Python Processor.annotate_entitiesの例

プログラミング言語: Python

名前空間/パッケージ名: processor

クラス/型: Processor

メソッド/関数: annotate_entities

hotexamples.comのコード掲載数: 1

Python Processor.annotate_entities - 1件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのprocessor.Processor.annotate_entitiesの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Processor(30)

__init__(14)

addCard(6)

Task(5)

chargeCard(3)

canny(2)

add_tasks(2)

config(2)

add_next_process(2)

average_slope_intercept(2)

compress(2)

print_top_symbols(2)

checkProcessing(1)

count_first_attempts_per_skill(1)

count_by_boro(1)

costPerTimeUnit(1)

copy_one_file(1)

convert(1)

construct(1)

configure_notifiers(1)

configure_detectors(1)

clean_workbooks(1)

configure_client(1)

check_login(1)

clean_prepare_data(1)

compute_psd(1)

clean_words(1)

close(1)

cleanup(1)

count_messages_per_chat(1)

AddGroup(1)

count_per_boro(1)

join(1)

step_process(1)

sort_all_files(1)

save_ply(1)

save_all_workbooks(1)

process_scan(1)

process_html(1)

performance(1)

open(1)

load_from_file(1)

is_done(1)

count_per_nta(1)

get_results(1)

gather_all_files_and_split(1)

fit_transform(1)

filename(1)

done(1)

create_both_workbooks(1)

コード例 #1

ファイルを表示

ファイル: parser.py プロジェクト: Wushaowei001/article-tagger-system

class Parser(object):
	def __init__(self,output):
		self.p = Processor()
		if output != None:
			self.outfile = open(output,'w')
	   
	def parse_HTML(self,indir):
		"""Parses the HTML code of a document and writes the body in a txt file."""
		self.input = indir
		
		for fname in os.listdir(self.input):
			if fname == '.DS_Store':
				continue
			with open((self.input + fname), 'r') as inputfile:
				print("Parsing: " + (self.input + fname))
				content = inputfile.read()
				soup = BeautifulSoup(content,"html.parser")
				#class name is defined by the location of the html data 
				story_body_parts = soup.findAll("p", { "class" : "story-body-text" })
				for b in story_body_parts:
					text = b.get_text().encode('utf8')
					entities = self.extract_entities(text)
					text = self.p.process(text)
					text = self.p.annotate_entities(entities,text)
					self.outfile.write(text.rstrip() + "\n")

	def parse_HTML_fix(self,indir,parsed_file):
		self.input = indir
		f = open(parsed_file, 'r')
		input_text = f.read()
		total_entities = {}
		for fname in os.listdir(self.input):
			if fname == '.DS_Store':
				continue
			with open((self.input + fname), 'r') as inputfile:
				print("Parsing: " + (self.input + fname))
				content = inputfile.read()
				soup = BeautifulSoup(content,"html.parser")
				#class name is determined by where the content data to retrieve exist in the html doc
				story_body_parts = soup.findAll("p", { "class" : "story-body-text" })
				for b in story_body_parts:
					text = b.get_text().encode('utf8')
					entities = self.extract_entities(text)
					for ent in entities:
						if not ent in total_entities:
							total_entities[ent] = entities[ent]
		
		input_text = self.p.annotate_entities(total_entities,input_text)
		self.outfile.write(input_text)

	def parse_recursive(self,indir,it):
		"""Parses HTML documents in different directories in a recursive manner."""
		if it == 1:
			self.parse_HTML(indir + "/")
		else:
			for fname in os.listdir(indir):
				if fname == '.DS_Store':
					continue

				self.parse_recursive((indir + fname),(it-1))

	def extract_entities(self,text):
		"""Detects name entities (Person, Location, Organization) in text using the nltk library"""
		#text = text.decode("utf8")

		#remove honorifics in case they exist in the text
		text = re.sub("Mr.","",text)
		text = re.sub("Mrs.","",text)
		text = re.sub("Dr.","",text)

		entities = {}
		for sent in nltk.sent_tokenize(text):
			for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
				if chunk.__class__.__name__ == "Tree":
					ent = ""
					for child in chunk.leaves():
						ent += child[0] + "_"
					ent = ent[:-1].lower()
					if not ent in entities:
						entities[ent] = chunk.label()

		return entities

	def parse_plain_text(self,text):
		return self.p.process(text).rstrip()