class Parser(object): def __init__(self,output): self.p = Processor() if output != None: self.outfile = open(output,'w') def parse_HTML(self,indir): """Parses the HTML code of a document and writes the body in a txt file.""" self.input = indir for fname in os.listdir(self.input): if fname == '.DS_Store': continue with open((self.input + fname), 'r') as inputfile: print("Parsing: " + (self.input + fname)) content = inputfile.read() soup = BeautifulSoup(content,"html.parser") #class name is defined by the location of the html data story_body_parts = soup.findAll("p", { "class" : "story-body-text" }) for b in story_body_parts: text = b.get_text().encode('utf8') entities = self.extract_entities(text) text = self.p.process(text) text = self.p.annotate_entities(entities,text) self.outfile.write(text.rstrip() + "\n") def parse_HTML_fix(self,indir,parsed_file): self.input = indir f = open(parsed_file, 'r') input_text = f.read() total_entities = {} for fname in os.listdir(self.input): if fname == '.DS_Store': continue with open((self.input + fname), 'r') as inputfile: print("Parsing: " + (self.input + fname)) content = inputfile.read() soup = BeautifulSoup(content,"html.parser") #class name is determined by where the content data to retrieve exist in the html doc story_body_parts = soup.findAll("p", { "class" : "story-body-text" }) for b in story_body_parts: text = b.get_text().encode('utf8') entities = self.extract_entities(text) for ent in entities: if not ent in total_entities: total_entities[ent] = entities[ent] input_text = self.p.annotate_entities(total_entities,input_text) self.outfile.write(input_text) def parse_recursive(self,indir,it): """Parses HTML documents in different directories in a recursive manner.""" if it == 1: self.parse_HTML(indir + "/") else: for fname in os.listdir(indir): if fname == '.DS_Store': continue self.parse_recursive((indir + fname),(it-1)) def extract_entities(self,text): """Detects name entities (Person, Location, Organization) in text using the nltk library""" #text = text.decode("utf8") #remove honorifics in case they exist in the text text = re.sub("Mr.","",text) text = re.sub("Mrs.","",text) text = re.sub("Dr.","",text) entities = {} for sent in nltk.sent_tokenize(text): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if chunk.__class__.__name__ == "Tree": ent = "" for child in chunk.leaves(): ent += child[0] + "_" ent = ent[:-1].lower() if not ent in entities: entities[ent] = chunk.label() return entities def parse_plain_text(self,text): return self.p.process(text).rstrip()