def extract_insert_info(self, document_id, path_to_html): cur_db_cursor = self.cur_db.cursor() with codecs.open(path_to_html, 'rt', encoding='utf8') as fp: raw_text = fp.read() extracted_emails = self._extract_email(raw_text) for email in extracted_emails: if email.endswith('.png') or email.endswith('.gif'): continue self.clean_insert_entity(cur_db_cursor, document_id, email, 'Email') extracted_phone_nos = self._extract_phone_no(raw_text) for phone_no in extracted_phone_nos: self.clean_insert_entity(cur_db_cursor, document_id, phone_no, 'Phone') # extract using NLP entities = html_ner(raw_text) for tag, data in entities: if tag == 'PERSON': self.clean_insert_entity(cur_db_cursor, document_id, data, 'Name') elif tag == 'ORGANIZATION': self.clean_insert_entity(cur_db_cursor, document_id, data, 'Organisation') elif tag == 'LOCATION': self.clean_insert_entity(cur_db_cursor, document_id, data, 'Location') self.cur_db.commit()
def test_html_ner(): test = 'Hello, my name is Sinderella' result = html_ner(test) expected = [(u'PERSON', 'Sinderella')] assert result == expected