Beispiel #1
0
    def extract_insert_info(self, document_id, path_to_html):
        cur_db_cursor = self.cur_db.cursor()
        with codecs.open(path_to_html, 'rt', encoding='utf8') as fp:
            raw_text = fp.read()
        extracted_emails = self._extract_email(raw_text)
        for email in extracted_emails:
            if email.endswith('.png') or email.endswith('.gif'):
                continue
            self.clean_insert_entity(cur_db_cursor, document_id, email, 'Email')

        extracted_phone_nos = self._extract_phone_no(raw_text)
        for phone_no in extracted_phone_nos:
            self.clean_insert_entity(cur_db_cursor, document_id, phone_no, 'Phone')
        # extract using NLP
        entities = html_ner(raw_text)
        for tag, data in entities:
            if tag == 'PERSON':
                self.clean_insert_entity(cur_db_cursor, document_id, data, 'Name')
            elif tag == 'ORGANIZATION':
                self.clean_insert_entity(cur_db_cursor, document_id, data, 'Organisation')
            elif tag == 'LOCATION':
                self.clean_insert_entity(cur_db_cursor, document_id, data, 'Location')
        self.cur_db.commit()
Beispiel #2
0
def test_html_ner():
    test = 'Hello, my name is Sinderella'
    result = html_ner(test)
    expected = [(u'PERSON', 'Sinderella')]
    assert result == expected