Beispiel #1
0
 def __init__(self, db_name, lock):
     super(Extractor, self).__init__("document", lock)
     self.entity_cleaner = EntityCleaner()
     self.db_name = db_name
     self.cur_db = None
Beispiel #2
0
 def __init__(self):
     self.ec = EntityCleaner()
     self.expected = 'test'
     self.exclude = list(string.punctuation.replace('@', '').replace('.', ''))
Beispiel #3
0
class Extractor(ProcessBase):
    def __init__(self, db_name, lock):
        super(Extractor, self).__init__("document", lock)
        self.entity_cleaner = EntityCleaner()
        self.db_name = db_name
        self.cur_db = None

    def run(self):
        self.cur_db = sqlite3.connect(self.db_name + '/documents.db')
        while True:
            self.running = self.lock.acquire()
            document_id, path_to_html = self.queue.get()
            if not self.added:
                self.timer()
                self.added = True
            self.extract_insert_info(document_id, path_to_html)
            self.running = False
            self.lock.release()
            self.queue.task_done()
            if self.queue.empty():
                # Kill the timer
                self.completed = False
                end_time = time.time()
                hours, rem = divmod(end_time - self.start_time, 3600)
                minutes, seconds = divmod(rem, 60)
                print("[*] Extraction elapsed time: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
                self.cur_db.close()
                break

    def clean_insert_entity(self, cursor, doc_id, entity, entity_type):
        cleaned_entity = self.entity_cleaner.clean(entity)
        if cleaned_entity is None:
            return
        insert_entity(cursor, doc_id, cleaned_entity, entity_type)

    def extract_insert_info(self, document_id, path_to_html):
        cur_db_cursor = self.cur_db.cursor()
        with codecs.open(path_to_html, 'rt', encoding='utf8') as fp:
            raw_text = fp.read()
        extracted_emails = self._extract_email(raw_text)
        for email in extracted_emails:
            if email.endswith('.png') or email.endswith('.gif'):
                continue
            self.clean_insert_entity(cur_db_cursor, document_id, email, 'Email')

        extracted_phone_nos = self._extract_phone_no(raw_text)
        for phone_no in extracted_phone_nos:
            self.clean_insert_entity(cur_db_cursor, document_id, phone_no, 'Phone')
        # extract using NLP
        entities = html_ner(raw_text)
        for tag, data in entities:
            if tag == 'PERSON':
                self.clean_insert_entity(cur_db_cursor, document_id, data, 'Name')
            elif tag == 'ORGANIZATION':
                self.clean_insert_entity(cur_db_cursor, document_id, data, 'Organisation')
            elif tag == 'LOCATION':
                self.clean_insert_entity(cur_db_cursor, document_id, data, 'Location')
        self.cur_db.commit()

    @staticmethod
    def _extract_email(text):
        results = re.findall(r'[a-z0-9\.]+@[a-z0-9\.]+\.[a-z]{2,}', text, flags=re.IGNORECASE)
        return set(results)

    @staticmethod
    def _extract_phone_no(text):
        def __phone_sanity_check(no):
            open_parent = no.count('(')
            close_parent = no.count(')')
            if open_parent > 1 or close_parent > 1:
                return False
            return open_parent == close_parent

        results = re.findall(r'\+?[0-9\- \(\)]{8,16}[0-9]', text)
        phone_numbers = []
        for result in results:
            if __phone_sanity_check(result):
                try:
                    phone_numbers.append(phonenumbers.parse(result, None))
                except phonenumbers.phonenumberutil.NumberParseException:
                    continue
        phone_numbers = [phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL) for
                         phone_number in phone_numbers if phonenumbers.is_possible_number(phone_number)]
        return phone_numbers
Beispiel #4
0
class TestEntityCleaner(object):
    def __init__(self):
        self.ec = EntityCleaner()
        self.expected = 'test'
        self.exclude = list(string.punctuation.replace('@', '').replace('.', ''))

    def test_entity_cleaner_exclamation(self):
        content = self.expected + '!'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_double_quote(self):
        content = self.expected + '"'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_sharp(self):
        content = self.expected + '#'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_dollar(self):
        content = self.expected + '$'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_percent(self):
        content = self.expected + '%'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_single_quote(self):
        content = self.expected + '%'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_parentheses(self):
        content = self.expected + '(){}[]'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_operators(self):
        content = self.expected + '*+-/'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_colon(self):
        content = self.expected + ':'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_semi_colon(self):
        content = self.expected + ';'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_question(self):
        content = self.expected + '?'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_back_slash(self):
        content = self.expected + '\\'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected

    def test_entity_cleaner_special(self):
        content = self.expected + '^`_|~'
        cleaned = self.ec.clean(content)
        assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected