Ejemplo n.º 1
0
 def test_duplicate_document_with_parse_and_register_documents_multiprocessing(
         self):
     with self.assertRaises(DuplicateDocumentError) as context:
         m = holmes.MultiprocessingManager('en_core_web_sm',
                                           number_of_workers=2)
         m.parse_and_register_documents({'A': "A"})
         m.parse_and_register_documents({'A': "A"})
    def test_deserialized_document_registration_multithreaded(self):
        def add_document(counter):
            m.deserialize_and_register_documents(
                {' '.join(('Irrelevant', str(counter))): irrelevant_doc})

        normal_m = holmes.Manager('en_core_web_sm',
                                  perform_coreference_resolution=False)
        normal_m.parse_and_register_document("People discuss irrelevancies",
                                             'irrelevant')
        irrelevant_doc = normal_m.serialize_document('irrelevant')
        m = holmes.MultiprocessingManager('en_core_web_sm',
                                          number_of_workers=4,
                                          perform_coreference_resolution=False)

        for i in range(NUMBER_OF_THREADS):
            t = Thread(target=add_document, args=(i, ))
            t.start()

        last_number_of_matches = 0
        for counter in range(50):
            document_labels = m.document_labels()
            for label in document_labels:
                self.assertTrue(label.startswith("Irrelevant"))
            if len(document_labels) == NUMBER_OF_THREADS:
                break
            self.assertFalse(counter == 49)
            sleep(0.5)
    def test_multithreading_filtering_with_topic_match_dictionaries(self):
        m = holmes.MultiprocessingManager('en_core_web_sm',
                                          number_of_workers=2,
                                          ontology=ontology,
                                          verbose=False)

        m.parse_and_register_documents({
            'T11': "The dog chased the cat",
            'T12': "The dog chased the cat",
            'T21': "The dog chased the cat",
            'T22': "The dog chased the cat"
        })
        topic_match_dictionaries = \
                m.topic_match_documents_returning_dictionaries_against(
                "The dog chased the cat")
        self.assertEqual(len(topic_match_dictionaries), 4)
        topic_match_dictionaries = \
                m.topic_match_documents_returning_dictionaries_against(
                "The dog chased the cat", document_label_filter="T")
        self.assertEqual(len(topic_match_dictionaries), 4)
        topic_match_dictionaries = \
                m.topic_match_documents_returning_dictionaries_against(
                "The dog chased the cat", document_label_filter="T1")
        self.assertEqual(len(topic_match_dictionaries), 2)
        topic_match_dictionaries = \
                m.topic_match_documents_returning_dictionaries_against(
                "The dog chased the cat", document_label_filter="T22")
        self.assertEqual(len(topic_match_dictionaries), 1)
        topic_match_dictionaries = \
                m.topic_match_documents_returning_dictionaries_against(
                "The dog chased the cat", document_label_filter="X")
        self.assertEqual(len(topic_match_dictionaries), 0)
        m.close()
Ejemplo n.º 4
0
 def test_serialization_not_supported_on_serialization_multiprocessing(
         self):
     with self.assertRaises(SerializationNotSupportedError) as context:
         m_normal = holmes.Manager('en_core_web_sm',
                                   perform_coreference_resolution=False)
         m_normal.remove_all_documents()
         m_normal.parse_and_register_document("A", '')
         deserialized_doc = m_normal.serialize_document('')
         m = holmes.MultiprocessingManager('en_core_web_sm',
                                           number_of_workers=2)
         m.deserialize_and_register_documents({'A': deserialized_doc})
Ejemplo n.º 5
0
 def test_embedding_threshold_higher_than_relation_threshold_multiprocessing_manager(
         self):
     with self.assertRaises(
             EmbeddingThresholdGreaterThanRelationThresholdError
     ) as context:
         m = holmes.MultiprocessingManager('en_core_web_sm',
                                           number_of_workers=1)
         m.parse_and_register_documents({'': "a"})
         m.topic_match_documents_returning_dictionaries_against(
             "b",
             maximum_number_of_single_word_matches_for_relation_matching=1,
             maximum_number_of_single_word_matches_for_embedding_matching=2)
import urllib.request
from bs4 import BeautifulSoup
import holmes_extractor as holmes
import os
import json
import falcon

if __name__ in ('__main__', 'example_search_DE_literature'):

    working_directory=REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES
    HOLMES_EXTENSION = 'hdc'
    flag_filename = os.sep.join((working_directory,'STORY_PARSING_COMPLETE'))

    print('Initializing Holmes...')
    # Start the Holmes manager with the German model
    holmes_manager = holmes.MultiprocessingManager(model='de_core_news_md',
            overall_similarity_threshold=0.85, number_of_workers=4)
            # set number_of_workers to prevent memory exhaustion / swapping; it should never be more
            # than the number of cores on the machine

    def process_documents_from_front_page(manager, front_page_uri, front_page_label,
            labels_to_documents):
        """ Download and save all the stories from a front page."""

        front_page = urllib.request.urlopen(front_page_uri)
        front_page_soup = BeautifulSoup(front_page, 'html.parser')
        # For each story ...
        for anchor in front_page_soup.find_all('a'):
            if not anchor['href'].startswith('/') and not anchor['href'].startswith('https'):
                this_document_url = '/'.join((front_page_uri, anchor['href']))
                print('Downloading story', anchor.contents[0], 'from front page', front_page_label)
                # Get the HTML document for the story
Ejemplo n.º 7
0
 def test_coreference_resolution_not_supported_multiprocessing_manager_error(
         self):
     with self.assertRaises(ValueError) as context:
         holmes.MultiprocessingManager(model='de_core_news_md',
                                       perform_coreference_resolution=True)
    def _internal_test_multithreading_topic_matching(self, number_of_workers):
        def topic_match_within_thread():
            normal_dict = m.topic_match_documents_returning_dictionaries_against(
                "A dog chases an animal")
            reversed_dict = m.topic_match_documents_returning_dictionaries_against(
                "The animal chased the dog")
            queue.put((normal_dict, reversed_dict))

        m = holmes.MultiprocessingManager('en_core_web_sm',
                                          ontology=ontology,
                                          number_of_workers=number_of_workers,
                                          verbose=False)
        m.parse_and_register_documents({
            'specific':
            "I saw a dog. It was chasing a cat",
            'exact':
            "The dog chased the animal",
            'specific-reversed':
            "The cat chased the dog",
            'exact-reversed':
            "The animal chased the dog"
        })
        queue = Queue()
        for i in range(NUMBER_OF_THREADS):
            t = Thread(target=topic_match_within_thread)
            t.start()
        for i in range(NUMBER_OF_THREADS):
            normal_dict, reversed_dict = queue.get(True, 60)
            self.assertEqual(normal_dict, [{
                'document_label':
                'exact',
                'text':
                'The dog chased the animal',
                'text_to_match':
                'A dog chases an animal',
                'rank':
                '1',
                'sentences_character_start_index_in_document':
                0,
                'sentences_character_end_index_in_document':
                25,
                'score':
                99.34666666666668,
                'word_infos': [[
                    4, 7, 'overlapping_relation', False,
                    "Matches DOG directly."
                ],
                               [
                                   8, 14, 'overlapping_relation', False,
                                   "Matches CHASE directly."
                               ],
                               [
                                   19, 25, 'overlapping_relation', True,
                                   "Matches ANIMAL directly."
                               ]]
            }, {
                'document_label':
                'specific',
                'text':
                'I saw a dog. It was chasing a cat',
                'text_to_match':
                'A dog chases an animal',
                'rank':
                '2',
                'sentences_character_start_index_in_document':
                0,
                'sentences_character_end_index_in_document':
                33,
                'score':
                81.94686666666668,
                'word_infos': [[
                    8, 11, 'overlapping_relation', False,
                    "Matches DOG directly."
                ],
                               [
                                   20, 27, 'overlapping_relation', False,
                                   "Is a synonym of CHASE in the ontology."
                               ],
                               [
                                   30, 33, 'overlapping_relation', True,
                                   "Is a child of ANIMAL in the ontology."
                               ]]
            }, {
                'document_label':
                'exact-reversed',
                'text':
                'The animal chased the dog',
                'text_to_match':
                'A dog chases an animal',
                'rank':
                '3=',
                'sentences_character_start_index_in_document':
                0,
                'sentences_character_end_index_in_document':
                25,
                'score':
                35.39866666666667,
                'word_infos': [[
                    4, 10, 'single', False, "Matches ANIMAL directly."
                ], [11, 17, 'relation', False, "Matches CHASE directly."],
                               [
                                   22, 25, 'relation', True,
                                   "Is a child of ANIMAL in the ontology."
                               ]]
            }, {
                'document_label':
                'specific-reversed',
                'text':
                'The cat chased the dog',
                'text_to_match':
                'A dog chases an animal',
                'rank':
                '3=',
                'sentences_character_start_index_in_document':
                0,
                'sentences_character_end_index_in_document':
                22,
                'score':
                34.486666666666665,
                'word_infos': [[
                    4, 7, 'single', False,
                    "Is a child of ANIMAL in the ontology."
                ], [8, 14, 'relation', False, "Matches CHASE directly."],
                               [
                                   19, 22, 'relation', True,
                                   "Is a child of ANIMAL in the ontology."
                               ]]
            }])
            self.assertEqual(reversed_dict, [{
                'document_label':
                'exact-reversed',
                'text':
                'The animal chased the dog',
                'text_to_match':
                'The animal chased the dog',
                'rank':
                '1=',
                'sentences_character_start_index_in_document':
                0,
                'sentences_character_end_index_in_document':
                25,
                'score':
                96.93333333333334,
                'word_infos': [[
                    4, 10, 'overlapping_relation', False,
                    "Matches ANIMAL directly."
                ],
                               [
                                   11, 17, 'overlapping_relation', True,
                                   "Matches CHASE directly."
                               ],
                               [
                                   22, 25, 'overlapping_relation', False,
                                   "Matches DOG directly."
                               ]]
            }, {
                'document_label':
                'specific-reversed',
                'text':
                'The cat chased the dog',
                'text_to_match':
                'The animal chased the dog',
                'rank':
                '1=',
                'sentences_character_start_index_in_document':
                0,
                'sentences_character_end_index_in_document':
                22,
                'score':
                87.446,
                'word_infos': [[
                    4, 7, 'overlapping_relation', False,
                    "Is a child of ANIMAL in the ontology."
                ],
                               [
                                   8, 14, 'overlapping_relation', True,
                                   "Matches CHASE directly."
                               ],
                               [
                                   19, 22, 'overlapping_relation', False,
                                   "Matches DOG directly."
                               ]]
            }, {
                'document_label':
                'exact',
                'text':
                'The dog chased the animal',
                'text_to_match':
                'The animal chased the dog',
                'rank':
                '3=',
                'sentences_character_start_index_in_document':
                0,
                'sentences_character_end_index_in_document':
                25,
                'score':
                30.598666666666666,
                'word_infos': [[
                    4, 7, 'relation', False,
                    "Is a child of ANIMAL in the ontology."
                ], [8, 14, 'relation', False, "Matches CHASE directly."
                    ], [19, 25, 'single', True, "Matches ANIMAL directly."]]
            }, {
                'document_label':
                'specific',
                'text':
                'I saw a dog. It was chasing a cat',
                'text_to_match':
                'The animal chased the dog',
                'rank':
                '3=',
                'sentences_character_start_index_in_document':
                0,
                'sentences_character_end_index_in_document':
                33,
                'score':
                27.704,
                'word_infos': [[
                    8, 11, 'relation', False,
                    "Is a child of ANIMAL in the ontology."
                ],
                               [
                                   20, 27, 'relation', True,
                                   "Is a synonym of CHASE in the ontology."
                               ],
                               [
                                   30, 33, 'single', False,
                                   "Is a child of ANIMAL in the ontology."
                               ]]
            }])
 def test_number_of_results(self):
     m = holmes.MultiprocessingManager('en_core_web_sm',
                                       ontology=ontology,
                                       number_of_workers=2,
                                       verbose=False)
     m.parse_and_register_documents({
         'specific':
         "I saw a dog. It was chasing a cat",
         'exact':
         "The dog chased the animal",
         'specific-reversed':
         "The cat chased the dog",
         'exact-reversed':
         "The animal chased the dog"
     })
     self.assertEqual(
         m.document_labels(),
         ['exact', 'exact-reversed', 'specific', 'specific-reversed'])
     self.assertEqual(
         m.topic_match_documents_returning_dictionaries_against(
             "A dog chases an animal", number_of_results=3), [{
                 'document_label':
                 'exact',
                 'text':
                 'The dog chased the animal',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '1',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 25,
                 'score':
                 99.34666666666668,
                 'word_infos': [[
                     4, 7, 'overlapping_relation', False,
                     "Matches DOG directly."
                 ],
                                [
                                    8, 14, 'overlapping_relation', False,
                                    "Matches CHASE directly."
                                ],
                                [
                                    19, 25, 'overlapping_relation', True,
                                    "Matches ANIMAL directly."
                                ]]
             }, {
                 'document_label':
                 'specific',
                 'text':
                 'I saw a dog. It was chasing a cat',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '2',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 33,
                 'score':
                 81.94686666666668,
                 'word_infos': [[
                     8, 11, 'overlapping_relation', False,
                     "Matches DOG directly."
                 ],
                                [
                                    20, 27, 'overlapping_relation', False,
                                    "Is a synonym of CHASE in the ontology."
                                ],
                                [
                                    30, 33, 'overlapping_relation', True,
                                    "Is a child of ANIMAL in the ontology."
                                ]]
             }, {
                 'document_label':
                 'exact-reversed',
                 'text':
                 'The animal chased the dog',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '3',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 25,
                 'score':
                 35.39866666666667,
                 'word_infos': [[
                     4, 10, 'single', False, "Matches ANIMAL directly."
                 ], [11, 17, 'relation', False, "Matches CHASE directly."],
                                [
                                    22, 25, 'relation', True,
                                    "Is a child of ANIMAL in the ontology."
                                ]]
             }])
     m.close()
 def test_deserialized_documents(self):
     normal_manager = holmes.Manager('en_core_web_sm',
                                     perform_coreference_resolution=False)
     normal_manager.parse_and_register_document(
         "I saw a dog. It was chasing a cat", 'specific')
     normal_manager.parse_and_register_document("The dog chased the animal",
                                                'exact')
     normal_manager.parse_and_register_document("The cat chased the dog",
                                                'specific-reversed')
     normal_manager.parse_and_register_document("The animal chased the dog",
                                                'exact-reversed')
     specific = normal_manager.serialize_document('specific')
     exact = normal_manager.serialize_document('exact')
     specific_reversed = normal_manager.serialize_document(
         'specific-reversed')
     exact_reversed = normal_manager.serialize_document('exact-reversed')
     m = holmes.MultiprocessingManager('en_core_web_sm',
                                       ontology=ontology,
                                       number_of_workers=2,
                                       verbose=False,
                                       perform_coreference_resolution=False)
     m.deserialize_and_register_documents({
         'specific': specific,
         'exact': exact,
         'specific-reversed': specific_reversed,
         'exact-reversed': exact_reversed
     })
     self.assertEqual(
         m.document_labels(),
         ['exact', 'exact-reversed', 'specific', 'specific-reversed'])
     self.assertEqual(
         m.topic_match_documents_returning_dictionaries_against(
             "A dog chases an animal"), [{
                 'document_label':
                 'exact',
                 'text':
                 'The dog chased the animal',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '1',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 25,
                 'score':
                 99.34666666666668,
                 'word_infos': [[
                     4, 7, 'overlapping_relation', False,
                     "Matches DOG directly."
                 ],
                                [
                                    8, 14, 'overlapping_relation', False,
                                    "Matches CHASE directly."
                                ],
                                [
                                    19, 25, 'overlapping_relation', True,
                                    "Matches ANIMAL directly."
                                ]]
             }, {
                 'document_label':
                 'exact-reversed',
                 'text':
                 'The animal chased the dog',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '2=',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 25,
                 'score':
                 35.39866666666667,
                 'word_infos': [[
                     4, 10, 'single', False, "Matches ANIMAL directly."
                 ], [11, 17, 'relation', False, "Matches CHASE directly."],
                                [
                                    22, 25, 'relation', True,
                                    "Is a child of ANIMAL in the ontology."
                                ]]
             }, {
                 'document_label':
                 'specific-reversed',
                 'text':
                 'The cat chased the dog',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '2=',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 22,
                 'score':
                 34.486666666666665,
                 'word_infos': [[
                     4, 7, 'single', False,
                     "Is a child of ANIMAL in the ontology."
                 ], [8, 14, 'relation', False, "Matches CHASE directly."],
                                [
                                    19, 22, 'relation', True,
                                    "Is a child of ANIMAL in the ontology."
                                ]]
             }, {
                 'document_label':
                 'specific',
                 'text':
                 'I saw a dog. It was chasing a cat',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '2=',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 33,
                 'score':
                 31.88346666666667,
                 'word_infos':
                 [[8, 11, 'single', False, "Matches DOG directly."],
                  [
                      20, 27, 'relation', False,
                      "Is a synonym of CHASE in the ontology."
                  ],
                  [
                      30, 33, 'relation', True,
                      "Is a child of ANIMAL in the ontology."
                  ]]
             }])
     m.close()
import re
import os
import json
import falcon

if __name__ in ('__main__', 'example_search_EN_literature'):

    script_directory = os.path.dirname(os.path.realpath(__file__))
    ontology = holmes.Ontology(
        os.sep.join(
            (script_directory, 'example_search_EN_literature_ontology.owl')))
    print('Initializing Holmes...')
    #Start the Holmes manager with the English model
    holmes_manager = holmes.MultiprocessingManager(
        model='en_core_web_lg',
        overall_similarity_threshold=0.9,
        ontology=ontology,
        number_of_workers=4)

    # set number_of_workers to prevent memory exhaustion / swapping; it should never be more
    # than the number of cores


    def extract_chapters_from_book(book_uri, title):
        """ Download and save the chapters from a book."""

        print()
        print(title)
        print()
        book = urllib.request.urlopen(book_uri).read().decode()
        book = re.sub("\\nPage \|.+?Rowling \\n", "", book)
 def test_workers_specified(self):
     m = holmes.MultiprocessingManager('en_core_web_sm',
                                       ontology=ontology,
                                       number_of_workers=2,
                                       verbose=False)
     m.parse_and_register_documents({
         'specific':
         "I saw a dog. It was chasing a cat",
         'exact':
         "The dog chased the animal",
         'specific-reversed':
         "The cat chased the dog",
         'exact-reversed':
         "The animal chased the dog"
     })
     self.assertEqual(
         m.document_labels(),
         ['exact', 'exact-reversed', 'specific', 'specific-reversed'])
     self.assertEqual(
         m.topic_match_documents_returning_dictionaries_against(
             "A dog chases an animal"), [{
                 'document_label':
                 'exact',
                 'text':
                 'The dog chased the animal',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '1=',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 25,
                 'score':
                 99.34666666666668,
                 'word_infos': [[4, 7, 'overlapping_relation', False],
                                [8, 14, 'overlapping_relation', False],
                                [19, 25, 'overlapping_relation', True]]
             }, {
                 'document_label':
                 'specific',
                 'text':
                 'I saw a dog. It was chasing a cat',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '1=',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 33,
                 'score':
                 99.14666666666669,
                 'word_infos': [[8, 11, 'overlapping_relation', False],
                                [20, 27, 'overlapping_relation', False],
                                [30, 33, 'overlapping_relation', True]]
             }, {
                 'document_label':
                 'exact-reversed',
                 'text':
                 'The animal chased the dog',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '3=',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 25,
                 'score':
                 40.946666666666665,
                 'word_infos': [[4, 10, 'single', False],
                                [11, 17, 'relation', False],
                                [22, 25, 'relation', True]]
             }, {
                 'document_label':
                 'specific-reversed',
                 'text':
                 'The cat chased the dog',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '3=',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 22,
                 'score':
                 40.946666666666665,
                 'word_infos': [[4, 7, 'single', False],
                                [8, 14, 'relation', False],
                                [19, 22, 'relation', True]]
             }])
     m.close()