Exemple #1
0
 def __init__(self, ontology):
     self.en_core_web_lg = holmes.Manager('en_core_web_lg')
     self.en_core_web_lg_ontology = holmes.Manager(
         model='en_core_web_lg', ontology=ontology)
     self.de_core_news_sm = holmes.Manager('de_core_news_sm')
     self.en_coref_lg = holmes.Manager('en_coref_lg')
     self.en_coref_lg_ontology = holmes.Manager(model='en_coref_lg',
                                                ontology=ontology)
    def test_deserialized_document_registration_multithreaded(self):
        def add_document(counter):
            m.deserialize_and_register_documents(
                {' '.join(('Irrelevant', str(counter))): irrelevant_doc})

        normal_m = holmes.Manager('en_core_web_sm',
                                  perform_coreference_resolution=False)
        normal_m.parse_and_register_document("People discuss irrelevancies",
                                             'irrelevant')
        irrelevant_doc = normal_m.serialize_document('irrelevant')
        m = holmes.MultiprocessingManager('en_core_web_sm',
                                          number_of_workers=4,
                                          perform_coreference_resolution=False)

        for i in range(NUMBER_OF_THREADS):
            t = Thread(target=add_document, args=(i, ))
            t.start()

        last_number_of_matches = 0
        for counter in range(50):
            document_labels = m.document_labels()
            for label in document_labels:
                self.assertTrue(label.startswith("Irrelevant"))
            if len(document_labels) == NUMBER_OF_THREADS:
                break
            self.assertFalse(counter == 49)
            sleep(0.5)
Exemple #3
0
 def test_initial_question_word_embedding_match_threshold_out_of_range(
         self):
     with self.assertRaises(ValueError) as context:
         m = holmes.Manager('en_core_web_sm', number_of_workers=1)
         m.parse_and_register_document("a")
         coref_holmes_manager.topic_match_documents_against(
             "b", initial_question_word_embedding_match_threshold=-1.2)
Exemple #4
0
 def test_relation_threshold_too_low(self):
     with self.assertRaises(ValueError) as context:
         m = holmes.Manager('en_core_web_sm', number_of_workers=1)
         m.parse_and_register_document("a")
         coref_holmes_manager.topic_match_documents_against(
             "b",
             relation_matching_frequency_threshold=-0.75,
             embedding_matching_frequency_threshold=-0.5)
Exemple #5
0
 def test_embedding_threshold_less_than_relation_threshold(self):
     with self.assertRaises(
             EmbeddingThresholdLessThanRelationThresholdError) as context:
         m = holmes.Manager('en_core_web_sm', number_of_workers=1)
         m.parse_and_register_document("a")
         coref_holmes_manager.topic_match_documents_against(
             "b",
             relation_matching_frequency_threshold=0.75,
             embedding_matching_frequency_threshold=0.5)
 def test_serialization_not_supported_on_serialization_multiprocessing(
         self):
     with self.assertRaises(SerializationNotSupportedError) as context:
         m_normal = holmes.Manager('en_core_web_sm',
                                   perform_coreference_resolution=False)
         m_normal.remove_all_documents()
         m_normal.parse_and_register_document("A", '')
         deserialized_doc = m_normal.serialize_document('')
         m = holmes.MultiprocessingManager('en_core_web_sm',
                                           number_of_workers=2)
         m.deserialize_and_register_documents({'A': deserialized_doc})
 def test_embedding_threshold_higher_than_relation_threshold_normal_manager(
         self):
     with self.assertRaises(
             EmbeddingThresholdGreaterThanRelationThresholdError
     ) as context:
         m = holmes.Manager('en_core_web_sm')
         m.parse_and_register_document("a")
         coref_holmes_manager.topic_match_documents_returning_dictionaries_against(
             "b",
             maximum_number_of_single_word_matches_for_relation_matching=1,
             maximum_number_of_single_word_matches_for_embedding_matching=2)
Exemple #8
0
 def __init__(
     self,
     model,
     overall_similarity_threshold=1.0,
     embedding_based_matching_on_root_words=False,
     analyze_derivational_morphology=True,
     perform_coreference_resolution=None,
     debug=False,
 ):
     self.manager = holmes.Manager(
         model=model,
         ontology=None,
         overall_similarity_threshold=overall_similarity_threshold,
         embedding_based_matching_on_root_words=
         embedding_based_matching_on_root_words,
         analyze_derivational_morphology=analyze_derivational_morphology,
         perform_coreference_resolution=perform_coreference_resolution,
         debug=debug,
     )
 def test_model_does_not_support_embeddings(self):
     with self.assertRaises(ValueError) as context:
         holmes.Manager(model='en_core_web_sm',
                        overall_similarity_threshold=0.85)
import unittest
import holmes_extractor as holmes
from holmes_extractor.extensive_matching import TopicMatcher
import os

script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
    (script_directory, 'test_ontology.owl')),
                           symmetric_matching=True)
holmes_manager_coref = holmes.Manager(model='en_core_web_lg',
                                      ontology=ontology,
                                      overall_similarity_threshold=0.65,
                                      perform_coreference_resolution=True)
holmes_manager_coref_embedding_on_root = holmes.Manager(
    model='en_core_web_lg',
    ontology=ontology,
    overall_similarity_threshold=0.65,
    embedding_based_matching_on_root_words=True)
holmes_manager_coref_no_embeddings = holmes.Manager(
    model='en_core_web_lg',
    ontology=ontology,
    overall_similarity_threshold=1,
    perform_coreference_resolution=True)


class EnglishTopicMatchingTest(unittest.TestCase):
    def _check_equals(self, text_to_match, document_text, highest_score,
                      manager):
        manager.remove_all_documents()
        manager.parse_and_register_document(document_text)
        topic_matches = manager.topic_match_documents_against(
 def test_embedding_based_matching_on_root_node_where_no_embedding_based_matching(
         self):
     with self.assertRaises(ValueError) as context:
         holmes.Manager(model='en_core_web_lg',
                        overall_similarity_threshold=1.0,
                        embedding_based_matching_on_root_words=True)
import unittest
import holmes_extractor as holmes
from holmes_extractor.errors import *
import jsonpickle

nocoref_holmes_manager = holmes.Manager('en_core_web_lg',
                                        analyze_derivational_morphology=False,
                                        perform_coreference_resolution=False)
coref_holmes_manager = holmes.Manager('en_core_web_lg',
                                      perform_coreference_resolution=True)
german_holmes_manager = holmes.Manager('de_core_news_md')


class ErrorsTest(unittest.TestCase):
    def test_overall_similarity_threshold_out_of_range(self):
        with self.assertRaises(ValueError) as context:
            holmes.Manager(model='en_core_web_lg',
                           overall_similarity_threshold=1.2)

    def test_embedding_based_matching_on_root_node_where_no_embedding_based_matching(
            self):
        with self.assertRaises(ValueError) as context:
            holmes.Manager(model='en_core_web_lg',
                           overall_similarity_threshold=1.0,
                           embedding_based_matching_on_root_words=True)

    def test_model_does_not_support_embeddings(self):
        with self.assertRaises(ValueError) as context:
            holmes.Manager(model='en_core_web_sm',
                           overall_similarity_threshold=0.85)
 def test_overall_similarity_threshold_out_of_range(self):
     with self.assertRaises(ValueError) as context:
         holmes.Manager(model='en_core_web_lg',
                        overall_similarity_threshold=1.2)
 def test_coreference_resolution_not_supported_error(self):
     with self.assertRaises(ValueError) as context:
         holmes.Manager(model='de_core_news_md',
                        perform_coreference_resolution=True)
Exemple #15
0
import unittest
import holmes_extractor as holmes
import os
from holmes_extractor.tests.testing_utils import HolmesInstanceManager

script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
    (script_directory, 'test_ontology.owl')))
ontology_holmes_manager = HolmesInstanceManager(
    ontology).en_core_web_lg_ontology
symmetric_ontology = holmes.Ontology(os.sep.join(
    (script_directory, 'test_ontology.owl')),
                                     symmetric_matching=True)
symmetric_ontology_holmes_manager = holmes.Manager(model='en_core_web_lg',
                                                   ontology=symmetric_ontology)
no_ontology_coref_holmes_manager = holmes.Manager(model='en_coref_lg')


class EnglishPhraseletProductionTest(unittest.TestCase):
    def _check_equals(self,
                      manager,
                      text_to_match,
                      phraselet_labels,
                      replace_with_hypernym_ancestors=True,
                      match_all_words=False):
        manager.remove_all_search_phrases()
        doc = manager.semantic_analyzer.parse(text_to_match)
        manager.structural_matcher.register_phraselets(
            doc,
            replace_with_hypernym_ancestors=replace_with_hypernym_ancestors,
            match_all_words=match_all_words,
import urllib.request
from bs4 import BeautifulSoup
import holmes_extractor as holmes

def download_and_register(url, label):
    print('Downloading', label)
    # Download the content
    page = urllib.request.urlopen(url)
    # Extract the raw text from the HTML document
    soup = BeautifulSoup(page, 'html.parser')
    # Register the document with Holmes
    print('Parsing and registering', label)
    holmes_manager.parse_and_register_document(soup.get_text(), label)

# Start the Holmes Manager with the German model
if __name__ in ('__main__', 'example_search_DE_law'):
    holmes_manager = holmes.Manager(model='de_core_news_lg', number_of_workers=2)
    download_and_register('https://www.gesetze-im-internet.de/vvg_2008/BJNR263110007.html', 'VVG_2008')
    download_and_register('https://www.gesetze-im-internet.de/vag_2016/BJNR043410015.html', 'VAG')
    holmes_manager.start_topic_matching_search_mode_console(initial_question_word_embedding_match_threshold=0.7)

# Example queries:
#
# Der Versicherer darf den Vertrag fristlos kündigen, wenn der Versicherungsnehmer beim Abschluss des Vertrags die vorvertragliche Anzeigepflicht verletzt hat.
# Der Versicherer darf Leistungen verweigern.
# Der Versicherer darf die Prämie anpassen.
# Eine Richtlinie einer ENTITYORG
Exemple #17
0
 def test_unrecognized_initial_question_word_behaviour(self):
     with self.assertRaises(ValueError) as context:
         m = holmes.Manager('en_core_web_sm', number_of_workers=1)
         m.parse_and_register_document("a")
         coref_holmes_manager.topic_match_documents_against(
             "b", initial_question_word_behaviour='r')
import unittest
import holmes_extractor as holmes
from holmes_extractor.errors import *
import jsonpickle

nocoref_holmes_manager = holmes.Manager('en_core_web_lg',
                                        perform_coreference_resolution=False)
coref_holmes_manager = holmes.Manager('en_core_web_lg',
                                      perform_coreference_resolution=True)
german_holmes_manager = holmes.Manager('de_core_news_md')


class ErrorsTest(unittest.TestCase):
    def test_overall_similarity_threshold_out_of_range(self):
        with self.assertRaises(ValueError) as context:
            holmes.Manager(model='en_core_web_lg',
                           overall_similarity_threshold=1.2)

    def test_embedding_based_matching_on_root_node_where_no_embedding_based_matching(
            self):
        with self.assertRaises(ValueError) as context:
            holmes.Manager(model='en_core_web_lg',
                           overall_similarity_threshold=1.0,
                           embedding_based_matching_on_root_words=True)

    def test_model_does_not_support_embeddings(self):
        with self.assertRaises(ValueError) as context:
            holmes.Manager(model='en_core_web_sm',
                           overall_similarity_threshold=0.85)

    def test_language_not_supported(self):
                    filename for filename in bbc_zipfile.namelist() if
                    filename.lower().endswith('.txt') and not filename.endswith('README.TXT')):
                category, document_number = get_document_filename_info(filename)
                if is_training_data(document_number):
                    with bbc_zipfile.open(filename, 'r') as training_doc:
                        training_contents = str(training_doc.read())
                        training_contents = training_contents.replace('\n', ' ').replace('\r', ' ')
                    training_basis.parse_and_register_training_document(
                        training_contents, category, filename)
        training_basis.prepare()
        classifier = training_basis.train().classifier()
        output_filename = os.sep.join((working_directory, 'model.json'))
        with open(output_filename, "w") as file:
            file.write(classifier.serialize_model())
        evaluate_classifier(zip_filename, classifier)
    holmes_manager = holmes.Manager('en_core_web_lg', number_of_workers=1)

    if os.path.exists(working_directory):
        if not os.path.isdir(working_directory):
            raise RuntimeError(' '.join((working_directory, 'must be a directory')))
    else:
        os.mkdir(working_directory)
    zip_filename = (os.sep.join((working_directory, 'bbc-fulltext.zip')))
    if not os.path.exists(zip_filename):
        url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'
        with urllib.request.urlopen(url) as response, open(zip_filename, 'wb') as out_file:
            shutil.copyfileobj(response, out_file)
    model_filename = os.sep.join((working_directory, 'model.json'))
    if not os.path.exists(model_filename):
        train_model(working_directory, zip_filename)
    else:
Exemple #20
0
import unittest
import holmes_extractor as holmes
import os

script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
    (script_directory, 'test_ontology.owl')))
holmes_manager = holmes.Manager('de_core_news_md', ontology=ontology)


class GermanPhraseletProductionTest(unittest.TestCase):
    def _check_equals(self,
                      text_to_match,
                      phraselet_labels,
                      match_all_words=False,
                      include_reverse_only=False,
                      replace_with_hypernym_ancestors=False):
        doc = holmes_manager.semantic_analyzer.parse(text_to_match)
        phraselet_labels_to_phraselet_infos = {}
        holmes_manager.structural_matcher.add_phraselets_to_dict(
            doc,
            phraselet_labels_to_phraselet_infos=
            phraselet_labels_to_phraselet_infos,
            replace_with_hypernym_ancestors=replace_with_hypernym_ancestors,
            match_all_words=match_all_words,
            ignore_relation_phraselets=False,
            include_reverse_only=include_reverse_only,
            stop_lemmas=holmes_manager.semantic_analyzer.
            topic_matching_phraselet_stop_lemmas,
            reverse_only_parent_lemmas=holmes_manager.semantic_analyzer.
            topic_matching_reverse_only_parent_lemmas)
Exemple #21
0
import unittest
import holmes_extractor as holmes
from holmes_extractor.errors import *
import jsonpickle

nocoref_holmes_manager = holmes.Manager('en_core_web_trf',
                                        analyze_derivational_morphology=False,
                                        perform_coreference_resolution=False,
                                        number_of_workers=2)
coref_holmes_manager = holmes.Manager('en_core_web_trf',
                                      perform_coreference_resolution=True,
                                      number_of_workers=1)
german_holmes_manager = holmes.Manager('de_core_news_lg', number_of_workers=1)


class ErrorsTest(unittest.TestCase):
    def test_overall_similarity_threshold_out_of_range(self):
        with self.assertRaises(ValueError) as context:
            holmes.Manager(model='en_core_web_lg',
                           overall_similarity_threshold=1.2)

    def test_embedding_based_matching_on_root_node_where_no_embedding_based_matching(
            self):
        with self.assertRaises(ValueError) as context:
            holmes.Manager(model='en_core_web_lg',
                           overall_similarity_threshold=1.0,
                           embedding_based_matching_on_root_words=True)

    def test_number_of_workers_out_of_range(self):
        with self.assertRaises(ValueError) as context:
            holmes.Manager(model='en_core_web_sm', number_of_workers=0)
                filename for filename in bbc_zipfile.namelist() if
                filename.lower().endswith('.txt') and not filename.endswith('README.TXT')):
            category, document_number = get_document_filename_info(filename)
            if is_training_data(document_number):
                with bbc_zipfile.open(filename, 'r') as training_doc:
                    training_contents = str(training_doc.read())
                    training_contents = training_contents.replace('\n', ' ').replace('\r', ' ')
                training_basis.parse_and_register_training_document(
                    training_contents, category, filename)
    training_basis.prepare()
    classifier = training_basis.train().classifier()
    output_filename = os.sep.join((working_directory, 'model.json'))
    with open(output_filename, "w") as file:
        file.write(classifier.serialize_model())
    evaluate_classifier(zip_filename, classifier)
holmes_manager = holmes.Manager('en_core_web_lg')

if os.path.exists(working_directory):
    if not os.path.isdir(working_directory):
        raise RuntimeError(' '.join((working_directory), 'must be a directory'))
else:
    os.mkdir(working_directory)
zip_filename = (os.sep.join((working_directory, 'bbc-fulltext.zip')))
if not os.path.exists(zip_filename):
    url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'
    with urllib.request.urlopen(url) as response, open(zip_filename, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
model_filename = os.sep.join((working_directory, 'model.json'))
if not os.path.exists(model_filename):
    train_model(working_directory, zip_filename)
else:
Exemple #23
0
import unittest
import holmes_extractor as holmes

holmes_manager = holmes.Manager(model='de_core_news_sm')
holmes_manager.register_search_phrase("Ein Hund jagt eine Katze")
holmes_manager.register_search_phrase("Ein Hund jagt einen Bären")
holmes_manager.register_search_phrase("Ein Hund frisst einen Knochen")
holmes_manager.register_search_phrase("Ein Mann ist schlau")
holmes_manager.register_search_phrase("Der reiche Mann")
holmes_manager.register_search_phrase("Jemand hat einen Berg gesehen")
holmes_manager.register_search_phrase("Ein Student geht aus", "excursion")
holmes_manager.register_search_phrase("Der Abschluss einer Versicherung")
holmes_manager.register_search_phrase("Die Kündigung von einer Versicherung")
holmes_manager.register_search_phrase("Jemand schließt eine Versicherung ab")
holmes_manager.register_search_phrase("Wer war traurig?")
holmes_manager.register_search_phrase("Das Fahrzeug hat einen Fehler")
holmes_manager.register_search_phrase(
    "Jemand braucht eine Versicherung für fünf Jahre")
holmes_manager.register_search_phrase("Jemand braucht etwas für fünf Jahre")
holmes_manager.register_search_phrase("Jemand braucht für fünf Jahre")
holmes_manager_with_variable_search_phrases = holmes.Manager(
    model='de_core_news_sm')


class GermanStructuralMatchingTest(unittest.TestCase):
    def _get_matches(self, holmes_manager, text):
        holmes_manager.remove_all_documents()
        holmes_manager.parse_and_register_document(document_text=text)
        return holmes_manager.match()

    def test_direct_matching(self):
import unittest
import holmes_extractor as holmes
import os

script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
    (script_directory, 'test_ontology.owl')))
nocoref_holmes_manager = holmes.Manager(model='en_core_web_lg',
                                        ontology=ontology,
                                        perform_coreference_resolution=False)
nocoref_holmes_manager.register_search_phrase("A dog chases a cat")
nocoref_holmes_manager.register_search_phrase("The man was poor")
nocoref_holmes_manager.register_search_phrase("The rich man")
nocoref_holmes_manager.register_search_phrase("Someone eats a sandwich")
nocoref_holmes_manager.register_search_phrase("A colleague's computer")
nocoref_holmes_manager.register_search_phrase(
    "An ENTITYPERSON opens an account")
nocoref_holmes_manager.register_search_phrase("A dog eats a bone")
nocoref_holmes_manager.register_search_phrase("Who fell asleep?")
nocoref_holmes_manager.register_search_phrase("Who is sad?")
nocoref_holmes_manager.register_search_phrase("Insurance for years")
nocoref_holmes_manager.register_search_phrase(
    "An employee needs insurance for the next five years")
nocoref_holmes_manager.register_search_phrase(
    "Somebody gives a file to an employee")
nocoref_holmes_manager.register_search_phrase("Somebody gives a boss a file")
nocoref_holmes_manager.register_search_phrase("Serendipity")
nocoref_holmes_manager.register_search_phrase("Somebody eats at an office")
nocoref_holmes_manager.register_search_phrase("A holiday is hard to book")
nocoref_holmes_manager.register_search_phrase("A man sings")
nocoref_holmes_manager.register_search_phrase("Somebody finds insurance")
 def test_language_not_supported(self):
     with self.assertRaises(ValueError) as context:
         holmes.Manager(model='fr_core_news_sm')
import urllib.request
from bs4 import BeautifulSoup
import holmes_extractor as holmes

def download_and_register(url, label):
    print('Downloading', label)
    # Download the content
    page = urllib.request.urlopen(url)
    # Extract the raw text from the HTML document
    soup = BeautifulSoup(page, 'html.parser')
    # Register the document with Holmes
    print('Parsing and registering', label)
    holmes_manager.parse_and_register_document(soup.get_text(), label)

# Start the Holmes Manager with the German model
holmes_manager = holmes.Manager(model='de_core_news_sm')
download_and_register('https://www.gesetze-im-internet.de/vvg_2008/BJNR263110007.html', 'VVG_2008')
download_and_register('https://www.gesetze-im-internet.de/vag_2016/BJNR043410015.html', 'VAG')
holmes_manager.start_search_mode_console()
Exemple #27
0
    (script_directory, 'test_ontology.owl')))
coref_holmes_manager = HolmesInstanceManager(ontology).en_coref_lg_ontology
coref_holmes_manager.register_search_phrase("A dog chases a cat")
coref_holmes_manager.register_search_phrase("A big horse chases a cat")
coref_holmes_manager.register_search_phrase("A tiger chases a little cat")
coref_holmes_manager.register_search_phrase("A big lion chases a cat")
coref_holmes_manager.register_search_phrase("An ENTITYPERSON needs insurance")
coref_holmes_manager.register_search_phrase("University for four years")
coref_holmes_manager.register_search_phrase("A big company makes a loss")
coref_holmes_manager.register_search_phrase(
    "A dog who chases rats chases mice")
coref_holmes_manager.register_search_phrase("A tired dog")
coref_holmes_manager.register_search_phrase("A panther chases a panther")
coref_holmes_manager.register_search_phrase("A leopard chases a leopard")
no_coref_holmes_manager = holmes.Manager(model='en_coref_lg',
                                         ontology=ontology,
                                         perform_coreference_resolution=False)
no_coref_holmes_manager.register_search_phrase("A dog chases a cat")
embeddings_coref_holmes_manager = holmes.Manager(
    model='en_coref_lg', overall_similarity_threshold=0.85)
embeddings_coref_holmes_manager.register_search_phrase('A man loves a woman')


class CoreferenceEnglishMatchingTest(unittest.TestCase):
    def _check_word_match(self, match, word_match_index, document_token_index,
                          extracted_word):
        word_match = match.word_matches[word_match_index]
        self.assertEqual(word_match.document_token.i, document_token_index)
        self.assertEqual(word_match.extracted_word, extracted_word)

    def test_simple_pronoun_coreference_same_sentence(self):
 def test_deserialized_documents(self):
     normal_manager = holmes.Manager('en_core_web_sm',
                                     perform_coreference_resolution=False)
     normal_manager.parse_and_register_document(
         "I saw a dog. It was chasing a cat", 'specific')
     normal_manager.parse_and_register_document("The dog chased the animal",
                                                'exact')
     normal_manager.parse_and_register_document("The cat chased the dog",
                                                'specific-reversed')
     normal_manager.parse_and_register_document("The animal chased the dog",
                                                'exact-reversed')
     specific = normal_manager.serialize_document('specific')
     exact = normal_manager.serialize_document('exact')
     specific_reversed = normal_manager.serialize_document(
         'specific-reversed')
     exact_reversed = normal_manager.serialize_document('exact-reversed')
     m = holmes.MultiprocessingManager('en_core_web_sm',
                                       ontology=ontology,
                                       number_of_workers=2,
                                       verbose=False,
                                       perform_coreference_resolution=False)
     m.deserialize_and_register_documents({
         'specific': specific,
         'exact': exact,
         'specific-reversed': specific_reversed,
         'exact-reversed': exact_reversed
     })
     self.assertEqual(
         m.document_labels(),
         ['exact', 'exact-reversed', 'specific', 'specific-reversed'])
     self.assertEqual(
         m.topic_match_documents_returning_dictionaries_against(
             "A dog chases an animal"), [{
                 'document_label':
                 'exact',
                 'text':
                 'The dog chased the animal',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '1',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 25,
                 'score':
                 99.34666666666668,
                 'word_infos': [[
                     4, 7, 'overlapping_relation', False,
                     "Matches DOG directly."
                 ],
                                [
                                    8, 14, 'overlapping_relation', False,
                                    "Matches CHASE directly."
                                ],
                                [
                                    19, 25, 'overlapping_relation', True,
                                    "Matches ANIMAL directly."
                                ]]
             }, {
                 'document_label':
                 'exact-reversed',
                 'text':
                 'The animal chased the dog',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '2=',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 25,
                 'score':
                 35.39866666666667,
                 'word_infos': [[
                     4, 10, 'single', False, "Matches ANIMAL directly."
                 ], [11, 17, 'relation', False, "Matches CHASE directly."],
                                [
                                    22, 25, 'relation', True,
                                    "Is a child of ANIMAL in the ontology."
                                ]]
             }, {
                 'document_label':
                 'specific-reversed',
                 'text':
                 'The cat chased the dog',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '2=',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 22,
                 'score':
                 34.486666666666665,
                 'word_infos': [[
                     4, 7, 'single', False,
                     "Is a child of ANIMAL in the ontology."
                 ], [8, 14, 'relation', False, "Matches CHASE directly."],
                                [
                                    19, 22, 'relation', True,
                                    "Is a child of ANIMAL in the ontology."
                                ]]
             }, {
                 'document_label':
                 'specific',
                 'text':
                 'I saw a dog. It was chasing a cat',
                 'text_to_match':
                 'A dog chases an animal',
                 'rank':
                 '2=',
                 'sentences_character_start_index_in_document':
                 0,
                 'sentences_character_end_index_in_document':
                 33,
                 'score':
                 31.88346666666667,
                 'word_infos':
                 [[8, 11, 'single', False, "Matches DOG directly."],
                  [
                      20, 27, 'relation', False,
                      "Is a synonym of CHASE in the ontology."
                  ],
                  [
                      30, 33, 'relation', True,
                      "Is a child of ANIMAL in the ontology."
                  ]]
             }])
     m.close()
                with open(long_filename, "r") as f:
                    contents = f.read()
                serialized_documents[label] = contents
        holmes_manager.deserialize_and_register_documents(serialized_documents)

    if os.path.exists(working_directory):
        if not os.path.isdir(working_directory):
            raise RuntimeError(' '.join((working_directory), 'must be a directory'))
    else:
        os.mkdir(working_directory)
    labels_to_documents={}

    if os.path.isfile(flag_filename):
        load_documents_from_working_directory(labels_to_documents)
    else:
        normal_holmes_manager = holmes.Manager(model='de_core_news_md')
        process_documents_from_front_page(normal_holmes_manager,
                "https://maerchen.com/grimm/", 'Gebrüder Grimm', labels_to_documents)
        process_documents_from_front_page(normal_holmes_manager,
                "https://maerchen.com/grimm2/", 'Gebrüder Grimm', labels_to_documents)
        process_documents_from_front_page(normal_holmes_manager,
                "https://maerchen.com/andersen/", 'Hans Christian Andersen', labels_to_documents)
        process_documents_from_front_page(normal_holmes_manager,
                "https://maerchen.com/bechstein/", 'Ludwig Bechstein', labels_to_documents)
        process_documents_from_front_page(normal_holmes_manager,
                "https://maerchen.com/wolf/", 'Johann Wilhelm Wolf', labels_to_documents)
        # Generate flag file to indicate files can be reloaded on next run
        open(flag_filename, 'a').close()
        load_documents_from_working_directory(labels_to_documents)

    #Comment following line in to activate interactive console
Exemple #30
0
 def test_number_of_workers_out_of_range(self):
     with self.assertRaises(ValueError) as context:
         holmes.Manager(model='en_core_web_sm', number_of_workers=0)