def test_mention_longest_match(): """Test longest match filtering in mention extraction.""" file_name = "lincoln_short" docs_path = f"tests/data/pure_html/{file_name}.html" doc = parse_doc(docs_path, file_name) # Mention Extraction name_ngrams = MentionNgramsPart(n_max=3) place_ngrams = MentionNgramsTemp(n_max=4) Name = mention_subclass("Name") Place = mention_subclass("Place") def is_birthplace_table_row(mention): if not mention.sentence.is_tabular(): return False ngrams = get_row_ngrams(mention, lower=True) if "birth_place" in ngrams: return True else: return False birthplace_matcher = LambdaFunctionMatcher( func=is_birthplace_table_row, longest_match_only=False ) mention_extractor_udf = MentionExtractorUDF( [Name, Place], [name_ngrams, place_ngrams], [PersonMatcher(), birthplace_matcher], ) doc = mention_extractor_udf.apply(doc) mentions = doc.places mention_spans = [x.context.get_span() for x in mentions] assert "Sinking Spring Farm" in mention_spans assert "Farm" in mention_spans assert len(mention_spans) == 23 # Clear manually for mention in doc.places[:]: doc.places.remove(mention) birthplace_matcher = LambdaFunctionMatcher( func=is_birthplace_table_row, longest_match_only=True ) mention_extractor_udf = MentionExtractorUDF( [Name, Place], [name_ngrams, place_ngrams], [PersonMatcher(), birthplace_matcher], ) doc = mention_extractor_udf.apply(doc) mentions = doc.places mention_spans = [x.context.get_span() for x in mentions] assert "Sinking Spring Farm" in mention_spans assert "Farm" not in mention_spans assert len(mention_spans) == 4
def test_ngrams(): """Test ngram limits in mention extraction""" file_name = "lincoln_short" docs_path = f"tests/data/pure_html/{file_name}.html" doc = parse_doc(docs_path, file_name) # Mention Extraction Person = mention_subclass("Person") person_ngrams = MentionNgrams(n_max=3) person_matcher = PersonMatcher() mention_extractor_udf = MentionExtractorUDF( [Person], [person_ngrams], [person_matcher] ) doc = mention_extractor_udf.apply(doc) assert len(doc.persons) == 118 mentions = doc.persons assert len([x for x in mentions if x.context.get_num_words() == 1]) == 49 assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0 # Test for unigram exclusion for mention in doc.persons[:]: doc.persons.remove(mention) assert len(doc.persons) == 0 person_ngrams = MentionNgrams(n_min=2, n_max=3) mention_extractor_udf = MentionExtractorUDF( [Person], [person_ngrams], [person_matcher] ) doc = mention_extractor_udf.apply(doc) assert len(doc.persons) == 69 mentions = doc.persons assert len([x for x in mentions if x.context.get_num_words() == 1]) == 0 assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0
def test_ngrams(caplog): """Test ngram limits in mention extraction""" caplog.set_level(logging.INFO) PARALLEL = 4 max_docs = 1 session = Meta.init("postgresql://localhost:5432/" + DB).Session() docs_path = "tests/data/pure_html/lincoln_short.html" logger.info("Parsing...") doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser(session, structural=True, lingual=True) corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL) assert session.query(Document).count() == max_docs assert session.query(Sentence).count() == 503 docs = session.query(Document).order_by(Document.name).all() # Mention Extraction Person = mention_subclass("Person") person_ngrams = MentionNgrams(n_max=3) person_matcher = PersonMatcher() mention_extractor = MentionExtractor( session, [Person], [person_ngrams], [person_matcher] ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Person).count() == 118 mentions = session.query(Person).all() assert len([x for x in mentions if x.context.get_num_words() == 1]) == 49 assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0 # Test for unigram exclusion person_ngrams = MentionNgrams(n_min=2, n_max=3) mention_extractor = MentionExtractor( session, [Person], [person_ngrams], [person_matcher] ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Person).count() == 69 mentions = session.query(Person).all() assert len([x for x in mentions if x.context.get_num_words() == 1]) == 0 assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0
def test_mention_longest_match(caplog): """Test longest match filtering in mention extraction.""" caplog.set_level(logging.INFO) # SpaCy on mac has issue on parallel parsing PARALLEL = 1 max_docs = 1 session = Meta.init("postgresql://localhost:5432/" + DB).Session() docs_path = "tests/data/pure_html/lincoln_short.html" # Parsing logger.info("Parsing...") doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser(session, structural=True, lingual=True) corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL) docs = session.query(Document).order_by(Document.name).all() # Mention Extraction name_ngrams = MentionNgramsPart(n_max=3) place_ngrams = MentionNgramsTemp(n_max=4) Name = mention_subclass("Name") Place = mention_subclass("Place") def is_birthplace_table_row(mention): if not mention.sentence.is_tabular(): return False ngrams = get_row_ngrams(mention, lower=True) if "birth_place" in ngrams: return True else: return False birthplace_matcher = LambdaFunctionMatcher( func=is_birthplace_table_row, longest_match_only=False ) mention_extractor = MentionExtractor( session, [Name, Place], [name_ngrams, place_ngrams], [PersonMatcher(), birthplace_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) mentions = session.query(Place).all() mention_spans = [x.context.get_span() for x in mentions] assert "Sinking Spring Farm" in mention_spans assert "Farm" in mention_spans assert len(mention_spans) == 23 birthplace_matcher = LambdaFunctionMatcher( func=is_birthplace_table_row, longest_match_only=True ) mention_extractor = MentionExtractor( session, [Name, Place], [name_ngrams, place_ngrams], [PersonMatcher(), birthplace_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) mentions = session.query(Place).all() mention_spans = [x.context.get_span() for x in mentions] assert "Sinking Spring Farm" in mention_spans assert "Farm" not in mention_spans assert len(mention_spans) == 4
def test_ner_matchers(): """Test different ner type matchers.""" # Set up a document doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = " ".join([ "Tim Cook was born in USA in 1960.", "He is the CEO of Apple.", "He sold 100 million of iPhone.", ]) lingual_parser = SpacyParser("en") for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) # Manually attach ner_tags as the result from spacy may fluctuate. doc.sentences[0].ner_tags = [ "PERSON", "PERSON", "O", "O", "O", "GPE", "O", "DATE", "O", ] doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"] doc.sentences[2].ner_tags = [ "O", "O", "CARDINAL", "CARDINAL", "O", "MISC", "O" ] # the length of words and that of ner_tags should match. assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags) assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags) space = MentionNgrams(n_min=1, n_max=2) # Test if PersonMatcher works as expected matcher = PersonMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"} # Test if LocationMatcher works as expected matcher = LocationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"USA"} # Test if DateMatcher works as expected matcher = DateMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"1960"} # Test if OrganizationMatcher works as expected matcher = OrganizationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Apple"} # Test if NumberMatcher works as expected matcher = NumberMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"100 million"} # Test if MiscMatcher works as expected matcher = MiscMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"iPhone"}