Ejemplo n.º 1
0
def get_subclasses(experiment):
  # 1.) Mention subclasses
  Data = mention_subclass("Data")
  Row = mention_subclass("Row")
  Col = mention_subclass("Col")

  # 2.) Mention spaces
  data_ngrams = MentionSentences() # MentionNgrams(n_max=3)
  row_ngrams = MentionSentences() # MentionNgrams(n_min=1, n_max=8)
  col_ngrams = MentionSentences() # MentionNgrams(n_min=1, n_max=8)

  # 3.) Matchers
  data_regex_matcher = RegexMatchSpan(rgx=r"[0-9-,.%$#]+( to | )?[0-9-,.%$#]*|^x$", longest_match_only=True)
  data_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Data", experiment))
  data_matcher = Intersect(data_regex_matcher, data_label_matcher)
  row_regex_matcher = RegexMatchSpan(rgx=r"^.*$", longest_match_only=True)
  row_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Header", experiment))
  row_matcher = Intersect(row_regex_matcher, row_label_matcher)
  col_regex_matcher = RegexMatchSpan(rgx=r"^.*$", longest_match_only=True)
  col_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Header", experiment))
  col_matcher = Intersect(col_regex_matcher, col_label_matcher)

  # 4.) Candidate classes
  RowCandidate = candidate_subclass("RowCandidate", [Data, Row])
  ColCandidate = candidate_subclass("ColCandidate", [Data, Col])

  # 5.) Throttlers
  mention_classes = [Data, Row, Col]
  mention_spaces = [data_ngrams, row_ngrams, col_ngrams]
  matchers = [data_matcher, row_matcher, col_matcher]
  candidate_classes = [RowCandidate, ColCandidate]
  throttlers = [row_filter, col_filter]

  return (mention_classes, mention_spaces, matchers, candidate_classes, throttlers)
Ejemplo n.º 2
0
def get_supply_current_matcher():
    def current_units(attr):

        # NOTE: These two symbols for mu are unique, not duplicates.
        current_units = ["ma", "μa", "ua", "µa", "\uf06da"]
        keywords = ["supply", "quiescent", "iq", "is", "idd", "icc"]
        filter_keywords = ["offset", "bias", "logic", "shutdown"]
        related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True))
        related_ngrams.update(get_row_ngrams(attr, n_max=1, spread=[-5, 5], lower=True))

        if attr.get_span().strip() == "0":
            return False

        if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)):
            return False

        if overlap(current_units, related_ngrams) and overlap(keywords, related_ngrams):
            return True

        return False

    # match 4-digit integers, or two-digit floats up with 2 points of precision
    current_rgx = RegexMatchSpan(
        rgx=r"(±?\d{1,2}\.\d{1,2}|±?\d{1,4})", longest_match_only=False
    )

    current_lambda = LambdaFunctionMatcher(func=current_units)
    condition_lambda = LambdaFunctionMatcher(func=_condition)
    location_lambda = LambdaFunctionMatcher(func=_first_page_or_table)

    return Intersect(condition_lambda, location_lambda, current_rgx, current_lambda)
Ejemplo n.º 3
0
def phone_extract(docs, session, phone_subclass, parallelism, clear=True):
    phone_lambda_matcher = LambdaFunctionMatcher(func=matcher_number_phone)
    regex_matcher = LambdaFunctionMatcher(func=regexMatch)
    phone_lamda_matcher = Union(regex_matcher, phone_lambda_matcher)

    phone_space = MentionPhoneNumber()
    
    mention_extractor = MentionExtractor(session, [phone_subclass], [phone_space], [phone_lamda_matcher])
    mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
Ejemplo n.º 4
0
def phone_extract_server(document, phone_subclass):
    phone_lambda_matcher = LambdaFunctionMatcher(func=matcher_number_phone)
    regex_matcher = LambdaFunctionMatcher(func=regexMatch)
    phone_lamda_matcher = Union(regex_matcher, phone_lambda_matcher)

    phone_space = MentionPhoneNumber()
    
    document = MentionExtractorUDF([phone_subclass], [phone_space], [phone_lamda_matcher]).apply(document)
    return document
Ejemplo n.º 5
0
def test_mention_longest_match():
    """Test longest match filtering in mention extraction."""
    file_name = "lincoln_short"
    docs_path = f"tests/data/pure_html/{file_name}.html"
    doc = parse_doc(docs_path, file_name)

    # Mention Extraction
    name_ngrams = MentionNgramsPart(n_max=3)
    place_ngrams = MentionNgramsTemp(n_max=4)

    Name = mention_subclass("Name")
    Place = mention_subclass("Place")

    def is_birthplace_table_row(mention):
        if not mention.sentence.is_tabular():
            return False
        ngrams = get_row_ngrams(mention, lower=True)
        if "birth_place" in ngrams:
            return True
        else:
            return False

    birthplace_matcher = LambdaFunctionMatcher(
        func=is_birthplace_table_row, longest_match_only=False
    )
    mention_extractor_udf = MentionExtractorUDF(
        [Name, Place],
        [name_ngrams, place_ngrams],
        [PersonMatcher(), birthplace_matcher],
    )
    doc = mention_extractor_udf.apply(doc)
    mentions = doc.places
    mention_spans = [x.context.get_span() for x in mentions]
    assert "Sinking Spring Farm" in mention_spans
    assert "Farm" in mention_spans
    assert len(mention_spans) == 23

    # Clear manually
    for mention in doc.places[:]:
        doc.places.remove(mention)

    birthplace_matcher = LambdaFunctionMatcher(
        func=is_birthplace_table_row, longest_match_only=True
    )
    mention_extractor_udf = MentionExtractorUDF(
        [Name, Place],
        [name_ngrams, place_ngrams],
        [PersonMatcher(), birthplace_matcher],
    )
    doc = mention_extractor_udf.apply(doc)
    mentions = doc.places
    mention_spans = [x.context.get_span() for x in mentions]
    assert "Sinking Spring Farm" in mention_spans
    assert "Farm" not in mention_spans
    assert len(mention_spans) == 4
Ejemplo n.º 6
0
def address_extract_server(document, address_subclass):
    address_m1 = LambdaFunctionMatcher(func = has_province_address)
    address_m2 = LambdaFunctionMatcher(func = has_geographic_term_address)
    address_m3 = LambdaFunctionMatcher(func = address_prefix)
    address_m4 = LambdaFunctionMatcher(func = is_collection_of_number_and_geographical_term_and_provinces_name_address)
    address_m5 = LambdaFunctionMatcher(func = hasnt_ignor_words)
    address_matcher = Intersect(Union(address_m1, address_m2, address_m3), address_m4, address_m5)

    address_space = MentionSentences()
    
    document = MentionExtractorUDF([address_subclass], [address_space], [address_matcher]).apply(document)
    return document
Ejemplo n.º 7
0
def address_extract(docs, session, address_subclass, parallelism, clear=True):
    address_m1 = LambdaFunctionMatcher(func = has_province_address)
    address_m2 = LambdaFunctionMatcher(func = has_geographic_term_address)
    address_m3 = LambdaFunctionMatcher(func = address_prefix)
    address_m4 = LambdaFunctionMatcher(func = is_collection_of_number_and_geographical_term_and_provinces_name_address)
    address_m5 = LambdaFunctionMatcher(func = hasnt_ignor_words)
    address_matcher = Intersect(Union(address_m1, address_m2, address_m3), address_m4, address_m5)

    address_space = MentionSentences()
    
    mention_extractor = MentionExtractor(session, [address_subclass], [address_space], [address_matcher])
    mention_extractor.apply(docs, parallelism=parallelism,clear=clear)
def birthday_extract_server(document, birthday_subclass):
    filter_birthday_matcher = LambdaFunctionMatcher(func=filter_birthday,
                                                    longest_match_only=True)
    birthday_conditions_matcher = LambdaFunctionMatcher(
        func=birthday_conditions, longest_match_only=True)
    birthday_matcher = Intersect(filter_birthday_matcher,
                                 birthday_conditions_matcher)
    birthday_space = MentionDates()

    document = MentionExtractorUDF([birthday_subclass], [birthday_space],
                                   [birthday_matcher]).apply(document)
    return document
Ejemplo n.º 9
0
def _get_part_matcher():
    """Return the part matcher."""
    # Transistor Naming Conventions as Regular Expressions
    eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}[0-9]?[A-Z]?"
                r"(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)")
    jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)"
    jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})"
    others_rgx = (
        r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|TIPL|DTC|MMBT"
        r"|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}[\d]{2,4}[A-Z]{0,5}"
        r"(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)")

    part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx])

    add_rgx = r"^[A-Z0-9\-]{5,15}$"
    part_file_name_lambda_matcher = LambdaFunctionMatcher(
        func=_part_file_name_conditions)

    part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True)
    part_dict_matcher = DictionaryMatch(d=_get_digikey_parts_set(DICT_PATH))
    part_file_name_matcher = Intersect(
        RegexMatchSpan(rgx=add_rgx, longest_match_only=True),
        part_file_name_lambda_matcher,
    )
    return Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher)
Ejemplo n.º 10
0
def birthday_extract(docs,
                     session,
                     birthday_subclass,
                     parallelism,
                     clear=True):
    filter_birthday_matcher = LambdaFunctionMatcher(func=filter_birthday,
                                                    longest_match_only=True)
    birthday_conditions_matcher = LambdaFunctionMatcher(
        func=birthday_conditions, longest_match_only=True)
    birthday_matcher = Intersect(filter_birthday_matcher,
                                 birthday_conditions_matcher)
    birthday_space = MentionDates()

    mention_extractor = MentionExtractor(session, [birthday_subclass],
                                         [birthday_space], [birthday_matcher])
    mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
Ejemplo n.º 11
0
def email_extract(docs, session, email_subclass, parallelism, clear=True):
    email_matcher = LambdaFunctionMatcher(func=email_mc,
                                          longest_match_only=True)
    email_space = MentionEmails()
    mention_extractor = MentionExtractor(session, [email_subclass],
                                         [email_space], [email_matcher])
    mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
Ejemplo n.º 12
0
def test_row_col_ngram_extraction():
    """Test whether row/column ngrams list is empty, if mention is not in a table."""
    file_name = "lincoln_short"
    docs_path = f"tests/data/pure_html/{file_name}.html"
    doc = parse_doc(docs_path, file_name)

    # Mention Extraction
    place_ngrams = MentionNgramsTemp(n_max=4)
    Place = mention_subclass("Place")

    def get_row_and_column_ngrams(mention):
        row_ngrams = list(get_row_ngrams(mention))
        col_ngrams = list(get_col_ngrams(mention))
        if not mention.sentence.is_tabular():
            assert len(row_ngrams) == 1 and row_ngrams[0] is None
            assert len(col_ngrams) == 1 and col_ngrams[0] is None
        else:
            assert not any(x is None for x in row_ngrams)
            assert not any(x is None for x in col_ngrams)
        if "birth_place" in row_ngrams:
            return True
        else:
            return False

    birthplace_matcher = LambdaFunctionMatcher(func=get_row_and_column_ngrams)
    mention_extractor_udf = MentionExtractorUDF(
        [Place], [place_ngrams], [birthplace_matcher]
    )

    doc = mention_extractor_udf.apply(doc)
Ejemplo n.º 13
0
def email_extract_server(document, email_subclass):
    email_matcher = LambdaFunctionMatcher(func=email_mc,
                                          longest_match_only=True)
    email_space = MentionEmails()

    document = MentionExtractorUDF([email_subclass], [email_space],
                                   [email_matcher]).apply(document)
    return document
Ejemplo n.º 14
0
def get_gain_matcher():
    def hertz_units(attr):
        hertz_units = ["mhz", "khz"]
        keywords = [
            "product",
            "gain",
            "gain",
            "unity",
            "bandwidth",
            "gbp",
            "gbw",
            "gbwp",
        ]
        filter_keywords = ["-3 db", "maximum", "minimum", "impedance"]
        related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True))
        related_ngrams.update(
            get_row_ngrams(attr, n_max=1, spread=[-2, 2], lower=True))
        cell_ngrams = set(get_cell_ngrams(attr, n_max=1, lower=True))

        if "f" in cell_ngrams and "=" in cell_ngrams:
            return False

        if attr.get_span().strip() == "0":
            return False

        if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)):
            return False

        if overlap(hertz_units, related_ngrams) and overlap(
                keywords, related_ngrams):
            return True

        return False

    # match 3-digit integers, or two-digit floats up with 2 points of precision
    gain_rgx = RegexMatchSpan(rgx=r"^(?:\d{1,2}\.\d{1,2}|\d{1,3})$",
                              longest_match_only=False)

    hertz_lambda = LambdaFunctionMatcher(func=hertz_units)
    condition_lambda = LambdaFunctionMatcher(func=_condition)
    location_lambda = LambdaFunctionMatcher(func=_first_page_or_table)

    return Intersect(hertz_lambda, gain_rgx, location_lambda, condition_lambda)
Ejemplo n.º 15
0
def test_lambda_function_matcher(doc_setup):
    """Test DictionaryMatch matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=1)

    # Test with a lambda function
    matcher = LambdaFunctionMatcher(func=lambda x: True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This",
        "is",
        "apple",
    }

    # Test if matcher raises an error when _f is given non-TemporarySpanMention
    with pytest.raises(ValueError):
        list(matcher.apply(doc.sentences[0].words))

    # Test if an error raised when a func is not provided.
    with pytest.raises(Exception):
        LambdaFunctionMatcher()
Ejemplo n.º 16
0
def _get_polarity_matcher():
    """Return the polarity matcher."""
    def polarity_conditions(attr):
        return not overlap(["complement", "complementary"],
                           get_sentence_ngrams(attr))

    polarity_rgx_matcher = RegexMatchSpan(rgx=r"NPN|PNP",
                                          longest_match_only=False,
                                          ignore_case=True)

    polarity_lambda_matcher = LambdaFunctionMatcher(func=polarity_conditions)

    return Intersect(polarity_rgx_matcher, polarity_lambda_matcher)
Ejemplo n.º 17
0
def _get_ce_v_max_matcher():
    """Return a collector-emmiter voltage max matcher."""
    ce_keywords = set(
        ["collector emitter", "collector-emitter", "collector - emitter"])
    ce_abbrevs = set(["ceo", "vceo"])

    def ce_v_max_conditions(attr):
        ngrams = set(get_row_ngrams(attr, n_max=1))
        if not overlap(ce_keywords.union(ce_abbrevs), ngrams):
            return False
        if any(_ in attr.sentence.text.lower() for _ in ["vcb", "base"]):
            return False

        return True

    ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\b\d{1,2}[05]",
                                          longest_match_only=False)
    ce_v_max_row_matcher = LambdaFunctionMatcher(func=ce_v_max_conditions)
    ce_v_max_in_table = LambdaFunctionMatcher(func=_attr_in_table)

    return Intersect(ce_v_max_rgx_matcher, ce_v_max_row_matcher,
                     ce_v_max_in_table)
Ejemplo n.º 18
0
def test_row_col_ngram_extraction(caplog):
    """Test whether row/column ngrams list is empty, if mention is not in a table."""
    caplog.set_level(logging.INFO)
    PARALLEL = 1
    max_docs = 1
    session = Meta.init("postgresql://localhost:5432/" + DB).Session()
    docs_path = "tests/data/pure_html/lincoln_short.html"

    # Parsing
    logger.info("Parsing...")
    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
    corpus_parser = Parser(session, structural=True, lingual=True)
    corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
    docs = session.query(Document).order_by(Document.name).all()

    # Mention Extraction
    place_ngrams = MentionNgramsTemp(n_max=4)
    Place = mention_subclass("Place")

    def get_row_and_column_ngrams(mention):
        row_ngrams = list(get_row_ngrams(mention))
        col_ngrams = list(get_col_ngrams(mention))
        if not mention.sentence.is_tabular():
            assert len(row_ngrams) == 1 and row_ngrams[0] is None
            assert len(col_ngrams) == 1 and col_ngrams[0] is None
        else:
            assert not any(x is None for x in row_ngrams)
            assert not any(x is None for x in col_ngrams)
        if "birth_place" in row_ngrams:
            return True
        else:
            return False

    birthplace_matcher = LambdaFunctionMatcher(func=get_row_and_column_ngrams)
    mention_extractor = MentionExtractor(
        session, [Place], [place_ngrams], [birthplace_matcher]
    )

    mention_extractor.apply(docs, parallelism=PARALLEL)
Ejemplo n.º 19
0
def name_extract(docs, session, name_subclass, parallelism, clear=True):
    length_name_matcher = LambdaFunctionMatcher(func=length_name)
    position_name_matcher = LambdaFunctionMatcher(func=position_name)
    capitalize_name_matcher = LambdaFunctionMatcher(func=capitalize_name)

    last_name_matcher = LambdaFunctionMatcher(func=last_name)
    name_common_matcher = LambdaFunctionMatcher(func=name_common)
    check_name_matcher = LambdaFunctionMatcher(func=check_name)
    prefix_name_matcher = LambdaFunctionMatcher(func=prefix_name)

    form_name_matcher = Intersect(length_name_matcher, position_name_matcher,
                                  capitalize_name_matcher)
    name_matcher = Intersect(
        Union(Intersect(last_name_matcher, form_name_matcher),
              Intersect(name_common_matcher, form_name_matcher),
              prefix_name_matcher), check_name_matcher)
    name_space = MentionName()

    mention_extractor = MentionExtractor(session, [name_subclass],
                                         [name_space], [name_matcher])
    mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
Ejemplo n.º 20
0
def name_extract_server(document, name_subclass):
    length_name_matcher = LambdaFunctionMatcher(func=length_name)
    position_name_matcher = LambdaFunctionMatcher(func=position_name)
    capitalize_name_matcher = LambdaFunctionMatcher(func=capitalize_name)

    last_name_matcher = LambdaFunctionMatcher(func=last_name)
    name_common_matcher = LambdaFunctionMatcher(func=name_common)
    check_name_matcher = LambdaFunctionMatcher(func=check_name)
    prefix_name_matcher = LambdaFunctionMatcher(func=prefix_name)

    form_name_matcher = Intersect(length_name_matcher, position_name_matcher,
                                  capitalize_name_matcher)
    name_matcher = Intersect(
        Union(Intersect(last_name_matcher, form_name_matcher),
              Intersect(name_common_matcher, form_name_matcher),
              prefix_name_matcher), check_name_matcher)

    name_space = MentionName()

    document = MentionExtractorUDF([name_subclass], [name_space],
                                   [name_matcher]).apply(document)
    return document
Ejemplo n.º 21
0
Presidentname = mention_subclass("Presidentname")
Placeofbirth = mention_subclass("Placeofbirth")


def mention_span_matches_file_name(mention):
    president_name_string = mention.get_span()
    file_name = mention.sentence.document.name.replace("_", " ")
    if president_name_string == file_name:
        return True
    else:
        return False


from fonduer.candidates.matchers import LambdaFunctionMatcher, Intersect, Union

president_name_matcher = LambdaFunctionMatcher(
    func=mention_span_matches_file_name)

from fonduer.utils.data_model_utils import get_row_ngrams


def is_in_birthplace_table_row(mention):
    if not mention.sentence.is_tabular():
        return False
    ngrams = get_row_ngrams(mention, lower=True)
    birth_place_words = set(["birth", "place"])
    if birth_place_words <= set(ngrams):
        return True
    else:
        return False

Ejemplo n.º 22
0
# Getting all documents parsed by Snorkel
print("Getting documents and sentences...")
docs = session.query(Document).all()
#sents = session.query(Sentence).all()

from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams
from fonduer.candidates.models import mention_subclass, candidate_subclass
from fonduer.candidates.matchers import RegexMatchSpan, Union, LambdaFunctionMatcher
from dataset_utils import price_match

# Defining ngrams for candidates
extraction_name = 'price'
ngrams = MentionNgrams(n_max=5)

# Define matchers
matchers = LambdaFunctionMatcher(func=price_match)

# Getting candidates
PriceMention = mention_subclass("PriceMention")
mention_extractor = MentionExtractor(
        session, [PriceMention], [ngrams], [matchers]
    )
mention_extractor.clear_all()
mention_extractor.apply(docs, parallelism=parallelism)
candidate_class = candidate_subclass("Price", [PriceMention])
candidate_extractor = CandidateExtractor(session, [candidate_class])

# Applying candidate extractors
candidate_extractor.apply(docs, split=0, parallelism=parallelism)
print("==============================")
print(f"Candidate extraction results for {postgres_db_name}:")
Ejemplo n.º 23
0
from fonduer.candidates.matchers import LambdaFunctionMatcher


def person_name_matcher(mention):
    mention_set = set(mention.sentence.ner_tags)
    if len(mention_set) == 1 and 'PERSON' in mention_set:
        return True
    else:
        return False


person_name_function = LambdaFunctionMatcher(func=person_name_matcher)


def get_matchers():
    return [person_name_function]
Ejemplo n.º 24
0
def test_mention_longest_match(caplog):
    """Test longest match filtering in mention extraction."""
    caplog.set_level(logging.INFO)
    # SpaCy on mac has issue on parallel parsing
    PARALLEL = 1

    max_docs = 1
    session = Meta.init("postgresql://localhost:5432/" + DB).Session()

    docs_path = "tests/data/pure_html/lincoln_short.html"

    # Parsing
    logger.info("Parsing...")
    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
    corpus_parser = Parser(session, structural=True, lingual=True)
    corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
    docs = session.query(Document).order_by(Document.name).all()
    # Mention Extraction
    name_ngrams = MentionNgramsPart(n_max=3)
    place_ngrams = MentionNgramsTemp(n_max=4)

    Name = mention_subclass("Name")
    Place = mention_subclass("Place")

    def is_birthplace_table_row(mention):
        if not mention.sentence.is_tabular():
            return False
        ngrams = get_row_ngrams(mention, lower=True)
        if "birth_place" in ngrams:
            return True
        else:
            return False

    birthplace_matcher = LambdaFunctionMatcher(
        func=is_birthplace_table_row, longest_match_only=False
    )
    mention_extractor = MentionExtractor(
        session,
        [Name, Place],
        [name_ngrams, place_ngrams],
        [PersonMatcher(), birthplace_matcher],
    )
    mention_extractor.apply(docs, parallelism=PARALLEL)
    mentions = session.query(Place).all()
    mention_spans = [x.context.get_span() for x in mentions]
    assert "Sinking Spring Farm" in mention_spans
    assert "Farm" in mention_spans
    assert len(mention_spans) == 23

    birthplace_matcher = LambdaFunctionMatcher(
        func=is_birthplace_table_row, longest_match_only=True
    )
    mention_extractor = MentionExtractor(
        session,
        [Name, Place],
        [name_ngrams, place_ngrams],
        [PersonMatcher(), birthplace_matcher],
    )
    mention_extractor.apply(docs, parallelism=PARALLEL)
    mentions = session.query(Place).all()
    mention_spans = [x.context.get_span() for x in mentions]
    assert "Sinking Spring Farm" in mention_spans
    assert "Farm" not in mention_spans
    assert len(mention_spans) == 4
Ejemplo n.º 25
0
from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams
from fonduer.candidates.models import mention_subclass, candidate_subclass
from dataset_utils import price_match_hour, price_match_half
from fonduer.candidates.matchers import LambdaFunctionMatcher

# Defining ngrams for candidates
if args['name'] == 'hour':
    extraction_name = 'price_per_hour'
elif args['name'] == 'half_hour':
    extraction_name = 'price_per_half_hour'
ngrams = MentionNgrams(n_max=1)

# Define matchers
if args['name'] == 'hour':
    price_matcher = LambdaFunctionMatcher(func=price_match_hour)
elif args['name'] == 'half_hour':
    price_matcher = LambdaFunctionMatcher(func=price_match_half)

matchers = price_matcher

# Getting candidates
PriceMention = mention_subclass("PriceMention")
mention_extractor = MentionExtractor(session, [PriceMention], [ngrams],
                                     [matchers])
mention_extractor.clear_all()
mention_extractor.apply(docs, parallelism=parallelism)
candidate_class = candidate_subclass("Price", [PriceMention])
candidate_extractor = CandidateExtractor(session, [candidate_class])

# Applying candidate extractors
Ejemplo n.º 26
0
def part_file_name_conditions(attr):
    """Check part file name conditions."""
    file_name = attr.sentence.document.name
    if len(file_name.split("_")) != 2:
        return False
    if attr.get_span()[0] == "-":
        return False
    name = attr.get_span().replace("-", "")
    return (any(char.isdigit() for char in name)
            and any(char.isalpha() for char in name)
            and common_prefix_length_diff(file_name.split("_")[1], name) <= 2)


add_rgx = r"^[A-Z0-9\-]{5,15}$"

part_file_name_lambda_matcher = LambdaFunctionMatcher(
    func=part_file_name_conditions)
part_file_name_matcher = Intersect(
    RegexMatchSpan(rgx=add_rgx, longest_match_only=True),
    part_file_name_lambda_matcher)

part_matcher = Union(part_rgx_matcher, part_dict_matcher,
                     part_file_name_matcher)

# CE Voltage Matcher
ce_keywords = set(
    ["collector emitter", "collector-emitter", "collector - emitter"])
ce_abbrevs = set(["ceo", "vceo"])
ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\d{1,2}[05]",
                                      longest_match_only=False)

Ejemplo n.º 27
0
#sents = session.query(Sentence).all()

from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams
from fonduer.candidates.models import mention_subclass, candidate_subclass
from dataset_utils import LocationMatcher, city_index
from fonduer.candidates.matchers import Union, LambdaFunctionMatcher, Intersect
from emmental_utils import get_posting_html_fast

# Defining ngrams for candidates
extraction_name = 'location'
ngrams = MentionNgrams(n_max=3)

# Define matchers
# Geolocation matcher
cities = city_index('../utils/data/cities15000.txt')
geo_location_matcher = LambdaFunctionMatcher(func=cities.fast_loc)

# In raw text matcher
with open(f"{config['prediction_model_path']}/char_dict.pkl", 'rb') as fl:
    char_dict = pickle.load(fl)
dataset = load_data_from_db(postgres_db_name,
                            config['postgres_location'], {},
                            char_dict=char_dict,
                            clobber_label=True)
text_dict = {a[0]['uid']: a[0]['text'] for a in dataset}


def post_matcher_fun(m):
    term = r"([Ll]ocation:[\w\W]{1,200}</.{0,20}>|\W[cC]ity:[\w\W]{1,200}</.{0,20}>|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)"
    #if m.get_span() in get_posting_html_fast(m.sentence.document.text, term):
    if m.get_span() in text_dict[m.sentence.document.name]: