def get_subclasses(experiment): # 1.) Mention subclasses Data = mention_subclass("Data") Row = mention_subclass("Row") Col = mention_subclass("Col") # 2.) Mention spaces data_ngrams = MentionSentences() # MentionNgrams(n_max=3) row_ngrams = MentionSentences() # MentionNgrams(n_min=1, n_max=8) col_ngrams = MentionSentences() # MentionNgrams(n_min=1, n_max=8) # 3.) Matchers data_regex_matcher = RegexMatchSpan(rgx=r"[0-9-,.%$#]+( to | )?[0-9-,.%$#]*|^x$", longest_match_only=True) data_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Data", experiment)) data_matcher = Intersect(data_regex_matcher, data_label_matcher) row_regex_matcher = RegexMatchSpan(rgx=r"^.*$", longest_match_only=True) row_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Header", experiment)) row_matcher = Intersect(row_regex_matcher, row_label_matcher) col_regex_matcher = RegexMatchSpan(rgx=r"^.*$", longest_match_only=True) col_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Header", experiment)) col_matcher = Intersect(col_regex_matcher, col_label_matcher) # 4.) Candidate classes RowCandidate = candidate_subclass("RowCandidate", [Data, Row]) ColCandidate = candidate_subclass("ColCandidate", [Data, Col]) # 5.) Throttlers mention_classes = [Data, Row, Col] mention_spaces = [data_ngrams, row_ngrams, col_ngrams] matchers = [data_matcher, row_matcher, col_matcher] candidate_classes = [RowCandidate, ColCandidate] throttlers = [row_filter, col_filter] return (mention_classes, mention_spaces, matchers, candidate_classes, throttlers)
def _get_part_matcher(): """Return the part matcher.""" # Transistor Naming Conventions as Regular Expressions eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}[0-9]?[A-Z]?" r"(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)") jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)" jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})" others_rgx = ( r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|TIPL|DTC|MMBT" r"|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}[\d]{2,4}[A-Z]{0,5}" r"(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)") part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx]) add_rgx = r"^[A-Z0-9\-]{5,15}$" part_file_name_lambda_matcher = LambdaFunctionMatcher( func=_part_file_name_conditions) part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True) part_dict_matcher = DictionaryMatch(d=_get_digikey_parts_set(DICT_PATH)) part_file_name_matcher = Intersect( RegexMatchSpan(rgx=add_rgx, longest_match_only=True), part_file_name_lambda_matcher, ) return Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher)
def _get_temp_matcher(temp_type): """Return the temperature matcher.""" if temp_type == "max": return RegexMatchSpan(rgx=r"(?:[1][5-9]|20)[05]", longest_match_only=False) elif temp_type == "min": return RegexMatchSpan(rgx=r"-[56][05]", longest_match_only=False) else: logger.warning(f"{temp_type} is not a valid temperature type.")
def get_supply_current_matcher(): def current_units(attr): # NOTE: These two symbols for mu are unique, not duplicates. current_units = ["ma", "μa", "ua", "µa", "\uf06da"] keywords = ["supply", "quiescent", "iq", "is", "idd", "icc"] filter_keywords = ["offset", "bias", "logic", "shutdown"] related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True)) related_ngrams.update(get_row_ngrams(attr, n_max=1, spread=[-5, 5], lower=True)) if attr.get_span().strip() == "0": return False if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)): return False if overlap(current_units, related_ngrams) and overlap(keywords, related_ngrams): return True return False # match 4-digit integers, or two-digit floats up with 2 points of precision current_rgx = RegexMatchSpan( rgx=r"(±?\d{1,2}\.\d{1,2}|±?\d{1,4})", longest_match_only=False ) current_lambda = LambdaFunctionMatcher(func=current_units) condition_lambda = LambdaFunctionMatcher(func=_condition) location_lambda = LambdaFunctionMatcher(func=_first_page_or_table) return Intersect(condition_lambda, location_lambda, current_rgx, current_lambda)
def test_cancat(doc_setup): """Test Concat matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) # Match any span that contains "this" matcher0 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) # Match any span that contains "is" matcher1 = RegexMatchSpan(rgx=r"is", search=False, full_match=False, longest_match_only=False) matcher = Concat(matcher0, matcher1) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"} # Test if matcher raises an error when _f is given non-TemporarySpanMention with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words)) # Test if an error is raised when the number of child matchers is not 2. matcher = Concat(matcher0) with pytest.raises(ValueError): list(matcher.apply(space.apply(doc))) # Test with left_required=False matcher = Concat(matcher0, matcher1, left_required=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "is apple", } # Test with right_required=False matcher = Concat(matcher0, matcher1, right_required=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"} # Test with permutations=False matcher = Concat(matcher1, matcher0, permutations=False) assert set(matcher.apply(space.apply(doc))) == set() # Test with permutations=True matcher = Concat(matcher1, matcher0, permutations=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"}
def test_inverse(doc_setup): """Test inverse matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" with longest_match_only=False matcher0 = RegexMatchSpan( rgx=r"apple", search=True, full_match=True, longest_match_only=False ) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Take an inverse matcher = Inverse(matcher0, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "This", "is", } # longest_match_only=True matcher = Inverse(matcher0, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"} # Match any span that contains "apple" with longest_match_only=True matcher0 = RegexMatchSpan( rgx=r"apple", search=True, full_match=True, longest_match_only=True ) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {"is apple"} # longest_match_only=False on Inverse is in effect. matcher = Inverse(matcher0, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "This", "is", } # longest_match_only=True on Inverse is in effect. matcher = Inverse(matcher0, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"} # Check if Inverse raises an error when no child matcher is provided. with pytest.raises(ValueError): Inverse() # Check if Inverse raises an error when two child matchers are provided. with pytest.raises(ValueError): Inverse(matcher0, matcher0)
def _get_polarity_matcher(): """Return the polarity matcher.""" def polarity_conditions(attr): return not overlap(["complement", "complementary"], get_sentence_ngrams(attr)) polarity_rgx_matcher = RegexMatchSpan(rgx=r"NPN|PNP", longest_match_only=False, ignore_case=True) polarity_lambda_matcher = LambdaFunctionMatcher(func=polarity_conditions) return Intersect(polarity_rgx_matcher, polarity_lambda_matcher)
def test_regex_match(doc_setup): """Test RegexMatch matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) # a wrong option name should raise an excetiopn with pytest.raises(Exception): RegexMatchSpan(regex=r"apple") # Test if matcher raises an error when _f is given non-TemporarySpanMention matcher = RegexMatchSpan(rgx=r"apple") with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words)) matcher = RegexMatchEach(rgx=r"apple") with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words)) # Test if RegexMatchEach works as expected. assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"apple"} # Test ignore_case option matcher = RegexMatchEach(rgx=r"Apple", ignore_case=False) assert list(matcher.apply(space.apply(doc))) == []
def test_union(doc_setup): """Test union matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is", "This", } matcher = Union(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "is apple", "apple", "This is", "This", } # longest_match_only of each matcher is ignored. matcher = Union(matcher0, matcher1, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "is apple", } # Unsupported option should raise an exception with pytest.raises(Exception): Union(matcher0, matcher1, long_match_only=False)
def get_gain_matcher(): def hertz_units(attr): hertz_units = ["mhz", "khz"] keywords = [ "product", "gain", "gain", "unity", "bandwidth", "gbp", "gbw", "gbwp", ] filter_keywords = ["-3 db", "maximum", "minimum", "impedance"] related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True)) related_ngrams.update( get_row_ngrams(attr, n_max=1, spread=[-2, 2], lower=True)) cell_ngrams = set(get_cell_ngrams(attr, n_max=1, lower=True)) if "f" in cell_ngrams and "=" in cell_ngrams: return False if attr.get_span().strip() == "0": return False if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)): return False if overlap(hertz_units, related_ngrams) and overlap( keywords, related_ngrams): return True return False # match 3-digit integers, or two-digit floats up with 2 points of precision gain_rgx = RegexMatchSpan(rgx=r"^(?:\d{1,2}\.\d{1,2}|\d{1,3})$", longest_match_only=False) hertz_lambda = LambdaFunctionMatcher(func=hertz_units) condition_lambda = LambdaFunctionMatcher(func=_condition) location_lambda = LambdaFunctionMatcher(func=_first_page_or_table) return Intersect(hertz_lambda, gain_rgx, location_lambda, condition_lambda)
def _get_ce_v_max_matcher(): """Return a collector-emmiter voltage max matcher.""" ce_keywords = set( ["collector emitter", "collector-emitter", "collector - emitter"]) ce_abbrevs = set(["ceo", "vceo"]) def ce_v_max_conditions(attr): ngrams = set(get_row_ngrams(attr, n_max=1)) if not overlap(ce_keywords.union(ce_abbrevs), ngrams): return False if any(_ in attr.sentence.text.lower() for _ in ["vcb", "base"]): return False return True ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\b\d{1,2}[05]", longest_match_only=False) ce_v_max_row_matcher = LambdaFunctionMatcher(func=ce_v_max_conditions) ce_v_max_in_table = LambdaFunctionMatcher(func=_attr_in_table) return Intersect(ce_v_max_rgx_matcher, ce_v_max_row_matcher, ce_v_max_in_table)
def test_union(caplog, doc_setup): doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is", "This", } matcher = Union(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "is apple", "apple", "This is", "This", } # longest_match_only of each matcher is ignored. matcher = Union(matcher0, matcher1, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "is apple", }
def test_intersect(doc_setup): """Test intersect matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=3) tc: TemporarySpanMention # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "This is apple", "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is apple", "This is", "This", } # Intersection of matcher0 and matcher1 matcher = Intersect(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is apple"} # Intersection of matcher0 and matcher0 matcher = Intersect(matcher0, matcher0, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is apple", "is apple", "apple", } # longest_match_only=True overrides that of child matchers. matcher = Intersect(matcher0, matcher0, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is apple"}
def _f(self, m: TemporaryContext) -> bool: """The internal (non-composed) version of filter function f""" def apply_rgx(attrib_span): # search for string as e.g. "_" split operator is used return ( True if self.r.search(attrib_span) is not None else False ) if isinstance(m, TemporarySpanMention): return RegexMatchSpan._f(self, m) if isinstance(m, TemporaryFigureMention): return apply_rgx(m.figure.url) if isinstance(m, TemporaryDocumentMention): return apply_rgx(m.document.name) raise ValueError( f""" {self.__class__.__name__} only supports TemporarySpanMention, TemporaryFigureMention and TemporaryDocumentMention """ )
# Getting all documents parsed by Snorkel print("Getting documents and sentences...") docs = session.query(Document).all() #sents = session.query(Sentence).all() from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams from fonduer.candidates.models import mention_subclass, candidate_subclass from fonduer.candidates.matchers import RegexMatchSpan, Union # Defining ngrams for candidates extraction_name = "age" age_ngrams = MentionNgrams(n_max=3) # Define matchers m = RegexMatchSpan(rgx=r'.*(I|He|She) (is|am) ^([0-9]{2})*') p = RegexMatchSpan(rgx=r'.*(age|is|@|was) ^([0-9]{2})*') q = RegexMatchSpan(rgx=r'.*(age:) ^([0-9]{2})*') r = RegexMatchSpan( rgx=r'.*^([0-9]{2}) (yrs|years|year|yr|old|year-old|yr-old|Years|Year|Yr)*' ) s = RegexMatchSpan(rgx=r'(^|\W)age\W{0,4}[1-9]\d(\W|$)') # Union matchers and create candidate extractor age_matchers = Union(m, p, r, q, s) # Getting candidates AgeMention = mention_subclass("AgeMention") mention_extractor = MentionExtractor(session, [AgeMention], [age_ngrams], [age_matchers]) mention_extractor.clear_all()
# Getting all documents parsed by Snorkel print("Getting documents and sentences...") docs = session.query(Document).all() #sents = session.query(Sentence).all() from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams from fonduer.candidates.models import mention_subclass, candidate_subclass from fonduer.candidates.matchers import RegexMatchSpan, Union # Defining ngrams for candidates extraction_name = 'call' ngrams = MentionNgrams(n_max=1) # Define matchers regex_matcher_1=RegexMatchSpan(rgx = r'(incalls?|outcalls?|incalls?outcalls?|in calls?|out calls?)') # Union matchers and create candidate extractor matchers = regex_matcher_1 # Getting candidates CallMention = mention_subclass("CallMention") mention_extractor = MentionExtractor( session, [CallMention], [ngrams], [matchers] ) mention_extractor.clear_all() mention_extractor.apply(docs, parallelism=parallelism) candidate_class = candidate_subclass("Call", [CallMention]) candidate_extractor = CandidateExtractor(session, [candidate_class]) # Applying candidate extractors candidate_extractor.apply(docs, split=0, parallelism=parallelism)
"""Hardware matchers.""" import csv from fonduer.candidates.matchers import ( DictionaryMatch, Intersect, LambdaFunctionMatcher, RegexMatchSpan, Union, ) from fonduer.utils.data_model_utils import get_row_ngrams, overlap temp_matcher = RegexMatchSpan(rgx=r"(?:[1][5-9]|20)[05]", longest_match_only=False) # Transistor Naming Conventions as Regular Expressions ### eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}" r"[0-9]?[A-Z]?(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)") jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)" jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})" others_rgx = (r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|" r"TIPL|DTC|MMBT|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}" r"[\d]{2,4}[A-Z]{0,5}(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)") part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx]) part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True) def get_digikey_parts_set(path): """Get all transistor parts from digikey part dictionary.""" all_parts = set()
# 2.) Mention spaces station_ngrams = MentionNgrams( n_max=4, split_tokens=[" ", "_", "\.", "%"]) # StationMentionSpace(n_max=4) # price_ngrams = MentionNgrams(n_max=1) # 3.) Matcher functions station_matcher = RegexMatchFull( rgx=station_rgx, ignore_case=True, # search=True, # full_match=False, # longest_match_only=False, ) # DictionaryMatch(d=stations_list) price_matcher = RegexMatchSpan(rgx=r"\d{1,4}(\.\d{1,5})", longest_match_only=True) # 4.) Candidate classes StationPrice = candidate_subclass("StationPrice", [Station, Price]) # 5.) Throttlers def my_throttler(c): (station, price) = c if 'volume' in get_aligned_ngrams(price, lower=True): return False if 'date' in get_aligned_ngrams(price, lower=True): return False if 'non' in get_aligned_ngrams(price, lower=True): return False html_tags = get_ancestor_tag_names(station)
# Getting all documents parsed by Snorkel print("Getting documents and sentences...") docs = session.query(Document).all() #sents = session.query(Sentence).all() from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams from fonduer.candidates.models import mention_subclass, candidate_subclass from fonduer.candidates.matchers import RegexMatchSpan, Union # Defining ngrams for candidates extraction_name = 'ethnicity' ngrams = MentionNgrams(n_max=1) # Define matchers regex_matcher_1 = RegexMatchSpan( rgx=r'(black|ebony|chocolate|mocha|cocoa|white|blonde|asian|latina|arab)') # Union matchers and create candidate extractor matchers = regex_matcher_1 # Getting candidates EthnicityMention = mention_subclass("EthnicityMention") mention_extractor = MentionExtractor(session, [EthnicityMention], [ngrams], [matchers]) mention_extractor.clear_all() mention_extractor.apply(docs, parallelism=parallelism) candidate_class = candidate_subclass("Ethnicity", [EthnicityMention]) candidate_extractor = CandidateExtractor(session, [candidate_class]) # Applying candidate extractors candidate_extractor.apply(docs, split=0, parallelism=parallelism)