def _get_part_matcher(): """Return the part matcher.""" # Transistor Naming Conventions as Regular Expressions eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}[0-9]?[A-Z]?" r"(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)") jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)" jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})" others_rgx = ( r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|TIPL|DTC|MMBT" r"|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}[\d]{2,4}[A-Z]{0,5}" r"(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)") part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx]) add_rgx = r"^[A-Z0-9\-]{5,15}$" part_file_name_lambda_matcher = LambdaFunctionMatcher( func=_part_file_name_conditions) part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True) part_dict_matcher = DictionaryMatch(d=_get_digikey_parts_set(DICT_PATH)) part_file_name_matcher = Intersect( RegexMatchSpan(rgx=add_rgx, longest_match_only=True), part_file_name_lambda_matcher, ) return Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher)
def test_dictionary_match(doc_setup): """Test DictionaryMatch matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=1) # Test with a list of str matcher = DictionaryMatch(d=["this"]) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This"} # Test without a dictionary with pytest.raises(Exception): DictionaryMatch() # TODO: test with plural words matcher = DictionaryMatch(d=["is"], stemmer=PorterStemmer()) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"is"} # Test if matcher raises an error when _f is given non-TemporarySpanMention matcher = DictionaryMatch(d=["this"]) with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words))
def test_do_not_use_stemmer_when_UnicodeDecodeError(): """Test DictionaryMatch when stemmer causes UnicodeDecodeError.""" stemmer = PorterStemmer() matcher = DictionaryMatch(d=["is"], stemmer=stemmer) # _stem(w) should return a word stem. assert matcher._stem("caresses") == "caress" stemmer.stem = Mock(side_effect=UnicodeDecodeError( "dummycodec", b"\x00\x00", 1, 2, "Dummy !")) matcher = DictionaryMatch(d=["is"], stemmer=stemmer) # _stem(w) should return w as stemmer.stem raises UnicodeDecodeError. assert matcher._stem("caresses") == "caresses"
def get_digikey_parts_set(path): """Get all transistor parts from digikey part dictionary.""" all_parts = set() with open(path, "r") as csvinput: reader = csv.reader(csvinput) for line in reader: (part, url) = line all_parts.add(part) return all_parts # Dictionary of known transistor parts ### dict_path = "tests/data/digikey_part_dictionary.csv" part_dict_matcher = DictionaryMatch(d=get_digikey_parts_set(dict_path)) def common_prefix_length_diff(str1, str2): """Calculate common prefix length difference.""" for i in range(min(len(str1), len(str2))): if str1[i] != str2[i]: return min(len(str1), len(str2)) - i return 0 def part_file_name_conditions(attr): """Check part file name conditions.""" file_name = attr.sentence.document.name if len(file_name.split("_")) != 2: return False