Esempio n. 1
0
def _get_part_matcher():
    """Return the part matcher."""
    # Transistor Naming Conventions as Regular Expressions
    eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}[0-9]?[A-Z]?"
                r"(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)")
    jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)"
    jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})"
    others_rgx = (
        r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|TIPL|DTC|MMBT"
        r"|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}[\d]{2,4}[A-Z]{0,5}"
        r"(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)")

    part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx])

    add_rgx = r"^[A-Z0-9\-]{5,15}$"
    part_file_name_lambda_matcher = LambdaFunctionMatcher(
        func=_part_file_name_conditions)

    part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True)
    part_dict_matcher = DictionaryMatch(d=_get_digikey_parts_set(DICT_PATH))
    part_file_name_matcher = Intersect(
        RegexMatchSpan(rgx=add_rgx, longest_match_only=True),
        part_file_name_lambda_matcher,
    )
    return Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher)
Esempio n. 2
0
def test_dictionary_match(doc_setup):
    """Test DictionaryMatch matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=1)

    # Test with a list of str
    matcher = DictionaryMatch(d=["this"])
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This"}

    # Test without a dictionary
    with pytest.raises(Exception):
        DictionaryMatch()

    # TODO: test with plural words
    matcher = DictionaryMatch(d=["is"], stemmer=PorterStemmer())
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"is"}

    # Test if matcher raises an error when _f is given non-TemporarySpanMention
    matcher = DictionaryMatch(d=["this"])
    with pytest.raises(ValueError):
        list(matcher.apply(doc.sentences[0].words))
Esempio n. 3
0
def test_do_not_use_stemmer_when_UnicodeDecodeError():
    """Test DictionaryMatch when stemmer causes UnicodeDecodeError."""
    stemmer = PorterStemmer()
    matcher = DictionaryMatch(d=["is"], stemmer=stemmer)
    # _stem(w) should return a word stem.
    assert matcher._stem("caresses") == "caress"

    stemmer.stem = Mock(side_effect=UnicodeDecodeError(
        "dummycodec", b"\x00\x00", 1, 2, "Dummy  !"))
    matcher = DictionaryMatch(d=["is"], stemmer=stemmer)
    # _stem(w) should return w as stemmer.stem raises UnicodeDecodeError.
    assert matcher._stem("caresses") == "caresses"
Esempio n. 4
0

def get_digikey_parts_set(path):
    """Get all transistor parts from digikey part dictionary."""
    all_parts = set()
    with open(path, "r") as csvinput:
        reader = csv.reader(csvinput)
        for line in reader:
            (part, url) = line
            all_parts.add(part)
    return all_parts


# Dictionary of known transistor parts ###
dict_path = "tests/data/digikey_part_dictionary.csv"
part_dict_matcher = DictionaryMatch(d=get_digikey_parts_set(dict_path))


def common_prefix_length_diff(str1, str2):
    """Calculate common prefix length difference."""
    for i in range(min(len(str1), len(str2))):
        if str1[i] != str2[i]:
            return min(len(str1), len(str2)) - i
    return 0


def part_file_name_conditions(attr):
    """Check part file name conditions."""
    file_name = attr.sentence.document.name
    if len(file_name.split("_")) != 2:
        return False