def test_compression():
    gt = GraphTransliterator.from_yaml(test_config)
    compressed_config = compression.compress_config(gt.dump())
    decompressed_config = compression.decompress_config(compressed_config)
    gt_from_decompressed = GraphTransliterator.load(decompressed_config)
    # Compare JSON dumps with sorted keys.
    assert (json.dumps(gt.dump(), sort_keys=True) == json.dumps(
        gt_from_decompressed.dump(), sort_keys=True))
    # Test bad compression level
    with pytest.raises(ValueError):
        gt.dump(
            compression_level=graphtransliterator.HIGHEST_COMPRESSION_LEVEL +
            1)
    # Test compression at level 0 (should likely not be called)
    assert "compressed_settings" not in compression.compress_config(
        gt.dump(), compression_level=0)
    # Test compression levels
    assert '"tokens": ' in gt.dumps(compression_level=0)
    assert '"compressed_settings"' in gt.dumps(compression_level=1)
    assert '"compressed_settings"' in gt.dumps(compression_level=2)
    for i in range(0, graphtransliterator.HIGHEST_COMPRESSION_LEVEL + 1):
        x = gt.dumps(compression_level=i)
        y = gt.loads(x)
        assert y.transliterate("a") == "A"
Beispiel #2
0
def test_serialization():
    """Test serialization of graphtransliterator"""
    # Field definitions
    required_fields = ["tokens", "rules", "whitespace"]
    optional_fields = [
        "onmatch_rules",
        "metadata",
        "ignore_errors",
        "onmatch_rules_lookup",
        "tokens_by_class",
        "graph",
        "tokenizer_pattern",
        "graphtransliterator_version",
    ]
    ordered_fields = required_fields + optional_fields
    yaml_ = """
        tokens:
          a: [vowel]
          ' ': [wb]
        rules:
          a: A
          ' ': ' '
        whitespace:
          default: " "
          consolidate: false
          token_class: wb
        onmatch_rules:
          - <vowel> + <vowel>: ','  # add a comma between vowels
        metadata:
          author: "Author McAuthorson"
    """
    gt = GraphTransliterator.from_yaml(yaml_)
    # test dump
    dump = gt.dump()
    assert dump["graph"]["edge"]
    # test ordering of dump fields
    assert list(dump.keys()) == ordered_fields
    # test dump version
    assert dump["graphtransliterator_version"] == graphtransliterator.__version__
    assert re.match(r"\d+\.\d+\.\d+$", gt.dump()["graphtransliterator_version"])
    # test dumps
    x = gt.dumps()
    assert "graph" in gt.dumps()
    assert type(x) == str
    # test loads
    new_gt = GraphTransliterator.loads(x)
    assert GraphTransliterator.loads(gt.dumps()).dumps()
    assert type(new_gt) == GraphTransliterator
    # test load
    settings = gt.dump()
    assert type(GraphTransliterator.load(settings)) == GraphTransliterator
    # confirm settings not affected by load
    assert settings == settings
    # confirm compacting (dropping) optional settings works
    for length in range(1, len(optional_fields)):
        for to_drop in combinations(optional_fields, length):
            settings = gt.dump()
            for _ in to_drop:
                settings.pop(_)
            # Confirm ValidationError if onmatch_rules_lookup but not onmatch_rules
            # (chances of this every being the case are slim!)
            if settings.get("onmatch_rules_lookup") and not settings.get(
                "onmatch_rules"
            ):
                with pytest.raises(ValidationError):
                    assert GraphTransliterator.load(settings)
            else:
                assert GraphTransliterator.load(settings)

    bad_settings = gt.dump()
    bad_settings.pop("onmatch_rules")
    with pytest.raises(ValidationError):
        assert GraphTransliterator.load(bad_settings)
Beispiel #3
0
def test_GraphTransliterator_transliterate(tmpdir):
    """Test GraphTransliterator transliterate."""
    YAML = r"""
    tokens:
        a: [class_a]
        b: [class_b]
        c: [class_c]
        " ": [wb]
        d: []
        Aa: [contrained_rule]
    rules:
        a: A
        b: B
        <class_c> <class_c> a: A(AFTER_CLASS_C_AND_CLASS_C)
        (<class_c> b) a: A(AFTER_B_AND_CLASS_C)
        (<class_c> b b) a a: AA(AFTER_BB_AND_CLASS_C)
        a <class_c>: A(BEFORE_CLASS_C)
        a b (c <class_b>): AB(BEFORE_C_AND_CLASS_B)
        c: C
        c c: C*2
        a (b b b): A(BEFORE_B_B_B)
        d (c <class_a>): D(BEFORE_C_AND_CLASS_A)
        (b b) a: A(AFTER_B_B)
        <wb> Aa: A(ONLY_A_CONSTRAINED_RULE)
    onmatch_rules:
        -
            <class_a> <class_b> + <class_a> <class_b>: "!"
        -
            <class_a> + <class_b>: ","
    whitespace:
        default: ' '
        consolidate: True
        token_class: wb
    """
    gt = GraphTransliterator.from_yaml(YAML)
    # rules with single token
    assert gt.transliterate("a") == "A"
    # rules with multiple tokens
    assert gt.transliterate("aa") == "AA"
    # rules with multiple tokens (for rule_key)
    assert gt.transliterate("cc") == "C*2"
    # # rules with multiple tokens overlapping end of tokens
    # assert gt.transliterate('c') == 'C'

    # rules with prev class
    assert gt.transliterate("ca") == "CA"
    # rules with prev class and prev token
    assert gt.transliterate("dca") == "D(BEFORE_C_AND_CLASS_A)CA"
    # rules with prev class and prev tokens
    assert gt.transliterate("cbba") == "CBBA(AFTER_B_B)"
    # rules with next class
    assert gt.transliterate("ac") == "A(BEFORE_CLASS_C)C"
    # rules with next class and next tokens
    assert gt.transliterate("acb") == "A(BEFORE_CLASS_C)CB"
    # rules with onmatch rule of length 1
    assert gt.transliterate("ab") == "A,B"
    # rules that only have constraints on first element
    assert gt.transliterate("Aa") == "A(ONLY_A_CONSTRAINED_RULE)"
    # test whitespace consolidation
    assert gt.transliterate(" a") == "A"
    # test whitespace consolidation following
    assert gt.transliterate("a ") == "A"

    # rules with longer onmatch rules
    assert gt.transliterate("abab") == "A,B!A,B"

    # test last_matched_input_tokens
    assert gt.last_input_tokens == [" ", "a", "b", "a", "b", " "]
    # test last_matched_tokens
    assert gt.last_matched_rule_tokens == [["a"], ["b"], ["a"], ["b"]]

    # test last_matched_rules
    assert len(gt.last_matched_rules) == 4

    # test dump
    assert gt.dump()["graph"]["edge"]
    assert type(GraphTransliterator.load(gt.dump())) == GraphTransliterator
    assert "graph" in gt.dumps()
    assert GraphTransliterator.loads(gt.dumps()).dumps()
    assert re.match(r"\d+\.\d+\.\d+$",
                    gt.dump()["graphtransliterator_version"])
    assert gt.dump(
    )["graphtransliterator_version"] == graphtransliterator.__version__
    x = gt.dumps()
    assert type(x) == str
    new_gt = GraphTransliterator.loads(x)
    assert type(new_gt) == GraphTransliterator