Example #1
0
def load_transliterator(source, **kwargs):
    """Loads transliterator (format, parameter)."""
    format, parameter = source
    if format == "bundled":
        mod = __import__("graphtransliterator.transliterators")
        transliterators_mod = mod.transliterators
        transliterator_class = getattr(transliterators_mod, parameter)
        return transliterator_class(**kwargs)
    elif format == "json":
        return GraphTransliterator.loads(parameter, **kwargs)
    elif format == "json_file":
        with open(parameter, "r") as f:
            return GraphTransliterator.loads(f.read(), **kwargs)
    elif format == "yaml_file":
        return GraphTransliterator.from_yaml_file(parameter, **kwargs)
Example #2
0
def test_cli_dump():
    """Test `dump` command."""
    runner = CliRunner()
    dump_result = runner.invoke(cli.main,
                                ["dump", "--from", "bundled", "Example"])
    assert dump_result.exit_code == 0
    json_ = dump_result.output
    assert GraphTransliterator.loads(json_).transliterate("a") == "A"
    # check that dump remains the same (important for version control)
    for i in range(0, 50):
        _ = runner.invoke(cli.main, ["dump", "--from", "bundled", "Example"])
        assert _.output == json_, "JSON dump varies"
Example #3
0
def test_serialization():
    """Test serialization of graphtransliterator"""
    # Field definitions
    required_fields = ["tokens", "rules", "whitespace"]
    optional_fields = [
        "onmatch_rules",
        "metadata",
        "ignore_errors",
        "onmatch_rules_lookup",
        "tokens_by_class",
        "graph",
        "tokenizer_pattern",
        "graphtransliterator_version",
    ]
    ordered_fields = required_fields + optional_fields
    yaml_ = """
        tokens:
          a: [vowel]
          ' ': [wb]
        rules:
          a: A
          ' ': ' '
        whitespace:
          default: " "
          consolidate: false
          token_class: wb
        onmatch_rules:
          - <vowel> + <vowel>: ','  # add a comma between vowels
        metadata:
          author: "Author McAuthorson"
    """
    gt = GraphTransliterator.from_yaml(yaml_)
    # test dump
    dump = gt.dump()
    assert dump["graph"]["edge"]
    # test ordering of dump fields
    assert list(dump.keys()) == ordered_fields
    # test dump version
    assert dump["graphtransliterator_version"] == graphtransliterator.__version__
    assert re.match(r"\d+\.\d+\.\d+$", gt.dump()["graphtransliterator_version"])
    # test dumps
    x = gt.dumps()
    assert "graph" in gt.dumps()
    assert type(x) == str
    # test loads
    new_gt = GraphTransliterator.loads(x)
    assert GraphTransliterator.loads(gt.dumps()).dumps()
    assert type(new_gt) == GraphTransliterator
    # test load
    settings = gt.dump()
    assert type(GraphTransliterator.load(settings)) == GraphTransliterator
    # confirm settings not affected by load
    assert settings == settings
    # confirm compacting (dropping) optional settings works
    for length in range(1, len(optional_fields)):
        for to_drop in combinations(optional_fields, length):
            settings = gt.dump()
            for _ in to_drop:
                settings.pop(_)
            # Confirm ValidationError if onmatch_rules_lookup but not onmatch_rules
            # (chances of this every being the case are slim!)
            if settings.get("onmatch_rules_lookup") and not settings.get(
                "onmatch_rules"
            ):
                with pytest.raises(ValidationError):
                    assert GraphTransliterator.load(settings)
            else:
                assert GraphTransliterator.load(settings)

    bad_settings = gt.dump()
    bad_settings.pop("onmatch_rules")
    with pytest.raises(ValidationError):
        assert GraphTransliterator.load(bad_settings)
Example #4
0
def test_GraphTransliterator_transliterate(tmpdir):
    """Test GraphTransliterator transliterate."""
    YAML = r"""
    tokens:
        a: [class_a]
        b: [class_b]
        c: [class_c]
        " ": [wb]
        d: []
        Aa: [contrained_rule]
    rules:
        a: A
        b: B
        <class_c> <class_c> a: A(AFTER_CLASS_C_AND_CLASS_C)
        (<class_c> b) a: A(AFTER_B_AND_CLASS_C)
        (<class_c> b b) a a: AA(AFTER_BB_AND_CLASS_C)
        a <class_c>: A(BEFORE_CLASS_C)
        a b (c <class_b>): AB(BEFORE_C_AND_CLASS_B)
        c: C
        c c: C*2
        a (b b b): A(BEFORE_B_B_B)
        d (c <class_a>): D(BEFORE_C_AND_CLASS_A)
        (b b) a: A(AFTER_B_B)
        <wb> Aa: A(ONLY_A_CONSTRAINED_RULE)
    onmatch_rules:
        -
            <class_a> <class_b> + <class_a> <class_b>: "!"
        -
            <class_a> + <class_b>: ","
    whitespace:
        default: ' '
        consolidate: True
        token_class: wb
    """
    gt = GraphTransliterator.from_yaml(YAML)
    # rules with single token
    assert gt.transliterate("a") == "A"
    # rules with multiple tokens
    assert gt.transliterate("aa") == "AA"
    # rules with multiple tokens (for rule_key)
    assert gt.transliterate("cc") == "C*2"
    # # rules with multiple tokens overlapping end of tokens
    # assert gt.transliterate('c') == 'C'

    # rules with prev class
    assert gt.transliterate("ca") == "CA"
    # rules with prev class and prev token
    assert gt.transliterate("dca") == "D(BEFORE_C_AND_CLASS_A)CA"
    # rules with prev class and prev tokens
    assert gt.transliterate("cbba") == "CBBA(AFTER_B_B)"
    # rules with next class
    assert gt.transliterate("ac") == "A(BEFORE_CLASS_C)C"
    # rules with next class and next tokens
    assert gt.transliterate("acb") == "A(BEFORE_CLASS_C)CB"
    # rules with onmatch rule of length 1
    assert gt.transliterate("ab") == "A,B"
    # rules that only have constraints on first element
    assert gt.transliterate("Aa") == "A(ONLY_A_CONSTRAINED_RULE)"
    # test whitespace consolidation
    assert gt.transliterate(" a") == "A"
    # test whitespace consolidation following
    assert gt.transliterate("a ") == "A"

    # rules with longer onmatch rules
    assert gt.transliterate("abab") == "A,B!A,B"

    # test last_matched_input_tokens
    assert gt.last_input_tokens == [" ", "a", "b", "a", "b", " "]
    # test last_matched_tokens
    assert gt.last_matched_rule_tokens == [["a"], ["b"], ["a"], ["b"]]

    # test last_matched_rules
    assert len(gt.last_matched_rules) == 4

    # test dump
    assert gt.dump()["graph"]["edge"]
    assert type(GraphTransliterator.load(gt.dump())) == GraphTransliterator
    assert "graph" in gt.dumps()
    assert GraphTransliterator.loads(gt.dumps()).dumps()
    assert re.match(r"\d+\.\d+\.\d+$",
                    gt.dump()["graphtransliterator_version"])
    assert gt.dump(
    )["graphtransliterator_version"] == graphtransliterator.__version__
    x = gt.dumps()
    assert type(x) == str
    new_gt = GraphTransliterator.loads(x)
    assert type(new_gt) == GraphTransliterator