Ejemplo n.º 1
0
def test_GraphTransliterator_ignore_errors():
    # if ignore_errors is not set and no matching transliteration rule
    # raise NoMatchingTransliterationRule exception
    yaml_str = """
        tokens:
           a: [class1]
           b: [class1]
           ' ': [wb]
        rules:
           a a: B2
           b: B
        whitespace:
           default: ' '
           consolidate: true
           token_class: wb
           """
    # check that ignore_errors works
    assert (GraphTransliterator.from_yaml(
        yaml_str, ignore_errors=True).transliterate("a") == "")

    with pytest.raises(NoMatchingTransliterationRuleException):
        gt = GraphTransliterator.from_yaml(yaml_str, ignore_errors=False)
        assert gt.ignore_errors is False
        gt.transliterate("a")

    with pytest.raises(UnrecognizableInputTokenException):
        gt = GraphTransliterator.from_yaml(yaml_str, ignore_errors=False)
        assert gt.ignore_errors is False
        gt.transliterate("!")

    gt = GraphTransliterator.from_yaml(yaml_str, ignore_errors=True)
    assert gt.ignore_errors is True
    assert gt.tokenize("b!b") == [" ", "b", "b", " "]
    assert gt.transliterate("b!b") == "BB"

    with pytest.raises(UnrecognizableInputTokenException):
        gt = GraphTransliterator.from_yaml(yaml_str, ignore_errors=False)
        assert gt.ignore_errors is False
        gt.transliterate("b!")

    # # test ignore_errors keyword value checking on init
    # with pytest.raises(ValueError):
    #     GraphTransliterator.from_yaml(yaml_str, ignore_errors="maybe")
    # test ignore_errors keyword property

    # test ignore_errors setter and property
    gt.ignore_errors = True
    assert gt.ignore_errors is True
    gt.ignore_errors = False
    assert gt.ignore_errors is False
Ejemplo n.º 2
0
def load_transliterator(source, **kwargs):
    """Loads transliterator (format, parameter)."""
    format, parameter = source
    if format == "bundled":
        mod = __import__("graphtransliterator.transliterators")
        transliterators_mod = mod.transliterators
        transliterator_class = getattr(transliterators_mod, parameter)
        return transliterator_class(**kwargs)
    elif format == "json":
        return GraphTransliterator.loads(parameter, **kwargs)
    elif format == "json_file":
        with open(parameter, "r") as f:
            return GraphTransliterator.loads(f.read(), **kwargs)
    elif format == "yaml_file":
        return GraphTransliterator.from_yaml_file(parameter, **kwargs)
Ejemplo n.º 3
0
def test_GraphTransliterator_productions():
    """Test productions."""
    tokens = {"ab": ["class_ab"], " ": ["wb"]}
    whitespace = {"default": " ", "token_class": "wb", "consolidate": True}
    rules = {"ab": "AB", " ": "_"}
    settings = {"tokens": tokens, "rules": rules, "whitespace": whitespace}
    assert set(GraphTransliterator.from_easyreading_dict(settings).productions) == set(
        ["AB", "_"]
    )
Ejemplo n.º 4
0
def test_GraphTransliterator_graph():
    """Test graph."""
    tokens = {"ab": ["class_ab"], " ": ["wb"]}
    whitespace = {"default": " ", "token_class": "wb", "consolidate": True}
    rules = {"ab": "AB", " ": "_"}
    settings = {"tokens": tokens, "rules": rules, "whitespace": whitespace}
    gt = GraphTransliterator.from_easyreading_dict(settings)
    assert gt._graph
    assert gt._graph.node[0]["type"] == "Start"  # test for Start
    assert gt
Ejemplo n.º 5
0
def test_GraphParser_check_ambiguity():
    """ Test for rules that can both match the same thing."""

    yaml_for_test = r"""
        tokens:
          a: [token, class1, class2]
          b: [token, class1, class2]
          ' ': [wb]
        rules:
          a <class1>: A<class1> # these should be ambiguous
          a <class2>: A<class2>

          <class1> a: <class1>A  # these should be ambiguous
          <class2> a: <class2>A # these should be ambiguous

          (<class1> b) a (b <class2>): A # ambigous
          (<class2> b) a (b <class1>): A # ambiguous
          a: A # not ambiguous
        whitespace:
          default: ' '
          token_class: 'wb'
          consolidate: true
        """
    with pytest.raises(AmbiguousTransliterationRulesException):
        GraphTransliterator.from_yaml(yaml_for_test, check_for_ambiguity=True)
    # check that ambiguity matches if rules are of different shape
    yaml = """
        tokens:
          a: []
          ' ': [wb]
        rules:
          <wb> a: _A
          a <wb>: A_
          a: a
          ' ': ' '
        whitespace:
          default: " "        # default whitespace token
          consolidate: true  # whitespace should be consolidated
          token_class: wb     # whitespace token class
        """
    with pytest.raises(AmbiguousTransliterationRulesException):
        GraphTransliterator.from_yaml(yaml, check_for_ambiguity=True)
Ejemplo n.º 6
0
def test_cli_transliterate_tests(tmpdir):
    """Tests transliterate command, and loading of all formats."""
    transliterator = GraphTransliterator.from_yaml(test_yaml)

    runner = CliRunner()

    # test bundled
    bundled_result = runner.invoke(
        cli.main, ["transliterate", "--from", "bundled", "Example", "a"])
    assert bundled_result.exit_code == 0
    assert bundled_result.output.strip() == "A"

    # test multiple inputs with python output
    bundled_multiple_result = runner.invoke(
        cli.main, ["transliterate", "--from", "bundled", "Example", "a", "a"])
    assert bundled_multiple_result.exit_code == 0
    assert bundled_multiple_result.output.strip() == str(["A", "A"])

    # test json
    bundled_multiple_json_result = runner.invoke(
        cli.main,
        [
            "transliterate", "--from", "bundled", "Example", "--to", "json",
            "a", "a"
        ],
    )
    assert bundled_multiple_json_result.exit_code == 0
    assert bundled_multiple_json_result.output.strip() == json.dumps(
        ["A", "A"])

    # test transliterate from JSON
    json_ = transliterator.dumps()
    json_result = runner.invoke(
        cli.main, ["transliterate", "--from", "json", json_, "a"])
    assert json_result.exit_code == 0
    assert json_result.output.strip() == "A"

    # test transliterate from json file
    json_file = tmpdir.mkdir("sub").join("test.json")
    json_file.write(json_)
    json_file_result = runner.invoke(
        cli.main,
        ["transliterate", "--from", "json_file", json_file.strpath, "a"])
    assert json_file_result.exit_code == 0
    assert json_file_result.output.strip() == "A"

    # test transliterate from yaml file
    yaml_file = tmpdir.join("test.yaml")
    yaml_file.write(test_yaml)
    yaml_file_result = runner.invoke(
        cli.main,
        ["transliterate", "--from", "yaml_file", yaml_file.strpath, "a"])
    assert yaml_file_result.exit_code == 0
    assert yaml_file_result.output.strip() == "A"
Ejemplo n.º 7
0
def test_cli_dump():
    """Test `dump` command."""
    runner = CliRunner()
    dump_result = runner.invoke(cli.main,
                                ["dump", "--from", "bundled", "Example"])
    assert dump_result.exit_code == 0
    json_ = dump_result.output
    assert GraphTransliterator.loads(json_).transliterate("a") == "A"
    # check that dump remains the same (important for version control)
    for i in range(0, 50):
        _ = runner.invoke(cli.main, ["dump", "--from", "bundled", "Example"])
        assert _.output == json_, "JSON dump varies"
Ejemplo n.º 8
0
 def __init__(
     self, meters_list=None, find_feet=None, meters_filter=None, with_mir=True
 ):
     if not meters_list:
         meters_list = _load_yaml(meters_filename)
         if with_mir:
             mir_meters = _load_yaml(mir_meters_filename)
             meters_list = meters_list + mir_meters
         self._scans_with_feet = _gen_possible_feet(meters_list)
         find_feet = self.find_feet
     if meters_filter:
         meters_list = meters_filter(meters_list)
     Scanner.__init__(
         self,
         GraphTransliterator.from_yaml_file(transcription_filename),
         GraphTransliterator.from_yaml_file(long_parser_filename),
         GraphTransliterator.from_yaml_file(short_parser_filename),
         _load_yaml(constraints_filename),
         meters_list,
         find_feet=find_feet,
         post_scan_filter=filter_scans,
     )
def test_compression():
    gt = GraphTransliterator.from_yaml(test_config)
    compressed_config = compression.compress_config(gt.dump())
    decompressed_config = compression.decompress_config(compressed_config)
    gt_from_decompressed = GraphTransliterator.load(decompressed_config)
    # Compare JSON dumps with sorted keys.
    assert (json.dumps(gt.dump(), sort_keys=True) == json.dumps(
        gt_from_decompressed.dump(), sort_keys=True))
    # Test bad compression level
    with pytest.raises(ValueError):
        gt.dump(
            compression_level=graphtransliterator.HIGHEST_COMPRESSION_LEVEL +
            1)
    # Test compression at level 0 (should likely not be called)
    assert "compressed_settings" not in compression.compress_config(
        gt.dump(), compression_level=0)
    # Test compression levels
    assert '"tokens": ' in gt.dumps(compression_level=0)
    assert '"compressed_settings"' in gt.dumps(compression_level=1)
    assert '"compressed_settings"' in gt.dumps(compression_level=2)
    for i in range(0, graphtransliterator.HIGHEST_COMPRESSION_LEVEL + 1):
        x = gt.dumps(compression_level=i)
        y = gt.loads(x)
        assert y.transliterate("a") == "A"
Ejemplo n.º 10
0
def test_GraphTransliterator_pruned_of():
    gt = GraphTransliterator.from_yaml("""
            tokens:
               a: [class1]
               b: [class2]
               ' ': [wb]
            rules:
               a: A
               b: B
            whitespace:
               default: ' '
               consolidate: true
               token_class: wb
        """)
    assert len(gt.rules) == 2
    assert len(gt.pruned_of("B").rules) == 1
    assert gt.pruned_of("B").rules[0].production == "A"
    assert gt.pruned_of(["A", "B"])  # if no rules present will still work
Ejemplo n.º 11
0
def test_match_all():
    """Test GraphTransliterator transliterate."""
    YAML = r"""
    tokens:
        a: [class_a]
        " ": [wb]
    rules:
        a: A
        a a: A*2
    whitespace:
        default: ' '
        consolidate: True
        token_class: wb
    """
    gt = GraphTransliterator.from_yaml(YAML)
    assert gt.rules[0].cost < gt.rules[1].cost

    tokens = gt.tokenize("aa")
    assert gt.match_at(1, tokens, match_all=False) == 0
    assert gt.match_at(1, tokens, match_all=True) == [0, 1]
Ejemplo n.º 12
0
def test_GraphTransliterator_from_YAML():
    """Test YAML loading of GraphTransliterator."""
    good_yaml = """
      tokens:
        a: [class1]
        ' ': [wb]
      rules:
        a: A
      whitespace:
        default: ' '
        consolidate: true
        token_class: wb
    """
    assert GraphTransliterator.from_yaml(good_yaml)
    bad_yaml = """
      tokens:
        a: class1
        ' ': wb
      rules:
        a: A
      whitespace:
        default: ' '
        consolidate: true
        token_class: wb
    """
    with pytest.raises(ValidationError):
        GraphTransliterator.from_yaml(bad_yaml)

    bad_yaml = """
      tokens:
        a: class1
        ' ': wb
      rules:
        a: A
      whitespace:
        default: ' '
        consolidate: true
        token_class: wb
    """
    # tokens values are not lists
    with pytest.raises(ValidationError):
        GraphTransliterator.from_yaml(bad_yaml)

    bad_yaml = """
          rules:
            a: A
          whitespace:
            default: ' '
            consolidate: true
            token_class: wb
    """
    with pytest.raises(ValidationError):
        GraphTransliterator.from_yaml(bad_yaml)
    bad_yaml = """
          rules:
            a: A
          tokens:
            a: [token]
            ' ': [wb]
          whitespace:
            default: 'BAD'
            consolidate: true
            token_class: bad
    """
    # whitespace errors
    with pytest.raises(ValidationError):
        GraphTransliterator.from_yaml(bad_yaml)
    bad_yaml = """
          tokens:
            a: [class1]
            ' ': [wb]
          whitespace:
            default: ' '
            consolidate: true
            token_class: wb
    """
    with pytest.raises(ValidationError):
        GraphTransliterator.from_yaml(bad_yaml)
    bad_yaml = """
          tokens:
            a: [class1]
            ' ': [wb]
          rules:
            b: A
          whitespace:
            default: ' '
            consolidate: true
            token_class: wb
    """
    with pytest.raises(ValidationError):
        GraphTransliterator.from_yaml(bad_yaml)

    bad_yaml = """
          tokens:
            a: [class1]
            ' ': [wb]
          rules:
            (b) a: A
          whitespace:
            default: ' '
            consolidate: true
            token_class: wb
    """
    with pytest.raises(ValidationError):
        GraphTransliterator.from_yaml(bad_yaml)

    bad_yaml = """
          tokens:
            a: [class1]
            ' ': [wb]
          rules:
            a (b): A
          whitespace:
            default: ' '
            consolidate: true
            token_class: wb
    """
    with pytest.raises(ValidationError):
        GraphTransliterator.from_yaml(bad_yaml)

    bad_yaml = """
          tokens:
            a: [class1]
            ' ': [wb]
          rules:
            a <class_nonexisting>: A
          whitespace:
            default: ' '
            consolidate: true
            token_class: wb
    """
    with pytest.raises(ValidationError):
        GraphTransliterator.from_yaml(bad_yaml)

    # test for bad tokens
    bad_yaml = """
          tokens: '7'
          rules:
            a <class_nonexisting>: A
          whitespace:
            default: ' '
            consolidate: true
            token_class: wb
    """
    with pytest.raises(ValidationError):
        GraphTransliterator.from_yaml(bad_yaml)
Ejemplo n.º 13
0
def test_GraphTransliterator(tmpdir):
    """Test GraphTransliterator."""
    yaml_str = r"""
    tokens:
      a: [token, class1]
      b: [token, class2]
      u: [token]
      ' ': [wb]
    rules:
      a: A
      b: B
      <wb> u: \N{DEVANAGARI LETTER U}
    onmatch_rules:
      -
        <class1> + <class2>: ","
      -
        <class1> + <token>: \N{DEVANAGARI SIGN VIRAMA}
    whitespace:
      default: ' '
      token_class: 'wb'
      consolidate: true
    metadata:
      author: Author
    """

    input_dict = yaml.safe_load(yaml_str)
    assert "a" in GraphTransliterator.from_easyreading_dict(input_dict).tokens.keys()
    gt = GraphTransliterator.from_easyreading_dict(input_dict)
    assert gt.onmatch_rules[0].production == ","
    assert gt.tokens
    assert gt.rules
    assert gt.whitespace
    assert gt.whitespace.default
    assert gt.whitespace.token_class
    assert gt.whitespace.consolidate
    assert gt.metadata["author"] == "Author"
    assert type(gt.graph) == DirectedGraph
    yaml_file = tmpdir.join("yaml_test.yaml")
    yaml_filename = str(yaml_file)
    yaml_file.write(yaml_str)

    assert yaml_file.read() == yaml_str

    assert GraphTransliterator.from_yaml_file(yaml_filename)

    assert len(set(GraphTransliterator.from_easyreading_dict(input_dict).tokens)) == 4

    assert GraphTransliterator.from_yaml(yaml_str).transliterate("ab") == "A,B"
    assert (
        GraphTransliterator.from_yaml_file(yaml_filename).transliterate("ab") == "A,B"
    )
    assert (
        GraphTransliterator.from_easyreading_dict(
            {
                "tokens": {"a": ["class_a"], "b": ["class_b"], " ": ["wb"]},
                "onmatch_rules": [{"<class_a> + <class_b>": ","}],
                "whitespace": {
                    "default": " ",
                    "token_class": "wb",
                    "consolidate": True,
                },
                "rules": {"a": "A", "b": "B"},
            }
        ).transliterate("ab")
        == "A,B"
    )
Ejemplo n.º 14
0
def test_serialization():
    """Test serialization of graphtransliterator"""
    # Field definitions
    required_fields = ["tokens", "rules", "whitespace"]
    optional_fields = [
        "onmatch_rules",
        "metadata",
        "ignore_errors",
        "onmatch_rules_lookup",
        "tokens_by_class",
        "graph",
        "tokenizer_pattern",
        "graphtransliterator_version",
    ]
    ordered_fields = required_fields + optional_fields
    yaml_ = """
        tokens:
          a: [vowel]
          ' ': [wb]
        rules:
          a: A
          ' ': ' '
        whitespace:
          default: " "
          consolidate: false
          token_class: wb
        onmatch_rules:
          - <vowel> + <vowel>: ','  # add a comma between vowels
        metadata:
          author: "Author McAuthorson"
    """
    gt = GraphTransliterator.from_yaml(yaml_)
    # test dump
    dump = gt.dump()
    assert dump["graph"]["edge"]
    # test ordering of dump fields
    assert list(dump.keys()) == ordered_fields
    # test dump version
    assert dump["graphtransliterator_version"] == graphtransliterator.__version__
    assert re.match(r"\d+\.\d+\.\d+$", gt.dump()["graphtransliterator_version"])
    # test dumps
    x = gt.dumps()
    assert "graph" in gt.dumps()
    assert type(x) == str
    # test loads
    new_gt = GraphTransliterator.loads(x)
    assert GraphTransliterator.loads(gt.dumps()).dumps()
    assert type(new_gt) == GraphTransliterator
    # test load
    settings = gt.dump()
    assert type(GraphTransliterator.load(settings)) == GraphTransliterator
    # confirm settings not affected by load
    assert settings == settings
    # confirm compacting (dropping) optional settings works
    for length in range(1, len(optional_fields)):
        for to_drop in combinations(optional_fields, length):
            settings = gt.dump()
            for _ in to_drop:
                settings.pop(_)
            # Confirm ValidationError if onmatch_rules_lookup but not onmatch_rules
            # (chances of this every being the case are slim!)
            if settings.get("onmatch_rules_lookup") and not settings.get(
                "onmatch_rules"
            ):
                with pytest.raises(ValidationError):
                    assert GraphTransliterator.load(settings)
            else:
                assert GraphTransliterator.load(settings)

    bad_settings = gt.dump()
    bad_settings.pop("onmatch_rules")
    with pytest.raises(ValidationError):
        assert GraphTransliterator.load(bad_settings)
Ejemplo n.º 15
0
def test_GraphTransliterator_transliterate(tmpdir):
    """Test GraphTransliterator transliterate."""
    YAML = r"""
    tokens:
        a: [class_a]
        b: [class_b]
        c: [class_c]
        " ": [wb]
        d: []
        Aa: [contrained_rule]
    rules:
        a: A
        b: B
        <class_c> <class_c> a: A(AFTER_CLASS_C_AND_CLASS_C)
        (<class_c> b) a: A(AFTER_B_AND_CLASS_C)
        (<class_c> b b) a a: AA(AFTER_BB_AND_CLASS_C)
        a <class_c>: A(BEFORE_CLASS_C)
        a b (c <class_b>): AB(BEFORE_C_AND_CLASS_B)
        c: C
        c c: C*2
        a (b b b): A(BEFORE_B_B_B)
        d (c <class_a>): D(BEFORE_C_AND_CLASS_A)
        (b b) a: A(AFTER_B_B)
        <wb> Aa: A(ONLY_A_CONSTRAINED_RULE)
    onmatch_rules:
        -
            <class_a> <class_b> + <class_a> <class_b>: "!"
        -
            <class_a> + <class_b>: ","
    whitespace:
        default: ' '
        consolidate: True
        token_class: wb
    """
    gt = GraphTransliterator.from_yaml(YAML)
    # rules with single token
    assert gt.transliterate("a") == "A"
    # rules with multiple tokens
    assert gt.transliterate("aa") == "AA"
    # rules with multiple tokens (for rule_key)
    assert gt.transliterate("cc") == "C*2"
    # # rules with multiple tokens overlapping end of tokens
    # assert gt.transliterate('c') == 'C'

    # rules with prev class
    assert gt.transliterate("ca") == "CA"
    # rules with prev class and prev token
    assert gt.transliterate("dca") == "D(BEFORE_C_AND_CLASS_A)CA"
    # rules with prev class and prev tokens
    assert gt.transliterate("cbba") == "CBBA(AFTER_B_B)"
    # rules with next class
    assert gt.transliterate("ac") == "A(BEFORE_CLASS_C)C"
    # rules with next class and next tokens
    assert gt.transliterate("acb") == "A(BEFORE_CLASS_C)CB"
    # rules with onmatch rule of length 1
    assert gt.transliterate("ab") == "A,B"
    # rules that only have constraints on first element
    assert gt.transliterate("Aa") == "A(ONLY_A_CONSTRAINED_RULE)"
    # test whitespace consolidation
    assert gt.transliterate(" a") == "A"
    # test whitespace consolidation following
    assert gt.transliterate("a ") == "A"

    # rules with longer onmatch rules
    assert gt.transliterate("abab") == "A,B!A,B"

    # test last_matched_input_tokens
    assert gt.last_input_tokens == [" ", "a", "b", "a", "b", " "]
    # test last_matched_tokens
    assert gt.last_matched_rule_tokens == [["a"], ["b"], ["a"], ["b"]]

    # test last_matched_rules
    assert len(gt.last_matched_rules) == 4
Ejemplo n.º 16
0
def test_validator():
    """Test validator."""
    transcriptionYAML_ok = r"""
    whitespace:
        token_class: wb
        default: ' '
        consolidate: True
    tokens:
        a: [short_vowel]
        b: [consonant]
        aa: [long_vowel]
        ' ': [wb]
        '\t': [wb]
    rules:
        a: s
        b: c
        aa: l
        ' ': b
    """

    transcriptionYAML_bad = r"""
    whitespace:
        token_class: wb
        default: ' '
        consolidate: True
    tokens:
        a: [short_vowel]
        b: [consonant]
        aa: [long_vowel]
        ' ': [wb]
        '\t': [wb]
    rules:
        a: s
        b: c
        aa: X
    """

    shortYAML_ok = r"""
    whitespace:
        token_class: wb
        default: 'b'
        consolidate: True
    tokens:
        b: [wb]
        s: [short_vowel]
        c: [consonant]
        l: [long_vowel]
    rules:
        c: s<c>
        b c s: s<bcs>
        (l) c (b): s<(l)c(b)>
        (c) c (b): s<(c)c(b)>
        b c l (b): s<bcl(b)>
        c l (b): s<cl(b)>
    """

    shortYAML_bad = r"""
    whitespace:
        token_class: wb
        default: 'b'
        consolidate: True
    tokens:
        b: [wb]
        s: [short_vowel]
        c: [consonant]
        l: [long_vowel]
        X: [extra bad token]
    rules:
        b c s: s<bcs>
        (l) c (b): s<(l)c(b)>
        (c) c (b): s<(c)c(b)>
        b c l (b): s<bcl(b)>
        c l (b): s<cl(b)>
    """

    longYAML_ok = r"""
    whitespace:
        token_class: wb
        default: 'b'
        consolidate: True
    tokens:
        b: [wb]
        s: [short_vowel]
        c: [consonant]
        l: [long_vowel]
    rules:
        b c l: l<bcl>
        c l: l<cl>
    """

    longYAML_bad = r"""
    whitespace:
        token_class: wb
        default: 'b'
        consolidate: True
    tokens:
        b: [wb]
        s: [short_vowel]
        c: [consonant]
        l: [long_vowel]
        X: [extra bad token]
    rules:
        b c l: l<bcl>
        c l: l<cl>
    """
    constraints = None
    # yaml.safe_load("""
    #     "-":
    #         "-":
    #             "<bcss>": [s<c>]
    # """)

    constraints_bad = {"bad": "constraints"}

    meters_list = yaml.safe_load("""
    -
      id : "1"
      regex_pattern : ===(-)
      name : three longs and maybe a short
    -
      id : "2"
      regex_pattern : (-|=)==(=|-)
      name : a long or short, two longs, and a long or short
    -
      id : "2"
      regex_pattern : (=-=|===)+==(=|-)
      name : meter with cycles
    """)
    meters_list_bad = {"bad": "meters_list"}

    transcription_parser_ok = GraphTransliterator.from_yaml(
        transcriptionYAML_ok)
    long_parser_ok = GraphTransliterator.from_yaml(longYAML_ok)
    short_parser_ok = GraphTransliterator.from_yaml(shortYAML_ok)

    transcription_parser_bad = GraphTransliterator.from_yaml(
        transcriptionYAML_bad)
    long_parser_bad = GraphTransliterator.from_yaml(longYAML_bad)
    short_parser_bad = GraphTransliterator.from_yaml(shortYAML_bad)

    assert Scanner(
        transcription_parser_ok,
        long_parser_ok,
        short_parser_ok,
        constraints,
        meters_list,
    )

    with pytest.raises(ValueError):
        Scanner(
            transcription_parser_ok,
            long_parser_bad,
            short_parser_ok,
            constraints,
            meters_list,
        )

    with pytest.raises(ValueError):
        Scanner(
            transcription_parser_ok,
            long_parser_ok,
            short_parser_bad,
            constraints,
            meters_list,
        )

    with pytest.raises(ValueError):
        Scanner(
            transcription_parser_bad,
            long_parser_ok,
            short_parser_ok,
            constraints,
            meters_list,
        )
    # test bad constraints
    with pytest.raises(ValueError):
        Scanner(
            transcription_parser_ok,
            long_parser_ok,
            short_parser_ok,
            constraints_bad,
            meters_list,
        )
    # test bad meters_list
    with pytest.raises(ValueError):
        Scanner(
            transcription_parser_ok,
            long_parser_ok,
            short_parser_ok,
            constraints,
            meters_list_bad,
        )
    for i in range(len(final) - a_id):
        #print(final[a_id+i])
        final2.append(final[a_id + i])

    return final2


## ConfusionMatrix
confusion_matrix = ConfusionDictionary()
possibilities = set(x for x in confusion_matrix.getPreds())
for x in confusion_matrix.getGolds():
    possibilities.add(x)
import json
from graphtransliterator import GraphTransliterator
gt = GraphTransliterator.from_yaml_file(
    "/Users/mosaix/orthographic-ASR/transliterate/transliterators/latin_prealignment.yml"
)
tf = GraphTransliterator.from_yaml_file(
    "/Users/mosaix/orthographic-ASR/transliterate/transliterators/tifinagh_to_latin.yml"
)
no_lm_store = {}

gold_aligned = []
pred_aligned = []
with open('transliterate/output/latin_norm/no_lm/inferences.json') as f:
    data = json.load(f)
with open('transliterate/output/latin_norm/no_lm/alignments.txt', "w+") as l:
    for i in data:
        try:
            wavfile = i['wav_filename'].split('/')[-1]
            compare_tuple = (gt.transliterate(i['src']),
    norm_string = re.sub(r"ṭ", "ṭ", norm_string)
    translated = norm_string.translate(
        norm_string.maketrans(normalization_dict))
    norm_string = re.sub(r"-", " ", translated)
    norm_string = re.sub(r'( ){2,}', " ", norm_string)
    return norm_string


df = pd.read_csv(input_file, sep='\t')
df_augmented = df.copy()
df_augmented["normalized"] = df["sentence"] = df.apply(
    lambda row: normalize(row['sentence'], orthography), axis=1)
if orthography in [
        'tifinagh_ahaggar', 'tifinagh_ahaggar_lig', 'tifinagh_ircam', 'arabic'
]:
    gt = GraphTransliterator.from_yaml_file(paths[orthography])
    df_augmented['transliteration'] = df["sentence"] = df_augmented.apply(
        lambda row: gt.transliterate(row['normalized']), axis=1)
df.to_csv(output_folder + '/' + os.path.basename(input_file),
          sep='\t',
          index=False,
          header=True)

input_base = os.path.splitext(os.path.basename(input_file))[0]
with open(output_folder + '/' + input_base + "_compare.txt", "w+") as f:
    for (idx, row) in df_augmented.iterrows():
        f.write(row.sentence + "\n")
        f.write("\t" + row.normalized + "\n")
        if orthography in [
                'tifinagh_ahaggar', 'tifinagh_ahaggar_lig', 'tifinagh_ircam',
                'arabic'
Ejemplo n.º 19
0
def test_constraints():
    """Test constraints."""
    transcription_parser = GraphTransliterator.from_yaml("""
            whitespace:
                token_class: wb
                default: ' '
                consolidate: True
            tokens:
                A: []
                ' ': [wb]
            rules:
                A: "a"
                ' ': "b"
        """)
    short_parser = GraphTransliterator.from_yaml("""
        whitespace:
            token_class: wb
            default: 'b'
            consolidate: True
        tokens:
            a: []
            b: [wb]
        rules:
            b a: s<ba>
            a: s<a>
        """)
    long_parser = GraphTransliterator.from_yaml("""
        whitespace:
            token_class: wb
            default: 'b'
            consolidate: True
        tokens:
            a: []
            b: [wb]
        rules:
            b a a: "l<baa>"
            a a: "l<aa>"
        """)
    constraints = yaml.safe_load("""
        '-':
            '-':
                's<a>': [s<ba>, s<a>]
        """

                                 # cannot have s<ba> s<a>; it must be long
                                 )
    meters_list = yaml.safe_load("""
        -
          id: "1"
          name: long long
          regex_pattern: "=="
        -
          id: "2"
          name: short short long
          notes: should not be possible due to constraints
          regex_pattern: "--="
        """)

    scanner = Scanner(transcription_parser, long_parser, short_parser,
                      constraints, meters_list)

    assert scanner._constrained_parsers["-"]["-"]["s<a>"]._graph.node[0] == {
        "ordered_children": {},
        "type": "Start",
    }