def test_compression(): gt = GraphTransliterator.from_yaml(test_config) compressed_config = compression.compress_config(gt.dump()) decompressed_config = compression.decompress_config(compressed_config) gt_from_decompressed = GraphTransliterator.load(decompressed_config) # Compare JSON dumps with sorted keys. assert (json.dumps(gt.dump(), sort_keys=True) == json.dumps( gt_from_decompressed.dump(), sort_keys=True)) # Test bad compression level with pytest.raises(ValueError): gt.dump( compression_level=graphtransliterator.HIGHEST_COMPRESSION_LEVEL + 1) # Test compression at level 0 (should likely not be called) assert "compressed_settings" not in compression.compress_config( gt.dump(), compression_level=0) # Test compression levels assert '"tokens": ' in gt.dumps(compression_level=0) assert '"compressed_settings"' in gt.dumps(compression_level=1) assert '"compressed_settings"' in gt.dumps(compression_level=2) for i in range(0, graphtransliterator.HIGHEST_COMPRESSION_LEVEL + 1): x = gt.dumps(compression_level=i) y = gt.loads(x) assert y.transliterate("a") == "A"
def test_serialization(): """Test serialization of graphtransliterator""" # Field definitions required_fields = ["tokens", "rules", "whitespace"] optional_fields = [ "onmatch_rules", "metadata", "ignore_errors", "onmatch_rules_lookup", "tokens_by_class", "graph", "tokenizer_pattern", "graphtransliterator_version", ] ordered_fields = required_fields + optional_fields yaml_ = """ tokens: a: [vowel] ' ': [wb] rules: a: A ' ': ' ' whitespace: default: " " consolidate: false token_class: wb onmatch_rules: - <vowel> + <vowel>: ',' # add a comma between vowels metadata: author: "Author McAuthorson" """ gt = GraphTransliterator.from_yaml(yaml_) # test dump dump = gt.dump() assert dump["graph"]["edge"] # test ordering of dump fields assert list(dump.keys()) == ordered_fields # test dump version assert dump["graphtransliterator_version"] == graphtransliterator.__version__ assert re.match(r"\d+\.\d+\.\d+$", gt.dump()["graphtransliterator_version"]) # test dumps x = gt.dumps() assert "graph" in gt.dumps() assert type(x) == str # test loads new_gt = GraphTransliterator.loads(x) assert GraphTransliterator.loads(gt.dumps()).dumps() assert type(new_gt) == GraphTransliterator # test load settings = gt.dump() assert type(GraphTransliterator.load(settings)) == GraphTransliterator # confirm settings not affected by load assert settings == settings # confirm compacting (dropping) optional settings works for length in range(1, len(optional_fields)): for to_drop in combinations(optional_fields, length): settings = gt.dump() for _ in to_drop: settings.pop(_) # Confirm ValidationError if onmatch_rules_lookup but not onmatch_rules # (chances of this every being the case are slim!) if settings.get("onmatch_rules_lookup") and not settings.get( "onmatch_rules" ): with pytest.raises(ValidationError): assert GraphTransliterator.load(settings) else: assert GraphTransliterator.load(settings) bad_settings = gt.dump() bad_settings.pop("onmatch_rules") with pytest.raises(ValidationError): assert GraphTransliterator.load(bad_settings)
def test_GraphTransliterator_transliterate(tmpdir): """Test GraphTransliterator transliterate.""" YAML = r""" tokens: a: [class_a] b: [class_b] c: [class_c] " ": [wb] d: [] Aa: [contrained_rule] rules: a: A b: B <class_c> <class_c> a: A(AFTER_CLASS_C_AND_CLASS_C) (<class_c> b) a: A(AFTER_B_AND_CLASS_C) (<class_c> b b) a a: AA(AFTER_BB_AND_CLASS_C) a <class_c>: A(BEFORE_CLASS_C) a b (c <class_b>): AB(BEFORE_C_AND_CLASS_B) c: C c c: C*2 a (b b b): A(BEFORE_B_B_B) d (c <class_a>): D(BEFORE_C_AND_CLASS_A) (b b) a: A(AFTER_B_B) <wb> Aa: A(ONLY_A_CONSTRAINED_RULE) onmatch_rules: - <class_a> <class_b> + <class_a> <class_b>: "!" - <class_a> + <class_b>: "," whitespace: default: ' ' consolidate: True token_class: wb """ gt = GraphTransliterator.from_yaml(YAML) # rules with single token assert gt.transliterate("a") == "A" # rules with multiple tokens assert gt.transliterate("aa") == "AA" # rules with multiple tokens (for rule_key) assert gt.transliterate("cc") == "C*2" # # rules with multiple tokens overlapping end of tokens # assert gt.transliterate('c') == 'C' # rules with prev class assert gt.transliterate("ca") == "CA" # rules with prev class and prev token assert gt.transliterate("dca") == "D(BEFORE_C_AND_CLASS_A)CA" # rules with prev class and prev tokens assert gt.transliterate("cbba") == "CBBA(AFTER_B_B)" # rules with next class assert gt.transliterate("ac") == "A(BEFORE_CLASS_C)C" # rules with next class and next tokens assert gt.transliterate("acb") == "A(BEFORE_CLASS_C)CB" # rules with onmatch rule of length 1 assert gt.transliterate("ab") == "A,B" # rules that only have constraints on first element assert gt.transliterate("Aa") == "A(ONLY_A_CONSTRAINED_RULE)" # test whitespace consolidation assert gt.transliterate(" a") == "A" # test whitespace consolidation following assert gt.transliterate("a ") == "A" # rules with longer onmatch rules assert gt.transliterate("abab") == "A,B!A,B" # test last_matched_input_tokens assert gt.last_input_tokens == [" ", "a", "b", "a", "b", " "] # test last_matched_tokens assert gt.last_matched_rule_tokens == [["a"], ["b"], ["a"], ["b"]] # test last_matched_rules assert len(gt.last_matched_rules) == 4 # test dump assert gt.dump()["graph"]["edge"] assert type(GraphTransliterator.load(gt.dump())) == GraphTransliterator assert "graph" in gt.dumps() assert GraphTransliterator.loads(gt.dumps()).dumps() assert re.match(r"\d+\.\d+\.\d+$", gt.dump()["graphtransliterator_version"]) assert gt.dump( )["graphtransliterator_version"] == graphtransliterator.__version__ x = gt.dumps() assert type(x) == str new_gt = GraphTransliterator.loads(x) assert type(new_gt) == GraphTransliterator