def test_profile_with_bad_metadata(tmpdir): mdpath = tmpdir / 'md.json' md = deepcopy(Profile.MD) md['tables'].append({'tableSchema': {'columns': []}}) jsonlib.dump(md, str(mdpath)) with pytest.raises(ValueError): Profile.from_file(str(mdpath))
def profile(args): """ Create an orthography profile for a string (passed as argument or read from stdin) segments profile [STRING] """ _write(args, Profile.from_text(_read(args)))
def test_normalization(): specs = [ {'Grapheme': 'ä'}, {'Grapheme': 'aa'}, {'Grapheme': 'a'}, ] prf = Profile(*specs, **{'form': 'NFD'}) t = Tokenizer(profile=prf) # "aa" matches, because the "ä" is decomposed: assert t(unicodedata.normalize('NFD', 'aä')) == 'aa ' + REPLACEMENT_MARKER # A composed "ä" doesn't match anymore: assert t(unicodedata.normalize('NFC', 'aä')) == 'a ' + REPLACEMENT_MARKER prf = Profile(*specs, **{'form': 'NFC'}) t = Tokenizer(profile=prf) # "aa" doesn't match here, this is typically the behaviour one wants: assert t(unicodedata.normalize('NFC', 'aä')) == 'a ä' assert t(unicodedata.normalize('NFD', 'aä')) == 'aa ' + REPLACEMENT_MARKER
def _orthography_profile(s): graphemes = {c: c for c in s} graphemes[NEWLINE] = '\n' graphemes.update(basic_confusables) graphemes.update(epa_confusables) graphemes.update(ipa_confusables) return Profile(*({ 'Grapheme': k, 'IPA': SPACE if v == ' ' else v } for k, v in graphemes.items()))
def test_profile(): prf = Profile( {'Grapheme': 'bischen', 'Out': 'b i s ch e n'}, {'Grapheme': 'sch', 'Out': 'sch'}, {'Grapheme': 'n', 'Out': 'n'}, {'Grapheme': 'a', 'Out': 'a'}, {'Grapheme': 'e', 'Out': 'e'}, {'Grapheme': 'n', 'Out': 'n'}, ) t = Tokenizer(profile=prf) assert t('bischen', column='Out') == 'b i s ch e n' assert t('naschen', column='Out') == 'n a sch e n' assert t('x', column='Out') == REPLACEMENT_MARKER prf = Profile( {'Grapheme': 'uu'}, {'Grapheme': 'b'}, {'Grapheme': 'o'}, ) t = Tokenizer(profile=prf) assert t('uubo uubo') == 'uu b o # uu b o'
def tokenizer(self): """ Datasets can provide support for segmentation (aka tokenization) in two ways: - by providing an orthography profile at etc/orthography.tsv or - by overwriting this method to return a custom tokenizer callable. :return: A callable to do segmentation. The expected signature of the callable is def t(item, string, **kw) where - `item` is a `dict` representing the complete CLDF FormTable row - `string` is the string to be segmented - `kw` may be used to pass any context info to the tokenizer, when called explicitly. """ profile = self.dir / 'etc' / 'orthography.tsv' if profile.exists(): profile = Profile.from_file(str(profile), form='NFC') default_spec = list(next(iter(profile.graphemes.values())).keys()) for grapheme in ['^', '$']: if grapheme not in profile.graphemes: profile.graphemes[grapheme] = { k: None for k in default_spec } profile.tree = Tree(list(profile.graphemes.keys())) tokenizer = Tokenizer(profile=profile, errors_replace=lambda c: '<{0}>'.format(c)) def _tokenizer(item, string, **kw): kw.setdefault("column", "IPA") kw.setdefault("separator", " + ") return tokenizer( unicodedata.normalize('NFC', '^' + string + '$'), **kw).split() return _tokenizer
def main(args): # Initiate tokenizer and profile profile = Profile.from_file(args.profile) tokenizer = Tokenizer(profile=profile) # Open file and check items errors = [] with open(args.wordlist) as handler: reader = csv.DictReader(handler, delimiter="\t") for count, row in enumerate(reader): segments = my_tokenizer(row[args.form], tokenizer) reference = row[args.segments] if segments != reference: errors.append([row["ID"], row[args.form], segments, reference]) if args.l: if count > args.l: break # Output print(tabulate(errors, headers=["ID", "Form", "Result", "Reference"])) print("Errors: %i/%i (%.2f%%)" % (len(errors), count + 1, (len(errors) / (count + 1)) * 100))
def test_from_text(): res = Profile.from_text('abcdabcab') assert 'a' in res.graphemes assert res.graphemes['a']['frequency'] == 3
def test_Profile_from_metadata(testdata): res = Profile.from_file(testdata / 'profile.json', form='NFD') assert 'ch' in res.graphemes assert res.graphemes['ch']['XSAMPA'] == 'tS' assert res.graphemes['-']['XSAMPA'] is None assert res.metadata['dc:language'] == 'abcd1234'
def test_Profile_from_file(profile_path): res = Profile.from_file(profile_path) assert 'aa' in res.graphemes assert res.graphemes['aa']['XSAMPA'] == 'a:' assert res.graphemes['-']['XSAMPA'] is None
def test_missing_grapheme(): with pytest.raises(ValueError): Profile({}) with pytest.raises(ValueError): Profile({'Grapheme': ''})
def test_duplicate_grapheme(mocker): logging = mocker.patch('segments.profile.logging') Profile({'Grapheme': 'a'}, {'Grapheme': 'a'}) assert logging.getLogger.return_value.warning.call_args[0][0].startswith( 'line 3')
"aĭ", "e͡i", "a͡i", "o͡i", "u͡i", "a͡e", "o͡e", "e͡o", "a͡o", "i͡u", "e͡u", "a͡u", "o͡u", ]) tokenizer = Tokenizer(Profile(*({ "Grapheme": x, "mapping": x } for x in sounds)), errors_ignore=lambda c: c) from pylexirumah import get_dataset, repository def needleman_wunsch(x, y, lodict={}, gop=-2.5, gep=-1.75, local=False, indel=''): """Needleman-Wunsch algorithm with affine gaps penalties.
def cmd_install(self, **kw): # Read individual orthographic profiles, extract the corresponding # doculect ids (here, glottocodes), and build the appropriate # tokenizers profile_files = sorted(glob.glob(str(self.dir / "etc" / "*.prof"))) doculect_codes = [ os.path.splitext(os.path.basename(pf))[0] for pf in profile_files ] self.doc_tokenizers = { doculect: Tokenizer( profile=Profile.from_file(pf, form="NFC"), errors_replace=lambda c: "<{0}>".format(c), ) for pf, doculect in zip(profile_files, doculect_codes) } # Cache the Concepticon IDs concepticon = { x.attributes["wold_id"]: x.concepticon_id for x in self.conceptlist.concepts.values() } # cache the field names for CLDF output fields = self.lexeme_class.fieldnames() # Write data to CLDF with self.cldf as ds: vocab_ids = [ v["ID"] for v in self.original_cldf["contributions.csv"] ] # add sources self.add_sources(ds) # add languages and build map for choosing the right profile lang_map = {} for row in self.original_cldf["LanguageTable"]: gc, iso = row["Glottocode"], row["ISO639P3code"] if gc == "tzot1264": gc, iso = "tzot1259", "tzo" if row["ID"] in vocab_ids: ds.add_language(ID=row["ID"], Name=row["Name"], Glottocode=gc, ISO639P3code=iso) # Add to map only those which are receivers if int(row["ID"]) <= 41: lang_map[row["ID"]] = gc # add parameters for row in self.original_cldf["ParameterTable"]: ds.add_concept( ID=row["ID"], Name=row.pop("Name"), Concepticon_ID=concepticon.get(row["ID"]), ) # Being explicit on what we are adding for row in self.original_cldf["FormTable"]: if row["Language_ID"] in vocab_ids: # Copy the raw Form to Value, clean form, and tokenize row["Value"] = row["Form"] row["Form"] = self.clean_form(row["Form"]) row["Segments"] = self.tokenizer( row["Form"], lang_map[row["Language_ID"]]) # Note: We count words marked as "probably borrowed" as loans. row["Loan"] = float(row["BorrowedScore"]) > 0.6 ds.add_form_with_segments( **{k: v for k, v in row.items() if k in fields})
def test_from_textfile(testdata): res = Profile.from_textfile(testdata / 'Kabiye_input.txt') assert 'à̙' in res.graphemes assert res.graphemes['à̙']['frequency'] == 20
"""Create an orthography profile for grapheme tokenization.""" from collections import OrderedDict from segments import Profile from filenames import GRAPHEME_PROFILE from preprocessing import clean from utils import read # Read in all EvaLatin training data into a single pyconll CoNLL structure conll = read() # Collect all the word forms text = "" for sentence in conll: for token in sentence: text += clean(token.form) + " " # Create orthography profile profile = Profile.from_text(text) profile.column_labels.remove("frequency") profile.graphemes.pop(" ") for key in ["ch", "qu", "th", "rh", "ph", "gn"]: profile.graphemes[key] = OrderedDict([("mapping", key[0].upper())]) profile.graphemes.move_to_end(key, last=False) with open(GRAPHEME_PROFILE, "w") as file: file.write(str(profile))
def test_tokenize_with_profile_from_object(): prf = Profile(dict(Grapheme='aa', mapping=['x', 'y']), dict(Grapheme='b', mapping='z')) assert Tokenizer(profile=prf)('aab', column='mapping') == 'x y z'
def get_orthography(name): return Tokenizer(Profile.from_file(profile_path(name + '.tsv'), form='NFD'))