def writeToFile(): print("LOAD TEST WORDLIST") pathToAnnotatedWordList = "Data/IELex/output/IELex-2016.tsv" print("LOAD WORDLIST") #pathToAnnotatedWordList = "Data/mattis_new/output/ObUgrian-110-21.tsv.asjp" languages,words,global_ids,cognate_classes = loadAnnotatedWordList(pathToAnnotatedWordList) print(len(set(global_ids))) stoplist = {221 , 646 ,1333 ,1224 , 778 ,1402, 1411, 1232, 1203, 1292} languages,words,global_ids,cognate_classes = getRidOfValidationSet(languages,words,global_ids,cognate_classes,stoplist) print(len(set(global_ids))) with codecs.open("lexstat_wordlist.txt","w",encoding="UTF-8") as f: f.write("CONCEPT\tIPA\tDOCULECT\tCOGID\n") for l,w,gi,cog in zip(languages,words,global_ids,cognate_classes): f.write(str(gi)+"\t"+w+"\t"+l+"\t"+cog+"\n") wl =get_wordlist("lexstat_wordlist.txt",delimiter="\t") print(wl.get_dict(concept="730",entry="IPA")) print("initializing lexstat") lex = LexStat(wl) print("getting scorer") lex.get_scorer() print("clustering") lex.cluster(method="lexstat", threshold=0.6, ref="cognate_class_pred") print("output") lex.output('tsv', filename="lexstat_ielex") from lingpy.evaluate.acd import bcubes, diff bcubes(lex, "cognate_class", "COGID") print(bcubes(lex, "cognate_class", "cognate_class_pred"))
def writeToFile(): print("LOAD TEST WORDLIST") pathToAnnotatedWordList = "Data/IELex/output/IELex-2016.tsv" print("LOAD WORDLIST") #pathToAnnotatedWordList = "Data/mattis_new/output/ObUgrian-110-21.tsv.asjp" languages, words, global_ids, cognate_classes = loadAnnotatedWordList( pathToAnnotatedWordList) print(len(set(global_ids))) stoplist = {221, 646, 1333, 1224, 778, 1402, 1411, 1232, 1203, 1292} languages, words, global_ids, cognate_classes = getRidOfValidationSet( languages, words, global_ids, cognate_classes, stoplist) print(len(set(global_ids))) with codecs.open("lexstat_wordlist.txt", "w", encoding="UTF-8") as f: f.write("CONCEPT\tIPA\tDOCULECT\tCOGID\n") for l, w, gi, cog in zip(languages, words, global_ids, cognate_classes): f.write(str(gi) + "\t" + w + "\t" + l + "\t" + cog + "\n") wl = get_wordlist("lexstat_wordlist.txt", delimiter="\t") print(wl.get_dict(concept="730", entry="IPA")) print("initializing lexstat") lex = LexStat(wl) print("getting scorer") lex.get_scorer() print("clustering") lex.cluster(method="lexstat", threshold=0.6, ref="cognate_class_pred") print("output") lex.output('tsv', filename="lexstat_ielex") from lingpy.evaluate.acd import bcubes, diff bcubes(lex, "cognate_class", "COGID") print(bcubes(lex, "cognate_class", "cognate_class_pred"))
def main(): for i, threshold in enumerate([-1, 0.0, 0.0001, 0.01, 0.1]): print(i) lex = LexStat("../data/ARM_GRE.csv") lex.get_scorer() lex.cluster(method="lexstat", threshold=threshold, ref="cognates", verbose=False) lex.output("csv", filename="ARM_GRE_lexstat_{}".format(i), ignore="all", prettify=True)
def cognate_detection_lexstat(output_path, output_cognates_path, input_type): print(" - Detect cognates in entire dataset using LexStat.") if os.path.exists(output_cognates_path): print(f"Using existing cognates file {output_cognates_path}, nothing is generated.") return print("Perform cognate classification, this can take a long time!") # TODO: Columns are NorthEuraLex-specific (at least classes=list) lex = LexStat(output_path, model="sca", segments="token", transcription=input_type, classes="list", langid="doculect") lex.get_scorer(method="markov") lex.cluster(method="lexstat", threshold=0.6, ref="COGNATES_LEXSTAT") print(f"Output cognates to {output_cognates_path}.") output_cognates_path_no_extension = output_cognates_path.split(".")[0] lex.output('tsv', filename=output_cognates_path_no_extension, ignore="all", prettify=False)
class TestLexStat(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.lex = LexStat(test_data('KSL.qlc')) self.log = get_log() self.get_scorer_kw = dict(runs=10, rands=10, limit=100) def test_init(self): LexStat({0: ['ID', 'doculect', 'concept', 'IPA']}, model='sca') ls = LexStat({0: ['ID', 'doculect', 'concept', 'IPA']}) self.assertIn('lexstat', repr(ls)) LexStat(ls) LexStat({0: ['ID', 'doculect', 'concept', 'tokens']}) self.assertRaises(AssertionError, LexStat, {0: ['ID', 'doculect', 'concept']}) LexStat(test_data('phybo.qlc'), check=True) with patch('lingpy.compare.lexstat.log', self.log): LexStat(test_data('KSL.qlc'), check=True) assert self.log.info.called error_log = self.tmp_path('errors') with patch('lingpy.util.confirm', Mock(return_value=True)): lex = LexStat({ 0: ['ID', 'doculect', 'concept', 'IPA', 'tokens'], 1: ['1', 'deu', 'hand', 'hand', ['']], 2: ['2', 'eng', 'hand', 'hand', ['abc']], 3: ['3', 'xyz', 'hand', 'hund', 'h u n d'], }, check=True, errors='%s' % error_log) assert error_log.exists() self.assertEquals(len(lex._meta['errors']), 2) def test_init2(self): freqs = self.lex.freqs['Hawaiian'] for char, n in {'5.W.C': 19, '5.I.V': 87, '5.Y.V': 75, '5.U.V': 87}.items(): self.assertEquals(freqs[char], n) self.assertEquals(len(self.lex.chars), 187) self.assertEquals(len(self.lex.rchars), 35) self.maxDiff = None for name in 'bscorer rscorer pairs'.split(): obj = jsonload(test_data('KSL.%s.json' % name)) if name != 'pairs': self.assertEquals(getattr(self.lex, name).matrix, obj) else: for key, values in self.lex.pairs.items(): values = set(values) ovalues = set(tuple(v) for v in obj['---'.join(key)]) if 'TRAVIS' not in os.environ and sys.version_info < (3, 5): # For some reason this assertion fails when run on travis-ci with # python 3.3 self.assertEquals(values, ovalues) def test_getitem(self): self.assertIsNone(self.lex['xyz']) def test_get_scorer(self): self.lex.get_scorer(**self.get_scorer_kw) assert hasattr(self.lex, "cscorer") with patch('lingpy.compare.lexstat.log', self.log): self.lex.get_scorer(**self.get_scorer_kw) assert self.log.warn.called del self.lex.cscorer self.lex.get_scorer(**self.get_scorer_kw) self.lex.get_scorer(method='markov', **self.get_scorer_kw) def test_cluster(self): self.lex.get_scorer(**self.get_scorer_kw) self.lex.cluster(method="lexstat", threshold=0.7) self.lex.cluster(method="edit-dist", threshold=0.7) self.lex.cluster(method="turchin", threshold=0.7) self.assertRaises(ValueError, self.lex.cluster, method="fuzzy") with patch('lingpy.basic.parser.input', Mock(return_value='y')): self.lex.cluster(method="sca", guess_threshold=True, gt_mode='nulld') assert 'scaid' in self.lex.header \ and 'lexstatid' in self.lex.header \ and 'editid' in self.lex.header \ and 'turchinid' in self.lex.header def test_align_pairs(self): self.lex.align_pairs('English', 'German', method='sca') def test_get_subset(self): self.lex.get_subset([]) self.assertEquals([v for v in self.lex.subsets.values() if v], []) pairs = jsonload(test_data('KSL.pairs.json')) self.assertEquals( sorted('---'.join(k) for k in self.lex.subsets.keys()), sorted(pairs.keys())) def test_get_distances(self): self.lex.get_scorer(**self.get_scorer_kw) self.lex.get_random_distances() self.lex.get_distances() self.lex.get_distances(method='turchin') self.lex.get_distances(aggregate=False) def test_get_frequencies(self): f = self.lex.get_frequencies('sounds') assert len(f) == self.lex.width f = self.lex.get_frequencies('sounds', aggregated=True) tokens = [] for k in self.lex: for t in self.lex[k, 'tokens']: tokens += [t] assert len(f) == len(set(tokens)) d = self.lex.get_frequencies('diversity', ref='cogid') assert isinstance(d, float) w = self.lex.get_frequencies('wordlength') assert len(w) == self.lex.width w = self.lex.get_frequencies('wordlength', aggregated=True) assert isinstance(w, float) def test_output(self): self.lex.output('csv', filename='%s' % self.tmp_path('test_lexstat')) self.lex.output('scorer', filename='%s' % self.tmp_path('test_lexstat')) def test_correctness(self): lex = LexStat({ 0: ['ID', 'doculect', 'concept', 'IPA'], 1: ['1', 'deu', 'hand', 'hand'], 2: ['2', 'eng', 'hand', 'hand'], 3: ['3', 'xyz', 'hand', 'xyz']}) lex.get_scorer(**self.get_scorer_kw) lex.cluster(ref='cogid') self.assertEquals(lex.get_entries('cogid'), [[1, 1, 3]]) lex = LexStat({ 0: ['ID', 'concept', 'ipa', 'doculect'], 1: ['5424', 'Abend::N', 'swar', 'FRA'], 2: ['5425', 'Abend::N', 'sware', 'FRA'], 3: ['5426', 'Abend::N', 'sear3', 'RON'], 4: ['5427', 'Abend::N', 'ivniN', 'ENG'], 5: ['5428', 'Abend::N', 'noyt3', 'POR'], 6: ['5429', 'Abend::N', 'tardi5a', 'POR'], 7: ['5430', 'Abend::N', 'afd3n', 'DAN'], }) lex.get_scorer() lex.cluster(method='lexstat', threshold=0.8, ref='cogid') self.assertEquals(lex.get_entries('cogid'), [[1, 2, 3, 4, 5], [0, 0, 3, 3, 0]])