def test_init3(test_data, lextstat_factory): # with kw check=True bad_file = test_data / 'bad_file.tsv' with pytest.raises(ValueError): LexStat(str(bad_file)) ls = lextstat_factory(str(bad_file), check=True, apply_checks=True) assert hasattr(ls, 'errors') cleaned = bad_file.parent.joinpath(bad_file.name + '_cleaned.tsv') assert cleaned.exists() cleaned.unlink() with pytest.raises(ValueError): LexStat({0: ['concept', 'language', 'ipa']})
def writeToFile(): print("LOAD TEST WORDLIST") pathToAnnotatedWordList = "Data/IELex/output/IELex-2016.tsv" print("LOAD WORDLIST") #pathToAnnotatedWordList = "Data/mattis_new/output/ObUgrian-110-21.tsv.asjp" languages, words, global_ids, cognate_classes = loadAnnotatedWordList( pathToAnnotatedWordList) print(len(set(global_ids))) stoplist = {221, 646, 1333, 1224, 778, 1402, 1411, 1232, 1203, 1292} languages, words, global_ids, cognate_classes = getRidOfValidationSet( languages, words, global_ids, cognate_classes, stoplist) print(len(set(global_ids))) with codecs.open("lexstat_wordlist.txt", "w", encoding="UTF-8") as f: f.write("CONCEPT\tIPA\tDOCULECT\tCOGID\n") for l, w, gi, cog in zip(languages, words, global_ids, cognate_classes): f.write(str(gi) + "\t" + w + "\t" + l + "\t" + cog + "\n") wl = get_wordlist("lexstat_wordlist.txt", delimiter="\t") print(wl.get_dict(concept="730", entry="IPA")) print("initializing lexstat") lex = LexStat(wl) print("getting scorer") lex.get_scorer() print("clustering") lex.cluster(method="lexstat", threshold=0.6, ref="cognate_class_pred") print("output") lex.output('tsv', filename="lexstat_ielex") from lingpy.evaluate.acd import bcubes, diff bcubes(lex, "cognate_class", "COGID") print(bcubes(lex, "cognate_class", "cognate_class_pred"))
def main(): results = [] for name in [ "ROM", "BAI", "GER", "JAP", "OUG", "PIE", "SLV", "IEL", "KSL", "PAN" ]: print(name) for n in range(1, 7): results = [] for i in [50]: lex = LexStat("../{}_updating_{}_{}.csv".format(name, i, n)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") results.append(( i, bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ), )) for r in results: print(r) print()
def setUp(self): WithTempDir.setUp(self) self.lex = LexStat(test_data('KSL.qlc')) self.part = Partial(test_data('partial_cognates.tsv'), segments='segments') self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x) self.part.add_entries('pid2', 'partialids2', lambda x: [int(y) for y in x.split(' ')])
def test_get_confidence(test_data, alm, tmppath): lex = LexStat(str(test_data / 'KSL3.qlc')) tmp_dict = dict([(k, lex[k, 'numbers']) for k in lex]) alm.add_entries('numbers', tmp_dict, lambda x: x) # Run get_confidence to populate the output variable. # TODO: Check and document side-effects of this. _ = alm.get_confidence(lex.rscorer, ref='cogid') alm.output('html', filename=str(tmppath / 'alm'), confidence=True)
def test_get_confidence(self): lex = LexStat(test_data('KSL3.qlc')) tmpDict = dict([(k, lex[k, 'numbers']) for k in lex]) self.alm.add_entries('numbers', tmpDict, lambda x: x) corrs = self.alm.get_confidence(lex.rscorer, ref='cogid') self.alm.output('html', filename=text_type(self.tmp_path('alm')), confidence=True)
def test_get_confidence(self): lex = LexStat(test_data('KSL3.qlc')) tmp_dict = dict([(k, lex[k, 'numbers']) for k in lex]) self.alm.add_entries('numbers', tmp_dict, lambda x: x) # Run get_confidence to populate the output variable. # TODO: Check and document side-effects of this. _ = self.alm.get_confidence(lex.rscorer, ref='cogid') self.alm.output('html', filename=text_type(self.tmp_path('alm')), confidence=True)
def main(): for i, threshold in enumerate([-1, 0.0, 0.0001, 0.01, 0.1]): print(i) lex = LexStat("../data/ARM_GRE.csv") lex.get_scorer() lex.cluster(method="lexstat", threshold=threshold, ref="cognates", verbose=False) lex.output("csv", filename="ARM_GRE_lexstat_{}".format(i), ignore="all", prettify=True)
def main(): results = [] """for i in range(20): lex = LexStat("../PIE_scored_{}_og.csv".format(i)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") print(".", end="", flush=True) results.append( ( i, bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ), ) ) : print() print("OG") for r in results: print(r) """ for name in ["ARM_GRE"]: print(name) for n in range(1, 7): results = [] for i in [50]: lex = LexStat("../{}_{}_{}.csv".format(name, i, n)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") results.append( ( i, bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ), ) ) for r in results: print(r) print()
def cognate_detection_lexstat(output_path, output_cognates_path, input_type): print(" - Detect cognates in entire dataset using LexStat.") if os.path.exists(output_cognates_path): print(f"Using existing cognates file {output_cognates_path}, nothing is generated.") return print("Perform cognate classification, this can take a long time!") # TODO: Columns are NorthEuraLex-specific (at least classes=list) lex = LexStat(output_path, model="sca", segments="token", transcription=input_type, classes="list", langid="doculect") lex.get_scorer(method="markov") lex.cluster(method="lexstat", threshold=0.6, ref="COGNATES_LEXSTAT") print(f"Output cognates to {output_cognates_path}.") output_cognates_path_no_extension = output_cognates_path.split(".")[0] lex.output('tsv', filename=output_cognates_path_no_extension, ignore="all", prettify=False)
def test_init(lextstat_factory, test_data, mocker, log, tmppath): lextstat_factory( { 0: ['ID', 'doculect', 'concept', 'IPA'], 1: ['1', 'deu', 'hand', 'hant'] }, model='sca') ls = lextstat_factory({ 0: ['ID', 'doculect', 'concept', 'IPA'], 1: ['1', 'deu', 'hand', 'hant'] }) assert 'lexstat' in repr(ls) lextstat_factory(ls) lextstat_factory({ 0: ['ID', 'doculect', 'concept', 'tokens'], 1: ['1', 'deu', 'hand', ['h', 'a', 'n', 't']] }) with pytest.raises(AssertionError): LexStat({0: ['ID', 'doculect', 'concept'], 1: ['1', 'deu', 'hand']}) lextstat_factory(str(test_data / 'phybo.qlc'), check=True) mocker.patch('lingpy.compare.lexstat.log', log) lextstat_factory(str(test_data / 'KSL.qlc'), check=True) assert log.info.called error_log = tmppath / 'errors' mocker.patch('lingpy.util.confirm', mocker.Mock(return_value=True)) lex = lextstat_factory( { 0: ['ID', 'doculect', 'concept', 'IPA', 'tokens'], 1: ['1', 'deu', 'hand', 'hand', ['']], 2: ['2', 'eng', 'hand', 'hand', ['abc']], 3: ['3', 'xyz', 'hand', 'hund', 'h u n d'], }, check=True, errors='%s' % error_log) assert error_log.exists() assert lex.filename.endswith('_cleaned.tsv') assert pathlib.Path(lex.filename).exists() pathlib.Path(lex.filename).unlink() assert len(lex._meta['errors']) == 2
def lex(test_data): return LexStat(str(test_data / 'KSL.qlc'))
def make(*args, **kw): kw.setdefault('errors', str(tmppath / 'errors.log')) return LexStat(*args, **kw)
def _make_one(self, *args, **kw): kw.setdefault('errors', self.tmp_path('errors.log').as_posix()) return LexStat(*args, **kw)
def main(): results = [] """for i in range(20): lex = LexStat("../PIE_scored_{}_og.csv".format(i)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") print(".", end="", flush=True) results.append( ( i, bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ), ) ) : print() print("OG") for r in results: print(r) """ records = [] f_names = [ "SLV", "GER", "ROM", "OUG", "KSL", "BAI", "JAP", "PIE", "IEL", "PAN" ] for name in f_names: print(name) for n in range(1, 7): results = [] for i in [50]: lex = LexStat("../{}_updating_{}_{}.csv".format(name, i, n)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") precision, recall, f_score = bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ) records.append([name, n, precision, recall, f_score]) for r in results: print(r) print() f, axes = plt.subplots(1, 2, figsize=(20, 8)) # markers = {i: "${}$".format(i) for i in range(1, 7)} df = pd.DataFrame.from_records( records, columns=["Partition", "Iteration", "Precision", "Recall", "F-score"]) sns.lineplot(x="Recall", y="Precision", hue="Partition", data=df, marker="o", ax=axes[0]) # plt.subplots_adjust(right=0.7) # plt.legend(bbox_to_anchor=(1.02, 1.02), loc="upper left") for _, i, precision, recall, _ in records: if i == 1 or i == 6: axes[0].annotate(str(i), (recall, precision)) # markers = {i: "${}$".format(i) for i in range(1, 7)} df = pd.DataFrame.from_records( records, columns=["Partition", "Iteration", "Precision", "Recall", "F-score"]) sns.lineplot(x="Iteration", y="F-score", hue="Partition", data=df, ax=axes[1]) # plt.subplots_adjust(right=0.7) axes[0].get_legend().remove() lgd = plt.legend(bbox_to_anchor=(1.02, 1.00), loc="upper left") plt.savefig("bbb.png", bbox_extra_artists=[lgd], bbox_inches="tight")
def setUp(self): self.lex = LexStat(test_data('KSL.qlc'))