Exemple #1
0
def test_init3(test_data, lextstat_factory):  # with kw check=True
    bad_file = test_data / 'bad_file.tsv'
    with pytest.raises(ValueError):
        LexStat(str(bad_file))
    ls = lextstat_factory(str(bad_file), check=True, apply_checks=True)
    assert hasattr(ls, 'errors')
    cleaned = bad_file.parent.joinpath(bad_file.name + '_cleaned.tsv')
    assert cleaned.exists()
    cleaned.unlink()
    with pytest.raises(ValueError):
        LexStat({0: ['concept', 'language', 'ipa']})
Exemple #2
0
def writeToFile():
    print("LOAD TEST WORDLIST")
    pathToAnnotatedWordList = "Data/IELex/output/IELex-2016.tsv"

    print("LOAD WORDLIST")
    #pathToAnnotatedWordList = "Data/mattis_new/output/ObUgrian-110-21.tsv.asjp"
    languages, words, global_ids, cognate_classes = loadAnnotatedWordList(
        pathToAnnotatedWordList)
    print(len(set(global_ids)))
    stoplist = {221, 646, 1333, 1224, 778, 1402, 1411, 1232, 1203, 1292}

    languages, words, global_ids, cognate_classes = getRidOfValidationSet(
        languages, words, global_ids, cognate_classes, stoplist)
    print(len(set(global_ids)))
    with codecs.open("lexstat_wordlist.txt", "w", encoding="UTF-8") as f:
        f.write("CONCEPT\tIPA\tDOCULECT\tCOGID\n")
        for l, w, gi, cog in zip(languages, words, global_ids,
                                 cognate_classes):
            f.write(str(gi) + "\t" + w + "\t" + l + "\t" + cog + "\n")
    wl = get_wordlist("lexstat_wordlist.txt", delimiter="\t")
    print(wl.get_dict(concept="730", entry="IPA"))
    print("initializing lexstat")
    lex = LexStat(wl)
    print("getting scorer")
    lex.get_scorer()
    print("clustering")
    lex.cluster(method="lexstat", threshold=0.6, ref="cognate_class_pred")
    print("output")
    lex.output('tsv', filename="lexstat_ielex")

    from lingpy.evaluate.acd import bcubes, diff
    bcubes(lex, "cognate_class", "COGID")
    print(bcubes(lex, "cognate_class", "cognate_class_pred"))
Exemple #3
0
def main():
    results = []
    for name in [
            "ROM", "BAI", "GER", "JAP", "OUG", "PIE", "SLV", "IEL", "KSL",
            "PAN"
    ]:
        print(name)
        for n in range(1, 7):
            results = []
            for i in [50]:
                lex = LexStat("../{}_updating_{}_{}.csv".format(name, i, n))
                # lex.get_scorer()
                # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")
                results.append((
                    i,
                    bcubes(
                        lex,
                        "cogid",
                        "newcogid",
                        pprint=False,
                        modify_ref=lambda x: abs(int(x)),
                    ),
                ))

            for r in results:
                print(r)
        print()
Exemple #4
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.lex = LexStat(test_data('KSL.qlc'))
     self.part = Partial(test_data('partial_cognates.tsv'),
                         segments='segments')
     self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
     self.part.add_entries('pid2', 'partialids2',
                           lambda x: [int(y) for y in x.split(' ')])
Exemple #5
0
def test_get_confidence(test_data, alm, tmppath):
    lex = LexStat(str(test_data / 'KSL3.qlc'))
    tmp_dict = dict([(k, lex[k, 'numbers']) for k in lex])
    alm.add_entries('numbers', tmp_dict, lambda x: x)
    # Run get_confidence to populate the output variable.
    # TODO: Check and document side-effects of this.
    _ = alm.get_confidence(lex.rscorer, ref='cogid')
    alm.output('html', filename=str(tmppath / 'alm'), confidence=True)
Exemple #6
0
    def test_get_confidence(self):

        lex = LexStat(test_data('KSL3.qlc'))
        tmpDict = dict([(k, lex[k, 'numbers']) for k in lex])
        self.alm.add_entries('numbers', tmpDict, lambda x: x)
        corrs = self.alm.get_confidence(lex.rscorer, ref='cogid')
        self.alm.output('html',
                        filename=text_type(self.tmp_path('alm')),
                        confidence=True)
Exemple #7
0
 def test_get_confidence(self):
     lex = LexStat(test_data('KSL3.qlc'))
     tmp_dict = dict([(k, lex[k, 'numbers']) for k in lex])
     self.alm.add_entries('numbers', tmp_dict, lambda x: x)
     # Run get_confidence to populate the output variable.
     # TODO: Check and document side-effects of this.
     _ = self.alm.get_confidence(lex.rscorer, ref='cogid')
     self.alm.output('html',
                     filename=text_type(self.tmp_path('alm')),
                     confidence=True)
def main():
    for i, threshold in enumerate([-1, 0.0, 0.0001, 0.01, 0.1]):
        print(i)
        lex = LexStat("../data/ARM_GRE.csv")
        lex.get_scorer()
        lex.cluster(method="lexstat",
                    threshold=threshold,
                    ref="cognates",
                    verbose=False)
        lex.output("csv",
                   filename="ARM_GRE_lexstat_{}".format(i),
                   ignore="all",
                   prettify=True)
Exemple #9
0
def main():
    results = []
    """for i in range(20):
        lex = LexStat("../PIE_scored_{}_og.csv".format(i))
        # lex.get_scorer()
        # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")
        print(".", end="", flush=True)
        results.append( (
                i,
                bcubes(
                    lex,
                    "cogid",
                    "newcogid",
                    pprint=False,
                    modify_ref=lambda x: abs(int(x)),
                ),
            )
        )
:
    print()
    print("OG")
    for r in results:
        print(r)
"""
    for name in ["ARM_GRE"]:
        print(name)
        for n in range(1, 7):
            results = []
            for i in [50]:
                lex = LexStat("../{}_{}_{}.csv".format(name, i, n))
                # lex.get_scorer()
                # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")
                results.append(
                    (
                        i,
                        bcubes(
                            lex,
                            "cogid",
                            "newcogid",
                            pprint=False,
                            modify_ref=lambda x: abs(int(x)),
                        ),
                    )
                )

            for r in results:
                print(r)
        print()
Exemple #10
0
def cognate_detection_lexstat(output_path, output_cognates_path, input_type):
    print(" - Detect cognates in entire dataset using LexStat.")
    if os.path.exists(output_cognates_path):
        print(f"Using existing cognates file {output_cognates_path}, nothing is generated.")
        return
    print("Perform cognate classification, this can take a long time!")
    # TODO: Columns are NorthEuraLex-specific (at least classes=list)
    lex = LexStat(output_path,
                  model="sca",
                  segments="token",
                  transcription=input_type,
                  classes="list",
                  langid="doculect")
    
    lex.get_scorer(method="markov")
    lex.cluster(method="lexstat", threshold=0.6, ref="COGNATES_LEXSTAT")
    
    print(f"Output cognates to {output_cognates_path}.")
    output_cognates_path_no_extension = output_cognates_path.split(".")[0]
    lex.output('tsv', filename=output_cognates_path_no_extension, ignore="all", prettify=False)
Exemple #11
0
def test_init(lextstat_factory, test_data, mocker, log, tmppath):
    lextstat_factory(
        {
            0: ['ID', 'doculect', 'concept', 'IPA'],
            1: ['1', 'deu', 'hand', 'hant']
        },
        model='sca')
    ls = lextstat_factory({
        0: ['ID', 'doculect', 'concept', 'IPA'],
        1: ['1', 'deu', 'hand', 'hant']
    })
    assert 'lexstat' in repr(ls)
    lextstat_factory(ls)
    lextstat_factory({
        0: ['ID', 'doculect', 'concept', 'tokens'],
        1: ['1', 'deu', 'hand', ['h', 'a', 'n', 't']]
    })
    with pytest.raises(AssertionError):
        LexStat({0: ['ID', 'doculect', 'concept'], 1: ['1', 'deu', 'hand']})
    lextstat_factory(str(test_data / 'phybo.qlc'), check=True)
    mocker.patch('lingpy.compare.lexstat.log', log)
    lextstat_factory(str(test_data / 'KSL.qlc'), check=True)
    assert log.info.called
    error_log = tmppath / 'errors'
    mocker.patch('lingpy.util.confirm', mocker.Mock(return_value=True))
    lex = lextstat_factory(
        {
            0: ['ID', 'doculect', 'concept', 'IPA', 'tokens'],
            1: ['1', 'deu', 'hand', 'hand', ['']],
            2: ['2', 'eng', 'hand', 'hand', ['abc']],
            3: ['3', 'xyz', 'hand', 'hund', 'h u n d'],
        },
        check=True,
        errors='%s' % error_log)
    assert error_log.exists()
    assert lex.filename.endswith('_cleaned.tsv')
    assert pathlib.Path(lex.filename).exists()
    pathlib.Path(lex.filename).unlink()
    assert len(lex._meta['errors']) == 2
Exemple #12
0
def lex(test_data):
    return LexStat(str(test_data / 'KSL.qlc'))
Exemple #13
0
 def make(*args, **kw):
     kw.setdefault('errors', str(tmppath / 'errors.log'))
     return LexStat(*args, **kw)
Exemple #14
0
 def _make_one(self, *args, **kw):
     kw.setdefault('errors', self.tmp_path('errors.log').as_posix())
     return LexStat(*args, **kw)
Exemple #15
0
def main():
    results = []
    """for i in range(20):
        lex = LexStat("../PIE_scored_{}_og.csv".format(i))
        # lex.get_scorer()
        # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")
        print(".", end="", flush=True)
        results.append( (
                i,
                bcubes(
                    lex,
                    "cogid",
                    "newcogid",
                    pprint=False,
                    modify_ref=lambda x: abs(int(x)),
                ),
            )
        )
:
    print()
    print("OG")
    for r in results:
        print(r)
"""
    records = []
    f_names = [
        "SLV", "GER", "ROM", "OUG", "KSL", "BAI", "JAP", "PIE", "IEL", "PAN"
    ]
    for name in f_names:
        print(name)
        for n in range(1, 7):
            results = []
            for i in [50]:
                lex = LexStat("../{}_updating_{}_{}.csv".format(name, i, n))
                # lex.get_scorer()
                # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")

                precision, recall, f_score = bcubes(
                    lex,
                    "cogid",
                    "newcogid",
                    pprint=False,
                    modify_ref=lambda x: abs(int(x)),
                )
                records.append([name, n, precision, recall, f_score])

            for r in results:
                print(r)
        print()

    f, axes = plt.subplots(1, 2, figsize=(20, 8))

    # markers = {i: "${}$".format(i) for i in range(1, 7)}
    df = pd.DataFrame.from_records(
        records,
        columns=["Partition", "Iteration", "Precision", "Recall", "F-score"])
    sns.lineplot(x="Recall",
                 y="Precision",
                 hue="Partition",
                 data=df,
                 marker="o",
                 ax=axes[0])
    # plt.subplots_adjust(right=0.7)
    # plt.legend(bbox_to_anchor=(1.02, 1.02), loc="upper left")

    for _, i, precision, recall, _ in records:
        if i == 1 or i == 6:
            axes[0].annotate(str(i), (recall, precision))

    # markers = {i: "${}$".format(i) for i in range(1, 7)}
    df = pd.DataFrame.from_records(
        records,
        columns=["Partition", "Iteration", "Precision", "Recall", "F-score"])
    sns.lineplot(x="Iteration",
                 y="F-score",
                 hue="Partition",
                 data=df,
                 ax=axes[1])
    # plt.subplots_adjust(right=0.7)
    axes[0].get_legend().remove()
    lgd = plt.legend(bbox_to_anchor=(1.02, 1.00), loc="upper left")

    plt.savefig("bbb.png", bbox_extra_artists=[lgd], bbox_inches="tight")
 def setUp(self):
     self.lex = LexStat(test_data('KSL.qlc'))