Ejemplo n.º 1
0
from lexibank_deepadungpalaung import Dataset
from lingpy import *
from lingpy.compare.partial import Partial
from lingpy.evaluate.acd import bcubes

columns=('concept_name', 'language_id',
                'value', 'form', 'segments', 'language_glottocode', 'cogid_cognateset_id'
                )
namespace=(('concept_name', 'concept'), ('language_id',
                'doculect'), ('segments', 'tokens'), ('language_glottocode',
                    'glottolog'), ('concept_concepticon_id', 'concepticon'),
                ('language_latitude', 'latitude'), ('language_longitude',
                    'longitude'), ('cognacy', 'cognacy'),
                ('cogid_cognateset_id', 'cog'))

part = Partial.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'),
        columns=columns, namespace=namespace)

part.renumber('cog')


method = input('method: ')

# type 'cogid' or 'cog' for method to see a tree based on Deepadung et al.'s
# cognate judgements

if method == 'lexstatcogids':
    part.get_partial_scorer(runs=10000)
    part.partial_cluster(method='lexstat', ref="lexstatcogids", threshold=0.55)
elif method == 'lexstatcogid':
    part.get_scorer(runs=10000)
Ejemplo n.º 2
0
from lexibank_deepadungpalaung import Dataset
from lingpy import *
wl = Wordlist.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'))
for idx, tokens in wl.iter_rows('tokens'):
    for segment in tokens.n:
        if not segment:
            print(idx, tokens)
Ejemplo n.º 3
0
    Mmatrix.append(NewM)
    return Mmatrix


def average_of_lists(list1, list2, wti, wtj):
    list3 = []
    for a in range(len(list1)):
        summ = (list1[a] * wti + list2[a] * wtj) / (wti + wtj)
        list3.append(summ)
    return list3


root_node = None
if __name__ == "__main__":
    method = input('method: ')
    filepath = Dataset().cldf_dir._path('../output/distmat_' + method +
                                        '.csv').resolve()
    with open(filepath, encoding="utf-8") as infile:
        reader = csv.reader(infile)
        lang_names = []
        dist_matrix = []
        header_row = next(reader)
        for row in reader:
            lang_names.append(row[0])
            dist_matrix.append([float(x) for x in row[1:]])
        root_node = UPGMA(dist_matrix, lang_names)
        root_XML = XML_node(root_node)
        outfilepath = Dataset().cldf_dir._path('../output/tree_' + method +
                                               '.xml').resolve()
        with open(outfilepath, encoding='utf-8') as outfile:
            outfile.write(root_XML)
Ejemplo n.º 4
0
from lexibank_deepadungpalaung import Dataset
from lingpy import *
from lingpy.compare.partial import Partial
from lingpy.evaluate.acd import bcubes

columns=('concept_name', 'language_id',
                'value', 'form', 'segments', 'language_glottocode', 'cogid_cognateset_id'
                )
namespace=(('concept_name', 'concept'), ('language_id',
                'doculect'), ('segments', 'tokens'), ('language_glottocode',
                    'glottolog'), ('concept_concepticon_id', 'concepticon'),
                ('language_latitude', 'latitude'), ('language_longitude',
                    'longitude'), ('cognacy', 'cognacy'),
                ('cogid_cognateset_id', 'cog'))

part = Partial.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'),
        columns=columns, namespace=namespace)

part.renumber('cog')


method = input('method: ')

# type 'cogid' or 'cog' for method to see a tree based on Deepadung et al.'s
# cognate judgements

if method == 'lexstatcogids':
    part.get_partial_scorer(runs=10000)
    part.partial_cluster(method='lexstat', ref="lexstatcogids", threshold=0.55)
elif method == 'lexstatcogid':
    part.get_scorer(runs=10000)
Ejemplo n.º 5
0
from lingpy import *
from lexibank_deepadungpalaung import Dataset
from lingpy.evaluate.acd import bcubes

ds = Dataset()

wl = Wordlist.from_cldf(ds.dir.joinpath('cldf', 'cldf-metadata.json'),
                        columns=[
                            'language_id', 'concept_name', 'value', 'form',
                            'segments', 'cogid_cognateset_id'
                        ],
                        namespace=dict([['language_id', 'doculect'],
                                        ['concept_name', 'concept'],
                                        ['value', 'value'], ['form', 'form'],
                                        ['segments', 'tokens'],
                                        ['cogid_cognateset_id', 'cog']]))
wl.renumber('cog')

lex = LexStat(wl)
lex.get_scorer(runs=10000)
for i in range(1, 20):
    t = i * 0.05
    ts = '{0}'.format(int(t * 100 + 0.5))
    lex.cluster(method='sca',
                threshold=t,
                ref='sca_' + ts,
                restricted_chars='')
    lex.cluster(method='lexstat',
                threshold=t,
                ref='ls_' + ts,
                restricted_chars='')