Example #1
0
    def test_align(self):
        self.alm.add_entries('cugid', self.alm._ref, lambda x: text_type(x))
        self.alm.add_alignments(ref="cugid")

        # align all sequences using standard params
        self.alm.align(ref="cugid", alignment="alignment2")
        assert self.alm.msa["cugid"]["1"]["ID"] == self.alm.msa["cogid"][1][
            "ID"]

        # iterate and align using the multiple function
        for key, value in self.alm.msa['cogid'].items():
            # first compare simple alignments
            msaA = lp.SCA(value)
            msaB = lp.Multiple(value['seqs'])
            msaB.prog_align()
            assert msaA == msaB

            # now compare with different flag
            msaA = lp.Multiple(
                [self.alm[idx, 'tokensb'] for idx in value['ID']])
            msaB = lp.Multiple([''.join(s) for s in value['seqs']],
                               merge_vowels=False)
            msaA.lib_align()
            msaB.lib_align()
            assert msaA == msaB
Example #2
0
    def test_align(self):
        # align all sequences using standard params
        self.alm.align()

        # iterate and align using the multiple function
        for key, value in self.alm.msa['cogid'].items():
            # first compare simple alignments
            msaA = lp.SCA(value)
            msaB = lp.Multiple(value['seqs'])
            msaB.prog_align()
            assert msaA == msaB

            # now compare with different flag
            msaA = lp.Multiple(
                [self.alm[idx, 'tokensb'] for idx in value['ID']])
            msaB = lp.Multiple([''.join(s) for s in value['seqs']],
                               merge_vowels=False)
            msaA.lib_align()
            msaB.lib_align()
            assert msaA == msaB
    def cmd_makecldf(self, args):
        from pybtex import errors, database
        errors.strict = False
        bibdata = database.parse_file(
            str(self.raw_dir.joinpath('bibliography', 'sources.bib')))
        args.writer.add_sources(bibdata)
        args.writer["FormTable", "Segments"].datatype = Datatype.fromvalue({
            "base":
            "string",
            "format":
            "([\\S]+)( [\\S]+)*"
        })
        args.writer["FormTable", "Morphemes"].separator = " "
        args.writer["FormTable", "PartialCognates"].separator = " "

        concepts = {}
        errors, blacklist = set(), set()
        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Portuguese_Gloss=concept.attributes["portuguese"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                EOL_ID=concept.attributes["eol"],
                Semantic_Field=concept.attributes["semantic_field"])
            concepts[concept.english] = idx

        languages = {}
        sources = {}
        for row in self.languages:
            if not -90 < float(row['Latitude']) < 90:
                errors.add('LATITUDE {0}'.format(row['Name']))
            elif not -180 < float(row['Longitude']) < 180:
                errors.add('LONGITUDE {0}'.format(row['Name']))
            else:
                try:
                    args.writer.add_language(
                        ID=row['ID'],
                        Name=row['Name'],
                        SubGroup=row['SubGroup'],
                        Latitude=row['Latitude'],
                        Longitude=row['Longitude'],
                        Glottocode=row['Glottocode']
                        if row['Glottocode'] != '???' else None,
                    )
                    languages[row['Name']] = row['ID']
                    sources[row['Name']] = []
                    for source in row['Sources'].split(','):
                        if source in bibdata.entries:
                            sources[row['Name']] += [source]
                        else:
                            errors.add('BIBTEX MISSING {0}'.format(source))
                except ValueError:
                    errors.add('LANGUAGE ID {0}'.format(row['ID'], ))
                    args.log.warn('Invalid Language ID {0}'.format(row['ID']))

        wl = lingpy.Wordlist(self.raw_dir.joinpath('tuled.tsv').as_posix())
        etd = wl.get_etymdict(ref='cogids')
        alignments, problems = {}, set()
        for cogid, vals in progressbar(etd.items(), desc='aligning data'):
            idxs = []
            for idx in vals:
                if idx:
                    idxs += idx
            positions = [wl[idx, 'cogids'].index(cogid) for idx in idxs]
            alms, new_idxs = [], []
            for idx, pos in zip(idxs, positions):
                try:
                    tks = lingpy.basictypes.lists(wl[idx, 'tokens']).n[pos]
                    if not ' '.join(tks).strip():
                        raise IndexError
                    alms += [tks]
                    new_idxs += [(idx, pos)]
                except IndexError:
                    problems.add((idx, pos))
            if alms:
                msa = lingpy.Multiple(alms)
                msa.prog_align()
                for i, alm in enumerate(msa.alm_matrix):
                    alignments[new_idxs[i][0], new_idxs[i][1],
                               cogid] = ' '.join(alm)
            else:
                errors.add('ALIGNMENT empty {0}'.format(cogid))

        bipa = CLTS(args.clts.dir).bipa
        for idx, tokens, glosses, cogids, alignment in wl.iter_rows(
                'tokens', 'morphemes', 'cogids', 'alignment'):
            tl, gl, cl, al = (len(lingpy.basictypes.lists(tokens).n),
                              len(glosses), len(cogids),
                              len(lingpy.basictypes.lists(alignment).n))
            if tl != gl or tl != cl or gl != cl or al != gl or al != cl:
                errors.add('LENGTH: {0} {1} {2}'.format(
                    idx, wl[idx, 'language'], wl[idx, 'concept']))
                blacklist.add(idx)
            for token in tokens:
                if bipa[token].type == 'unknownsound':
                    errors.add('SOUND: {0}'.format(token))
                    blacklist.add(idx)

        visited = set()
        for idx in wl:
            if wl[idx, 'concept'] not in concepts:
                if wl[idx, 'concept'] not in visited:
                    args.log.warn('Missing concept {0}'.format(wl[idx,
                                                                  'concept']))
                    visited.add(wl[idx, 'concept'])
                    errors.add('CONCEPT {0}'.format(wl[idx, 'concept']))
            elif wl[idx, 'doculect'] not in languages:
                if wl[idx, 'doculect'] not in visited:
                    args.log.warn("Missing language {0}".format(
                        wl[idx, 'doculect']))
                    visited.add(wl[idx, 'doculect'])
                    errors.add('LANGUAGE {0}'.format(wl[idx, 'doculect']))
            else:
                if ''.join(wl[idx, 'tokens']).strip() and idx not in blacklist:
                    lex = args.writer.add_form_with_segments(
                        Language_ID=languages[wl[idx, 'doculect']],
                        Parameter_ID=concepts[wl[idx, 'concept']],
                        Value=wl[idx, 'value'] or ''.join(wl[idx, 'tokens']),
                        Form=wl[idx, 'form'] or ''.join(wl[idx, 'tokens']),
                        Segments=wl[idx, 'tokens'],
                        Morphemes=wl[idx, 'morphemes'],
                        SimpleCognate=wl[idx, 'cogid'],
                        PartialCognates=wl[idx, 'cogids'],
                        Source=sources[wl[idx, 'doculect']],
                    )
                    for gloss_index, cogid in enumerate(wl[idx, 'cogids']):
                        args.writer.add_cognate(lexeme=lex,
                                                Cognateset_ID=cogid,
                                                Segment_Slice=gloss_index + 1,
                                                Alignment=alignments.get(
                                                    (idx, gloss_index, cogid),
                                                    ''),
                                                Alignment_Method='SCA')
                else:
                    args.log.warn(
                        'Entry ID={0}, concept={1}, language={2} is empty'.
                        format(idx, wl[idx, 'concept'], wl[idx, 'doculect']))

        with open(self.dir.joinpath('errors.md'), 'w', encoding="utf-8") as f:
            f.write('# Error Analysis for TULED\n')
            for error in sorted(errors):
                f.write('* ' + error + '\n')
Example #4
0
name = {}
with open('cldf/languages.csv') as fin:
    reader = csv.reader(fin)
    for i, row in enumerate(reader):
        if i == 0: continue
        name[row[0]] = row[1]

res = defaultdict(lambda: defaultdict(int))
with open('test.out', 'a') as fout:
    for _, form in enumerate(forms):
        print(_)
        if 'kṣ' not in forms[form][0][1]: continue
        try:
            if len(forms[form]) == 1: continue
            # forms[i].sort()
            m = lingpy.Multiple([x[1] for x in forms[form]])
            m.prog_align()
            strs = [['#'] + list(x.split()) + ['#']
                    for x in str(m).split('\n')]
            print(strs)

            l, r = -1, -1
            for i in range(len(strs[0])):
                if strs[0][i] == 'k':
                    l = i
                    for j in range(i + 1, len(strs[0])):
                        if strs[0][j] == 'ṣ':
                            r = j
                            break
                        elif strs[0][j] != '-':
                            l = -1
Example #5
0
import lingpy as lp
from lingpy.algorithm import squareform

seqs = ["kona", "kvinne", "queen"]
msa = lp.Multiple(seqs)
msa.prog_align()

print(msa)

languages = ['Norwegian', 'Swedish', 'Icelandic', 'Dutch', 'English']

distances = squareform([0.5, 0.67, 0.8, 0.2, 0.4, 0.7, 0.6, 0.8, 0.8, 0.3])

tree = lp.neighbor(distances, languages)
print(tree)
tree = lp.Tree(tree)
print(tree.asciiArt())