Example #1
0
 def test_renumber(self):
     from lingpy.basic.ops import renumber
     tmp = Wordlist(test_data('good_file.tsv'))
     tmp.renumber('cogid', 'newcogid')
     assert 'newcogid' in tmp.header
     tmp.renumber('mock')
     assert 'mockid' in tmp.header
Example #2
0
def main():

    debug = False
    if 'debug' in argv or '--debug' in argv:
        debug = True
    if 'pinyin' in argv:
        py = sinopy.pinyin(argv[argv.index('pinyin') + 1])
        print(py)
    if 'profile' in argv:
        if '--cldf' in argv:
            wl = Wordlist.from_cldf(argv[argv.index('profile') + 1],
                                    col='language_id',
                                    row='parameter_id')
            wl.add_entries('doculect', 'language_name', lambda x: x)
        else:
            wl = Wordlist(argv[argv.index('profile') + 1])
        column = 'ipa'
        language = None
        filename = 'orthography.tsv'
        if '--column' in argv:
            column = argv[argv.index('--column') + 1]
        if '--language' in argv:
            language = argv[argv.index('--language') + 1]
        if '-l' in argv:
            language = argv[argv.index('-l') + 1]
        if '-o' in argv:
            filename = argv[argv.index('-o') + 1]
        if '--filename' in argv:
            filename = argv[argv.index('--filename') + 1]

        segments.write_structure_profile(wl,
                                         column=column,
                                         filename=filename,
                                         debug=debug,
                                         language=language)
Example #3
0
def prepare(ds):
    errs = 0
    wl = Wordlist(ds.raw('bds.tsv'))
    W = {}
    for k in wl:
        value = wl[k, 'value']
        tokens = wl[k, 'tokens']
        doc = wl[k, 'doculect']
        if value:
            morphemes = []
            for a, b in _get_slices(wl[k, 'tokens']):
                ipa = ''.join(tokens[a:b])
                morphemes += [ipa]
            ipa = ' '.join(morphemes)

            clpa = ds.transform(ipa, 'CLPA')
            struc = ds.transform(ipa, 'Structure')
            try:
                assert len(clpa.split(' ')) == len(struc.split(' '))
            except:
                errs += 1
                print(errs, clpa, struc)
            if '«' in clpa:
                errs += 1
                print(errs, ipa, clpa, struc)
            W[k] = [
                doc, wl[k, 'concept'], wl[k, 'concepticon_id'], value, clpa,
                struc, wl[k, 'partial_ids']
            ]
    W[0] = [
        'doculect', 'concept', 'concepticon_id', 'value', 'segments',
        'structure', 'cogids'
    ]
    ds.write_wordlist(Wordlist(W))
Example #4
0
def get_lexibase(path,
                 name,
                 columns=None,
                 preprocessing=None,
                 namespace=None,
                 lexibase=False):

    wordlist = Wordlist.from_cldf(path,
                                  columns=columns
                                  or ("language_id", "concept_name", "value",
                                      "form", "segments", "comment"),
                                  namespace=namespace
                                  or dict([("language_id", "doculect"),
                                           ("concept_name", "concept"),
                                           ("value", "value"),
                                           ("form", "form"),
                                           ("segments", "tokens"),
                                           ("comment", "note")]))

    if preprocessing:
        D = preprocessing(wordlist)
    else:
        D = {idx: wordlist[idx] for idx in wordlist}
        D[0] = wordlist.columns

    if not lexibase:
        Wordlist(D).output("tsv", filename=name, ignore="all", prettify=False)
    else:
        lex = LexiBase(D, dbase=name + ".sqlite3")
        lex.create(name)
def test_wordlist2cognates(repos, mocker):
    @attr.s
    class Lexeme(dataset.Lexeme):
        Concept = attr.ib(default=None)
        Segments = attr.ib(default=[])
    @attr.s
    class Lexeme2(dataset.Lexeme):
        Concept = attr.ib(default=None)

    dsdir = repos / 'datasets' / 'test_dataset'
    if not dsdir.joinpath('cldf').exists():
        dsdir.joinpath('cldf').mkdir()
    ds = Dataset(mocker.Mock(
        lexeme_class=Lexeme,
        cognate_class=dataset.Cognate,
        language_class=dataset.Language,
        concept_class=dataset.Concept,
        split_forms=lambda _, s: [s],
        tokenize=lambda _, x: [],
        dir=dsdir,
        tr_analyses={},
        cldf_dir=dsdir.joinpath('cldf')))
    ds2 = Dataset(mocker.Mock(
        lexeme_class=Lexeme2,
        cognate_class=dataset.Cognate,
        language_class=dataset.Language,
        concept_class=dataset.Concept,
        split_forms=lambda _, s: [s],
        dir=dsdir,
        tr_analyses={},
        cldf_dir=dsdir.joinpath('cldf')))
    ds2.add_form_with_segments(
        Value='form,form2',
        Concept='meaning',
        Language_ID='1',
        Parameter_ID='p',
        Form='form',
        Segments=['f', 'o']
        )
    # needs to be fixed XXX
    ds2.tokenize = lambda _, x: [x]
    ds2.add_form(
        Value='form,form2',
        Concept='meaning',
        Language_ID='1',
        Parameter_ID='p',
        Form='form',
            )
    # needs to be fixed XXX
    ds.tokenize = lambda _, x: []
    ds.add_forms_from_value(
            Value='form,form2',
            Concept='meaning',
            Language_ID='1',
            Parameter_ID='p'
            )
    # lid, ipa, concept
    wl = Wordlist(lingpy_util._cldf2wld(ds2), row='concept', col='language_id')
    res = list(lingpy_util.wordlist2cognates(wl, 'src'))
    assert isinstance(res[0], dict)
Example #6
0
def test_med(test_data):
    wl = Wordlist(str(test_data / 'KSL.qlc'))

    assert med(wl, gold='gloss', test='gloss',
               classes=False) == pytest.approx(0.0)
    assert med(wl, gold='tokens', test='tokens',
               classes=True) == pytest.approx(0.0)
Example #7
0
def run(args):
    bipa = args.clts.api.bipa
    func = profile.simple_profile
    cols = ['Grapheme', 'IPA', 'Frequence', 'Codepoints']
    kw = {'ref': 'form', 'clts': bipa}
    if args.context:
        func = profile.context_profile
        cols = [
            'Grapheme', 'IPA', 'Examples', 'Languages', 'Frequence',
            'Codepoints'
        ]
        kw['col'] = 'language_id'

    ds = get_dataset(args)
    profile_path = ds.etc_dir / 'orthography.tsv'
    if profile_path.exists() and not args.force:
        raise ParserError(
            'Orthography profile exists already. To overwrite, pass "-f" flag')

    header, D = [], {}
    for i, row in enumerate(ds.cldf_reader()['FormTable'], start=1):
        if i == 1:
            header = [f for f in row.keys() if f != 'ID']
            D = {0: ['lid'] + [h.lower() for h in header]}

        row['Segments'] = ' '.join(row['Segments'])
        D[i] = [row['ID']] + [row[h] for h in header]

    with UnicodeWriter(profile_path, delimiter='\t') as writer:
        writer.writerow(cols)
        for row in func(Wordlist(D, row='parameter_id', col='language_id'),
                        **kw):
            writer.writerow(row)
    args.log.info('Orthography profile written to {0}'.format(profile_path))
Example #8
0
    def test_clean_taxnames(self):
        tmp = Wordlist({
            0: ['doculect', 'concept', 'counterpart'],
            1: ['l1', 'hand', 'hand'],
            2: ['l2 - a (taxon) name)', 'hand', 'hand']
        })

        clean_taxnames(tmp)
        assert tmp.cols[-1] == 'l2___a_taxon_name'
Example #9
0
    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        concepts, wl_concepts = {}, {}
        visited = set()
        for concept in self.concepts:
            cid = '{0}_{1}'.format(concept['NUMBER'], slug(concept['ENGLISH']))
            if cid in visited:
                pass
            else:
                visited.add(cid)
                args.writer.add_concept(
                    ID=cid,
                    Name=concept['ENGLISH'],
                    Glosses_in_Source=concept['GLOSSES_IN_SOURCE'],
                    Concepticon_ID=concept['CONCEPTICON_ID'],
                    Concepticon_Gloss=concept['CONCEPTICON_GLOSS'])
                for gloss in concept['GLOSSES_IN_SOURCE'].split(' // '):
                    concepts[gloss] = cid
                    wl_concepts[gloss] = concept['ENGLISH']

        languages = args.writer.add_languages(lookup_factory="Name_in_Source")
        args.writer.add_sources()

        # make a wordlist for edictor to inspect the data
        D = {0: ['doculect', 'concept', 'ipa', 'cogid']}
        idx = 1

        for i, row in progressbar(
                enumerate(
                    self.raw_dir.read_csv('data.tsv',
                                          delimiter='\t',
                                          dicts=True))):
            for language, lid in languages.items():
                form = row[language].strip()
                if form:
                    lexemes = args.writer.add_forms_from_value(
                        Language_ID=lid,
                        Parameter_ID=concepts[row['Meaning']],
                        Value=form,
                        Source='Holm2017')
                    if lexemes:
                        args.writer.add_cognate(
                            lexeme=lexemes[0],
                            Cognateset_ID=str(i + 1),
                            Cognate_Detection_Method='expert',
                            Source='Holm2017')
                        D[idx] = [
                            language, wl_concepts[row['Meaning']], form, i + 1
                        ]
                        idx += 1
        Wordlist(D).output(
            'tsv', filename=self.raw_dir.joinpath('wordlist').as_posix())
Example #10
0
 def test_renumber(self):
     from lingpy.basic.ops import renumber
     tmp = Wordlist(test_data('good_file.tsv'))
     tmp.renumber('cogid', 'newcogid')
     assert 'newcogid' in tmp.header
     tmp.renumber('mock')
     assert 'mockid' in tmp.header
Example #11
0
def test_wordlist2cognates(repos, mocker, dataset):
    @attr.s
    class Lexeme(pbds.Lexeme):
        Concept = attr.ib(default=None)
        Segments = attr.ib(default=[])

    @attr.s
    class Lexeme2(pbds.Lexeme):
        Concept = attr.ib(default=None)

    dsdir = repos / 'datasets' / 'test_dataset'
    if not dsdir.joinpath('cldf').exists():
        dsdir.joinpath('cldf').mkdir()
    dataset.cognate_class = pbds.Cognate
    dataset.language_class = pbds.Language
    dataset.concept_class = pbds.Concept
    dataset.split_forms = lambda _, s: [s]
    dataset.dir = dsdir
    dataset.tr_analyses = {}
    dataset.cldf_dir = dsdir.joinpath('cldf')

    dataset.lexeme_class = Lexeme
    with dataset.cldf_writer(mocker.Mock()) as ds:
        # needs to be fixed XXX
        ds.tokenize = lambda _, x: []
        ds.add_forms_from_value(Value='form,form2',
                                Concept='meaning',
                                Language_ID='1',
                                Parameter_ID='p')

    dataset.lexeme_class = Lexeme2
    with dataset.cldf_writer(
            mocker.Mock(clts=mocker.Mock(api=pyclts.CLTS(repos)))) as ds2:
        ds2.add_form_with_segments(Value='form,form2',
                                   Concept='meaning',
                                   Language_ID='1',
                                   Parameter_ID='p',
                                   Form='form',
                                   Segments=['f', 'o'])
        # needs to be fixed XXX
        ds2.tokenize = lambda _, x: [x]
        ds2.add_form(
            Value='form,form2',
            Concept='meaning',
            Language_ID='1',
            Parameter_ID='p',
            Form='form',
        )
    # lid, ipa, concept
    wl = Wordlist(lingpy_util._cldf2wld(ds2), row='concept', col='language_id')
    res = list(lingpy_util.wordlist2cognates(wl, 'src'))
    assert isinstance(res[0], dict)
Example #12
0
    def test_wl2dst(self):
        res = wl2dst(self.wordlist, mode='jaccard')
        self.assertIsInstance(res, list)
        res = wl2dst(self.wordlist, mode='jaccard', refB='glossid')
        self.assertIsInstance(res, list)

        _ = wl2dst(self.wordlist, mode='swadesh')
        _ = wl2dst(self.wordlist, mode='shared')
        _ = wl2dst(self.wordlist, mode='swadesh', ignore_missing=True)

        # trigger zero-division-warning in wl2dst
        tmp = Wordlist({
            0: ['doculect', 'concept', 'counterpart', 'cogid'],
            1: ['l1', 'hand', 'hand', '1'],
            2: ['l2 - a (taxon) name)', 'hand', 'hand', '2'],
            3: ['l3', 'foot', 'foot', '3']
        })
        dst = wl2dst(tmp)
        assert dst[0][2] == 1
Example #13
0
def prepare(dataset):
    concepts = dict(
            [(x.english, x.concepticon_id) for x in \
                    Concepticon().conceptlists['Wang-2004-100a'].concepts.values()]
                    )

    # correct wrong pinyins in sinopy
    pinyin = {
        "虱": "shī",
        "咯": "gē",
        "強": "qiáng",
        "哩": "lǐ",
        "喏": "nuò",
        "鳧": "fú",
        "伲": "nǐ",
        "黃": "huáng",
        "哋": "dì",
        "阿": "ā",
        "卵": "luǎn",
        "說": "shuō",
        "喙": "huì",
        "頸": "jǐng",
        "唔": "wú}",
        "雞": "jī",
        "黒": "hēi",
        "哪": "nǎ",
        "麼": "me",
        "蔃": "qiáng",
        "葷": "hūn",
        "鳥": "niǎo}",
        "舌": "huà",
        "吃": "chī",
        "膘": "biǎo}",
        "綠": "lǜ",
        "羽": "yǔ",
        "們": "men",
        "焦": "jiāo",
        "腳": "jiǎo",
        "乜": "miē",
        "即": "jí",
        "佬": "lǎo",
    }

    with UnicodeReader(dataset.get_path('raw', 'Wang2004.csv'),
                       delimiter='\t') as reader:
        lines = list(reader)
    D = {}
    idx = 1
    cogids = {0: 0}
    for line in lines[1:]:
        concept = line[0]
        cid = concepts[concept]
        for t, cogs in zip(lines[0][1:], line[1:]):
            taxon = varieties_in_source[t]
            for cog in cogs.split('/'):
                if cog in cogids:
                    cogid = cogids[cog]
                else:
                    cogid = max(list(cogids.values()) or 0) + 1
                    cogids[cog] = cogid
                D[idx] = [taxon, t, concept, cid, cog, cogid]
                idx += 1
    D[0] = [
        'doculect', 'doculect_in_source', 'concept', 'concepticon_id', 'value',
        'cogid'
    ]
    wl = Wordlist(D)

    # renumber for partial cognates
    pcogs, idx = {}, 1
    converter = {}
    for k in wl:
        chars = sinopy.gbk2big5(wl[k, 'value'])
        concept = wl[k, 'concept']
        cogids = []
        for char in chars:
            if sinopy.is_chinese(char):
                if char not in pcogs:
                    pcogs[char] = idx
                    idx += 1
                cchar = concept + ':' + str(pcogs[char])
                if cchar not in pcogs:
                    pcogs[cchar] = pcogs[char]
            else:
                cchar = concept + ':' + char
                if cchar not in pcogs:
                    pcogs[cchar] = idx
                    idx += 1
            cogids += [pcogs[cchar]]
        converter[k] = ' '.join([str(x) for x in cogids])
    wl.add_entries('cogids', converter, lambda x: x)
    wl.output('tsv',
              filename=dataset.get_path('words'),
              prettify=False,
              ignore='all')

    # we also write the characters
    C = [[
        'ID', 'CHARACTER', 'PINYIN', 'WORDS_COGIDS', 'WORDS_ID', 'CONCEPT',
        'DOCULECT', 'POSITION'
    ]]
    idx = 1
    errors = {}
    for k in wl:
        concept = wl[k, 'concept']
        doculect = wl[k, 'doculect']
        chars = sinopy.gbk2big5(wl[k, 'value'])
        cogids = wl[k, 'cogids'].split(' ')
        for i, (char, cogid) in enumerate(zip(chars, cogids)):
            if sinopy.is_chinese(char):
                py = sinopy.pinyin(char)
                py = pinyin.get(char, py)
                if '?' in py or '{' in py:
                    if char in errors:
                        pass
                    else:
                        errors[char] = py
                C += [[idx, char, py, cogid, k, concept, doculect, i]]
                idx += 1
    for k, v in errors.items():
        print('"' + k + '" : "' + v + '",')
    with open(dataset.get_path('characters.tsv'), 'w') as f:
        for line in C:
            f.write('\t'.join([str(x) for x in line]) + '\n')

    # prepare the trees
    with open(dataset.get_path('raw', 'tree-100.tre')) as f1:
        with open(dataset.get_path('trees', 'tree-100.tre'), 'w') as f2:
            f2.write(''.join(
                [varieties_in_source.get(x, x) for x in f1.read()]))
    with open(dataset.get_path('raw', 'tree-95.tre')) as f1:
        with open(dataset.get_path('trees', 'tree-95.tre'), 'w') as f2:
            f2.write(''.join(
                [varieties_in_source.get(x, x) for x in f1.read()]))
Example #14
0
def wordlist(test_data):
    return Wordlist(str(test_data / 'KSL.qlc'))
Example #15
0
 def test_renumber(self):
     tmp = Wordlist(test_data('good_file.tsv'))
     tmp.renumber('cogid', 'newcogid')
     assert 'newcogid' in tmp.header
     tmp.renumber('mock')
     assert 'mockid' in tmp.header
Example #16
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.wordlist = Wordlist(test_data('KSL.qlc'))
     self.wordlist2 = Wordlist(test_data('KSL4.qlc'))
Example #17
0
 def test_renumber(self):
     tmp = Wordlist(test_data('good_file.tsv'))
     tmp.renumber('cogid', 'newcogid')
     assert 'newcogid' in tmp.header
     tmp.renumber('mock')
     assert 'mockid' in tmp.header
Example #18
0
class TestWordlist(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.wordlist = Wordlist(test_data('KSL.qlc'))
        self.wordlist2 = Wordlist(test_data('good_file.tsv'))

    def test___len__(self):
        assert len(self.wordlist) == 1400

    def test_calculate(self):
        self.wordlist.calculate('dst')

        assert hasattr(self.wordlist, 'distances')
        assert sum([self.wordlist.distances[x][x] for x in
                    range(self.wordlist.width)]) == 0

        self.wordlist.calculate('tree')
        assert str(self.wordlist.tree).endswith(';')

        assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols)

        self.wordlist.calculate('groups')

        assert hasattr(self.wordlist, 'groups')
        assert type(self.wordlist.groups) == dict

    def test_coverage(self):
        self.wordlist.coverage()
        self.wordlist.coverage(stats='ratio')
        self.wordlist.coverage(stats='mean')

    def test_get_list(self):
        gerL = self.wordlist.get_list(doculect='German', entry='ipa', flat=True)
        gerD = self.wordlist.get_dict(col='German', entry='ipa')
        gerT = self.wordlist.get_list(doculect='German', entry="ipa")

        assert sorted(gerL) == sorted([v[0] for v in gerD.values()])
        assert sorted(gerT) == sorted(gerL)

        hand1 = self.wordlist.get_list(concept="hand", entry="ipa", flat=True)
        hand2 = self.wordlist.get_dict(row="hand", entry="ipa")
        hand3 = self.wordlist.get_list(concept="hand", flat=True)
        assert sorted(hand1) == sorted([v[0] for v in hand2.values()])

        # test for synonym lines, which are flattened
        assert self.wordlist2.get_list(concept='hand', entry="language",
                flat=True).count('l6') == 2
        nonflat = self.wordlist2.get_list(concept="hand", entry="language")
        assert nonflat[0][-1] == nonflat[1][-1]
        assert len(self.wordlist2.get_list(col="l1", entry="concept")) == 3
        assert len(self.wordlist2.get_list(col="l1", flat=True, entry="concept")) == 2

        assert_raises(ValueError, self.wordlist2.get_list, col="l1",
                row="hand")
        assert_raises(ValueError, self.wordlist2.get_list)
        assert_raises(ValueError, self.wordlist.get_list, **{"row" : "Hand"})

    def test_get_dict(self):
        gerD = self.wordlist.get_dict(col='German')

        assert sorted(gerD.keys()) == sorted(self.wordlist.rows)
        assert_raises(ValueError, self.wordlist.get_dict, **{"row" : "Hand"})

    def test_renumber(self):
        self.wordlist.renumber('cogid', 'dummy')

        ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True)
        ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True)

        assert len(set(ger1)) == len(set(ger2))
        assert sum([1 for x in ger2 if type(x) == int]) == len(ger2)

    def test_get_entries(self):
        ger = self.wordlist.get_entries('cogid')

        assert len(ger) == self.wordlist.height
        assert len(ger[0]) == self.wordlist.width

    def test_get_etymdict(self):
        etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', modify_ref=False)
        etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa',
                modify_ref=abs)

        assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \
                                         len(etd2)
        assert len([x for x in etd2 if x < 0]) == 0

        # make "fuzzy" cognate sets
        self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x])

        etd3 = self.wordlist.get_etymdict(
            ref='fuzzyid', entry='ipa', modify_ref=False)
        etd4 = self.wordlist.get_etymdict(
            ref='fuzzyid', entry='ipa', modify_ref=abs)
        for key in etd1:
            assert etd1[key] == etd3[key]
        for key in etd2:
            self.assertEquals(etd2[key], etd4[key])

    def test_get_paps(self):
        paps = self.wordlist.get_paps(ref="cogid", modify_ref=abs)
        cogs = self.wordlist.get_etymdict(ref="cogid", modify_ref=abs)

        for key in cogs:
            if abs(key) in paps:
                assert True
            else:
                print(key)
                assert False

    def test_output(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'tsv taxa tre dst starling paps.nex paps.csv separated multistate.nex groups'.split():
            kw = {'ref': 'word'} if fmt == 'starling' else {}
            self.wordlist.output(fmt, filename=fn, **kw)
            if fmt == 'starling':
                self.wordlist.output(fmt, filename=fn, cognates='cogid', **kw)
            if fmt == 'tsv':
                kw['subset'] = True
                self.wordlist.output(fmt, filename=fn, cols=[], rows={}, **kw)
                self.wordlist.output(fmt, filename=fn,
                        cols=sorted(self.wordlist.header)[:2], rows=dict(ID=" > 10"),
                            **kw)
    def test_export(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'txt tex html'.split():
            self.wordlist.export(fmt, filename=fn)

    def test_get_wordlist(self):
        from lingpy.basic.wordlist import get_wordlist
        wl1 = get_wordlist(test_data('mycsvwordlist.csv'))
        wl2 = get_wordlist(test_data('mycsvwordlistwithoutids.csv'))
        assert wl1.height == wl2.height
        for k in wl1:
            assert wl1[k, 'concept'] == wl2[k, 'concept']
Example #19
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.wordlist = Wordlist(test_data('KSL.qlc'))
     self.wordlist2 = Wordlist(test_data('good_file.tsv'))
Example #20
0
def wordlist2(test_data):
    return Wordlist(str(test_data / 'good_file.tsv'))
Example #21
0
class TestWordlist(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.wordlist = Wordlist(test_data('KSL.qlc'))
        self.wordlist2 = Wordlist(test_data('good_file.tsv'))

    def test___len__(self):
        assert len(self.wordlist) == 1400

    def test_calculate(self):
        self.wordlist.calculate('dst')

        assert hasattr(self.wordlist, 'distances')
        assert sum([
            self.wordlist.distances[x][x] for x in range(self.wordlist.width)
        ]) == 0

        self.wordlist.calculate('tree')
        assert str(self.wordlist.tree).endswith(';')

        assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols)

        self.wordlist.calculate('groups')

        assert hasattr(self.wordlist, 'groups')
        assert type(self.wordlist.groups) == dict

    def test_coverage(self):
        self.wordlist.coverage()
        self.wordlist.coverage(stats='ratio')
        self.wordlist.coverage(stats='mean')

    def test_get_list(self):
        ger_l = self.wordlist.get_list(doculect='German',
                                       entry='ipa',
                                       flat=True)
        ger_d = self.wordlist.get_dict(col='German', entry='ipa')
        ger_t = self.wordlist.get_list(doculect='German', entry="ipa")

        assert sorted(ger_l) == sorted([v[0] for v in ger_d.values()])
        assert sorted(ger_t) == sorted(ger_l)

        hand1 = self.wordlist.get_list(concept="hand", entry="ipa", flat=True)
        hand2 = self.wordlist.get_dict(row="hand", entry="ipa")
        assert sorted(hand1) == sorted([v[0] for v in hand2.values()])

        # test for synonym lines, which are flattened
        assert self.wordlist2.get_list(concept='hand',
                                       entry="language",
                                       flat=True).count('l6') == 2
        nonflat = self.wordlist2.get_list(concept="hand", entry="language")
        assert nonflat[0][-1] == nonflat[1][-1]
        assert len(self.wordlist2.get_list(col="l1", entry="concept")) == 3
        assert len(
            self.wordlist2.get_list(col="l1", flat=True, entry="concept")) == 2

        assert_raises(ValueError,
                      self.wordlist2.get_list,
                      col="l1",
                      row="hand")
        assert_raises(ValueError, self.wordlist2.get_list)
        assert_raises(ValueError, self.wordlist.get_list, **{"row": "Hand"})

    def test_get_dict(self):
        ger_d = self.wordlist.get_dict(col='German')

        assert sorted(ger_d.keys()) == sorted(self.wordlist.rows)
        assert_raises(ValueError, self.wordlist.get_dict, **{"row": "Hand"})

    def test_renumber(self):
        self.wordlist.renumber('cogid', 'dummy')

        ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True)
        ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True)

        assert len(set(ger1)) == len(set(ger2))
        assert sum([1 for x in ger2 if type(x) == int]) == len(ger2)

    def test_get_entries(self):
        ger = self.wordlist.get_entries('cogid')

        assert len(ger) == self.wordlist.height
        assert len(ger[0]) == self.wordlist.width

    def test_get_etymdict(self):
        etd1 = self.wordlist.get_etymdict(ref='cogid',
                                          entry='ipa',
                                          modify_ref=False)
        etd2 = self.wordlist.get_etymdict(ref='cogid',
                                          entry='ipa',
                                          modify_ref=abs)

        assert (len(etd1) > len(etd2)
                and len(set([abs(x) for x in etd1])) == len(etd2))
        assert len([x for x in etd2 if x < 0]) == 0

        # make "fuzzy" cognate sets
        self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x])

        etd3 = self.wordlist.get_etymdict(ref='fuzzyid',
                                          entry='ipa',
                                          modify_ref=False)
        etd4 = self.wordlist.get_etymdict(ref='fuzzyid',
                                          entry='ipa',
                                          modify_ref=abs)
        for key in etd1:
            assert etd1[key] == etd3[key]
        for key in etd2:
            self.assertEqual(etd2[key], etd4[key])

    def test_get_paps(self):
        paps = self.wordlist.get_paps(ref="cogid", modify_ref=abs)
        cogs = self.wordlist.get_etymdict(ref="cogid", modify_ref=abs)

        for key in cogs:
            if abs(key) in paps:
                assert True
            else:
                print(key)
                assert False

    def test_output(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'tsv taxa tre dst starling paps.nex paps.csv' \
                   'separated multistate.nex groups'.split():
            kw = {'ref': 'word'} if fmt == 'starling' else {}
            self.wordlist.output(fmt, filename=fn, **kw)

            if fmt == 'starling':
                self.wordlist.output(fmt, filename=fn, cognates='cogid', **kw)
            if fmt == 'tsv':
                kw['subset'] = True
                self.wordlist.output(fmt, filename=fn, cols=[], rows={}, **kw)
                self.wordlist.output(fmt,
                                     filename=fn,
                                     cols=sorted(self.wordlist.header)[:2],
                                     rows=dict(ID=" > 10"),
                                     **kw)

    def test_export(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'txt tex html'.split():
            self.wordlist.export(fmt, filename=fn)

    def test_get_wordlist(self):
        from lingpy.basic.wordlist import get_wordlist
        wl1 = get_wordlist(test_data('mycsvwordlist.csv'))
        wl2 = get_wordlist(test_data('mycsvwordlistwithoutids.csv'))
        assert wl1.height == wl2.height
        for k in wl1:
            assert wl1[k, 'concept'] == wl2[k, 'concept']
Example #22
0
class TestWordlist(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.wordlist = Wordlist(test_data('KSL.qlc'))

    def test___len__(self):
        assert len(self.wordlist) == 1400

    def test_calculate(self):
        self.wordlist.calculate('dst')

        assert hasattr(self.wordlist,'distances')
        assert sum([self.wordlist.distances[x][x] for x in
            range(self.wordlist.width)]) == 0

        self.wordlist.calculate('tree')

        assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols)

        self.wordlist.calculate('groups')

        assert hasattr(self.wordlist,'groups')
        assert type(self.wordlist.groups) == dict

    def test_get_list(self):
        gerL = self.wordlist.get_list(col='German', entry='ipa', flat=True)
        gerD = self.wordlist.get_dict(col='German',entry='ipa')

        assert sorted(gerL) == sorted([v[0] for v in gerD.values()])

    def test_get_dict(self):
        gerD = self.wordlist.get_dict(col='German')

        assert sorted(gerD.keys()) == sorted(self.wordlist.rows)

    def test_renumber(self):
        self.wordlist.renumber('cogid','dummy')

        ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True)
        ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True)

        assert len(set(ger1)) == len(set(ger2))
        assert sum([1 for x in ger2 if type(x) == int]) == len(ger2)

    def test_get_entries(self):
        ger = self.wordlist.get_entries('cogid')

        assert len(ger) == self.wordlist.height
        assert len(ger[0]) == self.wordlist.width

    def get_etymdict(self):
        etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa',
                loans=False)
        etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa',
                loans=True)

        assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \
                len(etd2)
        assert len([x for x in etd2 if x < 0]) == 0

        # make "fuzzy" cognate sets
        self.wordlist.add_entries(
                'fuzzyid',
                'cogid',
                lambda x: [x]
                )

        etd3 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa',
                loans=False, fuzzy=True)
        etd4 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa',
                loans=True, fuzzy=True)
        for key in etd1:
            assert etd1[key] == etd3[key]
        for key in etd2:
            assert etd2[key] == etd4[key]

    def test_get_paps(self):
        paps = self.wordlist.get_paps(ref="cogid", loans=True)
        cogs = self.wordlist.get_etymdict(ref="cogid", loans=True)
        
        for key in cogs:
            if abs(key) in paps:
                assert True
            else:
                print(key)
                assert False
    
    def test_output(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'taxa tre dst starling paps.nex paps.csv'.split():
            kw = {'ref': 'word'} if fmt == 'starling' else {}
            self.wordlist.output(fmt, filename=fn, **kw)
Example #23
0
    def cmd_makecldf(self, args):
        args.writer.add_sources()

        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Number=concept.number,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.number] = idx
        languages = args.writer.add_languages(lookup_factory="Name")

        # we combine with the manually edited wordlist to retrieve the lexeme
        # values
        wl = Wordlist(self.raw_dir.joinpath('deepadungpalaung.tsv').as_posix())
        mapper = {
            (concept, language, normalize("NFD", form)): segments
            for (idx, concept, language, form, segments
                 ) in wl.iter_rows('concept', 'doculect', 'form', 'tokens')
        }
        data = self.raw_dir.read_csv('100item-phylo.Sheet1.csv', dicts=False)
        for i, row in progressbar(enumerate(data[4:])):
            number = row[0].strip().strip('.')
            concept = row[1].strip()
            for j in range(0, len(row) - 2, 2):
                language = data[2][j + 2]
                value = row[j + 2]
                if value.strip() and value.strip() not in ['-----']:
                    if ',' in row[j + 2]:
                        forms = [v.strip() for v in value.split(',')]
                        cogids = [
                            str(int(float(x)))
                            for x in row[j + 3].split(' or ')
                        ]
                    else:
                        forms = [value.strip()]
                        cogids = [str(int(float(row[j + 3].split(' or ')[0])))]

                    for form, cogid in zip(forms, cogids):
                        try:
                            segments = mapper[concept, languages[language],
                                              form]
                            lexeme = args.writer.add_form_with_segments(
                                Parameter_ID=concepts[number],
                                Language_ID=languages[language],
                                Value=value.strip(),
                                Form=form,
                                Segments=segments,
                                Source="Deepadung2015")
                        except:
                            args.log.warn(
                                'lexeme missing {0} / {1} / {2}'.format(
                                    concept, language, form))
                            lexeme = args.writer.add_form(
                                Parameter_ID=concepts[number],
                                Language_ID=languages[language],
                                Value=value.strip(),
                                Form=form,
                                Source="Deepadung2015")
                        args.writer.add_cognate(lexeme=lexeme,
                                                Cognateset_ID=cogid + '-' +
                                                number,
                                                Source="Deepadung2015")
Example #24
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.wordlist = Wordlist(test_data('KSL.qlc'))
Example #25
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.wordlist = Wordlist(test_data('KSL.qlc'))
     self.wordlist2 = Wordlist(test_data('good_file.tsv'))
Example #26
0
class TestWordlist(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.wordlist = Wordlist(test_data('KSL.qlc'))

    def test___len__(self):
        assert len(self.wordlist) == 1400

    def test_calculate(self):
        self.wordlist.calculate('dst')

        assert hasattr(self.wordlist, 'distances')
        assert sum([
            self.wordlist.distances[x][x] for x in range(self.wordlist.width)
        ]) == 0

        self.wordlist.calculate('tree')

        assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols)

        self.wordlist.calculate('groups')

        assert hasattr(self.wordlist, 'groups')
        assert type(self.wordlist.groups) == dict

    def test_get_list(self):
        gerL = self.wordlist.get_list(col='German', entry='ipa', flat=True)
        gerD = self.wordlist.get_dict(col='German', entry='ipa')

        assert sorted(gerL) == sorted([v[0] for v in gerD.values()])

    def test_get_dict(self):
        gerD = self.wordlist.get_dict(col='German')

        assert sorted(gerD.keys()) == sorted(self.wordlist.rows)

    def test_renumber(self):
        self.wordlist.renumber('cogid', 'dummy')

        ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True)
        ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True)

        assert len(set(ger1)) == len(set(ger2))
        assert sum([1 for x in ger2 if type(x) == int]) == len(ger2)

    def test_get_entries(self):
        ger = self.wordlist.get_entries('cogid')

        assert len(ger) == self.wordlist.height
        assert len(ger[0]) == self.wordlist.width

    def get_etymdict(self):
        etd1 = self.wordlist.get_etymdict(ref='cogid',
                                          entry='ipa',
                                          loans=False)
        etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', loans=True)

        assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \
                len(etd2)
        assert len([x for x in etd2 if x < 0]) == 0

        # make "fuzzy" cognate sets
        self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x])

        etd3 = self.wordlist.get_etymdict(ref='fuzzyid',
                                          entry='ipa',
                                          loans=False,
                                          fuzzy=True)
        etd4 = self.wordlist.get_etymdict(ref='fuzzyid',
                                          entry='ipa',
                                          loans=True,
                                          fuzzy=True)
        for key in etd1:
            assert etd1[key] == etd3[key]
        for key in etd2:
            assert etd2[key] == etd4[key]

    def test_get_paps(self):
        paps = self.wordlist.get_paps(ref="cogid", loans=True)
        cogs = self.wordlist.get_etymdict(ref="cogid", loans=True)

        for key in cogs:
            if abs(key) in paps:
                assert True
            else:
                print(key)
                assert False

    def test_output(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'taxa tre dst starling paps.nex paps.csv'.split():
            kw = {'ref': 'word'} if fmt == 'starling' else {}
            self.wordlist.output(fmt, filename=fn, **kw)