Beispiel #1
0
class TestWordlist(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.wordlist = Wordlist(test_data('KSL.qlc'))
        self.wordlist2 = Wordlist(test_data('good_file.tsv'))

    def test___len__(self):
        assert len(self.wordlist) == 1400

    def test_calculate(self):
        self.wordlist.calculate('dst')

        assert hasattr(self.wordlist, 'distances')
        assert sum([
            self.wordlist.distances[x][x] for x in range(self.wordlist.width)
        ]) == 0

        self.wordlist.calculate('tree')
        assert str(self.wordlist.tree).endswith(';')

        assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols)

        self.wordlist.calculate('groups')

        assert hasattr(self.wordlist, 'groups')
        assert type(self.wordlist.groups) == dict

    def test_coverage(self):
        self.wordlist.coverage()
        self.wordlist.coverage(stats='ratio')
        self.wordlist.coverage(stats='mean')

    def test_get_list(self):
        ger_l = self.wordlist.get_list(doculect='German',
                                       entry='ipa',
                                       flat=True)
        ger_d = self.wordlist.get_dict(col='German', entry='ipa')
        ger_t = self.wordlist.get_list(doculect='German', entry="ipa")

        assert sorted(ger_l) == sorted([v[0] for v in ger_d.values()])
        assert sorted(ger_t) == sorted(ger_l)

        hand1 = self.wordlist.get_list(concept="hand", entry="ipa", flat=True)
        hand2 = self.wordlist.get_dict(row="hand", entry="ipa")
        assert sorted(hand1) == sorted([v[0] for v in hand2.values()])

        # test for synonym lines, which are flattened
        assert self.wordlist2.get_list(concept='hand',
                                       entry="language",
                                       flat=True).count('l6') == 2
        nonflat = self.wordlist2.get_list(concept="hand", entry="language")
        assert nonflat[0][-1] == nonflat[1][-1]
        assert len(self.wordlist2.get_list(col="l1", entry="concept")) == 3
        assert len(
            self.wordlist2.get_list(col="l1", flat=True, entry="concept")) == 2

        assert_raises(ValueError,
                      self.wordlist2.get_list,
                      col="l1",
                      row="hand")
        assert_raises(ValueError, self.wordlist2.get_list)
        assert_raises(ValueError, self.wordlist.get_list, **{"row": "Hand"})

    def test_get_dict(self):
        ger_d = self.wordlist.get_dict(col='German')

        assert sorted(ger_d.keys()) == sorted(self.wordlist.rows)
        assert_raises(ValueError, self.wordlist.get_dict, **{"row": "Hand"})

    def test_renumber(self):
        self.wordlist.renumber('cogid', 'dummy')

        ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True)
        ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True)

        assert len(set(ger1)) == len(set(ger2))
        assert sum([1 for x in ger2 if type(x) == int]) == len(ger2)

    def test_get_entries(self):
        ger = self.wordlist.get_entries('cogid')

        assert len(ger) == self.wordlist.height
        assert len(ger[0]) == self.wordlist.width

    def test_get_etymdict(self):
        etd1 = self.wordlist.get_etymdict(ref='cogid',
                                          entry='ipa',
                                          modify_ref=False)
        etd2 = self.wordlist.get_etymdict(ref='cogid',
                                          entry='ipa',
                                          modify_ref=abs)

        assert (len(etd1) > len(etd2)
                and len(set([abs(x) for x in etd1])) == len(etd2))
        assert len([x for x in etd2 if x < 0]) == 0

        # make "fuzzy" cognate sets
        self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x])

        etd3 = self.wordlist.get_etymdict(ref='fuzzyid',
                                          entry='ipa',
                                          modify_ref=False)
        etd4 = self.wordlist.get_etymdict(ref='fuzzyid',
                                          entry='ipa',
                                          modify_ref=abs)
        for key in etd1:
            assert etd1[key] == etd3[key]
        for key in etd2:
            self.assertEqual(etd2[key], etd4[key])

    def test_get_paps(self):
        paps = self.wordlist.get_paps(ref="cogid", modify_ref=abs)
        cogs = self.wordlist.get_etymdict(ref="cogid", modify_ref=abs)

        for key in cogs:
            if abs(key) in paps:
                assert True
            else:
                print(key)
                assert False

    def test_output(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'tsv taxa tre dst starling paps.nex paps.csv' \
                   'separated multistate.nex groups'.split():
            kw = {'ref': 'word'} if fmt == 'starling' else {}
            self.wordlist.output(fmt, filename=fn, **kw)

            if fmt == 'starling':
                self.wordlist.output(fmt, filename=fn, cognates='cogid', **kw)
            if fmt == 'tsv':
                kw['subset'] = True
                self.wordlist.output(fmt, filename=fn, cols=[], rows={}, **kw)
                self.wordlist.output(fmt,
                                     filename=fn,
                                     cols=sorted(self.wordlist.header)[:2],
                                     rows=dict(ID=" > 10"),
                                     **kw)

    def test_export(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'txt tex html'.split():
            self.wordlist.export(fmt, filename=fn)

    def test_get_wordlist(self):
        from lingpy.basic.wordlist import get_wordlist
        wl1 = get_wordlist(test_data('mycsvwordlist.csv'))
        wl2 = get_wordlist(test_data('mycsvwordlistwithoutids.csv'))
        assert wl1.height == wl2.height
        for k in wl1:
            assert wl1[k, 'concept'] == wl2[k, 'concept']
Beispiel #2
0
def prepare(dataset):
    concepts = dict(
            [(x.english, x.concepticon_id) for x in \
                    Concepticon().conceptlists['Wang-2004-100a'].concepts.values()]
                    )

    # correct wrong pinyins in sinopy
    pinyin = {
        "虱": "shī",
        "咯": "gē",
        "強": "qiáng",
        "哩": "lǐ",
        "喏": "nuò",
        "鳧": "fú",
        "伲": "nǐ",
        "黃": "huáng",
        "哋": "dì",
        "阿": "ā",
        "卵": "luǎn",
        "說": "shuō",
        "喙": "huì",
        "頸": "jǐng",
        "唔": "wú}",
        "雞": "jī",
        "黒": "hēi",
        "哪": "nǎ",
        "麼": "me",
        "蔃": "qiáng",
        "葷": "hūn",
        "鳥": "niǎo}",
        "舌": "huà",
        "吃": "chī",
        "膘": "biǎo}",
        "綠": "lǜ",
        "羽": "yǔ",
        "們": "men",
        "焦": "jiāo",
        "腳": "jiǎo",
        "乜": "miē",
        "即": "jí",
        "佬": "lǎo",
    }

    with UnicodeReader(dataset.get_path('raw', 'Wang2004.csv'),
                       delimiter='\t') as reader:
        lines = list(reader)
    D = {}
    idx = 1
    cogids = {0: 0}
    for line in lines[1:]:
        concept = line[0]
        cid = concepts[concept]
        for t, cogs in zip(lines[0][1:], line[1:]):
            taxon = varieties_in_source[t]
            for cog in cogs.split('/'):
                if cog in cogids:
                    cogid = cogids[cog]
                else:
                    cogid = max(list(cogids.values()) or 0) + 1
                    cogids[cog] = cogid
                D[idx] = [taxon, t, concept, cid, cog, cogid]
                idx += 1
    D[0] = [
        'doculect', 'doculect_in_source', 'concept', 'concepticon_id', 'value',
        'cogid'
    ]
    wl = Wordlist(D)

    # renumber for partial cognates
    pcogs, idx = {}, 1
    converter = {}
    for k in wl:
        chars = sinopy.gbk2big5(wl[k, 'value'])
        concept = wl[k, 'concept']
        cogids = []
        for char in chars:
            if sinopy.is_chinese(char):
                if char not in pcogs:
                    pcogs[char] = idx
                    idx += 1
                cchar = concept + ':' + str(pcogs[char])
                if cchar not in pcogs:
                    pcogs[cchar] = pcogs[char]
            else:
                cchar = concept + ':' + char
                if cchar not in pcogs:
                    pcogs[cchar] = idx
                    idx += 1
            cogids += [pcogs[cchar]]
        converter[k] = ' '.join([str(x) for x in cogids])
    wl.add_entries('cogids', converter, lambda x: x)
    wl.output('tsv',
              filename=dataset.get_path('words'),
              prettify=False,
              ignore='all')

    # we also write the characters
    C = [[
        'ID', 'CHARACTER', 'PINYIN', 'WORDS_COGIDS', 'WORDS_ID', 'CONCEPT',
        'DOCULECT', 'POSITION'
    ]]
    idx = 1
    errors = {}
    for k in wl:
        concept = wl[k, 'concept']
        doculect = wl[k, 'doculect']
        chars = sinopy.gbk2big5(wl[k, 'value'])
        cogids = wl[k, 'cogids'].split(' ')
        for i, (char, cogid) in enumerate(zip(chars, cogids)):
            if sinopy.is_chinese(char):
                py = sinopy.pinyin(char)
                py = pinyin.get(char, py)
                if '?' in py or '{' in py:
                    if char in errors:
                        pass
                    else:
                        errors[char] = py
                C += [[idx, char, py, cogid, k, concept, doculect, i]]
                idx += 1
    for k, v in errors.items():
        print('"' + k + '" : "' + v + '",')
    with open(dataset.get_path('characters.tsv'), 'w') as f:
        for line in C:
            f.write('\t'.join([str(x) for x in line]) + '\n')

    # prepare the trees
    with open(dataset.get_path('raw', 'tree-100.tre')) as f1:
        with open(dataset.get_path('trees', 'tree-100.tre'), 'w') as f2:
            f2.write(''.join(
                [varieties_in_source.get(x, x) for x in f1.read()]))
    with open(dataset.get_path('raw', 'tree-95.tre')) as f1:
        with open(dataset.get_path('trees', 'tree-95.tre'), 'w') as f2:
            f2.write(''.join(
                [varieties_in_source.get(x, x) for x in f1.read()]))
Beispiel #3
0
class TestWordlist(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.wordlist = Wordlist(test_data('KSL.qlc'))
        self.wordlist2 = Wordlist(test_data('good_file.tsv'))

    def test___len__(self):
        assert len(self.wordlist) == 1400

    def test_calculate(self):
        self.wordlist.calculate('dst')

        assert hasattr(self.wordlist, 'distances')
        assert sum([self.wordlist.distances[x][x] for x in
                    range(self.wordlist.width)]) == 0

        self.wordlist.calculate('tree')
        assert str(self.wordlist.tree).endswith(';')

        assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols)

        self.wordlist.calculate('groups')

        assert hasattr(self.wordlist, 'groups')
        assert type(self.wordlist.groups) == dict

    def test_coverage(self):
        self.wordlist.coverage()
        self.wordlist.coverage(stats='ratio')
        self.wordlist.coverage(stats='mean')

    def test_get_list(self):
        gerL = self.wordlist.get_list(doculect='German', entry='ipa', flat=True)
        gerD = self.wordlist.get_dict(col='German', entry='ipa')
        gerT = self.wordlist.get_list(doculect='German', entry="ipa")

        assert sorted(gerL) == sorted([v[0] for v in gerD.values()])
        assert sorted(gerT) == sorted(gerL)

        hand1 = self.wordlist.get_list(concept="hand", entry="ipa", flat=True)
        hand2 = self.wordlist.get_dict(row="hand", entry="ipa")
        hand3 = self.wordlist.get_list(concept="hand", flat=True)
        assert sorted(hand1) == sorted([v[0] for v in hand2.values()])

        # test for synonym lines, which are flattened
        assert self.wordlist2.get_list(concept='hand', entry="language",
                flat=True).count('l6') == 2
        nonflat = self.wordlist2.get_list(concept="hand", entry="language")
        assert nonflat[0][-1] == nonflat[1][-1]
        assert len(self.wordlist2.get_list(col="l1", entry="concept")) == 3
        assert len(self.wordlist2.get_list(col="l1", flat=True, entry="concept")) == 2

        assert_raises(ValueError, self.wordlist2.get_list, col="l1",
                row="hand")
        assert_raises(ValueError, self.wordlist2.get_list)
        assert_raises(ValueError, self.wordlist.get_list, **{"row" : "Hand"})

    def test_get_dict(self):
        gerD = self.wordlist.get_dict(col='German')

        assert sorted(gerD.keys()) == sorted(self.wordlist.rows)
        assert_raises(ValueError, self.wordlist.get_dict, **{"row" : "Hand"})

    def test_renumber(self):
        self.wordlist.renumber('cogid', 'dummy')

        ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True)
        ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True)

        assert len(set(ger1)) == len(set(ger2))
        assert sum([1 for x in ger2 if type(x) == int]) == len(ger2)

    def test_get_entries(self):
        ger = self.wordlist.get_entries('cogid')

        assert len(ger) == self.wordlist.height
        assert len(ger[0]) == self.wordlist.width

    def test_get_etymdict(self):
        etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', modify_ref=False)
        etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa',
                modify_ref=abs)

        assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \
                                         len(etd2)
        assert len([x for x in etd2 if x < 0]) == 0

        # make "fuzzy" cognate sets
        self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x])

        etd3 = self.wordlist.get_etymdict(
            ref='fuzzyid', entry='ipa', modify_ref=False)
        etd4 = self.wordlist.get_etymdict(
            ref='fuzzyid', entry='ipa', modify_ref=abs)
        for key in etd1:
            assert etd1[key] == etd3[key]
        for key in etd2:
            self.assertEquals(etd2[key], etd4[key])

    def test_get_paps(self):
        paps = self.wordlist.get_paps(ref="cogid", modify_ref=abs)
        cogs = self.wordlist.get_etymdict(ref="cogid", modify_ref=abs)

        for key in cogs:
            if abs(key) in paps:
                assert True
            else:
                print(key)
                assert False

    def test_output(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'tsv taxa tre dst starling paps.nex paps.csv separated multistate.nex groups'.split():
            kw = {'ref': 'word'} if fmt == 'starling' else {}
            self.wordlist.output(fmt, filename=fn, **kw)
            if fmt == 'starling':
                self.wordlist.output(fmt, filename=fn, cognates='cogid', **kw)
            if fmt == 'tsv':
                kw['subset'] = True
                self.wordlist.output(fmt, filename=fn, cols=[], rows={}, **kw)
                self.wordlist.output(fmt, filename=fn,
                        cols=sorted(self.wordlist.header)[:2], rows=dict(ID=" > 10"),
                            **kw)
    def test_export(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'txt tex html'.split():
            self.wordlist.export(fmt, filename=fn)

    def test_get_wordlist(self):
        from lingpy.basic.wordlist import get_wordlist
        wl1 = get_wordlist(test_data('mycsvwordlist.csv'))
        wl2 = get_wordlist(test_data('mycsvwordlistwithoutids.csv'))
        assert wl1.height == wl2.height
        for k in wl1:
            assert wl1[k, 'concept'] == wl2[k, 'concept']
class TestWordlist(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.wordlist = Wordlist(test_data('KSL.qlc'))

    def test___len__(self):
        assert len(self.wordlist) == 1400

    def test_calculate(self):
        self.wordlist.calculate('dst')

        assert hasattr(self.wordlist,'distances')
        assert sum([self.wordlist.distances[x][x] for x in
            range(self.wordlist.width)]) == 0

        self.wordlist.calculate('tree')

        assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols)

        self.wordlist.calculate('groups')

        assert hasattr(self.wordlist,'groups')
        assert type(self.wordlist.groups) == dict

    def test_get_list(self):
        gerL = self.wordlist.get_list(col='German', entry='ipa', flat=True)
        gerD = self.wordlist.get_dict(col='German',entry='ipa')

        assert sorted(gerL) == sorted([v[0] for v in gerD.values()])

    def test_get_dict(self):
        gerD = self.wordlist.get_dict(col='German')

        assert sorted(gerD.keys()) == sorted(self.wordlist.rows)

    def test_renumber(self):
        self.wordlist.renumber('cogid','dummy')

        ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True)
        ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True)

        assert len(set(ger1)) == len(set(ger2))
        assert sum([1 for x in ger2 if type(x) == int]) == len(ger2)

    def test_get_entries(self):
        ger = self.wordlist.get_entries('cogid')

        assert len(ger) == self.wordlist.height
        assert len(ger[0]) == self.wordlist.width

    def get_etymdict(self):
        etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa',
                loans=False)
        etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa',
                loans=True)

        assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \
                len(etd2)
        assert len([x for x in etd2 if x < 0]) == 0

        # make "fuzzy" cognate sets
        self.wordlist.add_entries(
                'fuzzyid',
                'cogid',
                lambda x: [x]
                )

        etd3 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa',
                loans=False, fuzzy=True)
        etd4 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa',
                loans=True, fuzzy=True)
        for key in etd1:
            assert etd1[key] == etd3[key]
        for key in etd2:
            assert etd2[key] == etd4[key]

    def test_get_paps(self):
        paps = self.wordlist.get_paps(ref="cogid", loans=True)
        cogs = self.wordlist.get_etymdict(ref="cogid", loans=True)
        
        for key in cogs:
            if abs(key) in paps:
                assert True
            else:
                print(key)
                assert False
    
    def test_output(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'taxa tre dst starling paps.nex paps.csv'.split():
            kw = {'ref': 'word'} if fmt == 'starling' else {}
            self.wordlist.output(fmt, filename=fn, **kw)
Beispiel #5
0
class TestWordlist(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.wordlist = Wordlist(test_data('KSL.qlc'))

    def test___len__(self):
        assert len(self.wordlist) == 1400

    def test_calculate(self):
        self.wordlist.calculate('dst')

        assert hasattr(self.wordlist, 'distances')
        assert sum([
            self.wordlist.distances[x][x] for x in range(self.wordlist.width)
        ]) == 0

        self.wordlist.calculate('tree')

        assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols)

        self.wordlist.calculate('groups')

        assert hasattr(self.wordlist, 'groups')
        assert type(self.wordlist.groups) == dict

    def test_get_list(self):
        gerL = self.wordlist.get_list(col='German', entry='ipa', flat=True)
        gerD = self.wordlist.get_dict(col='German', entry='ipa')

        assert sorted(gerL) == sorted([v[0] for v in gerD.values()])

    def test_get_dict(self):
        gerD = self.wordlist.get_dict(col='German')

        assert sorted(gerD.keys()) == sorted(self.wordlist.rows)

    def test_renumber(self):
        self.wordlist.renumber('cogid', 'dummy')

        ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True)
        ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True)

        assert len(set(ger1)) == len(set(ger2))
        assert sum([1 for x in ger2 if type(x) == int]) == len(ger2)

    def test_get_entries(self):
        ger = self.wordlist.get_entries('cogid')

        assert len(ger) == self.wordlist.height
        assert len(ger[0]) == self.wordlist.width

    def get_etymdict(self):
        etd1 = self.wordlist.get_etymdict(ref='cogid',
                                          entry='ipa',
                                          loans=False)
        etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', loans=True)

        assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \
                len(etd2)
        assert len([x for x in etd2 if x < 0]) == 0

        # make "fuzzy" cognate sets
        self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x])

        etd3 = self.wordlist.get_etymdict(ref='fuzzyid',
                                          entry='ipa',
                                          loans=False,
                                          fuzzy=True)
        etd4 = self.wordlist.get_etymdict(ref='fuzzyid',
                                          entry='ipa',
                                          loans=True,
                                          fuzzy=True)
        for key in etd1:
            assert etd1[key] == etd3[key]
        for key in etd2:
            assert etd2[key] == etd4[key]

    def test_get_paps(self):
        paps = self.wordlist.get_paps(ref="cogid", loans=True)
        cogs = self.wordlist.get_etymdict(ref="cogid", loans=True)

        for key in cogs:
            if abs(key) in paps:
                assert True
            else:
                print(key)
                assert False

    def test_output(self):
        fn = text_type(self.tmp_path('test'))
        for fmt in 'taxa tre dst starling paps.nex paps.csv'.split():
            kw = {'ref': 'word'} if fmt == 'starling' else {}
            self.wordlist.output(fmt, filename=fn, **kw)