class TestWordlist(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc')) self.wordlist2 = Wordlist(test_data('good_file.tsv')) def test___len__(self): assert len(self.wordlist) == 1400 def test_calculate(self): self.wordlist.calculate('dst') assert hasattr(self.wordlist, 'distances') assert sum([ self.wordlist.distances[x][x] for x in range(self.wordlist.width) ]) == 0 self.wordlist.calculate('tree') assert str(self.wordlist.tree).endswith(';') assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols) self.wordlist.calculate('groups') assert hasattr(self.wordlist, 'groups') assert type(self.wordlist.groups) == dict def test_coverage(self): self.wordlist.coverage() self.wordlist.coverage(stats='ratio') self.wordlist.coverage(stats='mean') def test_get_list(self): ger_l = self.wordlist.get_list(doculect='German', entry='ipa', flat=True) ger_d = self.wordlist.get_dict(col='German', entry='ipa') ger_t = self.wordlist.get_list(doculect='German', entry="ipa") assert sorted(ger_l) == sorted([v[0] for v in ger_d.values()]) assert sorted(ger_t) == sorted(ger_l) hand1 = self.wordlist.get_list(concept="hand", entry="ipa", flat=True) hand2 = self.wordlist.get_dict(row="hand", entry="ipa") assert sorted(hand1) == sorted([v[0] for v in hand2.values()]) # test for synonym lines, which are flattened assert self.wordlist2.get_list(concept='hand', entry="language", flat=True).count('l6') == 2 nonflat = self.wordlist2.get_list(concept="hand", entry="language") assert nonflat[0][-1] == nonflat[1][-1] assert len(self.wordlist2.get_list(col="l1", entry="concept")) == 3 assert len( self.wordlist2.get_list(col="l1", flat=True, entry="concept")) == 2 assert_raises(ValueError, self.wordlist2.get_list, col="l1", row="hand") assert_raises(ValueError, self.wordlist2.get_list) assert_raises(ValueError, self.wordlist.get_list, **{"row": "Hand"}) def test_get_dict(self): ger_d = self.wordlist.get_dict(col='German') assert sorted(ger_d.keys()) == sorted(self.wordlist.rows) assert_raises(ValueError, self.wordlist.get_dict, **{"row": "Hand"}) def test_renumber(self): self.wordlist.renumber('cogid', 'dummy') ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True) ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True) assert len(set(ger1)) == len(set(ger2)) assert sum([1 for x in ger2 if type(x) == int]) == len(ger2) def test_get_entries(self): ger = self.wordlist.get_entries('cogid') assert len(ger) == self.wordlist.height assert len(ger[0]) == self.wordlist.width def test_get_etymdict(self): etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', modify_ref=False) etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', modify_ref=abs) assert (len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == len(etd2)) assert len([x for x in etd2 if x < 0]) == 0 # make "fuzzy" cognate sets self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x]) etd3 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', modify_ref=False) etd4 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', modify_ref=abs) for key in etd1: assert etd1[key] == etd3[key] for key in etd2: self.assertEqual(etd2[key], etd4[key]) def test_get_paps(self): paps = self.wordlist.get_paps(ref="cogid", modify_ref=abs) cogs = self.wordlist.get_etymdict(ref="cogid", modify_ref=abs) for key in cogs: if abs(key) in paps: assert True else: print(key) assert False def test_output(self): fn = text_type(self.tmp_path('test')) for fmt in 'tsv taxa tre dst starling paps.nex paps.csv' \ 'separated multistate.nex groups'.split(): kw = {'ref': 'word'} if fmt == 'starling' else {} self.wordlist.output(fmt, filename=fn, **kw) if fmt == 'starling': self.wordlist.output(fmt, filename=fn, cognates='cogid', **kw) if fmt == 'tsv': kw['subset'] = True self.wordlist.output(fmt, filename=fn, cols=[], rows={}, **kw) self.wordlist.output(fmt, filename=fn, cols=sorted(self.wordlist.header)[:2], rows=dict(ID=" > 10"), **kw) def test_export(self): fn = text_type(self.tmp_path('test')) for fmt in 'txt tex html'.split(): self.wordlist.export(fmt, filename=fn) def test_get_wordlist(self): from lingpy.basic.wordlist import get_wordlist wl1 = get_wordlist(test_data('mycsvwordlist.csv')) wl2 = get_wordlist(test_data('mycsvwordlistwithoutids.csv')) assert wl1.height == wl2.height for k in wl1: assert wl1[k, 'concept'] == wl2[k, 'concept']
def prepare(dataset): concepts = dict( [(x.english, x.concepticon_id) for x in \ Concepticon().conceptlists['Wang-2004-100a'].concepts.values()] ) # correct wrong pinyins in sinopy pinyin = { "虱": "shī", "咯": "gē", "強": "qiáng", "哩": "lǐ", "喏": "nuò", "鳧": "fú", "伲": "nǐ", "黃": "huáng", "哋": "dì", "阿": "ā", "卵": "luǎn", "說": "shuō", "喙": "huì", "頸": "jǐng", "唔": "wú}", "雞": "jī", "黒": "hēi", "哪": "nǎ", "麼": "me", "蔃": "qiáng", "葷": "hūn", "鳥": "niǎo}", "舌": "huà", "吃": "chī", "膘": "biǎo}", "綠": "lǜ", "羽": "yǔ", "們": "men", "焦": "jiāo", "腳": "jiǎo", "乜": "miē", "即": "jí", "佬": "lǎo", } with UnicodeReader(dataset.get_path('raw', 'Wang2004.csv'), delimiter='\t') as reader: lines = list(reader) D = {} idx = 1 cogids = {0: 0} for line in lines[1:]: concept = line[0] cid = concepts[concept] for t, cogs in zip(lines[0][1:], line[1:]): taxon = varieties_in_source[t] for cog in cogs.split('/'): if cog in cogids: cogid = cogids[cog] else: cogid = max(list(cogids.values()) or 0) + 1 cogids[cog] = cogid D[idx] = [taxon, t, concept, cid, cog, cogid] idx += 1 D[0] = [ 'doculect', 'doculect_in_source', 'concept', 'concepticon_id', 'value', 'cogid' ] wl = Wordlist(D) # renumber for partial cognates pcogs, idx = {}, 1 converter = {} for k in wl: chars = sinopy.gbk2big5(wl[k, 'value']) concept = wl[k, 'concept'] cogids = [] for char in chars: if sinopy.is_chinese(char): if char not in pcogs: pcogs[char] = idx idx += 1 cchar = concept + ':' + str(pcogs[char]) if cchar not in pcogs: pcogs[cchar] = pcogs[char] else: cchar = concept + ':' + char if cchar not in pcogs: pcogs[cchar] = idx idx += 1 cogids += [pcogs[cchar]] converter[k] = ' '.join([str(x) for x in cogids]) wl.add_entries('cogids', converter, lambda x: x) wl.output('tsv', filename=dataset.get_path('words'), prettify=False, ignore='all') # we also write the characters C = [[ 'ID', 'CHARACTER', 'PINYIN', 'WORDS_COGIDS', 'WORDS_ID', 'CONCEPT', 'DOCULECT', 'POSITION' ]] idx = 1 errors = {} for k in wl: concept = wl[k, 'concept'] doculect = wl[k, 'doculect'] chars = sinopy.gbk2big5(wl[k, 'value']) cogids = wl[k, 'cogids'].split(' ') for i, (char, cogid) in enumerate(zip(chars, cogids)): if sinopy.is_chinese(char): py = sinopy.pinyin(char) py = pinyin.get(char, py) if '?' in py or '{' in py: if char in errors: pass else: errors[char] = py C += [[idx, char, py, cogid, k, concept, doculect, i]] idx += 1 for k, v in errors.items(): print('"' + k + '" : "' + v + '",') with open(dataset.get_path('characters.tsv'), 'w') as f: for line in C: f.write('\t'.join([str(x) for x in line]) + '\n') # prepare the trees with open(dataset.get_path('raw', 'tree-100.tre')) as f1: with open(dataset.get_path('trees', 'tree-100.tre'), 'w') as f2: f2.write(''.join( [varieties_in_source.get(x, x) for x in f1.read()])) with open(dataset.get_path('raw', 'tree-95.tre')) as f1: with open(dataset.get_path('trees', 'tree-95.tre'), 'w') as f2: f2.write(''.join( [varieties_in_source.get(x, x) for x in f1.read()]))
class TestWordlist(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc')) self.wordlist2 = Wordlist(test_data('good_file.tsv')) def test___len__(self): assert len(self.wordlist) == 1400 def test_calculate(self): self.wordlist.calculate('dst') assert hasattr(self.wordlist, 'distances') assert sum([self.wordlist.distances[x][x] for x in range(self.wordlist.width)]) == 0 self.wordlist.calculate('tree') assert str(self.wordlist.tree).endswith(';') assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols) self.wordlist.calculate('groups') assert hasattr(self.wordlist, 'groups') assert type(self.wordlist.groups) == dict def test_coverage(self): self.wordlist.coverage() self.wordlist.coverage(stats='ratio') self.wordlist.coverage(stats='mean') def test_get_list(self): gerL = self.wordlist.get_list(doculect='German', entry='ipa', flat=True) gerD = self.wordlist.get_dict(col='German', entry='ipa') gerT = self.wordlist.get_list(doculect='German', entry="ipa") assert sorted(gerL) == sorted([v[0] for v in gerD.values()]) assert sorted(gerT) == sorted(gerL) hand1 = self.wordlist.get_list(concept="hand", entry="ipa", flat=True) hand2 = self.wordlist.get_dict(row="hand", entry="ipa") hand3 = self.wordlist.get_list(concept="hand", flat=True) assert sorted(hand1) == sorted([v[0] for v in hand2.values()]) # test for synonym lines, which are flattened assert self.wordlist2.get_list(concept='hand', entry="language", flat=True).count('l6') == 2 nonflat = self.wordlist2.get_list(concept="hand", entry="language") assert nonflat[0][-1] == nonflat[1][-1] assert len(self.wordlist2.get_list(col="l1", entry="concept")) == 3 assert len(self.wordlist2.get_list(col="l1", flat=True, entry="concept")) == 2 assert_raises(ValueError, self.wordlist2.get_list, col="l1", row="hand") assert_raises(ValueError, self.wordlist2.get_list) assert_raises(ValueError, self.wordlist.get_list, **{"row" : "Hand"}) def test_get_dict(self): gerD = self.wordlist.get_dict(col='German') assert sorted(gerD.keys()) == sorted(self.wordlist.rows) assert_raises(ValueError, self.wordlist.get_dict, **{"row" : "Hand"}) def test_renumber(self): self.wordlist.renumber('cogid', 'dummy') ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True) ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True) assert len(set(ger1)) == len(set(ger2)) assert sum([1 for x in ger2 if type(x) == int]) == len(ger2) def test_get_entries(self): ger = self.wordlist.get_entries('cogid') assert len(ger) == self.wordlist.height assert len(ger[0]) == self.wordlist.width def test_get_etymdict(self): etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', modify_ref=False) etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', modify_ref=abs) assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \ len(etd2) assert len([x for x in etd2 if x < 0]) == 0 # make "fuzzy" cognate sets self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x]) etd3 = self.wordlist.get_etymdict( ref='fuzzyid', entry='ipa', modify_ref=False) etd4 = self.wordlist.get_etymdict( ref='fuzzyid', entry='ipa', modify_ref=abs) for key in etd1: assert etd1[key] == etd3[key] for key in etd2: self.assertEquals(etd2[key], etd4[key]) def test_get_paps(self): paps = self.wordlist.get_paps(ref="cogid", modify_ref=abs) cogs = self.wordlist.get_etymdict(ref="cogid", modify_ref=abs) for key in cogs: if abs(key) in paps: assert True else: print(key) assert False def test_output(self): fn = text_type(self.tmp_path('test')) for fmt in 'tsv taxa tre dst starling paps.nex paps.csv separated multistate.nex groups'.split(): kw = {'ref': 'word'} if fmt == 'starling' else {} self.wordlist.output(fmt, filename=fn, **kw) if fmt == 'starling': self.wordlist.output(fmt, filename=fn, cognates='cogid', **kw) if fmt == 'tsv': kw['subset'] = True self.wordlist.output(fmt, filename=fn, cols=[], rows={}, **kw) self.wordlist.output(fmt, filename=fn, cols=sorted(self.wordlist.header)[:2], rows=dict(ID=" > 10"), **kw) def test_export(self): fn = text_type(self.tmp_path('test')) for fmt in 'txt tex html'.split(): self.wordlist.export(fmt, filename=fn) def test_get_wordlist(self): from lingpy.basic.wordlist import get_wordlist wl1 = get_wordlist(test_data('mycsvwordlist.csv')) wl2 = get_wordlist(test_data('mycsvwordlistwithoutids.csv')) assert wl1.height == wl2.height for k in wl1: assert wl1[k, 'concept'] == wl2[k, 'concept']
class TestWordlist(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc')) def test___len__(self): assert len(self.wordlist) == 1400 def test_calculate(self): self.wordlist.calculate('dst') assert hasattr(self.wordlist,'distances') assert sum([self.wordlist.distances[x][x] for x in range(self.wordlist.width)]) == 0 self.wordlist.calculate('tree') assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols) self.wordlist.calculate('groups') assert hasattr(self.wordlist,'groups') assert type(self.wordlist.groups) == dict def test_get_list(self): gerL = self.wordlist.get_list(col='German', entry='ipa', flat=True) gerD = self.wordlist.get_dict(col='German',entry='ipa') assert sorted(gerL) == sorted([v[0] for v in gerD.values()]) def test_get_dict(self): gerD = self.wordlist.get_dict(col='German') assert sorted(gerD.keys()) == sorted(self.wordlist.rows) def test_renumber(self): self.wordlist.renumber('cogid','dummy') ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True) ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True) assert len(set(ger1)) == len(set(ger2)) assert sum([1 for x in ger2 if type(x) == int]) == len(ger2) def test_get_entries(self): ger = self.wordlist.get_entries('cogid') assert len(ger) == self.wordlist.height assert len(ger[0]) == self.wordlist.width def get_etymdict(self): etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', loans=False) etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', loans=True) assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \ len(etd2) assert len([x for x in etd2 if x < 0]) == 0 # make "fuzzy" cognate sets self.wordlist.add_entries( 'fuzzyid', 'cogid', lambda x: [x] ) etd3 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', loans=False, fuzzy=True) etd4 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', loans=True, fuzzy=True) for key in etd1: assert etd1[key] == etd3[key] for key in etd2: assert etd2[key] == etd4[key] def test_get_paps(self): paps = self.wordlist.get_paps(ref="cogid", loans=True) cogs = self.wordlist.get_etymdict(ref="cogid", loans=True) for key in cogs: if abs(key) in paps: assert True else: print(key) assert False def test_output(self): fn = text_type(self.tmp_path('test')) for fmt in 'taxa tre dst starling paps.nex paps.csv'.split(): kw = {'ref': 'word'} if fmt == 'starling' else {} self.wordlist.output(fmt, filename=fn, **kw)
class TestWordlist(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc')) def test___len__(self): assert len(self.wordlist) == 1400 def test_calculate(self): self.wordlist.calculate('dst') assert hasattr(self.wordlist, 'distances') assert sum([ self.wordlist.distances[x][x] for x in range(self.wordlist.width) ]) == 0 self.wordlist.calculate('tree') assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols) self.wordlist.calculate('groups') assert hasattr(self.wordlist, 'groups') assert type(self.wordlist.groups) == dict def test_get_list(self): gerL = self.wordlist.get_list(col='German', entry='ipa', flat=True) gerD = self.wordlist.get_dict(col='German', entry='ipa') assert sorted(gerL) == sorted([v[0] for v in gerD.values()]) def test_get_dict(self): gerD = self.wordlist.get_dict(col='German') assert sorted(gerD.keys()) == sorted(self.wordlist.rows) def test_renumber(self): self.wordlist.renumber('cogid', 'dummy') ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True) ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True) assert len(set(ger1)) == len(set(ger2)) assert sum([1 for x in ger2 if type(x) == int]) == len(ger2) def test_get_entries(self): ger = self.wordlist.get_entries('cogid') assert len(ger) == self.wordlist.height assert len(ger[0]) == self.wordlist.width def get_etymdict(self): etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', loans=False) etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', loans=True) assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \ len(etd2) assert len([x for x in etd2 if x < 0]) == 0 # make "fuzzy" cognate sets self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x]) etd3 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', loans=False, fuzzy=True) etd4 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', loans=True, fuzzy=True) for key in etd1: assert etd1[key] == etd3[key] for key in etd2: assert etd2[key] == etd4[key] def test_get_paps(self): paps = self.wordlist.get_paps(ref="cogid", loans=True) cogs = self.wordlist.get_etymdict(ref="cogid", loans=True) for key in cogs: if abs(key) in paps: assert True else: print(key) assert False def test_output(self): fn = text_type(self.tmp_path('test')) for fmt in 'taxa tre dst starling paps.nex paps.csv'.split(): kw = {'ref': 'word'} if fmt == 'starling' else {} self.wordlist.output(fmt, filename=fn, **kw)