def test_renumber(self): from lingpy.basic.ops import renumber tmp = Wordlist(test_data('good_file.tsv')) tmp.renumber('cogid', 'newcogid') assert 'newcogid' in tmp.header tmp.renumber('mock') assert 'mockid' in tmp.header
def main(): debug = False if 'debug' in argv or '--debug' in argv: debug = True if 'pinyin' in argv: py = sinopy.pinyin(argv[argv.index('pinyin') + 1]) print(py) if 'profile' in argv: if '--cldf' in argv: wl = Wordlist.from_cldf(argv[argv.index('profile') + 1], col='language_id', row='parameter_id') wl.add_entries('doculect', 'language_name', lambda x: x) else: wl = Wordlist(argv[argv.index('profile') + 1]) column = 'ipa' language = None filename = 'orthography.tsv' if '--column' in argv: column = argv[argv.index('--column') + 1] if '--language' in argv: language = argv[argv.index('--language') + 1] if '-l' in argv: language = argv[argv.index('-l') + 1] if '-o' in argv: filename = argv[argv.index('-o') + 1] if '--filename' in argv: filename = argv[argv.index('--filename') + 1] segments.write_structure_profile(wl, column=column, filename=filename, debug=debug, language=language)
def prepare(ds): errs = 0 wl = Wordlist(ds.raw('bds.tsv')) W = {} for k in wl: value = wl[k, 'value'] tokens = wl[k, 'tokens'] doc = wl[k, 'doculect'] if value: morphemes = [] for a, b in _get_slices(wl[k, 'tokens']): ipa = ''.join(tokens[a:b]) morphemes += [ipa] ipa = ' '.join(morphemes) clpa = ds.transform(ipa, 'CLPA') struc = ds.transform(ipa, 'Structure') try: assert len(clpa.split(' ')) == len(struc.split(' ')) except: errs += 1 print(errs, clpa, struc) if '«' in clpa: errs += 1 print(errs, ipa, clpa, struc) W[k] = [ doc, wl[k, 'concept'], wl[k, 'concepticon_id'], value, clpa, struc, wl[k, 'partial_ids'] ] W[0] = [ 'doculect', 'concept', 'concepticon_id', 'value', 'segments', 'structure', 'cogids' ] ds.write_wordlist(Wordlist(W))
def get_lexibase(path, name, columns=None, preprocessing=None, namespace=None, lexibase=False): wordlist = Wordlist.from_cldf(path, columns=columns or ("language_id", "concept_name", "value", "form", "segments", "comment"), namespace=namespace or dict([("language_id", "doculect"), ("concept_name", "concept"), ("value", "value"), ("form", "form"), ("segments", "tokens"), ("comment", "note")])) if preprocessing: D = preprocessing(wordlist) else: D = {idx: wordlist[idx] for idx in wordlist} D[0] = wordlist.columns if not lexibase: Wordlist(D).output("tsv", filename=name, ignore="all", prettify=False) else: lex = LexiBase(D, dbase=name + ".sqlite3") lex.create(name)
def test_wordlist2cognates(repos, mocker): @attr.s class Lexeme(dataset.Lexeme): Concept = attr.ib(default=None) Segments = attr.ib(default=[]) @attr.s class Lexeme2(dataset.Lexeme): Concept = attr.ib(default=None) dsdir = repos / 'datasets' / 'test_dataset' if not dsdir.joinpath('cldf').exists(): dsdir.joinpath('cldf').mkdir() ds = Dataset(mocker.Mock( lexeme_class=Lexeme, cognate_class=dataset.Cognate, language_class=dataset.Language, concept_class=dataset.Concept, split_forms=lambda _, s: [s], tokenize=lambda _, x: [], dir=dsdir, tr_analyses={}, cldf_dir=dsdir.joinpath('cldf'))) ds2 = Dataset(mocker.Mock( lexeme_class=Lexeme2, cognate_class=dataset.Cognate, language_class=dataset.Language, concept_class=dataset.Concept, split_forms=lambda _, s: [s], dir=dsdir, tr_analyses={}, cldf_dir=dsdir.joinpath('cldf'))) ds2.add_form_with_segments( Value='form,form2', Concept='meaning', Language_ID='1', Parameter_ID='p', Form='form', Segments=['f', 'o'] ) # needs to be fixed XXX ds2.tokenize = lambda _, x: [x] ds2.add_form( Value='form,form2', Concept='meaning', Language_ID='1', Parameter_ID='p', Form='form', ) # needs to be fixed XXX ds.tokenize = lambda _, x: [] ds.add_forms_from_value( Value='form,form2', Concept='meaning', Language_ID='1', Parameter_ID='p' ) # lid, ipa, concept wl = Wordlist(lingpy_util._cldf2wld(ds2), row='concept', col='language_id') res = list(lingpy_util.wordlist2cognates(wl, 'src')) assert isinstance(res[0], dict)
def test_med(test_data): wl = Wordlist(str(test_data / 'KSL.qlc')) assert med(wl, gold='gloss', test='gloss', classes=False) == pytest.approx(0.0) assert med(wl, gold='tokens', test='tokens', classes=True) == pytest.approx(0.0)
def run(args): bipa = args.clts.api.bipa func = profile.simple_profile cols = ['Grapheme', 'IPA', 'Frequence', 'Codepoints'] kw = {'ref': 'form', 'clts': bipa} if args.context: func = profile.context_profile cols = [ 'Grapheme', 'IPA', 'Examples', 'Languages', 'Frequence', 'Codepoints' ] kw['col'] = 'language_id' ds = get_dataset(args) profile_path = ds.etc_dir / 'orthography.tsv' if profile_path.exists() and not args.force: raise ParserError( 'Orthography profile exists already. To overwrite, pass "-f" flag') header, D = [], {} for i, row in enumerate(ds.cldf_reader()['FormTable'], start=1): if i == 1: header = [f for f in row.keys() if f != 'ID'] D = {0: ['lid'] + [h.lower() for h in header]} row['Segments'] = ' '.join(row['Segments']) D[i] = [row['ID']] + [row[h] for h in header] with UnicodeWriter(profile_path, delimiter='\t') as writer: writer.writerow(cols) for row in func(Wordlist(D, row='parameter_id', col='language_id'), **kw): writer.writerow(row) args.log.info('Orthography profile written to {0}'.format(profile_path))
def test_clean_taxnames(self): tmp = Wordlist({ 0: ['doculect', 'concept', 'counterpart'], 1: ['l1', 'hand', 'hand'], 2: ['l2 - a (taxon) name)', 'hand', 'hand'] }) clean_taxnames(tmp) assert tmp.cols[-1] == 'l2___a_taxon_name'
def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ concepts, wl_concepts = {}, {} visited = set() for concept in self.concepts: cid = '{0}_{1}'.format(concept['NUMBER'], slug(concept['ENGLISH'])) if cid in visited: pass else: visited.add(cid) args.writer.add_concept( ID=cid, Name=concept['ENGLISH'], Glosses_in_Source=concept['GLOSSES_IN_SOURCE'], Concepticon_ID=concept['CONCEPTICON_ID'], Concepticon_Gloss=concept['CONCEPTICON_GLOSS']) for gloss in concept['GLOSSES_IN_SOURCE'].split(' // '): concepts[gloss] = cid wl_concepts[gloss] = concept['ENGLISH'] languages = args.writer.add_languages(lookup_factory="Name_in_Source") args.writer.add_sources() # make a wordlist for edictor to inspect the data D = {0: ['doculect', 'concept', 'ipa', 'cogid']} idx = 1 for i, row in progressbar( enumerate( self.raw_dir.read_csv('data.tsv', delimiter='\t', dicts=True))): for language, lid in languages.items(): form = row[language].strip() if form: lexemes = args.writer.add_forms_from_value( Language_ID=lid, Parameter_ID=concepts[row['Meaning']], Value=form, Source='Holm2017') if lexemes: args.writer.add_cognate( lexeme=lexemes[0], Cognateset_ID=str(i + 1), Cognate_Detection_Method='expert', Source='Holm2017') D[idx] = [ language, wl_concepts[row['Meaning']], form, i + 1 ] idx += 1 Wordlist(D).output( 'tsv', filename=self.raw_dir.joinpath('wordlist').as_posix())
def test_wordlist2cognates(repos, mocker, dataset): @attr.s class Lexeme(pbds.Lexeme): Concept = attr.ib(default=None) Segments = attr.ib(default=[]) @attr.s class Lexeme2(pbds.Lexeme): Concept = attr.ib(default=None) dsdir = repos / 'datasets' / 'test_dataset' if not dsdir.joinpath('cldf').exists(): dsdir.joinpath('cldf').mkdir() dataset.cognate_class = pbds.Cognate dataset.language_class = pbds.Language dataset.concept_class = pbds.Concept dataset.split_forms = lambda _, s: [s] dataset.dir = dsdir dataset.tr_analyses = {} dataset.cldf_dir = dsdir.joinpath('cldf') dataset.lexeme_class = Lexeme with dataset.cldf_writer(mocker.Mock()) as ds: # needs to be fixed XXX ds.tokenize = lambda _, x: [] ds.add_forms_from_value(Value='form,form2', Concept='meaning', Language_ID='1', Parameter_ID='p') dataset.lexeme_class = Lexeme2 with dataset.cldf_writer( mocker.Mock(clts=mocker.Mock(api=pyclts.CLTS(repos)))) as ds2: ds2.add_form_with_segments(Value='form,form2', Concept='meaning', Language_ID='1', Parameter_ID='p', Form='form', Segments=['f', 'o']) # needs to be fixed XXX ds2.tokenize = lambda _, x: [x] ds2.add_form( Value='form,form2', Concept='meaning', Language_ID='1', Parameter_ID='p', Form='form', ) # lid, ipa, concept wl = Wordlist(lingpy_util._cldf2wld(ds2), row='concept', col='language_id') res = list(lingpy_util.wordlist2cognates(wl, 'src')) assert isinstance(res[0], dict)
def test_wl2dst(self): res = wl2dst(self.wordlist, mode='jaccard') self.assertIsInstance(res, list) res = wl2dst(self.wordlist, mode='jaccard', refB='glossid') self.assertIsInstance(res, list) _ = wl2dst(self.wordlist, mode='swadesh') _ = wl2dst(self.wordlist, mode='shared') _ = wl2dst(self.wordlist, mode='swadesh', ignore_missing=True) # trigger zero-division-warning in wl2dst tmp = Wordlist({ 0: ['doculect', 'concept', 'counterpart', 'cogid'], 1: ['l1', 'hand', 'hand', '1'], 2: ['l2 - a (taxon) name)', 'hand', 'hand', '2'], 3: ['l3', 'foot', 'foot', '3'] }) dst = wl2dst(tmp) assert dst[0][2] == 1
def prepare(dataset): concepts = dict( [(x.english, x.concepticon_id) for x in \ Concepticon().conceptlists['Wang-2004-100a'].concepts.values()] ) # correct wrong pinyins in sinopy pinyin = { "虱": "shī", "咯": "gē", "強": "qiáng", "哩": "lǐ", "喏": "nuò", "鳧": "fú", "伲": "nǐ", "黃": "huáng", "哋": "dì", "阿": "ā", "卵": "luǎn", "說": "shuō", "喙": "huì", "頸": "jǐng", "唔": "wú}", "雞": "jī", "黒": "hēi", "哪": "nǎ", "麼": "me", "蔃": "qiáng", "葷": "hūn", "鳥": "niǎo}", "舌": "huà", "吃": "chī", "膘": "biǎo}", "綠": "lǜ", "羽": "yǔ", "們": "men", "焦": "jiāo", "腳": "jiǎo", "乜": "miē", "即": "jí", "佬": "lǎo", } with UnicodeReader(dataset.get_path('raw', 'Wang2004.csv'), delimiter='\t') as reader: lines = list(reader) D = {} idx = 1 cogids = {0: 0} for line in lines[1:]: concept = line[0] cid = concepts[concept] for t, cogs in zip(lines[0][1:], line[1:]): taxon = varieties_in_source[t] for cog in cogs.split('/'): if cog in cogids: cogid = cogids[cog] else: cogid = max(list(cogids.values()) or 0) + 1 cogids[cog] = cogid D[idx] = [taxon, t, concept, cid, cog, cogid] idx += 1 D[0] = [ 'doculect', 'doculect_in_source', 'concept', 'concepticon_id', 'value', 'cogid' ] wl = Wordlist(D) # renumber for partial cognates pcogs, idx = {}, 1 converter = {} for k in wl: chars = sinopy.gbk2big5(wl[k, 'value']) concept = wl[k, 'concept'] cogids = [] for char in chars: if sinopy.is_chinese(char): if char not in pcogs: pcogs[char] = idx idx += 1 cchar = concept + ':' + str(pcogs[char]) if cchar not in pcogs: pcogs[cchar] = pcogs[char] else: cchar = concept + ':' + char if cchar not in pcogs: pcogs[cchar] = idx idx += 1 cogids += [pcogs[cchar]] converter[k] = ' '.join([str(x) for x in cogids]) wl.add_entries('cogids', converter, lambda x: x) wl.output('tsv', filename=dataset.get_path('words'), prettify=False, ignore='all') # we also write the characters C = [[ 'ID', 'CHARACTER', 'PINYIN', 'WORDS_COGIDS', 'WORDS_ID', 'CONCEPT', 'DOCULECT', 'POSITION' ]] idx = 1 errors = {} for k in wl: concept = wl[k, 'concept'] doculect = wl[k, 'doculect'] chars = sinopy.gbk2big5(wl[k, 'value']) cogids = wl[k, 'cogids'].split(' ') for i, (char, cogid) in enumerate(zip(chars, cogids)): if sinopy.is_chinese(char): py = sinopy.pinyin(char) py = pinyin.get(char, py) if '?' in py or '{' in py: if char in errors: pass else: errors[char] = py C += [[idx, char, py, cogid, k, concept, doculect, i]] idx += 1 for k, v in errors.items(): print('"' + k + '" : "' + v + '",') with open(dataset.get_path('characters.tsv'), 'w') as f: for line in C: f.write('\t'.join([str(x) for x in line]) + '\n') # prepare the trees with open(dataset.get_path('raw', 'tree-100.tre')) as f1: with open(dataset.get_path('trees', 'tree-100.tre'), 'w') as f2: f2.write(''.join( [varieties_in_source.get(x, x) for x in f1.read()])) with open(dataset.get_path('raw', 'tree-95.tre')) as f1: with open(dataset.get_path('trees', 'tree-95.tre'), 'w') as f2: f2.write(''.join( [varieties_in_source.get(x, x) for x in f1.read()]))
def wordlist(test_data): return Wordlist(str(test_data / 'KSL.qlc'))
def test_renumber(self): tmp = Wordlist(test_data('good_file.tsv')) tmp.renumber('cogid', 'newcogid') assert 'newcogid' in tmp.header tmp.renumber('mock') assert 'mockid' in tmp.header
def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc')) self.wordlist2 = Wordlist(test_data('KSL4.qlc'))
class TestWordlist(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc')) self.wordlist2 = Wordlist(test_data('good_file.tsv')) def test___len__(self): assert len(self.wordlist) == 1400 def test_calculate(self): self.wordlist.calculate('dst') assert hasattr(self.wordlist, 'distances') assert sum([self.wordlist.distances[x][x] for x in range(self.wordlist.width)]) == 0 self.wordlist.calculate('tree') assert str(self.wordlist.tree).endswith(';') assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols) self.wordlist.calculate('groups') assert hasattr(self.wordlist, 'groups') assert type(self.wordlist.groups) == dict def test_coverage(self): self.wordlist.coverage() self.wordlist.coverage(stats='ratio') self.wordlist.coverage(stats='mean') def test_get_list(self): gerL = self.wordlist.get_list(doculect='German', entry='ipa', flat=True) gerD = self.wordlist.get_dict(col='German', entry='ipa') gerT = self.wordlist.get_list(doculect='German', entry="ipa") assert sorted(gerL) == sorted([v[0] for v in gerD.values()]) assert sorted(gerT) == sorted(gerL) hand1 = self.wordlist.get_list(concept="hand", entry="ipa", flat=True) hand2 = self.wordlist.get_dict(row="hand", entry="ipa") hand3 = self.wordlist.get_list(concept="hand", flat=True) assert sorted(hand1) == sorted([v[0] for v in hand2.values()]) # test for synonym lines, which are flattened assert self.wordlist2.get_list(concept='hand', entry="language", flat=True).count('l6') == 2 nonflat = self.wordlist2.get_list(concept="hand", entry="language") assert nonflat[0][-1] == nonflat[1][-1] assert len(self.wordlist2.get_list(col="l1", entry="concept")) == 3 assert len(self.wordlist2.get_list(col="l1", flat=True, entry="concept")) == 2 assert_raises(ValueError, self.wordlist2.get_list, col="l1", row="hand") assert_raises(ValueError, self.wordlist2.get_list) assert_raises(ValueError, self.wordlist.get_list, **{"row" : "Hand"}) def test_get_dict(self): gerD = self.wordlist.get_dict(col='German') assert sorted(gerD.keys()) == sorted(self.wordlist.rows) assert_raises(ValueError, self.wordlist.get_dict, **{"row" : "Hand"}) def test_renumber(self): self.wordlist.renumber('cogid', 'dummy') ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True) ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True) assert len(set(ger1)) == len(set(ger2)) assert sum([1 for x in ger2 if type(x) == int]) == len(ger2) def test_get_entries(self): ger = self.wordlist.get_entries('cogid') assert len(ger) == self.wordlist.height assert len(ger[0]) == self.wordlist.width def test_get_etymdict(self): etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', modify_ref=False) etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', modify_ref=abs) assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \ len(etd2) assert len([x for x in etd2 if x < 0]) == 0 # make "fuzzy" cognate sets self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x]) etd3 = self.wordlist.get_etymdict( ref='fuzzyid', entry='ipa', modify_ref=False) etd4 = self.wordlist.get_etymdict( ref='fuzzyid', entry='ipa', modify_ref=abs) for key in etd1: assert etd1[key] == etd3[key] for key in etd2: self.assertEquals(etd2[key], etd4[key]) def test_get_paps(self): paps = self.wordlist.get_paps(ref="cogid", modify_ref=abs) cogs = self.wordlist.get_etymdict(ref="cogid", modify_ref=abs) for key in cogs: if abs(key) in paps: assert True else: print(key) assert False def test_output(self): fn = text_type(self.tmp_path('test')) for fmt in 'tsv taxa tre dst starling paps.nex paps.csv separated multistate.nex groups'.split(): kw = {'ref': 'word'} if fmt == 'starling' else {} self.wordlist.output(fmt, filename=fn, **kw) if fmt == 'starling': self.wordlist.output(fmt, filename=fn, cognates='cogid', **kw) if fmt == 'tsv': kw['subset'] = True self.wordlist.output(fmt, filename=fn, cols=[], rows={}, **kw) self.wordlist.output(fmt, filename=fn, cols=sorted(self.wordlist.header)[:2], rows=dict(ID=" > 10"), **kw) def test_export(self): fn = text_type(self.tmp_path('test')) for fmt in 'txt tex html'.split(): self.wordlist.export(fmt, filename=fn) def test_get_wordlist(self): from lingpy.basic.wordlist import get_wordlist wl1 = get_wordlist(test_data('mycsvwordlist.csv')) wl2 = get_wordlist(test_data('mycsvwordlistwithoutids.csv')) assert wl1.height == wl2.height for k in wl1: assert wl1[k, 'concept'] == wl2[k, 'concept']
def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc')) self.wordlist2 = Wordlist(test_data('good_file.tsv'))
def wordlist2(test_data): return Wordlist(str(test_data / 'good_file.tsv'))
class TestWordlist(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc')) self.wordlist2 = Wordlist(test_data('good_file.tsv')) def test___len__(self): assert len(self.wordlist) == 1400 def test_calculate(self): self.wordlist.calculate('dst') assert hasattr(self.wordlist, 'distances') assert sum([ self.wordlist.distances[x][x] for x in range(self.wordlist.width) ]) == 0 self.wordlist.calculate('tree') assert str(self.wordlist.tree).endswith(';') assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols) self.wordlist.calculate('groups') assert hasattr(self.wordlist, 'groups') assert type(self.wordlist.groups) == dict def test_coverage(self): self.wordlist.coverage() self.wordlist.coverage(stats='ratio') self.wordlist.coverage(stats='mean') def test_get_list(self): ger_l = self.wordlist.get_list(doculect='German', entry='ipa', flat=True) ger_d = self.wordlist.get_dict(col='German', entry='ipa') ger_t = self.wordlist.get_list(doculect='German', entry="ipa") assert sorted(ger_l) == sorted([v[0] for v in ger_d.values()]) assert sorted(ger_t) == sorted(ger_l) hand1 = self.wordlist.get_list(concept="hand", entry="ipa", flat=True) hand2 = self.wordlist.get_dict(row="hand", entry="ipa") assert sorted(hand1) == sorted([v[0] for v in hand2.values()]) # test for synonym lines, which are flattened assert self.wordlist2.get_list(concept='hand', entry="language", flat=True).count('l6') == 2 nonflat = self.wordlist2.get_list(concept="hand", entry="language") assert nonflat[0][-1] == nonflat[1][-1] assert len(self.wordlist2.get_list(col="l1", entry="concept")) == 3 assert len( self.wordlist2.get_list(col="l1", flat=True, entry="concept")) == 2 assert_raises(ValueError, self.wordlist2.get_list, col="l1", row="hand") assert_raises(ValueError, self.wordlist2.get_list) assert_raises(ValueError, self.wordlist.get_list, **{"row": "Hand"}) def test_get_dict(self): ger_d = self.wordlist.get_dict(col='German') assert sorted(ger_d.keys()) == sorted(self.wordlist.rows) assert_raises(ValueError, self.wordlist.get_dict, **{"row": "Hand"}) def test_renumber(self): self.wordlist.renumber('cogid', 'dummy') ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True) ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True) assert len(set(ger1)) == len(set(ger2)) assert sum([1 for x in ger2 if type(x) == int]) == len(ger2) def test_get_entries(self): ger = self.wordlist.get_entries('cogid') assert len(ger) == self.wordlist.height assert len(ger[0]) == self.wordlist.width def test_get_etymdict(self): etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', modify_ref=False) etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', modify_ref=abs) assert (len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == len(etd2)) assert len([x for x in etd2 if x < 0]) == 0 # make "fuzzy" cognate sets self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x]) etd3 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', modify_ref=False) etd4 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', modify_ref=abs) for key in etd1: assert etd1[key] == etd3[key] for key in etd2: self.assertEqual(etd2[key], etd4[key]) def test_get_paps(self): paps = self.wordlist.get_paps(ref="cogid", modify_ref=abs) cogs = self.wordlist.get_etymdict(ref="cogid", modify_ref=abs) for key in cogs: if abs(key) in paps: assert True else: print(key) assert False def test_output(self): fn = text_type(self.tmp_path('test')) for fmt in 'tsv taxa tre dst starling paps.nex paps.csv' \ 'separated multistate.nex groups'.split(): kw = {'ref': 'word'} if fmt == 'starling' else {} self.wordlist.output(fmt, filename=fn, **kw) if fmt == 'starling': self.wordlist.output(fmt, filename=fn, cognates='cogid', **kw) if fmt == 'tsv': kw['subset'] = True self.wordlist.output(fmt, filename=fn, cols=[], rows={}, **kw) self.wordlist.output(fmt, filename=fn, cols=sorted(self.wordlist.header)[:2], rows=dict(ID=" > 10"), **kw) def test_export(self): fn = text_type(self.tmp_path('test')) for fmt in 'txt tex html'.split(): self.wordlist.export(fmt, filename=fn) def test_get_wordlist(self): from lingpy.basic.wordlist import get_wordlist wl1 = get_wordlist(test_data('mycsvwordlist.csv')) wl2 = get_wordlist(test_data('mycsvwordlistwithoutids.csv')) assert wl1.height == wl2.height for k in wl1: assert wl1[k, 'concept'] == wl2[k, 'concept']
class TestWordlist(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc')) def test___len__(self): assert len(self.wordlist) == 1400 def test_calculate(self): self.wordlist.calculate('dst') assert hasattr(self.wordlist,'distances') assert sum([self.wordlist.distances[x][x] for x in range(self.wordlist.width)]) == 0 self.wordlist.calculate('tree') assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols) self.wordlist.calculate('groups') assert hasattr(self.wordlist,'groups') assert type(self.wordlist.groups) == dict def test_get_list(self): gerL = self.wordlist.get_list(col='German', entry='ipa', flat=True) gerD = self.wordlist.get_dict(col='German',entry='ipa') assert sorted(gerL) == sorted([v[0] for v in gerD.values()]) def test_get_dict(self): gerD = self.wordlist.get_dict(col='German') assert sorted(gerD.keys()) == sorted(self.wordlist.rows) def test_renumber(self): self.wordlist.renumber('cogid','dummy') ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True) ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True) assert len(set(ger1)) == len(set(ger2)) assert sum([1 for x in ger2 if type(x) == int]) == len(ger2) def test_get_entries(self): ger = self.wordlist.get_entries('cogid') assert len(ger) == self.wordlist.height assert len(ger[0]) == self.wordlist.width def get_etymdict(self): etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', loans=False) etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', loans=True) assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \ len(etd2) assert len([x for x in etd2 if x < 0]) == 0 # make "fuzzy" cognate sets self.wordlist.add_entries( 'fuzzyid', 'cogid', lambda x: [x] ) etd3 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', loans=False, fuzzy=True) etd4 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', loans=True, fuzzy=True) for key in etd1: assert etd1[key] == etd3[key] for key in etd2: assert etd2[key] == etd4[key] def test_get_paps(self): paps = self.wordlist.get_paps(ref="cogid", loans=True) cogs = self.wordlist.get_etymdict(ref="cogid", loans=True) for key in cogs: if abs(key) in paps: assert True else: print(key) assert False def test_output(self): fn = text_type(self.tmp_path('test')) for fmt in 'taxa tre dst starling paps.nex paps.csv'.split(): kw = {'ref': 'word'} if fmt == 'starling' else {} self.wordlist.output(fmt, filename=fn, **kw)
def cmd_makecldf(self, args): args.writer.add_sources() concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Number=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.number] = idx languages = args.writer.add_languages(lookup_factory="Name") # we combine with the manually edited wordlist to retrieve the lexeme # values wl = Wordlist(self.raw_dir.joinpath('deepadungpalaung.tsv').as_posix()) mapper = { (concept, language, normalize("NFD", form)): segments for (idx, concept, language, form, segments ) in wl.iter_rows('concept', 'doculect', 'form', 'tokens') } data = self.raw_dir.read_csv('100item-phylo.Sheet1.csv', dicts=False) for i, row in progressbar(enumerate(data[4:])): number = row[0].strip().strip('.') concept = row[1].strip() for j in range(0, len(row) - 2, 2): language = data[2][j + 2] value = row[j + 2] if value.strip() and value.strip() not in ['-----']: if ',' in row[j + 2]: forms = [v.strip() for v in value.split(',')] cogids = [ str(int(float(x))) for x in row[j + 3].split(' or ') ] else: forms = [value.strip()] cogids = [str(int(float(row[j + 3].split(' or ')[0])))] for form, cogid in zip(forms, cogids): try: segments = mapper[concept, languages[language], form] lexeme = args.writer.add_form_with_segments( Parameter_ID=concepts[number], Language_ID=languages[language], Value=value.strip(), Form=form, Segments=segments, Source="Deepadung2015") except: args.log.warn( 'lexeme missing {0} / {1} / {2}'.format( concept, language, form)) lexeme = args.writer.add_form( Parameter_ID=concepts[number], Language_ID=languages[language], Value=value.strip(), Form=form, Source="Deepadung2015") args.writer.add_cognate(lexeme=lexeme, Cognateset_ID=cogid + '-' + number, Source="Deepadung2015")
def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc'))
class TestWordlist(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('KSL.qlc')) def test___len__(self): assert len(self.wordlist) == 1400 def test_calculate(self): self.wordlist.calculate('dst') assert hasattr(self.wordlist, 'distances') assert sum([ self.wordlist.distances[x][x] for x in range(self.wordlist.width) ]) == 0 self.wordlist.calculate('tree') assert sorted(self.wordlist.tree.taxa) == sorted(self.wordlist.cols) self.wordlist.calculate('groups') assert hasattr(self.wordlist, 'groups') assert type(self.wordlist.groups) == dict def test_get_list(self): gerL = self.wordlist.get_list(col='German', entry='ipa', flat=True) gerD = self.wordlist.get_dict(col='German', entry='ipa') assert sorted(gerL) == sorted([v[0] for v in gerD.values()]) def test_get_dict(self): gerD = self.wordlist.get_dict(col='German') assert sorted(gerD.keys()) == sorted(self.wordlist.rows) def test_renumber(self): self.wordlist.renumber('cogid', 'dummy') ger1 = self.wordlist.get_list(col='German', entry='cogid', flat=True) ger2 = self.wordlist.get_list(col='German', entry='dummy', flat=True) assert len(set(ger1)) == len(set(ger2)) assert sum([1 for x in ger2 if type(x) == int]) == len(ger2) def test_get_entries(self): ger = self.wordlist.get_entries('cogid') assert len(ger) == self.wordlist.height assert len(ger[0]) == self.wordlist.width def get_etymdict(self): etd1 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', loans=False) etd2 = self.wordlist.get_etymdict(ref='cogid', entry='ipa', loans=True) assert len(etd1) > len(etd2) and len(set([abs(x) for x in etd1])) == \ len(etd2) assert len([x for x in etd2 if x < 0]) == 0 # make "fuzzy" cognate sets self.wordlist.add_entries('fuzzyid', 'cogid', lambda x: [x]) etd3 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', loans=False, fuzzy=True) etd4 = self.wordlist.get_etymdict(ref='fuzzyid', entry='ipa', loans=True, fuzzy=True) for key in etd1: assert etd1[key] == etd3[key] for key in etd2: assert etd2[key] == etd4[key] def test_get_paps(self): paps = self.wordlist.get_paps(ref="cogid", loans=True) cogs = self.wordlist.get_etymdict(ref="cogid", loans=True) for key in cogs: if abs(key) in paps: assert True else: print(key) assert False def test_output(self): fn = text_type(self.tmp_path('test')) for fmt in 'taxa tre dst starling paps.nex paps.csv'.split(): kw = {'ref': 'word'} if fmt == 'starling' else {} self.wordlist.output(fmt, filename=fn, **kw)