def test_cv_templates(test_data): assert cv_templates(Wordlist(str(test_data / 'KSL5.qlc')), 'French', output='markdown') patterns, _, sounds = cv_templates(Wordlist(str(test_data / 'KSL5.qlc')), 'French', output=None)
def test_plots(self): plot_gls(self.gls, self.tree, filename=text_type(self.tmp_path('test'))) plot_tree(self.tree, filename=text_type(self.tmp_path('test'))) plot_concept_evolution(self.scenarios, self.tree, filename=text_type(self.tmp_path('test'))) wl = Wordlist(test_data('KSL.qlc')) wl.calculate('tree') plot_heatmap(wl, filename=text_type(self.tmp_path('test')), ref="cogid", refB="cogid", steps=1)
def test_colexification_network(test_data, tmppath): graph = colexification_network(Wordlist(str(test_data / 'colexification.tsv'))) assert "hand" in graph and "arm" in graph graph = colexification_network(Wordlist(str(test_data / 'colexification.tsv')), bipartite=True) assert 'arm' in graph['l4.4'] and 'hand' in graph['l4.4'] _ = colexification_network( Wordlist(str(test_data / 'colexification.tsv')), output="gml", filename=str(tmppath / "test"))
def test_cache(self): filename = 'lingpy_test.qlc' self.parser.pickle(filename=filename) from_cache = QLCParser.unpickle(filename) self.assertEqual(self.parser.header, from_cache.header) os.remove(str(path(filename))) wl = Wordlist(test_data('KSL.qlc')) wl.pickle(filename=filename) from_cache = Wordlist.unpickle(filename) self.assert_(from_cache._class) os.remove(str(path(filename)))
def test_plots(mocker, Plt, Sch, gls, tree, scenarios, tmppath, test_data): mocker.patch('lingpy.convert.plot.mpl', new=mocker.MagicMock()) mocker.patch('lingpy.convert.plot.plt', new=Plt) mocker.patch('lingpy.convert.plot.sch', new=Sch) plot_gls(gls, tree, filename=str(tmppath / 'test')) plot_tree(tree, filename=str(tmppath / 'test')) plot_concept_evolution(scenarios, tree, filename=str(tmppath / 'test')) wl = Wordlist(str(test_data /'KSL.qlc')) wl.calculate('tree') plot_heatmap(wl, filename=str(tmppath / 'test'), ref="cogid", refB="cogid", steps=1)
def test_colexification_network(self): graph = colexification_network( Wordlist(test_data('colexification.tsv'))) assert "hand" in graph and "arm" in graph graph = colexification_network(Wordlist( test_data('colexification.tsv')), bipartite=True) assert 'arm' in graph['l4.4'] and 'hand' in graph['l4.4'] _ = colexification_network(Wordlist(test_data('colexification.tsv')), output="gml", filename=text_type(self.tmp_path("test")))
def test_load_from_cldf_metadata(test_data): wl = Wordlist.from_cldf(str(test_data / 'cldf/test-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower()) assert wl.width == 29 assert wl.height == 1 assert wl.entries[0] == 'alignment' assert wl.cols[0] == 'anuta'.lower() assert wl.cols[28] == 'wallisian'
def test_load_from_cldf_metadatafree(self): wl = Wordlist.from_cldf(test_data('cldf/forms.csv'), col="Language_ID".lower(), row="Parameter_ID".lower()) assert wl.width == 29 assert wl.height == 1 assert wl.entries[0] == 'alignment' assert wl.cols[0] == 'anuta'.lower() assert wl.cols[28] == 'wallisian'
def test_load_from_cldf_metadata(self): wl = Wordlist.from_cldf( test_data('cldf/test-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower()) assert wl.width == 29 assert wl.height == 1 assert wl.entries[0] == 'alignment' assert wl.cols[0] == 'anuta'.lower() assert wl.cols[28] == 'wallisian'
def test_load_from_cldf_metadatafree(test_data): with warnings.catch_warnings(): warnings.simplefilter("ignore") wl = Wordlist.from_cldf(str(test_data / 'cldf/forms.csv'), col="Language_ID".lower(), row="Parameter_ID".lower()) assert wl.width == 29 assert wl.height == 1 assert wl.entries[0] == 'alignment' assert wl.cols[0] == 'anuta'.lower() assert wl.cols[28] == 'wallisian'
def filter_wordlist(wordlist, lang1, lang2): """ Expects and returns a Wordlist instance, with the returned one retaining only entries of the two languages given. """ new_data = {} # the data formatted as LexStat wants it new_data[0] = ['doculect', 'concept', 'ipa', 'index', 'tokens'] # header key = 1 for entry in wordlist._data.values(): if entry[0] in (lang1, lang2): new_data[key] = entry key += 1 return Wordlist(new_data)
def make_wordlist(data, dataset_path, schema='ipa'): """ Expects {lang: {gloss: [ipa,]}}; returns a Wordlist instance. The last column of the header is needed for the sample ID. """ try: tokens = load_tokens(dataset_path, schema) assert len(tokens) == len(data) except AssertionError: raise ValueError('Could not find tokens in {}'.format(dataset_path)) new_data = {} # the data formatted as LexStat wants it new_data[0] = ['doculect', 'concept', 'ipa', 'index', 'tokens'] # header key = 1 for lang in sorted(data.keys()): for gloss in sorted(data[lang].keys()): for index, ipa in enumerate(data[lang][gloss]): new_data[key] = [lang, gloss, ipa, index + 1] new_data[key].append(tokens[lang][gloss][index]) key += 1 return Wordlist(new_data)
def test_load_cldf_and_write(self): wl = Wordlist.from_cldf( test_data('cldf/test-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower()) wl.output('tsv', filename=str(self.tmp_path('lingpycldf')))
def test_simple_profile(test_data): wl = Wordlist(str(test_data / 'KSL6.qlc')) prf = list(simple_profile(wl)) assert ('a', 'a', '7', 'U+0061') in prf prf = list(simple_profile(wl, clts={'a': 'A'})) assert prf[0][1] == 'A'
def test_load_non_wordlist_cldf(self): wl = Wordlist.from_cldf(test_data('cldf/non-wordlist-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower())
def test_load_noexisting_cldf(test_data): with pytest.raises(FileNotFoundError): wl = Wordlist.from_cldf(str(test_data / 'cldf/test-missing-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower())
def test_load_non_wordlist_cldf(test_data): with pytest.raises(ValueError): wl = Wordlist.from_cldf(str(test_data / 'cldf/non-wordlist-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower())
def wordlist(test_data): return Wordlist(str(test_data / 'GER.tsv'))
def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('GER.tsv'))
def test_load_cldf_and_write(self): wl = Wordlist.from_cldf(test_data('cldf/test-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower()) wl.output('tsv', filename=str(self.tmp_path('lingpycldf')))
def test_load_non_wordlist_cldf(self): wl = Wordlist.from_cldf( test_data('cldf/non-wordlist-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower())
def wordlist(test_data): return Wordlist(str(test_data / 'colexification.tsv'))
def test_load_noexisting_cldf(self): wl = Wordlist.from_cldf( test_data('cldf/test-missing-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower())
def wl(test_data): return Wordlist(str(test_data / 'KSL5.qlc'))
def test_load_cldf_and_write(test_data, tmppath): wl = Wordlist.from_cldf(str(test_data / 'cldf/test-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower()) wl.output('tsv', filename=str(tmppath / 'lingpycldf'))
def __init__(self, filepath, ngram_length=2): """ Matrix module to format wordlist data into various 2D matrices. """ # TODO: add a debug parameter # Get a Wordlist object given the specified input file self.wl = Wordlist(filepath) self.ngram_length = ngram_length # check for language, concept, counterpart in Wordlist object; if missing data, fail self.wl_header = self.wl.header print(self.wl_header) # TODO: check for the items in the header """ if all (k in self.wl_header for k in ("doculect", "concept", "orthoparse")): print("Matrix module input requires language ('doculect'), concept ('meaning'), and a qlc-format orthographic parse of the counterpart ('translation') and in wordlist object") sys.exit(1) sys.exit(1) """ # if not, add one # print(self.wl.__getitem__(1)) # print(self.wl[1,'orthoparse']) # a = lambda x:x.split('a') # wl.add_entries('juicyIPA','ipa',lambda x:x+x) # data structures to store various counts self._ngram_to_split_ngram = collections.defaultdict() # {"pb":"p_b"} # { word_id : { "#w_id":1, "wo_id":1, ...} } -- not ordered self._words_ngrams_counts = collections.defaultdict( lambda: collections.defaultdict(int)) # { word_id : ["#w_id", "wo_id", ...] } -- ordered self._words_ngrams = collections.defaultdict(list) # data stored: {language: {counterpart: count} } self._languages_words_counts = collections.defaultdict( lambda: collections.defaultdict(int)) # data stored: {concept: {counterpart: count} } self._concepts_words_counts = collections.defaultdict( lambda: collections.defaultdict(int)) # data containers - using sets to discard duplicate ngrams non_unique_parsed_words = set() non_unique_ngrams = set() languages = set() concepts = set() unique_ngrams = set() # loop over the wordlist data and parse into data strcutres for key in self.wl: language = self.wl[key, 'doculect'] language = language.replace("_", "-") # fix concept = self.wl[key, 'concept'] counterpart = self.wl[key, 'orthoparse'] # print(taxa, gloss, counterpart) # loop over the corpus reader data and parse into data structures """ for language, concept, counterpart in language_concept_counterpart_iterator: # First do orthography parsing. if gram_type == "graphemes": parsed_counterpart_tuple = orthography_parser.parse_string_to_graphemes(counterpart) # graphemes elif gram_type == "phonemes": parsed_counterpart_tuple = orthography_parser.parse_string_to_ipa_phonemes(counterpart) # phonemes else: sys.exit('\ninvalid gram type: specify "phonemes" or "graphemes"\n') # TODO: move this to orthography parser # If string is unparsable, write to file. if parsed_counterpart_tuple[0] == False: invalid_parse_string = qlc.ngram.formatted_string_from_ngrams(parsed_counterpart_tuple[1]) unparsables.write(language+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string+"\n") continue """ # parsed_counterpart = parsed_counterpart_tuple[1] counterpart = "# " + counterpart + " #" parsed_counterpart = tuple(counterpart.split()) # Get ngrams as a tuple of tuples. # ngram_tuples = qlc.ngram.ngrams_from_graphemes(parsed_counterpart, ngram_length) ngram_tuples = ng.ngrams_from_graphemes(parsed_counterpart, ngram_length) # Format that tuple of tuples into a space-delimed string. ngrams_string = ng.formatted_string_from_ngrams(ngram_tuples) # print(ngrams_string) # Format tuple into unigrams split on "_" into a space-delimited string. split_ngrams_string = ng.split_formatted_string_from_ngrams( ngram_tuples) # print(split_ngrams_string) # check to make sure ngrams string ("#a ab b#") # and split ngrams string ("#_a a_b b_#") are the same ngrams_string_list = ngrams_string.split() split_ngrams_string_list = split_ngrams_string.split() if len(ngrams_string_list) != len(split_ngrams_string_list): print("ngrams string and split ngrams sting do not match") sys.exit(1) # store key value pairs for ngram and split ngram; if unigram store the same for i in range(0, len(ngrams_string_list)): if self.ngram_length > 1: self._ngram_to_split_ngram[ ngrams_string_list[i]] = split_ngrams_string_list[i] else: self._ngram_to_split_ngram[ ngrams_string_list[i]] = ngrams_string_list[i] # Get the parsed version of counterparts. parsed_word = ng.formatted_string_from_ngrams(parsed_counterpart) # print("og: ", parsed_word) parsed_word = parsed_word.replace(" ", "") parsed_word = parsed_word.lstrip("#") parsed_word = parsed_word.rstrip("#") parsed_word = parsed_word.replace("#", " ") # print("pg: ", parsed_word) # print() # flipped # parsed_word_id = parsed_word+"_"+language parsed_word_id = language + "_" + parsed_word # if parsed_word not in dict: if not parsed_word_id in self._words_ngrams_counts: for ngram in ngrams_string.split(): # flipped # non_unique_ngram = language+"_"+ngram non_unique_ngram = language + "_" + ngram non_unique_ngrams.add(non_unique_ngram) self._words_ngrams_counts[parsed_word_id][ non_unique_ngram] += 1 self._words_ngrams[parsed_word_id].append(non_unique_ngram) # update data structures # self._languages_words_counts[language][parsed_word+"_"+language] += 1 # self._concepts_words_counts[concept][parsed_word+"_"+language] += 1 # flipped self._languages_words_counts[language][language + "_" + parsed_word] += 1 self._concepts_words_counts[concept][language + "_" + parsed_word] += 1 # add to header lists languages.add( language) # Append languages to unique set of langauge. concepts.add(concept) # Append concepts to unique set of concepts. unique_ngrams.update( set(ngram_tuples) ) # Append all the elements of ngram_tuples to unique_ngrams. # add to non-unique header lists # non_unique_parsed_words.add(parsed_word+"_"+language) # flipped non_unique_parsed_words.add(language + "_" + parsed_word) # listfy to sort self.languages = list(languages) self.languages.sort() self.concepts = list(concepts) self.concepts.sort() self.non_unique_parsed_words = list(non_unique_parsed_words) self.non_unique_parsed_words.sort() self.non_unique_ngrams = list(non_unique_ngrams) self.non_unique_ngrams.sort() self.unique_ngrams = list(unique_ngrams) self.unique_ngrams.sort()
def setUp(self): self.wl = Wordlist(test_data('KSL5.qlc'))
def test_context_profile(test_data): wl = Wordlist(str(test_data / 'KSL6.qlc')) prf = list(context_profile(wl)) assert prf[2][-2] == '4' # first line of profile prf = list(context_profile(wl, clts={'a': 'A'})) assert prf[2][1] == 'A'
def setUp(self): WithTempDir.setUp(self) self.wordlist = Wordlist(test_data('colexification.tsv')) self.cols = colx._get_colexifications(self.wordlist)
def test_load_noexisting_cldf(self): wl = Wordlist.from_cldf(test_data('cldf/test-missing-metadata.json'), col="Language_ID".lower(), row="Parameter_ID".lower())