def calculate_scores(*languages): pair_of_languages = { 0: ["backreference", "doculect", "concept", "ipa", "tokens"] } for l in languages: pair_of_languages.update({ i + len(pair_of_languages): [ form.id, form.language_id, form.concept_id, form.transcription, form.soundclasses.split() ] for i, form in enumerate( session.query(Form).filter_by(language=l)) }) lex = Partial(pair_of_languages, model=lingpy.data.model.Model("asjp"), check=True, apply_checks=True) lex.get_scorer(runs=10000, ratio=(3, 1), threshold=0.7) # This does not generalize to non-two languages yet session.add( Scorer(language1=languages[0], language2=languages[1], scorer=scorer2str(lex.bscorer))) for concept, forms, matrix in lex._get_matrices(method='lexstat', scale=0.5, factor=0.3, restricted_chars="_T", mode="overlap", gop=-2, restriction=""): for (i1, f1), (i2, f2) in itertools.combinations(enumerate(forms), 2): f1 = lex[f1][0] # Index 0 contains the 'backref', ie. our ID f2 = lex[f2][0] # Index 0 contains the 'backref', ie. our ID session.add( Similarity(form1_id=f1, form2_id=f2, score=matrix[i1][i2])) session.commit()
def compile_model(model, path=None): """ Function compiles customized sound-class models. Parameters ---------- model : str A string indicating the name of the model which shall be created. path : str A string indication the path where the model-folder is stored. Notes ----- A model is defined by a folder placed in :file:`data/models` directory of the LingPy package. The name of the folder reflects the name of the model. It contains three files: the file :file:`converter`, the file :file:`INFO`, and the optional file :file:`scorer`. The format requirements for these files are as follows: :file:`INFO` The ``INFO``-file serves as a reference for a given sound-class model. It can contain arbitrary information (and also be empty). If one wants to define specific characteristics, like the ``source``, the ``compiler``, the ``date``, or a ``description`` of a given model, this can be done by employing a key-value structure in which the key is preceded by an ``@`` and followed by a colon and the value is written right next to the key in the same line, e.g.:: @source: Dolgopolsky (1986) This information will then be read from the ``INFO`` file and rendered when printing the model to screen with help of the :py:func:`print` function. :file:`converter` The ``converter`` file contains all sound classes which are matched with their respective sound values. Each line is reserved for one class, precede by the key (preferably an ASCII-letter) representing the class:: B : ɸ, β, f, p͡f, p͜f, ƀ E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ D : θ, ð, ŧ, þ, đ G : x, ɣ, χ ... :file:`matrix` A scoring matrix indicating the alignment scores of all sound-class characters defined by the model. The scoring is structured as a simple tab-delimited text file. The first cell contains the character names, the following cells contain the scores in redundant form (with both triangles being filled):: B 10.0 -10.0 5.0 ... E -10.0 5.0 -10.0 ... F 5.0 -10.0 10.0 ... ... :file:`scorer` The ``scorer`` file (which is optional) contains the graph of class-transitions which is used for the calculation of the scoring dictionary. Each class is listed in a separate line, followed by the symbols ``v``,``c``, or ``t`` (indicating whether the class represents vowels, consonants, or tones), and by the classes it is directly connected to. The strength of this connection is indicated by digits (the smaller the value, the shorter the path between the classes):: A : v, E:1, O:1 C : c, S:2 B : c, W:2 E : v, A:1, I:1 D : c, S:2 ... The information in such a file is automatically converted into a scoring dictionary (see :evobib:`List2012b` for details). Based on the information provided by the files, a dictionary for the conversion of IPA-characters to sound classes and a scoring dictionary are created and stored as a binary. The model can be loaded with help of the :py:class:`~lingpy.data.model.Model` class and used in the various classes and functions provided by the library. See also -------- lingpy.data.model.Model compile_dvt """ log.info("Compiling model <" + model + ">...") # get the path to the models new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps) log.debug("Model-Path: %s" % new_path) # load the sound classes sound_classes = _import_sound_classes(new_path('converter')) # dump the data cache.dump(sound_classes, model + '.converter') log.info("... successfully created the converter.") # try to load the scoring function or the score tree scorer = False if os.path.isfile(new_path('matrix')): scorer = read_scorer(new_path('matrix')) elif os.path.isfile(new_path('scorer')): score_tree = _import_score_tree(new_path('scorer')) # calculate the scoring dictionary score_dict = _make_scoring_dictionary(score_tree) # make score_dict a ScoreDict instance chars = sorted(set([s[0] for s in score_dict.keys()])) matrix = [[0 for i in range(len(chars))] for j in range(len(chars))] for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)): if i < j: matrix[i][j] = score_dict.get((charA, charB), -100) matrix[j][i] = score_dict.get((charB, charA), -100) elif i == j: matrix[i][j] = score_dict[charA, charB] scorer = misc.ScoreDict(chars, matrix) util.write_text_file(new_path('matrix'), scorer2str(scorer)) if scorer: cache.dump(scorer, model + '.scorer') log.info("... successfully created the scorer.") else: log.info("... no scoring dictionary defined.") log.info("Model <" + model + "> was compiled successfully.")
def wl2qlc(header, data, filename='', formatter='concept', **keywords): """ Write the basic data of a wordlist to file. """ util.setdefaults(keywords, ignore=['taxa', 'doculects', 'msa'], fileformat='qlc', prettify=True) if keywords['ignore'] == 'all': keywords['ignore'] = [ 'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json' ] formatter = formatter.upper() if not filename: filename = rcParams['filename'] # create output string out = '# Wordlist\n' if keywords['prettify'] else '' # write meta to file meta = keywords.get("meta", {}) kvpairs = {} jsonpairs = {} msapairs = {} trees = {} distances = '' taxa = '' scorer = '' for k, v in meta.items(): # simple key-value-pairs if isinstance(v, (str, int)) or k == "tree": kvpairs[k] = v elif k == 'msa' and k not in keywords['ignore']: # go a level deeper, checking for keys for ref in v: if ref not in msapairs: msapairs[ref] = {} for a, b in v[ref].items(): msapairs[ref][a] = b elif k == 'distances': distances = matrix2dst(v, meta['taxa']) elif k in ['taxa', 'doculect', 'taxon', 'doculects']: # we need to find a better solution here, since it is not nice to # have taxa written to json again and again pass elif k == 'trees' and k not in keywords['ignore']: trees = '' for key, value in v.items(): trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value) elif k == 'scorer' and k not in keywords['ignore']: for key, value in v.items(): scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format( key, scorer2str(value), k) else: # check whether serialization works try: json.dumps(v) jsonpairs[k] = v except TypeError: pass if kvpairs and 'meta' not in keywords['ignore']: out += '\n# META\n' if keywords['prettify'] else '' for k, v in sorted(kvpairs.items(), key=lambda x: x[0]): out += '@{0}:{1}\n'.format(k, v) if taxa and keywords['taxa']: out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n' if jsonpairs and 'json' not in keywords['ignore']: out += "@json: " + json.dumps(jsonpairs) + '\n' if msapairs and 'msa' not in keywords['ignore']: for ref in msapairs: out += "\n# MSA reference: {0}\n".format(ref) for k, v in msapairs[ref].items(): if 'consensus' in v: out += '#\n<msa ' out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format( k, ref, ' '.join(v['consensus'])) else: out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref) outs = msa2str(v, wordlist=True) out += outs out += "</msa>\n" if distances and 'distances' not in keywords['ignore']: out += '\n# DISTANCES\n<dst>\n' out += distances + '</dst>\n' if trees: out += '\n# TREES\n' + trees if scorer and 'scorer' not in keywords['ignore']: out += '\n# SCORER\n' + scorer out += '\n# DATA\n' if keywords['prettify'] else '' out += 'ID\t' + '\t'.join(header) + '\n' # check for gloss in header to create nice output format if formatter in header: idx = header.index(formatter) formatter = None sorted_data = sorted(data.keys(), key=lambda x: data[x][idx]) elif len(formatter.split(',')) == 2: idxA, idxB = formatter.split(',') idxA = header.index(idxA) idxB = header.index(idxB) idx = idxA formatter = None sorted_data = sorted(data.keys(), key=lambda x: (data[x][idxA], data[x][idxB])) else: idx = False formatter = '' sorted_data = sorted(data.keys()) for key in sorted_data: # get the line line = data[key] # check for formatter if idx in range(len(line)): if line[idx] != formatter: out += '#\n' if keywords['prettify'] else '' formatter = line[idx] # add the key out += str(key) # add the rest of the values for value in line: if type(value) == list: try: out += '\t' + ' '.join(value) except: out += '\t' + ' '.join([str(v) for v in value]) elif type(value) == int: out += '\t' + str(value) elif type(value) == float: out += '\t{0:.4f}'.format(value) elif value is None: out += '\t' else: out += '\t{:}'.format(value) out += '\n' util.write_text_file(filename + '.' + keywords['fileformat'], out + keywords.get('stamp', ''), normalize="NFC") return
def test_scorer2str(self): """ Test conversion of scorers to strings. """ self.assertEqual(scorer2str(lingpy.rc('dolgo').scorer), read_text_file(test_data('dolgo.scorer')))
def wl2qlc( header, data, filename='', formatter='concept', **keywords): """ Write the basic data of a wordlist to file. """ util.setdefaults( keywords, ignore=['taxa', 'doculects', 'msa'], fileformat='qlc', prettify=True) if keywords['ignore'] == 'all': keywords['ignore'] = [ 'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json'] formatter = formatter.upper() if not filename: filename = rcParams['filename'] # create output string out = '# Wordlist\n' if keywords['prettify'] else '' # write meta to file meta = keywords.get("meta", {}) kvpairs = {} jsonpairs = {} msapairs = {} trees = {} distances = '' taxa = '' scorer = '' for k, v in meta.items(): # simple key-value-pairs if isinstance(v, (text_type, int)) or k == "tree": kvpairs[k] = v elif k == 'msa' and k not in keywords['ignore']: # go a level deeper, checking for keys for ref in v: if ref not in msapairs: msapairs[ref] = {} for a, b in v[ref].items(): msapairs[ref][a] = b elif k == 'distances': distances = matrix2dst(v, meta['taxa']) elif k in ['taxa', 'doculect', 'taxon', 'doculects']: # we need to find a better solution here, since it is not nice to # have taxa written to json again and again pass elif k == 'trees' and k not in keywords['ignore']: trees = '' for key, value in v.items(): trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value) elif k == 'scorer' and k not in keywords['ignore']: for key, value in v.items(): scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format( key, scorer2str(value), k) else: # check whether serialization works try: json.dumps(v) jsonpairs[k] = v except TypeError: pass if kvpairs and 'meta' not in keywords['ignore']: out += '\n# META\n' if keywords['prettify'] else '' for k, v in sorted(kvpairs.items(), key=lambda x: x[0]): out += '@{0}:{1}\n'.format(k, v) if taxa and keywords['taxa']: out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n' if jsonpairs and 'json' not in keywords['ignore']: out += "@json: " + json.dumps(jsonpairs) + '\n' if msapairs and 'msa' not in keywords['ignore']: for ref in msapairs: out += "\n# MSA reference: {0}\n".format(ref) for k, v in msapairs[ref].items(): if 'consensus' in v: out += '#\n<msa ' out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format( k, ref, ' '.join(v['consensus'])) else: out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref) outs = msa2str(v, wordlist=True) out += outs out += "</msa>\n" if distances and 'distances' not in keywords['ignore']: out += '\n# DISTANCES\n<dst>\n' out += distances + '</dst>\n' if trees: out += '\n# TREES\n' + trees if scorer and 'scorer' not in keywords['ignore']: out += '\n# SCORER\n' + scorer out += '\n# DATA\n' if keywords['prettify'] else '' out += 'ID\t' + '\t'.join(header) + '\n' # check for gloss in header to create nice output format if formatter in header: idx = header.index(formatter) formatter = None sorted_data = sorted(data.keys(), key=lambda x: data[x][idx]) elif len(formatter.split(',')) == 2: idxA, idxB = formatter.split(',') idxA = header.index(idxA) idxB = header.index(idxB) idx = idxA formatter = None sorted_data = sorted(data.keys(), key=lambda x: ( data[x][idxA], data[x][idxB])) else: idx = False formatter = '' sorted_data = sorted(data.keys()) for key in sorted_data: # get the line line = data[key] # check for formatter if idx in range(len(line)): if line[idx] != formatter: out += '#\n' if keywords['prettify'] else '' formatter = line[idx] # add the key out += text_type(key) # add the rest of the values for value in line: if type(value) == list: try: out += '\t' + ' '.join(value) except: out += '\t' + ' '.join([text_type(v) for v in value]) elif type(value) == int: out += '\t' + text_type(value) elif type(value) == float: out += '\t{0:.4f}'.format(value) elif value is None: out += '\t' else: out += '\t{:}'.format(value) out += '\n' util.write_text_file( filename + '.' + keywords['fileformat'], out + keywords.get('stamp', ''), normalize="NFC") return
def test_scorer2str(test_data): assert scorer2str(rc('dolgo').scorer) == read_text_file(str(test_data / 'dolgo.scorer'))
def test_scorer2str(self): """ Test conversion of scorers to strings. """ self.assertEqual(scorer2str(rc('dolgo').scorer), read_text_file(test_data('dolgo.scorer')))