def test_class2tokens(self): classes = 'T-VKTV-R' tokens = 'tʰ ɔ x t ə r'.split(' ') out = class2tokens(classes, tokens) _ = class2tokens([['T'], ['-VKTV-'], ['R']], 'th o x t e r'.split(), local=True) assert out[1] == '-' and out[-2] == '-'
def test_class2tokens(): classes = 'T-VKTV-R' tokens = 'tʰ ɔ x t ə r'.split(' ') out = class2tokens(classes, tokens) assert out[1] == '-' and out[-2] == '-'
def get_correspondences(alms, ref='lexstatid'): """ Compute sound correspondences for a given set of aligned cognates. """ # store all correspondences corrs = {} # store occurrences occs = {} for key, msa in alms.msa[ref].items(): # get basic stuff idxs = msa['ID'] taxa = msa['taxa'] concept = cgi.escape(alms[idxs[0], 'concept'], True) # get numerical representation of alignments if 'numbers' in alms.header: alignment = [class2tokens( alms[idxs[i], 'numbers'], msa['alignment'][i]) for i in range(len(idxs))] else: alignment = msa['alignment'] # create new array for confidence character_matrix = [] # iterate over each taxon for i, taxon in enumerate(taxa): # get the numerical sequence nums = alignment[i] # store chars per line chars = [] # iterate over the sequence for j, num in enumerate(nums): col = [alm[j] for alm in alignment] # get the char if num != '-': charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2]) chars += [charA] try: occs[charA] += [concept] except: occs[charA] = [concept] else: chars += ['-'] for k, numB in enumerate(col): if k != i: if num == '-' and numB == '-': pass else: if numB != '-' and num != '-': # get the second char charB = dotjoin( taxa[k], msa['alignment'][k][j], numB.split('.')[2]) try: corrs[charA][charB] += 1 except: try: corrs[charA][charB] = 1 except: corrs[charA] = {charB: 1} character_matrix += [chars] # append confidence matrix to alignments alms.msa[ref][key]['_charmat'] = character_matrix return corrs, occs
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1): """ Function creates confidence scores for a given set of alignments. Parameters ---------- alms : :py:class`~lingpy.align.sca.Alignments` An *Alignments* object containing already aligned strings. scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict` A *ScoreDict* object which gives similarity scores for all segments in the alignment. ref : str (default="lexstatid") The reference entry-type, referring to the cognate-set to be used for the analysis. """ # store all values for average scores values = [] # store all correspondences corrs = {} # store occurrences occs = {} for key, msa in alms.msa[ref].items(): # get basic stuff idxs = msa['ID'] taxa = msa['taxa'] concept = cgi.escape(alms[idxs[0], 'concept'], True) # get numerical representation of alignments if scorer: alignment = [class2tokens( alms[idxs[i], 'numbers'], msa['alignment'][i]) for i in range(len(idxs))] else: alignment = msa['alignment'] # create new array for confidence confidence_matrix = [] character_matrix = [] # iterate over each taxon for i, taxon in enumerate(taxa): idx = alms.taxa.index(taxon) + 1 # get the numerical sequence nums = alignment[i] # store confidences per line confidences = [] # store chars per line chars = [] # iterate over the sequence for j, num in enumerate(nums): col = [alm[j] for alm in alignment] score = 0 count = 0 # get the char if num != '-': charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2]) chars += [charA] try: occs[charA] += [concept] except: occs[charA] = [concept] else: chars += ['-'] for k, numB in enumerate(col): if k != i: if num == '-' and numB == '-': pass else: if numB != '-' and num != '-': # get the second char charB = dotjoin( taxa[k], msa['alignment'][k][j], numB.split('.')[2]) try: corrs[charA][charB] += 1 except: try: corrs[charA][charB] = 1 except: corrs[charA] = {charB: 1} gaps = False if num == '-' and numB != '-': numA = charstring(idx) gaps = True elif numB == '-' and num != '-': numB = charstring(alms.taxa.index(taxa[k])) numA = num gaps = True else: numA = num scoreA = scorer[numA, numB] scoreB = scorer[numB, numA] this_score = max(scoreA, scoreB) if not gaps: score += this_score count += 1 else: score += this_score * gap_weight count += gap_weight if count: score = score / count else: score = -25 confidences += [int(score + 0.5)] values += [int(score + 0.5)] confidence_matrix += [confidences] character_matrix += [chars] # append confidence matrix to alignments alms.msa[ref][key]['confidence'] = confidence_matrix alms.msa[ref][key]['_charmat'] = character_matrix # sort the values values = sorted(set(values + [1])) # make conversion to scale of 100 values converter = {} valsA = values[:values.index(1)] valsB = values[values.index(1):] stepA = 50 / (len(valsA) + 1) stepB = 75 / (len(valsB) + 1) for i, score in enumerate(valsA): # values[:values.index(0)): converter[score] = int((stepA * i) / 4 + 0.5) for i, score in enumerate(valsB): converter[score] = int(stepB * i + 0.5) + 50 # iterate over keys again for key, msa in alms.msa[ref].items(): # get basic stuff for i, line in enumerate(msa['confidence']): for j, cell in enumerate(line): alms.msa[ref][key]['confidence'][i][j] = converter[cell] jsond = {} for key, corr in corrs.items(): splits = [c.split('.') + [o] for c, o in corr.items()] sorts = sorted(splits, key=lambda x: (x[0], -x[3])) new_sorts = [] # check for rowspan spans = {} for a, b, c, d in sorts: if a in spans: if spans[a] < 3 and d > 1: spans[a] += 1 new_sorts += [[a, b, c, d]] else: if d > 1: spans[a] = 1 new_sorts += [[a, b, c, d]] bestis = [] old_lang = '' counter = 0 for a, b, c, d in new_sorts: new_lang = a if new_lang != old_lang: old_lang = new_lang tmp = '<tr class="display">' tmp += '<td class="display" rowspan={0}>'.format(spans[a]) tmp += a + '</td>' tmp += '<td class="display" onclick="show({0});"><span '.format( "'" + dotjoin(a, b, c) + "'") tmp += 'class="char {0}">' + b + '</span></td>' tmp += '<td class="display">' tmp += c + '</td>' tmp += '<td class="display">' + str(d) + '</td>' tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>' tmp += '</tr>' t = 'dolgo_' + token2class(b, rcParams['dolgo']) # bad check for three classes named differently if t == 'dolgo__': t = 'dolgo_X' elif t == 'dolgo_1': t = 'dolgo_TONE' elif t == 'dolgo_0': t = 'dolgo_ERROR' bestis += [tmp.format(t)] counter += 1 elif counter > 0: tmp = '<tr class="display">' tmp += '<td class="display" onclick="show({0});"><span '.format( "'" + dotjoin(a, b, c) + "'") tmp += 'class="char {0}">' + b + '</span></td>' tmp += '<td class="display">' + c + '</td>' tmp += '<td class="display">' + str(d) + '</td>' tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>' tmp += '</tr>' t = 'dolgo_' + token2class(b, rcParams['dolgo']) # bad check for three classes named differently if t == 'dolgo__': t = 'dolgo_X' elif t == 'dolgo_1': t = 'dolgo_TONE' elif t == 'dolgo_0': t = 'dolgo_ERROR' bestis += [tmp.format(t)] counter += 1 old_lang = new_lang else: old_lang = new_lang counter = 0 jsond[key] = [''.join(bestis), occs[key]] return jsond
def align(self, **keywords): """ Align a pair of sequences or multiple sequence pairs. Parameters ---------- gop : int (default=-1) The gap opening penalty (GOP). scale : float (default=0.5) The gap extension penalty (GEP), calculated with help of a scaling factor. mode : {"global","local","overlap","dialign"} The alignment mode, see :evobib:`List2012a` for details. factor : float (default = 0.3) The factor by which matches in identical prosodic position are increased. restricted_chars : str (default="T\_") The restricted chars that function as an indicator of syllable or morpheme breaks for secondary alignment, see :evobib:`List2012c` for details. distance : bool (default=False) If set to *True*, return the distance instead of the similarity score. Distance is calculated using the formula by :evobib:`Downey2008`. model : { None, ~lingpy.data.model.Model } Specify the sound class model that shall be used for the analysis. If no model is specified, the default model of :evobib:`List2012a` will be used. pprint : bool (default=False) If set to *True*, the alignments are printed to the screen. """ setdefaults( keywords, gop=-1, scale=0.5, mode='global', factor=0.3, restricted_chars='T_', distance=False, model=rcParams['sca'], pprint=False, transform=rcParams['align_transform']) if hasattr(self, 'model'): if keywords['model'] != self.model: self._set_model(**keywords) else: self._set_model(**keywords) # create the alignments array self._alignments = calign.align_pairs( self.classes, self.weights, self.prostrings, keywords['gop'], keywords['scale'], keywords['factor'], self.scoredict, keywords['mode'], keywords['restricted_chars'], distance=1 if keywords['distance'] else 0) # switch back to alignments self.alignments = [] for i, (almA, almB, sim) in enumerate(self._alignments): self.alignments.append(( class2tokens(self.tokens[i][0], almA, local=keywords['mode'] == "local"), class2tokens(self.tokens[i][1], almB, local=keywords['mode'] == "local"), sim)) # print the alignments, if this is chosen as_string(self, pprint=keywords['pprint'])
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1): """ Function creates confidence scores for a given set of alignments. Parameters ---------- alms : :py:class`~lingpy.align.sca.Alignments` An *Alignments* object containing already aligned strings. scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict` A *ScoreDict* object which gives similarity scores for all segments in the alignment. ref : str (default="lexstatid") The reference entry-type, referring to the cognate-set to be used for the analysis. """ # store all values for average scores values = [] for key, msa in alms.msa[ref].items(): # get basic stuff idxs = msa['ID'] taxa = msa['taxa'] # get numerical representation of alignments if scorer: alignment = [class2tokens( alms[idxs[i], 'numbers'], msa['alignment'][i]) for i in range(len(idxs))] else: alignment = msa['alignment'] # create new array for confidence confidence_matrix = [] # iterate over each taxon for i, taxon in enumerate(taxa): idx = alms.taxa.index(taxon) + 1 # get the numerical sequence nums = alignment[i] # store confidences per line confidences = [] # iterate over the sequence for j, num in enumerate(nums): col = [alm[j] for alm in alignment] score = 0 count = 0 for k, numB in enumerate(col): if k != i: if num == '-' and numB == '-': pass else: gaps = False if num == '-' and numB != '-': numA = str(idx) + '.X.-' gaps = True elif numB == '-' and num != '-': numB = str(alms.taxa.index(taxa[k])) + '.X.-' numA = num gaps = True else: numA = num scoreA = scorer[numA, numB] scoreB = scorer[numB, numA] this_score = max(scoreA, scoreB) if not gaps: score += this_score count += 1 else: score += this_score * gap_weight count += gap_weight if count: score = score / count # (len(col) - gaps * gap_weight) else: score = -25 confidences += [int(score + 0.5)] values += [int(score + 0.5)] confidence_matrix += [confidences] # append confidence matrix to alignments alms.msa[ref][key]['confidence'] = confidence_matrix # sort the values values = sorted(set(values)) # make conversion to scale of 100 values converter = {} step = 100 / (len(values) + 1) for i, score in enumerate(values): converter[score] = int(step * score + 0.5) # iterate over keys again for key, msa in alms.msa[ref].items(): # get basic stuff for i, line in enumerate(msa['confidence']): for j, cell in enumerate(line): msa['confidence'] = converter[cell]
def align(self, **keywords): """ Align a pair of sequences or multiple sequence pairs. Parameters ---------- gop : int (default=-1) The gap opening penalty (GOP). scale : float (default=0.5) The gap extension penalty (GEP), calculated with help of a scaling factor. mode : {"global","local","overlap","dialign"} The alignment mode, see :evobib:`List2012a` for details. factor : float (default = 0.3) The factor by which matches in identical prosodic position are increased. restricted_chars : str (default="T_") The restricted chars that function as an indicator of syllable or morpheme breaks for secondary alignment, see :evobib:`List2012c` for details. distance : bool (default=False) If set to *True*, return the distance instead of the similarity score. Distance is calculated using the formula by :evobib:`Downey2008`. model : { None, ~lingpy.data.model.Model } Specify the sound class model that shall be used for the analysis. If no model is specified, the default model of :evobib:`List2012a` will be used. pprint : bool (default=False) If set to *True*, the alignments are printed to the screen. """ setdefaults( keywords, gop=-1, scale=0.5, mode='global', factor=0.3, restricted_chars='T_', distance=False, model=rcParams['sca'], pprint=False, transform=rcParams['align_transform']) if hasattr(self, 'model'): if keywords['model'] != self.model: self._set_model(**keywords) else: self._set_model(**keywords) # create the alignments array self._alignments = calign.align_pairs( self.classes, self.weights, self.prostrings, keywords['gop'], keywords['scale'], keywords['factor'], self.scoredict, keywords['mode'], keywords['restricted_chars'], distance=1 if keywords['distance'] else 0) # switch back to alignments self.alignments = [] for i, (almA, almB, sim) in enumerate(self._alignments): self.alignments.append(( class2tokens(self.tokens[i][0], almA, local=keywords['mode'] == "local"), class2tokens(self.tokens[i][1], almB, local=keywords['mode'] == "local"), sim)) # print the alignments, if this is chosen as_string(self, pprint=keywords['pprint'])