def test_dotjoin(self): self.assertEqual(util.dotjoin(1, 2), '1.2') self.assertEqual(util.dotjoin([1, 2]), '1.2') self.assertEqual(util.dotjoin((1, 2)), '1.2') self.assertEqual( util.dotjoin((i for i in range(1, 3)), condition=lambda j: j > 1), '2') self.assertEqual(util.dotjoin(i for i in range(1, 3)), '1.2')
def _make_graph(colexifications, bipartite=False): """ Return a graph-object from colexification data. """ G = nx.Graph() if not bipartite: for c1, c2, t, f, entry in colexifications: try: G.edge[c1][c2]['families'] += [f] G.edge[c1][c2]['doculects'] += [t] G.edge[c1][c2]['words'] += [entry] except: G.add_node(c1, ntype='concept') G.add_node(c2, ntype='concept') G.add_edge(c1, c2, families=[f], doculects=[t], words=[entry]) for a, b, d in G.edges(data=True): d['familyWeight'] = len(set(d['families'])) d['wordWeight'] = len(d['words']) d['doculectWeight'] = len(set(d['doculects'])) d['family'] = sorted(set(d['families'])) d['doculects'] = sorted(set(d['doculects'])) elif bipartite: for idx, (c1, c2, t, f, entry) in enumerate(colexifications): nindex = dotjoin(t, idx + 1) try: G.edge[nindex][c1]['weight'] += 1 G.edge[nindex][c2]['weight'] += 1 except: G.add_node(nindex, ntype='word', entry=entry, doculect=t, family=f) G.add_node(c1, ntype='concept') G.add_node(c2, ntype='concept') G.add_edge(nindex, c1, weight=1) G.add_edge(nindex, c2, weight=1) return G
def _make_graph(colexifications, bipartite=False): """ Return a graph-object from colexification data. """ G = nx.Graph() if not bipartite: for c1, c2, t, f, entry in colexifications: try: G.edge[c1][c2]['families'] += [f] G.edge[c1][c2]['doculects'] += [t] G.edge[c1][c2]['words'] += [entry] except: G.add_node(c1, ntype='concept') G.add_node(c2, ntype='concept') G.add_edge(c1, c2, families=[f], doculects=[t], words=[entry]) for a, b, d in G.edges(data=True): d['familyWeight'] = len(set(d['families'])) d['wordWeight'] = len(d['words']) d['doculectWeight'] = len(set(d['doculects'])) d['family'] = sorted(set(d['families'])) d['doculects'] = sorted(set(d['doculects'])) elif bipartite: for idx, (c1, c2, t, f, entry) in enumerate(colexifications): nindex = dotjoin(t, idx + 1) try: G.edge[nindex][c1]['weight'] += 1 G.edge[nindex][c2]['weight'] += 1 except KeyError: G.add_node(nindex, ntype='word', entry=entry, doculect=t, family=f) G.add_node(c1, ntype='concept') G.add_node(c2, ntype='concept') G.add_edge(nindex, c1, weight=1) G.add_edge(nindex, c2, weight=1) return G
def get_correspondences(alms, ref='lexstatid'): """ Compute sound correspondences for a given set of aligned cognates. """ # store all correspondences corrs = {} # store occurrences occs = {} for key, msa in alms.msa[ref].items(): # get basic stuff idxs = msa['ID'] taxa = msa['taxa'] concept = cgi.escape(alms[idxs[0], 'concept'], True) # get numerical representation of alignments if 'numbers' in alms.header: alignment = [class2tokens( alms[idxs[i], 'numbers'], msa['alignment'][i]) for i in range(len(idxs))] else: alignment = msa['alignment'] # create new array for confidence character_matrix = [] # iterate over each taxon for i, taxon in enumerate(taxa): # get the numerical sequence nums = alignment[i] # store chars per line chars = [] # iterate over the sequence for j, num in enumerate(nums): col = [alm[j] for alm in alignment] # get the char if num != '-': charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2]) chars += [charA] try: occs[charA] += [concept] except: occs[charA] = [concept] else: chars += ['-'] for k, numB in enumerate(col): if k != i: if num == '-' and numB == '-': pass else: if numB != '-' and num != '-': # get the second char charB = dotjoin( taxa[k], msa['alignment'][k][j], numB.split('.')[2]) try: corrs[charA][charB] += 1 except: try: corrs[charA][charB] = 1 except: corrs[charA] = {charB: 1} character_matrix += [chars] # append confidence matrix to alignments alms.msa[ref][key]['_charmat'] = character_matrix return corrs, occs
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1): """ Function creates confidence scores for a given set of alignments. Parameters ---------- alms : :py:class`~lingpy.align.sca.Alignments` An *Alignments* object containing already aligned strings. scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict` A *ScoreDict* object which gives similarity scores for all segments in the alignment. ref : str (default="lexstatid") The reference entry-type, referring to the cognate-set to be used for the analysis. """ # store all values for average scores values = [] # store all correspondences corrs = {} # store occurrences occs = {} for key, msa in alms.msa[ref].items(): # get basic stuff idxs = msa['ID'] taxa = msa['taxa'] concept = cgi.escape(alms[idxs[0], 'concept'], True) # get numerical representation of alignments if scorer: alignment = [class2tokens( alms[idxs[i], 'numbers'], msa['alignment'][i]) for i in range(len(idxs))] else: alignment = msa['alignment'] # create new array for confidence confidence_matrix = [] character_matrix = [] # iterate over each taxon for i, taxon in enumerate(taxa): idx = alms.taxa.index(taxon) + 1 # get the numerical sequence nums = alignment[i] # store confidences per line confidences = [] # store chars per line chars = [] # iterate over the sequence for j, num in enumerate(nums): col = [alm[j] for alm in alignment] score = 0 count = 0 # get the char if num != '-': charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2]) chars += [charA] try: occs[charA] += [concept] except: occs[charA] = [concept] else: chars += ['-'] for k, numB in enumerate(col): if k != i: if num == '-' and numB == '-': pass else: if numB != '-' and num != '-': # get the second char charB = dotjoin( taxa[k], msa['alignment'][k][j], numB.split('.')[2]) try: corrs[charA][charB] += 1 except: try: corrs[charA][charB] = 1 except: corrs[charA] = {charB: 1} gaps = False if num == '-' and numB != '-': numA = charstring(idx) gaps = True elif numB == '-' and num != '-': numB = charstring(alms.taxa.index(taxa[k])) numA = num gaps = True else: numA = num scoreA = scorer[numA, numB] scoreB = scorer[numB, numA] this_score = max(scoreA, scoreB) if not gaps: score += this_score count += 1 else: score += this_score * gap_weight count += gap_weight if count: score = score / count else: score = -25 confidences += [int(score + 0.5)] values += [int(score + 0.5)] confidence_matrix += [confidences] character_matrix += [chars] # append confidence matrix to alignments alms.msa[ref][key]['confidence'] = confidence_matrix alms.msa[ref][key]['_charmat'] = character_matrix # sort the values values = sorted(set(values + [1])) # make conversion to scale of 100 values converter = {} valsA = values[:values.index(1)] valsB = values[values.index(1):] stepA = 50 / (len(valsA) + 1) stepB = 75 / (len(valsB) + 1) for i, score in enumerate(valsA): # values[:values.index(0)): converter[score] = int((stepA * i) / 4 + 0.5) for i, score in enumerate(valsB): converter[score] = int(stepB * i + 0.5) + 50 # iterate over keys again for key, msa in alms.msa[ref].items(): # get basic stuff for i, line in enumerate(msa['confidence']): for j, cell in enumerate(line): alms.msa[ref][key]['confidence'][i][j] = converter[cell] jsond = {} for key, corr in corrs.items(): splits = [c.split('.') + [o] for c, o in corr.items()] sorts = sorted(splits, key=lambda x: (x[0], -x[3])) new_sorts = [] # check for rowspan spans = {} for a, b, c, d in sorts: if a in spans: if spans[a] < 3 and d > 1: spans[a] += 1 new_sorts += [[a, b, c, d]] else: if d > 1: spans[a] = 1 new_sorts += [[a, b, c, d]] bestis = [] old_lang = '' counter = 0 for a, b, c, d in new_sorts: new_lang = a if new_lang != old_lang: old_lang = new_lang tmp = '<tr class="display">' tmp += '<td class="display" rowspan={0}>'.format(spans[a]) tmp += a + '</td>' tmp += '<td class="display" onclick="show({0});"><span '.format( "'" + dotjoin(a, b, c) + "'") tmp += 'class="char {0}">' + b + '</span></td>' tmp += '<td class="display">' tmp += c + '</td>' tmp += '<td class="display">' + str(d) + '</td>' tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>' tmp += '</tr>' t = 'dolgo_' + token2class(b, rcParams['dolgo']) # bad check for three classes named differently if t == 'dolgo__': t = 'dolgo_X' elif t == 'dolgo_1': t = 'dolgo_TONE' elif t == 'dolgo_0': t = 'dolgo_ERROR' bestis += [tmp.format(t)] counter += 1 elif counter > 0: tmp = '<tr class="display">' tmp += '<td class="display" onclick="show({0});"><span '.format( "'" + dotjoin(a, b, c) + "'") tmp += 'class="char {0}">' + b + '</span></td>' tmp += '<td class="display">' + c + '</td>' tmp += '<td class="display">' + str(d) + '</td>' tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>' tmp += '</tr>' t = 'dolgo_' + token2class(b, rcParams['dolgo']) # bad check for three classes named differently if t == 'dolgo__': t = 'dolgo_X' elif t == 'dolgo_1': t = 'dolgo_TONE' elif t == 'dolgo_0': t = 'dolgo_ERROR' bestis += [tmp.format(t)] counter += 1 old_lang = new_lang else: old_lang = new_lang counter = 0 jsond[key] = [''.join(bestis), occs[key]] return jsond
def test_dotjoin(): assert util.dotjoin(1, 2) == '1.2' assert util.dotjoin([1, 2]) == '1.2' assert util.dotjoin((1, 2)) == '1.2' assert util.dotjoin((i for i in range(1, 3)), condition=lambda j: j > 1) == '2' assert util.dotjoin(i for i in range(1, 3)) == '1.2'