def test_convenience(): info('m') warning('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def test_convenience(): info('m') warning('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def colexification_network( wordlist, entry='ipa', concept='concept', output='', filename='network', bipartite=False, **keywords): """ Calculate a colexification network from a given wordlist object. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist The wordlist object containing the data. entry : str (default="ipa") The reference point for the language entry. We use "ipa" as a default. concept : str (default="concept") The reference point for the name of the row containing the concepts. We use "concept" as a default. output: str (default='') If output is set to "gml", the resulting network will be written to a textfile in GML format. Returns ------- G : networkx.Graph A networkx.Graph object. """ # now, iterate over all concepts for each taxon and add the connections to # our network, which we now simply store as networkx graph for conveniency colexifications = _get_colexifications(wordlist, entry, concept) stats = _get_statistics(wordlist, entry, concept) G = _make_graph(colexifications, bipartite=bipartite) # we should also add meta-data to the nodes in the graph for node, data in G.nodes(data=True): if data['ntype'] == 'concept': data.update(stats[node]) if not output: return G def stringify_data(data): for k in data: if isinstance(data[k], list): data[k] = join('//', *data[k]) if output == 'gml': for node, data in G.nodes(data=True): stringify_data(data) for nA, nB, data in G.edges(data=True): stringify_data(data) nx.write_gml(G, filename + '.gml') log.file_written(filename + '.gml')
def colexification_network(wordlist, entry='ipa', concept='concept', output='', filename='network', bipartite=False, **keywords): """ Calculate a colexification network from a given wordlist object. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist The wordlist object containing the data. entry : str (default="ipa") The reference point for the language entry. We use "ipa" as a default. concept : str (default="concept") The reference point for the name of the row containing the concepts. We use "concept" as a default. output: str (default='') If output is set to "gml", the resulting network will be written to a textfile in GML format. Returns ------- G : networkx.Graph A networkx.Graph object. """ # now, iterate over all concepts for each taxon and add the connections to # our network, which we now simply store as networkx graph for conveniency colexifications = _get_colexifications(wordlist, entry, concept) stats = _get_statistics(wordlist, entry, concept) G = _make_graph(colexifications, bipartite=bipartite) # we should also add meta-data to the nodes in the graph for node, data in G.nodes(data=True): if data['ntype'] == 'concept': data.update(stats[node]) if not output: return G def stringify_data(data): for k in data: if isinstance(data[k], list): data[k] = join('//', *data[k]) if output == 'gml': for node, data in G.nodes(data=True): stringify_data(data) for nA, nB, data in G.edges(data=True): stringify_data(data) nx.write_gml(G, filename + '.gml') log.file_written(filename + '.gml')
def test_convenience(self): from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written info('m') warn('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def test_convenience(self): from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written info('m') warn('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def write_text_file(path, content, normalize=None, log=True): """Write a text file encoded in utf-8. :param path: File-system path of the file. :content: The text content to be written. :param normalize: If not `None` a valid unicode normalization mode must be passed. """ if not isinstance(content, text_type): content = lines_to_text(content) with io.open(_str_path(path, mkdir=True), 'w', encoding='utf8') as fp: fp.write(unicodedata.normalize(normalize, content) if normalize else content) if log: file_written(_str_path(path))
def diff(self, **keywords): """ Write all differences between two sets to a file. Parameters ---------- filename : str (default='eval_psa_diff') Default """ setdefaults(keywords, filename=self.gold.infile) if not keywords['filename'].endswith('.diff'): keywords['filename'] = keywords['filename'] + '.diff' out = [] for i, (a, b) in enumerate(zip(self.gold.alignments, self.test.alignments)): g1, g2, g3 = a t1, t2, t3 = b maxL = max([len(g1), len(t1)]) if g1 != t1 or g2 != t2: taxA, taxB = self.gold.taxa[i] taxlen = max(len(taxA), len(taxB)) seq_id = self.gold.seq_ids[i] out.append( '{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'. format( seq_id, taxA, '\t'.join(g1), taxB, '\t'.join(g2), '{0}\t{1}'.format( taxlen * ' ', '\t'.join(['==' for x in range(maxL)])), '\t'.join(t1), '\t'.join(t2), )) log.file_written(keywords['filename']) write_text_file(keywords['filename'], out)
def write_text_file(path, content, normalize=None, log=True): """Write a text file encoded in utf-8. Parameters ---------- path : str File-system path of the file. content : str The text content to be written. normalize : { None, "NFC", "NFD" } (default=False) If not `None` a valid unicode normalization mode must be passed. log : bool (default=True) Indicate whether you want to log the result of the file writing process. """ if not isinstance(content, text_type): content = lines_to_text(content) with io.open(_str_path(path, mkdir=True), 'w', encoding='utf8') as fp: fp.write(unicodedata.normalize(normalize, content) if normalize else content) if log: file_written(_str_path(path))
def diff(self, **keywords): """ Write all differences between two sets to a file. Parameters ---------- filename : str (default='eval_psa_diff') Default """ setdefaults(keywords, filename=self.gold.infile) if not keywords['filename'].endswith('.diff'): keywords['filename'] = keywords['filename'] + '.diff' out = [] for i, (a, b) in enumerate(zip(self.gold.alignments, self.test.alignments)): g1, g2, g3 = a t1, t2, t3 = b maxL = max([len(g1), len(t1)]) if g1 != t1 or g2 != t2: taxA, taxB = self.gold.taxa[i] taxlen = max(len(taxA), len(taxB)) seq_id = self.gold.seq_ids[i] out.append('{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'.format( seq_id, taxA, '\t'.join(g1), taxB, '\t'.join(g2), '{0}\t{1}'.format( taxlen * ' ', '\t'.join(['==' for x in range(maxL)])), '\t'.join(t1), '\t'.join(t2), )) log.file_written(keywords['filename']) write_text_file(keywords['filename'], out)
def write_text_file(path, content, normalize=None, log=True): """Write a text file encoded in utf-8. Parameters ---------- path : str File-system path of the file. content : str The text content to be written. normalize : { None, "NFC", "NFD" } (default=False) If not `None` a valid unicode normalization mode must be passed. log : bool (default=True) Indicate whether you want to log the result of the file writing process. """ if not isinstance(content, text_type): content = lines_to_text(content) with io.open(_str_path(path, mkdir=True), 'w', encoding='utf8') as fp: fp.write( unicodedata.normalize(normalize, content) if normalize else content ) if log: file_written(_str_path(path))
def plot_gls(gls, treestring, degree=90, fileformat='pdf', **keywords): """ Plot a gain-loss scenario for a given reference tree. """ # get kewyords defaults = dict(figsize=(15, 15), left=0.05, top=0.95, bottom=0.05, right=0.95, radius=0.5, textsize=8, edgewidth=5, linewidth=2, scale_radius=1.2, ylim=1, xlim=1, text=True, gain_color='white', loss_color='black', gain_linestyle='dotted', loss_linestyle='solid', ax_linewidth=0, filename=rcParams['filename']) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # set filename as variabel for convenience filename = keywords['filename'] try: tree = cg.LoadTree(treestring=treestring) except: try: tree = cg.LoadTree(treestring) except: tree = treestring tgraph = radial_layout(treestring, degree=degree) graph = gls2gml(gls, tgraph, tree) nodes = [] # assign nodes and edges for n, d in graph.nodes(data=True): g = d['graphics'] x = g['x'] y = g['y'] s = d['state'] nodes += [(x, y, s)] # now plot the stuff fig = plt.figure(figsize=keywords['figsize']) figsp = fig.add_subplot(111) figsp.axes.get_xaxis().set_visible(False) figsp.axes.get_yaxis().set_visible(False) # set the axes linewidht for s in figsp.spines.values(): s.set_linewidth(keywords['ax_linewidth']) plt.axis('equal') for nA, nB in graph.edges(): xA = graph.node[nA]['graphics']['x'] xB = graph.node[nB]['graphics']['x'] yA = graph.node[nA]['graphics']['y'] yB = graph.node[nB]['graphics']['y'] plt.plot([xA, xB], [yA, yB], '-', color='black', linewidth=keywords['edgewidth'], zorder=1) # now, iterate over nodes for x, y, s in nodes: if s == 'O': w = mpl.patches.Wedge((x, y), keywords['radius'], 0, 360, facecolor=keywords['gain_color'], linewidth=keywords['linewidth'], linestyle=keywords['gain_linestyle']) elif s == 'o': w = mpl.patches.Wedge( (x, y), keywords['radius'] / keywords['scale_radius'], 0, 360, facecolor=keywords['gain_color'], linewidth=keywords['linewidth']) elif s == 'L': w = mpl.patches.Wedge((x, y), keywords['radius'], 0, 360, facecolor=keywords['loss_color'], linewidth=keywords['linewidth'], linestyle=keywords['loss_linestyle']) else: w = mpl.patches.Wedge( (x, y), keywords['radius'] / keywords['scale_radius'], 0, 360, facecolor=keywords['loss_color'], linewidth=keywords['linewidth']) figsp.add_artist(w) # if text is chosen as argument if keywords['text']: if s in 'Oo': t = '1' c = 'black' else: t = '0' c = 'white' plt.text(x, y, t, size=keywords['textsize'], color=c, va="center", ha="center", fontweight='bold') # set x and y-values xvals = [x[0] for x in nodes] yvals = [x[1] for x in nodes] plt.xlim(min(xvals) - keywords['xlim'], max(xvals) + keywords['xlim']) plt.ylim(min(yvals) - keywords['ylim'], max(yvals) + keywords['ylim']) plt.subplots_adjust(left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom']) plt.savefig(filename + '.' + fileformat) plt.clf() log.file_written(filename + '.' + fileformat)
def plot_heatmap(wordlist, filename="heatmap", fileformat="pdf", ref='cogid', normalized=False, refB='', **keywords): """ Create a heatmap-representation of shared cognates for a given wordlist. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. filename : str (default="heatmap") Name of the file to which the heatmap will be written. fileformat : str (default="pdf") A regular matplotlib-fileformat (pdf, png, pgf, svg). ref : str (default="cogid') The name of the column that contains the cognate identifiers. normalized : {bool str} (default=True) If set to c{False}, don't normalize the data. Otherwise, select the normalization method, choose between: * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for details), and * "swadesh" for traditional lexicostatistical calculation of shared cognate percentages. cmap : matplotlib.cm (default=matplotlib.cm.jet) The color scheme to be used for the heatmap. steps : int (default=5) The number of steps in which names of taxa will be written to the axes. xrotation : int (default=45) The rotation of the taxon-names on the x-axis. colorbar : bool (default=True) Specify, whether a colorbar should be added to the plot. figsize : tuple (default=(10,10)) Specify the size of the figure. tree : str (default='') A tree passed for the taxa in Newick-format. If no tree is specified, the method looks for a tree object in the Wordlist. Notes ----- This function plots shared cognate percentages. """ defaults = dict( bottom=0.01, # rcParams['phybo_ylimb'] cmap=mpl.cm.jet, colorbar=True, colorbar_label="Shared Cognates", colorbar_shrink=0.75, colorbar_textsize=10, figsize=(10, 5), height=0.8, labels={}, # taxon labels passed for the taxa, left=0.01, # rcParams['phybo_xlimr'], matrix=False, normalization="jaccard", right=0.95, # rcParams['phybo_xliml'], scale=0.075, show_tree=True, steps=20, textsize=5, top=0.95, # rcParams['phybo_ylimt'], tree='', tree_bottom=0.1, tree_left=0.1, tree_width=0.2, vmax=1.0, vmin=0.0, width=0.8, xrotation=90, distances=False) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # access the reference tree of the wordlist and create a function that # orders the taxa accordingly if not keywords['tree']: try: tree = wordlist.tree except: raise ValueError("[i] No tree could be found") else: tree = keywords["tree"] # check for normalization if normalized: if normalized not in ["jaccard", "swadesh"]: raise ValueError( "Keyword 'normalized' must be one of 'jaccard','swadesh',False." ) # create an empty matrix if not normalized: matrix = np.zeros((wordlist.width, wordlist.width), dtype=int) else: matrix = np.zeros((wordlist.width, wordlist.width), dtype=float) # create the figure fig = plt.figure(figsize=keywords['figsize']) # plot the reference tree if keywords['show_tree']: tree_matrix, taxa = nwk2tree_matrix(tree) ax1 = fig.add_axes([ keywords['left'], keywords['bottom'], 0.25 * keywords['width'], keywords['height'] ]) # [0.01,0.1,0.2,0.7]) d = sch.dendrogram( np.array(tree_matrix), labels=[t for t in taxa], orientation='left', ) taxa = d['ivl'][::-1] ax1.set_xticks([]) ax1.set_yticks([]) ax1.spines['bottom'].set_color('#ffffff') ax1.spines['top'].set_color('#ffffff') ax1.spines['left'].set_color('#ffffff') ax1.spines['right'].set_color('#ffffff') left = keywords['left'] + keywords['scale'] * keywords['width'] else: left = keywords['left'] taxa = tree.taxa # start iterating over taxa in order of the reference tree and fill in the # matrix with numbers of shared cognates if keywords['matrix']: matrix = keywords['matrix'] else: for i, taxonA in enumerate(taxa): for j, taxonB in enumerate(taxa): if i < j: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list(taxa=taxonA, flat=True, entry=ref) cogsB = wordlist.get_list(taxa=taxonB, flat=True, entry=ref) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict(taxa=taxonA, entry=ref) cogsB = wordlist.get_dict(taxa=taxonB, entry=ref) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning( str([ shared, slots, len(cogsA), len(cogsB), taxonA, taxonB ])) shared = 0.0 matrix[i][j] = shared # if refB is also a possibiltiy if not refB: matrix[j][i] = shared elif i > j and refB: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list(taxa=taxonA, flat=True, entry=refB) cogsB = wordlist.get_list(taxa=taxonB, flat=True, entry=refB) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict(taxa=taxonA, entry=refB) cogsB = wordlist.get_dict(taxa=taxonB, entry=refB) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning( str([ shared, slots, len(cogsA), len(cogsB), taxonA, taxonB ])) shared = 0.0 matrix[i][j] = shared elif i == j: cogs = wordlist.get_list(taxa=taxonA, flat=True, entry=ref) if normalized: matrix[i][j] = 1.0 else: matrix[i][j] = len(set(cogs)) ax2 = fig.add_axes([ left, # keywords['left']+0.25 * keywords['width']+0.05, keywords['bottom'], keywords['width'], keywords['height'] ]) cmap = keywords['cmap'] # [0.15,0.1,0.7,0.7]) if 'distances' in keywords and keywords['distances']: for i, line in enumerate(matrix): for j, cell in enumerate(matrix): matrix[i][j] = 1 - matrix[i][j] nmatrix = [[keywords['vmax'], keywords['vmin']], [keywords['vmin'], keywords['vmax']]] im = ax2.matshow(nmatrix, aspect='auto', origin='lower', interpolation='nearest', cmap=keywords['cmap'], vmax=keywords['vmax'], vmin=keywords['vmin']) # set the xticks steps = int(len(taxa) / keywords['steps'] + 0.5) start = int(steps / 2 + 0.5) idxs = [0] + list(range(start, len(taxa), steps)) selected_taxa = [taxa[i] for i in idxs] # modify taxon names if this is specified for i, t in enumerate(selected_taxa): if t in keywords['labels']: selected_taxa[i] = keywords['labels'][t] ax2.set_xticks([]) ax2.set_yticks([]) plt.xticks(idxs, selected_taxa, size=keywords['textsize'], rotation=keywords['xrotation'], rotation_mode="default") plt.yticks( idxs, selected_taxa, size=keywords['textsize'], ) if keywords["colorbar"]: plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax']) c = plt.colorbar(im, shrink=keywords['colorbar_shrink']) c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize']) plt.subplots_adjust(left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom']) plt.savefig(filename + '.' + fileformat) f = open(filename + '.matrix', 'w') for i, t in enumerate(taxa): f.write('{0:20}'.format(t)) for j, c in enumerate(matrix[i]): if not normalized: f.write('\t{0:3}'.format(int(c))) else: f.write('\t{0:.2f}'.format(c)) f.write('\n') f.close() log.file_written(filename + '.' + fileformat)
def plot_tree( treestring, degree=90, fileformat='pdf', root="root", **keywords ): """ Plot a Newick tree to PDF or other graphical formats. Parameters ---------- treestring : str A string in Newick format. degree : int Determine the degree of the tree (this determines how "circular" the tree will be). fileformat : str (default="pdf") Select the fileformat to which the tree shall be written. filename : str Determine the name of the file to which the data shall be written. Defaults to a timestamp. figsize : tuple (default=(10,10)) Determine the size of the figure. """ default = dict( ax_linewidth=0, bg='black', bottom=0.05, change=lambda x: x ** 1.75, edge_list=[], figsize=(10, 10), filename=rcParams['filename'], fontweight='bold', frameon=False, ha='center', labels=[], left=0.05, linecolor='black', linewidth=5, no_labels=False, node_dict={}, nodecolor='black', nodesize=10, right=0.95, start=0, textcolor='white', textsize='10', top=0.95, usetex=False, va='center', xlim=5, xliml=False, xlimr=False, ylim=5, ylimb=False, ylimt=False, rotation_mode='anchor', latex_preamble=False, ) for k in default: if k not in keywords: keywords[k] = default[k] # set filename as variable for convenience filename = keywords['filename'] # switch backend, depending on whether tex is used or not backend = mpl.get_backend() if keywords['usetex'] and backend != 'pgf': plt.switch_backend('pgf') mpl.rcParams['text.latex.unicode'] = True elif not keywords['usetex'] and backend != 'TkAgg': plt.switch_backend('TkAgg') if keywords['latex_preamble']: mpl.rcParams['pgf.preamble'] = keywords['latex_preamble'] # get the tree-graph graph = radial_layout( treestring, degree=degree, change=keywords['change'], start=keywords['start'], root=root ) # create the figure fig = plt.figure(figsize=keywords['figsize']) figsp = fig.add_subplot(111) figsp.axes.get_xaxis().set_visible(False) figsp.axes.get_yaxis().set_visible(False) for s in figsp.spines.values(): s.set_linewidth(keywords['ax_linewidth']) # plt.axes(frameon=keywords['frameon']) plt.axis('equal') plt.xticks([]) plt.yticks([]) # get xlim and ylim xvals, yvals = [], [] # start iterating over edges for nA, nB, d in list(graph.edges(data=True)) + keywords['edge_list']: # get the coordinates xA = graph.node[nA]['graphics']['x'] yA = graph.node[nA]['graphics']['y'] xB = graph.node[nB]['graphics']['x'] yB = graph.node[nB]['graphics']['y'] if 'color' in d: plt.plot( [xA, xB], [yA, yB], '-', **d ) else: plt.plot( [xA, xB], [yA, yB], '-', color=keywords['linecolor'], linewidth=keywords['linewidth'], ) # get the nodes for n, d in graph.nodes(data=True): g = d['graphics'] x, y = g['x'], g['y'] xvals += [x] yvals += [y] # try to get information from the node-dict try: settings = {} settings.update(keywords['node_dict'][n]) except: settings = {} # overwrite the stuff in keywords for k in keywords: if k not in settings: settings[k] = keywords[k] if d['label'].startswith('edge') \ or d['label'].startswith(root) or keywords['no_labels']: plt.plot( x, y, 'o', markersize=settings['nodesize'], color=settings['nodecolor'], markeredgewidth=settings['linewidth'] ) else: try: label = keywords['labels'][d['label']] except: label = d['label'] if 'rotation' in settings: r = settings['rotation'] else: r = g['angle'] plt.text( x, y, label, # d['label'], color=settings['textcolor'], fontweight=settings['fontweight'], va=settings['va'], ha=g['s'], bbox=dict( facecolor=settings['bg'], boxstyle='square,pad=0.2', ec="none", ), size=settings['textsize'], rotation=r, # g['angle'], rotation_mode=settings['rotation_mode'] ) # set up the xlimits if not keywords['xlimr'] and not keywords['xliml']: xl, xr = 2 * [keywords['xlim']] else: xl, xr = keywords['xliml'], keywords['xlimr'] # set up the xlimits if not keywords['ylimt'] and not keywords['ylimb']: yb, yt = 2 * [keywords['ylim']] else: yb, yt = keywords['ylimb'], keywords['ylimt'] plt.xlim((min(xvals) - xl, max(xvals) + xr)) plt.ylim((min(yvals) - yb, max(yvals) + yt)) plt.subplots_adjust( left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom'] ) plt.savefig(filename + '.' + fileformat) plt.clf() log.file_written(filename + '.' + fileformat)
def diff( lex, gold='cogid', test='lexstatid', loans=False, pprint=True, filename='', tofile=True, fuzzy=False): r""" Write differences in classifications on an item-basis to file. lex : :py:class:`lingpy.compare.lexstat.LexStat` The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the computation. It should have two columns indicating cognate IDs. gold : str (default='cogid') The name of the column containing the gold standard cognate assignments. test : str (default='lexstatid') The name of the column containing the automatically implemented cognate assignments. loans : bool (default=True) If set to c{False}, loans (indicated by negative IDs in the gold standard) will be treated as separate cognates, otherwise, loans will be treated as cognates. pprint : bool (default=True) Print out the results filename : str (default='') Name of the output file. If not specified, it is identical with the name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the extension ``diff``. tofile : bool (default=True) If set to c{False}, no data will be written to file, but instead, the data will be returned. Returns ------- t : tuple A nested tuple consisting of two further tuples. The first containing precision, recall, and harmonic mean (F-scores), the second containing the same values for the pair-scores. Notes ----- If the **tofile** option is chosen, the results are written to a specific file with the extension ``diff``. This file contains all cognate sets in which there are differences between gold standard and test sets. It also gives detailed information regarding false positives, false negatives, and the words involved in these wrong decisions. .. This function also calculates the "transformation" score. This score is .. based on the calculation of steps that are needed to transform one cluster .. for one set of meanings into the other. Ideally, if there are *n* different .. cognate sets covering one gloss in the gold standard, the minimal length of .. a mapping to convert the *m* cognate sets of the test set into the gold standard .. is *n*. In this case, both gold standard and test set are identical. .. However, if gold standard and test set differ, the number of mappings .. necessarily exceeds *m* and *n*. Based on this, the transformation .. precision is defined as :math:`\frac{m}{M}`, where *m* is the number of .. distinct clusters in the test set and *M* is the length of the mapping. .. Accordingly, the recall is defined as :math:`\frac{n}{M}`, where *n* is the .. number of clusters in the gold standard. .. Note that if precision is lower than 1.0, this means there are false .. positive decisions in the test set. Accordingly, a recall lower than 1.0 .. indicates that there are false negative decisions in the test set. .. The drawback of this score is that it is not sensitive regarding the .. distinct number of decisions in which gold standard and test set differ, so .. the recall can be very low although most of the words have been grouped .. accurately. The advantage is that it can be directly interpreted in terms .. of 'false positive/false negative' decisions. See also -------- bcubes pairs """ filename = filename or lex.filename loan = abs if loans else identity # open file if tofile: f = codecs.open(filename + '.diff', 'w', 'utf-8') # get a formatter for language names lform = '{0:' + str(max([len(l) for l in lex.cols])) + '}' preT, recT = [], [] preB, recB = [], [] preP, recP = [], [] def get_cogs(ref, bidx): cogs = lex.get_list(row=concept, entry=ref, flat=True) if fuzzy: cogs = [i[0] for i in cogs] tmp = {} for a, b in zip(cogs, bidx): if loan(a) not in tmp: tmp[loan(a)] = b return [tmp[loan(i)] for i in cogs] def get_pairs(cogs, idxs): tmp = defaultdict(list) for x, y in zip(cogs, idxs): tmp[x].append(y) for x in tmp: for yA, yB in combinations(tmp[x], r=2): yield tuple(sorted([yA, yB])) def get_bcubed_score(one, other): tmp = defaultdict(list) for x, y in zip(one, other): tmp[x].append(y) bcp = 0.0 for x in tmp: for y in tmp[x]: bcp += tmp[x].count(y) / len(tmp[x]) return bcp / len(idxs) for concept in lex.concepts: idxs = lex.get_list(row=concept, flat=True) # get the basic index for all seqs bidx = [i + 1 for i in range(len(idxs))] cogsG = get_cogs(gold, bidx) cogsT = get_cogs(test, bidx) if cogsG != cogsT: # calculate the transformation distance of the sets tramGT = len(set(zip(cogsG, cogsT))) tramG = len(set(cogsG)) tramT = len(set(cogsT)) preT += [tramT / tramGT] recT += [tramG / tramGT] # calculate the bcubed precision for the sets preB += [get_bcubed_score(cogsT, cogsG)] # calculate b-cubed recall recB += [get_bcubed_score(cogsG, cogsT)] # calculate pair precision pairsG = set(get_pairs(cogsG, idxs)) pairsT = set(get_pairs(cogsT, idxs)) preP.append(len(pairsT.intersection(pairsG)) / len(pairsT) if pairsT else 1.0) recP.append(len(pairsT.intersection(pairsG)) / len(pairsG) if pairsG else 1.0) fp = "no" if preP[-1] == 1.0 else "yes" fn = "no" if recP[-1] == 1.0 else "yes" if tofile: f.write( "Concept: {0}, False Positives: {1}, False Negatives: {2}\n".format( concept, fp, fn)) # get the words words = [lex[i, 'ipa'] for i in idxs] langs = [lex[i, 'taxa'] for i in idxs] # get a word-formater wform = '{0:' + str(max([len(w) for w in words])) + '}' # write differences to file if tofile: for word, lang, cG, cT in sorted( zip(words, langs, cogsG, cogsT), key=lambda x: (x[2], x[3])): f.write('{0}\t{1}\t{2:4}\t{3:4}\n'.format( lform.format(lang), wform.format(word), cG, cT)) f.write('#\n') else: preT += [1.0] recT += [1.0] preB += [1.0] recB += [1.0] preP += [1.0] recP += [1.0] bp = sum(preB) / len(preB) br = sum(recB) / len(recB) bf = 2 * (bp * br) / (bp + br) pp = sum(preP) / len(preP) pr = sum(recP) / len(recP) pf = 2 * (pp * pr) / (pp + pr) if pprint: print('**************************') print('* B-Cubed-Scores *') print('* ---------------------- *') print('* B-C.-Precision: {0:.4f} *'.format(bp)) print('* B-C.-Recall: {0:.4f} *'.format(br)) print('* B-C.-F-Scores: {0:.4f} *'.format(bf)) print('**************************') print('') print('**************************') print('* Pair-Scores *') print('* ---------------------- *') print('* Pair-Precision: {0:.4f} *'.format(pp)) print('* Pair-Recall: {0:.4f} *'.format(pr)) print('* Pair-F-Scores: {0:.4f} *'.format(pf)) print('**************************') if tofile: f.write('B-Cubed Scores:\n') f.write('Precision: {0:.4f}\n'.format(bp)) f.write('Recall: {0:.4f}\n'.format(br)) f.write('F-Score: {0:.4f}\n'.format(bf)) f.write('#\n') f.write('Pair Scores:\n') f.write('Precision: {0:.4f}\n'.format(pp)) f.write('Recall: {0:.4f}\n'.format(pr)) f.write('F-Score: {0:.4f}\n'.format(pf)) f.close() log.file_written(filename + '.diff') else: return (bp, br, bf), (pp, pr, pf)
def __exit__(self, type, value, traceback): self.fp.close() if self.log: file_written(_str_path(self.path))
def plot_gls( gls, treestring, degree=90, fileformat='pdf', **keywords ): """ Plot a gain-loss scenario for a given reference tree. """ # get kewyords defaults = dict( figsize=(15, 15), left=0.05, top=0.95, bottom=0.05, right=0.95, radius=0.5, textsize=8, edgewidth=5, linewidth=2, scale_radius=1.2, ylim=1, xlim=1, text=True, gain_color='white', loss_color='black', gain_linestyle='dotted', loss_linestyle='solid', ax_linewidth=0, filename=rcParams['filename'] ) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # set filename as variabel for convenience filename = keywords['filename'] try: tree = cg.LoadTree(treestring=treestring) except: try: tree = cg.LoadTree(treestring) except: tree = treestring tgraph = radial_layout(treestring, degree=degree) graph = gls2gml( gls, tgraph, tree ) nodes = [] # assign nodes and edges for n, d in graph.nodes(data=True): g = d['graphics'] x = g['x'] y = g['y'] s = d['state'] nodes += [(x, y, s)] # now plot the stuff fig = plt.figure(figsize=keywords['figsize']) figsp = fig.add_subplot(111) figsp.axes.get_xaxis().set_visible(False) figsp.axes.get_yaxis().set_visible(False) # set the axes linewidht for s in figsp.spines.values(): s.set_linewidth(keywords['ax_linewidth']) plt.axis('equal') for nA, nB in graph.edges(): xA = graph.node[nA]['graphics']['x'] xB = graph.node[nB]['graphics']['x'] yA = graph.node[nA]['graphics']['y'] yB = graph.node[nB]['graphics']['y'] plt.plot( [xA, xB], [yA, yB], '-', color='black', linewidth=keywords['edgewidth'], zorder=1 ) # now, iterate over nodes for x, y, s in nodes: if s == 'O': w = mpl.patches.Wedge( (x, y), keywords['radius'], 0, 360, facecolor=keywords['gain_color'], linewidth=keywords['linewidth'], linestyle=keywords['gain_linestyle'] ) elif s == 'o': w = mpl.patches.Wedge( (x, y), keywords['radius'] / keywords['scale_radius'], 0, 360, facecolor=keywords['gain_color'], linewidth=keywords['linewidth'] ) elif s == 'L': w = mpl.patches.Wedge( (x, y), keywords['radius'], 0, 360, facecolor=keywords['loss_color'], linewidth=keywords['linewidth'], linestyle=keywords['loss_linestyle'] ) else: w = mpl.patches.Wedge( (x, y), keywords['radius'] / keywords['scale_radius'], 0, 360, facecolor=keywords['loss_color'], linewidth=keywords['linewidth'] ) figsp.add_artist(w) # if text is chosen as argument if keywords['text']: if s in 'Oo': t = '1' c = 'black' else: t = '0' c = 'white' plt.text( x, y, t, size=keywords['textsize'], color=c, va="center", ha="center", fontweight='bold' ) # set x and y-values xvals = [x[0] for x in nodes] yvals = [x[1] for x in nodes] plt.xlim(min(xvals) - keywords['xlim'], max(xvals) + keywords['xlim']) plt.ylim(min(yvals) - keywords['ylim'], max(yvals) + keywords['ylim']) plt.subplots_adjust( left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom'] ) plt.savefig( filename + '.' + fileformat ) plt.clf() log.file_written(filename + '.' + fileformat)
def __exit__(self, type, value, traceback): self.fp.close() if self.log: file_written(_str_path(self.path))
def plot_tree(treestring, degree=90, fileformat='pdf', root="root", **keywords): """ Plot a Newick tree to PDF or other graphical formats. Parameters ---------- treestring : str A string in Newick format. degree : int Determine the degree of the tree (this determines how "circular" the tree will be). fileformat : str (default="pdf") Select the fileformat to which the tree shall be written. filename : str Determine the name of the file to which the data shall be written. Defaults to a timestamp. figsize : tuple (default=(10,10)) Determine the size of the figure. """ default = dict( ax_linewidth=0, bg='black', bottom=0.05, change=lambda x: x**1.75, edge_list=[], figsize=(10, 10), filename=rcParams['filename'], fontweight='bold', frameon=False, ha='center', labels=[], left=0.05, linecolor='black', linewidth=5, no_labels=False, node_dict={}, nodecolor='black', nodesize=10, right=0.95, start=0, textcolor='white', textsize='10', top=0.95, usetex=False, va='center', xlim=5, xliml=False, xlimr=False, ylim=5, ylimb=False, ylimt=False, rotation_mode='anchor', latex_preamble=False, ) for k in default: if k not in keywords: keywords[k] = default[k] # set filename as variable for convenience filename = keywords['filename'] # switch backend, depending on whether tex is used or not backend = mpl.get_backend() if keywords['usetex'] and backend != 'pgf': plt.switch_backend('pgf') mpl.rcParams['text.latex.unicode'] = True elif not keywords['usetex'] and backend != 'TkAgg': plt.switch_backend('TkAgg') if keywords['latex_preamble']: mpl.rcParams['pgf.preamble'] = keywords['latex_preamble'] # get the tree-graph graph = radial_layout(treestring, degree=degree, change=keywords['change'], start=keywords['start'], root=root) # create the figure fig = plt.figure(figsize=keywords['figsize']) figsp = fig.add_subplot(111) figsp.axes.get_xaxis().set_visible(False) figsp.axes.get_yaxis().set_visible(False) for s in figsp.spines.values(): s.set_linewidth(keywords['ax_linewidth']) # plt.axes(frameon=keywords['frameon']) plt.axis('equal') plt.xticks([]) plt.yticks([]) # get xlim and ylim xvals, yvals = [], [] # start iterating over edges for nA, nB, d in list(graph.edges(data=True)) + keywords['edge_list']: # get the coordinates xA = graph.node[nA]['graphics']['x'] yA = graph.node[nA]['graphics']['y'] xB = graph.node[nB]['graphics']['x'] yB = graph.node[nB]['graphics']['y'] if 'color' in d: plt.plot([xA, xB], [yA, yB], '-', **d) else: plt.plot( [xA, xB], [yA, yB], '-', color=keywords['linecolor'], linewidth=keywords['linewidth'], ) # get the nodes for n, d in graph.nodes(data=True): g = d['graphics'] x, y = g['x'], g['y'] xvals += [x] yvals += [y] # try to get information from the node-dict try: settings = {} settings.update(keywords['node_dict'][n]) except: settings = {} # overwrite the stuff in keywords for k in keywords: if k not in settings: settings[k] = keywords[k] if d['label'].startswith('edge') \ or d['label'].startswith(root) or keywords['no_labels']: plt.plot(x, y, 'o', markersize=settings['nodesize'], color=settings['nodecolor'], markeredgewidth=settings['linewidth']) else: try: label = keywords['labels'][d['label']] except: label = d['label'] if 'rotation' in settings: r = settings['rotation'] else: r = g['angle'] plt.text( x, y, label, # d['label'], color=settings['textcolor'], fontweight=settings['fontweight'], va=settings['va'], ha=g['s'], bbox=dict( facecolor=settings['bg'], boxstyle='square,pad=0.2', ec="none", ), size=settings['textsize'], rotation=r, # g['angle'], rotation_mode=settings['rotation_mode']) # set up the xlimits if not keywords['xlimr'] and not keywords['xliml']: xl, xr = 2 * [keywords['xlim']] else: xl, xr = keywords['xliml'], keywords['xlimr'] # set up the xlimits if not keywords['ylimt'] and not keywords['ylimb']: yb, yt = 2 * [keywords['ylim']] else: yb, yt = keywords['ylimb'], keywords['ylimt'] plt.xlim((min(xvals) - xl, max(xvals) + xr)) plt.ylim((min(yvals) - yb, max(yvals) + yt)) plt.subplots_adjust(left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom']) plt.savefig(filename + '.' + fileformat) plt.clf() log.file_written(filename + '.' + fileformat)
def plot_concept_evolution(scenarios, tree, fileformat='pdf', degree=90, **keywords): """ Plot the evolution according to the MLN method of all words for a given concept. Parameters ---------- tree : str A tree representation in Newick format. fileformat : str (default="pdf") A valid fileformat according to Matplotlib. degree : int (default=90) The degree by which the tree is drawn. 360 yields a circular tree, 180 yields a tree filling half of the space of a circle. """ # make defaults defaults = dict( figsize=(15, 15), left=0.05, top=0.95, bottom=0.05, right=0.95, colormap=mpl.cm.jet, edgewidth=5, radius=2.5, outer_radius=0.5, inner_radius=0.25, cognates='', usetex=False, latex_preamble=False, textsize=8, change=lambda x: x**1.75, xlim=0, ylim=0, xlimr=False, xliml=False, ylimt=False, ylimb=False, rootsize=10, legend=True, legendsize=5, legendAloc='upper right', legendBloc='lower right', markeredgewidth=2.5, wedgeedgewidth=2, gain_linestyle='dotted', loss_linestyle='solid', ax_linewidth=0, labels={}, _prefix='- ', _suffix=' -', colors={}, start=0, filename=rcParams['filename'], loss_alpha=0.1, loss_background='0.75', edges=[], hedge_color="black", hedge_width=5, hedge_linestyle='dashed', ) keywords.update(defaults) # set filename as variable for convenience filename = keywords['filename'] # XXX customize later XXX colormap = keywords['colormap'] # switch backend, depending on whether tex is used or not backend = mpl.get_backend() if keywords['usetex'] and backend != 'pgf': plt.switch_backend('pgf') elif not keywords['usetex'] and backend != 'TkAgg': plt.switch_backend('TkAgg') # check for preamble settings if keywords['latex_preamble']: mpl.rcParams['pgf.preamble'] = keywords['latex_preamble'] # make a graph graph = nx.Graph() # get the tgraph tgraph = radial_layout(tree, degree=degree, change=keywords['change'], start=keywords['start']) # get the taxa taxa = [n[0] for n in tgraph.nodes(data=True) if n[1]['tip']] # set the labels labels = {} for taxon in taxa: if taxon in keywords['labels']: labels[taxon] = keywords['labels'][taxon] else: labels[taxon] = taxon # get the number of paps in order to get the right colors cfunc = np.array(np.linspace(10, 256, len(scenarios)), dtype='int') if not keywords['colors']: colors = { scenarios[i][0]: mpl.colors.rgb2hex(colormap(cfunc[i])) for i in range(len(scenarios)) } else: colors = keywords['colors'] # get the wedges for the paps wedges = {} linsp = np.linspace(0, 360, len(scenarios) + 1) for i, scenario in enumerate(scenarios): pap = scenario[0] theta1, theta2 = linsp[i], linsp[i + 1] wedges[pap] = (theta1, theta2) if keywords['legend']: # set the linestyle for the legend if keywords['gain_linestyle'] == 'dotted': ls = ':' elif keywords['gain_linestyle'] == 'dashed': ls = '--' legendEntriesA = [] legendTextA = [] # add stuff for the legend for pap, gls in scenarios: w = mpl.patches.Wedge((0, 0), 1, wedges[pap][0], wedges[pap][1], facecolor=colors[pap], zorder=1, linewidth=keywords['wedgeedgewidth'], edgecolor='black') legendEntriesA += [w] legendTextA += [pap] # second legend explains evolution legendEntriesB = [] legendTextB = [] p = mpl.patches.Wedge( (0, 0), 1, 0, 360, facecolor='0.5', linewidth=keywords['wedgeedgewidth'], edgecolor='black', ) legendEntriesB += [p] legendTextB += ['Loss Event'] p, = plt.plot(0, 0, ls, color='black', linewidth=keywords['wedgeedgewidth']) legendEntriesB += [p] legendTextB += ['Gain Event'] # overwrite stuff plt.plot(0, 0, 'o', markersize=2, zorder=2, color='white') # iterate over the paps and append states to the graph for pap, gls in scenarios: # get the graph with the model g = gls2gml(gls, tgraph, tree, filename='') # iterate over the graph for n, d in g.nodes(data=True): # add the node if necessary if n not in graph: graph.add_node(n) # add a pap-dictionary if it's not already there if 'pap' not in graph.node[n]: graph.node[n]['pap'] = {} # add data graph.node[n]['pap'][pap] = d['state'] # create the figure fig = plt.figure(figsize=keywords['figsize']) figsp = fig.add_subplot(111) figsp.axes.get_xaxis().set_visible(False) figsp.axes.get_yaxis().set_visible(False) for s in figsp.spines.values(): s.set_linewidth(keywords['ax_linewidth']) plt.axis('equal') xvals = [] yvals = [] # iterate over edges first for nA, nB in g.edges(): gA = g.node[nA]['graphics'] gB = g.node[nB]['graphics'] xA, yA = gA['x'], gA['y'] xB, yB = gB['x'], gB['y'] plt.plot([xA, xB], [yA, yB], '-', color='black', linewidth=keywords['edgewidth']) # add horizontal edges if this option is chosen if keywords['edges']: # get the coordinates for nA, nB in keywords['edges']: gA = g.node[nA]['graphics'] gB = g.node[nB]['graphics'] xA, yA = gA['x'], gA['y'] xB, yB = gB['x'], gB['y'] plt.plot([xA, xB], [yA, yB], '-', color=keywords['hedge_color'], linewidth=keywords["hedge_width"], linestyle=keywords['hedge_linestyle']) # now iterate over the nodes for n, d in graph.nodes(data=True): cpaps = d['pap'] x, y = g.node[n]['graphics']['x'], g.node[n]['graphics']['y'] # get z-value which serves as zorder attribute try: z = 6 * len(tree.getConnectingEdges('root', n)) except: z = 0 xvals += [x] yvals += [y] # plot the default marker plt.plot(x, y, 'o', markersize=keywords['rootsize'], color='black', zorder=50) # check for origins in cpaps if 'O' in cpaps.values(): w = mpl.patches.Wedge( (x, y), keywords['radius'] + keywords['outer_radius'], 0, 360, facecolor='white', zorder=57 + z, linewidth=keywords['markeredgewidth'], linestyle=keywords['gain_linestyle'], ) figsp.add_artist(w) # check for retentions elif 'o' in cpaps.values(): w = mpl.patches.Wedge( (x, y), keywords['radius'] + keywords['outer_radius'], 0, 360, facecolor='white', zorder=56 + z, linewidth=keywords['markeredgewidth'], linestyle='solid', ) figsp.add_artist(w) if 'L' in cpaps.values() and 'O' in cpaps.values(): w = mpl.patches.Wedge( (x, y), keywords['radius'] + keywords['outer_radius'], 0, 360, facecolor=keywords['loss_background'], zorder=58 + z, linewidth=keywords['markeredgewidth'], edgecolor='black', linestyle=keywords['loss_linestyle']) figsp.add_artist(w) elif "L" in cpaps.values(): w = mpl.patches.Wedge( (x, y), keywords['radius'] + keywords['outer_radius'], 0, 360, facecolor=keywords['loss_background'], zorder=59 + z, linewidth=keywords['markeredgewidth'], edgecolor='black', ) figsp.add_artist(w) # plot all wedges for pap in cpaps: theta1, theta2 = wedges[pap] color = colors[pap] # check for characteristics of this pap # if it's a loss if cpaps[pap] == 'L': w = mpl.patches.Wedge( (x, y), keywords['radius'], theta1, theta2, facecolor=color, zorder=61 + z, alpha=keywords['loss_alpha'], # 0.25, linewidth=keywords['wedgeedgewidth'], edgecolor='black', linestyle=keywords['loss_linestyle']) figsp.add_artist(w) elif cpaps[pap] == 'o': w = mpl.patches.Wedge((x, y), keywords['radius'], theta1, theta2, facecolor=color, zorder=61 + z, linewidth=keywords['wedgeedgewidth'], edgecolor='black') figsp.add_artist(w) elif cpaps[pap] == 'O': w = mpl.patches.Wedge((x, y), keywords['radius'], theta1, theta2, facecolor=color, zorder=61 + z, linewidth=keywords['wedgeedgewidth'], edgecolor='black', linestyle=keywords['gain_linestyle']) figsp.add_artist(w) # add the labels if this option is chosen if keywords['labels']: # if node is a tip if tgraph.node[n]['tip']: # get the values gf = tgraph.node[n]['graphics'] r = gf['angle'] x, y = gf['x'], gf['y'] ha = gf['s'] # modify the text if ha == 'left': text = keywords['_prefix'] + labels[n] else: text = labels[n] + keywords['_suffix'] # plot the text plt.text(x, y, text, size=keywords['textsize'], va='center', ha=ha, fontweight='bold', color='black', rotation=r, rotation_mode='anchor', zorder=z) # set up the xlimits if not keywords['xlimr'] and not keywords['xliml']: xl, xr = 2 * [keywords['xlim']] else: xl, xr = keywords['xliml'], keywords['xlimr'] # set up the xlimits if not keywords['ylimt'] and not keywords['ylimb']: yb, yt = 2 * [keywords['ylim']] else: yb, yt = keywords['ylimb'], keywords['ylimt'] plt.xlim((min(xvals) - xl, max(xvals) + xr)) plt.ylim((min(yvals) - yb, max(yvals) + yt)) prop = mpl.font_manager.FontProperties(size=keywords['legendsize']) if keywords['legend']: legend1 = plt.legend(legendEntriesA, legendTextA, loc=keywords['legendAloc'], numpoints=1, prop=prop) plt.legend(legendEntriesB, legendTextB, loc=keywords['legendBloc'], prop=prop) figsp.add_artist(legend1) plt.subplots_adjust(left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom']) plt.savefig(filename + '.' + fileformat) plt.clf() log.file_written(filename + '.' + fileformat)
def diff(wordlist, gold='cogid', test='lexstatid', modify_ref=False, pprint=True, filename='', tofile=True, transcription="ipa"): r""" Write differences in classifications on an item-basis to file. lex : :py:class:`lingpy.compare.lexstat.LexStat` The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the computation. It should have two columns indicating cognate IDs. gold : str (default='cogid') The name of the column containing the gold standard cognate assignments. test : str (default='lexstatid') The name of the column containing the automatically implemented cognate assignments. modify_ref : function (default=False) Use a function to modify the reference. If your cognate identifiers are numerical, for example, and negative values are assigned as loans, but you want to suppress this behaviour, just set this keyword to "abs", and all cognate IDs will be converted to their absolute value. pprint : bool (default=True) Print out the results filename : str (default='') Name of the output file. If not specified, it is identical with the name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the extension ``diff``. tofile : bool (default=True) If set to c{False}, no data will be written to file, but instead, the data will be returned. transcription : str (default="ipa") The file in which the transcriptions are located (should be a string, no segmentized version, for convenience of writing to file). Returns ------- t : tuple A nested tuple consisting of two further tuples. The first containing precision, recall, and harmonic mean (F-scores), the second containing the same values for the pair-scores. Notes ----- If the **tofile** option is chosen, the results are written to a specific file with the extension ``diff``. This file contains all cognate sets in which there are differences between gold standard and test sets. It also gives detailed information regarding false positives, false negatives, and the words involved in these wrong decisions. .. This function also calculates the "transformation" score. This score is .. based on the calculation of steps that are needed to transform one cluster .. for one set of meanings into the other. Ideally, if there are *n* different .. cognate sets covering one gloss in the gold standard, the minimal length of .. a mapping to convert the *m* cognate sets of the test set into the gold standard .. is *n*. In this case, both gold standard and test set are identical. .. However, if gold standard and test set differ, the number of mappings .. necessarily exceeds *m* and *n*. Based on this, the transformation .. precision is defined as :math:`\frac{m}{M}`, where *m* is the number of .. distinct clusters in the test set and *M* is the length of the mapping. .. Accordingly, the recall is defined as :math:`\frac{n}{M}`, where *n* is the .. number of clusters in the gold standard. .. Note that if precision is lower than 1.0, this means there are false .. positive decisions in the test set. Accordingly, a recall lower than 1.0 .. indicates that there are false negative decisions in the test set. .. The drawback of this score is that it is not sensitive regarding the .. distinct number of decisions in which gold standard and test set differ, so .. the recall can be very low although most of the words have been grouped .. accurately. The advantage is that it can be directly interpreted in terms .. of 'false positive/false negative' decisions. See also -------- bcubes pairs """ filename = filename or wordlist.filename loan = modify_ref if modify_ref else identity # open file if tofile: f = codecs.open(filename + '.diff', 'w', 'utf-8') # get a formatter for language names lform = '{0:' + str(max([len(l) for l in wordlist.cols])) + '}' preT, recT = [], [] preB, recB = [], [] preP, recP = [], [] def get_pairs(cogs, idxs): tmp = defaultdict(list) for x, y in zip(cogs, idxs): tmp[x].append(y) for x in tmp: for yA, yB in combinations(tmp[x], r=2): yield tuple(sorted([yA, yB])) for concept in wordlist.rows: idxs = wordlist.get_list(row=concept, flat=True) # get the basic index for all seqs bidx = [i + 1 for i in range(len(idxs))] cogsG = _get_cogs(gold, concept, loan, wordlist) cogsT = _get_cogs(test, concept, loan, wordlist) if cogsG != cogsT: # calculate the transformation distance of the sets tramGT = len(set(zip(cogsG, cogsT))) tramG = len(set(cogsG)) tramT = len(set(cogsT)) preT += [tramT / tramGT] recT += [tramG / tramGT] # calculate the bcubed precision for the sets preB += [_get_bcubed_score(cogsT, cogsG)] # calculate b-cubed recall recB += [_get_bcubed_score(cogsG, cogsT)] # calculate pair precision pairsG = set(get_pairs(cogsG, idxs)) pairsT = set(get_pairs(cogsT, idxs)) preP.append( len(pairsT.intersection(pairsG)) / len(pairsT) if pairsT else 1.0) recP.append( len(pairsT.intersection(pairsG)) / len(pairsG) if pairsG else 1.0) fp = "no" if preP[-1] == 1.0 else "yes" fn = "no" if recP[-1] == 1.0 else "yes" if tofile: f.write( "Concept: {0}, False Positives: {1}, False Negatives: {2}\n" .format(concept, fp, fn)) # get the words words = [wordlist[i, 'ipa'] for i in idxs] langs = [wordlist[i, 'taxa'] for i in idxs] # get a word-formater wform = '{0:' + str(max([len(w) for w in words])) + '}' # write differences to file if tofile: for word, lang, cG, cT in sorted(zip(words, langs, cogsG, cogsT), key=lambda x: (x[2], x[3])): f.write('{0}\t{1}\t{2:4}\t{3:4}\n'.format( lform.format(lang), wform.format(word), cG, cT)) f.write('#\n') else: preT += [1.0] recT += [1.0] preB += [1.0] recB += [1.0] preP += [1.0] recP += [1.0] bp = sum(preB) / len(preB) br = sum(recB) / len(recB) bf = 2 * (bp * br) / (bp + br) pp = sum(preP) / len(preP) pr = sum(recP) / len(recP) pf = 2 * (pp * pr) / (pp + pr) as_string(_format_results('B-Cubed', bp, br, bf) + \ _format_results('Pair', pp, pr, pf), pprint=pprint) if tofile: f.write('B-Cubed Scores:\n') f.write('Precision: {0:.4f}\n'.format(bp)) f.write('Recall: {0:.4f}\n'.format(br)) f.write('F-Score: {0:.4f}\n'.format(bf)) f.write('#\n') f.write('Pair Scores:\n') f.write('Precision: {0:.4f}\n'.format(pp)) f.write('Recall: {0:.4f}\n'.format(pr)) f.write('F-Score: {0:.4f}\n'.format(pf)) f.close() log.file_written(filename + '.diff') else: return (bp, br, bf), (pp, pr, pf)
def plot_concept_evolution( scenarios, tree, fileformat='pdf', degree=90, **keywords ): """ Plot the evolution according to the MLN method of all words for a given concept. Parameters ---------- tree : str A tree representation in Newick format. fileformat : str (default="pdf") A valid fileformat according to Matplotlib. degree : int (default=90) The degree by which the tree is drawn. 360 yields a circular tree, 180 yields a tree filling half of the space of a circle. """ # make defaults defaults = dict( figsize=(15, 15), left=0.05, top=0.95, bottom=0.05, right=0.95, colormap=mpl.cm.jet, edgewidth=5, radius=2.5, outer_radius=0.5, inner_radius=0.25, cognates='', usetex=False, latex_preamble=False, textsize=8, change=lambda x: x ** 1.75, xlim=0, ylim=0, xlimr=False, xliml=False, ylimt=False, ylimb=False, rootsize=10, legend=True, legendsize=5, legendAloc='upper right', legendBloc='lower right', markeredgewidth=2.5, wedgeedgewidth=2, gain_linestyle='dotted', loss_linestyle='solid', ax_linewidth=0, labels={}, _prefix='- ', _suffix=' -', colors={}, start=0, filename=rcParams['filename'], loss_alpha=0.1, loss_background='0.75', edges=[], hedge_color="black", hedge_width=5, hedge_linestyle='dashed', ) keywords.update(defaults) # set filename as variable for convenience filename = keywords['filename'] # XXX customize later XXX colormap = keywords['colormap'] # switch backend, depending on whether tex is used or not backend = mpl.get_backend() if keywords['usetex'] and backend != 'pgf': plt.switch_backend('pgf') elif not keywords['usetex'] and backend != 'TkAgg': plt.switch_backend('TkAgg') # check for preamble settings if keywords['latex_preamble']: mpl.rcParams['pgf.preamble'] = keywords['latex_preamble'] # make a graph graph = nx.Graph() # get the tgraph tgraph = radial_layout( tree, degree=degree, change=keywords['change'], start=keywords['start'] ) # get the taxa taxa = [n[0] for n in tgraph.nodes(data=True) if n[1]['tip']] # set the labels labels = {} for taxon in taxa: if taxon in keywords['labels']: labels[taxon] = keywords['labels'][taxon] else: labels[taxon] = taxon # get the number of paps in order to get the right colors cfunc = np.array(np.linspace(10, 256, len(scenarios)), dtype='int') if not keywords['colors']: colors = {scenarios[i][0]: mpl.colors.rgb2hex(colormap(cfunc[i])) for i in range(len(scenarios))} else: colors = keywords['colors'] # get the wedges for the paps wedges = {} linsp = np.linspace(0, 360, len(scenarios) + 1) for i, scenario in enumerate(scenarios): pap = scenario[0] theta1, theta2 = linsp[i], linsp[i + 1] wedges[pap] = (theta1, theta2) if keywords['legend']: # set the linestyle for the legend if keywords['gain_linestyle'] == 'dotted': ls = ':' elif keywords['gain_linestyle'] == 'dashed': ls = '--' legendEntriesA = [] legendTextA = [] # add stuff for the legend for pap, gls in scenarios: w = mpl.patches.Wedge( (0, 0), 1, wedges[pap][0], wedges[pap][1], facecolor=colors[pap], zorder=1, linewidth=keywords['wedgeedgewidth'], edgecolor='black' ) legendEntriesA += [w] legendTextA += [pap] # second legend explains evolution legendEntriesB = [] legendTextB = [] p = mpl.patches.Wedge( (0, 0), 1, 0, 360, facecolor='0.5', linewidth=keywords['wedgeedgewidth'], edgecolor='black', ) legendEntriesB += [p] legendTextB += ['Loss Event'] p, = plt.plot( 0, 0, ls, color='black', linewidth=keywords['wedgeedgewidth'] ) legendEntriesB += [p] legendTextB += ['Gain Event'] # overwrite stuff plt.plot(0, 0, 'o', markersize=2, zorder=2, color='white') # iterate over the paps and append states to the graph for pap, gls in scenarios: # get the graph with the model g = gls2gml( gls, tgraph, tree, filename='' ) # iterate over the graph for n, d in g.nodes(data=True): # add the node if necessary if n not in graph: graph.add_node(n) # add a pap-dictionary if it's not already there if 'pap' not in graph.node[n]: graph.node[n]['pap'] = {} # add data graph.node[n]['pap'][pap] = d['state'] # create the figure fig = plt.figure(figsize=keywords['figsize']) figsp = fig.add_subplot(111) figsp.axes.get_xaxis().set_visible(False) figsp.axes.get_yaxis().set_visible(False) for s in figsp.spines.values(): s.set_linewidth(keywords['ax_linewidth']) plt.axis('equal') xvals = [] yvals = [] # iterate over edges first for nA, nB in g.edges(): gA = g.node[nA]['graphics'] gB = g.node[nB]['graphics'] xA, yA = gA['x'], gA['y'] xB, yB = gB['x'], gB['y'] plt.plot( [xA, xB], [yA, yB], '-', color='black', linewidth=keywords['edgewidth'] ) # add horizontal edges if this option is chosen if keywords['edges']: # get the coordinates for nA, nB in keywords['edges']: gA = g.node[nA]['graphics'] gB = g.node[nB]['graphics'] xA, yA = gA['x'], gA['y'] xB, yB = gB['x'], gB['y'] plt.plot( [xA, xB], [yA, yB], '-', color=keywords['hedge_color'], linewidth=keywords["hedge_width"], linestyle=keywords['hedge_linestyle'] ) # now iterate over the nodes for n, d in graph.nodes(data=True): cpaps = d['pap'] x, y = g.node[n]['graphics']['x'], g.node[n]['graphics']['y'] # get z-value which serves as zorder attribute try: z = 6 * len(tree.getConnectingEdges('root', n)) except: z = 0 xvals += [x] yvals += [y] # plot the default marker plt.plot( x, y, 'o', markersize=keywords['rootsize'], color='black', zorder=50 ) # check for origins in cpaps if 'O' in cpaps.values(): w = mpl.patches.Wedge( (x, y), keywords['radius'] + keywords['outer_radius'], 0, 360, facecolor='white', zorder=57 + z, linewidth=keywords['markeredgewidth'], linestyle=keywords['gain_linestyle'], ) figsp.add_artist(w) # check for retentions elif 'o' in cpaps.values(): w = mpl.patches.Wedge( (x, y), keywords['radius'] + keywords['outer_radius'], 0, 360, facecolor='white', zorder=56 + z, linewidth=keywords['markeredgewidth'], linestyle='solid', ) figsp.add_artist(w) if 'L' in cpaps.values() and 'O' in cpaps.values(): w = mpl.patches.Wedge( (x, y), keywords['radius'] + keywords['outer_radius'], 0, 360, facecolor=keywords['loss_background'], zorder=58 + z, linewidth=keywords['markeredgewidth'], edgecolor='black', linestyle=keywords['loss_linestyle'] ) figsp.add_artist(w) elif "L" in cpaps.values(): w = mpl.patches.Wedge( (x, y), keywords['radius'] + keywords['outer_radius'], 0, 360, facecolor=keywords['loss_background'], zorder=59 + z, linewidth=keywords['markeredgewidth'], edgecolor='black', ) figsp.add_artist(w) # plot all wedges for pap in cpaps: theta1, theta2 = wedges[pap] color = colors[pap] # check for characteristics of this pap # if it's a loss if cpaps[pap] == 'L': w = mpl.patches.Wedge( (x, y), keywords['radius'], theta1, theta2, facecolor=color, zorder=61 + z, alpha=keywords['loss_alpha'], # 0.25, linewidth=keywords['wedgeedgewidth'], edgecolor='black', linestyle=keywords['loss_linestyle'] ) figsp.add_artist(w) elif cpaps[pap] == 'o': w = mpl.patches.Wedge( (x, y), keywords['radius'], theta1, theta2, facecolor=color, zorder=61 + z, linewidth=keywords['wedgeedgewidth'], edgecolor='black' ) figsp.add_artist(w) elif cpaps[pap] == 'O': w = mpl.patches.Wedge( (x, y), keywords['radius'], theta1, theta2, facecolor=color, zorder=61 + z, linewidth=keywords['wedgeedgewidth'], edgecolor='black', linestyle=keywords['gain_linestyle'] ) figsp.add_artist(w) # add the labels if this option is chosen if keywords['labels']: # if node is a tip if tgraph.node[n]['tip']: # get the values gf = tgraph.node[n]['graphics'] r = gf['angle'] x, y = gf['x'], gf['y'] ha = gf['s'] # modify the text if ha == 'left': text = keywords['_prefix'] + labels[n] else: text = labels[n] + keywords['_suffix'] # plot the text plt.text( x, y, text, size=keywords['textsize'], va='center', ha=ha, fontweight='bold', color='black', rotation=r, rotation_mode='anchor', zorder=z ) # set up the xlimits if not keywords['xlimr'] and not keywords['xliml']: xl, xr = 2 * [keywords['xlim']] else: xl, xr = keywords['xliml'], keywords['xlimr'] # set up the xlimits if not keywords['ylimt'] and not keywords['ylimb']: yb, yt = 2 * [keywords['ylim']] else: yb, yt = keywords['ylimb'], keywords['ylimt'] plt.xlim((min(xvals) - xl, max(xvals) + xr)) plt.ylim((min(yvals) - yb, max(yvals) + yt)) prop = mpl.font_manager.FontProperties(size=keywords['legendsize']) if keywords['legend']: legend1 = plt.legend( legendEntriesA, legendTextA, loc=keywords['legendAloc'], numpoints=1, prop=prop ) plt.legend( legendEntriesB, legendTextB, loc=keywords['legendBloc'], prop=prop ) figsp.add_artist(legend1) plt.subplots_adjust( left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom'] ) plt.savefig(filename + '.' + fileformat) plt.clf() log.file_written(filename + '.' + fileformat)
def plot_heatmap( wordlist, filename="heatmap", fileformat="pdf", ref='cogid', normalized=False, refB='', **keywords ): """ Create a heatmap-representation of shared cognates for a given wordlist. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. filename : str (default="heatmap") Name of the file to which the heatmap will be written. fileformat : str (default="pdf") A regular matplotlib-fileformat (pdf, png, pgf, svg). ref : str (default="cogid') The name of the column that contains the cognate identifiers. normalized : {bool str} (default=True) If set to c{False}, don't normalize the data. Otherwise, select the normalization method, choose between: * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for details), and * "swadesh" for traditional lexicostatistical calculation of shared cognate percentages. cmap : matplotlib.cm (default=matplotlib.cm.jet) The color scheme to be used for the heatmap. steps : int (default=5) The number of steps in which names of taxa will be written to the axes. xrotation : int (default=45) The rotation of the taxon-names on the x-axis. colorbar : bool (default=True) Specify, whether a colorbar should be added to the plot. figsize : tuple (default=(10,10)) Specify the size of the figure. tree : str (default='') A tree passed for the taxa in Newick-format. If no tree is specified, the method looks for a tree object in the Wordlist. Notes ----- This function plots shared cognate percentages. """ defaults = dict( bottom=0.01, # rcParams['phybo_ylimb'] cmap=mpl.cm.jet, colorbar=True, colorbar_label="Shared Cognates", colorbar_shrink=0.75, colorbar_textsize=10, figsize=(10, 5), height=0.8, labels={}, # taxon labels passed for the taxa, left=0.01, # rcParams['phybo_xlimr'], matrix=False, normalization="jaccard", right=0.95, # rcParams['phybo_xliml'], scale=0.075, show_tree=True, steps=20, textsize=5, top=0.95, # rcParams['phybo_ylimt'], tree='', tree_bottom=0.1, tree_left=0.1, tree_width=0.2, vmax=1.0, vmin=0.0, width=0.8, xrotation=90, distances=False ) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # access the reference tree of the wordlist and create a function that # orders the taxa accordingly if not keywords['tree']: try: tree = wordlist.tree except: raise ValueError("[i] No tree could be found") else: tree = keywords["tree"] # check for normalization if normalized: if normalized not in ["jaccard", "swadesh"]: raise ValueError( "Keyword 'normalized' must be one of 'jaccard','swadesh',False.") # create an empty matrix if not normalized: matrix = np.zeros((wordlist.width, wordlist.width), dtype=int) else: matrix = np.zeros((wordlist.width, wordlist.width), dtype=float) # create the figure fig = plt.figure(figsize=keywords['figsize']) # plot the reference tree if keywords['show_tree']: tree_matrix, taxa = nwk2tree_matrix(tree) ax1 = fig.add_axes( [ keywords['left'], keywords['bottom'], 0.25 * keywords['width'], keywords['height'] ] ) # [0.01,0.1,0.2,0.7]) d = sch.dendrogram( np.array(tree_matrix), labels=[t for t in taxa], orientation='left', ) taxa = d['ivl'][::-1] ax1.set_xticks([]) ax1.set_yticks([]) ax1.spines['bottom'].set_color('#ffffff') ax1.spines['top'].set_color('#ffffff') ax1.spines['left'].set_color('#ffffff') ax1.spines['right'].set_color('#ffffff') left = keywords['left'] + keywords['scale'] * keywords['width'] else: left = keywords['left'] taxa = tree.taxa # start iterating over taxa in order of the reference tree and fill in the # matrix with numbers of shared cognates if keywords['matrix']: matrix = keywords['matrix'] else: for i, taxonA in enumerate(taxa): for j, taxonB in enumerate(taxa): if i < j: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list( taxa=taxonA, flat=True, entry=ref ) cogsB = wordlist.get_list( taxa=taxonB, flat=True, entry=ref ) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict( taxa=taxonA, entry=ref ) cogsB = wordlist.get_dict( taxa=taxonB, entry=ref ) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning(str( [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB])) shared = 0.0 matrix[i][j] = shared # if refB is also a possibiltiy if not refB: matrix[j][i] = shared elif i > j and refB: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list( taxa=taxonA, flat=True, entry=refB ) cogsB = wordlist.get_list( taxa=taxonB, flat=True, entry=refB ) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict( taxa=taxonA, entry=refB ) cogsB = wordlist.get_dict( taxa=taxonB, entry=refB ) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning(str( [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB])) shared = 0.0 matrix[i][j] = shared elif i == j: cogs = wordlist.get_list( taxa=taxonA, flat=True, entry=ref ) if normalized: matrix[i][j] = 1.0 else: matrix[i][j] = len(set(cogs)) ax2 = fig.add_axes( [ left, # keywords['left']+0.25 * keywords['width']+0.05, keywords['bottom'], keywords['width'], keywords['height'] ] ) cmap = keywords['cmap'] # [0.15,0.1,0.7,0.7]) if 'distances' in keywords and keywords['distances']: for i, line in enumerate(matrix): for j, cell in enumerate(matrix): matrix[i][j] = 1 - matrix[i][j] nmatrix = [ [keywords['vmax'], keywords['vmin']], [keywords['vmin'], keywords['vmax']] ] im = ax2.matshow(nmatrix, aspect='auto', origin='lower', interpolation='nearest', cmap=keywords['cmap'], vmax=keywords['vmax'], vmin=keywords['vmin'] ) # set the xticks steps = int(len(taxa) / keywords['steps'] + 0.5) start = int(steps / 2 + 0.5) idxs = [0] + list(range(start, len(taxa), steps)) selected_taxa = [taxa[i] for i in idxs] # modify taxon names if this is specified for i, t in enumerate(selected_taxa): if t in keywords['labels']: selected_taxa[i] = keywords['labels'][t] ax2.set_xticks([]) ax2.set_yticks([]) plt.xticks( idxs, selected_taxa, size=keywords['textsize'], rotation=keywords['xrotation'], rotation_mode="default" ) plt.yticks( idxs, selected_taxa, size=keywords['textsize'], ) if keywords["colorbar"]: plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax']) c = plt.colorbar(im, shrink=keywords['colorbar_shrink']) c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize']) plt.subplots_adjust( left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom'] ) plt.savefig(filename + '.' + fileformat) f = open(filename + '.matrix', 'w') for i, t in enumerate(taxa): f.write('{0:20}'.format(t)) for j, c in enumerate(matrix[i]): if not normalized: f.write('\t{0:3}'.format(int(c))) else: f.write('\t{0:.2f}'.format(c)) f.write('\n') f.close() log.file_written(filename + '.' + fileformat)