def _get_brackets(brackets): out = defaultdict(str) for b in brackets: out[b] = unicodedata.lookup(unicodedata.name(b).replace('LEFT', 'RIGHT')) if b == out[b]: log.warn('lingpy.sequence.sound_classes.get_brackets' + \ 'Item «{0}» does not have a counterpart!'.format(b)) return out
def _get_brackets(brackets): out = defaultdict(str) for b in brackets: out[b] = unicodedata.lookup(unicodedata.name(b).replace('LEFT', 'RIGHT')) if b == out[b]: log.warn('lingpy.sequence.sound_classes.get_brackets' + \ 'Item «{0}» does not have a counterpart!'.format(b)) return out
def test_convenience(self): from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written info('m') warn('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def test_convenience(self): from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written info('m') warn('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def string2html( taxon, string, swaps=[], tax_len=None ): """ Function converts an (aligned) string into colored html-format. @deprecated """ # determine the length of the string if not tax_len: tax_len = len(taxon) # set the tr-line tr = '<tr class="msa">\n{0}\n</tr>' # set the td_taxon-line td_taxon = '<td class="taxon" width="' + str(15 * tax_len) + '">{0}</td>\n' # get the percentage scaling factor perc = int(80 / len(string) + 0.5) # get vals for residue and swaps td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \ '<font color="{2}">{0}</font></td>\n' td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \ 'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n' # start with filling the taxon out = '' out += td_taxon.format(taxon) # go on with the colors for i, char in enumerate(string): try: c = rcParams['_color'][char] fg = '#000000' except: try: c = rcParams['_color'][char[0]] fg = '#000000' except KeyError: log.warn("Unknown character '" + char + "', press ANY key to continue. ") c = '#ffffff' fg = '#eb3410' if i in swaps: out += td_swap.format(char, c, fg) else: out += td_residue.format(char, c, fg) return out
def string2html(taxon, string, swaps=[], tax_len=None): """ Function converts an (aligned) string into colored html-format. @deprecated """ # determine the length of the string if not tax_len: tax_len = len(taxon) # set the tr-line tr = '<tr class="msa">\n{0}\n</tr>' # set the td_taxon-line td_taxon = '<td class="taxon" width="' + str(15 * tax_len) + '">{0}</td>\n' # get the percentage scaling factor perc = int(80 / len(string) + 0.5) # get vals for residue and swaps td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \ '<font color="{2}">{0}</font></td>\n' td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \ 'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n' # start with filling the taxon out = '' out += td_taxon.format(taxon) # go on with the colors for i, char in enumerate(string): try: c = rcParams['_color'][char] fg = '#000000' except: try: c = rcParams['_color'][char[0]] fg = '#000000' except KeyError: log.warn("Unknown character '" + char + "', press ANY key to continue. ") c = '#ffffff' fg = '#eb3410' if i in swaps: out += td_swap.format(char, c, fg) else: out += td_residue.format(char, c, fg) return out
def tokens2html( string, swaps=[], tax_len=None, ): """ Function converts an (aligned) string into colored html-format. Notes ----- This function is currently not used by any other program. So it might be useful to just deprecate it. @deprecated """ # set the tr-line tr = '<tr class="msa">\n{0}\n</tr>' # get the percentage scaling factor perc = int(80 / len(string) + 0.5) # get vals for residue and swaps td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \ '<font color="{2}">{0}</font></td>\n' td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \ 'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n' # start with filling the taxon out = '<table>' # go on with the colors for i, char in enumerate(string): try: c = rcParams['_color'][char] fg = '#000000' except: try: c = rcParams['_color'][char[0]] fg = '#000000' except KeyError: log.warn("Unknown character '" + char + "', press ANY key to continue. ") c = '#ffffff' fg = '#eb3410' if i in swaps: out += td_swap.format(char, c, fg) else: out += td_residue.format(char, c, fg) return out + '</table>'
def tokens2html( string, swaps=[], tax_len=None, ): """ Function converts an (aligned) string into colored html-format. Notes ----- This function is currently not used by any other program. So it might be useful to just deprecate it. @deprecated """ # set the tr-line tr = '<tr class="msa">\n{0}\n</tr>' # get the percentage scaling factor perc = int(80 / len(string) + 0.5) # get vals for residue and swaps td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \ '<font color="{2}">{0}</font></td>\n' td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \ 'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n' # start with filling the taxon out = '<table>' # go on with the colors for i, char in enumerate(string): try: c = rcParams['_color'][char] fg = '#000000' except: try: c = rcParams['_color'][char[0]] fg = '#000000' except KeyError: log.warn("Unknown character '" + char + "', press ANY key to continue. ") c = '#ffffff' fg = '#eb3410' if i in swaps: out += td_swap.format(char, c, fg) else: out += td_residue.format(char, c, fg) return out + '</table>'
def npoint_ap(scores, cognates, reverse=False): """ Calculate the n-point average precision. Parameters ---------- scores : list The scores of your algorithm for pairwise string comparison. cognates : list The cognate codings of the word pairs you compared. 1 indicates that the pair is cognate, 0 indicates that it is not cognate. reverse : bool (default=False) The order of your ranking mechanism. If your algorithm yields high scores for words which are probably cognate, and low scores for non-cognate words, you should set this keyword to "True". Notes ----- This follows the description in :evobib:`Kondrak2002`. The n-point average precision is useful to compare the discriminative force of different algorithms for string similarity, or to train the parameters of a given algorithm. Examples -------- >>> scores = [1, 2, 3, 4, 5] >>> cognates = [1, 1, 1, 0, 0] >>> from lingpy.evaluate.acd import npoint_ap >>> npoint_ap(scores, cognates) 1.0 """ p = 0.0 cognate_count = 0 for k, (score, cognate) in enumerate( sorted(zip(scores, cognates), key=lambda x: x[0], reverse=reverse)): if cognate == 1: cognate_count += 1 p += cognate_count / (k + 1.0) try: return p / cognates.count(1) except ZeroDivisionError: log.warn( "Encountered Zero Division in npoint_ap, your data seems to contain no cognates." ) return 0
def wl2multistate(wordlist, ref, missing): """ Function converts a wordlist to multistate format (compatible with PAUP). """ # convert the data to a multistate matrix # get etymological dictionary wordlist.get_etymdict(ref=ref) # define chars, we only have a limited set, unfortunately chars = ascii_letters + digits # iterate over all cognate sets and assign the chars matrix = [] for c in wordlist.concepts: taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref) distinct_states = set() for taxon in wordlist.taxa: distinct_states.update(taxon_to_cognate_set.get(taxon, [0])) # make converter if len(distinct_states) > len(chars): # pragma: no cover # FIXME: This shouldn't just be a warning, because we # will get a KeyError # down below, since zip just returns a list of length len(chars)! log.warn('more distinct states than available characters!') char_map = dict(zip(sorted(distinct_states), chars)) char_map['-'] = '-' line = [] for taxon in wordlist.taxa: states = set(taxon_to_cognate_set.get(taxon, ['-'])) # exclude the case len(taxon_to_cognate_set[taxon]) == 0 if len(states) == 1: line.append(char_map[states.pop()]) elif not states: line.append(missing) else: line.append('({0})'.format("".join( [char_map[x] for x in sorted(states)]))) matrix.append(line) return misc.transpose(matrix)
def npoint_ap(scores, cognates, reverse=False): """ Calculate the n-point average precision. Parameters ---------- scores : list The scores of your algorithm for pairwise string comparison. cognates : list The cognate codings of the word pairs you compared. 1 indicates that the pair is cognate, 0 indicates that it is not cognate. reverse : bool (default=False) The order of your ranking mechanism. If your algorithm yields high scores for words which are probably cognate, and low scores for non-cognate words, you should set this keyword to "True". Notes ----- This follows the description in :evobib:`Kondrak2002`. The n-point average precision is useful to compare the discriminative force of different algorithms for string similarity, or to train the parameters of a given algorithm. Examples -------- >>> scores = [1, 2, 3, 4, 5] >>> cognates = [1, 1, 1, 0, 0] >>> from lingpy.evaluate.acd import npoint_ap >>> npoint_ap(scores, cognates) 1.0 """ p = 0.0 cognate_count = 0 for k,(score, cognate) in enumerate(sorted(zip(scores, cognates), key=lambda x: x[0], reverse=reverse)): if cognate == 1: cognate_count += 1 p += cognate_count / (k+1.0) try: return p / cognates.count(1) except ZeroDivisionError: log.warn("Encountered Zero Division in npoint_ap, your data seems to contain no cognates.") return 0
def wl2multistate(wordlist, ref, missing): """ Helper function converts a wordlist to multistate format (compatible with PAUP). """ # convert the data to a multistate matrix # get etymological dictionary wordlist.get_etymdict(ref=ref) # define chars, we only have a limited set, unfortunately chars = ascii_letters + digits # iterate over all cognate sets and assign the chars matrix = [] for c in wordlist.concepts: taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref) distinct_states = set() for taxon in wordlist.taxa: distinct_states.update(taxon_to_cognate_set.get(taxon, [0])) # make converter if len(distinct_states) > len(chars): # pragma: no cover # FIXME: This shouldn't just be a warning, because we will get a KeyError # down below, since zip just returns a list of length len(chars)! log.warn('more distinct states than available characters!') char_map = dict(zip(sorted(distinct_states), chars)) char_map['-'] = '-' line = [] for taxon in wordlist.taxa: states = set(taxon_to_cognate_set.get(taxon, ['-'])) #assert states # exclude the case len(taxon_to_cognate_set[taxon]) == 0 if len(states) == 1: line.append(char_map[states.pop()]) elif not states: line.append(missing) else: line.append('({0})'.format( "".join([char_map[x] for x in sorted(states)]))) matrix.append(line) return misc.transpose(matrix)
def normalize_alignment(alignment): """ Function normalizes an alignment. Normalization here means that columns consisting only of gaps will be deleted, and all sequences will be stretched to equal length by adding additional gap characters in the end of smaller sequences. """ # clone the alignment alm_clone = [[x for x in y] for y in alignment] # first check for alms of different length alm_lens = [len(alm) for alm in alm_clone] if alm_lens.count(1) == len(alm_lens): for i, alm in enumerate(alm_clone): alm_clone[i] = alm[0].split(' ') alm_lens[i] = len(alm_clone[i]) if len(set(alm_lens)) > 1: max_len = max(alm_lens) for i, alm in enumerate(alm_clone): new_alm = alm + ['-' for x in range(max_len)] alm_clone[i] = new_alm[:max_len] # then check for alms consisting only of gaps cols = misc.transpose(alm_clone) idxs = [] for i, col in enumerate(cols): if set(col) == set('-'): idxs += [i] for idx in idxs[::-1]: for i, alm in enumerate(alm_clone): del alm_clone[i][idx] if alignment != alm_clone: lgtxt = 'Modified the alignment:\n' for i in range(len(alignment)): lgtxt += '[!] ' + ' '.join(alignment[i]) + '->' lgtxt += ' '.join(alm_clone[i]) + '\n' warn(lgtxt) return alm_clone else: return alignment
def context_profile(wordlist, ref='ipa', col="doculect", semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False, brackets=None, splitters='/,;~', merge_geminates=True, clts=False, bad_word="<???>", bad_sound="<?>", unknown_sound="!{0}", examples=2, max_entries=100): """ Create an advanced Orthography Profile with context and doculect information. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A wordlist from which you want to derive an initial orthography profile. ref : str (default="ipa") The name of the reference column in which the words are stored. col : str (default="doculect") Indicate in which column the information on the language variety is stored. semi_diacritics : str Indicate characters which can occur both as "diacritics" (second part in a sound) or alone. merge_vowels : bool (default=True) Indicate whether consecutive vowels should be merged. brackets : dict A dictionary with opening brackets as key and closing brackets as values. Defaults to a pre-defined set of frequently occurring brackets. splitters : str The characters which force the automatic splitting of an entry. clts : dict (default=None) A dictionary(like) object that converts a given source sound into a potential target sound, using the get()-method of the dictionary. Normally, we think of a CLTS instance here (that is: a cross-linguistic transcription system as defined in the pyclts package). bad_word : str (default="«???»") Indicate how words that could not be parsed should be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. bad_sound : str (default="«?»") Indicate how sounds that could not be converted to a sound class be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. unknown_sound : str (default="!{0}") If with_clts is set to True, use this string to indicate that sounds are classified as "unknown sound" in the CLTS framework. examples : int(default=2) Indicate the number of examples that should be printed out. Returns ------- profile : generator A generator of tuples (three items), indicating the segment, its frequency, the conversion to sound classes in the Dolgopolsky sound-class model, and the unicode-codepoints. """ clts_ = clts or {} nulls = set() bad_words = set() brackets = brackets or "([{『(₍⁽«)]})』⁾₎" profile = defaultdict(list) errors = set() for idx, word, language in pb(wordlist.iter_rows(ref, col), desc='iter words', total=len(wordlist)): log.info('processing {0}-{1}'.format(idx, word)) if isinstance(word, list): word = ' '.join(word) if word.strip(): try: cleaned_string = clean_string( word, semi_diacritics=semi_diacritics, merge_vowels=merge_vowels, brackets=None, ignore_brackets=False, split_entries=False, preparse=None, rules=None, merge_geminates=merge_geminates)[0].split(' ') # retain whole word if there are splitters in the word if [x for x in cleaned_string if x in brackets + splitters]: profile[word] += [(language, word)] bad_words.add(word) else: context_pre = ['^'] + (len(cleaned_string) - 1) * [''] context_post = (len(cleaned_string) - 1) * [''] + ['$'] for ctxA, ctxB, segment in zip(context_pre, context_post, cleaned_string): profile[ctxA + segment + ctxB] += [(language, word)] for segment in [ x for x in word if x not in ' '.join(cleaned_string) ]: profile[segment] += [(language, word)] nulls.add(segment) except: errors.add(idx) log.warn('problem parsing {0}'.format(word)) for s in '^$': yield s, 'NULL', '', '', '', '' for idx, (s, entries) in pb(enumerate( sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)), desc='yielding entries', total=len(profile)): sclass = token2class(s.strip('^$'), 'dolgo') words, langs = [l[1] for l in entries ][:max_entries], [l[0] for l in entries][:max_entries] languages = ', '.join( sorted(set(langs), key=lambda x: langs.count(x), reverse=True)) frequency = str(len(langs)) codepoints = codepoint(s) examples_ = ', '.join( sorted(set(words), key=lambda x: words.count(x), reverse=True)[:examples]) if s in bad_words: ipa = bad_word.format(s) elif sclass == '0': ipa = bad_sound.format(s) elif s in nulls: ipa = 'NULL' elif clts_: sound = clts_.get(s.strip('^$'), False) if not sound: ipa = '!' + s.strip('^$') else: ipa = text_type(sound) else: ipa = s.strip('^$') yield s, ipa, examples_, languages, frequency, codepoints
def plot_heatmap( wordlist, filename="heatmap", fileformat="pdf", ref='cogid', normalized=False, refB='', **keywords ): """ Create a heatmap-representation of shared cognates for a given wordlist. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. filename : str (default="heatmap") Name of the file to which the heatmap will be written. fileformat : str (default="pdf") A regular matplotlib-fileformat (pdf, png, pgf, svg). ref : str (default="cogid') The name of the column that contains the cognate identifiers. normalized : {bool str} (default=True) If set to c{False}, don't normalize the data. Otherwise, select the normalization method, choose between: * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for details), and * "swadesh" for traditional lexicostatistical calculation of shared cognate percentages. cmap : matplotlib.cm (default=matplotlib.cm.jet) The color scheme to be used for the heatmap. steps : int (default=5) The number of steps in which names of taxa will be written to the axes. xrotation : int (default=45) The rotation of the taxon-names on the x-axis. colorbar : bool (default=True) Specify, whether a colorbar should be added to the plot. figsize : tuple (default=(10,10)) Specify the size of the figure. tree : str (default='') A tree passed for the taxa in Newick-format. If no tree is specified, the method looks for a tree object in the Wordlist. Notes ----- This function plots shared cognate percentages. """ defaults = dict( bottom=0.01, # rcParams['phybo_ylimb'] cmap=mpl.cm.jet, colorbar=True, colorbar_label="Shared Cognates", colorbar_shrink=0.75, colorbar_textsize=10, figsize=(10, 5), height=0.8, labels={}, # taxon labels passed for the taxa, left=0.01, # rcParams['phybo_xlimr'], matrix=False, normalization="jaccard", right=0.95, # rcParams['phybo_xliml'], scale=0.075, show_tree=True, steps=20, textsize=5, top=0.95, # rcParams['phybo_ylimt'], tree='', tree_bottom=0.1, tree_left=0.1, tree_width=0.2, vmax=1.0, vmin=0.0, width=0.8, xrotation=90 ) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # access the reference tree of the wordlist and create a function that # orders the taxa accordingly if not keywords['tree']: try: tree = wordlist.tree except: raise ValueError("[i] No tree could be found") else: tree = keywords["tree"] # check for normalization if normalized: if normalized not in ["jaccard", "swadesh"]: raise ValueError( "Keyword 'normalized' must be one of 'jaccard','swadesh',False.") # create an empty matrix if not normalized: matrix = np.zeros((wordlist.width, wordlist.width), dtype=int) else: matrix = np.zeros((wordlist.width, wordlist.width), dtype=float) # create the figure fig = plt.figure(figsize=keywords['figsize']) # plot the reference tree if keywords['show_tree']: tree_matrix, taxa = nwk2tree_matrix(tree) ax1 = fig.add_axes( [ keywords['left'], keywords['bottom'], 0.25 * keywords['width'], keywords['height'] ] ) # [0.01,0.1,0.2,0.7]) d = sch.dendrogram( np.array(tree_matrix), labels=[t for t in taxa], orientation='left', ) taxa = d['ivl'][::-1] ax1.set_xticks([]) ax1.set_yticks([]) ax1.spines['bottom'].set_color('#ffffff') ax1.spines['top'].set_color('#ffffff') ax1.spines['left'].set_color('#ffffff') ax1.spines['right'].set_color('#ffffff') left = keywords['left'] + keywords['scale'] * keywords['width'] else: left = keywords['left'] taxa = tree.taxa # start iterating over taxa in order of the reference tree and fill in the # matrix with numbers of shared cognates if keywords['matrix']: matrix = keywords['matrix'] else: for i, taxonA in enumerate(taxa): for j, taxonB in enumerate(taxa): if i < j: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list( taxa=taxonA, flat=True, entry=ref ) cogsB = wordlist.get_list( taxa=taxonB, flat=True, entry=ref ) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict( taxa=taxonA, entry=ref ) cogsB = wordlist.get_dict( taxa=taxonB, entry=ref ) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warn(str( [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB])) shared = 0.0 matrix[i][j] = shared # if refB is also a possibiltiy if not refB: matrix[j][i] = shared elif i > j and refB: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list( taxa=taxonA, flat=True, entry=refB ) cogsB = wordlist.get_list( taxa=taxonB, flat=True, entry=refB ) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict( taxa=taxonA, entry=refB ) cogsB = wordlist.get_dict( taxa=taxonB, entry=refB ) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warn(str( [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB])) shared = 0.0 matrix[i][j] = shared elif i == j: cogs = wordlist.get_list( taxa=taxonA, flat=True, entry=ref ) if normalized: matrix[i][j] = 1.0 else: matrix[i][j] = len(set(cogs)) ax2 = fig.add_axes( [ left, # keywords['left']+0.25 * keywords['width']+0.05, keywords['bottom'], keywords['width'], keywords['height'] ] ) cmap = keywords['cmap'] # [0.15,0.1,0.7,0.7]) im = ax2.matshow(matrix, aspect='auto', origin='lower', interpolation='nearest', cmap=keywords['cmap'], vmax=keywords['vmax'], vmin=keywords['vmin'] ) # set the xticks steps = int(len(taxa) / keywords['steps'] + 0.5) start = int(steps / 2 + 0.5) idxs = [0] + list(range(start, len(taxa), steps)) selected_taxa = [taxa[i] for i in idxs] # modify taxon names if this is specified for i, t in enumerate(selected_taxa): if t in keywords['labels']: selected_taxa[i] = keywords['labels'][t] ax2.set_xticks([]) ax2.set_yticks([]) plt.xticks( idxs, selected_taxa, size=keywords['textsize'], rotation=keywords['xrotation'], rotation_mode="default" ) plt.yticks( idxs, selected_taxa, size=keywords['textsize'], ) if keywords["colorbar"]: plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax']) c = plt.colorbar(im, shrink=keywords['colorbar_shrink']) c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize']) plt.subplots_adjust( left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom'] ) plt.savefig(filename + '.' + fileformat) f = open(filename + '.matrix', 'w') for i, t in enumerate(taxa): f.write('{0:20}'.format(t)) for j, c in enumerate(matrix[i]): if not normalized: f.write('\t{0:3}'.format(int(c))) else: f.write('\t{0:.2f}'.format(c)) f.write('\n') f.close() log.file_written(filename + '.' + fileformat)
def __init__(self, filename, conf=''): """ Parse data regularly if the data has not been loaded from a pickled version. """ self.log = log.get_logger() # try to load the data internal_import = False # check whether it's a dictionary from which we load if isinstance(filename, dict): input_data = filename if 'filename' not in input_data: self.filename = rcParams['filename'] internal_import = True # make check for correct input, there was a bug with a wrong # evaluation which is hopefully fixed by now tmp_keys = [k for k in input_data if isinstance(k, int)] if len(input_data[0]) != len(input_data[tmp_keys[0]]): raise ValueError("[!] Wrong input format!") # pragma: no cover # check whether it's another wordlist-object elif hasattr(filename, '_data') and hasattr(filename, '_meta'): input_data = dict(filename._data.items()) input_data.update(filename._meta.items()) input_data[0] = [a for a, b in sorted( filename.header.items(), key=lambda x:x[1], reverse=False)] internal_import = True self.filename = rcParams['filename'] # or whether the data is an actual file elif isinstance(filename, string_types) and os.path.isfile(filename): input_data = read_qlc(filename) self.filename = filename # raise an error otherwise elif isinstance(filename, string_types): raise IOError("Input file '{0}' does not exist.".format(filename)) else: raise TypeError("Unrecognized type for 'filename' argument: {0}".format( type(filename).__name__)) # load the configuration file if not conf: conf = util.data_path('conf', 'qlc.rc') # read the file defined by its path in conf tmp = [line.split('\t') for line in util.read_config_file(conf)] # define two attributes, _alias, and _class which store the aliases and # the datatypes (classes) of the given entries self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {} for name, cls, alias in tmp: # make sure the name itself is there self._alias[name.lower()] = name self._alias[name.upper()] = name self._class[name.lower()] = eval(cls) self._class[name.upper()] = eval(cls) self._class_string[name.lower()] = cls self._class_string[name.upper()] = cls # add the aliases for a in alias.split(','): self._alias[a.lower()] = name self._alias[a.upper()] = name self._class[a.lower()] = eval(cls) self._class[a.upper()] = eval(cls) self._class_string[a.lower()] = cls self._class_string[a.upper()] = cls self._alias2[name] = sorted(set(alias.split(','))) + [name] # append the names in data[0] to self.conf to make sure that all data # is covered, even the types which are not specifically defined in the # conf file. the datatype defaults here to "str" for name in input_data[0]: if name.lower() not in self._alias: self._alias[name.lower()] = name.lower() self._class[name.lower()] = str if name.upper() not in self._alias: self._alias[name.upper()] = name.lower() self._class[name.upper()] = str # add empty alias for empty strings XXX why was that? I can't remember # why this was important XXX self._alias[''] = '' # the header stores the indices of the data in the original data # dictionary self.header = dict( zip([self._alias[x] for x in input_data[0]], range(len(input_data[0])))) # now create a specific header which has all aliases self._header = dict([(k, v) for k, v in self.header.items()]) # assign all aliases to the header for alias in self._alias: try: idx = self._header[self._alias[alias]] self._header[alias] = idx except: pass # assign the data as attribute to the word list class. Note that we # need to check for the type here, but since numpy also offers integer # types, we don't check for type(x) == int, but instead use the # str.numeric-function that returns numeric values only if it is an # integer self._data = { int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric()} # iterate over self._data and change the values according to the # functions (only needed when reading from file) if not internal_import: heads = sorted(self._header.items(), key=lambda x: x[1]) for key in self._data: check = [] for head, i in heads: if i not in check: try: self._data[key][i] = self._class[head](self._data[key][i]) check.append(i) except: # pragma: no cover log.warn( 'Problem with row {0} in col {1}, expected' + ' «{4}» as datatype but received «{3}» ' + ' (ROW: {2}, entry {5}).'.format( key, i, '|'.join([str(x) for x in self._data[key]]), self._data[key][i], self._class[head], head)) # create entry attribute of the wordlist self.entries = sorted(set([b.lower() for a, b in self._alias.items() if b])) # assign meta-data self._meta = {} for key in [k for k in input_data if type(k) != int]: self._meta[key] = input_data[key]
def read_qlc(infile, comment='#'): """ Simple function that loads qlc-format into a dictionary. Parameters ---------- infile : str The name of the input file. comment : str (default="#") The comment character. If a line starts with this character, it will be ignored. Returns ------- d : dict A dictionary with integer keys corresponding to the order of the lines of the input file. The header is given 0 as a specific key. """ lines = read_text_file(infile, lines=True, normalize="NFC") data, meta, dtype = [], {}, False while lines: line = lines.pop(0) if line.startswith(comment) or not line: continue if line.startswith('@'): key, value = [s.strip() for s in line[1:].split(':', 1)] if key == 'tree': meta["tree"] = cg.LoadTree(treestring=value) elif key == 'json': for j1, j2 in json.loads(value).items(): meta[j1] = j2 else: if key not in meta: meta[key] = value else: if isinstance(meta[key], list): meta[key].append(value) else: warn( "Key '{0}' in input file is not unique! Use JSON-format for " "these datatypes!".format(key)) meta[key] = [meta[key]] + [value] # line starts with complex stuff elif line.startswith('<'): tmp = line[1:line.index('>')] # check for specific keywords if ' ' in tmp: dtype = tmp.split(' ')[0] keys = { k: v[1:-1] for k, v in [key.split('=') for key in tmp.split(' ')[1:]] } else: dtype = tmp.strip() keys = {} tmp = [] while True: line = lines.pop(0) if line.startswith('</' + dtype + '>'): break tmp += [line] tmp = '\n'.join(tmp) # check for data stuff if dtype == "json": tmp = json.loads(tmp) if not keys: for key in tmp: meta[key] = tmp[key] elif keys: meta[keys["id"]] = {} for k in tmp: meta[keys["id"]][k] = tmp[k] elif dtype in ['tre', 'nwk']: if "trees" not in meta: meta["trees"] = {} if not keys: keys["id"] = "1" # XXX consider switching to Tree here XXX meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp) elif dtype in ['csv']: meta[keys["id"]] = {} ncol = int(keys.get('ncol', 2)) if "dtype" in keys: transf = eval(keys["dtype"]) else: transf = str # split tmp into lines tmp = tmp.split('\n') for l in tmp: if ncol == 2: a, b = l.split('\t') b = transf(b) else: l = l.split('\t') a = l[0] b = [transf(b) for b in l[1:]] meta[keys["id"]][a] = b elif dtype == 'msa': tmp = tmp.split('\n') if 'msa' not in meta: meta['msa'] = {} ref = keys.get('ref', 'cogid') if ref not in meta['msa']: meta['msa'][ref] = {} tmp_msa = {} try: tmp_msa['dataset'] = meta['dataset'] except: tmp_msa['dataset'] = infile.replace('.csv', '') tmp_msa['seq_id'] = keys['id'] # add consensus string to msa, if it appears in the keys if "consensus" in keys: tmp_msa['consensus'] = keys['consensus'] msad = [] for l in tmp: if not l.startswith(comment): msad.append( [x.strip().rstrip('.') for x in l.split('\t')]) tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa) try: meta['msa'][ref][int(keys['id'])] = tmp_msa except ValueError: meta['msa'][ref][keys['id']] = tmp_msa elif dtype == 'dst': taxa, matrix = read_dst(tmp) distances = [[0.0 for _ in matrix] for _ in matrix] for i, line in enumerate(matrix): for j, cell in enumerate(line): if i < j: distances[i][j] = cell distances[j][i] = cell meta['distances'] = distances elif dtype == 'scorer': scorer = read_scorer(tmp) if 'scorer' not in meta: meta['scorer'] = {} if 'id' not in keys: keys['id'] = 'basic' meta['scorer'][keys['id']] = scorer elif dtype == 'taxa': meta['taxa'] = [t.strip() for t in tmp.split('\n')] else: data += [[l.strip() for l in line.split('\t')]] # create the dictionary in which the data will be stored d = {} # check for first line, if a local ID is given in the header (or simply # "ID"), take this line as the ID, otherwise create it local_id = data[0][0].lower() in ['id', 'local_id', 'localid'] # iterate over data and fill the dictionary (a bit inefficient, but enough # for the moment) try: i = 1 for j, line in enumerate(data[1:]): if local_id: d[int(line[0])] = line[1:] else: d[i] = line i += 1 except ValueError as e: raise Exception("Error processing line {0}:\n".format(j) + str(data[1:][j]) + '\nOriginal error message: ' + str(e)) # assign the header to d[0] if local_id: d[0] = [x.lower() for x in data[0][1:]] else: d[0] = [x.lower() for x in data[0]] for m in meta: d[m] = meta[m] if 'trees' in d and 'tree' not in d: d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1] return d
def __init__(self, filename, conf=''): """ Parse data regularly if the data has not been loaded from a pickled version. """ self.log = log.get_logger() # try to load the data internal_import = False # check whether it's a dictionary from which we load if isinstance(filename, dict): input_data = filename if 'filename' not in input_data: self.filename = rcParams['filename'] internal_import = True # make check for correct input, there was a bug with a wrong # evaluation which is hopefully fixed by now tmp_keys = [k for k in input_data if isinstance(k, int)] if len(input_data[0]) != len(input_data[tmp_keys[0]]): print(input_data[0], input_data[tmp_keys[0]]) raise ValueError("[!] Wrong input format!") # pragma: no cover # check whether it's another wordlist-object elif hasattr(filename, '_data') and hasattr(filename, '_meta'): input_data = dict([(key, [v for v in value]) for key, value in \ filename._data.items()]) input_data.update(filename._meta.items()) input_data[0] = [ a for a, b in sorted( filename.header.items(), key=lambda x: x[1], reverse=False) ] internal_import = True self.filename = rcParams['filename'] # or whether the data is an actual file elif isinstance(filename, string_types) and os.path.isfile(filename): input_data = read_qlc(filename) self.filename = filename # raise an error otherwise elif isinstance(filename, string_types): raise IOError("Input file '{0}' does not exist.".format(filename)) else: raise TypeError( "Unrecognized type for 'filename' argument: {0}".format( type(filename).__name__)) # load the configuration file if not conf: conf = util.data_path('conf', 'qlc.rc') # read the file defined by its path in conf tmp = [line.split('\t') for line in util.read_config_file(conf)] # define two attributes, _alias, and _class which store the aliases and # the datatypes (classes) of the given entries self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {} for name, cls, alias in tmp: # make sure the name itself is there self._alias[name.lower()] = self._alias[name.upper()] = name self._class[name.lower()] = self._class[name.upper()] = eval(cls) self._class_string[name.lower()] = self._class_string[ name.upper()] = cls # add the aliases for a in alias.split(','): self._alias[a.lower()] = self._alias[a.upper()] = name self._class[a.lower()] = self._class[a.upper()] = eval(cls) self._class_string[a.lower()] = self._class_string[ a.upper()] = cls self._alias2[name] = sorted(set(alias.split(','))) + [name] # append the names in data[0] to self.conf to make sure that all data # is covered, even the types which are not specifically defined in the # conf file. the datatype defaults here to "str" for name in input_data[0]: if name.lower() not in self._alias: self._alias[name.lower()] = name.lower() self._class[name.lower()] = str if name.upper() not in self._alias: self._alias[name.upper()] = name.lower() self._class[name.upper()] = str # add empty alias for empty strings XXX why was that? I can't remember # why this was important XXX self._alias[''] = '' # the header stores the indices of the data in the original data dictionary self.header = dict( zip([self._alias[x] for x in input_data[0]], range(len(input_data[0])))) # now create a specific header which has all aliases self._header = {k: v for k, v in self.header.items()} # add a sorted header for reference self.columns = sorted(self.header, key=lambda x: self.header[x]) # assign all aliases to the header for alias in self._alias: try: self._header[alias] = self._header[self._alias[alias]] except: pass # assign the data as attribute to the word list class. Note that we # need to check for the type here, but since numpy also offers integer # types, we don't check for type(x) == int, but instead use the # str.numeric-function that returns numeric values only if it is an # integer self._data = { int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric() } # check for same length of all columns check_errors = '' for k, v in self._data.items(): if len(v) != len(self.header): check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format( k, len(v), len(self.header)) if check_errors: raise ValueError(check_errors + '\n' + ', '.join(sorted(self.header))) # iterate over self._data and change the values according to the # functions (only needed when reading from file) if not internal_import: heads = sorted(self._header.items(), key=lambda x: x[1]) for key in self._data: check = [] for head, i in heads: if i not in check: logstring = 'Problem with row {0} in col {1}, expected' + \ ' «{4}» as datatype but received «{3}» ' + \ ' (ROW: {2}, entry {5}).' try: self._data[key][i] = self._class[head]( self._data[key][i]) check.append(i) except KeyError: log.warn( logstring.format( key, i, '|'.join([str(x) for x in self._data[key] ]), self._data[key][i], self._class[head], head)) except ValueError: log.warn( logstring.format( key, i, '|'.join([str(x) for x in self._data[key] ]), self._data[key][i], self._class[head], head)) # create entry attribute of the wordlist self.entries = sorted( set([b.lower() for a, b in self._alias.items() if b])) # assign meta-data self._meta = {} for key in [k for k in input_data if type(k) != int]: self._meta[key] = input_data[key]
def read_qlc(infile, comment='#'): """ Simple function that loads qlc-format into a dictionary. Parameters ---------- infile : str The name of the input file. comment : str (default="#") The comment character. If a line starts with this character, it will be ignored. Returns ------- d : dict A dictionary with integer keys corresponding to the order of the lines of the input file. The header is given 0 as a specific key. """ lines = read_text_file(infile, lines=True, normalize="NFC") data, meta, dtype = [], {}, False while lines: line = lines.pop(0) if line.startswith(comment) or not line: continue if line.startswith('@'): key, value = [s.strip() for s in line[1:].split(':', 1)] if key == 'tree': meta["tree"] = cg.LoadTree(treestring=value) elif key == 'json': for j1, j2 in json.loads(value).items(): meta[j1] = j2 else: if key not in meta: meta[key] = value else: if isinstance(meta[key], list): meta[key].append(value) else: warn( "Key '{0}' in input file is not unique! Use JSON-format for " "these datatypes!".format(key)) meta[key] = [meta[key]] + [value] # line starts with complex stuff elif line.startswith('<'): tmp = line[1:line.index('>')] # check for specific keywords if ' ' in tmp: dtype = tmp.split(' ')[0] keys = {k: v[1:-1] for k, v in [key.split('=') for key in tmp.split(' ')[1:]]} else: dtype = tmp.strip() keys = {} tmp = [] while True: line = lines.pop(0) if line.startswith('</' + dtype + '>'): break tmp += [line] tmp = '\n'.join(tmp) # check for data stuff if dtype == "json": tmp = json.loads(tmp) if not keys: for key in tmp: meta[key] = tmp[key] elif keys: meta[keys["id"]] = {} for k in tmp: meta[keys["id"]][k] = tmp[k] elif dtype in ['tre', 'nwk']: if "trees" not in meta: meta["trees"] = {} if not keys: keys["id"] = "1" # XXX consider switching to Tree here XXX meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp) elif dtype in ['csv']: meta[keys["id"]] = {} ncol = int(keys.get('ncol', 2)) if "dtype" in keys: transf = eval(keys["dtype"]) else: transf = str # split tmp into lines tmp = tmp.split('\n') for l in tmp: if ncol == 2: a, b = l.split('\t') b = transf(b) else: l = l.split('\t') a = l[0] b = [transf(b) for b in l[1:]] meta[keys["id"]][a] = b elif dtype == 'msa': tmp = tmp.split('\n') if 'msa' not in meta: meta['msa'] = {} ref = keys.get('ref', 'cogid') if ref not in meta['msa']: meta['msa'][ref] = {} tmp_msa = {} try: tmp_msa['dataset'] = meta['dataset'] except: tmp_msa['dataset'] = infile.replace('.csv', '') tmp_msa['seq_id'] = keys['id'] # add consensus string to msa, if it appears in the keys if "consensus" in keys: tmp_msa['consensus'] = keys['consensus'] msad = [] for l in tmp: if not l.startswith(comment): msad.append([x.strip().rstrip('.') for x in l.split('\t')]) tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa) try: meta['msa'][ref][int(keys['id'])] = tmp_msa except ValueError: meta['msa'][ref][keys['id']] = tmp_msa elif dtype == 'dst': taxa, matrix = read_dst(tmp) distances = [[0.0 for _ in matrix] for _ in matrix] for i, line in enumerate(matrix): for j, cell in enumerate(line): if i < j: distances[i][j] = cell distances[j][i] = cell meta['distances'] = distances elif dtype == 'scorer': scorer = read_scorer(tmp) if 'scorer' not in meta: meta['scorer'] = {} if 'id' not in keys: keys['id'] = 'basic' meta['scorer'][keys['id']] = scorer elif dtype == 'taxa': meta['taxa'] = [t.strip() for t in tmp.split('\n')] else: data += [[l.strip() for l in line.split('\t')]] # create the dictionary in which the data will be stored d = {} # check for first line, if a local ID is given in the header (or simply # "ID"), take this line as the ID, otherwise create it local_id = data[0][0].lower() in ['id', 'local_id', 'localid'] # iterate over data and fill the dictionary (a bit inefficient, but enough # for the moment) try: i = 1 for j, line in enumerate(data[1:]): if local_id: d[int(line[0])] = line[1:] else: d[i] = line i += 1 except ValueError as e: raise Exception("Error processing line {0}:\n".format(j) + str(data[1:][j]) + '\nOriginal error message: ' + str(e)) # assign the header to d[0] if local_id: d[0] = [x.lower() for x in data[0][1:]] else: d[0] = [x.lower() for x in data[0]] for m in meta: d[m] = meta[m] if 'trees' in d and 'tree' not in d: d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1] return d
def plot_heatmap( wordlist, filename="heatmap", fileformat="pdf", ref='cogid', normalized=False, refB='', **keywords ): """ Create a heatmap-representation of shared cognates for a given wordlist. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. filename : str (default="heatmap") Name of the file to which the heatmap will be written. fileformat : str (default="pdf") A regular matplotlib-fileformat (pdf, png, pgf, svg). ref : str (default="cogid') The name of the column that contains the cognate identifiers. normalized : {bool str} (default=True) If set to c{False}, don't normalize the data. Otherwise, select the normalization method, choose between: * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for details), and * "swadesh" for traditional lexicostatistical calculation of shared cognate percentages. cmap : matplotlib.cm (default=matplotlib.cm.jet) The color scheme to be used for the heatmap. steps : int (default=5) The number of steps in which names of taxa will be written to the axes. xrotation : int (default=45) The rotation of the taxon-names on the x-axis. colorbar : bool (default=True) Specify, whether a colorbar should be added to the plot. figsize : tuple (default=(10,10)) Specify the size of the figure. tree : str (default='') A tree passed for the taxa in Newick-format. If no tree is specified, the method looks for a tree object in the Wordlist. Notes ----- This function plots shared cognate percentages. """ defaults = dict( bottom=0.01, # rcParams['phybo_ylimb'] cmap=mpl.cm.jet, colorbar=True, colorbar_label="Shared Cognates", colorbar_shrink=0.75, colorbar_textsize=10, figsize=(10, 5), height=0.8, labels={}, # taxon labels passed for the taxa, left=0.01, # rcParams['phybo_xlimr'], matrix=False, normalization="jaccard", right=0.95, # rcParams['phybo_xliml'], scale=0.075, show_tree=True, steps=20, textsize=5, top=0.95, # rcParams['phybo_ylimt'], tree='', tree_bottom=0.1, tree_left=0.1, tree_width=0.2, vmax=1.0, vmin=0.0, width=0.8, xrotation=90, distances=False ) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # access the reference tree of the wordlist and create a function that # orders the taxa accordingly if not keywords['tree']: try: tree = wordlist.tree except: raise ValueError("[i] No tree could be found") else: tree = keywords["tree"] # check for normalization if normalized: if normalized not in ["jaccard", "swadesh"]: raise ValueError( "Keyword 'normalized' must be one of 'jaccard','swadesh',False.") # create an empty matrix if not normalized: matrix = np.zeros((wordlist.width, wordlist.width), dtype=int) else: matrix = np.zeros((wordlist.width, wordlist.width), dtype=float) # create the figure fig = plt.figure(figsize=keywords['figsize']) # plot the reference tree if keywords['show_tree']: tree_matrix, taxa = nwk2tree_matrix(tree) ax1 = fig.add_axes( [ keywords['left'], keywords['bottom'], 0.25 * keywords['width'], keywords['height'] ] ) # [0.01,0.1,0.2,0.7]) d = sch.dendrogram( np.array(tree_matrix), labels=[t for t in taxa], orientation='left', ) taxa = d['ivl'][::-1] ax1.set_xticks([]) ax1.set_yticks([]) ax1.spines['bottom'].set_color('#ffffff') ax1.spines['top'].set_color('#ffffff') ax1.spines['left'].set_color('#ffffff') ax1.spines['right'].set_color('#ffffff') left = keywords['left'] + keywords['scale'] * keywords['width'] else: left = keywords['left'] taxa = tree.taxa # start iterating over taxa in order of the reference tree and fill in the # matrix with numbers of shared cognates if keywords['matrix']: matrix = keywords['matrix'] else: for i, taxonA in enumerate(taxa): for j, taxonB in enumerate(taxa): if i < j: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list( taxa=taxonA, flat=True, entry=ref ) cogsB = wordlist.get_list( taxa=taxonB, flat=True, entry=ref ) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict( taxa=taxonA, entry=ref ) cogsB = wordlist.get_dict( taxa=taxonB, entry=ref ) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warn(str( [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB])) shared = 0.0 matrix[i][j] = shared # if refB is also a possibiltiy if not refB: matrix[j][i] = shared elif i > j and refB: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list( taxa=taxonA, flat=True, entry=refB ) cogsB = wordlist.get_list( taxa=taxonB, flat=True, entry=refB ) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict( taxa=taxonA, entry=refB ) cogsB = wordlist.get_dict( taxa=taxonB, entry=refB ) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warn(str( [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB])) shared = 0.0 matrix[i][j] = shared elif i == j: cogs = wordlist.get_list( taxa=taxonA, flat=True, entry=ref ) if normalized: matrix[i][j] = 1.0 else: matrix[i][j] = len(set(cogs)) ax2 = fig.add_axes( [ left, # keywords['left']+0.25 * keywords['width']+0.05, keywords['bottom'], keywords['width'], keywords['height'] ] ) cmap = keywords['cmap'] # [0.15,0.1,0.7,0.7]) if 'distances' in keywords: for i, line in enumerate(matrix): for j, cell in enumerate(matrix): matrix[i][j] = 1 - matrix[i][j] im = ax2.matshow(matrix, aspect='auto', origin='lower', interpolation='nearest', cmap=keywords['cmap'], vmax=keywords['vmax'], vmin=keywords['vmin'] ) # set the xticks steps = int(len(taxa) / keywords['steps'] + 0.5) start = int(steps / 2 + 0.5) idxs = [0] + list(range(start, len(taxa), steps)) selected_taxa = [taxa[i] for i in idxs] # modify taxon names if this is specified for i, t in enumerate(selected_taxa): if t in keywords['labels']: selected_taxa[i] = keywords['labels'][t] ax2.set_xticks([]) ax2.set_yticks([]) plt.xticks( idxs, selected_taxa, size=keywords['textsize'], rotation=keywords['xrotation'], rotation_mode="default" ) plt.yticks( idxs, selected_taxa, size=keywords['textsize'], ) if keywords["colorbar"]: plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax']) c = plt.colorbar(im, shrink=keywords['colorbar_shrink']) c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize']) plt.subplots_adjust( left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom'] ) plt.savefig(filename + '.' + fileformat) f = open(filename + '.matrix', 'w') for i, t in enumerate(taxa): f.write('{0:20}'.format(t)) for j, c in enumerate(matrix[i]): if not normalized: f.write('\t{0:3}'.format(int(c))) else: f.write('\t{0:.2f}'.format(c)) f.write('\n') f.close() log.file_written(filename + '.' + fileformat)
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults(kw, template=False, css=False, comment='#', filename=infile[:-4] + '.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append(('.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']))) alignments.append( ([str(a) for a in almA], [str(b) for b in almB], 0)) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warn("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults( kw, template=False, css=False, comment='#', filename=infile[:-4]+'.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append( ( '.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']) ) ) alignments.append( ( [str(a) for a in almA], [str(b) for b in almB], 0) ) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warn("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids ) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
def pid(almA, almB, mode=2): """ Calculate the Percentage Identity (PID) score for aligned sequence pairs. Parameters ---------- almA, almB : string or list The aligned sequences which can be either a string or a list. mode : { 1, 2, 3, 4, 5 } Indicate which of the four possible PID scores described in :evobib:`Raghava2006` should be calculated, the fifth possibility is added for linguistic purposes: 1. identical positions / (aligned positions + internal gap positions), 2. identical positions / aligned positions, 3. identical positions / shortest sequence, or 4. identical positions / shortest sequence (including internal gap pos.) 5. identical positions / (aligned positions + 2 * number of gaps) Returns ------- score : float The PID score of the given alignment as a floating point number between 0 and 1. Notes ----- The PID score is a common measure for the diversity of a given alignment. The implementation employed by LingPy follows the description of :evobib:`Raghava2006` where four different variants of PID scores are distinguished. Essentially, the PID score is based on the comparison of identical residue pairs with the total number of residue pairs in a given alignment. Examples -------- Load an alignment from the test suite. >>> from lingpy import * >>> pairs = PSA(get_file('test.psa')) Extract the alignments of the first aligned sequence pair. >>> almA,almB,score = pairs.alignments[0] Calculate the PID score of the alignment. >>> pid(almA,almB) 0.44444444444444442 See also -------- lingpy.compare.Multiple.get_pid .. todo:: change debug for ZeroDivisionError """ zipped = zip(almA, almB) idn_pos = 0 int_gps = 0 aln_pos = 0 for charA, charB in zipped: tmp = [charA, charB].count('-') if tmp == 1: int_gps += 1 elif tmp == 0 and charA == charB: idn_pos += 1 aln_pos += 1 elif tmp == 0: aln_pos += 1 if mode == 2: try: return idn_pos / (aln_pos + int_gps) except ZeroDivisionError: log.warn('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 1: try: return idn_pos / aln_pos except ZeroDivisionError: log.warn('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 3: srt_seq = min( len([i for i in almA if i != '-']), len([i for i in almB if i != '-'])) try: return idn_pos / srt_seq except ZeroDivisionError: log.warn('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 4: srt_seq = min( len(''.join([i[0] for i in almA]).strip('-')), len(''.join([i[0] for i in almB]).strip('-'))) try: return idn_pos / srt_seq except ZeroDivisionError: log.warn('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 5: return idn_pos / len(almA)
def pid(almA, almB, mode=2): """ Calculate the Percentage Identity (PID) score for aligned sequence pairs. Parameters ---------- almA, almB : string or list The aligned sequences which can be either a string or a list. mode : { 1, 2, 3, 4, 5 } Indicate which of the four possible PID scores described in :evobib:`Raghava2006` should be calculated, the fifth possibility is added for linguistic purposes: 1. identical positions / (aligned positions + internal gap positions), 2. identical positions / aligned positions, 3. identical positions / shortest sequence, or 4. identical positions / shortest sequence (including internal gap pos.) 5. identical positions / (aligned positions + 2 * number of gaps) Returns ------- score : float The PID score of the given alignment as a floating point number between 0 and 1. Notes ----- The PID score is a common measure for the diversity of a given alignment. The implementation employed by LingPy follows the description of :evobib:`Raghava2006` where four different variants of PID scores are distinguished. Essentially, the PID score is based on the comparison of identical residue pairs with the total number of residue pairs in a given alignment. Examples -------- Load an alignment from the test suite. >>> from lingpy import * >>> pairs = PSA(get_file('test.psa')) Extract the alignments of the first aligned sequence pair. >>> almA,almB,score = pairs.alignments[0] Calculate the PID score of the alignment. >>> pid(almA,almB) 0.44444444444444442 See also -------- lingpy.compare.Multiple.get_pid .. todo:: change debug for ZeroDivisionError """ zipped = zip(almA, almB) idn_pos = 0 int_gps = 0 aln_pos = 0 for charA, charB in zipped: tmp = [charA, charB].count('-') if tmp == 1: int_gps += 1 elif tmp == 0 and charA == charB: idn_pos += 1 aln_pos += 1 elif tmp == 0: aln_pos += 1 if mode == 2: try: return idn_pos / (aln_pos + int_gps) except ZeroDivisionError: log.warn('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 1: try: return idn_pos / aln_pos except ZeroDivisionError: log.warn('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 3: srt_seq = min( len([i for i in almA if i != '-']), len([i for i in almB if i != '-'])) try: return idn_pos / srt_seq except ZeroDivisionError: log.warn('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 4: srt_seq = min( len(''.join([i[0] for i in almA]).strip('-')), len(''.join([i[0] for i in almB]).strip('-'))) try: return idn_pos / srt_seq except ZeroDivisionError: log.warn('Zero Division Error in {0} and {1}'.format(almA, almB)) return 0 elif mode == 5: return idn_pos / len(almA)