def getLetters(typ='nt', fontname='monospace', dpi=500): ''' Generates a temporary image file for every letter (4 nt or 20 aa) Each letter extends to the full length of both axes Parameters ---------- typ: string nt (default) or aa fontname: string name of a font (default: monospace) dpi: int DPI value (default: 500) Returns ------- none ''' # obtain color scheme depending on nt or aa alignment if typ == 'nt': colours = utilityFunctions.getNtColours() elif typ == 'aa': colours = utilityFunctions.getAAColours() # for each possible base/aa create temporary plot for base in colours.keys(): f = plt.figure(figsize=(1, 1), dpi=dpi, edgecolor='black') a = f.add_subplot(1, 1, 1) a.set_xlim(0, 1) a.set_ylim(0, 1) fs = getFontSize(f, a, 1) a.text(0.5, 0.02, base, fontsize=fs * 0.95, fontdict={ 'family': 'monospace', 'name': fontname }, color=colours[base], va='baseline', ha='center') plt.gca().set_axis_off() a.margins(0, 0) f.subplots_adjust(top=1, bottom=0, right=1, left=0, wspace=None, hspace=None) a.set_frame_on(False) # temporarily save plot in working directory base = base.replace("*", "stop") f.savefig("%s_temp.png" % base, dpi=500, pad_inches=0.1) plt.close()
def arrNumeric(arr, typ): ''' Converts the sequence array into a numerical matrix and a colour map which matplotlib can interpret as an image (similar to https://bit.ly/2CIKOEr) The rows in the array are inverted so that the output image has the rows in the same order as the input alignment. Parameters ---------- arr: np.array The alignment stored as a numpy array typ: str Either 'aa' - amino acid - or 'nt' - nucleotide Returns ------- arr2: np.array The flipped alignment as an array of integers cmap: matplotlib.colors.ListedColormap A colour map with the colours corresponding to each base or amino acid ''' # turn the array upside down arr = np.flip(arr, axis=0) if typ == 'nt': D = utilityFunctions.getNtColours() else: D = utilityFunctions.getAAColours() # retrieve the colours for the colour map keys = list(D.keys()) ali_height, ali_width = np.shape(arr) # make a dictionary where each integer corresponds to a base or nt i = 0 nD = dict() colours = [] for key in keys: if key in arr: nD[key] = i colours.append(D[key]) i += 1 arr2 = np.empty([ali_height, ali_width]) for x in range(ali_width): for y in range(ali_height): # numeric version of the alignment array arr2[y, x] = nD[arr[y, x]] cmap = matplotlib.colors.ListedColormap(colours) return (arr2, cmap)
def calc_entropy(count, seq_count, typ): ''' Creates a sequence logo based on an entropy calculation using bars Scales the bars according to the information content of the alignment Representation of the consensus sequence of the alignment Parameters ---------- count: dict of nt/aa with counts seq_count: int number of sequences in alignment typ: str nt or aa Returns ------- height_per_base: dictionary height for each nt/aa info_per_base: dictionary information content for each nt/aa ''' # obtain nt/aa lists, use colour scheme for that w/o using colours here # just because another list of nt/aa would be obsolete if typ == "nt": element_list = utilityFunctions.getNtColours() s = 4 max_entropy = log(4, 2) elif typ == "aa": element_list = utilityFunctions.getAAColours() s = 20 max_entropy = log(20, 2) info_per_base = {} freq_per_base = {} height_per_base = {} entropy_per_base = {} for element in element_list: info_per_base[element] = 0 freq_per_base[element] = 0 height_per_base[element] = 0 entropy_per_base[element] = 0 # correct for small sample sizes sample_size_correction = (1 / log(s, 2)) * ((s - 1) / (2 * seq_count)) gap_correction = seq_count if count.get("-"): seq_count -= count.get("-") # correct for gaps, since they lower the information content gap_correction = seq_count / gap_correction entropy = 0 if seq_count == 0: return height_per_base, info_per_base # calculate entropy, from that information, from that height for base, quantity in count.items(): if base != "-": frequency = quantity / seq_count freq_per_base[base] = frequency entropy -= frequency * log(frequency, 2) info_per_base[base] = max_entropy + frequency * log(frequency, 2) entropy_per_base[base] = -frequency * log(frequency, 2) information_per_column = max_entropy - entropy - sample_size_correction # if the information content is constant throughout the column, # these value will be negative. Since this does not add any information # set them to 0 # they can be negative due to the sample size correction (otherwise they'd be 0) for base, quantity in info_per_base.items(): if freq_per_base[base] * information_per_column < 0: height_per_base[base] = 0 else: # scale to accomodate gaps height_per_base[base] = (gap_correction * freq_per_base[base] * information_per_column) return height_per_base, info_per_base
def sequence_bar_logo(alignment, figname, typ='nt', figdpi=300, figrowlength=50, start=0, end=0): ''' Creates a sequence logo based on an entropy calculation using bars Scales the bars according to the information content of the alignment Representation of the consensus sequence of the alignment Parameters ---------- alignment: np.array The alignment stored as a numpy array figname: str name of figure typ: str Either 'aa' - amino acid - or 'nt' - nucleotide figfontname: str Name of font, default: Arial figdpi: int DPI (default: 300) figrowlength: int clength of figure (default: 50) start: int start pos to be turned into logo end: int end pos to be turned into logo Returns ------- none ''' if start == 0 and end == 0: alignment_width = len(alignment[0, :]) else: if end == 0: end = len(alignment[0, :]) alignment_width = len(alignment[0, start:end]) if alignment_width < figrowlength: figrowlength = alignment_width nsegs = math.ceil(alignment_width / figrowlength) f = plt.figure(figsize=(figrowlength / 5, nsegs * 2), dpi=figdpi) gs = gridspec.GridSpec(ncols=1, nrows=nsegs) rstart = start rend = rstart + figrowlength for n in range(nsegs): if rend > (alignment_width + start): rend = alignment_width + start axes = f.add_subplot(gs[n]) axes.set_xlim(rstart - 0.5, rend - 0.5) if typ == 'nt': axes.set_ylim(0, 2.1) axes.set_yticks(np.arange(0, 2.1, 1)) elif typ == 'aa': axes.set_ylim(0, 4.6) axes.set_yticks(np.arange(0, 4.6, 1)) seq_count = len(alignment[:, 0]) width = 0.75 ind = np.arange(rstart, rend) if typ == "nt": element_list = utilityFunctions.getNtColours() colours = utilityFunctions.getNtColours() elif typ == "aa": element_list = utilityFunctions.getAAColours() colours = utilityFunctions.getAAColours() height_list = {} for element in element_list: height_list[element] = [] bottom_height = [] # for each column calculate heights via entropy # and scale letters accordlingly for i in range(rstart, rend): unique, counts = np.unique(alignment[:, i], return_counts=True) count = dict(zip(unique, counts)) height_per_base, info_per_base = calc_entropy( count, seq_count, typ) bottom_height.append(0) # need a list of each nt/aa separately to plot them as bars for base, height in height_per_base.items(): height_list[base].append(height_per_base[base]) # stag bars on top of each other for base, height in height_list.items(): plt.bar(ind, height, width, bottom=bottom_height, color=colours[base]) bottom_height = [i + j for i, j in zip(bottom_height, height)] plt.xticks([rstart, rend - 1], [rstart + 1, rend]) plt.yticks(np.arange(0, 2.1, 1)) plt.xlabel("Position") plt.ylabel("Bit Score") axes.spines['right'].set_visible(False) axes.spines['top'].set_visible(False) rstart += figrowlength rend += figrowlength # save plot as figname plt.savefig(figname, bbox_inches='tight', dpi=figdpi) plt.close()
def sequence_logo(alignment, figname, typ='nt', figfontname='Arial', figdpi=300, figrowlength=50, start=0, end=0): ''' Creates a sequence logo based on an entropy calculation using letters Scales the letters according to the information content of the alignment Representation of the consensus sequence of the alignment Parameters ---------- alignment: np.array The alignment stored as a numpy array figname: str name of figure typ: str Either 'aa' - amino acid - or 'nt' - nucleotide figfontname: str Name of font, default: Arial figdpi: int DPI (default: 300) figrowlength: int clength of figure (default: 50) start: int start pos to be turned into logo end: int end pos to be turned into logo Returns ------- none ''' if start == 0 and end == 0: alignment_width = len(alignment[0, :]) else: if end == 0: end = len(alignment[0, :]) alignment_width = len(alignment[0, start:end]) if alignment_width < figrowlength: figrowlength = alignment_width nsegs = math.ceil(alignment_width / figrowlength) f = plt.figure(figsize=(figrowlength, nsegs * 2), dpi=figdpi) gs = gridspec.GridSpec(ncols=1, nrows=nsegs) getLetters(typ=typ, fontname=figfontname, dpi=figdpi) rstart = start rend = rstart + figrowlength for n in range(nsegs): if rend > (alignment_width + start): rend = alignment_width + start a = plt.subplot(gs[n]) a.set_xlim(rstart, rstart + figrowlength) if typ == 'nt': a.set_ylim(0, 2.1) a.set_yticks(np.arange(0, 2.1, 1)) elif typ == 'aa': a.set_ylim(0, 4.6) a.set_yticks(np.arange(0, 4.6, 1)) limits = a.axis() # for each column calculate heights via entropy # and scale letters accordlingly for i in range(rstart, rend): unique, counts = np.unique(alignment[:, i], return_counts=True) count = dict(zip(unique, counts)) height_per_base, info_per_base = calc_entropy(count, len(alignment[:, 0]), typ=typ) height_sum_higher = 0 Z = zip(height_per_base.keys(), height_per_base.values()) Z = sorted(Z, key=lambda x: x[1]) for base, height in Z: if height > 0: b = base.replace("*", "stop") L = plt.imread("%s_temp.png" % b) a.imshow(L, extent=(i, i + 1, height_sum_higher, height_sum_higher + height), filternorm=False) height_sum_higher += height a.axis(limits) a.set_xticks([rstart, rend]) a.set_xticklabels([rstart, rend]) a.spines['right'].set_visible(False) a.spines['top'].set_visible(False) if n == (nsegs - 1): a.set_xlabel("Position") a.set_ylabel("Bit Score") rstart += figrowlength rend += figrowlength # obtain colours if typ == 'nt': allbases = utilityFunctions.getNtColours() elif typ == 'aa': allbases = utilityFunctions.getAAColours() for base in allbases: b = base.replace("*", "stop") os.unlink("%s_temp.png" % b) # save plot using figname f.savefig(figname, dpi=figdpi, bbox_inches='tight') plt.close()
def testGetAAColours(self): AAcolours = utilityFunctions.getAAColours() self.assertEqual(len(AAcolours), 28)