def tsv_plot_output_aa(wtfile, cond, df, df_se=pd.DataFrame(), outfilename='test.pdf'): """ Read in score and se dataframes, output plots accordingly. wtfile: WT sequence file, used for y-labeling cond: The experiment condition, used for title info df: enrich score dataframe, from plot_input folder, can also used customized df df_se: enrich score dataframe, from plot_input folder, will match df (if customized) automatically outfilename: outputfile path + filename, where to store the output This function is customized for aa. """ row_number, col_number = df.shape # Create Figure object that consists of two axes: grid = GridSpec(2, 1, height_ratios=[row_number, 1], hspace=1 / (row_number + 1) * 2) fig = plt.figure() fig.set_size_inches((col_number + 8) * 0.2, (10 + row_number) * 0.2) # Later, will use subplots_adjust to make sure each square is 1*0.2^2 inches^2 ### Up to now, there is no axes object created, therefore, no plot will be shown. # Create two axes object subplot. mesh_ax = plt.subplot(grid[0]) cbar_ax = plt.subplot(grid[1]) # Adjust subplots to make room for add-on information: ## top: (title + amino acid grouping): 5 unit (each unit is 1*0.2 inch) ## padding between main figure and bar: 1 unit ## bottom: 3 units ## left 2.5 units ## right: 3.5 units plt.gcf().subplots_adjust(top=(row_number + 5) / (row_number + 10), bottom=3 / (row_number + 10), left=5 / (col_number + 8), right=(col_number + 5) / (col_number + 8)) # Replace 'deplete' with an arbituraliy large value, say, 1000 df_raw = df.set_index(['pos']) for col in df_raw.columns: df_raw[col] = df_raw.apply(lambda row: float(row[col]) if not row[col] == 'deplete' else 1000, axis=1) # Find score range: ls = df_raw.values.reshape(1, -1).tolist()[0] ls = [x for x in ls if not np.isnan(x)] _ls = sorted(set(ls)) vmin = _ls[0] if not _ls[-1] == 1000: vmax = _ls[-1] else: vmax = _ls[-2] # Prepare a numpy array and mask NaNs for later plotting: arr_masked = np.ma.array(df_raw, mask=np.isnan(df_raw)) # Get color map and set colors: cmap = plt.get_cmap("RdBu_r") colorlim = max(-vmin, vmax) # Recenter color map: cmap = recentered_cmap(cmap, -colorlim, colorlim) colors = [cmap(i) for i in range(65, 200)] # R -> G -> B cmap_new = LinearSegmentedColormap.from_list('place_holder', colors) # The new cmap_new will take advantage of old cmap, "shrink" the spectrum to make it lighter for better printing. # Set grey to NaN values, and black to totally depleted ones: cmap_new.set_bad("#808080", 1) rgba = cmap_new(0) # The darkest in the new cmap_new, equal to cmap(65) cmap_new.set_over(rgba) # Plot the heatmap: return the value as mesn_pcolor as a mappable object in order to add color_bar: mesh_pcolor = mesh_ax.pcolormesh(arr_masked, cmap=cmap_new, vmin=-colorlim, vmax=colorlim) ## Below are modifications to plotting: # Add in the color bar cbar = fig.colorbar(mesh_pcolor, cax=cbar_ax, orientation='horizontal') cbar.set_label("Enrich2 Score") # Set mesh_ax y labels: mesh_ax.set_ylabel("Position in WT") # Set mesh_ax title: mesh_ax.set_title("Amino Acid Distribution Map for Experiment: " + cond, pad=50) # pad uses points, not sure the relationship between point and inch. # 50 works nicely it seems. # Add in column information: for i, x in enumerate(list(df_raw.columns)): mesh_ax.text(i + 0.5, len(df_raw.index) + 1, x, horizontalalignment="center", verticalalignment="center", rotation=90) # Add in amino acid grouping information: new_AA_GROUPS = [] cstart = 0 cend = -1 for i in range(0, len(AA_GROUPS)): aaProperty = AA_GROUPS[i][0] count = 0 for item in df.columns.values: if item in Property2AA[aaProperty]: count = count + 1 if count == 1: cstart = cend = cend + 1 else: gap = count - 1 cstart = cend + 1 cend = cstart + gap if cend >= cstart: newTuple = (aaProperty, cstart, cend) new_AA_GROUPS.append(newTuple) for codon, start, end in new_AA_GROUPS: mesh_ax.text((end - start + 1) / 2 + start, row_number + 2.5, codon, horizontalalignment="center", verticalalignment="center") bar = Line2D([start + 0.125, end + 1 - 0.125], [row_number + 2, row_number + 2], color="black") bar.set_clip_on(False) mesh_ax.add_line(bar) WT_codon = [ str(wt_codon(wtfile, int(x * 3 - 2))) for x in df['pos'].values ] wt_aa = [str(CODON2AA[codon][0]) for codon in WT_codon] # Add in mesh_ax y-label: aa coordinates in WT sequence: ypos = np.arange(len(df['pos'])) + 0.5 mesh_ax.set_yticks(ypos) labelPos = list(map(str, list(map(int, df['pos'])))) labelAA = wt_aa labelCombine = ['-'.join(item) for item in zip(labelPos, labelAA)] mesh_ax.set_yticklabels(labelCombine, ha='right') # Add in deliminator horizontally: for i in range(0, df_raw.shape[0]): delimBar = Line2D([0, df_raw.shape[1]], [i, i], transform=mesh_ax.transData, color="white") delimBar.set_clip_on(False) mesh_ax.add_line(delimBar) # Add in WT label onto corresponding cell: WT_aa = [CODON2AA[x][0] for x in WT_codon] x_coordinate = [ list(df.columns).index(x) if x in df.columns else np.nan for x in WT_aa ] y_coordinate = list(range(df.shape[1])) wt_xy = zip(x_coordinate, y_coordinate) for x, y in wt_xy: if not x == np.nan: mesh_ax.add_patch( Circle((x + 0.5, y + 0.5), .1666, fill=True, facecolor="black", edgecolor="none", alpha=0.5)) # Make the figure cleaner by removing ticks: mesh_ax.tick_params(bottom=False, left=False) mesh_ax.get_xaxis().set_visible(False) cbar_ax.tick_params(bottom=False) # Add in SE if specified: if not df_se.shape == pd.DataFrame().shape: # Add in SE # Subset df_se to match df that might be truncated: for col in df_se.columns: if not col in df.columns: df_se.drop(columns=col, inplace=True) df_se = df_se[df_se['pos'].isin(df['pos'])] # Below is to find max se value: se_df = df_se.drop(columns=['pos']) tem = sorted(se_df.values.reshape(1, -1).tolist()[0]) tem = [x for x in tem if not np.isnan(x)] se_max = tem[-1] for row in range(len(df_se['pos'])): for col in range(len(df_se.columns) - 1): se_value = df_se.iloc[row, col] corner_dist = (se_max - se_value) / (2 * se_max) corner_dist = (1 - se_value) / 2 diag = Line2D([col + corner_dist, col + 1 - corner_dist], [row + corner_dist, row + 1 - corner_dist], color="black") if se_value > 0.02 and df_raw.iloc[ row, col] != 1000: # se_value below 0.02 will not be displayed so as totally depleted ones mesh_ax.add_line(diag) pylab.savefig(outfilename)
def tsv_plot_output_aa_double(wtfile, wt_mut, cond, df, df_se=pd.DataFrame(), outfilename='test.pdf', version=1, scale = 'max'): # Same as tsv_plot_output_double but for aa mode. row_number, col_number = df.shape # Create Figure object that consists of two axes: grid = GridSpec(2,1, height_ratios=[row_number,1], hspace=1/(row_number+1)*2) fig = plt.figure() fig.set_size_inches((col_number+10)*0.2, (10+row_number)*0.2) # Later, will use subplots_adjust to make sure each square is 1*0.2^2 inches^2 ### Up to now, there is no axes object created, therefore, no plot will be shown. # Create two axes object subplot. mesh_ax = plt.subplot(grid[0]) cbar_ax = plt.subplot(grid[1]) # Adjust subplots to make room for add-on information: ## top: (title + amino acid grouping): 5 unit (each unit is 1*0.2 inch) ## padding between main figure and bar: 1 unit ## bottom: 3 units plt.gcf().subplots_adjust(top=(row_number+4.2)/(row_number+10), bottom=2.2/(row_number+10), left=2/(col_number+10), right=(col_number+2)/(col_number+10)) # Replace 'deplete' with an arbituraliy large value, say, 1000 df_raw = df.set_index(['pos']) for col in df_raw.columns: df_raw[col] = df_raw.apply(lambda row: float(row[col]) if not row[col] == 'deplete' else 1000, axis=1) # Find score range: ls = df_raw.values.reshape(1,-1).tolist()[0] ls = [x for x in ls if not np.isnan(x)] _ls = sorted(set(ls)) vmin = _ls[0] if not _ls[-1] == 1000: vmax = _ls[-1] else: vmax = _ls[-2] ### Try to fix the color overflow issue: if scale == 'max': colorlim = max(-vmin, vmax) else: colorlim = float(scale) for col in df_raw.columns: df_raw[col] = df_raw.apply(lambda row: colorlim if row[col] > colorlim and not row[col] == 1000 else row[col], axis=1) # Prepare a numpy array and mask NaNs for later plotting: arr_masked = np.ma.array(df_raw, mask=np.isnan(df_raw)) # Get color map and set colors: cmap = plt.get_cmap("RdBu_r") # Recenter color map: cmap = recentered_cmap(cmap, -colorlim, colorlim) # Rescale the color by cutting a fragment from it: colors = [cmap(i) for i in range(65,200)] # R -> G -> B cmap_new = LinearSegmentedColormap.from_list('place_holder', colors) # The new cmap_new will take advantage of old cmap, "shrink" the spectrum to make it lighter for better printing. # Set grey to NaN values, and black to totally depleted ones: cmap_new.set_bad("#808080",1) rgba = cmap_new(0) # The darkest in the new cmap_new, equal to cmap(65) cmap_new.set_over(rgba) # Plot the heatmap: return the value as mesn_pcolor as a mappable object in order to add color_bar: mesh_pcolor = mesh_ax.pcolormesh(arr_masked, cmap=cmap_new, vmin=-colorlim, vmax=colorlim) ## Below are modifications to plotting: # Add in the color bar cbar = fig.colorbar(mesh_pcolor, cax=cbar_ax, orientation='horizontal') cbar.set_label("Enrich2 Score") # Set mesh_ax title: WT_codon = [str(wt_codon(wtfile, wt_mut[0][0])), str(wt_codon(wtfile, wt_mut[0][1]))] WT_site1 = np.nan WT_site2 = np.nan # These lines for determining WT label coordinates: site1 determines row position while site2 determines column position Col_site = " (Column: " + str(int((wt_mut[0][1] + 2)/3)) + "-" + CODON2AA[WT_codon[1]][0] + " (" + WT_codon[1] + "))" mesh_ax.set_title("Amino Acid Enrichment (Double) for : " + cond + "\n" + Col_site, pad=50) # pad uses points, not sure the relationship between point and inch. # 50 works nicely it seems. # Reoder the rows according to Property order list: PropertyOrderList = Property2AA['Polar'] + Property2AA['Charged'] + Property2AA['Non-polar'] + ('Stop',) df_row_order = pd.DataFrame() for aa in PropertyOrderList: pos = [group[1] for group in CODON_GROUPS if group[0] == aa][0] row = df[df['pos'] == pos] df_row_order = df_row_order.append(row) df_raw = df_row_order.copy() df_raw.set_index('pos') # Add in aa information: columns for i, x in enumerate(list(df_raw.columns[:-1])): mesh_ax.text(i + 0.5, len(df_raw.index) + 1, x, horizontalalignment="center", verticalalignment="center", rotation = 90) if x == CODON2AA[WT_codon[1]][0]: WT_site2 = i # Add in amino acid property grouping information: columns new_AA_GROUPS = [] cstart = 0 cend = -1 for i in range(0, len(AA_GROUPS)): aaProperty = AA_GROUPS[i][0] count = 0 for item in df.columns.values: if item in Property2AA[aaProperty]: count = count + 1 if count == 1: cstart = cend = cend + 1 else: gap = count - 1 cstart = cend + 1 cend = cstart + gap if cend >= cstart: newTuple = (aaProperty, cstart, cend) new_AA_GROUPS.append(newTuple) for codon, start, end in new_AA_GROUPS: if version == 2: codon = CODON321[codon] mesh_ax.text((end - start + 1) / 2 + start, row_number + 2.5, codon, horizontalalignment="center", verticalalignment="center") bar = Line2D([start + 0.125, end + 1 - 0.125], [row_number + 2, row_number + 2], color="black") bar.set_clip_on(False) mesh_ax.add_line(bar) # Add in the deliminator for amino acid property groups: columns delimBar = Line2D([end + 1, end + 1], [0, len(df_raw.index) + 1], color="white") delimBar.set_clip_on(False) mesh_ax.add_line(delimBar) # Add in aa information: row for i, x in enumerate(list(df_raw['pos'][::-1])): aa_text = [group[0] for group in CODON_GROUPS if group[1] == x][0] # determine the corresponding aa mesh_ax.text(col_number, i+1-0.5, aa_text, # Note that if -0.5, visually not perfect but this is the best achievable horizontalalignment="center", verticalalignment="center",) if CODON2AA[CodonList[int(x) - 1]][0] == CODON2AA[WT_codon[0]][0]: posInPropertyList = [i for i, x in enumerate(PropertyOrderList[::-1]) if x == CODON2AA[WT_codon[0]][0]][0] WT_site1 = posInPropertyList # Add in amino acid property grouping information: rows new_AA_GROUPS = [] cstart = 0 cend = -1 for i in range(0, len(AA_GROUPS)): aaProperty = AA_GROUPS[i][0] count = 0 for item in df.columns.values: if item in Property2AA[aaProperty]: count = count + 1 if count == 1: cstart = cend = cend + 1 else: gap = count - 1 cstart = cend + 1 cend = cstart + gap if cend >= cstart: newTuple = (aaProperty, cstart, cend) new_AA_GROUPS.append(newTuple) for codon, start, end in new_AA_GROUPS: if version == 2: codon = CODON321[codon] y = row_number - (end - start)/2 - start - 0.5 mesh_ax.text(col_number+1.25, y, codon, horizontalalignment="left", verticalalignment="center") x = [col_number + 1, col_number + 1] y = [row_number - start - 1 + 0.875, row_number - end - 0.875] bar = Line2D(x, y, color="black") bar.set_clip_on(False) mesh_ax.add_line(bar) # Add in the deliminator for amino acid property groups: rows x = [0, col_number] y = [row_number - end - 1, row_number - end - 1] delimBar = Line2D(x, y, color="white") delimBar.set_clip_on(False) mesh_ax.add_line(delimBar) """ # Legacy, add to figure title (colomn site) and to the right margin (row site) # Add in mutation site info onto the left y-axis: x = -2 y = row_number / 3 * 2 mesh_ax.text(x, y, "Row:\n" + str(int((wt_mut[0][0] + 2)/3)) + "-" + CODON2AA[WT_codon[0]][0] + "\n(" + WT_codon[0] + ")", horizontalalignment="center", verticalalignment="center") mesh_ax.text(x, y/2, "Column:\n" + str(int((wt_mut[0][1] + 2)/3)) + "-" + CODON2AA[WT_codon[1]][0] + "\n(" + WT_codon[1] + ")", horizontalalignment="center", verticalalignment="center") """ # Add in first mutation site (per row) info onto figure right margin x = col_number + 5 y = row_number / 2 mesh_ax.text(x, y, "(Row: " + str(int((wt_mut[0][0] + 2)/3)) + "-" + CODON2AA[WT_codon[0]][0] + " (" + WT_codon[0] + "))", horizontalalignment="center", verticalalignment="center", rotation = 270) # Add in WT label onto corresponding cells: wt_aa = [str(CODON2AA[codon][0]) for codon in WT_codon] if not WT_site1 == np.nan and not WT_site2 == np.nan: mesh_ax.add_patch(Circle((WT_site2 + 0.5, WT_site1 + 0.5), .1666, fill=True, facecolor="black", edgecolor="none", alpha=0.5)) # Make the figure cleaner by removing ticks: mesh_ax.tick_params(bottom=False, left=False) mesh_ax.get_xaxis().set_visible(False) mesh_ax.get_yaxis().set_visible(False) cbar_ax.tick_params(bottom=False) # Add in SE if specified: if not df_se.shape == pd.DataFrame().shape: # Add in SE # Subset df_se to match df that might be truncated: for col in df_se.columns: if not col in df.columns: df_se.drop(columns=col, inplace=True) df_se = df_se[df_se['pos'].isin(df['pos'])] # Below is to find max se value: se_df = df_se.drop(columns=['pos']) tem = sorted(se_df.values.reshape(1,-1).tolist()[0]) tem = [x for x in tem if not np.isnan(x)] se_max = tem[-1] for row in range(len(df_se['pos'])): for col in range(len(df_se.columns)-1): se_value = df_se.iloc[row, col] corner_dist = (se_max - se_value)/(2 * se_max) corner_dist = (1 - se_value) / 2 diag = Line2D([col + corner_dist, col + 1 - corner_dist], [row + corner_dist, row + 1 - corner_dist], color="black") if se_value > 0.02 and df_raw.iloc[row, col] != 1000: # se_value below 0.02 will not be displayed so as totally depleted ones mesh_ax.add_line(diag) pylab.savefig(outfilename)