def tsv_plot_output_aa(wtfile,
                       cond,
                       df,
                       df_se=pd.DataFrame(),
                       outfilename='test.pdf'):
    """
    Read in score and se dataframes, output plots accordingly.
    wtfile: WT sequence file, used for y-labeling
    cond: The experiment condition, used for title info
    df: enrich score dataframe, from plot_input folder, can also used customized df
    df_se: enrich score dataframe, from plot_input folder, will match df (if customized) automatically
    outfilename: outputfile path + filename, where to store the output
    This function is customized for aa.
    """

    row_number, col_number = df.shape

    # Create Figure object that consists of two axes:
    grid = GridSpec(2,
                    1,
                    height_ratios=[row_number, 1],
                    hspace=1 / (row_number + 1) * 2)
    fig = plt.figure()
    fig.set_size_inches((col_number + 8) * 0.2, (10 + row_number) * 0.2)
    # Later, will use subplots_adjust to make sure each square is 1*0.2^2 inches^2

    ### Up to now, there is no axes object created, therefore, no plot will be shown.

    # Create two axes object subplot.
    mesh_ax = plt.subplot(grid[0])
    cbar_ax = plt.subplot(grid[1])
    # Adjust subplots to make room for add-on information:
    ## top: (title + amino acid grouping): 5 unit (each unit is 1*0.2 inch)
    ## padding between main figure and bar: 1 unit
    ## bottom: 3 units
    ## left 2.5 units
    ## right: 3.5 units
    plt.gcf().subplots_adjust(top=(row_number + 5) / (row_number + 10),
                              bottom=3 / (row_number + 10),
                              left=5 / (col_number + 8),
                              right=(col_number + 5) / (col_number + 8))

    # Replace 'deplete' with an arbituraliy large value, say, 1000
    df_raw = df.set_index(['pos'])
    for col in df_raw.columns:
        df_raw[col] = df_raw.apply(lambda row: float(row[col])
                                   if not row[col] == 'deplete' else 1000,
                                   axis=1)
    # Find score range:
    ls = df_raw.values.reshape(1, -1).tolist()[0]
    ls = [x for x in ls if not np.isnan(x)]
    _ls = sorted(set(ls))
    vmin = _ls[0]
    if not _ls[-1] == 1000:
        vmax = _ls[-1]
    else:
        vmax = _ls[-2]

    # Prepare a numpy array and mask NaNs for later plotting:
    arr_masked = np.ma.array(df_raw, mask=np.isnan(df_raw))

    # Get color map and set colors:
    cmap = plt.get_cmap("RdBu_r")
    colorlim = max(-vmin, vmax)
    # Recenter color map:
    cmap = recentered_cmap(cmap, -colorlim, colorlim)

    colors = [cmap(i) for i in range(65, 200)]  # R -> G -> B
    cmap_new = LinearSegmentedColormap.from_list('place_holder', colors)
    # The new cmap_new will take advantage of old cmap, "shrink" the spectrum to make it lighter for better printing.

    # Set grey to NaN values, and black to totally depleted ones:
    cmap_new.set_bad("#808080", 1)
    rgba = cmap_new(0)  # The darkest in the new cmap_new, equal to cmap(65)
    cmap_new.set_over(rgba)

    # Plot the heatmap: return the value as mesn_pcolor as a mappable object in order to add color_bar:
    mesh_pcolor = mesh_ax.pcolormesh(arr_masked,
                                     cmap=cmap_new,
                                     vmin=-colorlim,
                                     vmax=colorlim)

    ## Below are modifications to plotting:

    # Add in the color bar
    cbar = fig.colorbar(mesh_pcolor, cax=cbar_ax, orientation='horizontal')
    cbar.set_label("Enrich2 Score")

    # Set mesh_ax y labels:
    mesh_ax.set_ylabel("Position in WT")

    # Set mesh_ax title:
    mesh_ax.set_title("Amino Acid Distribution Map for Experiment: " + cond,
                      pad=50)
    # pad uses points, not sure the relationship between point and inch.
    # 50 works nicely it seems.

    # Add in column information:
    for i, x in enumerate(list(df_raw.columns)):
        mesh_ax.text(i + 0.5,
                     len(df_raw.index) + 1,
                     x,
                     horizontalalignment="center",
                     verticalalignment="center",
                     rotation=90)

    # Add in amino acid grouping information:
    new_AA_GROUPS = []
    cstart = 0
    cend = -1
    for i in range(0, len(AA_GROUPS)):
        aaProperty = AA_GROUPS[i][0]
        count = 0
        for item in df.columns.values:
            if item in Property2AA[aaProperty]:
                count = count + 1
        if count == 1: cstart = cend = cend + 1
        else:
            gap = count - 1
            cstart = cend + 1
            cend = cstart + gap
        if cend >= cstart:
            newTuple = (aaProperty, cstart, cend)
            new_AA_GROUPS.append(newTuple)

    for codon, start, end in new_AA_GROUPS:
        mesh_ax.text((end - start + 1) / 2 + start,
                     row_number + 2.5,
                     codon,
                     horizontalalignment="center",
                     verticalalignment="center")

        bar = Line2D([start + 0.125, end + 1 - 0.125],
                     [row_number + 2, row_number + 2],
                     color="black")
        bar.set_clip_on(False)
        mesh_ax.add_line(bar)

    WT_codon = [
        str(wt_codon(wtfile, int(x * 3 - 2))) for x in df['pos'].values
    ]
    wt_aa = [str(CODON2AA[codon][0]) for codon in WT_codon]

    # Add in mesh_ax y-label: aa coordinates in WT sequence:
    ypos = np.arange(len(df['pos'])) + 0.5
    mesh_ax.set_yticks(ypos)
    labelPos = list(map(str, list(map(int, df['pos']))))
    labelAA = wt_aa
    labelCombine = ['-'.join(item) for item in zip(labelPos, labelAA)]
    mesh_ax.set_yticklabels(labelCombine, ha='right')

    # Add in deliminator horizontally:
    for i in range(0, df_raw.shape[0]):
        delimBar = Line2D([0, df_raw.shape[1]], [i, i],
                          transform=mesh_ax.transData,
                          color="white")
        delimBar.set_clip_on(False)
        mesh_ax.add_line(delimBar)

    # Add in WT label onto corresponding cell:
    WT_aa = [CODON2AA[x][0] for x in WT_codon]
    x_coordinate = [
        list(df.columns).index(x) if x in df.columns else np.nan for x in WT_aa
    ]
    y_coordinate = list(range(df.shape[1]))

    wt_xy = zip(x_coordinate, y_coordinate)
    for x, y in wt_xy:
        if not x == np.nan:
            mesh_ax.add_patch(
                Circle((x + 0.5, y + 0.5),
                       .1666,
                       fill=True,
                       facecolor="black",
                       edgecolor="none",
                       alpha=0.5))

    # Make the figure cleaner by removing ticks:
    mesh_ax.tick_params(bottom=False, left=False)
    mesh_ax.get_xaxis().set_visible(False)
    cbar_ax.tick_params(bottom=False)

    # Add in SE if specified:
    if not df_se.shape == pd.DataFrame().shape:  # Add in SE
        # Subset df_se to match df that might be truncated:
        for col in df_se.columns:
            if not col in df.columns: df_se.drop(columns=col, inplace=True)
        df_se = df_se[df_se['pos'].isin(df['pos'])]

        # Below is to find max se value:
        se_df = df_se.drop(columns=['pos'])
        tem = sorted(se_df.values.reshape(1, -1).tolist()[0])
        tem = [x for x in tem if not np.isnan(x)]
        se_max = tem[-1]

        for row in range(len(df_se['pos'])):
            for col in range(len(df_se.columns) - 1):
                se_value = df_se.iloc[row, col]
                corner_dist = (se_max - se_value) / (2 * se_max)
                corner_dist = (1 - se_value) / 2
                diag = Line2D([col + corner_dist, col + 1 - corner_dist],
                              [row + corner_dist, row + 1 - corner_dist],
                              color="black")

                if se_value > 0.02 and df_raw.iloc[
                        row,
                        col] != 1000:  # se_value below 0.02 will not be displayed so as totally depleted ones
                    mesh_ax.add_line(diag)

    pylab.savefig(outfilename)
Exemple #2
0
def tsv_plot_output_aa_double(wtfile, wt_mut, cond, df, df_se=pd.DataFrame(), outfilename='test.pdf', version=1, scale = 'max'):
    # Same as tsv_plot_output_double but for aa mode.
    row_number, col_number = df.shape

    # Create Figure object that consists of two axes:
    grid = GridSpec(2,1, height_ratios=[row_number,1], hspace=1/(row_number+1)*2)
    fig  = plt.figure()
    fig.set_size_inches((col_number+10)*0.2, (10+row_number)*0.2)
    # Later, will use subplots_adjust to make sure each square is 1*0.2^2 inches^2

    ### Up to now, there is no axes object created, therefore, no plot will be shown.

    # Create two axes object subplot.
    mesh_ax = plt.subplot(grid[0])
    cbar_ax = plt.subplot(grid[1])
    # Adjust subplots to make room for add-on information:
        ## top: (title + amino acid grouping): 5 unit (each unit is 1*0.2 inch)
        ## padding between main figure and bar: 1 unit
        ## bottom: 3 units
    plt.gcf().subplots_adjust(top=(row_number+4.2)/(row_number+10),
                              bottom=2.2/(row_number+10),
                              left=2/(col_number+10),
                              right=(col_number+2)/(col_number+10))

    # Replace 'deplete' with an arbituraliy large value, say, 1000
    df_raw = df.set_index(['pos'])
    for col in df_raw.columns:
        df_raw[col] = df_raw.apply(lambda row: float(row[col])
                                   if not row[col] == 'deplete'
                                   else 1000, axis=1)
    # Find score range:
    ls = df_raw.values.reshape(1,-1).tolist()[0]
    ls = [x for x in ls if not np.isnan(x)]

    _ls = sorted(set(ls))
    vmin = _ls[0]
    if not _ls[-1] == 1000:
        vmax = _ls[-1]
    else:
        vmax = _ls[-2]

### Try to fix the color overflow issue:
    if scale == 'max':
        colorlim = max(-vmin, vmax)
    else:
        colorlim = float(scale)

    for col in df_raw.columns:
        df_raw[col] = df_raw.apply(lambda row: colorlim if row[col] > colorlim and not row[col] == 1000 else row[col], axis=1)

    # Prepare a numpy array and mask NaNs for later plotting:
    arr_masked = np.ma.array(df_raw, mask=np.isnan(df_raw))

    # Get color map and set colors:
    cmap = plt.get_cmap("RdBu_r")

    # Recenter color map:
    cmap = recentered_cmap(cmap, -colorlim, colorlim)

    # Rescale the color by cutting a fragment from it:
    colors = [cmap(i) for i in range(65,200)]  # R -> G -> B
    cmap_new = LinearSegmentedColormap.from_list('place_holder', colors)
    # The new cmap_new will take advantage of old cmap, "shrink" the spectrum to make it lighter for better printing.

    # Set grey to NaN values, and black to totally depleted ones:
    cmap_new.set_bad("#808080",1)
    rgba = cmap_new(0) # The darkest in the new cmap_new, equal to cmap(65)
    cmap_new.set_over(rgba)

    # Plot the heatmap: return the value as mesn_pcolor as a mappable object in order to add color_bar:
    mesh_pcolor = mesh_ax.pcolormesh(arr_masked, cmap=cmap_new, vmin=-colorlim, vmax=colorlim)

    ## Below are modifications to plotting:

    # Add in the color bar
    cbar = fig.colorbar(mesh_pcolor, cax=cbar_ax, orientation='horizontal')
    cbar.set_label("Enrich2 Score")

    # Set mesh_ax title:
    WT_codon  = [str(wt_codon(wtfile, wt_mut[0][0])), str(wt_codon(wtfile, wt_mut[0][1]))]
    WT_site1 = np.nan
    WT_site2 = np.nan
    # These lines for determining WT label coordinates: site1 determines row position while site2 determines column position

    Col_site = " (Column: " + str(int((wt_mut[0][1] + 2)/3)) + "-" + CODON2AA[WT_codon[1]][0] + " (" + WT_codon[1] + "))"
    mesh_ax.set_title("Amino Acid Enrichment (Double) for : " + cond + "\n" + Col_site, pad=50)
    # pad uses points, not sure the relationship between point and inch.
    # 50 works nicely it seems.

    # Reoder the rows according to Property order list:
    PropertyOrderList = Property2AA['Polar'] + Property2AA['Charged'] + Property2AA['Non-polar'] + ('Stop',)
    df_row_order = pd.DataFrame()
    for aa in PropertyOrderList:
        pos = [group[1] for group in CODON_GROUPS if group[0] == aa][0]
        row = df[df['pos'] == pos]
        df_row_order = df_row_order.append(row)
    df_raw = df_row_order.copy()
    df_raw.set_index('pos')

    # Add in aa information: columns
    for i, x in enumerate(list(df_raw.columns[:-1])):
        mesh_ax.text(i + 0.5, len(df_raw.index) + 1, x,
        horizontalalignment="center",
        verticalalignment="center",
        rotation = 90)
        if x == CODON2AA[WT_codon[1]][0]: WT_site2 = i

    # Add in amino acid property grouping information: columns
    new_AA_GROUPS = []
    cstart = 0
    cend   = -1
    for i in range(0, len(AA_GROUPS)):
        aaProperty = AA_GROUPS[i][0]
        count = 0
        for item in df.columns.values:
            if item in Property2AA[aaProperty]:
                count = count + 1
        if count == 1: cstart = cend = cend + 1
        else:
            gap = count - 1
            cstart = cend + 1
            cend = cstart + gap
        if cend >= cstart:
            newTuple = (aaProperty, cstart, cend)
            new_AA_GROUPS.append(newTuple)

    for codon, start, end in new_AA_GROUPS:
        if version == 2: codon = CODON321[codon]
        mesh_ax.text((end - start + 1) / 2 + start,
            row_number + 2.5, codon,
            horizontalalignment="center",
            verticalalignment="center")

        bar = Line2D([start + 0.125, end + 1 - 0.125],
                [row_number + 2, row_number + 2], color="black")
        bar.set_clip_on(False)
        mesh_ax.add_line(bar)

    # Add in the deliminator for amino acid property groups: columns
        delimBar = Line2D([end + 1, end + 1],
                [0, len(df_raw.index) + 1], color="white")
        delimBar.set_clip_on(False)
        mesh_ax.add_line(delimBar)

    # Add in aa information: row
    for i, x in enumerate(list(df_raw['pos'][::-1])):
        aa_text = [group[0] for group in CODON_GROUPS if group[1] == x][0] # determine the corresponding aa
        mesh_ax.text(col_number, i+1-0.5, aa_text, # Note that if -0.5, visually not perfect but this is the best achievable
                horizontalalignment="center",
                verticalalignment="center",)
        if CODON2AA[CodonList[int(x) - 1]][0] == CODON2AA[WT_codon[0]][0]:
            posInPropertyList = [i for i, x in enumerate(PropertyOrderList[::-1]) if x == CODON2AA[WT_codon[0]][0]][0]
            WT_site1 = posInPropertyList

    # Add in amino acid property grouping information: rows
    new_AA_GROUPS = []
    cstart = 0
    cend   = -1
    for i in range(0, len(AA_GROUPS)):
        aaProperty  = AA_GROUPS[i][0]
        count = 0
        for item in df.columns.values:
            if item in Property2AA[aaProperty]:
                count = count + 1
        if count == 1: cstart = cend = cend + 1
        else:
            gap = count - 1
            cstart = cend + 1
            cend = cstart + gap
        if cend >= cstart:
            newTuple = (aaProperty, cstart, cend)
            new_AA_GROUPS.append(newTuple)

    for codon, start, end in new_AA_GROUPS:
        if version == 2: codon = CODON321[codon]
        y = row_number - (end - start)/2 - start - 0.5
        mesh_ax.text(col_number+1.25, y, codon,
            horizontalalignment="left",
            verticalalignment="center")

        x = [col_number + 1, col_number + 1]
        y = [row_number - start - 1 + 0.875, row_number - end - 0.875]
        bar = Line2D(x, y, color="black")
        bar.set_clip_on(False)
        mesh_ax.add_line(bar)

    # Add in the deliminator for amino acid property groups: rows
        x = [0, col_number]
        y = [row_number - end - 1, row_number - end - 1]
        delimBar = Line2D(x, y, color="white")
        delimBar.set_clip_on(False)
        mesh_ax.add_line(delimBar)

    """
    # Legacy, add to figure title (colomn site) and to the right margin (row site)
    # Add in mutation site info onto the left y-axis:
        x = -2
        y = row_number / 3 * 2

        mesh_ax.text(x, y, "Row:\n" + str(int((wt_mut[0][0] + 2)/3)) + "-" + CODON2AA[WT_codon[0]][0] + "\n(" + WT_codon[0] + ")",
        horizontalalignment="center",
        verticalalignment="center")

        mesh_ax.text(x, y/2, "Column:\n" + str(int((wt_mut[0][1] + 2)/3)) + "-" + CODON2AA[WT_codon[1]][0] + "\n(" + WT_codon[1] + ")",
        horizontalalignment="center",
        verticalalignment="center")
    """

    # Add in first mutation site (per row) info onto figure right margin
    x = col_number + 5
    y = row_number / 2
    mesh_ax.text(x, y, "(Row: " + str(int((wt_mut[0][0] + 2)/3)) + "-" + CODON2AA[WT_codon[0]][0] + " (" + WT_codon[0] + "))",
    horizontalalignment="center",
    verticalalignment="center",
    rotation = 270)

    # Add in WT label onto corresponding cells:
    wt_aa     = [str(CODON2AA[codon][0]) for codon in WT_codon]
    if not WT_site1 == np.nan and not WT_site2 == np.nan:
        mesh_ax.add_patch(Circle((WT_site2 + 0.5, WT_site1 + 0.5), .1666,
        fill=True, facecolor="black",
        edgecolor="none", alpha=0.5))

    # Make the figure cleaner by removing ticks:
    mesh_ax.tick_params(bottom=False, left=False)
    mesh_ax.get_xaxis().set_visible(False)
    mesh_ax.get_yaxis().set_visible(False)
    cbar_ax.tick_params(bottom=False)

    # Add in SE if specified:
    if not df_se.shape == pd.DataFrame().shape: # Add in SE
        # Subset df_se to match df that might be truncated:
        for col in df_se.columns:
            if not col in df.columns: df_se.drop(columns=col, inplace=True)
        df_se = df_se[df_se['pos'].isin(df['pos'])]

        # Below is to find max se value:
        se_df = df_se.drop(columns=['pos'])
        tem = sorted(se_df.values.reshape(1,-1).tolist()[0])
        tem = [x for x in tem if not np.isnan(x)]
        se_max = tem[-1]

        for row in range(len(df_se['pos'])):
            for col in range(len(df_se.columns)-1):
                se_value = df_se.iloc[row, col]
                corner_dist = (se_max - se_value)/(2 * se_max)
                corner_dist = (1 - se_value) / 2
                diag = Line2D([col + corner_dist, col + 1 - corner_dist],
                        [row + corner_dist, row + 1 - corner_dist], color="black")

                if se_value > 0.02 and df_raw.iloc[row, col] != 1000: # se_value below 0.02 will not be displayed so as totally depleted ones
                    mesh_ax.add_line(diag)

    pylab.savefig(outfilename)