def plotDistributions(data, cutoff, palette, pdf): # Open new figureHandler instance fh = figureHandler(proj='2d', figsize=(14, 8)) #Get xmin and xmax xmin = -np.nanpercentile(data['cv'].values, 99) * 0.2 xmax = np.nanpercentile(data['cv'].values, 99) * 1.5 # Split design file by treatment group and plot density plot for name, group in palette.design.groupby(palette.combName): dist.plotDensityDF(data=data["cv_" + name], ax=fh.ax[0], colors=palette.ugColors[name], lb="{0}".format(name)) # Plot legend fh.makeLegendLabel(ax=fh.ax[0]) # Give format to the axis fh.formatAxis( yTitle="Density", xlim=(xmin, xmax), ylim="ignore", figTitle="Density Plot of Coefficients of Variation by {0}".format( palette.combName)) # Shrink figure fh.shrink() # Add figure to PDF fh.addToPdf(pdfPages=pdf)
def plotScatterplot3D(data, palette, pdf): """ Plots Scatterplots 3D for a given number of loadngs for PCA. :Arguments: :type data: pandas.DataFrame :param data: Loadings of the PCA. :type pdf: pdf object :param pdf: PDF object to save all the generated figures. """ # Open figure handler with 3D projection fh = figureHandler(proj="3d", figsize=(14,8)) # Plot scatterplot3D ax = scatter.scatter3D(ax=fh.ax[0], colorList=palette.design.colors.tolist(), x=list(data["PC1"]), y=list(data["PC2"]), z=list(data["PC3"])) # Make legends fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Add Titles to the PCA fh.format3D(xTitle="PC1",yTitle="PC2",zTitle="PC3") # Add Figure to the PDf fh.addToPdf(dpi=600, pdfPages=pdf)
def plotDensityDistribution(pdf, wide, palette): # Instanciating figureHandler object figure = figureHandler(proj="2d", figsize=(12, 7)) # Formating axis figure.formatAxis(figTitle="Distribution by Samples Density", xlim="ignore", ylim="ignore", grid=False) # Plotting density plot density.plotDensityDF(colors=palette.design["colors"], ax=figure.ax[0], data=wide) # Add legend to the plot figure.makeLegend(ax=figure.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Shrinking figure figure.shrink() # Adding to PDF figure.addToPdf(pdf, dpi=600)
def plotDensity(data, name, pdf): """ This function takes a pandas dataframe and plots a density plot and a boxplot. """ # Stablishing figure layout (x,y,colspan,rowspan) axisLayout = [(0, 0, 1, 3), (3, 0, 1, 1)] # Creating a figure template figure = figureHandler(proj='2d', numAx=2, numRow=4, numCol=1, figsize=(8, 13), arrangement=axisLayout) # Adding figure Title figure.formatAxis(figTitle="Distribution by Features {0}".format(name), xlim="ignore", ylim="ignore", axnum=0, showX=False) #Creting list of len(wide.T) maximu 50 with the colors for each index colors = [palette.ugColors[name]] * len(data.index) # Plotting boxPlot box.boxDF(ax=figure.ax[0], colors=colors, dat=data.T, vert=False, rot=0) # Plotting density plot density.plotDensityDF(data=data.T.unstack(), ax=figure.ax[1], colors=colors[0]) # Adding figure to pdf object figure.addToPdf(pdf)
def plotVenn2(data, title, name1, name2, innerLabels=None, circles=None): """ Plots venn diagram for 2 sets (2 circles). :Arguments: :type data: list :param data: list of values for venn circles [1,2,3] = [Ab,AB,aB] :type title: str :param title: Title for the plot :type name1: str :param name1: Name of the first category (circle) :type name2: str :param name2: Name of the second category (circle) :type innerLabels: list :param innerLabels: List of labels for the inside of circles. :type circles: boolean :param circles: If true draws the edge of the circles :Returns: :rtype figInstance: figureHandler object :returns figInstance: Outputs a figureHandler object with the plot. """ #Get figure instances figInstance = figureHandler(proj="2d") #Setting format of the figure figInstance.formatAxis(xTitle=name1, yTitle=name2, axTitle=title) #Plotting venn venn2fig = venn2(subsets=data, set_labels=(name1, name2), ax=figInstance.ax[0]) #Plot circles if circles: circles = venn2_circles(subsets=data, linestyle='dotted', ax=figInstance.ax[0]) # If not inner labels are provided use the data as a string if innerLabels is None: innerLabels = list(map(str, data)) #Art of the venn diagram _artVenn2(venn2fig, innerLabels=innerLabels) #Return Plot return figInstance
def main(args): # Loading design if args.design: design = pd.DataFrame.from_csv(args.design, sep="\t") design.reset_index(inplace=True) else: design = False # Loading wide file wide = pd.DataFrame.from_csv(args.input, sep="\t") # Open Figure handler fh = figureHandler(proj="3d", figsize=(14, 8)) # If design file with group and the uniqID is "sampleID" then color by group if args.group and args.uniqID == "sampleID": glist = list(design[args.group]) colorList, ucGroups = palette.getColorsByGroup(design=design, group=args.group, uGroup=sorted( set(glist))) else: glist = list() colorList = palette.mpl_colors[0] ucGroups = dict() # Plot scatterplot 3D scatter.scatter3D(ax=fh.ax[0], x=list(wide[args.x]), y=list(wide[args.y]), z=list(wide[args.z]), colorList=colorList) # Despine axis (spine = tick) fh.despine(fh.ax[0]) # Give format to the plot fh.format3D(title=args.x + " vs " + args.y + " vs " + args.z, xTitle=args.x, yTitle=args.y, zTitle=args.z, rotation=float(args.rotation), elevation=float(args.elevation)) # If groups are provided create a legend if args.group and args.uniqID == "sampleID": fh.makeLegend(ax=fh.ax[0], ucGroups=ucGroups, group=args.group) fh.shrink() # Saving figure to file with PdfPages(args.figure) as pdfOut: fh.addToPdf(dpi=600, pdfPages=pdfOut) logger.info("Script Complete!")
def main(args): """Runs eveything""" # Importing data dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Getting labels to drop from arguments x = True y = True if "x" in args.labels: x = False if "y" in args.labels: y = False print("x =", x) print("y =", y) #Plotting with dendogram Hierarchical cluster heatmap (HCH) logger.info("Plotting heatmaps") if args.dendogram == True: fh = hm.plotHCHeatmap(dat.wide, hcheatmap=True, cmap=palette.mpl_colormap, xlbls=x, ylbls=y) fh.savefig(args.fig, format="pdf") #Plotting without a dendogram single heatmap else: # Creating figure Handler object fh = figureHandler(proj='2d', figsize=(14, 14)) # Creating plot hm.plotHeatmap(dat.wide, fh.ax[0], cmap=palette.mpl_colormap, xlbls=x, ylbls=y) # formating axis fh.formatAxis(xTitle="sampleID") # Saving figure fh.export(out=args.fig, dpi=300) # Finishing script logger.info("Script Complete!")
def plotCVplots(data, cutoff, palette, pdf): #Iterate over groups for name, group in palette.design.groupby(palette.combName): # Open figure handler fh = figureHandler(proj='2d', figsize=(14, 8)) # Get xmin and xmax xmin = -np.nanpercentile(data['cv_' + name].values, 99) * 0.2 xmax = np.nanpercentile(data['cv_' + name].values, 99) * 1.5 # Plot histogram hist.serHist(ax=fh.ax[0], dat=data['cv_' + name], color='grey', normed=1, range=(xmin, xmax), bins=15) # Plot density plot dist.plotDensityDF(data=data['cv_' + name], ax=fh.ax[0], lb="CV density", colors=palette.ugColors[name]) # Plot cutoff lines.drawCutoffVert(ax=fh.ax[0], x=cutoff[name], lb="Cutoff at: {0}".format(cutoff[name])) # Plot legend fh.makeLegendLabel(ax=fh.ax[0]) # Give format to the axis fh.formatAxis( yTitle='Density', xlim=(xmin, xmax), ylim="ignore", figTitle="Density Plot of Coefficients of Variation in {0}".format( name)) # Shrink figure to fit legend fh.shrink() # Add plot to PDF fh.addToPdf(pdfPages=pdf)
def plotScores(data, palette, pdf): """ This function creates a PDF file with 3 scatter plots for the combinations of the 3 principal components. PC1 vs PC2, PC1 vs PC3, PC2 vs PC3. :Arguments: :type data: pandas.core.frame.DataFrame :param data: Data frame with the data to plot. :type outpath: string :param outpath: Path for the output file :type group: string :param group: Name of the column that contains the group information on the design file. :Return: :rtype PDF: file :retrn PDF: file with the 3 scatter plots for PC1 vs PC2, PC1 vs PC3, PC2 vs PC3. """ for x,y in list(itertools.combinations(data.columns.tolist(),2)): # Creating a figure handler object fh = figureHandler(proj="2d", figsize=(14,8)) # Creating title for the figure title = "{0} vs {1}".format(x,y) # Creating the scatterplot 2D scatter.scatter2D(ax=fh.ax[0], x=list(data[x]), y=list(data[y]), colorList=palette.design.colors.tolist()) # Despine axis fh.despine(fh.ax[0]) fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Shinking the plot so everything fits fh.shrink() # Format Axis fh.formatAxis(figTitle=title, xTitle="Scores on {0}".format(x), yTitle="Scores on {0}".format(y), grid=False) # Adding figure to pdf fh.addToPdf(dpi=90,pdfPages=pdf)
def plotScatterplot2D(data, palette, pdf, nloads=3): """ Plots Scatterplots 2D for a number of loadngs for PCA. :Arguments: :type data: pandas.DataFrame :param data: Loadings of the PCA. :type pdf: pdf object :param pdf: PDF object to save all the generated figures. :type nloads: int :param nloads: Number of principal components to create pairwise combs. """ # Selecting amount of pairwise combinations to plot scaterplots for loads. for x, y in list(combinations(data.columns.tolist()[:nloads],2)): # Create a single-figure figure handler object fh = figureHandler(proj="2d", figsize=(14,8)) # Create a title for the figure title = "{0} vs {1}".format(x,y) # Plot the scatterplot based on data scatter.scatter2D(x=list(data[x]), y=list(data[y]), colorList=palette.design.colors.tolist(), ax=fh.ax[0]) # Create legend fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Shrink axis to fit legend fh.shrink() # Despine axis fh.despine(fh.ax[0]) # Formatting axis fh.formatAxis(figTitle=title,xTitle="Scores on {0}".format(x), yTitle="Scores on {0}".format(y),grid=False) # Adding figure to pdf fh.addToPdf(dpi=600,pdfPages=pdf)
def plotBoxplotDistribution(pdf, wide, palette): # Instanciating figureHandler object figure = figureHandler(proj="2d", figsize=(max(len(wide.columns) / 4, 12), 7)) # Formating axis figure.formatAxis(figTitle="Distribution by Samples Boxplot", ylim="ignore", grid=False, xlim="ignore") # Plotting boxplot box.boxDF(ax=figure.ax[0], colors=palette.design["colors"], dat=wide) # Shrinking figure figure.shrink() #Adding to PDF figure.addToPdf(pdf, dpi=600)
def plotCDhistogram(count, pdf, group): """ This function counts digits on a given file. :Arguments: :type count: pandas.DataFrame. :param count: DataFrama with the counted digits and min, max and diff among rows. :type pdf: matplotlib.backends.backend_pdf.PdfPages. :param pdf: PDF object to plot figures in. :type group: str. :param group: Name of the group to plot. """ #Creating title title="Distribution of difference between \n(min and max) for {0} compounds".\ format(group) if count['diff'].any(): #Opening figure handler fh = figureHandler(proj='2d') #Plot histogram hist.quickHist(ax=fh.ax[0], dat=count['diff']) #Giving format to the axis fh.formatAxis(xTitle='Difference in Number of Digits (max - min)', yTitle='Number of Features', figTitle=title, ylim="ignore") # Explort figure fh.addToPdf(pdf, dpi=600) else: logger.warn( "There were no differences in digit counts for {0}, no plot will be generated" .format(group))
def main(args): # Loading data trought Interface dat = wideToDesign(args.input, args.design, args.uniqueID, group=args.group, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Getting the uinique pairs and all pairwise prermutations # son that we will feed them to Kruscal-Wallis. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique, 2)) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Running overall Kruscall-Wallis test for all group levels combined. # Creating p_values_all and flag_values_all for 3 significance levels as emply lists of length equal to the number_of_features. # This will be used for all groups. p_value_all = [0] * number_of_features H_value_all = [0] * number_of_features mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features flag_value_all_0p01 = [0] * number_of_features flag_value_all_0p05 = [0] * number_of_features flag_value_all_0p10 = [0] * number_of_features for j in range(0, number_of_features): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to ith unique group. data_frame_current_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list = data_frame_current_group.index.tolist() # Series current for group i and row (feature) j. series_current = data_frame_current_group.loc[indexes_list[j]] # This piece of code depends on whether it is the first group in the list or not. if i == 0: series_total = [series_current] else: series_total.append(series_current) # Checking if the compared elements are different. # Combining for checking. combined_list = data_frame_manipulate_transpose.loc[ indexes_list_complete[j]].tolist() combined_list_unique = np.unique(combined_list) # Checking if the number of unique elements is exactly 1. if len(combined_list_unique) == 1: # Performing Kruscal-Wallis for all groups for feature j. p_value_all[j] = float("nan") H_value_all[j] = float("nan") if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1 if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1 if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1 else: # Performing Kruscal-Wallis for all groups for feature j. kruscal_wallis_args = series_total p_value_all[j] = kruskalwallis(*kruscal_wallis_args)[1] H_value_all[j] = kruskalwallis(*kruscal_wallis_args)[0] if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1 if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1 if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1 # The loop over features has to be finished by now. Converting them into the data frame. # The pariwise results will be added later. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list) summary_df['SampleVariance'] = variance_value_all summary_df['H_value_for_all'] = H_value_all summary_df['prob_greater_than_H_for_all'] = p_value_all flag_df = pd.DataFrame(data=flag_value_all_0p01, columns=["flag_significant_0p01_on_all_groups"], index=indexes_list) flag_df["flag_significant_0p05_on_all_groups"] = flag_value_all_0p05 flag_df["flag_significant_0p10_on_all_groups"] = flag_value_all_0p10 # Informing that KW for all group has been performed. logger.info( u"Kruscal-Wallis test for all groups together has been performed.") # Computing means for each group # This part just produces sumamry statistics for the output table. # This has nothing to do with Kruscal-Wallis for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features): series_current = data_frame_current_group.loc[indexes_list[j]] means_value[j] = series_current.mean() means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[ i] summary_df[means_value_column_name_current] = means_value # Running pairwise Kruscall-Wallis test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]])] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_first_group = data_frame_first_group.drop(args.group, 1).transpose() data_frame_second_group = data_frame_second_group.drop(args.group, 1).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features H_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features): series_first = data_frame_first_group.loc[indexes_list[j]] series_second = data_frame_second_group.loc[indexes_list[j]] # Checking if the compared elements are different. # Combining for checking. first_list = data_frame_first_group.loc[indexes_list[j]].tolist() second_list = data_frame_second_group.loc[indexes_list[j]].tolist() combined_list = first_list + second_list combined_list_unique = np.unique(combined_list) # Checking if the number of unique elements is exactly 1. if len(combined_list_unique) == 1: p_value[j] = float("nan") H_value[j] = float("nan") # Possible alternative for two groups. # p_value[j] = kruskalwallis(series_first, series_second)[1] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean( ) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 else: kruscal_wallis_args = [series_first, series_second] p_value[j] = kruskalwallis(*kruscal_wallis_args)[1] H_value[j] = kruskalwallis(*kruscal_wallis_args)[0] # Possible alternative for two groups. # p_value[j] = kruskalwallis(series_first, series_second)[1] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean( ) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Adding current p_value and flag_value column to the data frame and assigning the name p_value_column_name_current = 'prob_greater_than_H_for_diff_' + groups_subset[ 0] + '_' + groups_subset[1] H_value_column_name_current = 'H_value_for_diff_' + groups_subset[ 0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[ 0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[ 0] + '_' + groups_subset[1] summary_df[p_value_column_name_current] = p_value summary_df[H_value_column_name_current] = H_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precison digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Informing that KW for pairwise group has been performed. logger.info( u"Kruscal-Wallis test for all groups pairwise has been performed.") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff = 2 # Making volcano plots with PdfPages(args.volcano) as pdf: for i in range(0, number_of_groups_pairwise): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}". format(current_key), xTitle="Difference of treatment means for {0}".format( current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing running of Kruscal-Wallis tests.")
def qqPlot(tresid, tfit, oname): """ Plot the residual diagnostic plots by sample. Output q-q plot, boxplots and distributions of the residuals. These plots will be used diagnose if residuals are approximately normal. :Arguments: :type tresid: pandas.Series :param tresid: Pearson normalized residuals. (transposed) (residuals / sqrt(MSE)) :type tfit: pandas DataFrame :param tfit: output of the ANOVA (transposed) :type oname: string :param oname: Name of the output file in pdf format. :Returns: :rtype: PDF :returns: Outputs a pdf file containing all plots. """ #Open pdf with PdfPages(oname) as pdf: # Stablishing axisLayout axisLayout = [(0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 1, 1), (1, 0, 3, 1)] # Start plotting for col in tresid.columns: #Creating figure fig = figureHandler(proj='2d', numAx=4, numRow=2, numCol=3, arrangement=axisLayout) data = tresid[col].values.ravel() noColors = list() for j in range(0, len(data)): noColors.append('b') #blue df_data = pd.DataFrame(data) # Removing missing so that it will plot correctly. mask_nan_data = np.isnan(data) data = data[~mask_nan_data] # Plot qqplot on axis 0 sm.graphics.qqplot(data, fit=True, line='r', ax=fig.ax[0]) # Plot boxplot on axis 1 box.boxSeries(ser=data, ax=fig.ax[1]) # Plot histogram on axis 2 hist.quickHist(ax=fig.ax[2], dat=df_data, orientation='horizontal') # Plot scatterplot on axis 3 scatter.scatter2D(ax=fig.ax[3], x=tfit[col], y=tresid[col], colorList=list('b')) # Draw cutoff line for scatterplot on axis 3 lines.drawCutoffHoriz(ax=fig.ax[3], y=0) # Format axis 0 fig.formatAxis(figTitle=col, axnum=0, grid=False, showX=True, yTitle="Sample Quantiles", xTitle=" ") # Format axis 1 fig.formatAxis(axnum=1, axTitle="Standardized Residuals", grid=False, showX=False, showY=True, xTitle=" ") # Format axis 2 fig.formatAxis(axnum=2, grid=False, showX=True, showY=True, axTitle=" ", xTitle=" ") # Format axis 3 fig.formatAxis(axnum=3, axTitle="Predicted Values vs Residual Values", xTitle="Predicted Values", yTitle="Residual Values", grid=False) #Add figure to pdf fig.addToPdf(pdfPages=pdf)
def nontechnical_analysis(args, df, mask, C, clustering): # Re-order things more palatably for the user, # based on the results of the technical analysis. # Get the map from the name to the original row index. all_row_names = df.index.values row_index_map = {s: i for i, s in enumerate(all_row_names)} # If some variables are uninformative for clustering, # the correlation matrix and the cluster vector will have smaller # dimensions than the number of rows in the original data frame. remaining_row_names = df[mask].index.values # Count the variables included in the clustering. p = clustering.shape[0] # Count the clusters. k = clustering.max() + 1 # To sort the modules and to sort the variables within the modules, # we want to use absolute values of correlations. C_abs = np.abs(C) # For each cluster, get its indices and its submatrix of C_abs. selections = [] submatrices = [] degrees = np.zeros(p, dtype=float) for i in range(k): selection = np.flatnonzero(clustering == i) selections.append(selection) submatrix = C_abs[np.ix_(selection, selection)] submatrices.append(submatrix) if selection.size > 1: denom = selection.size - 1 degrees[selection] = (submatrix.sum(axis=0) - 1) / denom # Modules should be reordered according to decreasing "average degree". cluster_sizes = [] average_degrees = [] for selection in selections: cluster_sizes.append(selection.size) average_degrees.append(degrees[selection].mean()) module_to_cluster = np.argsort(average_degrees)[::-1] cluster_to_module = {v: k for k, v in enumerate(module_to_cluster)} triples = [( cluster_to_module[clustering[i]], -degrees[i], i, ) for i in range(p)] _a, _b, new_to_old_idx = zip(*sorted(triples)) # Make a csv file if requested. header = ('Gene', 'Module', 'Entry Index', 'Average Degree', 'Degree') with open(args.out, 'wb') as fout: writer = csv.writer( fout, 'excel-tab') #problematic; need to switch to tsv file! writer.writerow(header) for old_i in new_to_old_idx: name = remaining_row_names[old_i] cluster = clustering[old_i] row = ( name, cluster_to_module[cluster] + 1, row_index_map[name] + 1, average_degrees[cluster], degrees[old_i], ) writer.writerow(row) #Create Output fh1 = figureHandler(proj="2d") fh2 = figureHandler(proj="2d") fh3 = figureHandler(proj="2d") # Prepare to create the sorted heatmaps. (fh2) C_sorted = C[np.ix_(new_to_old_idx, new_to_old_idx)] clustering_new = clustering[np.ix_(new_to_old_idx)] # Draw the third heatmap (smoothed). # Make a smoothed correlation array. (fh3) S = expansion(clustering_new) block_mask = S.dot(S.T) denom = np.outer(S.sum(axis=0), S.sum(axis=0)) small = S.T.dot(C_sorted).dot(S) / denom C_all_smoothed = S.dot(small).dot(S.T) C_smoothed = (C_all_smoothed * (1 - block_mask) + C_sorted * block_mask) # Getting list of names for heatmaps 2 and 3 hpnames = [remaining_row_names[old_i] for old_i in new_to_old_idx] # Plot using something like http://stackoverflow.com/questions/15988413/ # Drawing heatmaps # Draw first heatmap [C] hm.plotHeatmap(C, fh1.ax[0], cmap=palette.mpl_colormap, xlbls=remaining_row_names, ylbls=remaining_row_names) fh1.formatAxis(xTitle="sampleID", figTitle="Correlations") # Draw second heatmap [C_sorted](reordered according to the clustering). hm.plotHeatmap(C_sorted, fh2.ax[0], cmap=palette.mpl_colormap, xlbls=hpnames, ylbls=hpnames) fh2.formatAxis(xTitle="sampleID", figTitle="Re-Ordered correlations") # Draw the heatmap [C_smoothed](smoothed version of C_sorted) hm.plotHeatmap(C_smoothed, fh3.ax[0], cmap=palette.mpl_colormap, xlbls=hpnames, ylbls=hpnames) fh3.formatAxis(xTitle="sampleID", figTitle="Smoothed correlations") #Create output from maps with PdfPages(args.figure) as pdf: fh1.addToPdf(pdf) fh2.addToPdf(pdf) fh3.addToPdf(pdf)
def main(args): # Loading data trought Interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, runOrder=args.order, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # SCENARIO 1: Unpaired t-test. In this case there can be as many groups as possible. # Order variable is ignored and t-tests are performed pairwise for each pair of groups. if args.pairing == "unpaired": logger.info("Unpaired t-test will be performed for all groups pairwise.") # Getting the uinique pairs and all pairwise prermutations # son that we will feed them to pairwise unpaired t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. This will depend on whether the user has provided ordering variable or not. # This variable is useless for unpared test. it just adds extra column to the data frame. if args.order == False: number_of_features = data_frame.shape[1] - 1 else: number_of_features = data_frame.shape[1] - 2 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with unpaired t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1 ).transpose() else: data_frame_manipulate_transpose = data_frame_manipulate.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. # This has nothing to do with unpaired t-test. This is just summary for the table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_current_group = data_frame_current_group.drop( args.group, 1 ).transpose() else: data_frame_current_group = data_frame_current_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise ): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_first_group = data_frame_first_group.drop( args.group, 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose() else: data_frame_first_group = data_frame_first_group.drop( [args.group, args.order], 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features ): series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] ttest_ind_args = [series_first, series_second] p_value[j] = ttest_ind( *ttest_ind_args )[1] t_value[j] = ttest_ind( *ttest_ind_args )[0] # Possible alternative for two groups. # p_value[j] = ttest_ind_args(series_first, series_second)[1] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating column names for the data frame. p_value_column_name_current = 'prob_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1] t_value_column_name_current = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] # Adding current p_value and flag_value column to the data frame and assigning the name. # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frame exists so only columns are added to the existing data frame. summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # SCENARIO 2: Paired t-test. In this case there should be EXACTLY TWO groups. # Each sample in one group should have exacty one matching pair in the other group. # The matching is controlled by args.order variable. if args.pairing == "paired": logger.info("Paired test will be performed for two groups pairwise based on pairing variable: {0}.".format(args.order)) # Getting the number of unique groups. If it is bigger than 2 return the warning and exit. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] if number_of_unique_groups != 2: logger.warning(u"The number of unique groups is {0} and not 2 as expected. The paired t-test cannot be performed.".format(number_of_unique_groups) ) exit() # This piece of code will be executed only if the number_of_unique_groups is exactly 2 so the group check is passed. # Creating pairwise combination of our two groups that we will use in the future. groups_pairwise = list( combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. This will depend on whether the user has provided ordering variable or not. # Checking that the requred pairing variable has been provided. if args.order == False: logger.info("The required t-test pairing variable has not been provided: The paired t-test cannot be performed.") exit() # This piece of code will be executed only if the args.order has been provided and the check is passed. # Defining the number of features. It should be the dimension of the data frame minus 2 columns that stand for arg.group and args.order number_of_features = data_frame.shape[1] - 2 # At this point is is confirmed that there are only 2 group and that pairing variable args.order has been provided. # Now we need to check that pairing is correct i.e. that each pairID corresponds to only two samples from different groups. # Getting the unique pairs and deleting those theat have more or less than three. pairid_values_series = dat.transpose()[dat.runOrder].T.squeeze() pairid_values_series_unique = pairid_values_series.unique() number_of_unique_pairid = pairid_values_series_unique.shape[0] # Extracting data from the interface. data_frame = dat.transpose() # Extracting the number of samples in the final frame. number_of_samples = data_frame.shape[0] # Performing the cleaning of the original data. We are removing samples that are not paired and not belonging to the two groups. # If the dataset has 1 or 3 or more matches for a pairid those samples are removed with a warning. # If pairdid corresponds to exactly two samples (which is correct) but groupid-s are NOT different those values will be also removed. for i in range(0, number_of_unique_pairid ): # Extracting the pieces of the data frame that belong to ith unique pairid. data_frame_current_pairid = data_frame.loc[data_frame[args.order].isin( [ pairid_values_series_unique[i] ] )] # We transpose here so it will be easier to operate with. data_frame_current_pairid = data_frame_current_pairid.transpose() sample_names_current_pairid = list(data_frame_current_pairid.columns.values) if data_frame_current_pairid.shape[1] != 2: # Pulling indexes list from the current data frame. logger.warning(u"Number of samples for the pairID: {0} is equal to {1} and NOT equal to 2. Sample(s) {2} will be removed from further analysis.".format(pairid_values_series_unique[i], data_frame_current_pairid.shape[1], sample_names_current_pairid) ) # Getting indexes we are trying to delete. boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid ) # Deleting the indexes and in the for loop going to next iteration. data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True) # This piece is executed if the numbe is correct i.e. data_frame_current_group.shape[1] == 2: # Here we are checking if the groupID-s for the given pair are indeed different. elif data_frame_current_pairid.transpose()[args.group][0] == data_frame_current_pairid.transpose()[args.group][1]: logger.warning(u"Samples in pairID {0} have groupIDs: {1} and {2}. Should be different! Sample(s) {3} will be removed from further analysis.".format(pairid_values_series_unique[i], data_frame_current_pairid.transpose()[args.group][1], data_frame_current_pairid.transpose()[args.group][0], sample_names_current_pairid) ) # Getting indexes we are trying to delete. boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid ) # Deleting the indexes. data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True) # Cheching if the data frame bacame empty after cleaning. if data_frame.shape[0] == 0: logger.warning(u"Number of paired samples in the final dataset is exactly 0! Please check the desing file for accuracy! Exiting the program." ) exit() # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with paired t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( [args.group,args.order], 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. # This has nothing to do with paired t-test. This is just summary for the table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. data_frame_current_group = data_frame_current_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Performing paired t-test for the two groups and saving the results. # Creating p_values and flag_values emply list of length number_of_features. # This will be used for thw two groups in paired t-test. p_value = [0] * number_of_features t_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features neg_log10_p_value = [0] * number_of_features difference_value = [0] * number_of_features # Performing paired t-test for each pair of features. for j in range(0, number_of_features ): # Extracting the pieces of the data frame that belong to 1st group. data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[1]] )] # Sorting data frame by args.group index # This will ensure datasets are aligned by pair when fed to the t-test. data_frame_first_group = data_frame_first_group.sort(args.order) data_frame_second_group = data_frame_second_group.sort(args.order) # Sorting data frame by args.group index data_frame_first_group = data_frame_first_group.drop( [args.group,args.order], 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( [args.group,args.order], 1 ).transpose() # Pulling list of indexes. This is the same list for the first and for the second. indexes_list = data_frame_first_group.index.tolist() # Pullinng the samples out series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] # Running t-test for the two given samples paired_ttest_args = [series_first, series_second] p_value[j] = ttest_rel( *paired_ttest_args )[1] t_value[j] = ttest_rel( *paired_ttest_args )[0] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # The loop over features has to be finished by now. Converting them into the data frame. # Creating column names for the data frame. p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] difference_value_column_name_current = 'diff_of_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] flag_value_column_name_current_0p01 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p01' flag_value_column_name_current_0p05 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p05' flag_value_column_name_current_0p10 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p10' summary_df[t_value_column_name_current] = t_value summary_df[p_value_column_name_current] = p_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff=2 # Making volcano plots with PdfPages( args.volcano ) as pdf: for i in range(0, number_of_groups_pairwise ): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis(axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key), xTitle="Difference of treatment means for {0}".format(current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing running of t-test.")
def plotDistances(df_distance, palette, plotType, disType, cutoff, p, pdf): #Geting number of samples in dataframe (ns stands for number of samples) ns = len(df_distance.index) #Calculates the widht for the figure base on the number of samples figWidth = max(ns / 2, 16) # Keeping the order on the colors df_distance["colors"] = palette.design["colors"] # Create figure object with a single axis figure = figureHandler(proj='2d', figsize=(figWidth, 8)) # Getting type of distance file if "distance_to_mean" in df_distance.columns: dataType = "to the mean" else: dataType = "pairwise" # Getting ty of distance header if disType == "Mahalanobis": distType1 = "Penalized" distType2 = disType else: distType1 = "Standardized" distType2 = disType # Adds Figure title, x axis limits and set the xticks figure.formatAxis(figTitle="{0} for {1} {2} Distance for {3} {4}".format( plotType, distType1, distType2, df_distance.name, dataType), yTitle="{0} {1} Distance".format(distType1, distType2), xTitle="Index", ylim="ignore", xlim=(-0.5, -0.5 + ns), xticks=df_distance.index) # If distance to mean if dataType == "to the mean": # Plot scatterplot quickplot scatter.scatter2D(ax=figure.ax[0], colorList=df_distance["colors"], x=range(len(df_distance.index)), y=df_distance["distance_to_mean"]) # if pairwise else: if plotType == "Scatterplot": # Plot scatterplot for index in df_distance.index: scatter.scatter2D(ax=figure.ax[0], colorList=df_distance["colors"][index], x=range(len(df_distance.index)), y=df_distance[index]) elif plotType == "Box-plots": # Plot Box plot box.boxDF(ax=figure.ax[0], colors=df_distance["colors"], dat=df_distance) # Shrink figure figure.shrink() # Plot legend figure.makeLegend(figure.ax[0], palette.ugColors, palette.combName) #Add a cutoof line cutoff.apply(lambda x: plotCutoffs(x, ax=figure.ax[0], p=p), axis=0) # Add figure to PDF and close the figure afterwards figure.addToPdf(pdf) # Drop "color" column to no mess the results df_distance.drop("colors", axis=1, inplace=True)
def iterateCombo(dat, combo, pdf): """ A function to iterate generate all plots and flags. :Arguments: :type dat: interface.wideToDesign :param dat: A wideToDesign object containing wide and design information. :param tuple combo: A tuple of pairwise combination for current sample. :type pdf: matplotlib.backends.backend_pdf.PdfPages :param pdf: Handler for multi-page PDF that will contain all plots. :Updates: :type pdf: matplotlib.backends.backend_pdf.PdfPages :param pdf: Handler for multi-page PDF that will contain all plots. :Returns: :rtype flag: interface.Flags :param flag: A Flags object with outlier flags. """ # Current combination c1 = combo[0] c2 = combo[1] # Set up figure with 2 subplots fh = figureHandler(proj='2d', numAx=2, numRow=2, numCol=2, arrangement=[(0, 0, 1, 2), (0, 1, 1, 2)]) # Scatter Plot of c1 vs c2 makeScatter(dat.wide.loc[:, c1], dat.wide.loc[:, c2], fh.ax[0], fh) # BA plot of c1 vs c2 outlier, pearson, cooks, dffits = makeBA(dat.wide.loc[:, c1], dat.wide.loc[:, c2], fh.ax[1], fh) # Build plot title title = buildTitle(dat, c1, c2) # Add plot title to the figure fh.formatAxis(figTitle=title) # Stablishing a tight layout for the figure plt.tight_layout(pad=2, w_pad=.05) # Shinking figure fh.shrink(top=.85, bottom=.25, left=.15, right=.9) # Output figure to pdf fh.addToPdf(dpi=90, pdfPages=pdf) # Create flags flag = Flags(index=dat.wide.index) flag.addColumn(column='flag_{0}_{1}'.format(c1, c2), mask=outlier) flag.addColumn(column='flag_pearson_{0}_{1}'.format(c1, c2), mask=pearson) flag.addColumn(column='flag_cooks_{0}_{1}'.format(c1, c2), mask=cooks) flag.addColumn(column='flag_dffits_{0}_{1}'.format(c1, c2), mask=dffits) return flag.df_flags
def plotFlagDist(propSample, propFeature, pdf): """ Plot the distribution of proportion of samples and features that were outliers. :Arguments: :type propSample: pandas.DataFrame :param propSample: Data frame of the proportion of samples flagged as an outlier. :type propFeature: pandas.DataFrame :param propFeature: Data frame of the proportion of features flagged as an outlier. :type pdf: string :param pdf: Filename of pdf to save plots. :Returns: :rtype: matplotlib.backends.backend_pdf.PdfPages :returns: Saves two bar plots to pdf. """ # sort samples propSample.sort_values(inplace=True, ascending=False) # sort compounds propFeature.sort_values(inplace=True, ascending=False) # Make Plots ## Open pdf for plotting ppFlag = PdfPages(pdf) # Open figure handler instance fh = figureHandler(proj='2d') keys = list(propSample.head(30).keys()) # Plotting quickBar bar.quickBar(ax=fh.ax[0], y=list(propSample.head(30).get_values()), x=keys) # Formating axis fh.formatAxis(xlim=(0, len(keys) + 1), ylim="ignore", xTitle="Sample ID", yTitle="Proportion of features that were outliers.") # Save Figure in PDF ppFlag.savefig(fh.fig, bbox_inches='tight') ## Plot samples # Open figure handler instance fh = figureHandler(proj='2d') keys = list(propFeature.head(30).keys()) # Plot bar plot bar.quickBar(ax=fh.ax[0], y=list(propFeature.head(30).get_values()), x=keys) # Format Axis fh.formatAxis( xlim=(0, len(keys) + 1), ylim="ignore", xTitle="Feature ID", yTitle="Proportion of samples that a feature was an outlier.") # Plot samples ppFlag.savefig(fh.fig, bbox_inches="tight") ## Close pdf ppFlag.close()
def main(args): # If the user provides grouping variable we test each group against the null (my supplied by user, 0 is the default). if args.group != False: logger.info( u"""t-test will be performed for all groups saved in [{0}] variable in the desing file pairwise with the H_0: mu = {1}.""" .format(args.group, args.mu)) # Loading data trought Interface. logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group=args.group, logger=logger) # Treat everything as numeric. dat.wide = dat.wide.applymap(float) # Cleaning from the missing data. dat.dropMissing() # Getting the uinique group values so that we will feed them to the t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. We subtract 1 since we have provided args.group number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with the single sample t-test. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also transpose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist( ) # Computing dataset summaries for feature j. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list_complete) summary_df['SampleVariance'] = variance_value_all # Running single sample t-test for all groups. # We are also computing means for each group and outputting them. for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. # Creating p values, difference values, neg_log10_p_value, t-value, flag_value lists filled wiht 0es. means_value = [0] * number_of_features difference_value = [0] * number_of_features p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features for j in range(0, number_of_features): series_current = data_frame_current_group.loc[indexes_list[j]] means_value[j] = series_current.mean() # Performing one sample t-test ttest_1samp_args = [series_current, float(args.mu)] p_value[j] = ttest_1samp(*ttest_1samp_args)[1] t_value[j] = ttest_1samp(*ttest_1samp_args)[0] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = means_value[j] - float(args.mu) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating names for the current analysis columns and adding result columns to the data frame. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[ i] p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[ i] + '_' + args.mu t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[ i] + '_' + args.mu neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[ i] + '_' + args.mu difference_value_column_name_current = 'diff_of_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + group_values_series_unique[ i] + '_' + args.mu # Adding flag_value column to the data frame and assigning the name. # If the data frame for flags has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame( data=flag_value_0p01, columns=[flag_value_column_name_current_0p01], index=indexes_list) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frames (summary and flags) exist so only columns are added to the existing data frame. summary_df[means_value_column_name_current] = means_value summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[ neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # If the user does not provide grouping variable we test all dataset as a single group against the null (my supplied by user, 0 is the default). if args.group == False: logger.info( u"""t-test will be performed for the entire dataset since goruping variable was not provided.""" ) # Loading data trough the interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Saving the number of unique groups that will be used for plotting. # Since we did not feed any grouping variable it is exactly one. number_of_unique_groups = 1 # Extracting data from the interface. data_frame = dat.wide.transpose() # Extracting number of features. We do not subtract 1 since we have not provided args.group number_of_features = data_frame.shape[1] # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with single sample t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features # Creating array of means for the current group that will be filled. # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features difference_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features for j in range(0, number_of_features): # We transpose here so data will be easier to operate on. data_frame_manipulate_transpose = data_frame.transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist( ) # Computing dataset summaries. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) # Performing one sample t-test for the entire dataset. ttest_1samp_args = [ data_frame_manipulate_transpose.loc[indexes_list_complete[j]], float(args.mu) ] p_value[j] = ttest_1samp(*ttest_1samp_args)[1] t_value[j] = ttest_1samp(*ttest_1samp_args)[0] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = mean_value_all[j] - float(args.mu) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating the table and putting the results there. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list_complete) summary_df['SampleVariance'] = variance_value_all # Creating names for the current analysis columns and adding result columns to the data frame. means_value_column_name_current = 'mean_treatment_all' p_value_column_name_current = 'prob_greater_than_t_for_diff_all_' + args.mu t_value_column_name_current = 't_value_for_diff_all_' + args.mu neg_log10_p_value_column_name_current = 'neg_log10_p_value_all_' + args.mu difference_value_column_name_current = 'diff_of_all_' + args.mu flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_all_' + args.mu flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_all_' + args.mu flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_all_' + args.mu summary_df[means_value_column_name_current] = mean_value_all summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df = pd.DataFrame(data=flag_value_0p01, columns=[flag_value_column_name_current_0p01], index=indexes_list_complete) flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff = 2 # Making volcano plots with PdfPages(args.volcano) as pdf: for i in range(0, number_of_unique_groups): # Set Up Figure volcanoPlot = figureHandler(proj="2d") # If no grouping variable is provided. if number_of_unique_groups == 1: current_key = 'all_' + args.mu else: current_key = group_values_series_unique[i] + '_' + args.mu # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}". format(current_key), xTitle="Difference of the means from H0 for {0}".format( current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) logger.info(u"Volcano plots have been created.") logger.info(u"Finishing running of t-test.")
def main(args): # Loading data through Interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Unpaired permuted t-test. In this case there can be as many groups as possible. # Order variable is ignored and t-tests are performed pairwise for each pair of groups. logger.info("Unpaired t-test will be performed for all groups pairwise.") # Getting the unique pairs and all pairwise permutations to feed to pairwise unpaired t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. # This variable not used in unpaired test. it just adds extra column to the data frame. # if args.order == False: number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces summary statistics for the output table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_current_group = data_frame_current_group.drop( args.group, 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise ): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_first_group = data_frame_first_group.drop( args.group, 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features ): series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] p_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[0] # print j # print p_value[j] t_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[1] # print j # print t_value[j] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating column names for the data frame. p_value_column_name_current = 'perm_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1] t_value_column_name_current = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] # Adding current p_value and flag_value column to the data frame and assigning the name. # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frame exists so only columns are added to the existing data frame. summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Rounding the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there originally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Getting data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff=2 # Making volcano plots with PdfPages( args.volcano ) as pdf: for i in range(0, number_of_groups_pairwise ): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond threshold red cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis(axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key), xTitle="Difference of treatment means for {0}".format(current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing t-test run.")
def volcano(combo, results, oname, cutoff=2): """ Plot volcano plots. Creates volcano plots to compare means, for all pairwise differences. :Arguments: :type combo: dictionary :param combo: A dictionary of dictionaries with all possible pairwise combinations. Used this to create the various column headers in the results table. :type results: pandas DataFrame :param results: TODO :type oname: string :param oname: Name of the output file in pdf format. :type cutoff: int :param cutoff: The cutoff value for significance. :Returns: :rtype: PD :returns: Outputs a pdf file containing all plots. """ # Getting data for lpvals lpvals = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \ if col.startswith("-log10_p-value_")} # Gettign data for diffs difs = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \ if col.startswith("diff_of")} # Making plots with PdfPages(oname) as pdf: for key in sorted(difs.keys()): # Set Up Figure volcanoPlot = figureHandler(proj="2d") # Plot all results scatter.scatter2D(x=list(difs[key]), y=list(lpvals[key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[key][lpvals[key] > cutoff] if not cutLpvals.empty: cutDiff = difs[key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=key, grid=False, yTitle="-log10(p-value) for Diff of treatment = {0}".format( key), xTitle="Diff of treatment = {0}".format(key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf)
def plotSignificantROR(data, pdf, palette): """ Plot a scatter plot of x vs y. :Arguments: :type row: :param row: :type pdf: PdfPages :param pdf: pdf object to store scatterplots :type des: pandas DataFrame :param des: design file :type groupName: string :param groupName: name of group """ # Iterates over all rows in the dataframe # Make scatter plot if p-pvalue is less than 0.05 for index, row in data.iterrows(): if row["pval"] > 0.05: continue #plotSignificantROR(row,pdf,dat.design,args.group) # Get 95% CI prstd, lower, upper = wls_prediction_std(row["res"]) # Sort CIs for Plotting toPlot = pd.DataFrame({"x": row["x"], "lower": lower, "upper": upper}) toPlot.sort_values(by="x", inplace=True) # Create plot fh = figureHandler(proj="2d", figsize=(14, 8)) #Plot scatterplot scatter.scatter2D(ax=fh.ax[0], x=row["x"], y=row["y"], colorList=palette.list_colors) # Plot cutoffs lines.drawCutoff(ax=fh.ax[0], x=row["x"], y=row["fitted"], c="c") lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["lower"], c="r") lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["upper"], c="r") # Formatting ymin, ymax = fh.ax[0].get_ylim() fh.formatAxis(xTitle="Run Order", yTitle="Value", ylim=(ymin,ymax*1.2), figTitle=u"{} Scatter plot (fitted regression line and prediction bands"\ " included)".format(row["name"])) # Shrink figure fh.shrink() # Add legend to figure fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) #Add text to the ax fh.ax[0].text(.7, .85, u"Slope= {0:.4f}\n(p-value = {1:.4f})\n"\ "$R^2$ = {2:4f}".format(round(row["slope"],4), round(row["pval"],4), round(row["rsq"],4)),transform=fh.ax[0].transAxes, fontsize=12) # Save to PDF fh.addToPdf(pdf)
def makePlots(SEDData, design, pdf, groupName, cutoff, p, plotType, ugColors, levels): """ Manage all the plots for this script :Arguments: :type SEDData: pandas.dataFrame :param SEDData: Contains SED data either to Mean or pairwise :type design: pandas.dataFrame :param design: Design file after getColor :type pdf: PDF object :param pdf: PDF for output plots :type groupName: string :param groupName: Name of the group (figure title). :type cutoff: pandas.dataFrame :param cutoff: Cutoff values, beta, chi-sqr and normal. :type p: float :param p: Percentil for cutoff. :type plotType: string :param plotType: Type of plot, the possible types are scatterplot to mean scatterplot pairwise and boxplot pairwise. """ #Geting number of features in dataframe nFeatures = len(SEDData.index) #Calculates the widht for the figure base on the number of features figWidth = max(nFeatures / 2, 16) # Create figure object with a single axis and initiate the figss figure = figureHandler(proj='2d', figsize=(figWidth, 8)) # Keeping the order on the colors SEDData["colors"] = design["colors"] # Choose type of plot # Plot scatterplot to mean if (plotType == "scatterToMean"): #Adds Figure title, x axis limits and set the xticks figure.formatAxis( figTitle= "Standardized Euclidean Distance from samples {} to the mean". format(groupName), xlim=(-0.5, -0.5 + nFeatures), ylim="ignore", xticks=SEDData.index.values, xTitle="Index", yTitle="Standardized Euclidean Distance") #Plot scatterplot quickplot scatter.scatter2D(ax=figure.ax[0], colorList=SEDData["colors"], x=range(len(SEDData.index)), y=SEDData["SED_to_Mean"]) #Plot scatterplot pairwise elif (plotType == "scatterPairwise"): # Adds Figure title, x axis limits and set the xticks figure.formatAxis( figTitle="Pairwise standardized Euclidean Distance from samples {}" .format(groupName), xlim=(-0.5, -0.5 + nFeatures), ylim="ignore", xticks=SEDData.index.values, xTitle="Index", yTitle="Standardized Euclidean Distance") # Plot scatterplot for index in SEDData.index.values: scatter.scatter2D(ax=figure.ax[0], colorList=design["colors"][index], x=range(len(SEDData.index)), y=SEDData[index]) #Plot boxplot pairwise elif (plotType == "boxplotPairwise"): # Add Figure title, x axis limits and set the xticks figure.formatAxis( figTitle= "Box-plots for pairwise standardized Euclidean Distance from samples {}" .format(groupName), xlim=(-0.5, -0.5 + nFeatures), ylim="ignore", xticks=SEDData.index.values, xTitle="Index", yTitle="Standardized Euclidean Distance") # Plot Box plot box.boxDF(ax=figure.ax[0], colors=SEDData["colors"].values, dat=SEDData) #Add a cutoof line cutoff.apply(lambda x: plotCutoffs(x, ax=figure.ax[0], p=p), axis=0) figure.shrink() # Plot legend #if group: figure.makeLegend(figure.ax[0], ugColors, levels) # Add figure to PDF and close the figure afterwards figure.addToPdf(pdf)