def plotCutoffs(cut_S, ax, p): """ Plot the cutoff lines to each plot :Arguments: :type cut_S: pandas.Series :param cut_S: contains a cutoff value, name and color :type ax: matplotlib.axes._subplots.AxesSubplot :param ax: Gets an ax project. :type p: float :param p: percentile of cutoff """ lines.drawCutoffHoriz(ax=ax, y=float(cut_S.values[0]), cl=cutPalette.ugColors[cut_S.name], lb="{0} {1}% Threshold: {2}".format( cut_S.name, round(p * 100, 3), round(float(cut_S.values[0]), 1)), ls="--", lw=2)
def qqPlot(tresid, tfit, oname): """ Plot the residual diagnostic plots by sample. Output q-q plot, boxplots and distributions of the residuals. These plots will be used diagnose if residuals are approximately normal. :Arguments: :type tresid: pandas.Series :param tresid: Pearson normalized residuals. (transposed) (residuals / sqrt(MSE)) :type tfit: pandas DataFrame :param tfit: output of the ANOVA (transposed) :type oname: string :param oname: Name of the output file in pdf format. :Returns: :rtype: PDF :returns: Outputs a pdf file containing all plots. """ #Open pdf with PdfPages(oname) as pdf: # Stablishing axisLayout axisLayout = [(0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 1, 1), (1, 0, 3, 1)] # Start plotting for col in tresid.columns: #Creating figure fig = figureHandler(proj='2d', numAx=4, numRow=2, numCol=3, arrangement=axisLayout) data = tresid[col].values.ravel() noColors = list() for j in range(0, len(data)): noColors.append('b') #blue df_data = pd.DataFrame(data) # Removing missing so that it will plot correctly. mask_nan_data = np.isnan(data) data = data[~mask_nan_data] # Plot qqplot on axis 0 sm.graphics.qqplot(data, fit=True, line='r', ax=fig.ax[0]) # Plot boxplot on axis 1 box.boxSeries(ser=data, ax=fig.ax[1]) # Plot histogram on axis 2 hist.quickHist(ax=fig.ax[2], dat=df_data, orientation='horizontal') # Plot scatterplot on axis 3 scatter.scatter2D(ax=fig.ax[3], x=tfit[col], y=tresid[col], colorList=list('b')) # Draw cutoff line for scatterplot on axis 3 lines.drawCutoffHoriz(ax=fig.ax[3], y=0) # Format axis 0 fig.formatAxis(figTitle=col, axnum=0, grid=False, showX=True, yTitle="Sample Quantiles", xTitle=" ") # Format axis 1 fig.formatAxis(axnum=1, axTitle="Standardized Residuals", grid=False, showX=False, showY=True, xTitle=" ") # Format axis 2 fig.formatAxis(axnum=2, grid=False, showX=True, showY=True, axTitle=" ", xTitle=" ") # Format axis 3 fig.formatAxis(axnum=3, axTitle="Predicted Values vs Residual Values", xTitle="Predicted Values", yTitle="Residual Values", grid=False) #Add figure to pdf fig.addToPdf(pdfPages=pdf)
def main(args): # Loading data trought Interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, runOrder=args.order, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # SCENARIO 1: Unpaired t-test. In this case there can be as many groups as possible. # Order variable is ignored and t-tests are performed pairwise for each pair of groups. if args.pairing == "unpaired": logger.info("Unpaired t-test will be performed for all groups pairwise.") # Getting the uinique pairs and all pairwise prermutations # son that we will feed them to pairwise unpaired t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. This will depend on whether the user has provided ordering variable or not. # This variable is useless for unpared test. it just adds extra column to the data frame. if args.order == False: number_of_features = data_frame.shape[1] - 1 else: number_of_features = data_frame.shape[1] - 2 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with unpaired t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1 ).transpose() else: data_frame_manipulate_transpose = data_frame_manipulate.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. # This has nothing to do with unpaired t-test. This is just summary for the table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_current_group = data_frame_current_group.drop( args.group, 1 ).transpose() else: data_frame_current_group = data_frame_current_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise ): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_first_group = data_frame_first_group.drop( args.group, 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose() else: data_frame_first_group = data_frame_first_group.drop( [args.group, args.order], 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features ): series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] ttest_ind_args = [series_first, series_second] p_value[j] = ttest_ind( *ttest_ind_args )[1] t_value[j] = ttest_ind( *ttest_ind_args )[0] # Possible alternative for two groups. # p_value[j] = ttest_ind_args(series_first, series_second)[1] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating column names for the data frame. p_value_column_name_current = 'prob_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1] t_value_column_name_current = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] # Adding current p_value and flag_value column to the data frame and assigning the name. # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frame exists so only columns are added to the existing data frame. summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # SCENARIO 2: Paired t-test. In this case there should be EXACTLY TWO groups. # Each sample in one group should have exacty one matching pair in the other group. # The matching is controlled by args.order variable. if args.pairing == "paired": logger.info("Paired test will be performed for two groups pairwise based on pairing variable: {0}.".format(args.order)) # Getting the number of unique groups. If it is bigger than 2 return the warning and exit. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] if number_of_unique_groups != 2: logger.warning(u"The number of unique groups is {0} and not 2 as expected. The paired t-test cannot be performed.".format(number_of_unique_groups) ) exit() # This piece of code will be executed only if the number_of_unique_groups is exactly 2 so the group check is passed. # Creating pairwise combination of our two groups that we will use in the future. groups_pairwise = list( combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. This will depend on whether the user has provided ordering variable or not. # Checking that the requred pairing variable has been provided. if args.order == False: logger.info("The required t-test pairing variable has not been provided: The paired t-test cannot be performed.") exit() # This piece of code will be executed only if the args.order has been provided and the check is passed. # Defining the number of features. It should be the dimension of the data frame minus 2 columns that stand for arg.group and args.order number_of_features = data_frame.shape[1] - 2 # At this point is is confirmed that there are only 2 group and that pairing variable args.order has been provided. # Now we need to check that pairing is correct i.e. that each pairID corresponds to only two samples from different groups. # Getting the unique pairs and deleting those theat have more or less than three. pairid_values_series = dat.transpose()[dat.runOrder].T.squeeze() pairid_values_series_unique = pairid_values_series.unique() number_of_unique_pairid = pairid_values_series_unique.shape[0] # Extracting data from the interface. data_frame = dat.transpose() # Extracting the number of samples in the final frame. number_of_samples = data_frame.shape[0] # Performing the cleaning of the original data. We are removing samples that are not paired and not belonging to the two groups. # If the dataset has 1 or 3 or more matches for a pairid those samples are removed with a warning. # If pairdid corresponds to exactly two samples (which is correct) but groupid-s are NOT different those values will be also removed. for i in range(0, number_of_unique_pairid ): # Extracting the pieces of the data frame that belong to ith unique pairid. data_frame_current_pairid = data_frame.loc[data_frame[args.order].isin( [ pairid_values_series_unique[i] ] )] # We transpose here so it will be easier to operate with. data_frame_current_pairid = data_frame_current_pairid.transpose() sample_names_current_pairid = list(data_frame_current_pairid.columns.values) if data_frame_current_pairid.shape[1] != 2: # Pulling indexes list from the current data frame. logger.warning(u"Number of samples for the pairID: {0} is equal to {1} and NOT equal to 2. Sample(s) {2} will be removed from further analysis.".format(pairid_values_series_unique[i], data_frame_current_pairid.shape[1], sample_names_current_pairid) ) # Getting indexes we are trying to delete. boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid ) # Deleting the indexes and in the for loop going to next iteration. data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True) # This piece is executed if the numbe is correct i.e. data_frame_current_group.shape[1] == 2: # Here we are checking if the groupID-s for the given pair are indeed different. elif data_frame_current_pairid.transpose()[args.group][0] == data_frame_current_pairid.transpose()[args.group][1]: logger.warning(u"Samples in pairID {0} have groupIDs: {1} and {2}. Should be different! Sample(s) {3} will be removed from further analysis.".format(pairid_values_series_unique[i], data_frame_current_pairid.transpose()[args.group][1], data_frame_current_pairid.transpose()[args.group][0], sample_names_current_pairid) ) # Getting indexes we are trying to delete. boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid ) # Deleting the indexes. data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True) # Cheching if the data frame bacame empty after cleaning. if data_frame.shape[0] == 0: logger.warning(u"Number of paired samples in the final dataset is exactly 0! Please check the desing file for accuracy! Exiting the program." ) exit() # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with paired t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( [args.group,args.order], 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. # This has nothing to do with paired t-test. This is just summary for the table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. data_frame_current_group = data_frame_current_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Performing paired t-test for the two groups and saving the results. # Creating p_values and flag_values emply list of length number_of_features. # This will be used for thw two groups in paired t-test. p_value = [0] * number_of_features t_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features neg_log10_p_value = [0] * number_of_features difference_value = [0] * number_of_features # Performing paired t-test for each pair of features. for j in range(0, number_of_features ): # Extracting the pieces of the data frame that belong to 1st group. data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[1]] )] # Sorting data frame by args.group index # This will ensure datasets are aligned by pair when fed to the t-test. data_frame_first_group = data_frame_first_group.sort(args.order) data_frame_second_group = data_frame_second_group.sort(args.order) # Sorting data frame by args.group index data_frame_first_group = data_frame_first_group.drop( [args.group,args.order], 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( [args.group,args.order], 1 ).transpose() # Pulling list of indexes. This is the same list for the first and for the second. indexes_list = data_frame_first_group.index.tolist() # Pullinng the samples out series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] # Running t-test for the two given samples paired_ttest_args = [series_first, series_second] p_value[j] = ttest_rel( *paired_ttest_args )[1] t_value[j] = ttest_rel( *paired_ttest_args )[0] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # The loop over features has to be finished by now. Converting them into the data frame. # Creating column names for the data frame. p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] difference_value_column_name_current = 'diff_of_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] flag_value_column_name_current_0p01 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p01' flag_value_column_name_current_0p05 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p05' flag_value_column_name_current_0p10 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p10' summary_df[t_value_column_name_current] = t_value summary_df[p_value_column_name_current] = p_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff=2 # Making volcano plots with PdfPages( args.volcano ) as pdf: for i in range(0, number_of_groups_pairwise ): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis(axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key), xTitle="Difference of treatment means for {0}".format(current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing running of t-test.")
def main(args): # If the user provides grouping variable we test each group against the null (my supplied by user, 0 is the default). if args.group != False: logger.info( u"""t-test will be performed for all groups saved in [{0}] variable in the desing file pairwise with the H_0: mu = {1}.""" .format(args.group, args.mu)) # Loading data trought Interface. logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group=args.group, logger=logger) # Treat everything as numeric. dat.wide = dat.wide.applymap(float) # Cleaning from the missing data. dat.dropMissing() # Getting the uinique group values so that we will feed them to the t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. We subtract 1 since we have provided args.group number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with the single sample t-test. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also transpose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist( ) # Computing dataset summaries for feature j. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list_complete) summary_df['SampleVariance'] = variance_value_all # Running single sample t-test for all groups. # We are also computing means for each group and outputting them. for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. # Creating p values, difference values, neg_log10_p_value, t-value, flag_value lists filled wiht 0es. means_value = [0] * number_of_features difference_value = [0] * number_of_features p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features for j in range(0, number_of_features): series_current = data_frame_current_group.loc[indexes_list[j]] means_value[j] = series_current.mean() # Performing one sample t-test ttest_1samp_args = [series_current, float(args.mu)] p_value[j] = ttest_1samp(*ttest_1samp_args)[1] t_value[j] = ttest_1samp(*ttest_1samp_args)[0] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = means_value[j] - float(args.mu) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating names for the current analysis columns and adding result columns to the data frame. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[ i] p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[ i] + '_' + args.mu t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[ i] + '_' + args.mu neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[ i] + '_' + args.mu difference_value_column_name_current = 'diff_of_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + group_values_series_unique[ i] + '_' + args.mu # Adding flag_value column to the data frame and assigning the name. # If the data frame for flags has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame( data=flag_value_0p01, columns=[flag_value_column_name_current_0p01], index=indexes_list) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frames (summary and flags) exist so only columns are added to the existing data frame. summary_df[means_value_column_name_current] = means_value summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[ neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # If the user does not provide grouping variable we test all dataset as a single group against the null (my supplied by user, 0 is the default). if args.group == False: logger.info( u"""t-test will be performed for the entire dataset since goruping variable was not provided.""" ) # Loading data trough the interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Saving the number of unique groups that will be used for plotting. # Since we did not feed any grouping variable it is exactly one. number_of_unique_groups = 1 # Extracting data from the interface. data_frame = dat.wide.transpose() # Extracting number of features. We do not subtract 1 since we have not provided args.group number_of_features = data_frame.shape[1] # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with single sample t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features # Creating array of means for the current group that will be filled. # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features difference_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features for j in range(0, number_of_features): # We transpose here so data will be easier to operate on. data_frame_manipulate_transpose = data_frame.transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist( ) # Computing dataset summaries. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) # Performing one sample t-test for the entire dataset. ttest_1samp_args = [ data_frame_manipulate_transpose.loc[indexes_list_complete[j]], float(args.mu) ] p_value[j] = ttest_1samp(*ttest_1samp_args)[1] t_value[j] = ttest_1samp(*ttest_1samp_args)[0] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = mean_value_all[j] - float(args.mu) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating the table and putting the results there. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list_complete) summary_df['SampleVariance'] = variance_value_all # Creating names for the current analysis columns and adding result columns to the data frame. means_value_column_name_current = 'mean_treatment_all' p_value_column_name_current = 'prob_greater_than_t_for_diff_all_' + args.mu t_value_column_name_current = 't_value_for_diff_all_' + args.mu neg_log10_p_value_column_name_current = 'neg_log10_p_value_all_' + args.mu difference_value_column_name_current = 'diff_of_all_' + args.mu flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_all_' + args.mu flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_all_' + args.mu flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_all_' + args.mu summary_df[means_value_column_name_current] = mean_value_all summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df = pd.DataFrame(data=flag_value_0p01, columns=[flag_value_column_name_current_0p01], index=indexes_list_complete) flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff = 2 # Making volcano plots with PdfPages(args.volcano) as pdf: for i in range(0, number_of_unique_groups): # Set Up Figure volcanoPlot = figureHandler(proj="2d") # If no grouping variable is provided. if number_of_unique_groups == 1: current_key = 'all_' + args.mu else: current_key = group_values_series_unique[i] + '_' + args.mu # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}". format(current_key), xTitle="Difference of the means from H0 for {0}".format( current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) logger.info(u"Volcano plots have been created.") logger.info(u"Finishing running of t-test.")
def volcano(combo, results, oname, cutoff=2): """ Plot volcano plots. Creates volcano plots to compare means, for all pairwise differences. :Arguments: :type combo: dictionary :param combo: A dictionary of dictionaries with all possible pairwise combinations. Used this to create the various column headers in the results table. :type results: pandas DataFrame :param results: TODO :type oname: string :param oname: Name of the output file in pdf format. :type cutoff: int :param cutoff: The cutoff value for significance. :Returns: :rtype: PD :returns: Outputs a pdf file containing all plots. """ # Getting data for lpvals lpvals = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \ if col.startswith("-log10_p-value_")} # Gettign data for diffs difs = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \ if col.startswith("diff_of")} # Making plots with PdfPages(oname) as pdf: for key in sorted(difs.keys()): # Set Up Figure volcanoPlot = figureHandler(proj="2d") # Plot all results scatter.scatter2D(x=list(difs[key]), y=list(lpvals[key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[key][lpvals[key] > cutoff] if not cutLpvals.empty: cutDiff = difs[key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=key, grid=False, yTitle="-log10(p-value) for Diff of treatment = {0}".format( key), xTitle="Diff of treatment = {0}".format(key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf)
def main(args): # Loading data through Interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Unpaired permuted t-test. In this case there can be as many groups as possible. # Order variable is ignored and t-tests are performed pairwise for each pair of groups. logger.info("Unpaired t-test will be performed for all groups pairwise.") # Getting the unique pairs and all pairwise permutations to feed to pairwise unpaired t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. # This variable not used in unpaired test. it just adds extra column to the data frame. # if args.order == False: number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces summary statistics for the output table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_current_group = data_frame_current_group.drop( args.group, 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise ): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_first_group = data_frame_first_group.drop( args.group, 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features ): series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] p_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[0] # print j # print p_value[j] t_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[1] # print j # print t_value[j] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating column names for the data frame. p_value_column_name_current = 'perm_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1] t_value_column_name_current = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] # Adding current p_value and flag_value column to the data frame and assigning the name. # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frame exists so only columns are added to the existing data frame. summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Rounding the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there originally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Getting data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff=2 # Making volcano plots with PdfPages( args.volcano ) as pdf: for i in range(0, number_of_groups_pairwise ): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond threshold red cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis(axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key), xTitle="Difference of treatment means for {0}".format(current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing t-test run.")
def main(args): # Loading data trought Interface dat = wideToDesign(args.input, args.design, args.uniqueID, group=args.group, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Getting the uinique pairs and all pairwise prermutations # son that we will feed them to Kruscal-Wallis. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique, 2)) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Running overall Kruscall-Wallis test for all group levels combined. # Creating p_values_all and flag_values_all for 3 significance levels as emply lists of length equal to the number_of_features. # This will be used for all groups. p_value_all = [0] * number_of_features H_value_all = [0] * number_of_features mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features flag_value_all_0p01 = [0] * number_of_features flag_value_all_0p05 = [0] * number_of_features flag_value_all_0p10 = [0] * number_of_features for j in range(0, number_of_features): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to ith unique group. data_frame_current_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list = data_frame_current_group.index.tolist() # Series current for group i and row (feature) j. series_current = data_frame_current_group.loc[indexes_list[j]] # This piece of code depends on whether it is the first group in the list or not. if i == 0: series_total = [series_current] else: series_total.append(series_current) # Checking if the compared elements are different. # Combining for checking. combined_list = data_frame_manipulate_transpose.loc[ indexes_list_complete[j]].tolist() combined_list_unique = np.unique(combined_list) # Checking if the number of unique elements is exactly 1. if len(combined_list_unique) == 1: # Performing Kruscal-Wallis for all groups for feature j. p_value_all[j] = float("nan") H_value_all[j] = float("nan") if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1 if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1 if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1 else: # Performing Kruscal-Wallis for all groups for feature j. kruscal_wallis_args = series_total p_value_all[j] = kruskalwallis(*kruscal_wallis_args)[1] H_value_all[j] = kruskalwallis(*kruscal_wallis_args)[0] if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1 if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1 if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1 # The loop over features has to be finished by now. Converting them into the data frame. # The pariwise results will be added later. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list) summary_df['SampleVariance'] = variance_value_all summary_df['H_value_for_all'] = H_value_all summary_df['prob_greater_than_H_for_all'] = p_value_all flag_df = pd.DataFrame(data=flag_value_all_0p01, columns=["flag_significant_0p01_on_all_groups"], index=indexes_list) flag_df["flag_significant_0p05_on_all_groups"] = flag_value_all_0p05 flag_df["flag_significant_0p10_on_all_groups"] = flag_value_all_0p10 # Informing that KW for all group has been performed. logger.info( u"Kruscal-Wallis test for all groups together has been performed.") # Computing means for each group # This part just produces sumamry statistics for the output table. # This has nothing to do with Kruscal-Wallis for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features): series_current = data_frame_current_group.loc[indexes_list[j]] means_value[j] = series_current.mean() means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[ i] summary_df[means_value_column_name_current] = means_value # Running pairwise Kruscall-Wallis test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]])] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_first_group = data_frame_first_group.drop(args.group, 1).transpose() data_frame_second_group = data_frame_second_group.drop(args.group, 1).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features H_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features): series_first = data_frame_first_group.loc[indexes_list[j]] series_second = data_frame_second_group.loc[indexes_list[j]] # Checking if the compared elements are different. # Combining for checking. first_list = data_frame_first_group.loc[indexes_list[j]].tolist() second_list = data_frame_second_group.loc[indexes_list[j]].tolist() combined_list = first_list + second_list combined_list_unique = np.unique(combined_list) # Checking if the number of unique elements is exactly 1. if len(combined_list_unique) == 1: p_value[j] = float("nan") H_value[j] = float("nan") # Possible alternative for two groups. # p_value[j] = kruskalwallis(series_first, series_second)[1] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean( ) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 else: kruscal_wallis_args = [series_first, series_second] p_value[j] = kruskalwallis(*kruscal_wallis_args)[1] H_value[j] = kruskalwallis(*kruscal_wallis_args)[0] # Possible alternative for two groups. # p_value[j] = kruskalwallis(series_first, series_second)[1] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean( ) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Adding current p_value and flag_value column to the data frame and assigning the name p_value_column_name_current = 'prob_greater_than_H_for_diff_' + groups_subset[ 0] + '_' + groups_subset[1] H_value_column_name_current = 'H_value_for_diff_' + groups_subset[ 0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[ 0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[ 0] + '_' + groups_subset[1] summary_df[p_value_column_name_current] = p_value summary_df[H_value_column_name_current] = H_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precison digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Informing that KW for pairwise group has been performed. logger.info( u"Kruscal-Wallis test for all groups pairwise has been performed.") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff = 2 # Making volcano plots with PdfPages(args.volcano) as pdf: for i in range(0, number_of_groups_pairwise): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}". format(current_key), xTitle="Difference of treatment means for {0}".format( current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing running of Kruscal-Wallis tests.")