def plotScores(data, palette, pdf):
    """
    This function creates a PDF file with 3 scatter plots for the combinations 
    of the 3 principal components. PC1 vs PC2, PC1 vs PC3, PC2 vs PC3.

    :Arguments:
        :type data: pandas.core.frame.DataFrame
        :param data: Data frame with the data to plot.
        
        :type outpath: string
        :param outpath: Path for the output file

        :type group: string
        :param group: Name of the column that contains the group information on the design file.

    :Return:
        :rtype PDF: file
        :retrn PDF: file with the 3 scatter plots for PC1 vs PC2, PC1 vs PC3, PC2  vs PC3.
    """
    for x,y in  list(itertools.combinations(data.columns.tolist(),2)):
        # Creating a figure handler object
        fh = figureHandler(proj="2d", figsize=(14,8))

        # Creating title for the figure
        title = "{0} vs {1}".format(x,y)
        
        # Creating the scatterplot 2D
        scatter.scatter2D(ax=fh.ax[0], x=list(data[x]), y=list(data[y]),
                        colorList=palette.design.colors.tolist())

        # Despine axis
        fh.despine(fh.ax[0])

        fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName)

        # Shinking the plot so everything fits
        fh.shrink()

        # Format Axis
        fh.formatAxis(figTitle=title, xTitle="Scores on {0}".format(x),
                    yTitle="Scores on {0}".format(y), grid=False)

        # Adding figure to pdf
        fh.addToPdf(dpi=90,pdfPages=pdf)
def main(args):
    # Loading design
    if args.design:
        design = pd.DataFrame.from_csv(args.design,sep="\t")
        design.reset_index(inplace=True)
    else:
        design = False

    # Loading wide file
    wide = pd.DataFrame.from_csv(args.input,sep="\t")

    # Create figureHandler object
    fh = figureHandler(proj="2d",figsize=(14,8))

    # If design file with group and the uniqID is "sampleID" then color by group
    if args.group and args.uniqID == "sampleID":
        glist=list(design[args.group])
        colorList, ucGroups = palette.getColorsByGroup(design=design,
                                group=args.group,uGroup=sorted(set(glist)))
    else:
        glist = list()
        colorList = palette.mpl_colors[0]
        ucGroups = dict()
    
    # Plote scatterplot 2D
    scatter.scatter2D(ax=fh.ax[0], x=list(wide[args.x]), y=list(wide[args.y]),
                    colorList=colorList)

    # Despine axis (spine = tick)
    fh.despine(fh.ax[0])

    # Formating axis
    fh.formatAxis(figTitle=args.x + " vs " + args.y, xTitle=args.x, 
                yTitle=args.y, grid=False)
    
    # If groups are provided create a legend
    if args.group and args.uniqID == "sampleID":
        fh.makeLegend(ax=fh.ax[0],ucGroups=ucGroups,group=args.group)
        fh.shrink()

    # Saving figure to file
    with PdfPages(args.figure) as pdfOut:
        fh.addToPdf(dpi=600, pdfPages=pdfOut)
    logger.info("Script Complete!")
Exemple #3
0
def plotScatterplot2D(data, palette, pdf, nloads=3):
    """
    Plots Scatterplots 2D for a number of loadngs for PCA.

    :Arguments:
        :type data: pandas.DataFrame
        :param data: Loadings of the PCA.

        :type pdf: pdf object
        :param pdf: PDF object to save all the generated figures.

        :type nloads: int
        :param nloads: Number of principal components to create pairwise combs.
    """

    # Selecting amount of pairwise combinations to plot scaterplots for loads.
    for x, y in list(combinations(data.columns.tolist()[:nloads],2)):

        # Create a single-figure figure handler object
        fh = figureHandler(proj="2d", figsize=(14,8))

        # Create a title for the figure
        title = "{0} vs {1}".format(x,y)

        # Plot the scatterplot based on data
        scatter.scatter2D(x=list(data[x]), y=list(data[y]),
                         colorList=palette.design.colors.tolist(), ax=fh.ax[0])

        # Create legend
        fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName)

        # Shrink axis to fit legend
        fh.shrink()

        # Despine axis
        fh.despine(fh.ax[0])

        # Formatting axis
        fh.formatAxis(figTitle=title,xTitle="Scores on {0}".format(x),
            yTitle="Scores on {0}".format(y),grid=False)

        # Adding figure to pdf
        fh.addToPdf(dpi=600,pdfPages=pdf)
Exemple #4
0
def makeScatter(x, y, ax, fh):
    """ Plot a scatter plot of x vs y.

    :Arguments:
        :type x: pandas.Series
        :param x: Series of first sample, treated as independent variable.

        :type y: pandas.Series
        :param y: Series of second sample, treated as dependent variables.

        :type ax: matplotlib.axis
        :param ax: Axis which to plot.

        :type fh: figureHandler
        :param fh: figure to draw BA plots onto.

    :Returns:
        :rtype: matplotlib.axis
        :returns: A matplotlib axis with a scatter plot.

    """
    #logger.info('{0}, {1}'.format(x.name, y.name))
    # Get Upper and Lower CI from regression
    lower, upper, fitted, resid, infl = runRegression(x, y)

    # Plot scatter
    scatter.scatter2D(x=x, y=y, ax=ax, colorList=list("b"))
    # Plot regression lines
    # If there are missing data, x and the result vectors won't have the same
    # dimensions. First filter x by the index of the fitted values then plot.
    x2 = x.loc[fitted.index]
    lines.drawCutoff(x=x2, y=lower, ax=ax)
    lines.drawCutoff(x=x2, y=fitted, ax=ax)
    lines.drawCutoff(x=x2, y=upper, ax=ax)
    # Adjust plot
    fh.formatAxis(axnum=0,
                  xTitle=x.name,
                  yTitle=y.name,
                  axTitle='Scatter plot',
                  grid=False)
Exemple #5
0
def qqPlot(tresid, tfit, oname):
    """ 
    Plot the residual diagnostic plots by sample.

    Output q-q plot, boxplots and distributions of the residuals. These plots
    will be used diagnose if residuals are approximately normal.

    :Arguments:
        :type tresid: pandas.Series
        :param tresid: Pearson normalized residuals. (transposed)
                        (residuals / sqrt(MSE))

        :type tfit: pandas DataFrame
        :param tfit: output of the ANOVA (transposed)

        :type oname: string
        :param oname: Name of the output file in pdf format.

    :Returns:
        :rtype: PDF
        :returns: Outputs a pdf file containing all plots.

    """
    #Open pdf
    with PdfPages(oname) as pdf:

        # Stablishing axisLayout
        axisLayout = [(0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 1, 1), (1, 0, 3, 1)]

        # Start plotting
        for col in tresid.columns:
            #Creating figure
            fig = figureHandler(proj='2d',
                                numAx=4,
                                numRow=2,
                                numCol=3,
                                arrangement=axisLayout)

            data = tresid[col].values.ravel()
            noColors = list()
            for j in range(0, len(data)):
                noColors.append('b')  #blue
            df_data = pd.DataFrame(data)

            # Removing missing so that it will plot correctly.
            mask_nan_data = np.isnan(data)
            data = data[~mask_nan_data]

            # Plot qqplot on axis 0
            sm.graphics.qqplot(data, fit=True, line='r', ax=fig.ax[0])

            # Plot boxplot on axis 1
            box.boxSeries(ser=data, ax=fig.ax[1])

            # Plot histogram on axis 2
            hist.quickHist(ax=fig.ax[2], dat=df_data, orientation='horizontal')

            # Plot scatterplot on axis 3
            scatter.scatter2D(ax=fig.ax[3],
                              x=tfit[col],
                              y=tresid[col],
                              colorList=list('b'))

            # Draw cutoff line for scatterplot on axis 3
            lines.drawCutoffHoriz(ax=fig.ax[3], y=0)

            # Format axis 0
            fig.formatAxis(figTitle=col,
                           axnum=0,
                           grid=False,
                           showX=True,
                           yTitle="Sample Quantiles",
                           xTitle=" ")

            # Format axis 1
            fig.formatAxis(axnum=1,
                           axTitle="Standardized Residuals",
                           grid=False,
                           showX=False,
                           showY=True,
                           xTitle=" ")

            # Format axis 2
            fig.formatAxis(axnum=2,
                           grid=False,
                           showX=True,
                           showY=True,
                           axTitle=" ",
                           xTitle=" ")

            # Format axis 3
            fig.formatAxis(axnum=3,
                           axTitle="Predicted Values vs Residual Values",
                           xTitle="Predicted Values",
                           yTitle="Residual Values",
                           grid=False)

            #Add figure to pdf
            fig.addToPdf(pdfPages=pdf)
Exemple #6
0
def main(args):

    # Loading data trought Interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, 
                        runOrder=args.order, logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)
    
    # Cleaning from missing data
    dat.dropMissing()


    # SCENARIO 1: Unpaired t-test. In this case there can be as many groups as possible. 
    # Order variable is ignored and t-tests are performed pairwise for each pair of groups.

    if args.pairing == "unpaired":
       logger.info("Unpaired t-test will be performed for all groups pairwise.")
  

       # Getting the uinique pairs and all pairwise prermutations
       # son that we will feed them to pairwise unpaired t-tests.
       group_values_series = dat.transpose()[dat.group].T.squeeze()
       group_values_series_unique = group_values_series.unique()
       number_of_unique_groups = group_values_series_unique.shape[0]
       groups_pairwise = list(combinations(group_values_series_unique,2) ) 
       number_of_groups_pairwise = len(groups_pairwise)

       # Extracting data from the interface.
       data_frame = dat.transpose()
       # Extracting number of features. This will depend on whether the user has provided ordering variable or not.
       # This variable is useless for unpared test. it just adds extra column to the data frame.	
       if args.order == False:
          number_of_features = data_frame.shape[1] - 1
       else:
          number_of_features = data_frame.shape[1] - 2
       # Saving treatment group name from the arguments.



       # Computing overall summaries (mean and variance).
       # This part just produces sumamry statistics for the output table.
       # This has nothing to do with unpaired t-test. This is just summary for the table.
       mean_value_all = [0] * number_of_features
       variance_value_all = [0] * number_of_features

       for j in range(0, number_of_features ):
  

           # Creating duplicate for manipulation.
           data_frame_manipulate = data_frame

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_manipulate_transpose  = data_frame_manipulate.drop(  args.group, 1 ).transpose()
           else:
              data_frame_manipulate_transpose  = data_frame_manipulate.drop(  [args.group, args.order], 1 ).transpose()
           # Pulling indexes list from the current data frame.
           indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

           # Computing dataset summaries.
           mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
           variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)



       # Creating the table and putting the results there.
       summary_df     =  pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
       summary_df['SampleVariance'] =  variance_value_all


       # Computing means for each group and outputting them.
       # This part just produces summary statistics for the output table.
       # This has nothing to do with unpaired t-test. This is just summary for the table.

       for i in range(0, number_of_unique_groups ):
        

           # Extracting the pieces of the data frame that belong to the ith group.
           data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_current_group  = data_frame_current_group.drop(  args.group, 1 ).transpose()
           else:
              data_frame_current_group  = data_frame_current_group.drop(  [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the current group.
           indexes_list = data_frame_current_group.index.tolist()

           # Creating array of means for the current group that will be filled.
           means_value  = [0] * number_of_features
    
           for j in range(0, number_of_features ):
  
               series_current = data_frame_current_group.loc[ indexes_list[j] ] 
               means_value[j] = series_current.mean()


           # Adding current mean_value column to the data frame and assigning the name.
           means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
           summary_df[means_value_column_name_current] = means_value
           
           


       # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise.
       for i in range(0, number_of_groups_pairwise ):
        
           # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
           groups_subset = groups_pairwise[i] 
           data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]]  )]
           data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_first_group  = data_frame_first_group.drop(  args.group, 1 ).transpose()
              data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose()
           else:
              data_frame_first_group  = data_frame_first_group.drop(  [args.group, args.order], 1 ).transpose()
              data_frame_second_group = data_frame_second_group.drop( [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the first one (they are the same)
           indexes_list = data_frame_first_group.index.tolist()

           # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
           p_value           = [0] * number_of_features
           t_value           = [0] * number_of_features
           neg_log10_p_value = [0] * number_of_features
           flag_value_0p01   = [0] * number_of_features
           flag_value_0p05   = [0] * number_of_features
           flag_value_0p10   = [0] * number_of_features
           difference_value  = [0] * number_of_features


           for j in range(0, number_of_features ):
       
               series_first  = data_frame_first_group.loc[ indexes_list[j] ] 
               series_second = data_frame_second_group.loc[ indexes_list[j] ]

               ttest_ind_args = [series_first, series_second]
               p_value[j] = ttest_ind( *ttest_ind_args )[1]
               t_value[j] = ttest_ind( *ttest_ind_args )[0]
               # Possible alternative for two groups.
               # p_value[j] = ttest_ind_args(series_first, series_second)[1]
	       neg_log10_p_value[j] = - np.log10(p_value[j])
               difference_value[j] = series_first.mean() - series_second.mean()
               if p_value[j] < 0.01: flag_value_0p01[j] = 1
               if p_value[j] < 0.05: flag_value_0p05[j] = 1
               if p_value[j] < 0.10: flag_value_0p10[j] = 1


           # Creating column names for the data frame.
           p_value_column_name_current           = 'prob_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
           t_value_column_name_current           = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
           neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1]
           difference_value_column_name_current  = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1]
           flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] 
           flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] 
           flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] 

           # Adding current p_value and flag_value column to the data frame and assigning the name.
           # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it.
           if i == 0:
              flag_df     =  pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
           else:
              flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

           # At this point data frame exists so only columns are added to the existing data frame.
           summary_df[p_value_column_name_current]           = p_value
           summary_df[t_value_column_name_current]           = t_value
           summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
           summary_df[difference_value_column_name_current]  = difference_value
           flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
           flag_df[flag_value_column_name_current_0p10] = flag_value_0p10
  


    # SCENARIO 2: Paired t-test. In this case there should be EXACTLY TWO groups.
    # Each sample in one group should have exacty one matching pair in the other group. 
    # The matching is controlled by args.order variable.

    if args.pairing == "paired":
       logger.info("Paired test will be performed for two groups pairwise based on pairing variable: {0}.".format(args.order))


       # Getting the number of unique groups. If it is bigger than 2 return the warning and exit.
       group_values_series = dat.transpose()[dat.group].T.squeeze()
       group_values_series_unique = group_values_series.unique()
       number_of_unique_groups = group_values_series_unique.shape[0]
       if number_of_unique_groups != 2:
          logger.warning(u"The number of unique groups is {0} and not 2 as expected. The paired t-test cannot be performed.".format(number_of_unique_groups) )
          exit()	
 
       # This piece of code will be executed only if the number_of_unique_groups is exactly 2 so the group check is passed. 

       # Creating pairwise combination of our two groups that we will use in the future.
       groups_pairwise = list( combinations(group_values_series_unique,2) ) 
       number_of_groups_pairwise = len(groups_pairwise)

       # Extracting data from the interface.
       data_frame = dat.transpose()
       # Extracting number of features. This will depend on whether the user has provided ordering variable or not.
       # Checking that the requred pairing variable has been provided.
       if args.order == False:
          logger.info("The required t-test pairing variable has not been provided: The paired t-test cannot be performed.")
          exit()	


       # This piece of code will be executed only if the args.order has been provided and the check is passed. 

       # Defining the number of features. It should be the dimension of the data frame minus 2 columns that stand for arg.group and args.order
       number_of_features = data_frame.shape[1] - 2

       # At this point is is confirmed that there are only 2 group and that pairing variable args.order has been provided.
       # Now we need to check that pairing is correct i.e. that each pairID corresponds to only two samples from different groups.

       # Getting the unique pairs and deleting those theat have more or less than three.
       pairid_values_series = dat.transpose()[dat.runOrder].T.squeeze()
       pairid_values_series_unique = pairid_values_series.unique()
       number_of_unique_pairid = pairid_values_series_unique.shape[0]


       # Extracting data from the interface.
       data_frame = dat.transpose()
  
       # Extracting the number of samples in the final frame.
       number_of_samples = data_frame.shape[0]


       # Performing the cleaning of the original data. We are removing samples that are not paired and not belonging to the two groups.
       # If the dataset has 1 or 3 or more matches for a pairid those samples are removed with a warning.
       # If pairdid corresponds to exactly two samples (which is correct) but groupid-s are NOT different those values will be also removed.
       for i in range(0, number_of_unique_pairid ):
       
           # Extracting the pieces of the data frame that belong to ith unique pairid.
           data_frame_current_pairid = data_frame.loc[data_frame[args.order].isin( [ pairid_values_series_unique[i] ]  )]

           # We transpose here so it will be easier to operate with.
           data_frame_current_pairid  = data_frame_current_pairid.transpose()
           sample_names_current_pairid = list(data_frame_current_pairid.columns.values)
       
           if data_frame_current_pairid.shape[1] != 2:

              # Pulling indexes list from the current data frame.
              logger.warning(u"Number of samples for the pairID: {0} is equal to {1} and NOT equal to 2. Sample(s) {2} will be removed from further analysis.".format(pairid_values_series_unique[i],
                               data_frame_current_pairid.shape[1], sample_names_current_pairid)  )

              # Getting indexes we are trying to delete.
              boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid )  
              # Deleting the indexes and in the for loop going to next iteration.
              data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True)
    
           # This piece is executed if the numbe is correct i.e. data_frame_current_group.shape[1] == 2:
           # Here we are checking if the groupID-s for the given pair are indeed different.

           elif data_frame_current_pairid.transpose()[args.group][0] == data_frame_current_pairid.transpose()[args.group][1]:

                logger.warning(u"Samples in pairID {0} have groupIDs: {1} and {2}. Should be different! Sample(s) {3} will be removed from further analysis.".format(pairid_values_series_unique[i],       		                         data_frame_current_pairid.transpose()[args.group][1], data_frame_current_pairid.transpose()[args.group][0], sample_names_current_pairid)  )
                   
                # Getting indexes we are trying to delete.
                boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid )  
                # Deleting the indexes.
                data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True)


        
       # Cheching if the data frame bacame empty after cleaning.
       if data_frame.shape[0] == 0:
          logger.warning(u"Number of paired samples in the final dataset is exactly 0! Please check the desing file for accuracy! Exiting the program."  )
          exit()	
   


       # Computing overall summaries (mean and variance).
       # This part just produces sumamry statistics for the output table.
       # This has nothing to do with paired t-test. This is just summary for the table.
       mean_value_all = [0] * number_of_features
       variance_value_all = [0] * number_of_features

       for j in range(0, number_of_features ):

           # Creating duplicate for manipulation.
           data_frame_manipulate = data_frame

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           data_frame_manipulate_transpose  = data_frame_manipulate.drop( [args.group,args.order], 1 ).transpose()
           # Pulling indexes list from the current data frame.
           indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

           # Computing dataset summaries.
           mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
           variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)



       # Creating the table and putting the results there.
       summary_df     =  pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
       summary_df['SampleVariance'] =  variance_value_all


       # Computing means for each group and outputting them.
       # This part just produces summary statistics for the output table.
       # This has nothing to do with paired t-test. This is just summary for the table.

       for i in range(0, number_of_unique_groups ):
        

           # Extracting the pieces of the data frame that belong to the ith group.
           data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           data_frame_current_group  = data_frame_current_group.drop(  [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the current group.
           indexes_list = data_frame_current_group.index.tolist()

           # Creating array of means for the current group that will be filled.
           means_value  = [0] * number_of_features
    
           for j in range(0, number_of_features ):
  
               series_current = data_frame_current_group.loc[ indexes_list[j] ] 
               means_value[j] = series_current.mean()


           # Adding current mean_value column to the data frame and assigning the name.
           means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
           summary_df[means_value_column_name_current] = means_value




       # Performing paired t-test for the two groups and saving the results.

       # Creating p_values and flag_values emply list of length number_of_features.
       # This will be used for thw two groups in paired t-test.
       p_value = [0] * number_of_features
       t_value = [0] * number_of_features
       flag_value_0p01   = [0] * number_of_features
       flag_value_0p05   = [0] * number_of_features
       flag_value_0p10   = [0] * number_of_features
       neg_log10_p_value = [0] * number_of_features
       difference_value  = [0] * number_of_features

       # Performing paired t-test for each pair of features.
       for j in range(0, number_of_features ):

  
           # Extracting the pieces of the data frame that belong to 1st group.
           data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[0]]  )]
           data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[1]]  )]

        
           # Sorting data frame by args.group index 
           # This will ensure datasets are aligned by pair when fed to the t-test.
           data_frame_first_group  = data_frame_first_group.sort(args.order)
           data_frame_second_group = data_frame_second_group.sort(args.order)


           # Sorting data frame by args.group index 
           data_frame_first_group  = data_frame_first_group.drop(  [args.group,args.order], 1 ).transpose()
           data_frame_second_group = data_frame_second_group.drop( [args.group,args.order], 1 ).transpose()
         
           # Pulling list of indexes. This is the same list for the first and for the second.
           indexes_list = data_frame_first_group.index.tolist()

           # Pullinng the samples out
           series_first  = data_frame_first_group.loc[ indexes_list[j] ] 
           series_second = data_frame_second_group.loc[ indexes_list[j] ]


           # Running t-test for the two given samples
           paired_ttest_args = [series_first, series_second]
           p_value[j] = ttest_rel( *paired_ttest_args )[1]
           t_value[j] = ttest_rel( *paired_ttest_args )[0]
           neg_log10_p_value[j] = - np.log10(p_value[j])
           difference_value[j] = series_first.mean() - series_second.mean()
           if p_value[j] < 0.01: flag_value_0p01[j] = 1
           if p_value[j] < 0.05: flag_value_0p05[j] = 1
           if p_value[j] < 0.10: flag_value_0p10[j] = 1


       # The loop over features has to be finished by now. Converting them into the data frame.    


       # Creating column names for the data frame.
       p_value_column_name_current           = 'prob_greater_than_t_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       t_value_column_name_current           = 't_value_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       difference_value_column_name_current  = 'diff_of_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       flag_value_column_name_current_0p01 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p01'
       flag_value_column_name_current_0p05 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p05'
       flag_value_column_name_current_0p10 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p10'


       summary_df[t_value_column_name_current] = t_value
       summary_df[p_value_column_name_current] = p_value
       summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
       summary_df[difference_value_column_name_current] = difference_value

       flag_df  =  pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
       flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
       flag_df[flag_value_column_name_current_0p10] = flag_value_0p10



   
    # Roundign the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name    =  args.uniqueID
    flag_df.index.name =  args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")




    # Generating Indexing for volcano plots.

    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff=2



    # Making volcano plots
    with PdfPages( args.volcano ) as pdf:
         for i in range(0, number_of_groups_pairwise ):
             # Set Up Figure
             volcanoPlot = figureHandler(proj="2d")


             groups_subset = groups_pairwise[i] 
             current_key =  groups_subset[0] + '_' + groups_subset[1]
             
             # Plot all results
             scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), 
                                colorList=list('b'), ax=volcanoPlot.ax[0])

             # Color results beyond treshold red
             cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff]
             if not cutLpvals.empty:
                    cutDiff = difs[current_key][cutLpvals.index]
                    scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), 
                                      colorList=list('r'), ax=volcanoPlot.ax[0])

             # Drawing cutoffs
             lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

             # Format axis (volcanoPlot)
             volcanoPlot.formatAxis(axTitle=current_key, grid=False,
                 yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key),
                 xTitle="Difference of treatment means for {0}".format(current_key))

             # Add figure to PDF
             volcanoPlot.addToPdf(pdfPages=pdf)
  
    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")



    # Ending script
    logger.info(u"Finishing running of t-test.")
def plotDistances(df_distance, palette, plotType, disType, cutoff, p, pdf):
    #Geting number of samples in dataframe (ns stands for number of samples)
    ns = len(df_distance.index)

    #Calculates the widht for the figure base on the number of samples
    figWidth = max(ns / 2, 16)

    # Keeping the order on the colors
    df_distance["colors"] = palette.design["colors"]

    # Create figure object with a single axis
    figure = figureHandler(proj='2d', figsize=(figWidth, 8))

    # Getting type of distance file
    if "distance_to_mean" in df_distance.columns:
        dataType = "to the mean"
    else:
        dataType = "pairwise"

    # Getting ty of distance header
    if disType == "Mahalanobis":
        distType1 = "Penalized"
        distType2 = disType
    else:
        distType1 = "Standardized"
        distType2 = disType

    # Adds Figure title, x axis limits and set the xticks
    figure.formatAxis(figTitle="{0} for {1} {2} Distance for {3} {4}".format(
        plotType, distType1, distType2, df_distance.name, dataType),
                      yTitle="{0} {1} Distance".format(distType1, distType2),
                      xTitle="Index",
                      ylim="ignore",
                      xlim=(-0.5, -0.5 + ns),
                      xticks=df_distance.index)

    # If distance to mean
    if dataType == "to the mean":
        # Plot scatterplot quickplot
        scatter.scatter2D(ax=figure.ax[0],
                          colorList=df_distance["colors"],
                          x=range(len(df_distance.index)),
                          y=df_distance["distance_to_mean"])
    # if pairwise
    else:
        if plotType == "Scatterplot":
            # Plot scatterplot
            for index in df_distance.index:
                scatter.scatter2D(ax=figure.ax[0],
                                  colorList=df_distance["colors"][index],
                                  x=range(len(df_distance.index)),
                                  y=df_distance[index])

        elif plotType == "Box-plots":
            # Plot Box plot
            box.boxDF(ax=figure.ax[0],
                      colors=df_distance["colors"],
                      dat=df_distance)

    # Shrink figure
    figure.shrink()

    # Plot legend
    figure.makeLegend(figure.ax[0], palette.ugColors, palette.combName)

    #Add a cutoof line
    cutoff.apply(lambda x: plotCutoffs(x, ax=figure.ax[0], p=p), axis=0)

    # Add figure to PDF and close the figure afterwards
    figure.addToPdf(pdf)

    # Drop "color" column to no mess the results
    df_distance.drop("colors", axis=1, inplace=True)
Exemple #8
0
def makeBA(x, y, ax, fh):
    """ Function to make BA Plot comparing x vs y.

    :Arguments:
        :type x: pandas.Series
        :param x: Series of first sample, treated as independent variable.

        :type y: pandas.Series
        :param y: Series of second sample, treated as dependent variables.

        :type ax: matplotlib.axis
        :param ax: Axis which to plot.

        :type fh: figureHandler
        :param fh: figure to draw BA plots onto.

    :Returns:
        :rtype: pandas.Series
        :returns: A Series containing Boolean values with True
              indicating a value is more extreme than CI and False indicating a
              value falls inside CI.

    """
    # Make BA plot
    x = x.apply(float)
    y = y.apply(float)

    diff = x - y
    mean = (x + y) / 2

    # Drop missing for current comparison
    diff.dropna(inplace=True)
    mean.dropna(inplace=True)

    # Get Upper and Lower CI from regression
    lower, upper, fitted, resid, infl = runRegression(mean, diff)
    mask1 = abs(resid['resid_pearson']) > cutoff
    mask2 = infl['cooks_pval'] <= 0.5
    mask3 = infl['dffits']
    mask = mask1 | mask2 | mask3

    # Create BA plot
    scatter.scatter2D(ax=ax, x=mean[~mask], y=diff[~mask], colorList='b')
    scatter.scatter2D(ax=ax, x=mean[mask], y=diff[mask], colorList='r')

    # Plot regression lines
    ax.plot(mean, lower, 'r:')
    ax.plot(mean, fitted, 'r')
    ax.axhline(0, color='k')
    ax.plot(mean, upper, 'r:')

    #Adjust axes
    fh.formatAxis(axnum=1,
                  xlim='ignore',
                  ylim='ignore',
                  axTitle='Bland-Altman Plot',
                  xTitle='Mean\n{0} & {1}'.format(x.name, y.name),
                  yTitle='Difference\n{0} - {1}'.format(x.name, y.name),
                  grid=False)

    return mask, mask1, mask2, mask3
def main(args):
    # If the user provides grouping variable we test each group against the null (my supplied by user, 0 is the default).
    if args.group != False:
        logger.info(
            u"""t-test will be performed for all groups saved in [{0}] variable in the desing file pairwise with the H_0: mu = {1}."""
            .format(args.group, args.mu))

        # Loading data trought Interface.
        logger.info("Loading data with the Interface")
        dat = wideToDesign(args.input,
                           args.design,
                           args.uniqueID,
                           group=args.group,
                           logger=logger)

        # Treat everything as numeric.
        dat.wide = dat.wide.applymap(float)

        # Cleaning from the missing data.
        dat.dropMissing()

        # Getting the uinique group values so that we will feed them to the t-tests.
        group_values_series = dat.transpose()[dat.group].T.squeeze()
        group_values_series_unique = group_values_series.unique()
        number_of_unique_groups = group_values_series_unique.shape[0]

        # Extracting data from the interface.
        data_frame = dat.transpose()
        # Extracting number of features. We subtract 1 since we have provided args.group
        number_of_features = data_frame.shape[1] - 1
        # Saving treatment group name from the arguments.

        # Computing overall summaries (mean and variance).
        # This part just produces sumamry statistics for the output table.
        # This has nothing to do with the single sample t-test.
        mean_value_all = [0] * number_of_features
        variance_value_all = [0] * number_of_features

        for j in range(0, number_of_features):
            # Creating duplicate for manipulation.
            data_frame_manipulate = data_frame

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also transpose here so it will be easier to operate with.
            data_frame_manipulate_transpose = data_frame_manipulate.drop(
                args.group, 1).transpose()
            # Pulling indexes list from the current data frame.
            indexes_list_complete = data_frame_manipulate_transpose.index.tolist(
            )

            # Computing dataset summaries for feature j.
            mean_value_all[j] = np.mean(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
            variance_value_all[j] = np.var(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                ddof=1)

        # Creating the table and putting the results there.
        summary_df = pd.DataFrame(data=mean_value_all,
                                  columns=["GrandMean"],
                                  index=indexes_list_complete)
        summary_df['SampleVariance'] = variance_value_all

        # Running single sample t-test for all groups.
        # We are also computing means for each group and outputting them.
        for i in range(0, number_of_unique_groups):

            # Extracting the pieces of the data frame that belong to the ith group.
            data_frame_current_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[i]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            data_frame_current_group = data_frame_current_group.drop(
                args.group, 1).transpose()

            # Pulling indexes list from the current group.
            indexes_list = data_frame_current_group.index.tolist()

            # Creating array of means for the current group that will be filled.
            # Creating p values, difference values,  neg_log10_p_value, t-value, flag_value lists filled wiht 0es.
            means_value = [0] * number_of_features
            difference_value = [0] * number_of_features
            p_value = [0] * number_of_features
            t_value = [0] * number_of_features
            neg_log10_p_value = [0] * number_of_features
            flag_value_0p01 = [0] * number_of_features
            flag_value_0p05 = [0] * number_of_features
            flag_value_0p10 = [0] * number_of_features

            for j in range(0, number_of_features):
                series_current = data_frame_current_group.loc[indexes_list[j]]
                means_value[j] = series_current.mean()

                # Performing one sample t-test
                ttest_1samp_args = [series_current, float(args.mu)]
                p_value[j] = ttest_1samp(*ttest_1samp_args)[1]
                t_value[j] = ttest_1samp(*ttest_1samp_args)[0]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = means_value[j] - float(args.mu)
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

            # Creating names for the current analysis columns and adding result columns to the data frame.
            means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[
                i]
            p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[
                i] + '_' + args.mu
            t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[
                i] + '_' + args.mu
            neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[
                i] + '_' + args.mu
            difference_value_column_name_current = 'diff_of_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + group_values_series_unique[
                i] + '_' + args.mu

            # Adding flag_value column to the data frame and assigning the name.
            # If the data frame for flags has not been created yet we create it on the fly. i.e. if i == 0 create it.
            if i == 0:
                flag_df = pd.DataFrame(
                    data=flag_value_0p01,
                    columns=[flag_value_column_name_current_0p01],
                    index=indexes_list)
            else:
                flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

            # At this point data frames (summary and flags) exist so only columns are added to the existing data frame.
            summary_df[means_value_column_name_current] = means_value
            summary_df[p_value_column_name_current] = p_value
            summary_df[t_value_column_name_current] = t_value
            summary_df[
                neg_log10_p_value_column_name_current] = neg_log10_p_value
            summary_df[difference_value_column_name_current] = difference_value
            flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
            flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # If the user does not provide grouping variable we test all dataset as a single group against the null (my supplied by user, 0 is the default).
    if args.group == False:
        logger.info(
            u"""t-test will be performed for the entire dataset since goruping variable was not provided."""
        )

        # Loading data trough the interface
        logger.info("Loading data with the Interface")
        dat = wideToDesign(args.input,
                           args.design,
                           args.uniqueID,
                           logger=logger)

        # Treat everything as numeric
        dat.wide = dat.wide.applymap(float)

        # Cleaning from missing data
        dat.dropMissing()

        # Saving the number of unique groups that will be used for plotting.
        # Since we did not feed any grouping variable it is exactly one.
        number_of_unique_groups = 1

        # Extracting data from the interface.
        data_frame = dat.wide.transpose()
        # Extracting number of features. We do not subtract 1 since we have not provided args.group
        number_of_features = data_frame.shape[1]
        # Saving treatment group name from the arguments.

        # Computing overall summaries (mean and variance).
        # This part just produces sumamry statistics for the output table.
        # This has nothing to do with single sample t-test. This is just summary for the table.
        mean_value_all = [0] * number_of_features
        variance_value_all = [0] * number_of_features
        # Creating array of means for the current group that will be filled.
        # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
        p_value = [0] * number_of_features
        t_value = [0] * number_of_features
        neg_log10_p_value = [0] * number_of_features
        difference_value = [0] * number_of_features
        flag_value_0p01 = [0] * number_of_features
        flag_value_0p05 = [0] * number_of_features
        flag_value_0p10 = [0] * number_of_features

        for j in range(0, number_of_features):
            # We transpose here so data will be easier to operate on.
            data_frame_manipulate_transpose = data_frame.transpose()
            # Pulling indexes list from the current data frame.
            indexes_list_complete = data_frame_manipulate_transpose.index.tolist(
            )

            # Computing dataset summaries.
            mean_value_all[j] = np.mean(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
            variance_value_all[j] = np.var(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                ddof=1)

            # Performing one sample t-test for the entire dataset.
            ttest_1samp_args = [
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                float(args.mu)
            ]
            p_value[j] = ttest_1samp(*ttest_1samp_args)[1]
            t_value[j] = ttest_1samp(*ttest_1samp_args)[0]
            neg_log10_p_value[j] = -np.log10(p_value[j])
            difference_value[j] = mean_value_all[j] - float(args.mu)
            if p_value[j] < 0.01: flag_value_0p01[j] = 1
            if p_value[j] < 0.05: flag_value_0p05[j] = 1
            if p_value[j] < 0.10: flag_value_0p10[j] = 1

        # Creating the table and putting the results there.
        summary_df = pd.DataFrame(data=mean_value_all,
                                  columns=["GrandMean"],
                                  index=indexes_list_complete)
        summary_df['SampleVariance'] = variance_value_all

        # Creating names for the current analysis columns and adding result columns to the data frame.
        means_value_column_name_current = 'mean_treatment_all'
        p_value_column_name_current = 'prob_greater_than_t_for_diff_all_' + args.mu
        t_value_column_name_current = 't_value_for_diff_all_' + args.mu
        neg_log10_p_value_column_name_current = 'neg_log10_p_value_all_' + args.mu
        difference_value_column_name_current = 'diff_of_all_' + args.mu
        flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_all_' + args.mu
        flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_all_' + args.mu
        flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_all_' + args.mu

        summary_df[means_value_column_name_current] = mean_value_all
        summary_df[p_value_column_name_current] = p_value
        summary_df[t_value_column_name_current] = t_value
        summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
        summary_df[difference_value_column_name_current] = difference_value

        flag_df = pd.DataFrame(data=flag_value_0p01,
                               columns=[flag_value_column_name_current_0p01],
                               index=indexes_list_complete)
        flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
        flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # Roundign the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name = args.uniqueID
    flag_df.index.name = args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")

    # Generating Indexing for volcano plots.
    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
              if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
              if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff = 2

    # Making volcano plots
    with PdfPages(args.volcano) as pdf:
        for i in range(0, number_of_unique_groups):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            # If no grouping variable is provided.
            if number_of_unique_groups == 1:
                current_key = 'all_' + args.mu
            else:
                current_key = group_values_series_unique[i] + '_' + args.mu

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]),
                              y=list(lpvals[current_key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=current_key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".
                format(current_key),
                xTitle="Difference of the means from H0 for {0}".format(
                    current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    logger.info(u"Volcano plots have been created.")
    logger.info(u"Finishing running of t-test.")
def plotSignificantROR(data, pdf, palette):
    """
    Plot a scatter plot of x vs y. 

    :Arguments:

        :type row:
        :param row:

        :type pdf: PdfPages
        :param pdf: pdf object to store scatterplots

        :type des: pandas DataFrame
        :param des: design file

        :type groupName: string
        :param groupName: name of group
    """
    # Iterates over all rows in the dataframe
    # Make scatter plot if p-pvalue is less than 0.05
    for index, row in data.iterrows():
        if row["pval"] > 0.05: continue
        #plotSignificantROR(row,pdf,dat.design,args.group)

        # Get 95% CI
        prstd, lower, upper = wls_prediction_std(row["res"])

        # Sort CIs for Plotting
        toPlot = pd.DataFrame({"x": row["x"], "lower": lower, "upper": upper})
        toPlot.sort_values(by="x", inplace=True)

        # Create plot
        fh = figureHandler(proj="2d", figsize=(14, 8))

        #Plot scatterplot
        scatter.scatter2D(ax=fh.ax[0],
                          x=row["x"],
                          y=row["y"],
                          colorList=palette.list_colors)

        # Plot cutoffs
        lines.drawCutoff(ax=fh.ax[0], x=row["x"], y=row["fitted"], c="c")
        lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["lower"], c="r")
        lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["upper"], c="r")

        # Formatting
        ymin, ymax = fh.ax[0].get_ylim()
        fh.formatAxis(xTitle="Run Order", yTitle="Value", ylim=(ymin,ymax*1.2),
        figTitle=u"{} Scatter plot (fitted regression line and prediction bands"\
        " included)".format(row["name"]))

        # Shrink figure
        fh.shrink()

        # Add legend to figure
        fh.makeLegend(ax=fh.ax[0],
                      ucGroups=palette.ugColors,
                      group=palette.combName)

        #Add text to the ax
        fh.ax[0].text(.7, .85, u"Slope= {0:.4f}\n(p-value = {1:.4f})\n"\
            "$R^2$ = {2:4f}".format(round(row["slope"],4), round(row["pval"],4),
            round(row["rsq"],4)),transform=fh.ax[0].transAxes, fontsize=12)

        # Save to PDF
        fh.addToPdf(pdf)
def makePlots(SEDData, design, pdf, groupName, cutoff, p, plotType, ugColors,
              levels):
    """
    Manage all the plots for this script

    :Arguments:
        :type SEDData: pandas.dataFrame
        :param SEDData: Contains SED data either to Mean or pairwise

        :type design: pandas.dataFrame
        :param design: Design file after getColor

        :type pdf: PDF object
        :param pdf: PDF for output plots

        :type groupName: string
        :param groupName: Name of the group (figure title).

        :type cutoff: pandas.dataFrame
        :param cutoff: Cutoff values, beta, chi-sqr and normal.

        :type p: float
        :param p: Percentil for cutoff.

        :type plotType: string
        :param plotType: Type of plot, the possible types are scatterplot to mean
            scatterplot pairwise and boxplot pairwise.

    """

    #Geting number of features in dataframe
    nFeatures = len(SEDData.index)

    #Calculates the widht for the figure base on the number of features
    figWidth = max(nFeatures / 2, 16)

    # Create figure object with a single axis and initiate the figss
    figure = figureHandler(proj='2d', figsize=(figWidth, 8))

    # Keeping the order on the colors
    SEDData["colors"] = design["colors"]

    # Choose type of plot
    # Plot scatterplot to mean
    if (plotType == "scatterToMean"):
        #Adds Figure title, x axis limits and set the xticks
        figure.formatAxis(
            figTitle=
            "Standardized Euclidean Distance from samples {} to the mean".
            format(groupName),
            xlim=(-0.5, -0.5 + nFeatures),
            ylim="ignore",
            xticks=SEDData.index.values,
            xTitle="Index",
            yTitle="Standardized Euclidean Distance")

        #Plot scatterplot quickplot
        scatter.scatter2D(ax=figure.ax[0],
                          colorList=SEDData["colors"],
                          x=range(len(SEDData.index)),
                          y=SEDData["SED_to_Mean"])

    #Plot scatterplot pairwise
    elif (plotType == "scatterPairwise"):
        # Adds Figure title, x axis limits and set the xticks
        figure.formatAxis(
            figTitle="Pairwise standardized Euclidean Distance from samples {}"
            .format(groupName),
            xlim=(-0.5, -0.5 + nFeatures),
            ylim="ignore",
            xticks=SEDData.index.values,
            xTitle="Index",
            yTitle="Standardized Euclidean Distance")

        # Plot scatterplot
        for index in SEDData.index.values:
            scatter.scatter2D(ax=figure.ax[0],
                              colorList=design["colors"][index],
                              x=range(len(SEDData.index)),
                              y=SEDData[index])

    #Plot boxplot pairwise
    elif (plotType == "boxplotPairwise"):
        # Add Figure title, x axis limits and set the xticks
        figure.formatAxis(
            figTitle=
            "Box-plots for pairwise standardized Euclidean Distance from samples {}"
            .format(groupName),
            xlim=(-0.5, -0.5 + nFeatures),
            ylim="ignore",
            xticks=SEDData.index.values,
            xTitle="Index",
            yTitle="Standardized Euclidean Distance")
        # Plot Box plot
        box.boxDF(ax=figure.ax[0],
                  colors=SEDData["colors"].values,
                  dat=SEDData)

    #Add a cutoof line
    cutoff.apply(lambda x: plotCutoffs(x, ax=figure.ax[0], p=p), axis=0)
    figure.shrink()
    # Plot legend
    #if group:
    figure.makeLegend(figure.ax[0], ugColors, levels)

    # Add figure to PDF and close the figure afterwards
    figure.addToPdf(pdf)
Exemple #12
0
def volcano(combo, results, oname, cutoff=2):
    """ 
    Plot volcano plots.

    Creates volcano plots to compare means, for all pairwise differences.

    :Arguments:

        :type combo: dictionary
        :param combo: A dictionary of dictionaries with all possible pairwise
            combinations. Used this to create the various column headers in the
            results table.

        :type results: pandas DataFrame
        :param results: TODO

        :type oname: string
        :param oname: Name of the output file in pdf format.
       
        :type cutoff: int
        :param cutoff: The cutoff value for significance.

    :Returns:
        :rtype: PD
        :returns: Outputs a pdf file containing all plots.

    """
    # Getting data for lpvals
    lpvals = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \
            if col.startswith("-log10_p-value_")}

    # Gettign data for diffs
    difs   = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \
            if col.startswith("diff_of")}

    # Making plots
    with PdfPages(oname) as pdf:
        for key in sorted(difs.keys()):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            # Plot all results
            scatter.scatter2D(x=list(difs[key]),
                              y=list(lpvals[key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[key][lpvals[key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment = {0}".format(
                    key),
                xTitle="Diff of treatment = {0}".format(key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)
Exemple #13
0
def main(args):

    # Loading data through Interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)

    # Cleaning from missing data
    dat.dropMissing()

    # Unpaired permuted t-test. In this case there can be as many groups as possible. 
    # Order variable is ignored and t-tests are performed pairwise for each pair of groups.

    logger.info("Unpaired t-test will be performed for all groups pairwise.")

    # Getting the unique pairs and all pairwise permutations to feed to pairwise unpaired t-tests.
    group_values_series = dat.transpose()[dat.group].T.squeeze()
    group_values_series_unique = group_values_series.unique()
    number_of_unique_groups = group_values_series_unique.shape[0]
    groups_pairwise = list(combinations(group_values_series_unique,2) ) 
    number_of_groups_pairwise = len(groups_pairwise)

    # Extracting data from the interface.
    data_frame = dat.transpose()
    # Extracting number of features.
    # This variable not used in unpaired test. it just adds extra column to the data frame.	
#    if args.order == False:
    number_of_features = data_frame.shape[1] - 1

    # Saving treatment group name from the arguments.
    # Computing overall summaries (mean and variance).
    # This part just produces summary statistics for the output table.
    mean_value_all = [0] * number_of_features
    variance_value_all = [0] * number_of_features

    for j in range(0, number_of_features ):
        # Creating duplicate for manipulation.
        data_frame_manipulate = data_frame

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        # We should either drop 1 or 2 columns depending whether we fed the second one.
        data_frame_manipulate_transpose  = data_frame_manipulate.drop(  args.group, 1 ).transpose()

        # Pulling indexes list from the current data frame.
        indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

        # Computing dataset summaries.
        mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
        variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)

    # Creating the table and putting the results there.
    summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
    summary_df['SampleVariance'] =  variance_value_all


    # Computing means for each group and outputting them.
    # This part just produces summary statistics for the output table.
    for i in range(0, number_of_unique_groups ):
       # Extracting the pieces of the data frame that belong to the ith group.
       data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

       # Dropping columns that characterize group. Only feature columns will remain.
       # We also trnaspose here so it will be easier to operate with.
       # We should either drop 1 or 2 columns depending whether we fed the second one.
       data_frame_current_group  = data_frame_current_group.drop(  args.group, 1 ).transpose()

       # Pulling indexes list from the current group.
       indexes_list = data_frame_current_group.index.tolist()

       # Creating array of means for the current group that will be filled.
       means_value  = [0] * number_of_features

       for j in range(0, number_of_features ):
           series_current = data_frame_current_group.loc[ indexes_list[j] ] 
           means_value[j] = series_current.mean()

       # Adding current mean_value column to the data frame and assigning the name.
       means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
       summary_df[means_value_column_name_current] = means_value


    # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise.
    for i in range(0, number_of_groups_pairwise ):
       # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
       groups_subset = groups_pairwise[i]
       data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]]  )]
       data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]]  )]

       # Dropping columns that characterize group. Only feature columns will remain.
       # We also trnaspose here so it will be easier to operate with.
       # We should either drop 1 or 2 columns depending whether we fed the second one.
       data_frame_first_group  = data_frame_first_group.drop(  args.group, 1 ).transpose()
       data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose()

       # Pulling indexes list from the first one (they are the same)
       indexes_list = data_frame_first_group.index.tolist()

       # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
       p_value           = [0] * number_of_features
       t_value           = [0] * number_of_features
       neg_log10_p_value = [0] * number_of_features
       flag_value_0p01   = [0] * number_of_features
       flag_value_0p05   = [0] * number_of_features
       flag_value_0p10   = [0] * number_of_features
       difference_value  = [0] * number_of_features

       for j in range(0, number_of_features ):
           series_first  = data_frame_first_group.loc[ indexes_list[j] ]
           series_second = data_frame_second_group.loc[ indexes_list[j] ]

           p_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[0]    
#           print j
#           print p_value[j]
           t_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[1]
#           print j
#           print t_value[j]

           neg_log10_p_value[j] = - np.log10(p_value[j])
           difference_value[j] = series_first.mean() - series_second.mean()
           if p_value[j] < 0.01: flag_value_0p01[j] = 1
           if p_value[j] < 0.05: flag_value_0p05[j] = 1
           if p_value[j] < 0.10: flag_value_0p10[j] = 1

       # Creating column names for the data frame.
       p_value_column_name_current           = 'perm_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
       t_value_column_name_current           = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
       neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1]
       difference_value_column_name_current  = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1]
       flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] 
       flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] 
       flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] 

       # Adding current p_value and flag_value column to the data frame and assigning the name.
       # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it.
       if i == 0:
          flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
       else:
          flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

       # At this point data frame exists so only columns are added to the existing data frame.
       summary_df[p_value_column_name_current]           = p_value
       summary_df[t_value_column_name_current]           = t_value
       summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
       summary_df[difference_value_column_name_current]  = difference_value
       flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
       flag_df[flag_value_column_name_current_0p10] = flag_value_0p10




    # Rounding the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there originally.
    summary_df.index.name    =  args.uniqueID
    flag_df.index.name =  args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")



    # Generating Indexing for volcano plots.
    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
        if col.startswith("neg_log10_p_value")}

    # Getting data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
        if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff=2

    # Making volcano plots
    with PdfPages( args.volcano ) as pdf:
        for i in range(0, number_of_groups_pairwise ):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            groups_subset = groups_pairwise[i]
            current_key =  groups_subset[0] + '_' + groups_subset[1]

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0])

            # Color results beyond threshold red
            cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(axTitle=current_key, grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key),
                xTitle="Difference of treatment means for {0}".format(current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")

    # Ending script
    logger.info(u"Finishing t-test run.")
def main(args):

    # Loading data trought Interface
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqueID,
                       group=args.group,
                       logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)

    # Cleaning from missing data
    dat.dropMissing()

    # Getting the uinique pairs and all pairwise prermutations
    # son that we will feed them to Kruscal-Wallis.
    group_values_series = dat.transpose()[dat.group].T.squeeze()
    group_values_series_unique = group_values_series.unique()
    number_of_unique_groups = group_values_series_unique.shape[0]
    groups_pairwise = list(combinations(group_values_series_unique, 2))
    number_of_groups_pairwise = len(groups_pairwise)

    # Extracting data from the interface.
    data_frame = dat.transpose()
    # Extracting number of features.
    number_of_features = data_frame.shape[1] - 1
    # Saving treatment group name from the arguments.

    # Running overall Kruscall-Wallis test for all group levels combined.

    # Creating p_values_all and flag_values_all for 3 significance levels as emply lists of length equal to the number_of_features.
    # This will be used for all groups.
    p_value_all = [0] * number_of_features
    H_value_all = [0] * number_of_features
    mean_value_all = [0] * number_of_features
    variance_value_all = [0] * number_of_features
    flag_value_all_0p01 = [0] * number_of_features
    flag_value_all_0p05 = [0] * number_of_features
    flag_value_all_0p10 = [0] * number_of_features

    for j in range(0, number_of_features):

        # Creating duplicate for manipulation.
        data_frame_manipulate = data_frame

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_manipulate_transpose = data_frame_manipulate.drop(
            args.group, 1).transpose()
        # Pulling indexes list from the current data frame.
        indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

        # Computing dataset summaries.
        mean_value_all[j] = np.mean(
            data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
        variance_value_all[j] = np.var(
            data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
            ddof=1)

        for i in range(0, number_of_unique_groups):

            # Extracting the pieces of the data frame that belong to ith unique group.
            data_frame_current_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[i]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            data_frame_current_group = data_frame_current_group.drop(
                args.group, 1).transpose()
            # Pulling indexes list from the current data frame.
            indexes_list = data_frame_current_group.index.tolist()

            # Series current for group i and row (feature) j.
            series_current = data_frame_current_group.loc[indexes_list[j]]

            # This piece of code depends on whether it is the first group in the list or not.
            if i == 0:
                series_total = [series_current]
            else:
                series_total.append(series_current)

        # Checking if the compared elements are different.
        # Combining for checking.
        combined_list = data_frame_manipulate_transpose.loc[
            indexes_list_complete[j]].tolist()
        combined_list_unique = np.unique(combined_list)
        # Checking if the number of unique elements is exactly 1.
        if len(combined_list_unique) == 1:
            # Performing Kruscal-Wallis for all groups for feature j.
            p_value_all[j] = float("nan")
            H_value_all[j] = float("nan")
            if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1
            if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1
            if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1

        else:
            # Performing Kruscal-Wallis for all groups for feature j.
            kruscal_wallis_args = series_total
            p_value_all[j] = kruskalwallis(*kruscal_wallis_args)[1]
            H_value_all[j] = kruskalwallis(*kruscal_wallis_args)[0]
            if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1
            if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1
            if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1

    # The loop over features has to be finished by now. Converting them into the data frame.
    # The pariwise results will be added later.
    summary_df = pd.DataFrame(data=mean_value_all,
                              columns=["GrandMean"],
                              index=indexes_list)
    summary_df['SampleVariance'] = variance_value_all
    summary_df['H_value_for_all'] = H_value_all
    summary_df['prob_greater_than_H_for_all'] = p_value_all
    flag_df = pd.DataFrame(data=flag_value_all_0p01,
                           columns=["flag_significant_0p01_on_all_groups"],
                           index=indexes_list)
    flag_df["flag_significant_0p05_on_all_groups"] = flag_value_all_0p05
    flag_df["flag_significant_0p10_on_all_groups"] = flag_value_all_0p10

    # Informing that KW for all group has been performed.
    logger.info(
        u"Kruscal-Wallis test for all groups together has been performed.")

    # Computing means for each group
    # This part just produces sumamry statistics for the output table.
    # This has nothing to do with Kruscal-Wallis

    for i in range(0, number_of_unique_groups):

        # Extracting the pieces of the data frame that belong to ith group.
        data_frame_current_group = data_frame.loc[data_frame[args.group].isin(
            [group_values_series_unique[i]])]

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_current_group = data_frame_current_group.drop(
            args.group, 1).transpose()
        # Pulling indexes list from the current group.
        indexes_list = data_frame_current_group.index.tolist()

        # Creating array of means for the current group that will be filled.
        means_value = [0] * number_of_features

        for j in range(0, number_of_features):

            series_current = data_frame_current_group.loc[indexes_list[j]]
            means_value[j] = series_current.mean()

        means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[
            i]
        summary_df[means_value_column_name_current] = means_value

    # Running pairwise Kruscall-Wallis test for all pairs of group levels that are saved in groups_pairwise.

    for i in range(0, number_of_groups_pairwise):

        # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
        groups_subset = groups_pairwise[i]
        data_frame_first_group = data_frame.loc[data_frame[args.group].isin(
            [groups_subset[0]])]
        data_frame_second_group = data_frame.loc[data_frame[args.group].isin(
            [groups_subset[1]])]

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_first_group = data_frame_first_group.drop(args.group,
                                                             1).transpose()
        data_frame_second_group = data_frame_second_group.drop(args.group,
                                                               1).transpose()
        # Pulling indexes list from the first one (they are the same)
        indexes_list = data_frame_first_group.index.tolist()

        # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
        p_value = [0] * number_of_features
        H_value = [0] * number_of_features
        neg_log10_p_value = [0] * number_of_features
        flag_value_0p01 = [0] * number_of_features
        flag_value_0p05 = [0] * number_of_features
        flag_value_0p10 = [0] * number_of_features
        difference_value = [0] * number_of_features

        for j in range(0, number_of_features):

            series_first = data_frame_first_group.loc[indexes_list[j]]
            series_second = data_frame_second_group.loc[indexes_list[j]]

            # Checking if the compared elements are different.
            # Combining for checking.
            first_list = data_frame_first_group.loc[indexes_list[j]].tolist()
            second_list = data_frame_second_group.loc[indexes_list[j]].tolist()
            combined_list = first_list + second_list
            combined_list_unique = np.unique(combined_list)
            # Checking if the number of unique elements is exactly 1.
            if len(combined_list_unique) == 1:
                p_value[j] = float("nan")
                H_value[j] = float("nan")
                # Possible alternative for two groups.
                # p_value[j] = kruskalwallis(series_first, series_second)[1]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = series_first.mean() - series_second.mean(
                )
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

            else:
                kruscal_wallis_args = [series_first, series_second]
                p_value[j] = kruskalwallis(*kruscal_wallis_args)[1]
                H_value[j] = kruskalwallis(*kruscal_wallis_args)[0]
                # Possible alternative for two groups.
                # p_value[j] = kruskalwallis(series_first, series_second)[1]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = series_first.mean() - series_second.mean(
                )
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

        # Adding current p_value and flag_value column to the data frame and assigning the name
        p_value_column_name_current = 'prob_greater_than_H_for_diff_' + groups_subset[
            0] + '_' + groups_subset[1]
        H_value_column_name_current = 'H_value_for_diff_' + groups_subset[
            0] + '_' + groups_subset[1]
        neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[
            0] + '_' + groups_subset[1]
        difference_value_column_name_current = 'diff_of_' + groups_subset[
            0] + '_' + groups_subset[1]
        summary_df[p_value_column_name_current] = p_value
        summary_df[H_value_column_name_current] = H_value
        summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
        summary_df[difference_value_column_name_current] = difference_value

        flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_df[flag_value_column_name_current_0p01] = flag_value_0p01
        flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
        flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # Roundign the results up to 4 precison digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name = args.uniqueID
    flag_df.index.name = args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")

    # Informing that KW for pairwise group has been performed.
    logger.info(
        u"Kruscal-Wallis test for all groups pairwise has been performed.")

    # Generating Indexing for volcano plots.

    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff = 2

    # Making volcano plots
    with PdfPages(args.volcano) as pdf:
        for i in range(0, number_of_groups_pairwise):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            groups_subset = groups_pairwise[i]
            current_key = groups_subset[0] + '_' + groups_subset[1]

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]),
                              y=list(lpvals[current_key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=current_key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".
                format(current_key),
                xTitle="Difference of treatment means for {0}".format(
                    current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")

    # Ending script
    logger.info(u"Finishing running of Kruscal-Wallis tests.")