Esempio n. 1
0
def main(args):
    # Import data with interface
    logger.info("Importig data with interface")
    dat = wideToDesign(args.input, args.design, uniqID=args.uniqID, group=args.group,
                        logger=logger)

    # Preprocessing
    logger.info("Preprocessing")
    dat.wide = preprocess(noz=args.noZero, non=args.noNegative, ex=args.exclude,
                            data=dat.wide)

    # Choosing knn as imputation method
    logger.info("Inpute")
    if args.strategy == "knn":
        pdFull = imputeKNN(rc=float(args.rowCutoff), cc=float(args.colCutoff),
                            k=int(args.knn), dat=dat)
    else:
        # Iterate over groups and perform either a mean or median imputation.
        pdFull = iterateGroups(dat=dat, strategy=args.strategy, dist=args.dist, 
                                rc=args.rowCutoff) 
        
    # Convert dataframe to float and round results to 4 digits
    pdFull.applymap(float)
    pdFull = pdFull.round(4)

    # Maake sure that the output has the same unique.ID
    pdFull.index.name = args.uniqID

    # Saving inputed data
    pdFull.to_csv(args.output, sep="\t")
    logger.info("Script Complete!")
Esempio n. 2
0
def main(args):
    """ Function to input all the arguments"""
    # Checking if levels
    if args.levels and args.group:
        levels = [args.group] + args.levels
    elif args.group and not args.levels:
        levels = [args.group]
    else:
        levels = []
    logger.info(u"Groups used to color by: {0}".format(",".join(levels)))

    # Import data
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       group=args.group,
                       anno=args.levels,
                       logger=logger)

    # Remove groups with just one element
    dat.removeSingle()

    # Cleaning from missing data
    dat.dropMissing()

    # Treat everything as float and round it to 3 digits
    dat.wide = dat.wide.applymap(lambda x: round(x, 3))

    # Get colors
    palette.getColors(dat.design, levels)

    # Use group separation or not depending on user input
    CV, CVcutoff = calculateCV(data=dat.wide,
                               design=palette.design,
                               cutoff=args.CVcutoff,
                               levels=palette.combName)

    # Plot CVplots for each group and a distribution plot for all groups together
    logger.info("Plotting Data")
    with PdfPages(args.figure) as pdf:
        plotCVplots(data=CV, cutoff=CVcutoff, palette=palette, pdf=pdf)
        plotDistributions(data=CV, cutoff=CVcutoff, palette=palette, pdf=pdf)

    # Create flag file instance and output flags by group
    logger.info("Creatting Flags")
    flag = Flags(index=CV['cv'].index)
    for name, group in palette.design.groupby(palette.combName):
        flag.addColumn(column="flag_feature_big_CV_{0}".format(name),
                       mask=((CV['cv_' + name].get_values() > CVcutoff[name])
                             | CV['cv_' + name].isnull()))

    # Write flag file
    flag.df_flags.to_csv(args.flag, sep='\t')

    # Finishing script
    logger.info("Script Complete!")
def main(args):

    # Imput data
    dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger)

    # Convert objects to numeric
    norm = dat.wide.applymap(float)

    # The following steps depend whether we perform log transformation or g-log transformation.

    # LOG Transformation
    if args.transformation == 'log':

        # According to the tipe of log-base selected perform log transformation
        if args.log_base == 'log':
            logger.info(u"Running Log transformation with log e")
            norm = norm.apply(lambda x: np.log(x))
        elif args.log_base == 'log2':
            logger.info(u"Running Log transformation with log 2")
            norm = norm.apply(lambda x: np.log2(x))
        elif args.log_base == 'log10':
            logger.info(u"Running Log transformation with log 10")
            norm = norm.apply(lambda x: np.log10(x))

    # G-LOG Transformation
    # Generalized log transformation formula is:  log(y + sqrt(y^2 + lambda_value))
    # It reduced to sqrt(2) rescaled version of log when lambda_value = 0
    # i.e. lambda_value == 0 implies log(y + y) = sqrt(2) * log(y)
    if args.transformation == 'glog':

        # According to the tipe of log-base selected perform log transformation
        if args.log_base == 'log':
            logger.info(u"Running G-Log transformation with log e")
            norm = np.log(norm +
                          np.sqrt(np.square(norm) + float(args.lambda_value)))
        elif args.log_base == 'log2':
            logger.info(u"Running G-Log transformation with log 2")
            norm = np.log2(norm +
                           np.sqrt(np.square(norm) + float(args.lambda_value)))
        elif args.log_base == 'log10':
            logger.info(u"Running G-Log transformation with log 10")
            norm = np.log10(
                norm + np.sqrt(np.square(norm) + float(args.lambda_value)))

    # Round results to 8 digits
    norm = norm.apply(lambda x: x.round(8))

    # Treat inf as NaN
    norm.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Debugging step
    # print "norm", norm

    # Save file to CSV
    norm.to_csv(args.oname, sep="\t")
    logger.info("Finishing Script")
def main(args):
    """ 
    Main Script 
    """

    #Getting palettes for data and cutoffs
    global cutPalette
    cutPalette = ch.colorHandler(pal="tableau", col="TrafficLight_9")

    # Checking if levels
    if args.levels and args.group:
        levels = [args.group] + args.levels
    elif args.group and not args.levels:
        levels = [args.group]
    else:
        levels = []

    #Parsing data with interface
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       group=args.group,
                       anno=args.levels,
                       logger=logger,
                       runOrder=args.order)

    #Dropping missing values and remove groups with just one sample
    dat.dropMissing()
    if args.group:
        dat.removeSingle()

    #Select colors for data
    dataPalette.getColors(design=dat.design, groups=levels)
    dat.design = dataPalette.design

    #Open pdfPages Calculate SED
    with PdfPages(os.path.abspath(args.figure)) as pdf:
        SEDtoMean, SEDpairwise = calculateSED(dat, dataPalette.ugColors,
                                              dataPalette.combName, pdf,
                                              args.p)

    #Outputing files for tsv files
    SEDtoMean.to_csv(os.path.abspath(args.toMean),
                     index_label="sampleID",
                     columns=["SED_to_Mean"],
                     sep='\t')
    SEDpairwise.drop(["colors"], axis=1, inplace=True)
    if args.group:
        SEDpairwise.drop(["colors_x", "colors_y"], axis=1, inplace=True)
    SEDpairwise.to_csv(os.path.abspath(args.pairwise),
                       index_label="sampleID",
                       sep='\t')

    #Ending script
    logger.info("Script complete.")
def main(args):
    #Importing data
    logger.info("Importing data with the Interface")
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       args.group,
                       logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Calculate the means of each group but blanks
    logger.info("Calcualting group means")
    df_nobMeans = pd.DataFrame(index=dat.wide.index)
    for name, group in dat.design.groupby(dat.group):
        if name == args.blank:
            df_blank = dat.wide[group.index].copy()
        else:
            df_nobMeans[name] = dat.wide[group.index].mean(axis=1)

    # Calculating the LOD
    # Calculates the average of the blanks plus3 times the SD of the same.
    # If value calculated is 0 then use the default lod (default = 5000)
    # NOTE: ["lod"]!=0 expression represents that eveything that is not 0 is fine
    # and shoud remain as it is, and eveything that is 0  shoud be replaced
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.where.html
    logger.info(
        "Calculating limit of detection for each group default value [{0}].".
        format(args.bff))
    df_blank.loc[:, "lod"] = np.average(
        df_blank, axis=1) + (3 * np.std(df_blank, ddof=1, axis=1))
    df_blank["lod"].where(df_blank["lod"] != 0, args.bff, inplace=True)

    # Apoply the limit of detection to the rest of the data, these values will be
    # compared agains the criteria value for flagging.
    logger.info(
        "Comparing value of limit of detection to criteria [{0}].".format(
            args.criteria))
    nob_bff = pd.DataFrame(index=dat.wide.index, columns=df_nobMeans.columns)
    for group in nob_bff:
        nob_bff.loc[:, group] = (df_nobMeans[group] -
                                 df_blank["lod"]) / df_blank["lod"]

    # We create flags based on the criteria value (user customizable)
    logger.info("Creating flags.")
    df_offFlags = Flags(index=nob_bff.index)
    for group in nob_bff:
        df_offFlags.addColumn(column='flag_bff_' + group + '_off',
                              mask=(nob_bff[group] < args.criteria))

    # Output BFF values and flags
    nob_bff.to_csv(args.outbff, sep='\t')
    df_offFlags.df_flags.to_csv(args.outflags, sep='\t')
    logger.info("Script Complete!")
Esempio n. 6
0
def main(args):
    # Import data through the SECIMTools interface
    dat = wideToDesign(wide=args.input,
                       design=args.design,
                       uniqID=args.uniqID,
                       logger=logger)
    logger.info('Number of variables: {0}'.format(dat.wide.shape[0]))
    logger.info('Number of observations per variable: {0}'.format(
        dat.wide.shape[1]))

    ## If there is no variance in a row, the correlations cannot be computed.
    dat.wide["variance"] = dat.wide.apply(lambda x: ((x - x.mean()).sum()**2),
                                          axis=1)
    dat.wide = dat.wide[dat.wide["variance"] != 0.0]
    dat.wide.drop("variance", axis=1, inplace=True)
    logger.info("Table arranged")

    # Compute the matrix of correlation coefficients.
    C = dat.wide.T.corr(method=args.correlation).values
    logger.info("Correlated")

    # For now, ignore the possibility that a variable
    # will have negligible variation.
    mask = np.ones(dat.wide.shape[0], dtype=bool)

    # Count the number of variables not excluded from the clustering.
    p = np.count_nonzero(mask)

    # Consider all values of tuning parameter sigma in this array.
    sigmas, step = np.linspace(args.sigmaLow,
                               args.sigmaHigh,
                               num=args.sigmaNum,
                               retstep=True)

    # Compute the clustering for each of the several values of sigma.
    # Each sigma corresponds to a different affinity matrix,
    # so the modularity matrix is also different for each sigma.
    # The goal is to the clustering whose modularity is greatest
    # across all joint (sigma, partition) pairs.
    # In practice, we will look for an approximation of this global optimum.
    #exit()
    logger.info("Begin clustering")
    clustering, sigma, m = get_clustering(C, sigmas)

    # Report a summary of the results of the technical analysis.
    logger.info("After partition refinement:")
    logger.info("Sigma: {0}".format(sigma))
    logger.info("Number of clusters: {0}".format(clustering.max() + 1))
    logger.info("Modulated modularity: {0}".format(m))

    # Run the nontechnical analysis using the data frame and the less nerdy
    # of the outputs from the technical analysis.
    nontechnical_analysis(args, dat.wide, mask, C, clustering)
    logger.info("Script Complete!")
def main(args):
    # Checking if levels
    if args.levels and args.group:
        levels = [args.group]+args.levels
    elif args.group and not args.levels:
        levels = [args.group]
    else:
        levels = []

    #Parsing data with interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input, args.design, args.uniqID, args.group, 
                        runOrder=args.order, anno=args.levels, logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Get colors
    palette.getColors(dat.design,levels)

    # Transpose Data so compounds are columns, set the runOrder as index
    # and drop the colum with the groups from the tranposed wide.
    trans = dat.transpose()
    trans.set_index(dat.runOrder, inplace=True)
    trans.drop(dat.group, axis=1, inplace=True)

    # Run regressions
    logger.info("Running Regressions")
    ror_df = runRegression(trans)

    # Creating flags flags for pvals 0.05 and 0.1
    ror_flags = Flags(index=ror_df.index)    
    ror_flags.addColumn(column="flag_feature_runOrder_pval_05",
                        mask=(ror_df["pval"]<=0.05))
    ror_flags.addColumn(column="flag_feature_runOrder_pval_01",
                        mask=(ror_df["pval"]<=0.01))

    # Plot Results
    # Open a multiple page PDF for plots
    logger.info("Plotting Results")
    with PdfPages(args.figure) as pdf:
        plotSignificantROR(ror_df, pdf, palette)
        
        # If not pages
        if pdf.get_pagecount() == 0:
            fig = plt.figure()
            fig.text(0.5, 0.4, "There were no features significant for plotting.", fontsize=12)
            pdf.savefig(fig)

    # Write  results and flasg to TSV files
    ror_df.to_csv(args.table, sep="\t", float_format="%.4f", index_label=args.uniqID,
                columns=["pval","rsq","slope"])
    ror_flags.df_flags.to_csv(args.flags, sep="\t", index_label=args.uniqID)
Esempio n. 8
0
def main(args):
    """
    Function to call all other functions
    """
    # Checking if levels
    if args.levels and args.group:
        levels = [args.group] + args.levels
    elif args.group and not args.levels:
        levels = [args.group]
    else:
        levels = []
    logger.info(u"Groups used to color by: {0}".format(",".join(levels)))

    # Parsing files with interface
    logger.info(u"Loading data with the Interface")
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       args.group,
                       anno=args.levels,
                       runOrder=args.order,
                       logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Sort data by runOrder if provided
    if args.order:
        logger.info(u"Sorting by runOrder")
        design_final = dat.design.sort_values(by=args.order, axis=0)
        wide_final = dat.wide.reindex(columns=design_final.index)

    else:
        design_final = dat.design
        wide_final = dat.wide

    # Get colors for each sample based on the group
    palette.getColors(design=design_final, groups=levels)

    # Open PDF pages to output figures
    with PdfPages(args.figure) as pdf:

        # Plot density plot
        logger.info(u"Plotting density for sample distribution")
        plotDensityDistribution(pdf=pdf, wide=wide_final, palette=palette)

        # Plot boxplots
        logger.info(u"Plotting boxplot for sample distribution")
        plotBoxplotDistribution(pdf=pdf, wide=wide_final, palette=palette)

    logger.info(u"Script complete!")
def main(args):
    # Checking if levels
    if args.levels and args.group:
        levels = [args.group]+args.levels
    elif args.group and not args.levels:
        levels = [args.group]
    else:
        levels = []

    # Loading data trought Interface
    dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, 
                        anno=args.levels, logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)

    # Cleaning from missing data
    dat.dropMissing()

    # Get colors for each sample based on the group
    palette.getColors(design=dat.design, groups=levels)

    # Transpose data
    dat.trans = dat.transpose()

    # Run PLS
    df_scores,df_weights,df_classification = runPLS(dat.trans,dat.group,args.toCompare,args.nComp,args.cross_validation)

    # Update palette afterdrop selection of groups toCompare
    palette.design   =  palette.design.T[df_scores.index].T
    palette.ugColors =  {ugc:palette.ugColors[ugc] for ugc in palette.ugColors.keys() if ugc in args.toCompare}

    # Plotting scatter plot for scores
    with PdfPages(args.figure) as pdfOut:
        logger.info(u"Plotting PLS scores")
        plotScores(data=df_scores, palette=palette, pdf=pdfOut)

    # Save df_scores, df_weights and df_classification to tsv files.
    df_scores.to_csv( args.outScores,                 sep = "\t", index_label = 'sampleID' )
    df_weights.to_csv( args.outWeights,               sep = "\t", index_label = dat.uniqID )
    df_classification.to_csv( args.outClassification, sep = "\t", index_label = 'sampleID' )

    # Computing mismatches between original data and final data.
    classification_mismatch_percent = 100 * sum( df_classification['Group_Observed'] == df_classification['Group_Predicted_Rounded'] )/df_classification.shape[0]
    classification_mismatch_percent_string = str( classification_mismatch_percent ) + ' Percent'
    os.system("echo %s > %s"%( classification_mismatch_percent_string, args.outClassificationAccuracy ) )


    #Ending script
    logger.info(u"Finishing running of PLS")
Esempio n. 10
0
def main(args):
    # import data with interface
    dat = wideToDesign(wide=args.input,
                       design=args.design,
                       uniqID=args.uniqID,
                       logger=logger)

    kpd_wide = dropRowCol(df_col_UPD=dat.wide,
                          rowID=args.row,
                          colID=args.col,
                          args=args)

    # output new wide dataset
    kpd_wide.to_csv(args.outWide, sep='\t')
Esempio n. 11
0
def main(args):
    """Runs eveything"""
    # Importing data
    dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Getting labels to drop from arguments
    x = True
    y = True
    if "x" in args.labels:
        x = False
    if "y" in args.labels:
        y = False

    print("x =", x)
    print("y =", y)

    #Plotting with dendogram Hierarchical cluster heatmap (HCH)
    logger.info("Plotting heatmaps")
    if args.dendogram == True:
        fh = hm.plotHCHeatmap(dat.wide,
                              hcheatmap=True,
                              cmap=palette.mpl_colormap,
                              xlbls=x,
                              ylbls=y)
        fh.savefig(args.fig, format="pdf")

    #Plotting without a dendogram single heatmap
    else:
        # Creating figure Handler object
        fh = figureHandler(proj='2d', figsize=(14, 14))

        # Creating plot
        hm.plotHeatmap(dat.wide,
                       fh.ax[0],
                       cmap=palette.mpl_colormap,
                       xlbls=x,
                       ylbls=y)

        # formating axis
        fh.formatAxis(xTitle="sampleID")

        # Saving figure
        fh.export(out=args.fig, dpi=300)

    # Finishing script
    logger.info("Script Complete!")
Esempio n. 12
0
def main(args):
    """
    Function to call all other functions
    """
    # Loading files with interface
    logger.info(u"Loading data with the Interface")
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       group=args.group,
                       logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Subseting wide to get features for wide files with more that 50 features
    if len(dat.wide.index) > 50:
        wide = dat.wide.sample(n=50, axis=0)
        wide = wide.T
    else:
        wide = dat.wide.T

    # Saving figure
    with PdfPages(args.figure) as pdf:

        # Iterating over groups
        if args.group:

            # Getting colors for groups
            palette.getColors(design=dat.design, groups=[dat.group])

            # Iterating over groups
            for name, group in dat.design.groupby(args.group):
                logger.info(u"Plotting for group {0}".format(name))

                # Plotting Density and Box plot for the group
                plotDensity(data=wide.T[group.index], name=name, pdf=pdf)

        # Get colors for each feature for "All groups"
        logger.info(u"Plotting for group {0}".format("samples"))
        palette.getColors(design=dat.design, groups=[])

        # Plotting density and boxplots for all
        plotDensity(data=wide, name="samples", pdf=pdf)

        #Ending script
        logger.info(u"Ending script")
def main(args):
    # Import data
    logger.info("Importing data with the interface")
    dat = wideToDesign(args.input, args.design, args.uniqID)

    # Cleaning from missing data
    dat.dropMissing()

    # Iterate through each group to add flags for if a group has over half of
    # its data above the cutoff
    logger.info("Running threshold based flags")
    df_offFlags = Flags(index=dat.wide.index)
    for title, group in dat.design.groupby(args.group):
        mask = (dat.wide[group.index] < args.cutoff)
        meanOn = mask.mean(axis=1)
        df_offFlags.addColumn(column='flag_feature_' + title + '_off',
                              mask=meanOn > 0.5)

    logger.info("Creating output")
    df_offFlags.df_flags.to_csv(args.output, sep="\t")
Esempio n. 14
0
def main(args):
    # Import data with the interface
    dat = wideToDesign(wide=args.input,
                       design=args.design,
                       uniqID=args.uniqID,
                       logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Read flag file
    df_flags = pd.read_table(args.flags)

    # Select index on flag file if none then rise an error
    if args.flagUniqID:
        df_flags.set_index(args.flagUniqID, inplace=True)
    else:
        logger.error("Not flagUniqID provided")
        raise

    # Drop either rows or columns
    logger.info("Running drop flags by {0}".format(args.flagfiletype))
    if args.flagfiletype == "column":
        kpd_wide, kpd_flag = dropColumns(df_wide=dat.wide,
                                         df_flags=df_flags,
                                         cut_value=args.value,
                                         condition=args.condition,
                                         args=args)
    else:
        kpd_wide, kpd_flag = dropRows(df_wide=dat.wide,
                                      df_flags=df_flags,
                                      cut_value=args.value,
                                      condition=args.condition,
                                      args=args)

    # Wide and flags
    kpd_wide.to_csv(args.outWide, sep='\t')
    kpd_flag.to_csv(args.outFlags, sep='\t')

    # Finishing script
    logger.info("Script complete.")
Esempio n. 15
0
def main(args):
    # Importing data trough
    logger.info("Importing data through wideToDesign data manager")
    dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Making sure all the groups to drop actually exist on the design column
    if args.group:
        for todrop in args.drops:
            if todrop in list(set(dat.design[args.group].values)):
                pass
            else:
                logger.error("The group '{0}' is not located in the column '{1}' "\
                            "of your design file".format(todrop,args.group))
                raise ValueError

    # If the subsetting is going to be made by group the select de sampleIDs
    # from the design file
    logger.info(u"Getting sampleNames to drop")
    if args.group:
        iToDrop = list()
        for name, group in dat.design.groupby(args.group):
            if name in args.drops:
                iToDrop += (group.index.tolist())
    else:
        iToDrop = args.drops

    # Remove weird characters
    iToDrop = [cleanStr(x) for x in iToDrop]

    # Dropping elements
    selectedDesign = dat.design.drop(iToDrop, axis=0, inplace=False)

    # Output wide results
    logger.info("Output wide file")
    selectedDesign.to_csv(args.out, sep='\t')
    logger.info("Script Complete!")
Esempio n. 16
0
def main(args):
    # Checking if levels
    if args.levels and args.group:
        levels = [args.group]+args.levels
    elif args.group and not args.levels:
        levels = [args.group]
    else:
        levels = []

    #Loading data trought Interface
    dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, 
                        anno=args.levels, logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Get colors for each sample based on the group
    palette.getColors(design=dat.design, groups=levels)

    # Transpossing matrix
    dat.wide = dat.wide.T

    # RunPCA
    df_scores, df_loadings, df_summary = runPCA(dat.wide)

    #Plotting scatter plot 3D
    logger.info(u"Plotting PCA scores")
    with PdfPages(args.figure) as pdfOut:
        plotScatterplot2D(data=df_scores, palette=palette, pdf=pdfOut)
        plotScatterplot3D(data=df_scores, palette=palette, pdf=pdfOut)

    # Save Scores, Loadings and Summary
    df_scores.to_csv(args.score_out, sep="\t", index_label='sampleID')
    df_loadings.to_csv(args.load_out, sep="\t", index_label=dat.uniqID)
    df_summary.to_csv(args.summary_out, sep="\t", index_label="PCs")

    #Ending script
    logger.info(u"Finishing running of PCA")
def main(args):
    #parsing data with interface
    dat = wideToDesign(wide=args.input,
                       design=args.design,
                       uniqID=args.uniqID,
                       group=args.group,
                       logger=logger)

    # Removing groups with just one elemen from dat
    dat.removeSingle()

    # Create folder for counts if html found
    if args.html is not None:
        logger.info(u"Using html output file")
        folderDir = args.htmlPath
        try:
            os.makedirs(folderDir)
        except Exception, e:
            logger.error("Error. {}".format(e))

        # Initiation zip files
        html = createHTML()
        folderDir = folderDir + "/" + args.counts
Esempio n. 18
0
def main(args):

    # Loading data trought Interface
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqueID,
                       group=args.group,
                       logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)

    # Cleaning from missing data
    dat.dropMissing()

    # Getting the uinique pairs and all pairwise prermutations
    # son that we will feed them to Kruscal-Wallis.
    group_values_series = dat.transpose()[dat.group].T.squeeze()
    group_values_series_unique = group_values_series.unique()
    number_of_unique_groups = group_values_series_unique.shape[0]
    groups_pairwise = list(combinations(group_values_series_unique, 2))
    number_of_groups_pairwise = len(groups_pairwise)

    # Extracting data from the interface.
    data_frame = dat.transpose()
    # Extracting number of features.
    number_of_features = data_frame.shape[1] - 1
    # Saving treatment group name from the arguments.

    # Running overall Kruscall-Wallis test for all group levels combined.

    # Creating p_values_all and flag_values_all for 3 significance levels as emply lists of length equal to the number_of_features.
    # This will be used for all groups.
    p_value_all = [0] * number_of_features
    H_value_all = [0] * number_of_features
    mean_value_all = [0] * number_of_features
    variance_value_all = [0] * number_of_features
    flag_value_all_0p01 = [0] * number_of_features
    flag_value_all_0p05 = [0] * number_of_features
    flag_value_all_0p10 = [0] * number_of_features

    for j in range(0, number_of_features):

        # Creating duplicate for manipulation.
        data_frame_manipulate = data_frame

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_manipulate_transpose = data_frame_manipulate.drop(
            args.group, 1).transpose()
        # Pulling indexes list from the current data frame.
        indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

        # Computing dataset summaries.
        mean_value_all[j] = np.mean(
            data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
        variance_value_all[j] = np.var(
            data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
            ddof=1)

        for i in range(0, number_of_unique_groups):

            # Extracting the pieces of the data frame that belong to ith unique group.
            data_frame_current_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[i]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            data_frame_current_group = data_frame_current_group.drop(
                args.group, 1).transpose()
            # Pulling indexes list from the current data frame.
            indexes_list = data_frame_current_group.index.tolist()

            # Series current for group i and row (feature) j.
            series_current = data_frame_current_group.loc[indexes_list[j]]

            # This piece of code depends on whether it is the first group in the list or not.
            if i == 0:
                series_total = [series_current]
            else:
                series_total.append(series_current)

        # Checking if the compared elements are different.
        # Combining for checking.
        combined_list = data_frame_manipulate_transpose.loc[
            indexes_list_complete[j]].tolist()
        combined_list_unique = np.unique(combined_list)
        # Checking if the number of unique elements is exactly 1.
        if len(combined_list_unique) == 1:
            # Performing Kruscal-Wallis for all groups for feature j.
            p_value_all[j] = float("nan")
            H_value_all[j] = float("nan")
            if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1
            if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1
            if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1

        else:
            # Performing Kruscal-Wallis for all groups for feature j.
            kruscal_wallis_args = series_total
            p_value_all[j] = kruskalwallis(*kruscal_wallis_args)[1]
            H_value_all[j] = kruskalwallis(*kruscal_wallis_args)[0]
            if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1
            if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1
            if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1

    # The loop over features has to be finished by now. Converting them into the data frame.
    # The pariwise results will be added later.
    summary_df = pd.DataFrame(data=mean_value_all,
                              columns=["GrandMean"],
                              index=indexes_list)
    summary_df['SampleVariance'] = variance_value_all
    summary_df['H_value_for_all'] = H_value_all
    summary_df['prob_greater_than_H_for_all'] = p_value_all
    flag_df = pd.DataFrame(data=flag_value_all_0p01,
                           columns=["flag_significant_0p01_on_all_groups"],
                           index=indexes_list)
    flag_df["flag_significant_0p05_on_all_groups"] = flag_value_all_0p05
    flag_df["flag_significant_0p10_on_all_groups"] = flag_value_all_0p10

    # Informing that KW for all group has been performed.
    logger.info(
        u"Kruscal-Wallis test for all groups together has been performed.")

    # Computing means for each group
    # This part just produces sumamry statistics for the output table.
    # This has nothing to do with Kruscal-Wallis

    for i in range(0, number_of_unique_groups):

        # Extracting the pieces of the data frame that belong to ith group.
        data_frame_current_group = data_frame.loc[data_frame[args.group].isin(
            [group_values_series_unique[i]])]

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_current_group = data_frame_current_group.drop(
            args.group, 1).transpose()
        # Pulling indexes list from the current group.
        indexes_list = data_frame_current_group.index.tolist()

        # Creating array of means for the current group that will be filled.
        means_value = [0] * number_of_features

        for j in range(0, number_of_features):

            series_current = data_frame_current_group.loc[indexes_list[j]]
            means_value[j] = series_current.mean()

        means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[
            i]
        summary_df[means_value_column_name_current] = means_value

    # Running pairwise Kruscall-Wallis test for all pairs of group levels that are saved in groups_pairwise.

    for i in range(0, number_of_groups_pairwise):

        # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
        groups_subset = groups_pairwise[i]
        data_frame_first_group = data_frame.loc[data_frame[args.group].isin(
            [groups_subset[0]])]
        data_frame_second_group = data_frame.loc[data_frame[args.group].isin(
            [groups_subset[1]])]

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_first_group = data_frame_first_group.drop(args.group,
                                                             1).transpose()
        data_frame_second_group = data_frame_second_group.drop(args.group,
                                                               1).transpose()
        # Pulling indexes list from the first one (they are the same)
        indexes_list = data_frame_first_group.index.tolist()

        # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
        p_value = [0] * number_of_features
        H_value = [0] * number_of_features
        neg_log10_p_value = [0] * number_of_features
        flag_value_0p01 = [0] * number_of_features
        flag_value_0p05 = [0] * number_of_features
        flag_value_0p10 = [0] * number_of_features
        difference_value = [0] * number_of_features

        for j in range(0, number_of_features):

            series_first = data_frame_first_group.loc[indexes_list[j]]
            series_second = data_frame_second_group.loc[indexes_list[j]]

            # Checking if the compared elements are different.
            # Combining for checking.
            first_list = data_frame_first_group.loc[indexes_list[j]].tolist()
            second_list = data_frame_second_group.loc[indexes_list[j]].tolist()
            combined_list = first_list + second_list
            combined_list_unique = np.unique(combined_list)
            # Checking if the number of unique elements is exactly 1.
            if len(combined_list_unique) == 1:
                p_value[j] = float("nan")
                H_value[j] = float("nan")
                # Possible alternative for two groups.
                # p_value[j] = kruskalwallis(series_first, series_second)[1]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = series_first.mean() - series_second.mean(
                )
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

            else:
                kruscal_wallis_args = [series_first, series_second]
                p_value[j] = kruskalwallis(*kruscal_wallis_args)[1]
                H_value[j] = kruskalwallis(*kruscal_wallis_args)[0]
                # Possible alternative for two groups.
                # p_value[j] = kruskalwallis(series_first, series_second)[1]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = series_first.mean() - series_second.mean(
                )
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

        # Adding current p_value and flag_value column to the data frame and assigning the name
        p_value_column_name_current = 'prob_greater_than_H_for_diff_' + groups_subset[
            0] + '_' + groups_subset[1]
        H_value_column_name_current = 'H_value_for_diff_' + groups_subset[
            0] + '_' + groups_subset[1]
        neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[
            0] + '_' + groups_subset[1]
        difference_value_column_name_current = 'diff_of_' + groups_subset[
            0] + '_' + groups_subset[1]
        summary_df[p_value_column_name_current] = p_value
        summary_df[H_value_column_name_current] = H_value
        summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
        summary_df[difference_value_column_name_current] = difference_value

        flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_df[flag_value_column_name_current_0p01] = flag_value_0p01
        flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
        flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # Roundign the results up to 4 precison digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name = args.uniqueID
    flag_df.index.name = args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")

    # Informing that KW for pairwise group has been performed.
    logger.info(
        u"Kruscal-Wallis test for all groups pairwise has been performed.")

    # Generating Indexing for volcano plots.

    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff = 2

    # Making volcano plots
    with PdfPages(args.volcano) as pdf:
        for i in range(0, number_of_groups_pairwise):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            groups_subset = groups_pairwise[i]
            current_key = groups_subset[0] + '_' + groups_subset[1]

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]),
                              y=list(lpvals[current_key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=current_key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".
                format(current_key),
                xTitle="Difference of treatment means for {0}".format(
                    current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")

    # Ending script
    logger.info(u"Finishing running of Kruscal-Wallis tests.")
Esempio n. 19
0
def main(args):
    #Get R ready
    # Get current pathway
    myPath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))

    # Stablish path for LASSO script
    my_r_script_path = os.path.join(myPath, "lasso_enet.R")
    logger.info(my_r_script_path)

    # Activate pandas2ri
    pandas2ri.activate()

    # Running LASSO R sctrip
    with open(my_r_script_path, 'r') as f:
        rFile = f.read()
    lassoEnetScript = STAP(rFile, "lasso_enet")

    # Importing data trought interface
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       group=args.group,
                       logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Transpossing data
    dat.trans = dat.transpose()
    dat.trans.columns.name = ""

    # Dropping nan columns from design
    removed = dat.design[dat.design[dat.group] == "nan"]
    dat.design = dat.design[dat.design[dat.group] != "nan"]
    dat.trans.drop(removed.index.values, axis=0, inplace=True)

    logger.info("{0} removed from analysis".format(removed.index.values))
    dat.design.rename(columns={dat.group: "group"}, inplace=True)
    dat.trans.rename(columns={dat.group: "group"}, inplace=True)

    #Generate a group List
    groupList = [
        title for title, group in dat.design.groupby("group")
        if len(group.index) > 2
    ]

    #Turn group list into pairwise combinations
    comboMatrix = np.array(list(it.combinations(groupList, 2)))
    comboLength = len(comboMatrix)

    #Run R
    correct_list_of_names = np.array(dat.trans.columns.values.tolist())
    returns = lassoEnetScript.lassoEN(dat.trans, dat.design, args.uniqID,
                                      correct_list_of_names, comboMatrix,
                                      comboLength, args.alpha, args.plots)
    robjects.r['write.table'](returns[0],
                              file=args.coefficients,
                              sep='\t',
                              quote=False,
                              row_names=False,
                              col_names=True)
    robjects.r['write.table'](returns[1],
                              file=args.flags,
                              sep='\t',
                              quote=False,
                              row_names=False,
                              col_names=True)
    # Finishing
    logger.info("Script Complete!")
Esempio n. 20
0
def main(args):

    # Loading data through Interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)

    # Cleaning from missing data
    dat.dropMissing()

    # Unpaired permuted t-test. In this case there can be as many groups as possible. 
    # Order variable is ignored and t-tests are performed pairwise for each pair of groups.

    logger.info("Unpaired t-test will be performed for all groups pairwise.")

    # Getting the unique pairs and all pairwise permutations to feed to pairwise unpaired t-tests.
    group_values_series = dat.transpose()[dat.group].T.squeeze()
    group_values_series_unique = group_values_series.unique()
    number_of_unique_groups = group_values_series_unique.shape[0]
    groups_pairwise = list(combinations(group_values_series_unique,2) ) 
    number_of_groups_pairwise = len(groups_pairwise)

    # Extracting data from the interface.
    data_frame = dat.transpose()
    # Extracting number of features.
    # This variable not used in unpaired test. it just adds extra column to the data frame.	
#    if args.order == False:
    number_of_features = data_frame.shape[1] - 1

    # Saving treatment group name from the arguments.
    # Computing overall summaries (mean and variance).
    # This part just produces summary statistics for the output table.
    mean_value_all = [0] * number_of_features
    variance_value_all = [0] * number_of_features

    for j in range(0, number_of_features ):
        # Creating duplicate for manipulation.
        data_frame_manipulate = data_frame

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        # We should either drop 1 or 2 columns depending whether we fed the second one.
        data_frame_manipulate_transpose  = data_frame_manipulate.drop(  args.group, 1 ).transpose()

        # Pulling indexes list from the current data frame.
        indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

        # Computing dataset summaries.
        mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
        variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)

    # Creating the table and putting the results there.
    summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
    summary_df['SampleVariance'] =  variance_value_all


    # Computing means for each group and outputting them.
    # This part just produces summary statistics for the output table.
    for i in range(0, number_of_unique_groups ):
       # Extracting the pieces of the data frame that belong to the ith group.
       data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

       # Dropping columns that characterize group. Only feature columns will remain.
       # We also trnaspose here so it will be easier to operate with.
       # We should either drop 1 or 2 columns depending whether we fed the second one.
       data_frame_current_group  = data_frame_current_group.drop(  args.group, 1 ).transpose()

       # Pulling indexes list from the current group.
       indexes_list = data_frame_current_group.index.tolist()

       # Creating array of means for the current group that will be filled.
       means_value  = [0] * number_of_features

       for j in range(0, number_of_features ):
           series_current = data_frame_current_group.loc[ indexes_list[j] ] 
           means_value[j] = series_current.mean()

       # Adding current mean_value column to the data frame and assigning the name.
       means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
       summary_df[means_value_column_name_current] = means_value


    # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise.
    for i in range(0, number_of_groups_pairwise ):
       # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
       groups_subset = groups_pairwise[i]
       data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]]  )]
       data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]]  )]

       # Dropping columns that characterize group. Only feature columns will remain.
       # We also trnaspose here so it will be easier to operate with.
       # We should either drop 1 or 2 columns depending whether we fed the second one.
       data_frame_first_group  = data_frame_first_group.drop(  args.group, 1 ).transpose()
       data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose()

       # Pulling indexes list from the first one (they are the same)
       indexes_list = data_frame_first_group.index.tolist()

       # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
       p_value           = [0] * number_of_features
       t_value           = [0] * number_of_features
       neg_log10_p_value = [0] * number_of_features
       flag_value_0p01   = [0] * number_of_features
       flag_value_0p05   = [0] * number_of_features
       flag_value_0p10   = [0] * number_of_features
       difference_value  = [0] * number_of_features

       for j in range(0, number_of_features ):
           series_first  = data_frame_first_group.loc[ indexes_list[j] ]
           series_second = data_frame_second_group.loc[ indexes_list[j] ]

           p_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[0]    
#           print j
#           print p_value[j]
           t_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[1]
#           print j
#           print t_value[j]

           neg_log10_p_value[j] = - np.log10(p_value[j])
           difference_value[j] = series_first.mean() - series_second.mean()
           if p_value[j] < 0.01: flag_value_0p01[j] = 1
           if p_value[j] < 0.05: flag_value_0p05[j] = 1
           if p_value[j] < 0.10: flag_value_0p10[j] = 1

       # Creating column names for the data frame.
       p_value_column_name_current           = 'perm_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
       t_value_column_name_current           = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
       neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1]
       difference_value_column_name_current  = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1]
       flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] 
       flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] 
       flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] 

       # Adding current p_value and flag_value column to the data frame and assigning the name.
       # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it.
       if i == 0:
          flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
       else:
          flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

       # At this point data frame exists so only columns are added to the existing data frame.
       summary_df[p_value_column_name_current]           = p_value
       summary_df[t_value_column_name_current]           = t_value
       summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
       summary_df[difference_value_column_name_current]  = difference_value
       flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
       flag_df[flag_value_column_name_current_0p10] = flag_value_0p10




    # Rounding the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there originally.
    summary_df.index.name    =  args.uniqueID
    flag_df.index.name =  args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")



    # Generating Indexing for volcano plots.
    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
        if col.startswith("neg_log10_p_value")}

    # Getting data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
        if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff=2

    # Making volcano plots
    with PdfPages( args.volcano ) as pdf:
        for i in range(0, number_of_groups_pairwise ):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            groups_subset = groups_pairwise[i]
            current_key =  groups_subset[0] + '_' + groups_subset[1]

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0])

            # Color results beyond threshold red
            cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(axTitle=current_key, grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key),
                xTitle="Difference of treatment means for {0}".format(current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")

    # Ending script
    logger.info(u"Finishing t-test run.")
Esempio n. 21
0
def main(args):

    # Loading data trought Interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, 
                        runOrder=args.order, logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)
    
    # Cleaning from missing data
    dat.dropMissing()


    # SCENARIO 1: Unpaired t-test. In this case there can be as many groups as possible. 
    # Order variable is ignored and t-tests are performed pairwise for each pair of groups.

    if args.pairing == "unpaired":
       logger.info("Unpaired t-test will be performed for all groups pairwise.")
  

       # Getting the uinique pairs and all pairwise prermutations
       # son that we will feed them to pairwise unpaired t-tests.
       group_values_series = dat.transpose()[dat.group].T.squeeze()
       group_values_series_unique = group_values_series.unique()
       number_of_unique_groups = group_values_series_unique.shape[0]
       groups_pairwise = list(combinations(group_values_series_unique,2) ) 
       number_of_groups_pairwise = len(groups_pairwise)

       # Extracting data from the interface.
       data_frame = dat.transpose()
       # Extracting number of features. This will depend on whether the user has provided ordering variable or not.
       # This variable is useless for unpared test. it just adds extra column to the data frame.	
       if args.order == False:
          number_of_features = data_frame.shape[1] - 1
       else:
          number_of_features = data_frame.shape[1] - 2
       # Saving treatment group name from the arguments.



       # Computing overall summaries (mean and variance).
       # This part just produces sumamry statistics for the output table.
       # This has nothing to do with unpaired t-test. This is just summary for the table.
       mean_value_all = [0] * number_of_features
       variance_value_all = [0] * number_of_features

       for j in range(0, number_of_features ):
  

           # Creating duplicate for manipulation.
           data_frame_manipulate = data_frame

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_manipulate_transpose  = data_frame_manipulate.drop(  args.group, 1 ).transpose()
           else:
              data_frame_manipulate_transpose  = data_frame_manipulate.drop(  [args.group, args.order], 1 ).transpose()
           # Pulling indexes list from the current data frame.
           indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

           # Computing dataset summaries.
           mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
           variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)



       # Creating the table and putting the results there.
       summary_df     =  pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
       summary_df['SampleVariance'] =  variance_value_all


       # Computing means for each group and outputting them.
       # This part just produces summary statistics for the output table.
       # This has nothing to do with unpaired t-test. This is just summary for the table.

       for i in range(0, number_of_unique_groups ):
        

           # Extracting the pieces of the data frame that belong to the ith group.
           data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_current_group  = data_frame_current_group.drop(  args.group, 1 ).transpose()
           else:
              data_frame_current_group  = data_frame_current_group.drop(  [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the current group.
           indexes_list = data_frame_current_group.index.tolist()

           # Creating array of means for the current group that will be filled.
           means_value  = [0] * number_of_features
    
           for j in range(0, number_of_features ):
  
               series_current = data_frame_current_group.loc[ indexes_list[j] ] 
               means_value[j] = series_current.mean()


           # Adding current mean_value column to the data frame and assigning the name.
           means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
           summary_df[means_value_column_name_current] = means_value
           
           


       # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise.
       for i in range(0, number_of_groups_pairwise ):
        
           # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
           groups_subset = groups_pairwise[i] 
           data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]]  )]
           data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_first_group  = data_frame_first_group.drop(  args.group, 1 ).transpose()
              data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose()
           else:
              data_frame_first_group  = data_frame_first_group.drop(  [args.group, args.order], 1 ).transpose()
              data_frame_second_group = data_frame_second_group.drop( [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the first one (they are the same)
           indexes_list = data_frame_first_group.index.tolist()

           # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
           p_value           = [0] * number_of_features
           t_value           = [0] * number_of_features
           neg_log10_p_value = [0] * number_of_features
           flag_value_0p01   = [0] * number_of_features
           flag_value_0p05   = [0] * number_of_features
           flag_value_0p10   = [0] * number_of_features
           difference_value  = [0] * number_of_features


           for j in range(0, number_of_features ):
       
               series_first  = data_frame_first_group.loc[ indexes_list[j] ] 
               series_second = data_frame_second_group.loc[ indexes_list[j] ]

               ttest_ind_args = [series_first, series_second]
               p_value[j] = ttest_ind( *ttest_ind_args )[1]
               t_value[j] = ttest_ind( *ttest_ind_args )[0]
               # Possible alternative for two groups.
               # p_value[j] = ttest_ind_args(series_first, series_second)[1]
	       neg_log10_p_value[j] = - np.log10(p_value[j])
               difference_value[j] = series_first.mean() - series_second.mean()
               if p_value[j] < 0.01: flag_value_0p01[j] = 1
               if p_value[j] < 0.05: flag_value_0p05[j] = 1
               if p_value[j] < 0.10: flag_value_0p10[j] = 1


           # Creating column names for the data frame.
           p_value_column_name_current           = 'prob_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
           t_value_column_name_current           = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
           neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1]
           difference_value_column_name_current  = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1]
           flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] 
           flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] 
           flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] 

           # Adding current p_value and flag_value column to the data frame and assigning the name.
           # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it.
           if i == 0:
              flag_df     =  pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
           else:
              flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

           # At this point data frame exists so only columns are added to the existing data frame.
           summary_df[p_value_column_name_current]           = p_value
           summary_df[t_value_column_name_current]           = t_value
           summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
           summary_df[difference_value_column_name_current]  = difference_value
           flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
           flag_df[flag_value_column_name_current_0p10] = flag_value_0p10
  


    # SCENARIO 2: Paired t-test. In this case there should be EXACTLY TWO groups.
    # Each sample in one group should have exacty one matching pair in the other group. 
    # The matching is controlled by args.order variable.

    if args.pairing == "paired":
       logger.info("Paired test will be performed for two groups pairwise based on pairing variable: {0}.".format(args.order))


       # Getting the number of unique groups. If it is bigger than 2 return the warning and exit.
       group_values_series = dat.transpose()[dat.group].T.squeeze()
       group_values_series_unique = group_values_series.unique()
       number_of_unique_groups = group_values_series_unique.shape[0]
       if number_of_unique_groups != 2:
          logger.warning(u"The number of unique groups is {0} and not 2 as expected. The paired t-test cannot be performed.".format(number_of_unique_groups) )
          exit()	
 
       # This piece of code will be executed only if the number_of_unique_groups is exactly 2 so the group check is passed. 

       # Creating pairwise combination of our two groups that we will use in the future.
       groups_pairwise = list( combinations(group_values_series_unique,2) ) 
       number_of_groups_pairwise = len(groups_pairwise)

       # Extracting data from the interface.
       data_frame = dat.transpose()
       # Extracting number of features. This will depend on whether the user has provided ordering variable or not.
       # Checking that the requred pairing variable has been provided.
       if args.order == False:
          logger.info("The required t-test pairing variable has not been provided: The paired t-test cannot be performed.")
          exit()	


       # This piece of code will be executed only if the args.order has been provided and the check is passed. 

       # Defining the number of features. It should be the dimension of the data frame minus 2 columns that stand for arg.group and args.order
       number_of_features = data_frame.shape[1] - 2

       # At this point is is confirmed that there are only 2 group and that pairing variable args.order has been provided.
       # Now we need to check that pairing is correct i.e. that each pairID corresponds to only two samples from different groups.

       # Getting the unique pairs and deleting those theat have more or less than three.
       pairid_values_series = dat.transpose()[dat.runOrder].T.squeeze()
       pairid_values_series_unique = pairid_values_series.unique()
       number_of_unique_pairid = pairid_values_series_unique.shape[0]


       # Extracting data from the interface.
       data_frame = dat.transpose()
  
       # Extracting the number of samples in the final frame.
       number_of_samples = data_frame.shape[0]


       # Performing the cleaning of the original data. We are removing samples that are not paired and not belonging to the two groups.
       # If the dataset has 1 or 3 or more matches for a pairid those samples are removed with a warning.
       # If pairdid corresponds to exactly two samples (which is correct) but groupid-s are NOT different those values will be also removed.
       for i in range(0, number_of_unique_pairid ):
       
           # Extracting the pieces of the data frame that belong to ith unique pairid.
           data_frame_current_pairid = data_frame.loc[data_frame[args.order].isin( [ pairid_values_series_unique[i] ]  )]

           # We transpose here so it will be easier to operate with.
           data_frame_current_pairid  = data_frame_current_pairid.transpose()
           sample_names_current_pairid = list(data_frame_current_pairid.columns.values)
       
           if data_frame_current_pairid.shape[1] != 2:

              # Pulling indexes list from the current data frame.
              logger.warning(u"Number of samples for the pairID: {0} is equal to {1} and NOT equal to 2. Sample(s) {2} will be removed from further analysis.".format(pairid_values_series_unique[i],
                               data_frame_current_pairid.shape[1], sample_names_current_pairid)  )

              # Getting indexes we are trying to delete.
              boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid )  
              # Deleting the indexes and in the for loop going to next iteration.
              data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True)
    
           # This piece is executed if the numbe is correct i.e. data_frame_current_group.shape[1] == 2:
           # Here we are checking if the groupID-s for the given pair are indeed different.

           elif data_frame_current_pairid.transpose()[args.group][0] == data_frame_current_pairid.transpose()[args.group][1]:

                logger.warning(u"Samples in pairID {0} have groupIDs: {1} and {2}. Should be different! Sample(s) {3} will be removed from further analysis.".format(pairid_values_series_unique[i],       		                         data_frame_current_pairid.transpose()[args.group][1], data_frame_current_pairid.transpose()[args.group][0], sample_names_current_pairid)  )
                   
                # Getting indexes we are trying to delete.
                boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid )  
                # Deleting the indexes.
                data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True)


        
       # Cheching if the data frame bacame empty after cleaning.
       if data_frame.shape[0] == 0:
          logger.warning(u"Number of paired samples in the final dataset is exactly 0! Please check the desing file for accuracy! Exiting the program."  )
          exit()	
   


       # Computing overall summaries (mean and variance).
       # This part just produces sumamry statistics for the output table.
       # This has nothing to do with paired t-test. This is just summary for the table.
       mean_value_all = [0] * number_of_features
       variance_value_all = [0] * number_of_features

       for j in range(0, number_of_features ):

           # Creating duplicate for manipulation.
           data_frame_manipulate = data_frame

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           data_frame_manipulate_transpose  = data_frame_manipulate.drop( [args.group,args.order], 1 ).transpose()
           # Pulling indexes list from the current data frame.
           indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

           # Computing dataset summaries.
           mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
           variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)



       # Creating the table and putting the results there.
       summary_df     =  pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
       summary_df['SampleVariance'] =  variance_value_all


       # Computing means for each group and outputting them.
       # This part just produces summary statistics for the output table.
       # This has nothing to do with paired t-test. This is just summary for the table.

       for i in range(0, number_of_unique_groups ):
        

           # Extracting the pieces of the data frame that belong to the ith group.
           data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           data_frame_current_group  = data_frame_current_group.drop(  [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the current group.
           indexes_list = data_frame_current_group.index.tolist()

           # Creating array of means for the current group that will be filled.
           means_value  = [0] * number_of_features
    
           for j in range(0, number_of_features ):
  
               series_current = data_frame_current_group.loc[ indexes_list[j] ] 
               means_value[j] = series_current.mean()


           # Adding current mean_value column to the data frame and assigning the name.
           means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
           summary_df[means_value_column_name_current] = means_value




       # Performing paired t-test for the two groups and saving the results.

       # Creating p_values and flag_values emply list of length number_of_features.
       # This will be used for thw two groups in paired t-test.
       p_value = [0] * number_of_features
       t_value = [0] * number_of_features
       flag_value_0p01   = [0] * number_of_features
       flag_value_0p05   = [0] * number_of_features
       flag_value_0p10   = [0] * number_of_features
       neg_log10_p_value = [0] * number_of_features
       difference_value  = [0] * number_of_features

       # Performing paired t-test for each pair of features.
       for j in range(0, number_of_features ):

  
           # Extracting the pieces of the data frame that belong to 1st group.
           data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[0]]  )]
           data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[1]]  )]

        
           # Sorting data frame by args.group index 
           # This will ensure datasets are aligned by pair when fed to the t-test.
           data_frame_first_group  = data_frame_first_group.sort(args.order)
           data_frame_second_group = data_frame_second_group.sort(args.order)


           # Sorting data frame by args.group index 
           data_frame_first_group  = data_frame_first_group.drop(  [args.group,args.order], 1 ).transpose()
           data_frame_second_group = data_frame_second_group.drop( [args.group,args.order], 1 ).transpose()
         
           # Pulling list of indexes. This is the same list for the first and for the second.
           indexes_list = data_frame_first_group.index.tolist()

           # Pullinng the samples out
           series_first  = data_frame_first_group.loc[ indexes_list[j] ] 
           series_second = data_frame_second_group.loc[ indexes_list[j] ]


           # Running t-test for the two given samples
           paired_ttest_args = [series_first, series_second]
           p_value[j] = ttest_rel( *paired_ttest_args )[1]
           t_value[j] = ttest_rel( *paired_ttest_args )[0]
           neg_log10_p_value[j] = - np.log10(p_value[j])
           difference_value[j] = series_first.mean() - series_second.mean()
           if p_value[j] < 0.01: flag_value_0p01[j] = 1
           if p_value[j] < 0.05: flag_value_0p05[j] = 1
           if p_value[j] < 0.10: flag_value_0p10[j] = 1


       # The loop over features has to be finished by now. Converting them into the data frame.    


       # Creating column names for the data frame.
       p_value_column_name_current           = 'prob_greater_than_t_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       t_value_column_name_current           = 't_value_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       difference_value_column_name_current  = 'diff_of_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       flag_value_column_name_current_0p01 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p01'
       flag_value_column_name_current_0p05 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p05'
       flag_value_column_name_current_0p10 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p10'


       summary_df[t_value_column_name_current] = t_value
       summary_df[p_value_column_name_current] = p_value
       summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
       summary_df[difference_value_column_name_current] = difference_value

       flag_df  =  pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
       flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
       flag_df[flag_value_column_name_current_0p10] = flag_value_0p10



   
    # Roundign the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name    =  args.uniqueID
    flag_df.index.name =  args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")




    # Generating Indexing for volcano plots.

    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff=2



    # Making volcano plots
    with PdfPages( args.volcano ) as pdf:
         for i in range(0, number_of_groups_pairwise ):
             # Set Up Figure
             volcanoPlot = figureHandler(proj="2d")


             groups_subset = groups_pairwise[i] 
             current_key =  groups_subset[0] + '_' + groups_subset[1]
             
             # Plot all results
             scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), 
                                colorList=list('b'), ax=volcanoPlot.ax[0])

             # Color results beyond treshold red
             cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff]
             if not cutLpvals.empty:
                    cutDiff = difs[current_key][cutLpvals.index]
                    scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), 
                                      colorList=list('r'), ax=volcanoPlot.ax[0])

             # Drawing cutoffs
             lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

             # Format axis (volcanoPlot)
             volcanoPlot.formatAxis(axTitle=current_key, grid=False,
                 yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key),
                 xTitle="Difference of treatment means for {0}".format(current_key))

             # Add figure to PDF
             volcanoPlot.addToPdf(pdfPages=pdf)
  
    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")



    # Ending script
    logger.info(u"Finishing running of t-test.")
def main(args):
    """ 
    Main Script 
    """
    #Checking if levels
    if args.levels and args.group:
        levels = [args.group] + args.levels
    elif args.group and not args.levels:
        levels = [args.group]
    else:
        levels = []
    logger.info(u"Groups used to color by: {0}".format(",".join(levels)))

    # Parsing data with interface
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       group=args.group,
                       anno=args.levels,
                       logger=logger,
                       runOrder=args.order)

    # Removing groups with just one sample and then clean from missing data.
    dat.removeSingle()
    dat.dropMissing()

    # Select colors for data (dat.design contains a copy of dat.design with an
    # additional columns for colors).
    dataPalette.getColors(design=dat.design, groups=levels)

    # Getting list of indexex to subset wide file
    if args.group:
        disGroups = [(group.index, level) for level, group in
                     dataPalette.design.groupby(dataPalette.combName)]
    else:
        disGroups = [(dat.design.index, "samples")]

    # Iterating over subgroups
    pairwise_disCuts = list()
    toMean_disCuts = list()
    for indexes, name in disGroups:
        # If less than 3 elements in the group skip to the next
        if len(indexes) < 3:
            logger.error("Group {0} has less than 3 elements, it will not be"\
                        " included in the analysis".format(level))
            continue

        #Subsetting wide
        currentFrame = pd.DataFrame(dat.wide[indexes].copy())
        currentFrame.name = name

        # Calculate Penalized Sigma
        penalizedSigma = calculatePenalizedSigma(data=currentFrame,
                                                 penalty=args.penalty)

        # Calculate Distances (dis estands for distance)
        disToMean, disPairwise = calculateDistances(data=currentFrame,
                                                    V_VI=penalizedSigma)

        # Calculate cutoffs
        cutoff1, cutoff2 = calculateCutoffs(currentFrame, args.p)

        # Appending results
        pairwise_disCuts.append([disPairwise, cutoff2])
        toMean_disCuts.append([disToMean, cutoff1])

    if args.group:
        # Splitting results to mean and pairwise
        pairwise_dis = [distance for distance, cutoff in pairwise_disCuts]
        toMean_dis = [distance for distance, cutoff in toMean_disCuts]

        # Merging to get distance for all pairwise
        pairwise_dis_all = pd.DataFrame(columns=["group"])
        for dis in pairwise_dis:
            dis.loc[:, "group"] = [dis.name] * len(dis.columns)
            pairwise_dis_all = pd.DataFrame.merge(pairwise_dis_all,
                                                  dis,
                                                  on=["group"],
                                                  left_index=True,
                                                  right_index=True,
                                                  how='outer',
                                                  sort=False)
        pairwise_dis_all.sort_values(by="group", inplace=True)
        pairwise_dis_all.drop("group", axis=1, inplace=True)
        pairwise_dis_all.name = "samples"

        # Merging to get distance for all to mean
        toMean_dis_all = pd.DataFrame(columns=["group", "distance_to_mean"])
        for dis in toMean_dis:
            dis.loc[:, "group"] = [dis.name] * len(dis.columns)
            toMean_dis_all = pd.DataFrame.merge(
                toMean_dis_all,
                dis,
                on=['distance_to_mean', 'group'],
                left_index=True,
                right_index=True,
                how='outer',
                sort=False)
        toMean_dis_all.sort_values(by="group", inplace=True)
        toMean_dis_all.drop("group", axis=1, inplace=True)
        toMean_dis_all.name = "samples"

        # Geting cuttoffs for distances
        cutoff1, cutoff2 = calculateCutoffs(dat.wide, args.p)

        # Appending toMean_dis_all and pairwise_dis_all to toMean_dis_cuts and
        # pairwise_dis_cuts respectively.
        toMean_disCuts.append([toMean_dis_all, cutoff1])
        pairwise_disCuts.append([pairwise_dis_all, cutoff2])

    # Iterating over each pair of (distance,cutoff) for toMean and pairwise to
    # plot  distances.
    with PdfPages((args.figure)) as pdf:
        # Iterating over toMean,pairwise distances in parallel
        for toMean, pairwise in zip(toMean_disCuts, pairwise_disCuts):
            # Making plots
            plotDistances(df_distance=toMean[0],
                          palette=dataPalette,
                          p=args.p,
                          plotType="Scatterplot",
                          disType="Mahalanobis",
                          cutoff=toMean[1],
                          pdf=pdf)
            plotDistances(df_distance=pairwise[0],
                          palette=dataPalette,
                          p=args.p,
                          plotType="Scatterplot",
                          disType="Mahalanobis",
                          cutoff=pairwise[1],
                          pdf=pdf)
            plotDistances(df_distance=pairwise[0],
                          palette=dataPalette,
                          p=args.p,
                          plotType="Box-plots",
                          disType="Mahalanobis",
                          cutoff=pairwise[1],
                          pdf=pdf)

    # Since its a list of dataframes and we are only interested in the last one
    # we are using [-1] to access it and [0] to getit out of the list.
    # Outputting distances to mean and pairwise
    toMean_disCuts[-1][0].to_csv(args.toMean, index_label="sampleID", sep='\t')
    pairwise_disCuts[-1][0].to_csv(args.pairwise,
                                   index_label="sampleID",
                                   sep='\t')

    # Ending script
    logger.info("Script complete.")
Esempio n. 23
0
def main(args):
    # Import data
    dat = wideToDesign(args.input,args.design,args.uniqID,logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Generate formula Formula
    preFormula,categorical,numerical,levels,dat.design = preProcessing(design=dat.design,
                        factorTypes=args.ftypes, factorNames=args.factors)

    # Transpose data
    dat.trans  = dat.transpose()

    # if interactions
    if args.interactions:
        logger.info("Running ANOVA on interactions")
        dat.trans["_treatment_"] = dat.trans.apply(lambda x: \
                                "_".join(map(str,x[categorical].values)),axis=1)

        dat.design["_treatment_"] = dat.design.apply(lambda x: \
                                "_".join(map(str,x[categorical].values)),axis=1)

        # if numerical adde then to the formula
        if len(numerical)>0:
            formula = ["C(_treatment_)"]+numerical
        else:
            formula = ["C(_treatment_)"]

        # Concatenatig the formula
        formula = "+".join(formula)

        # Getting new formula for interactions
        dictFormula = {feature:"{0}~{1}".format(str(feature),formula) \
                    for feature in dat.wide.index.tolist()}

        # Creating levelCombinations
        levels=sorted(list(set(dat.trans["_treatment_"].tolist())))

        # Creating levelCombinations
        reverseLevels = copy.copy(levels)
        reverseLevels.reverse()
        lvlComb = list()
        generateDinamicCmbs([levels],lvlComb)

        # Running anova
        logger.info('Running anova models')
        results,significant,residDat,fitDat = runANOVA(dat=dat, categorical=["_treatment_"],
                                levels=[levels], lvlComb=lvlComb, formula=dictFormula, 
                                numerical=numerical)
    else:
        logger.info("Running ANOVA without interactions")
        # Create combination of groups
        nLevels =  [list(itertools.chain.from_iterable(levels))]
        reverseLevels = copy.copy(nLevels)
        reverseLevels.reverse()
        lvlComb = list()
        generateDinamicCmbs(reverseLevels,lvlComb)

        # Maps every metabolite to its formulas
        dictFormula = {feature:"{0}~{1}".format(str(feature),preFormula) for feature \
                        in dat.wide.index.values}

        # running anova
        logger.info('Running anova models')
        results,significant,residDat,fitDat = runANOVA(dat=dat, categorical=categorical,
                                levels=levels, lvlComb=lvlComb, formula=dictFormula, 
                                numerical=numerical)

    # QQ plots    
    logger.info('Generating q-q plots.')
    qqPlot(residDat.T, fitDat.T, args.ofig)

    # Generate Volcano plots
    logger.info('Generating volcano plots.')
    volcano(lvlComb, results, args.ofig2)

    # Round results to 4 digits and save
    results = results.round(4)
    results.index.name = dat.uniqID
    results.to_csv(args.oname, sep="\t")

    # Flags
    significant.index.name = dat.uniqID
    significant.to_csv(args.flags, sep="\t")
Esempio n. 24
0
def main(args):
    # Import data
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       args.group,
                       logger=logger)

    # Get a list of samples to process, if processOnly is specified only
    # analyze specified group.
    if args.processOnly:
        dat.design = dat.design[dat.design[args.group].isin(args.processOnly)]
        toProcess = dat.design.index
        dat.sampleIDs = toProcess.tolist()

    # Create dataframe with sampleIDs that are to be analyzed.
    dat.keep_sample(dat.sampleIDs)

    # Get list of pairwise combinations. If group is specified, only do
    # within group combinations.
    combos = list()
    if args.group:
        # If group is given, only do within group pairwise combinations
        logger.info('Only doing within group, pairwise comparisons.')
        for groupName, dfGroup in dat.design.groupby(dat.group):
            combos.extend(list(combinations(dfGroup.index, 2)))
    else:
        logger.info('Doing all pairwise comparisons. This could take a while!')
        # Get all pairwise combinations for all samples
        combos.extend(list(combinations(dat.sampleIDs, 2)))

    # Open a multiple page PDF for plots
    ppBA = PdfPages(args.baName)

    # Loop over combinations and generate plots and return a list of flags.
    logger.info('Generating flags and plots.')
    flags = map(lambda combo: iterateCombo(dat, combo, ppBA), combos)

    # Close PDF with plots
    ppBA.close()

    # Merge flags
    logger.info('Merging outlier flags.')
    merged = Flags.merge(flags)

    # Summarize flags
    logger.info('Summarizing outlier flags.')
    propSample, propFeature, propSample_p, propFeature_p, propSample_c, propFeature_c, propSample_d, propFeature_d = summarizeFlags(
        dat, merged, combos)
    plotFlagDist(propSample, propFeature, args.distName)

    # Create sample level flags
    flag_sample = Flags(index=dat.sampleIDs)
    flag_sample.addColumn(column='flag_sample_BA_outlier',
                          mask=(propSample >= args.sampleCutoff))
    flag_sample.addColumn(column='flag_sample_BA_pearson',
                          mask=(propSample_p >= args.sampleCutoff))
    flag_sample.addColumn(column='flag_sample_BA_cooks',
                          mask=(propSample_c >= args.sampleCutoff))
    flag_sample.addColumn(column='flag_sample_BA_dffits',
                          mask=(propSample_d >= args.sampleCutoff))
    flag_sample.df_flags.index.name = "sampleID"
    flag_sample.df_flags.to_csv(args.flagSample, sep='\t')

    # Create metabolite level flags
    flag_metabolite = Flags(dat.wide.index)
    flag_metabolite.addColumn(column='flag_feature_BA_outlier',
                              mask=(propFeature >= args.featureCutoff))
    flag_metabolite.addColumn(column='flag_feature_BA_pearson',
                              mask=(propFeature_p >= args.featureCutoff))
    flag_metabolite.addColumn(column='flag_feature_BA_cooks',
                              mask=(propFeature_c >= args.featureCutoff))
    flag_metabolite.addColumn(column='flag_feature_BA_dffits',
                              mask=(propFeature_d >= args.featureCutoff))
    flag_metabolite.df_flags.to_csv(args.flagFeature, sep='\t')

    # Finish Script
    logger.info("Script Complete!")
Esempio n. 25
0
def main(args):
    # If the user provides grouping variable we test each group against the null (my supplied by user, 0 is the default).
    if args.group != False:
        logger.info(
            u"""t-test will be performed for all groups saved in [{0}] variable in the desing file pairwise with the H_0: mu = {1}."""
            .format(args.group, args.mu))

        # Loading data trought Interface.
        logger.info("Loading data with the Interface")
        dat = wideToDesign(args.input,
                           args.design,
                           args.uniqueID,
                           group=args.group,
                           logger=logger)

        # Treat everything as numeric.
        dat.wide = dat.wide.applymap(float)

        # Cleaning from the missing data.
        dat.dropMissing()

        # Getting the uinique group values so that we will feed them to the t-tests.
        group_values_series = dat.transpose()[dat.group].T.squeeze()
        group_values_series_unique = group_values_series.unique()
        number_of_unique_groups = group_values_series_unique.shape[0]

        # Extracting data from the interface.
        data_frame = dat.transpose()
        # Extracting number of features. We subtract 1 since we have provided args.group
        number_of_features = data_frame.shape[1] - 1
        # Saving treatment group name from the arguments.

        # Computing overall summaries (mean and variance).
        # This part just produces sumamry statistics for the output table.
        # This has nothing to do with the single sample t-test.
        mean_value_all = [0] * number_of_features
        variance_value_all = [0] * number_of_features

        for j in range(0, number_of_features):
            # Creating duplicate for manipulation.
            data_frame_manipulate = data_frame

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also transpose here so it will be easier to operate with.
            data_frame_manipulate_transpose = data_frame_manipulate.drop(
                args.group, 1).transpose()
            # Pulling indexes list from the current data frame.
            indexes_list_complete = data_frame_manipulate_transpose.index.tolist(
            )

            # Computing dataset summaries for feature j.
            mean_value_all[j] = np.mean(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
            variance_value_all[j] = np.var(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                ddof=1)

        # Creating the table and putting the results there.
        summary_df = pd.DataFrame(data=mean_value_all,
                                  columns=["GrandMean"],
                                  index=indexes_list_complete)
        summary_df['SampleVariance'] = variance_value_all

        # Running single sample t-test for all groups.
        # We are also computing means for each group and outputting them.
        for i in range(0, number_of_unique_groups):

            # Extracting the pieces of the data frame that belong to the ith group.
            data_frame_current_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[i]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            data_frame_current_group = data_frame_current_group.drop(
                args.group, 1).transpose()

            # Pulling indexes list from the current group.
            indexes_list = data_frame_current_group.index.tolist()

            # Creating array of means for the current group that will be filled.
            # Creating p values, difference values,  neg_log10_p_value, t-value, flag_value lists filled wiht 0es.
            means_value = [0] * number_of_features
            difference_value = [0] * number_of_features
            p_value = [0] * number_of_features
            t_value = [0] * number_of_features
            neg_log10_p_value = [0] * number_of_features
            flag_value_0p01 = [0] * number_of_features
            flag_value_0p05 = [0] * number_of_features
            flag_value_0p10 = [0] * number_of_features

            for j in range(0, number_of_features):
                series_current = data_frame_current_group.loc[indexes_list[j]]
                means_value[j] = series_current.mean()

                # Performing one sample t-test
                ttest_1samp_args = [series_current, float(args.mu)]
                p_value[j] = ttest_1samp(*ttest_1samp_args)[1]
                t_value[j] = ttest_1samp(*ttest_1samp_args)[0]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = means_value[j] - float(args.mu)
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

            # Creating names for the current analysis columns and adding result columns to the data frame.
            means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[
                i]
            p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[
                i] + '_' + args.mu
            t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[
                i] + '_' + args.mu
            neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[
                i] + '_' + args.mu
            difference_value_column_name_current = 'diff_of_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + group_values_series_unique[
                i] + '_' + args.mu

            # Adding flag_value column to the data frame and assigning the name.
            # If the data frame for flags has not been created yet we create it on the fly. i.e. if i == 0 create it.
            if i == 0:
                flag_df = pd.DataFrame(
                    data=flag_value_0p01,
                    columns=[flag_value_column_name_current_0p01],
                    index=indexes_list)
            else:
                flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

            # At this point data frames (summary and flags) exist so only columns are added to the existing data frame.
            summary_df[means_value_column_name_current] = means_value
            summary_df[p_value_column_name_current] = p_value
            summary_df[t_value_column_name_current] = t_value
            summary_df[
                neg_log10_p_value_column_name_current] = neg_log10_p_value
            summary_df[difference_value_column_name_current] = difference_value
            flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
            flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # If the user does not provide grouping variable we test all dataset as a single group against the null (my supplied by user, 0 is the default).
    if args.group == False:
        logger.info(
            u"""t-test will be performed for the entire dataset since goruping variable was not provided."""
        )

        # Loading data trough the interface
        logger.info("Loading data with the Interface")
        dat = wideToDesign(args.input,
                           args.design,
                           args.uniqueID,
                           logger=logger)

        # Treat everything as numeric
        dat.wide = dat.wide.applymap(float)

        # Cleaning from missing data
        dat.dropMissing()

        # Saving the number of unique groups that will be used for plotting.
        # Since we did not feed any grouping variable it is exactly one.
        number_of_unique_groups = 1

        # Extracting data from the interface.
        data_frame = dat.wide.transpose()
        # Extracting number of features. We do not subtract 1 since we have not provided args.group
        number_of_features = data_frame.shape[1]
        # Saving treatment group name from the arguments.

        # Computing overall summaries (mean and variance).
        # This part just produces sumamry statistics for the output table.
        # This has nothing to do with single sample t-test. This is just summary for the table.
        mean_value_all = [0] * number_of_features
        variance_value_all = [0] * number_of_features
        # Creating array of means for the current group that will be filled.
        # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
        p_value = [0] * number_of_features
        t_value = [0] * number_of_features
        neg_log10_p_value = [0] * number_of_features
        difference_value = [0] * number_of_features
        flag_value_0p01 = [0] * number_of_features
        flag_value_0p05 = [0] * number_of_features
        flag_value_0p10 = [0] * number_of_features

        for j in range(0, number_of_features):
            # We transpose here so data will be easier to operate on.
            data_frame_manipulate_transpose = data_frame.transpose()
            # Pulling indexes list from the current data frame.
            indexes_list_complete = data_frame_manipulate_transpose.index.tolist(
            )

            # Computing dataset summaries.
            mean_value_all[j] = np.mean(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
            variance_value_all[j] = np.var(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                ddof=1)

            # Performing one sample t-test for the entire dataset.
            ttest_1samp_args = [
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                float(args.mu)
            ]
            p_value[j] = ttest_1samp(*ttest_1samp_args)[1]
            t_value[j] = ttest_1samp(*ttest_1samp_args)[0]
            neg_log10_p_value[j] = -np.log10(p_value[j])
            difference_value[j] = mean_value_all[j] - float(args.mu)
            if p_value[j] < 0.01: flag_value_0p01[j] = 1
            if p_value[j] < 0.05: flag_value_0p05[j] = 1
            if p_value[j] < 0.10: flag_value_0p10[j] = 1

        # Creating the table and putting the results there.
        summary_df = pd.DataFrame(data=mean_value_all,
                                  columns=["GrandMean"],
                                  index=indexes_list_complete)
        summary_df['SampleVariance'] = variance_value_all

        # Creating names for the current analysis columns and adding result columns to the data frame.
        means_value_column_name_current = 'mean_treatment_all'
        p_value_column_name_current = 'prob_greater_than_t_for_diff_all_' + args.mu
        t_value_column_name_current = 't_value_for_diff_all_' + args.mu
        neg_log10_p_value_column_name_current = 'neg_log10_p_value_all_' + args.mu
        difference_value_column_name_current = 'diff_of_all_' + args.mu
        flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_all_' + args.mu
        flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_all_' + args.mu
        flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_all_' + args.mu

        summary_df[means_value_column_name_current] = mean_value_all
        summary_df[p_value_column_name_current] = p_value
        summary_df[t_value_column_name_current] = t_value
        summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
        summary_df[difference_value_column_name_current] = difference_value

        flag_df = pd.DataFrame(data=flag_value_0p01,
                               columns=[flag_value_column_name_current_0p01],
                               index=indexes_list_complete)
        flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
        flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # Roundign the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name = args.uniqueID
    flag_df.index.name = args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")

    # Generating Indexing for volcano plots.
    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
              if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
              if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff = 2

    # Making volcano plots
    with PdfPages(args.volcano) as pdf:
        for i in range(0, number_of_unique_groups):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            # If no grouping variable is provided.
            if number_of_unique_groups == 1:
                current_key = 'all_' + args.mu
            else:
                current_key = group_values_series_unique[i] + '_' + args.mu

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]),
                              y=list(lpvals[current_key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=current_key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".
                format(current_key),
                xTitle="Difference of the means from H0 for {0}".format(
                    current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    logger.info(u"Volcano plots have been created.")
    logger.info(u"Finishing running of t-test.")
Esempio n. 26
0
def main(args):
    target = wideToDesign(wide=args.test_wide,
                          design=args.test_design,
                          uniqID=args.uniqID,
                          group=args.group,
                          logger=logger)
    train = wideToDesign(wide=args.train_wide,
                         design=args.train_design,
                         uniqID=args.uniqID,
                         group=args.group,
                         logger=logger)
    train.wide = train.wide.applymap(float)
    target.wide = train.wide.applymap(float)
    train.dropMissing()
    train = train.transpose()
    target.dropMissing()
    target = target.transpose()
    for i in target.columns:
        if i not in train.columns:
            del target[i]
    cv_status = args.cross_validation
    kernel_final = args.kernel
    gamma_final = float(args.a)
    coef0_final = float(args.b)
    degree_final = int(args.degree)

    # Definging the data to use for the model training.
    train_classes_to_feed = train[args.group].copy()
    train_data_to_feed = train
    del train_data_to_feed[args.group]
    # Definging the data to use for the model target.
    target_classes_to_feed = target[args.group].copy()
    target_data_to_feed = target
    del target_data_to_feed[args.group]
    if cv_status == "none":
        logger.info(u"Using the value of C specified by the user.")
        C_final = float(args.C)
    if cv_status == "single":
        logger.info(u"Using the value of C determined via a single \
                    cross-validation.")
        if (len(train_classes_to_feed) < 100):
            logger.info(u"The required number of samples for a single \
                        cross-validation procedure is at least 100. The \
                        dataset has {0}.".format(len(train_classes_to_feed)))
            logger.info(u"Exiting the tool.")
            exit(1)
        C_lower = float(args.C_lower_bound)
        C_upper = float(args.C_upper_bound)
        C_list_of_values = np.linspace(C_lower, C_upper, 20)
        gamma_param_dict = {
            "kernel": [kernel_final],
            "C": C_list_of_values,
            "gamma": [gamma_final],
            "coef0": [coef0_final],
            "degree": [degree_final]
        }
        auto_gamma_param_dict = {
            "kernel": [kernel_final],
            "C": C_list_of_values,
            "gamma": ["auto"],
            "coef0": [coef0_final],
            "degree": [degree_final]
        }
        try:
            logger.info("Running SVM model")
            internal_cv = GridSearchCV(estimator=SVC(),
                                       param_grid=gamma_param_dict)
        except ValueError:
            logger.info("Model failed with gamma = {0} trying automatic gamma \
                        instead of.".format(float(args.a)))
            internal_cv = GridSearchCV(estimator=SVC(),
                                       param_grid=auto_gamma_param_dict)
        internal_cv.fit(train_data_to_feed, train_classes_to_feed)
        C_final = internal_cv.best_params_['C']
    if cv_status == "double":
        logger.info(u"Using the value of C determined via a double \
                    cross-validation.")
        if (len(train_classes_to_feed) < 100):
            logger.info(u"The required number of samples for a double \
                        cross-validation procedure is at least 100. The \
                        dataset has {0}.".format(len(train_classes_to_feed)))
            logger.info(u"Exiting the tool.")
            exit()
        C_lower = float(args.C_lower_bound)
        C_upper = float(args.C_upper_bound)
        C_list_of_values = np.linspace(C_lower, C_upper, 20)
        C_final = C_list_of_values[0]
        for index_current in range(0, 20):
            C_list_of_values_current = np.linspace(
                C_list_of_values[0], C_list_of_values[index_current],
                (index_current + 1))
            # Creating dictionary for the single cross-validation procedure.
            # In this dictionary gamma is speficied by the user.
            gamma_param_dict = {
                "kernel": [kernel_final],
                "C": C_list_of_values_current,
                "gamma": [gamma_final],
                "coef0": [coef0_final],
                "degree": [degree_final]
            }
            # gamma is determined automatically if the first dictionary fails.
            auto_gamma_param_dict = {
                "kernel": [kernel_final],
                "C": C_list_of_values_current,
                "gamma": ["auto"],
                "coef0": [coef0_final],
                "degree": [degree_final]
            }
            try:
                logger.info("Running SVM model")
                internal_cv = GridSearchCV(estimator=SVC(),
                                           param_grid=gamma_param_dict)
            except ValueError:
                logger.info("Model failed with gamma = {0} trying automatic \
                            gamma instead of.".format(float(args.a)))
                internal_cv = GridSearchCV(estimator=SVC(),
                                           param_grid=auto_gamma_param_dict)
            internal_cv.fit(train_data_to_feed, train_classes_to_feed)
            external_cv = cross_val_score(internal_cv, train_data_to_feed,
                                          train_classes_to_feed)
            if index_current == 0:
                best_predction_proportion = external_cv.mean()
            else:
                if external_cv.mean() > best_predction_proportion:
                    best_predction_proportion = external_cv.mean()
                    C_final = C_list_of_values[index_current]
    C_final = float(C_final)
    print("The value of C used for the SVM classifier is {}".format(C_final))
    try:
        logger.info("Running SVM model")
        svm_model = svm.SVC(kernel=args.kernel,
                            C=C_final,
                            gamma=float(args.a),
                            coef0=float(args.b),
                            degree=int(args.degree))
    except ValueError:
        logger.info("Model failed with gamma = {0} trying automatic gamma \
                    instead.".format(float(args.a)))
        svm_model = svm.SVC(kernel=args.kernel,
                            C=C_final,
                            gamma="auto",
                            coef0=float(args.b),
                            degree=int(args.degree))
    svm_model.fit(train_data_to_feed, train_classes_to_feed)
    train_fitted_values = svm_model.predict(train_data_to_feed)
    train_fitted_values_series = pd.Series(train_fitted_values,
                                           index=train_classes_to_feed.index)
    train_classes_to_feed_series = pd.Series(train_classes_to_feed,
                                             index=train_classes_to_feed.index)
    classification_df = pd.DataFrame({
        'Group_Observed':
        train_classes_to_feed_series,
        'Group_Predicted':
        train_fitted_values_series
    })
    classification_df.to_csv(args.outClassification,
                             index='sampleID',
                             sep='\t')
    classification_mismatch_percent = 100 * sum(
        classification_df['Group_Observed'] ==
        classification_df['Group_Predicted']) / classification_df.shape[0]
    classification_mismatch_percent_string = str(
        classification_mismatch_percent) + ' Percent'
    os.system("echo {0} > {1}".format(classification_mismatch_percent_string,
                                      args.outClassificationAccuracy))
    target_fitted_values = svm_model.predict(target_data_to_feed)
    target_fitted_values_series = pd.Series(target_fitted_values,
                                            index=target_classes_to_feed.index)
    target_classes_to_feed_series = pd.Series(
        target_classes_to_feed, index=target_classes_to_feed.index)
    prediction_df = pd.DataFrame({
        'Group_Observed': target_classes_to_feed_series,
        'Group_Predicted': target_fitted_values_series
    })
    prediction_df.to_csv(args.outPrediction, index='sampleID', sep='\t')
    prediction_mismatch_percent = 100 * sum(
        prediction_df['Group_Observed'] ==
        prediction_df['Group_Predicted']) / prediction_df.shape[0]
    prediction_mismatch_percent_string = str(
        prediction_mismatch_percent) + ' Percent'
    os.system("echo {0} > {1}".format(prediction_mismatch_percent_string,
                                      args.outPredictionAccuracy))
    logger.info("Script Complete!")
Esempio n. 27
0
def main(args):

    # Importing data trough
    logger.info("Loading data trough the interface")
    dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Transpose data to normalize
    toNormalize_df = dat.wide.T

    # Telling the user about the selected normalization method.

    logger.info("Normalizing data using {0} method.".format(args.method))

    # mean, median and sum are applied per sample across features!!!!
    if args.method == "mean" or args.method == "sum" or args.method == "median":

        if args.method == "mean":
            toNormalize_df[args.method] = toNormalize_df.mean(axis=1)
            logger.info(
                "Mean scaling is used for each sample across features.")

        if args.method == "sum":
            toNormalize_df[args.method] = toNormalize_df.sum(axis=1)
            logger.info("Sum scaling is used for each sample across features.")

        if args.method == "median":
            toNormalize_df[args.method] = toNormalize_df.median(axis=1)
            logger.info(
                "Median scaling is used for each sample across features.")

        # Dividing by factor
        toNormalize_df = toNormalize_df.apply(lambda x: x / x[args.method],
                                              axis=1)

        # Dropping extra column
        toNormalize_df.drop(args.method, axis=1, inplace=True)

    # "centering", "auto", "range", "pareto", "level", "vast" are performed per feature across samples!!!!
    else:

        # Computing mean for each feature.
        feature_value_means = toNormalize_df.mean(axis=0)

        if args.method == "centering":

            # Performing centering for each feature.
            # In this case the value fo each feature will have mean zero across samples.
            logger.info("Centering is used for each feature across samples.")
            toNormalize_df = toNormalize_df - feature_value_means

        if args.method == "auto":

            # Computing standard deviation for each feature.
            feature_value_std = toNormalize_df.std(axis=0, ddof=1)

            # Performing auto-sclaing.
            # In this case the value fo each feature will have mean zero and std = 1 across samples.
            logger.info("Autoscaling is used for each feature across samples.")
            toNormalize_df = (toNormalize_df -
                              feature_value_means) / feature_value_std

        if args.method == "pareto":

            # Computing standard deviation and the square root of it for each feature.
            feature_value_std = toNormalize_df.std(axis=0, ddof=1)
            feature_value_std_sqrt = np.sqrt(feature_value_std)

            # Performing Pareto Scaling. The only difference from auto-scaling is that we use sqrt(standar_deviation).
            # In this case the value fo each feature will have mean zero and std will NOT be 1 across samples.
            logger.info(
                "Pareto scaling is used for each feature across samples.")
            toNormalize_df = (toNormalize_df -
                              feature_value_means) / feature_value_std_sqrt

        if args.method == "range":

            # Computing mean, min, max and range for each feature.
            feature_value_min = toNormalize_df.min(axis=0)
            feature_value_max = toNormalize_df.max(axis=0)
            feature_value_max_min = feature_value_max - feature_value_min

            # Performing range scaling. Each feature is centered and divided by the range of that feature.
            logger.info(
                "Range scaling is used for each feature across samples.")
            toNormalize_df = (toNormalize_df -
                              feature_value_means) / feature_value_max_min

        if args.method == "level":

            # Performing level scaling. Each feature is centered and divided by the the mean of that feature.
            logger.info(
                "Level scaling is used for each feature across samples.")
            toNormalize_df = (toNormalize_df -
                              feature_value_means) / feature_value_means

        if args.method == "vast":

            # Computing standard deviation and coefficient of variaiton for each feature.
            feature_value_std = toNormalize_df.std(axis=0, ddof=1)
            feature_value_cv = feature_value_std / feature_value_means

            # Performing VarianceStabilizing (VAST) scaling. Each feature is centered and divided by the coefficient of variation.
            logger.info(
                "VAST scaling is used for each feature across samples.")
            toNormalize_df = (toNormalize_df -
                              feature_value_means) / feature_value_std
            toNormalize_df = toNormalize_df / feature_value_cv

    # Transposing normalized data
    normalized_df = toNormalize_df.T

    # Saving data
    normalized_df.to_csv(args.out, sep="\t")
    logger.info("Script Complete!")