Example #1
0
def main(args):
    # Need to take each arg and turn into data frame and add to new list
    flagDataFrameList = []
    logger.info("Importing data")

    # Check for commas, commas are used in galaxy. If there are commas separate
    # the list by commas
    if ',' in args.flagFiles[0]:
        args.flagFiles = args.flagFiles[0].split(',')

    # If args.filename is provided then use it to add its name to column names
    # This paramether will should be used only on galaxy
    if args.filename:
        # Cleaning weird characters on file names and replacing them with '_'. 
        filenames = [cleanStr(x=fname) for fname in args.filename]


    # Convert files into dataframes and populate into new list
    for flagFile,filename in zip(args.flagFiles,filenames):
        # Read table
        dataFrame = pd.read_table(flagFile)

        # Flag uniqID
        if args.flagUniqID:
            try:
                dataFrame.set_index(args.flagUniqID, inplace=True)
            except:
                logger.error("Index {0} does not exist on file.".format(args.flagUniqID))

        dataFrame.columns=[name+"_"+filename for name in dataFrame.columns]

        # List of frame
        flagDataFrameList.append(dataFrame)

    #logger.info("Checking all indexes are the same")

    # Merge flags using Flags class
    mergedFlags = Flags.merge(flagDataFrameList)

    # Export merged flags
    # NOTE: Pandas cannot store NANs as an int. If there are NANs from the
    # merge, then the column becomes a float. Here I change the float output to
    # look like an int.
    mergedFlags.to_csv(args.mergedFile, float_format='%.0f', sep='\t')
    logger.info("Script Complete!")
def main(args):
    # Import data
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       args.group,
                       logger=logger)

    # Get a list of samples to process, if processOnly is specified only
    # analyze specified group.
    if args.processOnly:
        dat.design = dat.design[dat.design[args.group].isin(args.processOnly)]
        toProcess = dat.design.index
        dat.sampleIDs = toProcess.tolist()

    # Create dataframe with sampleIDs that are to be analyzed.
    dat.keep_sample(dat.sampleIDs)

    # Get list of pairwise combinations. If group is specified, only do
    # within group combinations.
    combos = list()
    if args.group:
        # If group is given, only do within group pairwise combinations
        logger.info('Only doing within group, pairwise comparisons.')
        for groupName, dfGroup in dat.design.groupby(dat.group):
            combos.extend(list(combinations(dfGroup.index, 2)))
    else:
        logger.info('Doing all pairwise comparisons. This could take a while!')
        # Get all pairwise combinations for all samples
        combos.extend(list(combinations(dat.sampleIDs, 2)))

    # Open a multiple page PDF for plots
    ppBA = PdfPages(args.baName)

    # Loop over combinations and generate plots and return a list of flags.
    logger.info('Generating flags and plots.')
    flags = map(lambda combo: iterateCombo(dat, combo, ppBA), combos)

    # Close PDF with plots
    ppBA.close()

    # Merge flags
    logger.info('Merging outlier flags.')
    merged = Flags.merge(flags)

    # Summarize flags
    logger.info('Summarizing outlier flags.')
    propSample, propFeature, propSample_p, propFeature_p, propSample_c, propFeature_c, propSample_d, propFeature_d = summarizeFlags(
        dat, merged, combos)
    plotFlagDist(propSample, propFeature, args.distName)

    # Create sample level flags
    flag_sample = Flags(index=dat.sampleIDs)
    flag_sample.addColumn(column='flag_sample_BA_outlier',
                          mask=(propSample >= args.sampleCutoff))
    flag_sample.addColumn(column='flag_sample_BA_pearson',
                          mask=(propSample_p >= args.sampleCutoff))
    flag_sample.addColumn(column='flag_sample_BA_cooks',
                          mask=(propSample_c >= args.sampleCutoff))
    flag_sample.addColumn(column='flag_sample_BA_dffits',
                          mask=(propSample_d >= args.sampleCutoff))
    flag_sample.df_flags.index.name = "sampleID"
    flag_sample.df_flags.to_csv(args.flagSample, sep='\t')

    # Create metabolite level flags
    flag_metabolite = Flags(dat.wide.index)
    flag_metabolite.addColumn(column='flag_feature_BA_outlier',
                              mask=(propFeature >= args.featureCutoff))
    flag_metabolite.addColumn(column='flag_feature_BA_pearson',
                              mask=(propFeature_p >= args.featureCutoff))
    flag_metabolite.addColumn(column='flag_feature_BA_cooks',
                              mask=(propFeature_c >= args.featureCutoff))
    flag_metabolite.addColumn(column='flag_feature_BA_dffits',
                              mask=(propFeature_d >= args.featureCutoff))
    flag_metabolite.df_flags.to_csv(args.flagFeature, sep='\t')

    # Finish Script
    logger.info("Script Complete!")