def main(args): # Need to take each arg and turn into data frame and add to new list flagDataFrameList = [] logger.info("Importing data") # Check for commas, commas are used in galaxy. If there are commas separate # the list by commas if ',' in args.flagFiles[0]: args.flagFiles = args.flagFiles[0].split(',') # If args.filename is provided then use it to add its name to column names # This paramether will should be used only on galaxy if args.filename: # Cleaning weird characters on file names and replacing them with '_'. filenames = [cleanStr(x=fname) for fname in args.filename] # Convert files into dataframes and populate into new list for flagFile,filename in zip(args.flagFiles,filenames): # Read table dataFrame = pd.read_table(flagFile) # Flag uniqID if args.flagUniqID: try: dataFrame.set_index(args.flagUniqID, inplace=True) except: logger.error("Index {0} does not exist on file.".format(args.flagUniqID)) dataFrame.columns=[name+"_"+filename for name in dataFrame.columns] # List of frame flagDataFrameList.append(dataFrame) #logger.info("Checking all indexes are the same") # Merge flags using Flags class mergedFlags = Flags.merge(flagDataFrameList) # Export merged flags # NOTE: Pandas cannot store NANs as an int. If there are NANs from the # merge, then the column becomes a float. Here I change the float output to # look like an int. mergedFlags.to_csv(args.mergedFile, float_format='%.0f', sep='\t') logger.info("Script Complete!")
def main(args): # Import data dat = wideToDesign(args.input, args.design, args.uniqID, args.group, logger=logger) # Get a list of samples to process, if processOnly is specified only # analyze specified group. if args.processOnly: dat.design = dat.design[dat.design[args.group].isin(args.processOnly)] toProcess = dat.design.index dat.sampleIDs = toProcess.tolist() # Create dataframe with sampleIDs that are to be analyzed. dat.keep_sample(dat.sampleIDs) # Get list of pairwise combinations. If group is specified, only do # within group combinations. combos = list() if args.group: # If group is given, only do within group pairwise combinations logger.info('Only doing within group, pairwise comparisons.') for groupName, dfGroup in dat.design.groupby(dat.group): combos.extend(list(combinations(dfGroup.index, 2))) else: logger.info('Doing all pairwise comparisons. This could take a while!') # Get all pairwise combinations for all samples combos.extend(list(combinations(dat.sampleIDs, 2))) # Open a multiple page PDF for plots ppBA = PdfPages(args.baName) # Loop over combinations and generate plots and return a list of flags. logger.info('Generating flags and plots.') flags = map(lambda combo: iterateCombo(dat, combo, ppBA), combos) # Close PDF with plots ppBA.close() # Merge flags logger.info('Merging outlier flags.') merged = Flags.merge(flags) # Summarize flags logger.info('Summarizing outlier flags.') propSample, propFeature, propSample_p, propFeature_p, propSample_c, propFeature_c, propSample_d, propFeature_d = summarizeFlags( dat, merged, combos) plotFlagDist(propSample, propFeature, args.distName) # Create sample level flags flag_sample = Flags(index=dat.sampleIDs) flag_sample.addColumn(column='flag_sample_BA_outlier', mask=(propSample >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_pearson', mask=(propSample_p >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_cooks', mask=(propSample_c >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_dffits', mask=(propSample_d >= args.sampleCutoff)) flag_sample.df_flags.index.name = "sampleID" flag_sample.df_flags.to_csv(args.flagSample, sep='\t') # Create metabolite level flags flag_metabolite = Flags(dat.wide.index) flag_metabolite.addColumn(column='flag_feature_BA_outlier', mask=(propFeature >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_pearson', mask=(propFeature_p >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_cooks', mask=(propFeature_c >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_dffits', mask=(propFeature_d >= args.featureCutoff)) flag_metabolite.df_flags.to_csv(args.flagFeature, sep='\t') # Finish Script logger.info("Script Complete!")