def setCVflagByGroup(args, wide, dat): # Split design file by treatment group pdfOut = PdfPages(args.CVplot) CV = pd.DataFrame(index=wide.index) for title, group in dat.design.groupby(args.group): # Filter the wide file into a new dataframe currentFrame = wide[group.index] # Change dat.sampleIDs to match the design file dat.sampleIDs = group.index CV['cv_'+title], CVcutoff = setCVflag(args, currentFrame, dat, groupName=title) CV['cv'] = CV.apply(np.max, axis=1) if not args.CVcutoff: CVcutoff = np.nanpercentile(CV['cv'].values, q=90) CVcutoff = round(CVcutoff, -int(floor(log(abs(CVcutoff), 10))) + 2) else: CVcutoff = float(args.CVcutoff) for title, group in dat.design.groupby(args.group): fig, ax = plt.subplots() xmin = -np.nanpercentile(CV['cv_'+title].values,99)*0.2 xmax = np.nanpercentile(CV['cv_'+title].values,99)*1.5 ax.set_xlim(xmin, xmax) CV['cv_'+title].plot(kind='hist', range = (xmin, xmax), bins = 15, normed = 1, color = 'grey', label = "CV histogram") CV['cv_'+title].plot(kind='kde', title="Density Plot of Coefficients of Variation in " + args.group + " " + title, ax=ax, label = "CV density") plt.axvline(x=CVcutoff, color = 'red', linestyle = 'dashed', label = "Cutoff at: {0}".format(CVcutoff)) plt.legend() pdfOut.savefig(fig, bbox_inches='tight') plt.close(fig) fig, ax = plt.subplots() xmin = -np.nanpercentile(CV['cv'].values,99)*0.2 xmax = np.nanpercentile(CV['cv'].values,99)*1.5 ax.set_xlim(xmin, xmax) # Create flag file instance CVflag = Flags(index=CV['cv'].index) for title, group in dat.design.groupby(args.group): CV['cv_'+title].plot(kind='kde', title="Density Plot of Coefficients of Variation by " + args.group, ax=ax, label = "CV density in group "+title) # Create new flag row for each group CVflag.addColumn(column='flag_feature_big_CV_' + title, mask=((CV['cv_'+title].get_values() > CVcutoff) | CV['cv_'+title].isnull())) plt.axvline(x=CVcutoff, color = 'red', linestyle = 'dashed', label = "Cutoff at: {0}".format(CVcutoff)) plt.legend() pdfOut.savefig(fig, bbox_inches='tight') plt.close(fig) pdfOut.close() # Write flag file CVflag.df_flags.to_csv(args.CVflag, sep='\t')
def mergeFlags(args): """ :Arguments: :type args: argparse.ArgumentParser :param args: Command line arguments :Returns: :rtype: .tsv :returns: Merged flags tsv file """ # Need to take each arg and turn into data frame and add to new list flagDataFrameList = [] logger.info("Importing data") # Check for commas, commas are used in galaxy. If there are commas separate # the list by commas if ',' in args.flagFiles[0]: args.flagFiles = args.flagFiles[0].split(',') # Convert files into dataframes and populate into new list for flagFile in args.flagFiles: dataFrame = pd.DataFrame.from_csv(flagFile, sep='\t') flagDataFrameList.append(dataFrame) logger.info("Checking all indexes are the same") # Merge flags using Flags class mergedFlags = Flags.merge(flagDataFrameList) # Export merged flags # NOTE: Pandas cannot store NANs as an int. If there are NANs from the # merge, then the column becomes a float. Here I change the float output to # look like an int. mergedFlags.to_csv(args.mergedFile, float_format='%.0f', sep='\t')
def main(args): # Import data dat = wideToDesign(args.fname, args.dname, args.uniqID) df_offFlags = Flags(index=dat.wide.index) # Iterate through each group to add flags for if a group has over half of # its data above the cutoff for title, group in dat.design.groupby(args.group): # Create mask of current frame containing True/False values if the # values are greater than the cutoff mask = (dat.wide[group.index] < args.cutoff) # Convert the mean column to a boolean meanOn = mask.mean(axis=1) # Add mean column of boolean values to flags df_offFlags.addColumn(column='flag_feature_' + title + '_off', mask=meanOn > 0.5) # flag_met_off column maskFlagMetOff = df_offFlags.df_flags.any(axis=1) df_offFlags.addColumn('flag_feature_off', maskFlagMetOff) # flag_met_all_off column maskFlagMetAllOff = df_offFlags.df_flags.all(axis=1) df_offFlags.addColumn('flag_feature_all_off', maskFlagMetAllOff) df_offFlags.df_flags.to_csv(args.output, sep="\t")
def setCVflag(args, wide, dat, groupName = ''): # Round all values to 3 significant digits DATround = wide.applymap(lambda x: x) # Get std, mean and calculate CV # DATstat = pd.DataFrame(index=DATround.index) DATstat['std'] = DATround.apply(np.std, axis=1) DATstat['mean'] = DATround.apply(np.mean, axis=1) DATstat['cv'] = abs(DATstat['std'] / DATstat['mean']) if not args.CVcutoff: CVcutoff = np.nanpercentile(DATstat['cv'].values, q=90) CVcutoff = round(CVcutoff, -int(floor(log(abs(CVcutoff), 10))) + 2) else: CVcutoff = float(args.CVcutoff) # Plot CVs if groupName == '': fig, ax = plt.subplots() xmin = -np.nanpercentile(DATstat['cv'].values,99)*0.2 xmax = np.nanpercentile(DATstat['cv'].values,99)*1.5 ax.set_xlim(xmin, xmax) DATstat['cv'].plot(kind='hist', range = (xmin, xmax), bins = 15, normed = 1, color = 'grey', ax = ax, label = "CV histogram") DATstat['cv'].plot(kind='kde', title="Density Plot of Coefficients of Variation", ax=ax, label = "CV density") plt.axvline(x=CVcutoff, color = 'red', linestyle = 'dashed', label = "Cutoff at: {0}".format(CVcutoff)) plt.legend() # Set file name of pdf and export #CVplotFileName = args.CVplot plt.savefig(args.CVplot, format='pdf') plt.close(fig) # Create flag instance CVflag = Flags(index=DATstat.index) # Create new flag column with flags CVflag.addColumn(column='flag_feature_big_CV', mask=((DATstat['cv'].get_values() > CVcutoff) | DATstat['cv'].isnull())) # Write output CVflag.df_flags.to_csv(args.CVflag, sep='\t') else: return DATstat['cv'], CVcutoff
htmlContents.append('<div style=\"background-color:black; color:white; text-align:center; margin-bottom:5% padding:4px;\">' '<h1>Output</h1>' '</div>') htmlContents.append('<ul style=\"text-align:left; margin-left:5%;\">') # Import data logger.info(u'html system path: {}'.format(args.htmlPath)) logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}'.format(args.fname, args.dname, args.uniqID)) dat = wideToDesign(args.fname, args.dname, args.uniqID) # Only interested in samples wide = dat.wide[dat.sampleIDs] # Global flag file global flag flag = Flags(index=wide.index) flag.addColumn(column='flag_feature_count_digits') # Use group separation or not depending on user input if args.group: countDigitsByGroups(args, wide, dat, dir=directory) else: countDigits(wide, dat, dir=directory) # Create a zip archive with the inputted zip file name of the temp file if args.noZip: pass else: shutil.make_archive(directory + '/Archive_of_Results', 'zip', directory) # Add zip of all the files to the list
def setRTflag(args, wide, dat, dir): # Round retention time to 2 decimals RTround = wide.applymap(lambda x: ifZero(x)) RTround = RTround.applymap(lambda x: round(x, 2)) # Get percentiles, min, max, mean, median RTstat = pd.DataFrame(index=RTround.index) RTstat['min'] = RTround.apply(np.min, axis=1) RTstat['max'] = RTround.apply(np.max, axis=1) RTstat['p95'] = RTround.apply(np.nanpercentile, q=95, axis=1) RTstat['p90'] = RTround.apply(np.nanpercentile, q=90, axis=1) RTstat['p10'] = RTround.apply(np.nanpercentile, q=10, axis=1) RTstat['p05'] = RTround.apply(np.nanpercentile, q= 5, axis=1) RTstat['std'] = RTround.apply(np.std, axis=1) RTstat['mean'] = RTround.apply(np.mean, axis=1) RTstat['median'] = RTround.apply(np.median, axis=1) RTstat['cv'] = RTstat['std'] / RTstat['mean'] RTstat['p95p05'] = RTstat['p95'] - RTstat['p05'] RTstat['p90p10'] = RTstat['p90'] - RTstat['p10'] # Set RT flags RTflag = Flags(index=RTround.index) if args.p90p10: RTflag.addColumn(column = 'flag_RT_Q90Q10_outlier', mask = (RTstat['p90p10'] > args.minutes)) else: RTflag.addColumn(column = 'flag_RT_Q95Q05_outlier', mask = (RTstat['p95p05'] > args.minutes)) RTflag.addColumn(column = 'flag_RT_max_gt_threshold', mask = (RTstat['max'] - RTstat['median'] > args.minutes / 2)) RTflag.addColumn(column = 'flag_RT_min_lt_threshold', mask = (RTstat['min'] - RTstat['median'] < -args.minutes / 2)) RTflag.addColumn(column = 'flag_RT_min_max_outlier', mask = ((RTstat['max']-RTstat['mean'] > 3 * RTstat['std']) | (RTstat['min']-RTstat['mean'] < -3 * RTstat['std']))) if not args.CVcutoff: CVcutoff = np.nanpercentile(RTstat['cv'].values, q=90) CVcutoff = round(CVcutoff, -int(floor(log(CVcutoff, 10))) + 2) else: CVcutoff = args.CVcutoff RTflag.addColumn(column = 'flag_RT_big_CV', mask = (RTstat['cv'] > CVcutoff)) # Output flags RTflag.df_flags.to_csv(args.RTflag, sep="\t") # Plot RT CVs fig, ax = plt.subplots() #xmin, xmax = ax.get_xlim() xmin = -np.nanpercentile(RTstat['cv'].values,99)*0.2 xmax = np.nanpercentile(RTstat['cv'].values,99)*1.5 ax.set_xlim(xmin, xmax) RTstat['cv'].plot(kind='hist', range = (xmin, xmax), bins = 15, normed = 1, color = 'grey', ax=ax, label = "CV histogram") RTstat['cv'].plot(kind='kde', title="Density Plot of Coefficients of Variation of the Retention Time", ax=ax, label = "CV density") plt.axvline(x=CVcutoff, color = 'red', linestyle = 'dashed', label = "Cutoff at: {0}".format(CVcutoff)) plt.legend() galaxySavefig(fig, args.RTplot)