def main(args): # Import data logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}\n\tGroup Column: {3}'.format(args.fname, args.dname, args.uniqID, args.group)) dat = wideToDesign(args.fname, args.dname, args.uniqID, args.group) dat.wide.convert_objects(convert_numeric=True) fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 20)) plt.subplots_adjust(hspace=0.3) # If there is group information, color by group. if hasattr(dat, 'group'): logger.info('Plotting sample distributions by group') legend1 = pltByTrt(dat, ax1) else: logger.info('Plotting sample distributions') pltBySample(dat, ax1) # Create Legend handles, labels = ax1.get_legend_handles_labels() ax1.legend(handles, labels, ncol=5, loc='upper right', fontsize=10) # Create second legend if there is group information if hasattr(dat, 'group'): ax1.add_artist(legend1) # Plot boxplot of samples pltBoxplot(dat, ax2) plt.savefig(args.ofig, format='pdf') mpld3.save_html(fig, args.ofig2, template_type='simple')
def main(args): # Import data dat = wideToDesign(args.fname, args.dname, args.uniqID) df_offFlags = Flags(index=dat.wide.index) # Iterate through each group to add flags for if a group has over half of # its data above the cutoff for title, group in dat.design.groupby(args.group): # Create mask of current frame containing True/False values if the # values are greater than the cutoff mask = (dat.wide[group.index] < args.cutoff) # Convert the mean column to a boolean meanOn = mask.mean(axis=1) # Add mean column of boolean values to flags df_offFlags.addColumn(column='flag_feature_' + title + '_off', mask=meanOn > 0.5) # flag_met_off column maskFlagMetOff = df_offFlags.df_flags.any(axis=1) df_offFlags.addColumn('flag_feature_off', maskFlagMetOff) # flag_met_all_off column maskFlagMetAllOff = df_offFlags.df_flags.all(axis=1) df_offFlags.addColumn('flag_feature_all_off', maskFlagMetAllOff) df_offFlags.df_flags.to_csv(args.output, sep="\t")
def main(args): # Import data with clean string as true df_cleanedData = wideToDesign(wide=args.fname, design=args.dname, uniqID=args.uniqID, clean_string=True) # Export cleaned data df_cleanedData.wide.to_csv(args.wideOut, sep="\t") df_cleanedData.design.to_csv(args.designOut, sep="\t")
def main(args): """ """ directory = args.RTflag # Import data dat = wideToDesign(args.fname, args.dname, args.uniqID) # Only interested in samples wide = dat.wide[dat.sampleIDs] # Set RT flags setRTflag(args, wide, dat, dir = directory)
def main(args): # Import data logger.info('Importing Data') dat = wideToDesign(args.fname, args.dname, args.uniqID) df_wide = dat.wide[dat.sampleIDs].copy() # Drop Missing if np.isnan(df_wide.values).any(): nRows = df_wide.shape[0] # Number of rows before dropping missing df_wide.dropna(inplace=True) # Drop missing rows in place nRowsNoMiss = df_wide.shape[0] # Number of rows after dropping missing logger.warn('{} rows were dropped because of missing values.'.format(nRows - nRowsNoMiss)) # Run PCA # Initialize PCA class with default values pca = PCA() # Fit PCA scores = pca.fit_transform(df_wide) # Get loadings loadings = pca.components_ # Get additional information sd = scores.std(axis=0) propVar = pca.explained_variance_ratio_ cumPropVar = propVar.cumsum() # Create header block for output. I am replicating output from R, which # includes additional information (above) at the top of the output file. labels = np.array(['#Std. deviation', '#Proportion of variance explained', '#Cumulative proportion of variance explained']) blockDat = np.vstack([sd, propVar, cumPropVar]) block = np.column_stack([labels, blockDat]) # Create header for output header = np.array(['PC{}'.format(x + 1) for x in range(loadings.shape[1])]) compoundIndex = np.hstack([df_wide.index.name, df_wide.index]) sampleIndex = np.hstack(['sampleID', df_wide.columns]) # Create loadings output loadHead = np.vstack([header, loadings]) loadIndex = np.column_stack([sampleIndex, loadHead]) loadOut = np.vstack([block, loadIndex]) # Create scores output scoreHead = np.vstack([header, scores]) scoreIndex = np.column_stack([compoundIndex, scoreHead]) scoreOut = np.vstack([block, scoreIndex]) # Save output np.savetxt(args.lname, loadOut, fmt='%s', delimiter='\t') np.savetxt(args.sname, scoreOut, fmt='%s', delimiter='\t')
def main(args): # Import data logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}\n\tGroup Column: {3}'.format(args.fname, args.dname, args.uniqID, args.group)) dat = wideToDesign(args.fname, args.dname, args.uniqID, args.group, clean_string=True) results = initResults(dat) # Transpose the data dat.trans = dat.transpose() # Group by Treatment grp = dat.trans.groupby(dat.group) grpMeans = grp.mean().T combo = createCbn(dat) resids = list() fitted = list() # Iterate over compound logger.info('Running row-by-row analysis.') for compound in dat.wide.index.tolist(): # Get Overall Mean results.ix[compound, 'GrandMean'] = dat.trans[compound].mean() # run one-way ANOVA resid, fit = oneWay(dat, compound, results) resids.append(resid) fitted.append(fit) # Calculate mean differences calcDiff(dat, compound, grpMeans, combo, results) # Calculate SE of difference between means calcDiffSE(dat, compound, combo, results) # Calculate T-test tTest(compound, combo, results) residDat = pd.concat(resids, axis=1) fitDat = pd.concat(fitted, axis=1) # Generate qqplots logger.info('Generating q-q plots.') qqPlot(residDat, fitDat, args.ofig) # Generate Volcano plots logger.info('Generating volcano plots.') volcano(combo, results, args.ofig2) # write results table results = results.convert_objects(convert_numeric=True) results.index = pd.Series([dat.revertStr(x) for x in results.index]) results = results.apply(lambda x: x.round(4)) results.to_csv(args.oname, sep="\t")
def main(args): # Import data dat = wideToDesign(args.fname, args.dname, args.uniqID) # Only interested in samples wide = dat.wide[dat.sampleIDs] # Use group separation or not depending on user input if not args.group: setCVflag(args, wide, dat) else: setCVflagByGroup(args, wide, dat)
def main(args): # Execute wideToDesign to make all data uniform formatted_data = wideToDesign(wide=args.fname, design=args.dname, uniqID=args.uniqID, group=args.group) # Convert flag file to DataFrame df_flags = pd.DataFrame.from_csv(args.flagFile, sep='\t') # If the user specified rows, run dropRows if args.dropRow: dropRows(df_wide=formatted_data.wide, df_flags=df_flags, cutoffValue=args.cutoff, args=args) # If the user specified columns, run dropColumns else: # (if args.dropColumn:) dropColumns(df_wide=formatted_data.wide, df_design=formatted_data.design, df_flags=df_flags, cutoffValue=args.cutoff, args=args)
def main(args): """ Main Script """ # Import data dat = wideToDesign(args.fname, args.dname, args.uniqID) # Only interested in samples wide = dat.wide[dat.sampleIDs] # Put warnings and get rid of rows with missing values if wide.isnull().sum().sum(): nOriginal = wide.shape[0] print "Missing values detected. All missing rows removed. " wide = wide.dropna() print "Original rows: {0}; # of rows after drop: {1}".format(nOriginal, wide.shape[0]) # Calculate SED by group or not SEDbyGroup(dat, wide, args)
def main(args): # Import data and transpose logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}\n\tGroup Column: {3}'.format(args.fname, args.dname, args.uniqID, args.group)) dat = wideToDesign(args.fname, args.dname, args.uniqID, args.group, clean_string=True) data = dat.transpose() data.dropna(axis=1, inplace=True) # Pull classifications out of dataset classes = data[dat.group].copy() data.drop(dat.group, axis=1, inplace=True) #TODO: Random forest does not handle NaNs, need to figure out the proper way to impute values. # Build Random Forest classifier logger.info('Creating classifier') model = RandomForestClassifier(n_estimators=args.num) model.fit(data, classes) # Identify features importance = pd.DataFrame([data.columns, model.feature_importances_]).T.sort(columns=1, ascending=False) # Export features ranked by importance logger.info('Exporting features') rev = importance.applymap(lambda x: dat.revertStr(x)) rev.columns = ('feature', 'ranked_importance') rev.to_csv(args.oname2, index=False, sep='\t') # Select data based on features data = data[importance.ix[:, 0].tolist()] selected_data = pd.DataFrame(model.transform(data, threshold=0)) selected_data.columns = [dat.revertStr(x) for x in data.columns] # Merge on classes and export logger.info('Exporting transformed data') clDf = pd.DataFrame(classes) clDf.reset_index(inplace=True) out = clDf.join(selected_data) out.to_csv(args.oname, index=False, sep='\t', float_format="%.4f")
parser.add_argument("--train_wide", dest="train_wide", action='store', required=True, help="wide part of the train dataset.") parser.add_argument("--train_design", dest="train_design", action='store', required=True, help="design part of the train dataset.") parser.add_argument("--test_wide", dest="test_wide", action='store', required=True, help="wide part of the test dataset.") parser.add_argument("--test_design", dest="test_design", action='store', required=True, help="design part of the test dataset.") parser.add_argument("--class_column_name", dest="class_column_name", action='store', required=True, help="Name of column in design file with Group/treatment information.") parser.add_argument("--ID", dest="uniqID", action='store', required=True, help="Name of the column with unique identifiers.") parser.add_argument("--kernel", dest="kernel", action='store', required=True, help="choice of kernel function: rbf, linear, poly, sigmoid.") parser.add_argument("--degree", dest="degree", action='store', required=True, help="(integer) degree for the polynomial kernel, default 3.") parser.add_argument("--C", dest="C", action='store', required=True, help="positive regularization parameter.") parser.add_argument("--a", dest="a", action='store', required=True, help=" positive coefficient in kernel function.") parser.add_argument("--b", dest="b", action='store', required=True, help=" independent term coefficient in kernel function.") parser.add_argument("--outfile1", dest="outfile1", action='store', required=True, help="Output traget set with predicted_class labels.") parser.add_argument("--accuracy_on_training", dest="accuracy_on_training", action='store', required=True, help="Output accuracy value on the training set.") args = parser.parse_args() train = wideToDesign(wide=args.train_wide, design= args.train_design, uniqID=args.uniqID, group=args.class_column_name).transpose() test_design=read_table(args.test_design) if args.class_column_name in test_design.columns: target = wideToDesign(wide=args.test_wide,design = args.test_design, uniqID=args.uniqID, group=args.class_column_name).transpose() else: target = wideToDesign(wide=args.test_wide,design=args.test_design, uniqID=args.uniqID).transpose() #target=target.loc[:, (target.dtypes== np.int) | (target.dtypes== np.float)] # make sure test and train have the same features for i in target.columns: if i not in train.columns: del target[i] ###################### trainig the SVM #######################################
logger.error("Error. {}".format(e)) htmlFile = file(args.html, 'w') global htmlContents # universe_wsgi.ini file's html_sanitizing must be false to allow for styling htmlContents = ["<html><head><title>Count Digits Results List</title></head><body>"] htmlContents.append('<div style=\"background-color:black; color:white; text-align:center; margin-bottom:5% padding:4px;\">' '<h1>Output</h1>' '</div>') htmlContents.append('<ul style=\"text-align:left; margin-left:5%;\">') # Import data logger.info(u'html system path: {}'.format(args.htmlPath)) logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}'.format(args.fname, args.dname, args.uniqID)) dat = wideToDesign(args.fname, args.dname, args.uniqID) # Only interested in samples wide = dat.wide[dat.sampleIDs] # Global flag file global flag flag = Flags(index=wide.index) flag.addColumn(column='flag_feature_count_digits') # Use group separation or not depending on user input if args.group: countDigitsByGroups(args, wide, dat, dir=directory) else: countDigits(wide, dat, dir=directory)