def MaxTE(self, Cursor, Date, Limit=100): '''Calcula Maximo de precio en fila debe llevar fecha y limite ''' Qry = """ select id_ofertas_energia_david, hora, central, precio_per_mw_1, precio_per_mw_2, precio_per_mw_3, precio_per_mw_4, precio_per_mw_5, precio_per_mw_6, precio_per_mw_7, precio_per_mw_8, precio_per_mw_9, precio_per_mw_10, precio_per_mw_11 from ofertas_energia_david where tipo_reporte='TE' and fecha_inicial='{0}' limit {1} """.format(Date, Limit) Cursor.execute(Qry) #da formato pandas df = as_pandas(Cursor) #pasando a formato R y a R df = com.convert_to_r_dataframe(df) #print type(rdf) ro.r('source("./Rfunctions/max.R")') ro.globalenv['tabla'] = df ro.r('Out <- Rmax(tabla)') print ro.r('Out')
def ccaPermuteOutcomesVsControls(self, groupFreqThresh = 0, nPerms = 25, penaltyXs = None , penaltyZs = None): (groups, allOutcomes, controls) = self.outcomeGetter.getGroupsAndOutcomes(groupFreqThresh) # groups: set(group_ids) # allOutcomes: {outcome: {group_id: value}} # controls: {control: {group_id: value}} # X contains feature group_norms, Z contains outcome values Zdict = allOutcomes Xdict = controls # R doesn't handle '$'s in column names Xdict = {k.replace('$','.'):v for k, v in Xdict.iteritems()} Zdict = {k.replace('$','.'):v for k, v in Zdict.iteritems()} # X, Z, Xfreqs, Zfreqs = self.prepMatrices(pd.DataFrame(data=Xdict),pd.DataFrame(data=Zdict), softImputeXtoo=True) X, Z, Xfreqs, Zfreqs = self.prepMatricesTogether(pd.DataFrame(data=Xdict), pd.DataFrame(data=Zdict)) X = com.convert_to_r_dataframe(X) Z = com.convert_to_r_dataframe(Z) Ngroups = com.convert_robj(ro.r["nrow"](X)[0]) kwParams = {"nperms": nPerms} kwParams['penaltyxs'] = penaltyXs if penaltyXs else ro.vectors.FloatVector(np.arange(.1,.91,.05)) kwParams['penaltyzs'] = penaltyZs if penaltyZs else ro.vectors.FloatVector(np.arange(.1,.91,.05)) self._ccaPermute(X,Z, **kwParams)
def venn_diagram(first, second, name1, name2, save_path): """Wrapper arround R's VennDiagram.""" # define function in R to make venn diagram ro.r('''venn_diag <- function(df1, df2, save_path){ library(VennDiagram) png() venn.diagram( x = list( %s = df1$X0, %s = df2$X0 ), filename = '%s', lwd = 4, fill = c("cornflowerblue", "darkorchid1"), alpha = 0.75, label.col = "black", cex = 4, fontfamily = "serif", fontface = "bold", cat.col = c("cornflowerblue", "darkorchid1"), cat.cex = 3, cat.fontfamily = "serif", cat.fontface = "bold", cat.dist = c(0.03, 0.03), cat.pos = c(-20, 14) ); dev.off() }''' % (name1, name2, save_path)) venn_diag = ro.r['venn_diag'] # venn diagram function # convert to R data frame first_rdf = com.convert_to_r_dataframe(first) second_rdf = com.convert_to_r_dataframe(second) venn_diag(first_rdf, second_rdf, save_path)
def runDESeq(infile, outfiles, outfileRoot): # Report print 'Doing ' + infile + '...' # Read dataframe countDataframe = pd.read_table(infile, index_col='gene_symbol') # Sample counts sampleCounts = collections.Counter( [x.split('-')[-1] for x in countDataframe.columns]) # Make annotation dataframe annotationDataframe = pd.DataFrame.from_dict([{ 'sample_id': x, 'sample_type': x.split('-')[-1] } for x in countDataframe.columns]).set_index('sample_id') # Sample counts sampleCounts = collections.Counter( [x.split('-')[-1] for x in countDataframe.columns]) # Get comparisons comparisons = [ list(x[::-1]) for x in itertools.combinations( [key for key, value in sampleCounts.iteritems() if value >= 5], 2) ] # Loop through comparisons for comparison in comparisons: # Filter annotationDataframeSubset = annotationDataframe[ annotationDataframe['sample_type'].isin(comparison)] countDataframeSubset = countDataframe[annotationDataframeSubset.index] # Run function deseqDataframe = r.runDESeq2( com.convert_to_r_dataframe(countDataframeSubset), com.convert_to_r_dataframe(annotationDataframeSubset), '~ sample_type') # Convert to dataframe deseqDataframe = com.convert_robj(deseqDataframe) # Get comparison string comparisonString = 'v'.join(comparison) # Get outfile outfile = '{outfileRoot}{comparisonString}.txt'.format(**locals()) # Create outdir outDir = os.path.dirname(outfile) if not os.path.exists(outDir): os.makedirs(outDir) # Write deseqDataframe.to_csv(outfile, sep='\t', index_label='gene_symbol')
def _cca(self, X, Z, **params): """Given two Pandas dataframes and a set of parameters, performs CCA returns CCA dict (converted from R CCA named list object) """ pma = importr("PMA") # Defaults: kwParams = {"typex": "standard", "typez": "standard", "trace": False, "K": self.numComponents, } kwParams.update(params) if isinstance(X, pd.core.frame.DataFrame): X = com.convert_to_r_dataframe(X) if isinstance(Z, pd.core.frame.DataFrame): Z = com.convert_to_r_dataframe(Z) assert isinstance(X, ro.vectors.DataFrame) and isinstance(Z, ro.vectors.DataFrame), "X, Z need to be either Pandas DataFrames or R dataframes!" assert self.numComponents <= min(len(X.names),len(Z.names)), "Number of components must be smaller than the minimum of columns in each of your matrices" nGroups = com.convert_robj(ro.r["nrow"](X)[0]) print "\tCCA parameters:", kwParams cca = pma.CCA(X, Z, **kwParams) cca = {k:v for k, v in cca.items()} cca['nGroups'] = nGroups return cca
def runComBat(infiles, outfile): # Split infiles vstFile, annotationFile = infiles # Read expression dataframe vstDataframe = pd.read_table(vstFile, index_col='gene_symbol').drop(['B8N', 'B10C'], axis=1) # Read annotation dataframe annotationDataframe = pd.read_table(annotationFile, index_col='sample_name') # Get common samples annotationDataframe = annotationDataframe.loc[vstDataframe.columns] # Run function combatMatrix = r.runComBat(com.convert_to_r_dataframe(vstDataframe), com.convert_to_r_dataframe(annotationDataframe), covariateFormula='~treatment', batchColumn='patient') # Convert to dataframe combatDataframe = com.convert_robj(combatMatrix) # Write file combatDataframe.to_csv(outfile, sep='\t', index_label='gene_symbol')
def _cca(self, X, Z, **params): """Given two Pandas dataframes and a set of parameters, performs CCA returns CCA dict (converted from R CCA named list object) """ pma = importr("PMA") # Defaults: kwParams = {"typex": "standard", "typez": "standard", "trace": False, "K": self.numComponents, } kwParams.update(params) if isinstance(X, pd.core.frame.DataFrame): X = com.convert_to_r_dataframe(X) if isinstance(Z, pd.core.frame.DataFrame): Z = com.convert_to_r_dataframe(Z) assert isinstance(X, ro.vectors.DataFrame) and isinstance(Z, ro.vectors.DataFrame), "X, Z need to be either Pandas DataFrames or R dataframes!" assert self.numComponents <= min(len(X.names),len(Z.names)), "Number of components must be smaller than the minimum of columns in each of your matrices" nGroups = com.convert_robj(ro.r["nrow"](X)[0]) print("\tCCA parameters:", kwParams) cca = pma.CCA(X, Z, **kwParams) cca = {k:v for k, v in list(cca.items())} cca['nGroups'] = nGroups return cca
def ccaPermuteOutcomesVsControls(self, nPerms = 25, penaltyXs = None , penaltyZs = None): (groups, allOutcomes, controls) = self.outcomeGetter.getGroupsAndOutcomes() # groups: set(group_ids) # allOutcomes: {outcome: {group_id: value}} # controls: {control: {group_id: value}} # X contains feature group_norms, Z contains outcome values Zdict = allOutcomes Xdict = controls # R doesn't handle '$'s in column names Xdict = {k.replace('$','.'):v for k, v in Xdict.items()} Zdict = {k.replace('$','.'):v for k, v in Zdict.items()} # X, Z, Xfreqs, Zfreqs = self.prepMatrices(pd.DataFrame(data=Xdict),pd.DataFrame(data=Zdict), softImputeXtoo=True) X, Z, Xfreqs, Zfreqs = self.prepMatricesTogether(pd.DataFrame(data=Xdict), pd.DataFrame(data=Zdict)) try: X = com.convert_to_r_dataframe(X) Z = com.convert_to_r_dataframe(Z) Ngroups = com.convert_robj(ro.r["nrow"](X)[0]) except NameError: warn("pandas.rpy.common cannot be imported") sys.exit(1) kwParams = {"nperms": nPerms} kwParams['penaltyxs'] = penaltyXs if penaltyXs else ro.vectors.FloatVector(np.arange(.1,.91,.05)) kwParams['penaltyzs'] = penaltyZs if penaltyZs else ro.vectors.FloatVector(np.arange(.1,.91,.05)) self._ccaPermute(X,Z, **kwParams)
def run_gams_model_single_run(proc_data, feature_list): if proc_data['outcome'] is not None: print("outcome variable found") else: print("outcome variable missing") return (list(None, None, status="FAIL")) #Prepare Formula from feature list s = "outcome~" for i in range(len(feature_list)): if (i in [37, 49, 48, 51, 72, 50]): continue if (feature_list['feature_type'][i] == 'num'): s += feature_list['feature_name'][i] + '+' else: s += feature_list['feature_name'][i] + '+' s = s[:-1] fmla = Formula(s) print s proc_data.dropna() response = proc_data['outcome'] positiveOutcomes = np.sum(np.asarray(response)) full_range = range(0, len(proc_data.index) - 1) thres_tune = round((positiveOutcomes / len(proc_data.index)), 3) train_ind = random.sample(range(0, len(proc_data.index) - 1), int(math.floor(0.7 * len(proc_data.index)))) test_ind = [val for val in full_range if val not in train_ind] rdf = com.convert_to_r_dataframe(proc_data.iloc[train_ind]) #Run the model print("Running GAMs model") model = mgcv.bam(formula=fmla, data=rdf, family=statsf.binomial(link="logit")) print("Model building completed") #Predict the values tdf = com.convert_to_r_dataframe(proc_data) predval = statsf.predict(model, tdf, type="response") predicted_values = pd.concat([ pd.DataFrame({'pred': predval}, index=[response.index.get_values()]), response ], axis=1) predicted_values.columns = ['prediction', 'observed'] #Returning the List status = "pass" res = list() res.append(model) res.append(predicted_values) res.append(status) print("+++++++++++++++++ Completed +++++++++++++++++++") return res
def internal_cluster_evaluation(feature_points, cluster_labels, metrics): fp = DataFrame(feature_points) fp = com.convert_to_r_dataframe(fp) r.assign("feature_points", fp) cl = DataFrame(cluster_labels) cl = com.convert_to_r_dataframe(cl) r.assign("cluster_labels", cl) r('cluster_labels')
def create_model(input_json): global hourly_volume #Loads JSON file print 'Loading Data...' json = loads(input_json) #Converts to Pandas Time Series Dataframe which can be converted to be used by R df = pd.DataFrame(json) df.columns = ['time'] df['time'] = df['time'].apply(dateutil.parser.parse) df.set_index('time', inplace=True) df['t'] = 1 #Resamples Dataframe into hourly (for weekly model) and daily (for monthly model) buckets hourly_volume = df.resample('1H', how=np.count_nonzero) daily_volume = df.resample('1D', how=np.count_nonzero) print 'Creating Model...' #Converts Pandas Dataframe to R Dataframe demand_data_daily = com.convert_to_r_dataframe(daily_volume) demand_data_hourly = com.convert_to_r_dataframe(hourly_volume) #Brings Dataframes into R workspace r.assign('train_data_hourly', demand_data_hourly) r.assign('train_data_daily', demand_data_daily) #Assigns values to required input variables in R r('start_index = ' + str(get_friday_index(hourly_volume))) r('month_index = ' + str(get_first_of_month(hourly_volume))) #Reorganizes hourly dataframe to seasonal time series w/ 168 hr weekly intervals starting at the 1st Fri r('train_data_ts <- ts(train_data_hourly[,1],start=c(1,(168-start_index+2)),frequency=168)' ) #Adds 0.01 as model input data must be non-zero r('train_data_ts = train_data_ts+ 0.01') #R creates hourly model we set beta=0 as we assume no global trend (HOLTZ-WINTERS MODEL) r('hr_model <- HoltWinters(train_data_ts,beta=0,seasonal="m",start.periods=(168+start_index-1))' ) #R creates a monthly model IFF there is enough data (min 8 weeks) r('dy_model = NULL') #1st Fri of hourly dataset translated for daily dataset r('start_index = (start_index-1)/24+1') if (r('length(train_data_daily[,1])>(28*2+start_index-1)')[0]): #if the first fri of the month of the dataset proceeds start date of dataset, sets to prior month's first fri r('if(month_index<1){month_index = 28-month_index }') #Reorganizes daily dataframe to seasonal time series r('train_data_ts <- ts(train_data_daily[,1],start=c(1,month_index),frequency=28)' ) #R creates monthly model, again we assume no global trend r('dy_model <- HoltWinters(train_data_ts,seasonal="m",start.periods=(28+start_index-1))' ) print 'Model Created!'
def run_gams_model_single_run(proc_data, feature_list): if proc_data["outcome"] is not None: print ("outcome variable found") else: print ("outcome variable missing") return list(None, None, status="FAIL") # Prepare Formula from feature list s = "outcome~" for i in range(len(feature_list)): if i in [37, 49, 48, 51, 72, 50]: continue if feature_list["feature_type"][i] == "num": s += feature_list["feature_name"][i] + "+" else: s += feature_list["feature_name"][i] + "+" s = s[:-1] fmla = Formula(s) print s proc_data.dropna() response = proc_data["outcome"] positiveOutcomes = np.sum(np.asarray(response)) full_range = range(0, len(proc_data.index) - 1) thres_tune = round((positiveOutcomes / len(proc_data.index)), 3) train_ind = random.sample(range(0, len(proc_data.index) - 1), int(math.floor(0.7 * len(proc_data.index)))) test_ind = [val for val in full_range if val not in train_ind] rdf = com.convert_to_r_dataframe(proc_data.iloc[train_ind]) # Run the model print ("Running GAMs model") model = mgcv.bam(formula=fmla, data=rdf, family=statsf.binomial(link="logit")) print ("Model building completed") # Predict the values tdf = com.convert_to_r_dataframe(proc_data) predval = statsf.predict(model, tdf, type="response") predicted_values = pd.concat( [pd.DataFrame({"pred": predval}, index=[response.index.get_values()]), response], axis=1 ) predicted_values.columns = ["prediction", "observed"] # Returning the List status = "pass" res = list() res.append(model) res.append(predicted_values) res.append(status) print ("+++++++++++++++++ Completed +++++++++++++++++++") return res
def create_model(input_json): global hourly_volume #Loads JSON file print 'Loading Data...' json = loads(input_json) #Converts to Pandas Time Series Dataframe which can be converted to be used by R df = pd.DataFrame(json) df.columns = ['time'] df['time'] = df['time'].apply(dateutil.parser.parse) df.set_index('time', inplace=True) df['t'] = 1 #Resamples Dataframe into hourly (for weekly model) and daily (for monthly model) buckets hourly_volume = df.resample('1H', how=np.count_nonzero) daily_volume = df.resample('1D', how=np.count_nonzero) print 'Creating Model...' #Converts Pandas Dataframe to R Dataframe demand_data_daily = com.convert_to_r_dataframe(daily_volume) demand_data_hourly = com.convert_to_r_dataframe(hourly_volume) #Brings Dataframes into R workspace r.assign('train_data_hourly',demand_data_hourly) r.assign('train_data_daily',demand_data_daily) #Assigns values to required input variables in R r('start_index = ' +str(get_friday_index(hourly_volume))) r('month_index = ' +str(get_first_of_month(hourly_volume))) #Reorganizes hourly dataframe to seasonal time series w/ 168 hr weekly intervals starting at the 1st Fri r('train_data_ts <- ts(train_data_hourly[,1],start=c(1,(168-start_index+2)),frequency=168)') #Adds 0.01 as model input data must be non-zero r('train_data_ts = train_data_ts+ 0.01') #R creates hourly model we set beta=0 as we assume no global trend (HOLTZ-WINTERS MODEL) r('hr_model <- HoltWinters(train_data_ts,beta=0,seasonal="m",start.periods=(168+start_index-1))') #R creates a monthly model IFF there is enough data (min 8 weeks) r('dy_model = NULL') #1st Fri of hourly dataset translated for daily dataset r('start_index = (start_index-1)/24+1') if (r('length(train_data_daily[,1])>(28*2+start_index-1)')[0]): #if the first fri of the month of the dataset proceeds start date of dataset, sets to prior month's first fri r('if(month_index<1){month_index = 28-month_index }') #Reorganizes daily dataframe to seasonal time series r('train_data_ts <- ts(train_data_daily[,1],start=c(1,month_index),frequency=28)') #R creates monthly model, again we assume no global trend r('dy_model <- HoltWinters(train_data_ts,seasonal="m",start.periods=(28+start_index-1))') print 'Model Created!'
def ccaPermute(self, nPerms=25, penaltyXs=None, penaltyZs=None, controlsWithFeats=False): (groups, allOutcomes, controls) = self.outcomeGetter.getGroupsAndOutcomes() # groups: set(group_ids) # allOutcomes: {outcome: {group_id: value}} # controls: {control: {group_id: value}} (groupNorms, featureNames ) = self.featureGetter.getGroupNormsWithZerosFeatsFirst(groups) Zdict = allOutcomes Xdict = groupNorms if controlsWithFeats: print("Appending controls to X") Xdict.update(controls) else: print("Appending controls to Z") Zdict.update(controls) # TO DO: get topic frequencies? # groupNorms: {feat: {group_id: group_norm}} # featureNames: list of possible feature names # X contains feature group_norms, Z contains outcome values X, Z, Xfreqs, Zfreqs = self.prepMatrices(pd.DataFrame(data=Xdict), pd.DataFrame(data=Zdict)) try: X = com.convert_to_r_dataframe(X) Z = com.convert_to_r_dataframe(Z) Ngroups = com.convert_robj(ro.r["nrow"](X)[0]) except NameError: warn("pandas.rpy.common cannot be imported") sys.exit(1) kwParams = {"nperms": nPerms} kwParams[ 'penaltyxs'] = penaltyXs if penaltyXs else ro.vectors.FloatVector( np.arange(.1, .91, .05)) kwParams[ 'penaltyzs'] = penaltyZs if penaltyZs else ro.vectors.FloatVector( np.arange(.1, .91, .05)) self._ccaPermute(X, Z, **kwParams)
def gams_for_individual_run(proc_data,feature_list): if proc_data['outcome'] is not None: print("outcome variable found") else: print("outcome variable missing") return (list(None, None, status = "FAIL")) #Prepare Formula from feature list s="outcome~" for i in range(len(feature_list)): if(i in [37,49,48,51,72,50]): continue; if(feature_list['feature_type'][i] == 'num'): s += feature_list['feature_name'][i] + '+' else: s += feature_list['feature_name'][i] + '+' s = s[:-1] fmla = Formula(s) proc_data.dropna() full_range = range(0, len(proc_data.index) - 1) #thres_tune=round((positiveOutcomes/len(traindata.index)),3) train_ind =random.sample(range(0,len(proc_data.index)-1),int(math.floor(0.7*len(proc_data.index)))) test_ind = [val for val in full_range if val not in train_ind] response = proc_data['outcome'].iloc[test_ind] rdf = com.convert_to_r_dataframe(proc_data.iloc[train_ind]) #Run the model print("Running GAMs model") model = mgcv.bam(formula=fmla, data=rdf, family=statsf.binomial(link="logit")) print("Model building completed") #Predict the values tdf = com.convert_to_r_dataframe(proc_data.iloc[test_ind]) predval = statsf.predict(model, tdf, type="response") predicted_values = pd.concat([pd.DataFrame({'pred':predval}, index=[response.index.get_values()]), response], axis=1) predicted_values.columns = ['prediction', 'observed'] auc=roc_auc_score(response, predval) status = "pass" res = list() res.append(model) res.append(predicted_values) res.append(auc) print("+++++++++++++++++ Completed +++++++++++++++++++") return res
def check(self): """Performs check, some issue with output however.""" importr('biotools') pan_data = pandas.DataFrame(self.data) pan_classes = pandas.DataFrame(self.classes) r_data = com.convert_to_r_dataframe(pan_data) r_classes = com.convert_to_r_dataframe(pan_classes) ro.globalenv['r_data'] = r_data ro.globalenv['r_classes'] = r_classes ro.r('boxM_test = boxM(data=r_data,grouping=r_classes)') ro.r('pvals = boxM_test$p.value') pan_data = com.load_data('pvals') ##P-value of box test of equal covariances self.boxMp = pan_data[0]
def removeBatchEffects(infile, outfile): # Read data vstDataframe = pd.read_table(infile).set_index('gene_symbol').drop( ['NK1', 'NK2', 'B8N', 'B10C'], axis=1) # Create annotation dataframe annotationDataframe = pd.DataFrame( [[x, x[:-1], x[-1]] for x in vstDataframe.columns], columns=['sample_id', 'patient_id', 'treatment']) # Run function r.remove_batch_effects(com.convert_to_r_dataframe(vstDataframe), com.convert_to_r_dataframe(annotationDataframe), outfile)
def test_converting_to_factors(): test_data = DataFrame( { 'colA': Series(randn(1, 5000).flatten() > 0), 'colB': Series(100 * randn(1, 5000).flatten()), 'colC': Series(100 + randn(1, 5000).flatten()), 'colD': Series(randn(1, 5000).flatten() > 0), }, ) test_data['colA'] = test_data['colA'].map(str) test_data['colD'] = test_data['colD'].map(str) factor_cols = [('colA', 'True'), ('colD', 'True')] rpy_test_df = com.convert_to_r_dataframe(test_data) rpy_out_df = Rtools.convert_columns_to_factors(rpy_test_df, factor_cols) test_cols = [('colA', 'factor'), ('colB', 'numeric'), ('colC', 'numeric'), ('colD', 'factor')] for col, typ in test_cols: if typ == 'factor': yield eq_, rpy_out_df.rx2(col).nlevels, 2 elif typ == 'numeric': yield ok_, (not hasattr(rpy_out_df.rx2(col), 'nlevels'))
def pd_py2ri(o): """ """ res = None if isinstance(o, pd.Series): o = pd.DataFrame(o, index=o.index) if isinstance(o, pd.DataFrame): if isinstance(o.index, pd.DatetimeIndex): res = rconv.convert_df_to_xts(o) else: res = rcom.convert_to_r_dataframe(o) if isinstance(o, pd.DatetimeIndex): res = rconv.convert_datetime_index(o) if isinstance(o, pd.Timestamp): res = rconv.convert_timestamp(o) if res is None: try: res = numpy2ri.py2ri(o) except: res = robjects.default_converter.py2ri(o) return res
def run_earth(X, y, **kwargs): ''' Run with the R package earth. Return prediction value, training time, and number of forward pass iterations. ''' r = robjects.r m, n = X.shape data = pandas.DataFrame(X) data['y'] = y r_data = com.convert_to_r_dataframe(data) r('library(earth)') r_func = ''' run <- function(data, degree=1, fast.k=0, penalty=3.0){ time = system.time(model <- earth(y~.,data=data,degree=degree,penalty=penalty))[3] forward_terms = dim(summary(model)$prune.terms)[1] y_pred = predict(model,data) return(list(y_pred, time, forward_terms, model)) } ''' r(r_func) run = r('run') r_list = run( **{ 'data': r_data, 'degree': kwargs['max_degree'], 'fast.k': 0, 'penalty': kwargs['penalty'] }) y_pred = numpy.array(r_list[0]).reshape(m) time = r_list[1][0] forward_terms = r_list[2][0] return y_pred, time, (forward_terms - 1) / 2
def test_mixed_model(): test_data = DataFrame( { 'colA': Series(randn(1, 5000).flatten() > 0), 'colB': Series(100 * randn(1, 5000).flatten()), 'colC': Series(100 + randn(1, 5000).flatten()), 'colD': Series(randn(1, 5000).flatten() > 0), }, ) test_data['colA'] = test_data['colA'].map(str) test_data['colD'] = test_data['colD'].map(str) factor_cols = [('colA', 'True'), ('colD', 'True')] rpy_test_df = com.convert_to_r_dataframe(test_data) rpy_test_df = Rtools.convert_columns_to_factors(rpy_test_df, factor_cols) base_formula = Formula('colC ~ as.factor(colA) + colB') rand_formula = Formula('~1|colD') results = Rtools.R_linear_mixed_effects_model(rpy_test_df, base_formula, rand_formula) print results['tTable'] ok_(('tTable' in results), 'Did not have the tTable in the results') ok_(('as.factor(colA)False' in results['tTable'].index), 'Did not have the factor in the tTable') ok_(('colB' in results['tTable'].index), 'Did not have the variable in the tTable')
def set_cv_fold(self, df): """Send which genes are valid test sets for each CV fold.""" if new_pandas_flag: r_df = pandas2ri.py2ri(df) else: r_df = com.convert_to_r_dataframe(df) ro.globalenv['cvFoldDf'] = r_df
def predict(self, xtest): """Predicts class via majority vote. Parameters ---------- xtest : pd.DataFrame features for test set """ if new_pandas_flag: r_xtest = pandas2ri.py2ri(xtest) else: r_xtest = com.convert_to_r_dataframe(xtest) #r_xtest = pandas2ri.py2ri(xtest) pred = self.rf_pred(self.rf, r_xtest) if new_pandas_flag: #py_pred = pandas2ri.ri2py(pred) tmp_genes = pred[1] tmp_pred_class = pred[0] genes = pandas2ri.ri2py(tmp_genes) pred_class = pandas2ri.ri2py(tmp_pred_class) else: py_pred = com.convert_robj(pred) genes, pred_class = zip(*py_pred.items()) #genes = com.convert_robj(tmp_genes) #pred_class = com.convert_robj(tmp_pred_class) tmp_df = pd.DataFrame({'pred_class': pred_class}, index=genes) tmp_df = tmp_df.reindex(xtest.index) tmp_df -= 1 # for some reason the class numbers start at 1 return tmp_df['pred_class']
def run_earth(X, y, **kwargs): '''Run with the R package earth. Return prediction value, training time, and number of forward pass iterations.''' r = robjects.r m, n = X.shape data = pandas.DataFrame(X) data['y'] = y r_data = com.convert_to_r_dataframe(data) r('library(earth)') r_func = ''' run <- function(data, degree=1, fast.k=0, penalty=3.0){ time = system.time(model <- earth(y~.,data=data,degree=degree,penalty=penalty))[3] forward_terms = dim(summary(model)$prune.terms)[1] y_pred = predict(model,data) return(list(y_pred, time, forward_terms, model)) } ''' r(r_func) run = r('run') r_list = run( **{'data': r_data, 'degree': kwargs['max_degree'], 'fast.k': 0, 'penalty': kwargs['penalty']}) y_pred = numpy.array(r_list[0]).reshape(m) time = r_list[1][0] forward_terms = r_list[2][0] return y_pred, time, (forward_terms - 1) / 2
def fit(self, xtrain, ytrain): """The fit method trains R's random forest classifier. NOTE: the method name ("fit") and method signature were choosen to be consistent with scikit learn's fit method. Parameters ---------- xtrain : pd.DataFrame features for training set ytrain : pd.DataFrame true class labels (as integers) for training set """ label_counts = ytrain.value_counts() if self.is_onco_pred and self.is_tsg_pred: sampsize = [label_counts[self.other_num], label_counts[self.onco_num], label_counts[self.tsg_num]] elif self.is_onco_pred: sampsize = [label_counts[self.other_num], label_counts[self.onco_num]] elif self.is_tsg_pred: sampsize = [label_counts[self.other_num], label_counts[self.tsg_num]] self.set_sample_size(sampsize) ytrain.index = xtrain.index # ensure indexes match xtrain['true_class'] = ytrain r_xtrain = com.convert_to_r_dataframe(xtrain) #r_xtrain = pandas2ri.py2ri(xtrain) self.rf = self.rf_fit(r_xtrain, self.ntrees, self.sample_size) r_imp = self.rf_imp(self.rf) # importance dataframe in R self.feature_importances_ = com.convert_robj(r_imp)
def skm_permute(data): """ rpy2 wrapper for R function: KMeansSparseCluster.permute from the sparcl package. The tuning parameter controls the L1 bound on w, the feature weights. A permutation approach is used to select the tuning parameter. Infile: --------------- data: pandas Dataframe nxp dataframe where n is observations and p is features (i.e. ROIs) Should be a pandas DataFrame with subject codes as index and features as columns. Returns: --------------- best_L1bound: float tuning parameter that returns the highest gap statistic (more features given non-zero weights) lowest_L1bound: float smallest tuning parameter that gives a gap statistic within one sdgap of the largest gap statistic (sparser result) """ sparcl = import_sparcl() r_data = com.convert_to_r_dataframe(data) km_perm = sparcl.KMeansSparseCluster_permute(r_data,K=2,nperms=25) best_L1bound = km_perm.rx2('bestw')[0] wbounds = km_perm.rx2('wbounds') gaps = km_perm.rx2('gaps') bestgap = max(gaps) sdgaps = km_perm.rx2('sdgaps') # Calculate smallest wbound that returns gap stat within one sdgap of best wbound wbound_rnge = [wbounds[i] for i in range(len(gaps)) if (gaps[i]+sdgaps[i]>=bestgap)] lowest_L1bound = min(wbound_rnge) return best_L1bound, lowest_L1bound
def write_r_dataframe(count_dict,outputfile): """Write R-compatible dataframe to output file""" df = pd.DataFrame.from_dict(count_dict) r_dataframe = com.convert_to_r_dataframe(df) f1 = open(outputfile,'w+') print >>f1, r_dataframe
def process_covariates(surv, feature=None, cov=None): ''' Coerce covariates and feature into format suitable for R's survival functions. ''' if type(feature) is type(None): feature = pd.Series(index=surv.index.levels[0]) if type(cov) is type(None): cov = pd.DataFrame(index=feature.index) if type(cov) == pd.Series: cov = pd.concat([cov], axis=1) elif type(cov) == list: assert map(type, cov) == ([pd.Series] * len(cov)) cov = pd.concat(cov, axis=1) cov = cov.apply(sanitize_lr) feature = sanitize_lr(feature) c_real = cov.ix[:, cov.dtypes.isin([np.dtype(float), np.dtype(int)])] c_real = (c_real - c_real.mean()) / c_real.std() if c_real.shape[1] > 0: cov[c_real.columns] = c_real cov = cov.dropna(1, how='all') df = cov.join(surv.unstack()).dropna() df.loc[:, 'days'] = df.loc[:, 'days'] / 365 df = df.groupby(level=0).first() if len(feature.dropna()) == 0: feature = None df, factors = process_factors(df, feature, list(cov.columns)) df = df[factors + ['days', 'event']] df = df.dropna(axis=1, how='all') df = convert_to_r_dataframe(df) return df, factors
def sarima_test(steps, path): index_name, my_trend = parse_csv(path) dta = pd.DataFrame(my_trend) dta.index = index_name dta = dta.rename(columns={0: 'search'}) r_df = com.convert_to_r_dataframe(dta) y = stats.ts(r_df) order = R.IntVector((1, 1, 1)) season = R.ListVector({'order': R.IntVector((0, 1, 0)), 'period': 52}) model = stats.arima(y[-5 * 52:-steps], order=order, seasonal=season) f = forecast.forecast(model, h=steps) future = [var for var in f[3]] y_pred = np.array(future) y_true = np.array(my_trend[-steps:]) metrics_result = { 'sarima_MAE': metrics.mean_absolute_error(y_true, y_pred), 'sarima_MSE': metrics.mean_squared_error(y_true, y_pred), 'sarima_MAPE': np.mean(np.abs((y_true - y_pred) / y_true)) * 100 } p1 = plt.plot(my_trend[-steps:], '*-') p2 = plt.plot(future) # p1 = plt.plot(index_name,my_trend,'r-') # p2 = plt.plot(index_name_future,future,'g-') plt.ylabel('Search Intensity') plt.xlabel('Year') plt.title('Search Prediction of ' + path.split('/')[-1][:-4]) plt.legend((p1[0], p2[0]), ["Actual", "Predicted"], loc=1) plt.grid(True) # print metrics_result['sarima_MAPE'] return metrics_result['sarima_MAPE']
def process_covariates(surv, feature=None, cov=None): ''' Coerce covariates and feature into format suitable for R's survival functions. ''' if type(feature) is type(None): feature = pd.Series(index=surv.index.levels[0]) if type(cov) is type(None): cov = pd.DataFrame(index=feature.index) if type(cov) == pd.Series: cov = pd.concat([cov], axis=1) elif type(cov) == list: assert map(type, cov) == ([pd.Series] * len(cov)) cov = pd.concat(cov, axis=1) cov = cov.apply(sanitize_lr) feature = sanitize_lr(feature) c_real = cov.ix[:, cov.dtypes.isin([np.dtype(float), np.dtype(int)])] c_real = (c_real - c_real.mean()) / c_real.std() if c_real.shape[1] > 0: cov[c_real.columns] = c_real cov = cov.dropna(1, how='all') df = cov.join(surv.unstack()).dropna() df['days'] = df['days'] / 365. df = df.groupby(level=0).first() if len(feature.dropna()) == 0: feature = None df, factors = process_factors(df, feature, list(cov.columns)) df = df[factors + ['days', 'event']] df = df.dropna(axis=1, how='all') df = convert_to_r_dataframe(df) return df, factors
def case_classifyCascade(self): """ A individual case classification function""" ########### To R for classification os.chdir("Z:\Cristina\MassNonmass\codeProject\codeBase\extractFeatures\casesDatabase") cF = pd.read_csv('casesFrames_toclasify.csv') cF['finding.mri_mass_yn'] = cF['finding.mri_mass_yn'].astype('int32') cF['finding.mri_nonmass_yn'] = cF['finding.mri_nonmass_yn'].astype('int32') cF['finding.mri_foci_yn'] = cF['finding.mri_foci_yn'].astype('int32') cF['finding.mri_foci_yn'] = cF['finding.mri_foci_yn'].astype('int32') cF['is_insitu'] = cF['is_insitu'].astype('int32') cF['is_invasive'] = cF['is_invasive'].astype('int32') self.rpycasesFrame = com.convert_to_r_dataframe(cF) base = importr('base') base.source('Z:/Cristina/MassNonmass/codeProject/codeBase/finalClassifier/finalClassifier_classifyCascade.R') RFcascade = globalenv['finalClassifier_classifyCascade'](self.rpycasesFrame) self.RFcascade_probs = com.convert_robj(RFcascade) print "\n========================" print self.RFcascade_probs # proccess possible outcome [veredict, caseoutcome] = self.parse_classes(self.RFcascade_probs) print "\n========================\nCascade classification result:" print veredict print caseoutcome return
def write_r_dataframe(count_dict, outputfile): """Write R-compatible dataframe to output file""" df = pd.DataFrame.from_dict(count_dict) r_dataframe = com.convert_to_r_dataframe(df) f1 = open(outputfile, 'w+') print >> f1, r_dataframe
def RsoftImpute(self, X): softImpute = importr("softImpute") X = com.convert_to_r_dataframe(X) X = softImpute.complete( X, softImpute.softImpute(softImpute.biScale(X, maxit=100))) X = com.convert_robj(X) return X
def av(data, formula, model='', output='', as_strings='', title='Title for Your Output', label='Label for Your Output', pythontex=True): if not output: output = 'xtable' if not model: model = 'aov' if output == 'stargazer': stargazer = importr('stargazer') elif output == 'texreg': texreg = importr('texreg') formula = robjects.Formula(formula) dfr = com.convert_to_r_dataframe(data) # convert from pandas to R and make string columns factors if model == 'aov': output = 'xtable' #aov only works with xtable av_model = stats.aov(formula, data=dfr) av_model_sum = base.summary(av_model) if output == 'xtable': xtable = importr('xtable') latex = xtable.xtable(av_model_sum, caption=title, label=label) if pythontex: return latex else: return '\n'.join(np.array(latex))
def sarima_test(steps,path): index_name,my_trend = parse_csv(path) dta = pd.DataFrame(my_trend) dta.index = index_name dta=dta.rename(columns = {0:'search'}) r_df = com.convert_to_r_dataframe(dta) y = stats.ts(r_df) order = R.IntVector((1,1,1)) season = R.ListVector({'order': R.IntVector((0,1,0)), 'period' : 52}) model = stats.arima(y[-5*52:-steps], order = order, seasonal=season) f = forecast.forecast(model,h=steps) future = [var for var in f[3]] y_pred = np.array(future) y_true = np.array(my_trend[-steps:]) metrics_result = {'sarima_MAE':metrics.mean_absolute_error(y_true, y_pred),'sarima_MSE':metrics.mean_squared_error(y_true, y_pred), 'sarima_MAPE':np.mean(np.abs((y_true - y_pred) / y_true)) * 100} p1 = plt.plot(my_trend[-steps:],'*-') p2 = plt.plot(future) # p1 = plt.plot(index_name,my_trend,'r-') # p2 = plt.plot(index_name_future,future,'g-') plt.ylabel('Search Intensity') plt.xlabel('Year') plt.title('Search Prediction of '+path.split('/')[-1][:-4]) plt.legend((p1[0], p2[0]), ["Actual","Predicted"], loc=1) plt.grid(True) # print metrics_result['sarima_MAPE'] return metrics_result['sarima_MAPE']
def spades_assembly_qc_plot(args) : from pandas import DataFrame import pandas.rpy.common as com import rpy2.robjects as ro import rpy2.robjects.lib.ggplot2 as ggplot2 fasta_in = args[0] outfile = args[1] get_gc = lambda s : float(s.seq.count("G")+s.seq.count("C"))/len(s.seq) get_cov = lambda s : float(s.description.split("_")[-1]) get_len = lambda s : len(s.seq) with open(fasta_in) as handle: seqs = [s for s in SeqIO.parse(handle,"fasta")] seqs_data = DataFrame.from_dict({s.id : {'GC' : get_gc(s), 'length' : get_len(s), 'cov' : get_cov(s)} for s in seqs}).transpose() r_data = com.convert_to_r_dataframe(seqs_data) ro.r.library('ggplot2') x=ro.r.ggplot(r_data, ro.r.aes_string(x='length',y='cov')) + ro.r.geom_point() + ro.r.scale_y_log10() + ro.r.scale_x_log10()+ro.r.geom_vline(xintercept=1000, color="red")+ro.r.geom_vline(xintercept=2500, color="red")+ro.r.geom_vline(xintercept=10000, color="red") + ro.r.theme_bw() x.plot() ro.r('dev.copy(pdf,"%s")'%(outfile)) ro.r('dev.off()')
def __init__(self, name, data): """ Dataset class for R data inputs Parameters ---------- name : str Name of datasett data : 2-tuple data[0]: data format, e.g. 'gslib', 'bhdf' data[1]: data (or path/location) """ self.name = name self.dataformat = data[0] self.datapath = data[1] xyz = ['x', 'y', 'z'] if data[0] is 'table': self.dataframe = pd.read_csv(self.datapath) self.col_names = self.dataframe.columns.values.tolist() self.ncol = len(self.col_names) self.xyz_cols = [self.col_names.index(i) for i in xyz] self.variables = [i for i in self.col_names if i not in xyz] self.var_cols = [self.col_names.index(i) for i in self.variables] self.nvar = len(self.variables) self.rdf = com.convert_to_r_dataframe(self.dataframe)
def test_convert_r_dataframe(self): is_na = robj.baseenv.get("is.na") seriesd = tm.getSeriesData() frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) # Null data frame["E"] = [np.nan for item in frame["A"]] # Some mixed type data frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] r_dataframe = com.convert_to_r_dataframe(frame) assert np.array_equal( com.convert_robj(r_dataframe.rownames), frame.index) assert np.array_equal( com.convert_robj(r_dataframe.colnames), frame.columns) assert all(is_na(item) for item in r_dataframe.rx2("E")) for column in frame[["A", "B", "C", "D"]]: coldata = r_dataframe.rx2(column) original_data = frame[column] assert np.array_equal(com.convert_robj(coldata), original_data) for column in frame[["D", "E"]]: for original, converted in zip(frame[column], r_dataframe.rx2(column)): if pd.isnull(original): assert is_na(converted) else: assert original == converted
def test_convert_r_dataframe(self): is_na = robj.baseenv.get("is.na") seriesd = tm.getSeriesData() frame = pd.DataFrame(seriesd, columns=["D", "C", "B", "A"]) # Null data frame["E"] = [np.nan for item in frame["A"]] # Some mixed type data frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] r_dataframe = com.convert_to_r_dataframe(frame) assert np.array_equal(com.convert_robj(r_dataframe.rownames), frame.index) assert np.array_equal(com.convert_robj(r_dataframe.colnames), frame.columns) assert all(is_na(item) for item in r_dataframe.rx2("E")) for column in frame[["A", "B", "C", "D"]]: coldata = r_dataframe.rx2(column) original_data = frame[column] assert np.array_equal(com.convert_robj(coldata), original_data) for column in frame[["D", "E"]]: for original, converted in zip(frame[column], r_dataframe.rx2(column)): if pd.isnull(original): assert is_na(converted) else: assert original == converted
def to_r_obj(dataframe): if not isinstance(dataframe, pd.DataFrame): dataframe = pd.DataFrame(dataframe) rdata = rpycom.convert_to_r_dataframe(dataframe) if len(rdata.colnames) == 1: rdata = rdata[0] return rdata
def pathway_mutation_section_exp(cancer, gene_sets, cutoff=.25): #Format data for report path = cancer.report_folder + '/' pathway_table_file = path + 'pathway_table.csv' pathway_table = format_pathway_table_exp(cancer, gene_sets) if 'survival' in pathway_table: pathway_table.sort(columns='survival') pathway_table.to_csv(pathway_table_file) keepers = cancer.q_pathways[(cancer.q_pathways < .25).sum(1) > 0].index pathway_table = pathway_table.ix[keepers] if 'survival' in pathway_table: pathway_table = pathway_table.sort(columns='survival') pathway_table = pathway_table.head(20) pathway_table_r = com.convert_to_r_dataframe(pathway_table.replace(nan, 1.23)) #@UndefinedVariable if len(pathway_table) == 0: return nz.addTo(nz.newSubSection('Expressed Pathways'), nz.newParagraph('')) #Overview tableCaption1 = ('Association of pathway level expression patterns with patient' + 'clinical features.') table1 = nz.newTable(pathway_table_r, tableCaption1, file=pathway_table_file, significantDigits=2); #Fill in the details pathway_pos = dict((p,i) for i,p in enumerate(pathway_table.index)) col_pos = dict((c,i) for i,c in enumerate(pathway_table.columns)) #age scatter plots for p in (pathway_table['age'][pathway_table['age'] < cutoff]).index: fig_file = path + FIG_EXT + p + '_age.png' draw_pathway_age_scatter(p, cancer, fig_file) age_fig1 = nz.newFigure(fig_file, 'Age of patients with or without' + 'mutation to pathway.') result1 = nz.addTo(nz.newResult('', isSignificant='TRUE'), nz.addTo(nz.newSection(p), age_fig1)) table1 = nz.addTo(table1, result1, row=pathway_pos[p]+1, column=col_pos['age']+1) #survival curves for p in (pathway_table['survival'][pathway_table['survival'] < cutoff]).index: fig_file = path + FIG_EXT + p + '_survival.png' data_frame = cancer.data_matrix.ix[gene_sets[p]].dropna() U,S,vH = frame_svd(((data_frame.T - data_frame.mean(1)) / data_frame.std(1)).T) strat = (vH[0] > vH[0].std()).astype(int) - (vH[0] < -vH[0].std()) + 1 draw_survival_curves(cancer.clinical, Series(strat, name='pc'), labels=['low','mid','high'], filename=fig_file) sv_fig1 = nz.newFigure(fig_file, 'Survival of patients with ' + 'varying levels of pathway expression.') fig_file2 = path + FIG_EXT + p + '.svg' draw_pathway_eig_bar(U, fig_file2) sv_fig_2 = nz.newFigure(fig_file2, 'Loading for first eigen-patient.') result1 = nz.addTo(nz.newResult('', isSignificant='TRUE'), nz.addTo(nz.newSection(p), sv_fig1, sv_fig_2)) table1 = nz.addTo(table1, result1, row=pathway_pos[p]+1, column=col_pos['survival']+1) section = nz.addTo(nz.newSubSection('Pathway Mutations'), table1) return section
def set_cv_fold(self, df): """Send which genes are valid test sets for each CV fold.""" if new_pandas_flag: with localconverter(ro.default_converter + pandas2ri.converter): r_df = ro.conversion.py2rpy(df) else: r_df = com.convert_to_r_dataframe(df) ro.globalenv['cvFoldDf'] = r_df
def _from_python(obj): if isinstance(obj, DataFrame): obj = convert_to_r_dataframe(obj) elif isinstance(obj, Series): obj = numpy2ri(obj.values) elif isinstance(obj, np.ndarray): obj = numpy2ri(obj) return obj
def runCharacteristicDirection(infiles, outfile): # Split infiles vstFile, annotationFile = infiles # Read expression data vstDataframe = pd.read_table(vstFile, index_col='gene_symbol') # Read annotation data annotationDataframe = pd.read_table(annotationFile, index_col='sample_name') # Get timepoint samples timepointSampleDict = { 'day' + str(day): annotationDataframe.index[annotationDataframe['day'] == day].tolist() for day in set(annotationDataframe['day']) } # Group 4 and 5 days timepointSampleDict[ 'day4-5'] = timepointSampleDict['day4'] + timepointSampleDict['day5'] del timepointSampleDict['day4'] del timepointSampleDict['day5'] # Get controls controlColumns = timepointSampleDict.pop('day0') # Initialize empty dataframe resultDataframe = pd.DataFrame() # Loop through timepoints for timepoint in timepointSampleDict.keys(): # Get experiment samples experimentColumns = timepointSampleDict[timepoint] # Run characteristic direction cdResults = r.runCharacteristicDirection( com.convert_to_r_dataframe(vstDataframe), experimentColumns, controlColumns, 0.1) # Convert to dataframe cdDataframe = com.convert_robj(cdResults).reset_index() # Add timepoint column cdDataframe['timepoint'] = timepoint # Append resultDataframe = pd.concat([resultDataframe, cdDataframe]) # Pivot resultDataframeCast = resultDataframe.pivot(index='index', columns='timepoint', values='CD') # Save resultDataframeCast.to_csv(outfile, sep='\t', index_label='gene_symbol')
def preview(fileid): download_url = 'http://dataverse.harvard.edu/api/access/datafile/' + str(fileid) request = requests.get(download_url) d = request.text data_unicode = unicodedata.normalize("NFKD", d).encode("ascii",'ignore') data_string = StringIO(data_unicode) df = pd.read_table(data_string) global df_global df_global = df #create variable names for varaible table variables = df.columns total_rows = df.shape[0] total_cols = df.shape[1] #Create HTML of pandas dataframe df.index += 1 df_html = df.to_html() start = df_html.find('class="dataframe"') df_html = df_html[:start] + 'id = "preview_DataTable"' + df_html[start+1:] r_dataframe = com.convert_to_r_dataframe(df_global) robjects.r(''' source('preprocess.R') ''') r_preprocess = robjects.globalenv['preprocess'] meta = str(r_preprocess(testdata =r_dataframe)) meta= meta.replace('\\', '') meta=meta.replace('"\n', '') meta = meta.replace('[1] "', '') global metadata_all metadata_all = pandas.io.json.read_json(meta) metadata_subset = [0,3,4,9,12,13,14,16,17,20,24,25] #the summary metrics I want for the summary stats variable_info_dict = dict() for var in variables: metadata_variable_series = metadata_all[var] metadata_variable = pandas.DataFrame(metadata_variable_series) sumstats_variable = metadata_variable.ix[metadata_subset] sumstats_variable_html = str(sumstats_variable.to_html(header = False)) start = sumstats_variable_html.find('class="') stop =sumstats_variable_html.find(">") sumstats_variable_html= sumstats_variable_html[:start] + 'class = "table-condensed table-striped>"' + sumstats_variable_html[stop+1:] sumstats_variable_html = sumstats_variable_html.replace('<','<') sumstats_variable_html = sumstats_variable_html.replace('<','>') variable_info_dict[var] = sumstats_variable_html d = {"data": df_html, 'variables' : variables, 'fileid':str(fileid), 'variable_info_dict':variable_info_dict, 'total_rows':total_rows, 'total_cols':total_cols} return render_template('gui_redo.html', **d)
def line_plot(pdf_file, data, x, y, var, null_label="N/A", linetype=None, title=None, xlab=None, ylab=None, colorname=None, linename=None, **extra_aes_params): pdf(pdf_file, width=11.7, height=8.3, paper="a4r") if any(data[x].isnull()): labels = [null_label] + map(str, sorted(set( data[data[x].notnull()][x]))) labels = robjects.StrVector(labels) nulls = data[x].isnull() label_vals = dict(zip(labels, range(len(labels)))) data[x] = data[x].astype("str") data[x][nulls] = null_label data['sortcol'] = data[x].map(label_vals.__getitem__) data.sort('sortcol', inplace=True) else: labels = None if linetype and linetype != var: data['group'] = data[var].map(str) + data[linetype].map(str) else: data['group'] = data[var] rdata = common.convert_to_r_dataframe(data) if labels: ix = rdata.names.index(x) rdata[ix] = ordered(rdata[ix], levels=labels) gp = gg2.ggplot(rdata) pp = ( gp + gg2.geom_point(size=3) + gg2.scale_colour_hue(name=(colorname or var)) + #gg2.scale_colour_continuous(low="black") + gg2.aes_string(x=x, y=y, color=var, variable=var) + ggtitle(title or "") + xlabel(xlab or x) + ylabel(ylab or y) #+ #gg2.scale_y_continuous(breaks=seq(0.0, 1.0, 0.05)) ) # line type stuff if linetype: pp += gg2.geom_path(gg2.aes_string(group='group', linetype=linetype), size=0.5) pp += gg2.scale_linetype(name=(linename or linetype)) else: pp += gg2.geom_path(gg2.aes_string(group='group'), size=0.5) pp.plot() dev_off()
def sarima(steps, path): index_name, my_trend = parse_csv(path) dta = pd.DataFrame(my_trend) dta.index = index_name dta = dta.rename(columns={0: 'search'}) #dta.plot(figsize=(10,4)) #============================================================================== # check stationarity #============================================================================== #r_df = com.convert_to_r_dataframe(DataFrame(dta)) #y = stats.ts(r_df) #ad = tseries.adf_test(y, alternative="stationary", k=52) #a = ad.names[:5] #{ad.names[i]:ad[i][0] for i in xrange(len(a))} #============================================================================== # check the seasonality #============================================================================== #diff1lev = dta.diff(periods=1).dropna() #diff1lev.plot(figsize=(12,6)) #diff1lev_season = diff1lev.diff(52).dropna() #r_df = com.convert_to_r_dataframe(DataFrame(diff1lev_season)) #diff1lev_season1lev = diff1lev_season.diff().dropna() #============================================================================== # check stationarity after difference #============================================================================== #y = stats.ts(r_df) #ad = tseries.adf_test(y, alternative="stationary", k=52) #a = ad.names[:5] #{ad.names[i]:ad[i][0] for i in xrange(len(a))} #============================================================================== # plot acf and pacf #============================================================================== #fig = plt.figure(figsize=(12,8)) #ax1 = fig.add_subplot(211) #fig = sm.graphics.tsa.plot_acf(diff1lev_season1lev.values.squeeze(), lags=150, ax=ax1) #ax2 = fig.add_subplot(212) #fig = sm.graphics.tsa.plot_pacf(diff1lev_season1lev, lags=150, ax=ax2) #fig r_df = com.convert_to_r_dataframe(dta) y = stats.ts(r_df) order = R.IntVector((1, 1, 1)) season = R.ListVector({'order': R.IntVector((0, 1, 0)), 'period': 52}) a = time.time() model = stats.arima(y, order=order, seasonal=season) print time.time() - a f = forecast.forecast(model, h=steps) future = [var for var in f[3]] dt = date_range(dta.index[-1], periods=len(future) + 1, freq='W')[1:] #создаем индекс из дат pr = Series(future, index=dt) # dta.plot(figsize=(12,6)) # pr.plot(color = 'red') return index_name, dt, my_trend, future
def sarima(steps,path): index_name,my_trend = parse_csv(path) dta = pd.DataFrame(my_trend) dta.index = index_name dta=dta.rename(columns = {0:'search'}) #dta.plot(figsize=(10,4)) #============================================================================== # check stationarity #============================================================================== #r_df = com.convert_to_r_dataframe(DataFrame(dta)) #y = stats.ts(r_df) #ad = tseries.adf_test(y, alternative="stationary", k=52) #a = ad.names[:5] #{ad.names[i]:ad[i][0] for i in xrange(len(a))} #============================================================================== # check the seasonality #============================================================================== #diff1lev = dta.diff(periods=1).dropna() #diff1lev.plot(figsize=(12,6)) #diff1lev_season = diff1lev.diff(52).dropna() #r_df = com.convert_to_r_dataframe(DataFrame(diff1lev_season)) #diff1lev_season1lev = diff1lev_season.diff().dropna() #============================================================================== # check stationarity after difference #============================================================================== #y = stats.ts(r_df) #ad = tseries.adf_test(y, alternative="stationary", k=52) #a = ad.names[:5] #{ad.names[i]:ad[i][0] for i in xrange(len(a))} #============================================================================== # plot acf and pacf #============================================================================== #fig = plt.figure(figsize=(12,8)) #ax1 = fig.add_subplot(211) #fig = sm.graphics.tsa.plot_acf(diff1lev_season1lev.values.squeeze(), lags=150, ax=ax1) #ax2 = fig.add_subplot(212) #fig = sm.graphics.tsa.plot_pacf(diff1lev_season1lev, lags=150, ax=ax2) #fig r_df = com.convert_to_r_dataframe(dta) y = stats.ts(r_df) order = R.IntVector((1,1,1)) season = R.ListVector({'order': R.IntVector((0,1,0)), 'period' : 52}) a = time.time() model = stats.arima(y, order = order, seasonal=season) print time.time()-a f = forecast.forecast(model,h=steps) future = [var for var in f[3]] dt = date_range(dta.index[-1], periods=len(future)+1,freq='W')[1:] #создаем индекс из дат pr = Series(future, index = dt) # dta.plot(figsize=(12,6)) # pr.plot(color = 'red') return index_name,dt,my_trend,future
def save_to_R(X, filename): import numpy as np from rpy2.robjects import r import pandas.rpy.common as com from pandas import DataFrame df = DataFrame(np.array(X)) df = com.convert_to_r_dataframe(df) r.assign("X", df) r("save(X, file='%s.gz', compress=TRUE)"%(filename))
def simpleNetworkx(G): """ D3 JavaScript networkx graphs using python. This is an python interface to Christopher Gandrud's R package networkD3. Parameters ---------- G : A networkx graph. Returns ------- An HTML page containing an interactive visual of the graph. Example ------- >>> G = nx.Graph() >>> H = ["A","B","C","D","E","F","G", "H","I","J"] >>> G.add_nodes_from(H) >>> G.add_edges_from([("A","B"), ("A","C"), ("A","D"), ("A","J"), ("B","E"), ("B","F"), ("C","G"),("C","H"), ("D","I")]) >>> simpleNetworkxD3(G) >>> Net.html References ---------- [1] Christorpher Gandrud - https://github.com/christophergandrud/networkD3 """ ro.r('src = c()') ro.r('target =c()') ro.r('rdf=data.frame()') df = p.DataFrame(data=G.edges()) df_r = com.convert_to_r_dataframe(df) ro.globalenv['src'] = df_r[0] ro.globalenv['target'] = df_r[1] ro.r('rdf=data.frame(src,target)') utils = importr('utils') utils.chooseCRANmirror(ind=1) try: networkD3 = importr('networkD3') except: utils.install_packages('networkD3') networkD3 = importr('networkD3') try: magrittr = importr('magrittr') except: utils.install_packages('magrittr') magrittr = importr('magrittr') ro.r('''simpleNetwork(rdf) %>% saveNetwork(file = 'Net.html')''') return None
def Sizezonematrixfeatures(ngrl,mask): '''Function calculate size zone matrix based features''' dircontent = os.listdir('.') selionimg = grep(dircontent,'.sim') fielid = []; szmfeature=[] for f in selionimg: sname = f[:-4] print('PROCESSING %s' %sname) Img = np.genfromtxt(f,dtype=float,delimiter=',') if mask == 'drug': print('<--- Using drug mask -->') Mask = np.genfromtxt(sname + '_drug.msk',dtype=float,delimiter=',') if mask == 'mim': print('<--- Using MIM tissue mask -->') Mask = np.genfromtxt(sname + '_mim.msk',dtype=float,delimiter=',') if mask == 'tic': print('<--- Using TIC tissue mask -->') Mask = np.genfromtxt(sname + '_mim.msk',dtype=float,delimiter=',') ## rescaling to the desired number of gray levels if (ngrl != 0): m = ngrl/Img.max() scaledImg = Img*m binnedImg = np.rint(scaledImg) Img = (binnedImg + 1) else: Img = np.sqrt(Img) Img = np.rint(Img) Img = (Img +1) tissue = np.multiply(Img,Mask) tissue = pd.DataFrame(tissue) rdf = com.convert_to_r_dataframe(tissue) ro.globalenv['tissue'] = rdf ro.r('tissue <- as.matrix(tissue)') ro.r('library(radiomics)') ro.r('szmatrix <- glszm(tissue)') ro.r('szmatrix[0,] <- 0') ### Assign zero value to first row which belongs to mask region ro.r('szmfeature <- array(NA,dim=c(11,1))') ro.r('szmfeature[1,1] <- glszm_SAE(szmatrix)') ro.r('szmfeature[2,1] <- glszm_LAE(szmatrix)') ro.r('szmfeature[3,1] <- glszm_IV(szmatrix)') ro.r('szmfeature[4,1] <- glszm_HILAE(szmatrix)') ro.r('szmfeature[5,1] <- glszm_LILAE(szmatrix)') ro.r('szmfeature[6,1] <- glszm_HISAE(szmatrix)') ro.r('szmfeature[7,1] <- glszm_LISAE(szmatrix)') ro.r('szmfeature[8,1] <- glszm_HIE(szmatrix)') ro.r('szmfeature[9,1] <- glszm_LIE(szmatrix)') ro.r('szmfeature[10,1] <- glszm_ZP(szmatrix)') ro.r('szmfeature[11,1] <- glszm_SZV(szmatrix)') szm = ro.r.matrix(ro.r('szmfeature')) szm = np.array(szm) szmfeature.append(szm.transpose()) fielid.append(sname) szmfeature = np.array(szmfeature) output = pd.DataFrame(szmfeature.reshape(szmfeature.shape[0],szmfeature.shape[2]),columns = ["sae","lae","iv","szv","zp","lie","hie","lisae","hisae","lilae","hilae"]) output['Id'] = fielid output.to_csv("SZM_features.csv",delimiter=",")
def contextGeneDistances(cdhitProc): clusterIDs = [] dists = [] #heats = defaultdict( Counter ) for i,cluster in enumerate(cdhitProc.clusters): members = cluster.seqs for mem in members: tag = cluster_id_reg.findall(mem)[0][:-3] bst,bend,ast,aend = tag.split('|')[-4:] bst,bend,ast,aend = int(bst),int(bend),int(ast),int(aend) if overlap(bst,bend,ast,aend): continue bmid = (bst+bend)/2 int_st,int_end = ast-bmid,aend-bmid interval = xrange(int_st,int_end) #heats[i][(int_st+int_end)/2]+=1 dists+=interval clusterIDs+=[i]*len(interval) #dists+=[(int_st+int_end)/2] #clusterIDs+=[i] #dists+= [(int_st+int_end)/2] #clusterIDs+=[i] data = zip(clusterIDs,dists) sorted(data,key=lambda x:x[0]) clusterIDs,dists = zip(*data) #heats = pd.DataFrame(heats) heats = pd.DataFrame({'clusters':clusterIDs,'distances':dists}) print heats #heats = heats.fillna(0) heats_R=com.convert_to_r_dataframe(heats) #print heats_R importr("ggplot2") plotViolinFunc = robj.r(""" library(ggplot2) function(df){ png(filename="violin.png") p <- ggplot(df,aes(x=as.character(clusters), y=distances)) + geom_violin(aes(x=as.character(clusters), y=distances), stat="ydensity", adjust=40, trim=TRUE, fill="red") + coord_flip() print(p) dev.off() print(p) } """) plotViolinFunc(heats_R) raw_input()
def cad_queryset_to_r(qset, outfile='from_python.gzip'): from rpy2.robjects import r import pandas.rpy.common as com from pandas import DataFrame rel_dt = np.min([x.inc_datetime for x in qset]) res = np.array([[(x.inc_datetime - rel_dt).total_seconds()] + list(x.att_map.coords) for x in qset]) df = com.convert_to_r_dataframe(DataFrame(res)) r.assign("foo", df) r("save(foo, file='%s', compress=TRUE)" % outfile)
def plot_ROC(self, path): robjects.r["pdf"](path, width=14, height=8) df = self.df # print(df) gp = ggplot2.ggplot(convert_to_r_dataframe(df, strings_as_factors=True)) gp += ggplot2.aes_string(x="fpr", y="tpr") gp += ggplot2.geom_line(color="blue") gp += ggplot2.geom_point(size=2) gp.plot()
def plot_ROC(self, path): robjects.r['pdf'](path, width=14, height=8) df = self.df print(df) gp = ggplot2.ggplot(convert_to_r_dataframe(df, strings_as_factors=True)) gp += ggplot2.aes_string(x='fpr', y='tpr') gp += ggplot2.geom_line(color='blue') gp += ggplot2.geom_point(size=2) gp.plot()