def load_from_Rdata(path, to_csv=False): import pandas.rpy.common as com from rpy2 import robjects as r input_pensipp = path + 'dataALL.RData' dates_to_col = [year * 100 + 1 for year in range(1901, 2061)] r.r("load('" + str(input_pensipp) + "')") statut = com.load_data('statut') statut.columns = dates_to_col salaire = com.load_data('salaire') salaire.columns = dates_to_col info = com.load_data('ind') info['naiss'] = [ datetime.date(1900 + int(year), 1, 1) for year in info['t_naiss'] ] info['id'] = info.index id_enf = com.load_data('enf') id_enf.columns = ['enf' + str(i) for i in range(id_enf.shape[1])] info_child = build_info_child(id_enf, info) if to_csv: for table in ['info', 'info_child', 'salaire', 'statut']: temp = eval(table) temp.to_csv(pensipp_comparison_path + table + '.csv', sep=',') return info, info_child, salaire, statut
def r_analisis(path): r(''' regresion <- function(path) { myData <- read.csv(path, header = TRUE) names(myData)[1] <- "Fecha" names(myData)[2] <- "VelViento" myData$time <-strptime(myData$Fecha, "%d-%m-%Y %H:%M") fit_year <- fitdist(myData$VelViento, "weibull") shape <- c() scale <- c() error_shape <- c() error_scale <- c() vel_min <- c() vel_max <- c() vel_prom <- c() for(i in 0:11){ fit <- fitdist(myData$VelViento[myData$time$mon == i], "weibull") shape <- c(shape, fit$estimate[1]) scale <- c(scale, fit$estimate[2]) error_shape <- c(error_shape, fit$sd[1]) error_scale <- c(error_scale, fit$sd[2]) vel_min <- c(vel_min, min(myData$VelViento[myData$time$mon == i])) vel_max <- c(vel_max, max(myData$VelViento[myData$time$mon == i])) vel_prom <- c(vel_prom, mean(myData$VelViento[myData$time$mon == i])) } mes <- c(1:12) dt_meses <<- data.frame(mes, shape, scale, error_shape, error_scale, vel_max, vel_min, vel_prom) shape_year <- c(fit_year$estimate[1], fit_year$sd[1]) scale_year <- c(fit_year$estimate[2],fit_year$sd[2]) dt_anio <<- data.frame(shape_year, scale_year) his <<- hist(myData$VelViento, plot = FALSE) histo_breaks <<- data.frame(breaks = his$breaks) histo_counts <<- data.frame(counts = his$counts, density = his$density, mids = his$mids) } ''') r_regresion = r['regresion'] r_regresion(path) df_meses = com.load_data('dt_meses') df_anio = com.load_data('dt_anio') histo_breaks = com.load_data('histo_breaks') histo_counts = com.load_data('histo_counts') return(df_anio, df_meses, histo_breaks, histo_counts)
def setUp(self): try: import rpy2 except ImportError: raise nose.SkipTest("No rpy2") self.rdata = rpy.load_data('Canada', package='vars', convert=False) self.data = rpy.load_data('Canada', package='vars', convert=True) self.res = VAR(self.data) self.ref = RVAR(self.rdata)
def setUp(self): try: import rpy2 except ImportError: raise nose.SkipTest("No rpy2") self.rdata = rpy.load_data('Canada', package='vars', convert=False) self.data = rpy.load_data('Canada', package='vars', convert=True) self.res = VAR(self.data) self.ref = RVAR(self.rdata)
def read_rdata(rdata_fullpath): """ Returns the pandas DataFrame """ import pandas.rpy.common as com from rpy2 import robjects as r # we want forward slashes for R rdata_fullpath_forR = rdata_fullpath.replace("\\", "/") print "Loading %s" % rdata_fullpath_forR # read in the data from the R session with python r.r("load('%s')" % rdata_fullpath_forR) # check that it's there # print "Dimensions are %s" % str(r.r('dim(model_summary)')) table_df = com.load_data("model_summary") # fillna for col in table_df.columns: nullcount = sum(pandas.isnull(table_df[col])) if nullcount > 0: print " Found %5d NA values in column %s" % (nullcount, col) table_df = table_df.fillna(0) for col in table_df.columns: nullcount = sum(pandas.isnull(table_df[col])) if nullcount > 0: print " -> Found %5d NA values in column %s" % (nullcount, col) print "Read %d lines from %s" % (len(table_df), rdata_fullpath) return table_df
def convert_datafiles(datasets_folder): '''convert .RData files to .csv files and clean up''' print "Convert .RData to .csv and clean up files..." for root, dirs, files in os.walk(datasets_folder): for name in files: # sort out .RData files if name.endswith('.RData'): name_ = os.path.splitext(name)[0] name_path = os.path.join(datasets_folder, name_) # creat sub-directory if not os.path.exists(name_path): os.makedirs(name_path) file_path = os.path.join(root, name) robj = robjects.r.load(file_path) # check out subfiles in the data frame for var in robj: myRData = com.load_data(var) # convert to DataFrame if not isinstance(myRData, pd.DataFrame): myRData = pd.DataFrame(myRData) var_path = os.path.join(datasets_folder, name_, var + '.csv') myRData.to_csv(var_path) os.remove(os.path.join(datasets_folder, name)) # clean up print "=> Success!"
def test_dist(self): for name in ("eurodist",): df = com.load_data(name) dist = r[name] labels = r["labels"](dist) assert np.array_equal(df.index, labels) assert np.array_equal(df.columns, labels)
def encode_data(): robj = robjects.r.load('heart-proc.RData') heart = com.load_data(robj[2]) heart = heart.fillna(method='ffill') #Take care of missing data heart.iloc[:, 11] = heart.iloc[:, 11].fillna(method='ffill') heart.iloc[:, 12] = heart.iloc[:, 12].fillna(method='ffill') #Encoding categorical data from sklearn.preprocessing import LabelEncoder label_X_gender = LabelEncoder() heart[['sex']] = label_X_gender.fit_transform(heart[['sex']]) label_X_fbs = LabelEncoder() heart[['fbs']] = label_X_fbs.fit_transform(heart[['fbs']]) label_X_exang = LabelEncoder() heart[['exang']] = label_X_exang.fit_transform(heart[['exang']]) label_X_cp = LabelEncoder() heart[['cp']] = label_X_cp.fit_transform(heart[['cp']]) label_X_restecg = LabelEncoder() heart[['restecg']] = label_X_restecg.fit_transform(heart[['restecg']]) label_X_slope = LabelEncoder() heart[['slope']] = label_X_slope.fit_transform(heart[['slope']]) label_X_thal = LabelEncoder() heart[['thal']] = label_X_thal.fit_transform(heart[['thal']]) transform_to_csv(heart) #x features vector x_heart = heart.iloc[:, 0:13].values y_heart = heart.iloc[:, 13].values return x_heart, y_heart
def read_rdata(rdata_fullpath): """ Returns the pandas DataFrame """ import pandas.rpy.common as com from rpy2 import robjects as r # we want forward slashes for R rdata_fullpath_forR = rdata_fullpath.replace("\\", "/") print "Loading %s" % rdata_fullpath_forR # read in the data from the R session with python r.r("load('%s')" % rdata_fullpath_forR) # check that it's there # print "Dimensions are %s" % str(r.r('dim(model_summary)')) table_df = com.load_data('model_summary') # fillna for col in table_df.columns: nullcount = sum(pandas.isnull(table_df[col])) if nullcount > 0: print " Found %5d NA values in column %s" % (nullcount, col) table_df = table_df.fillna(0) for col in table_df.columns: nullcount = sum(pandas.isnull(table_df[col])) if nullcount > 0: print " -> Found %5d NA values in column %s" % (nullcount, col) print "Read %d lines from %s" % (len(table_df), rdata_fullpath) return table_df
def main(): # simulated brownian motion df = pd.DataFrame(randn(1000, 4), index=pd.date_range('1/1/2000', periods=1000), columns=['Legend A', 'Legend B', 'Legend C', 'Legend D']) df = df.cumsum() # iris data set iris = com.load_data('iris') feature_means = iris.groupby('Species').mean() with mpl.rc_context(fname=args.style): # line plot of brownian motion df.plot() plt.title('Brownian Motion') plt.ylabel('Simulated Values') plt.xlabel('Dates') plt.tight_layout() plt.savefig(args.output.strip('.png') + '.line.png') plt.close() # bar plot of iris data set feature_means.plot(kind='bar') plt.title('Mean Features of Iris Data Set') plt.ylabel('Values') plt.tight_layout() plt.savefig(args.output.strip('.png') + '.bar.png') plt.close()
def transform_data(r_datfile, full_dump_name="", train_sampl_size=0,test_sampl_size=0,sample_only=False): if sample_only: full_data = cPickle.load(open('noncoding.data.py.save','rb')) train_data = full_data['train_data'] test_data = full_data['test_data'] ## Sample the data set (~3.6M -> sampl_size) train_subset,train_left = sample_split_data(train_data, float(train_sampl_size)/train_data.shape[0]) test_subset,test_left = sample_split_data(test_data,float(test_sampl_size)/test_data.shape[0]) print('train_subset:{},test_subset:{}'.format(train_subset.shape,test_subset.shape)) dump_data_subset = {'train_data': train__subset, 'test_data': test_subset} cPickle.dump(dump_data_subset,open(sampl_dump_name,'wb'),protocol=cPickle.HIGHEST_PROTOCOL) return dump_data_subset else: ## Load the R-store data robj = robjects.r.load(r_datfile) myRData = com.load_data(robj[0]) train_data = np.array(pd.DataFrame(myRData['train'])) test_data = np.array(pd.DataFrame(myRData['test'])) dump_data = {'train_data': train_data, 'test_data': test_data} print('train:{}, test:{}'.format(train_data.shape, test_data.shape)) cPickle.dump(dump_data,open(full_dump_name,'wb'),protocol=cPickle.HIGHEST_PROTOCOL) return dump_data
def region_deform_tensor(self, roi_name): if 'avgDeformTensorsWide.tsv' in os.listdir(DB_path+self.name+'/shear_contrib/'+roi_name): df_DB_shear= pp.read_csv(DB_path+self.name+'/shear_contrib/'+roi_name+'/avgDeformTensorsWide.tsv', sep='\t') else: ro.r('load("'+DB_path+self.name+'/shear_contrib/'+roi_name+'/avgDeformTensorsWide.RData")') df_DB_shear= com.load_data('avgDeformTensorsWide') return df_DB_shear
def fill_hdf_from_Rdata(self, table): import pandas.rpy.common as com import rpy2.rpy_classic as rpy rpy.set_default_mode(rpy.NO_CONVERSION) assert table in self.tables, "Table {} is not a filed table".format(table) Rdata_table = self.tables[table]["Rdata_table"] Rdata_file = self.tables[table]["Rdata_file"] if 'variables' in self.tables: variables = self.tables[table]['variables'] else: variables = None if not os.path.isfile(Rdata_file): raise Exception("file_path do not exists") rpy.r.load(Rdata_file) stored_dataframe = com.load_data(Rdata_table) store_path = table log.info("Inserting {} in HDF file {} at point {}".format( Rdata_table, self.hdf5_file_path, table, ) ) if variables is not None: log.info('variables asked by the user: {}'.format(variables)) variables_stored = list(set(variables).intersection(set(stored_dataframe.columns))) log.info('variables stored: {}'.format(variables_stored)) stored_dataframe = stored_dataframe[variables_stored].copy() stored_dataframe.to_hdf(self.hdf5_file_path, store_path, format = 'table', append = False) gc.collect()
def test_dist(self): for name in ('eurodist',): df = com.load_data(name) dist = r[name] labels = r['labels'](dist) assert np.array_equal(df.index, labels) assert np.array_equal(df.columns, labels)
def convert_datafiles(datasets_folder): '''convert .RData files to .csv files and clean up''' print "Convert .RData to .csv and clean up files..." for root, dirs, files in os.walk(datasets_folder): for name in files: # sort out .RData files if name.endswith('.RData'): name_ = os.path.splitext(name)[0] name_path = os.path.join(datasets_folder, name_) # creat sub-directory if not os.path.exists(name_path): os.makedirs(name_path) file_path = os.path.join(root, name) robj = robjects.r.load(file_path) # check out subfiles in the data frame for var in robj: myRData = com.load_data(var) # convert to DataFrame if not isinstance(myRData, pd.DataFrame): myRData = pd.DataFrame(myRData) var_path = os.path.join(datasets_folder,name_,var+'.csv') myRData.to_csv(var_path) os.remove(os.path.join(datasets_folder, name)) # clean up print "=> Success!"
def r_to_py(r_df_name): """ :param r_df_name: the r data frame NAME saved in R environment :return: """ ro.globalenv['\'' + str(r_df_name) + '\''] = r_df_name return com.load_data('\'' + str(r_df_name) + '\'')
def test_factor(self): for name in ('state.division', 'state.region'): vector = r[name] factors = list(r['factor'](vector)) level = list(r['levels'](vector)) factors = [level[index - 1] for index in factors] result = com.load_data(name) assert np.equal(result, factors)
def test_factor(self): for name in ('state.division', 'state.region'): vector = r[name] factors = list(r['factor'](vector)) level = list(r['levels'](vector)) factors = [level[index - 1] for index in factors] result = com.load_data(name) assert np.equal(result, factors)
def test_factor(self): for name in ("state.division", "state.region"): vector = r[name] factors = list(r["factor"](vector)) level = list(r["levels"](vector)) factors = [level[index - 1] for index in factors] result = com.load_data(name) assert np.equal(result, factors)
def covarFilter(infile, time_points, replicates, quantile): ''' Filter gene list based on the distribution of the sums of the covariance of each gene. This is highly recommended to reduce the total number of genes used in the dynamic time warping clustering to reduce the computational time. The threshold is placed at the intersection of the expected and observed value for the given quantile. ''' time_points.sort() time_rep_comb = [x for x in itertools.product(time_points, replicates)] time_cond = ro.StrVector([x[0] for x in time_rep_comb]) rep_cond = ro.StrVector([x[1] for x in time_rep_comb]) df = pd.read_table(infile, sep="\t", header=0, index_col=0) df.drop(['replicates'], inplace=True, axis=1) df.drop(['times'], inplace=True, axis=1) df = df.fillna(0.0) R.assign('diff_data', df) E.info("loading data frame") # need to be careful about column headers and transposing data frames R('''trans_data <- data.frame(diff_data)''') R('''times <- c(%s)''' % time_cond.r_repr()) R('''replicates <- c(%s)''' % rep_cond.r_repr()) # calculate the covariance matrix for all genes # sum each gene's covariance vector E.info("calculating sum of covariance of expression") R('''covar.mat <- abs(cov(trans_data))''') R('''sum.covar <- rowSums(covar.mat)''') R('''exp.covar <- abs(qnorm(ppoints(sum.covar),''' '''mean=mean(sum.covar), sd=sd(sum.covar)))''') R('''sum.covar.quant <- quantile(sum.covar)''') R('''exp.covar.quant <- quantile(exp.covar)''') E.info("filter on quantile") R('''filtered_genes <- names(sum.covar[sum.covar > ''' '''sum.covar.quant[%(quantile)i]''' ''' & sum.covar > exp.covar.quant[%(quantile)i]])''' % locals()) R('''filtered_frame <- data.frame(diff_data[, filtered_genes],''' '''times, replicates)''') filtered_frame = com.load_data('filtered_frame').T return filtered_frame
def covarFilter(infile, time_points, replicates, quantile): ''' Filter gene list based on the distribution of the sums of the covariance of each gene. This is highly recommended to reduce the total number of genes used in the dynamic time warping clustering to reduce the computational time. The threshold is placed at the intersection of the expected and observed value for the given quantile. ''' time_points.sort() time_rep_comb = [x for x in itertools.product(time_points, replicates)] time_cond = ro.StrVector([x[0] for x in time_rep_comb]) rep_cond = ro.StrVector([x[1] for x in time_rep_comb]) df = pd.read_table(infile, sep="\t", header=0, index_col=0) df.drop(['replicates'], inplace=True, axis=1) df.drop(['times'], inplace=True, axis=1) df = df.fillna(0.0) R.assign('diff_data', df) E.info("loading data frame") # need to be careful about column headers and transposing data frames R('''trans_data <- data.frame(diff_data)''') R('''times <- c(%s)''' % time_cond.r_repr()) R('''replicates <- c(%s)''' % rep_cond.r_repr()) # calculate the covariance matrix for all genes # sum each gene's covariance vector E.info("calculating sum of covariance of expression") R('''covar.mat <- abs(cov(trans_data))''') R('''sum.covar <- rowSums(covar.mat)''') R('''exp.covar <- abs(qnorm(ppoints(sum.covar),''' '''mean=mean(sum.covar), sd=sd(sum.covar)))''') R('''sum.covar.quant <- quantile(sum.covar)''') R('''exp.covar.quant <- quantile(exp.covar)''') E.info("filter on quantile") R('''filtered_genes <- names(sum.covar[sum.covar > ''' '''sum.covar.quant[%(quantile)i]''' ''' & sum.covar > exp.covar.quant[%(quantile)i]])''' % locals()) R('''filtered_frame <- data.frame(diff_data[, filtered_genes],''' '''times, replicates)''') filtered_frame = com.load_data('filtered_frame').T return filtered_frame
def _build_df(self, file_pointer): try: r = robjects r.r("require(foreign)") r.r('x <- read.spss("{}",to.data.frame=T)'.format(file_pointer.name)) r.r('row.names(x) = 0:(nrow(x)-1)') return {"dataframe":com.load_data('x')} except (RRuntimeError, TypeError): raise BlankOrCorruptTableError("Is this a valid SPSS file?")
def get_prediction_for_daterange(ed='2012-05-20 23:00:00+00:00', asutc=False): #start date for forecast is necessarily the period proceeding the last period of the dataset sd = hourly_volume.index[-1:][0].isoformat() print 'Forecasting...' #Time Zone info is annoying sd = dateutil.parser.parse(sd).replace(tzinfo=None) ed = dateutil.parser.parse(ed).replace(tzinfo=None) #Compute # of days to forecast ahead td = ed - sd td = td.days + 1 #Brings days to forecast into R r('d_forecast = ' + str(td) + ' # of days forecast') #R creates hourly forecasts for the specified number of days to forecast ahead r('HRforecast <- forecast.HoltWinters(hr_model,h=(d_forecast*24))') r('forecast_result <- HRforecast$mean') #If monthly model was built (min 8 weeks of data), reconciles hourly forecast with daily forecast over month if (r('!is.null(dy_model)')[0]): #R creates daily forecast for the number of days to forecast ahead r('dy_forecast <- forecast.HoltWinters(dy_model,h=d_forecast)') #Extract daily forecast vector from time-series forecast object r('dy_fcst_result <- dy_forecast$mean') r('dy_fcst_result <- unclass(dy_fcst_result)') r('dy_fcst_result <- dy_fcst_result[]') #Re-bucket hourly forecast into daily forecast r('hr_fcst_by_day <- unname(tapply(forecast_result, (seq_along(forecast_result)-1) %/% 24, sum))' ) #scale hourly model by daily model r('scale_coefs <- dy_fcst_result/hr_fcst_by_day') #Scale scaling-coefficient vector to length of hourly forecast vector r('scale_coefs <- as.vector(matrix(rep(scale_coefs,each=24),nrow=24))') #Extract hourly forecast vector from time-series forecast object r('hr_fcst_result <- unclass(forecast_result)') r('hr_fcst_result <- hr_fcst_result[]') #multiply hourly forecast vector by scaling ceofficients to get final result r('forecast_result <- hr_fcst_result*scale_coefs') #Load prediction from R into Python workspace pred = com.load_data('forecast_result') #Convert prediction to Pandas time series pred = pd.Series(pred) rng = pd.date_range(sd, periods=td * 24, freq='H') pred.index = pd.Series(rng, name='time') ret = pred.reset_index() if asutc: ret.index = ret.time.apply(lambda x: str(x)) else: ret.index = ret.time.apply(lambda x: int(x.strftime('%s'))) ret = ret[0] return ret #.to_dict()
def robjectify(fp): """Create a dataframe object using R""" import pandas.rpy.common as common import rpy2.robjects as robjects r = robjects r.r("require(foreign)") r.r('x <- read.spss("{}",to.data.frame=T)'.format(fp.name)) r.r('row.names(x) = 0:(nrow(x)-1)') return common.load_data('x')
def _build_df(self, file_pointer): try: r = robjects r.r("require(foreign)") r.r('x <- read.spss("{}",to.data.frame=T)'.format( file_pointer.name)) r.r('row.names(x) = 0:(nrow(x)-1)') return {"dataframe": com.load_data('x')} except (RRuntimeError, TypeError): raise BlankOrCorruptTableError("Is this a valid SPSS file?")
def get_prediction_for_daterange(ed='2012-05-20 23:00:00+00:00', asutc=False): #start date for forecast is necessarily the period proceeding the last period of the dataset sd = hourly_volume.index[-1:][0].isoformat() print 'Forecasting...' #Time Zone info is annoying sd = dateutil.parser.parse(sd).replace(tzinfo=None) ed = dateutil.parser.parse(ed).replace(tzinfo=None) #Compute # of days to forecast ahead td = ed-sd td = td.days+1 #Brings days to forecast into R r('d_forecast = '+str(td)+' # of days forecast') #R creates hourly forecasts for the specified number of days to forecast ahead r('HRforecast <- forecast.HoltWinters(hr_model,h=(d_forecast*24))') r('forecast_result <- HRforecast$mean') #If monthly model was built (min 8 weeks of data), reconciles hourly forecast with daily forecast over month if (r('!is.null(dy_model)')[0]): #R creates daily forecast for the number of days to forecast ahead r('dy_forecast <- forecast.HoltWinters(dy_model,h=d_forecast)') #Extract daily forecast vector from time-series forecast object r('dy_fcst_result <- dy_forecast$mean') r('dy_fcst_result <- unclass(dy_fcst_result)') r('dy_fcst_result <- dy_fcst_result[]') #Re-bucket hourly forecast into daily forecast r('hr_fcst_by_day <- unname(tapply(forecast_result, (seq_along(forecast_result)-1) %/% 24, sum))') #scale hourly model by daily model r('scale_coefs <- dy_fcst_result/hr_fcst_by_day') #Scale scaling-coefficient vector to length of hourly forecast vector r('scale_coefs <- as.vector(matrix(rep(scale_coefs,each=24),nrow=24))') #Extract hourly forecast vector from time-series forecast object r('hr_fcst_result <- unclass(forecast_result)') r('hr_fcst_result <- hr_fcst_result[]') #multiply hourly forecast vector by scaling ceofficients to get final result r('forecast_result <- hr_fcst_result*scale_coefs') #Load prediction from R into Python workspace pred = com.load_data('forecast_result') #Convert prediction to Pandas time series pred = pd.Series(pred) rng = pd.date_range(sd, periods=td*24, freq='H') pred.index = pd.Series(rng, name='time') ret = pred.reset_index() if asutc: ret.index = ret.time.apply(lambda x: str(x)) else: ret.index = ret.time.apply(lambda x: int(x.strftime('%s'))) ret = ret[0] return ret#.to_dict()
def transform_data(r_datfile, full_dump_name, train_sampl_size, test_sampl_size, sample_only=True): #train_sampl_size = 100000 #test_sampl_size = 20000 #full_dump_name = 'noncoding.data.py.save' #sampl_dump_name = 'noncoding.data.py.'+str(train_sampl_size/10**3)+"k."+str(test_sampl_size/10**3)+"k.save" #print sampl_dump_name if sample_only: full_data = cPickle.load(open('noncoding.data.py.save', 'rb')) train_data = full_data['train_data'] test_data = full_data['test_data'] ## Sample the data set (~3.6M -> sampl_size) #train_data_subset = train_data[random.sample(range(train_data.shape[0]),train_sampl_size),] #test_data_subset = test_data[random.sample(range(test_data.shape[0]),test_sampl_size),] train_subset, train_left = sample_split_data( train_data, float(train_sampl_size) / train_data.shape[0]) test_subset, test_left = sample_split_data( test_data, float(test_sampl_size) / test_data.shape[0]) print('train_subset:{},test_subset:{}'.format(train_subset.shape, test_subset.shape)) dump_data_subset = { 'train_data': train__subset, 'test_data': test_subset } cPickle.dump(dump_data_subset, open(sampl_dump_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) else: ## Load the R-store data robj = robjects.r.load(r_datfile) myRData = com.load_data(robj[0]) #from rpy2.robjects import r #r.data('train_test.rda') #myRData = pandas2ri.ripy(r[0]) train_data = np.array(pd.DataFrame(myRData['train'])) test_data = np.array(pd.DataFrame(myRData['test'])) dump_data = {'train_data': train_data, 'test_data': test_data} print('train:{}, test:{}'.format(train_data.shape, test_data.shape)) cPickle.dump(dump_data, open(full_dump_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) '''
def conditionDESeq2(data_frame, header, alpha, res_dir): ''' Perform DESeq2-based analysis of condition:time interaction dependent differential expression ''' E.info("Differential expression testing for %s" % header) cols = data_frame.columns counts = com.convert_to_r_dataframe(data_frame) des_times = ro.IntVector([x.split(".")[1] for x in cols]) des_reps = ro.StrVector([x.split(".")[2] for x in cols]) des_cond = ro.StrVector([x.split(".")[0] for x in cols]) genes = ro.StrVector([x for x in data_frame.index]) # setup counts table and design frame R('''suppressPackageStartupMessages(library("DESeq2"))''') R('''sink(file="/dev/null")''') R('''times <- as.factor(%s)''' % des_times.r_repr()) R('''reps <- c(%s)''' % des_reps.r_repr()) R('''condition <- c(%s)''' % des_cond.r_repr()) R('''design <- data.frame(times, reps, condition)''') R('''counts <- data.frame(%s)''' % counts.r_repr()) R('''genes <- c(%s)''' % genes.r_repr()) R('''rownames(counts) <- genes''') R('''rownames(design) <- colnames(counts)''') # use DESeq() with LRT and reduced formula. Use effect # size moderation R('''dds <- DESeqDataSetFromMatrix(countData=counts, ''' '''colData=design, ''' '''design=~reps + times + condition + times:condition)''') R('''dds <- DESeq(dds, test="LRT", ''' '''reduced=~reps + times + condition, betaPrior=T)''') R('''res <- results(dds)[order(results(dds)$padj, na.last=T), ]''') R('''res.df <- data.frame(res)''') # generate dispersion and MA plots R('''png("%s/%s-dispersions.png")''' % (res_dir, header)) R('''plotDispEsts(dds)''') R('''dev.off()''') R('''png("%s/%s-MAplot.png")''' % (res_dir, header)) R('''plotMA(res, alpha=%0.3f, ylim=c(-5,5))''' % alpha) R('''dev.off()''') R('''sink(file=NULL)''') df = com.load_data('res.df') return df
def treeCutting(infile, expression_file, cluster_file, cluster_algorithm, deepsplit=False): ''' Use dynamic tree cutting to derive clusters for each resampled distance matrix ''' wgcna_out = "/dev/null" E.info("loading distance matrix") df = pd.read_table(infile, sep="\t", header=0, index_col=0) df = df.fillna(0.0) genes = df.index genes_r = ro.StrVector([g for g in genes]) rdf = com.convert_to_r_dataframe(df) R.assign("distance_data", rdf) R.assign("gene_ids", genes_r) R('''sink(file='%(wgcna_out)s')''' % locals()) R('''suppressPackageStartupMessages(library("WGCNA"))''') R('''suppressPackageStartupMessages(library("flashClust"))''') E.info("clustering data by %s linkage" % cluster_algorithm) R('''rownames(distance_data) <- gene_ids''') R('''clustering <- flashClust(as.dist(distance_data),''' ''' method='%(cluster_algorithm)s')''' % locals()) if deepsplit: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''minClusterSize=50, deepSplit=T)''') else: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''minClusterSize=50, deepSplit=F)''') R('''color_cut <- labels2colors(cluster_cut)''') R('''write.table(color_cut, file = '%(cluster_file)s',''' '''sep="\t")''' % locals()) R('''cluster_matched <- data.frame(cbind(rownames(distance_data),''' '''color_cut))''') R('''colnames(cluster_matched) = c("gene_id", "cluster")''') R('''cluster_matched <- data.frame(cluster_matched$gene_id,''' '''cluster_matched$cluster)''') R('''sink(file=NULL)''') cluster_frame = com.load_data('cluster_matched') cluster_frame.columns = ['gene_id', 'cluster'] cluster_frame.index = cluster_frame['gene_id'] cluster_frame.drop(['gene_id'], inplace=True, axis=1) return cluster_frame
def test_timeseries(self): """ Test that the series has an informative index. Unfortunately the code currently does not build a DateTimeIndex """ for name in ('austres', 'co2', 'fdeaths', 'freeny.y', 'JohnsonJohnson', 'ldeaths', 'mdeaths', 'nottem', 'presidents', 'sunspot.month', 'sunspots', 'UKDriverDeaths', 'UKgas', 'USAccDeaths', 'airmiles', 'discoveries', 'EuStockMarkets', 'LakeHuron', 'lh', 'lynx', 'nhtemp', 'Nile', 'Seatbelts', 'sunspot.year', 'treering', 'uspop'): series = com.load_data(name) ts = r[name] assert np.array_equal(series.index, r['time'](ts))
def load_pensipp_result(pensipp_path, to_csv=False): path = os.path.join(pensipp_path, 'result_pensipp.csv') try: result_pensipp = read_table(path, sep=',', index_col=0) except: import pandas.rpy.common as com from rpy2 import robjects as r print(" Les données sont chargées à partir du Rdata et non du csv") output_pensipp = os.path.join(pensipp_path, 'output20.RData') r.r['load'](output_pensipp) result_pensipp = com.load_data('output1') result_pensipp.rename(columns={ 'dec_rg': 'decote_RG', 'surc_rg': 'surcote_RG', 'taux': 'taux_RG', 'sam_rg': 'salref_RG', 'pliq_rg': 'pension_RG', 'prorat_rg': 'CP_RG', 'pts_ar': 'nb_points_arrco', 'pts_ag': 'nb_points_agirc', 'pliq_ar': 'pension_arrco', 'pliq_ag': 'pension_agirc', 'DA_rg_maj': 'DA_RG', 'taux_rg': 'taux_RG', 'pliq_fp': 'pension_FP', 'prorat_fp': 'CP_FP', 'taux_fp': 'taux_FP', 'surc_fp': 'surcote_FP', 'dec_fp': 'decote_FP', 'DA_fp_maj': 'DA_FP', 'DA_in': 'DA_RSI_brute', 'DA_in_maj': 'DA_RSI', 'DAcible_rg': 'n_trim_RG', 'DAcible_fp': 'n_trim_FP', 'CPcible_rg': 'N_CP_RG', 'sam_fp': 'salref_FP' }, inplace=True) if to_csv: result_pensipp.to_csv(path, sep=',') return result_pensipp
def build_erf_aggregates(): """ Fetch the relevant aggregates from erf data """ # Uses rpy2. # On MS Windows, The environment variable R_HOME and R_USER should be set import pandas.rpy.common as com import rpy2.rpy_classic as rpy rpy.set_default_mode(rpy.NO_CONVERSION) country = 'france' for year in range(2006, 2008): menageXX = "menage" + str(year)[2:] menageRdata = menageXX + ".Rdata" filename = os.path.join(os.path.dirname(DATA_DIR), 'R', 'erf', str(year), menageRdata) yr = str(year) simu = SurveySimulation() simu.set_config(year=yr, country=country) simu.set_param() agg = Aggregates() agg.set_simulation(simu) # print agg.varlist rpy.r.load(filename) menage = com.load_data(menageXX) cols = [] print year for col in agg.varlist: #print col erf_var = "m_" + col + "m" if erf_var in menage.columns: cols += [erf_var] df = menage[cols] wprm = menage["wprm"] for col in df.columns: tot = (df[col] * wprm).sum() / 1e9 print col, tot
def test_table(self): iris3 = pd.DataFrame({'X0': {0: '0', 1: '1', 2: '2', 3: '3', 4: '4'}, 'X1': {0: 'Sepal L.', 1: 'Sepal L.', 2: 'Sepal L.', 3: 'Sepal L.', 4: 'Sepal L.'}, 'X2': {0: 'Setosa', 1: 'Setosa', 2: 'Setosa', 3: 'Setosa', 4: 'Setosa'}, 'value': {0: '5.1', 1: '4.9', 2: '4.7', 3: '4.6', 4: '5.0'}}) hec = pd.DataFrame( { 'Eye': {0: 'Brown', 1: 'Brown', 2: 'Brown', 3: 'Brown', 4: 'Blue'}, 'Hair': {0: 'Black', 1: 'Brown', 2: 'Red', 3: 'Blond', 4: 'Black'}, 'Sex': {0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4: 'Male'}, 'value': {0: '32.0', 1: '53.0', 2: '10.0', 3: '3.0', 4: '11.0'}}) titanic = pd.DataFrame( { 'Age': {0: 'Child', 1: 'Child', 2: 'Child', 3: 'Child', 4: 'Child'}, 'Class': {0: '1st', 1: '2nd', 2: '3rd', 3: 'Crew', 4: '1st'}, 'Sex': {0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4: 'Female'}, 'Survived': {0: 'No', 1: 'No', 2: 'No', 3: 'No', 4: 'No'}, 'value': {0: '0.0', 1: '0.0', 2: '35.0', 3: '0.0', 4: '0.0'}}) for name, expected in zip(('HairEyeColor', 'Titanic', 'iris3'), (hec, titanic, iris3)): df = com.load_data(name) table = r[name] names = r['dimnames'](table) try: columns = list(r['names'](names))[::-1] except TypeError: columns = ['X{:d}'.format(i) for i in range(len(names))][::-1] columns.append('value') assert np.array_equal(df.columns, columns) result = df.head() cond = ((result.sort(axis=1) == expected.sort(axis=1))).values assert np.all(cond)
def load_data(self): """ retrieve the data from the DoOR database, but only those odorants that have been measured in Hallem et al., 2006. """ self.hallemresponses = {} measured = None for OR in self.hallems: dataframe = com.load_data(OR) dataframe_hallemonly = dataframe[['Hallem.2006.EN', 'CAS', 'Name']] responses = numpy.array(dataframe[['Hallem.2006.EN']], dtype=float) cas = numpy.array(dataframe[['CAS']], dtype=type('a')) names = numpy.array(dataframe[['Name']], dtype=type('a')) if measured is None: measured = ~numpy.isnan(responses) responses = responses[measured] cas = cas[measured] names = names[measured] ordict = {} ordict['responses'] = responses ordict['names'] = names ordict['cas'] = cas self.hallemresponses[OR] = ordict self.measured = measured
def store_survey(self, survey_name, R_table_name, destination_table_name, data_dir, variables=None, force_recreation=True): """ Store a R data table in an HDF5 file Parameters ---------- survey_name : string the name of the survey R_table_name : string the name of the R data table destination_table_name : string the name of the table in the HDFStore data_dir : path the directory where to find the RData file variables : list of string, default None When not None, list of the variables to keep """ gc.collect() year = self.year def get_survey_year(survey_name, year): if survey_name == "logement": if year == 2003: return 2003 elif year in range(2006, 2010): return 2006 if survey_name == "patrimoine": return 2004 else: return year print "creating %s" % (destination_table_name) table_Rdata = R_table_name + ".Rdata" filename = os.path.join(data_dir, str(get_survey_year(survey_name, year)), table_Rdata) print filename if not os.path.isfile(filename): raise Exception("filename do not exists") rpy.r.load(filename) stored_table = com.load_data(R_table_name) store = HDFStore(self.hdf5_filename) store_path = str(self.year) + "/" + destination_table_name if store_path in store: if force_recreation is not True: print store_path + "already exists, do not re-create and exit" store.close() return if variables is not None: print store print store_path print variables variables_stored = list( set(variables).intersection(set(stored_table.columns))) print list(set(variables).difference((set(stored_table.columns)))) store[store_path] = stored_table[variables_stored] else: store[store_path] = stored_table store.close() del stored_table gc.collect()
def fill_zeros(s, k): while len(s) < k: s = '0' + s return s inPath = '/data/biophys/etournay/' inName = 'WT_25deg_111102' Rdata_path = '/home/mpopovic/Documents/Work/Projects/drosophila_wing_analysis/WT_25deg_111102/shear_contrib/' triList_name = 'triList.RData' Ta_name = 'Ta_t.RData' ro.r('load("' + Rdata_path + triList_name + '")') triList_df = com.load_data('triList') ro.r('load("' + Rdata_path + Ta_name + '")') Ta_df = com.load_data('Ta_t') ro.r( 'load("/home/mpopovic/Documents/Work/Projects/drosophila_wing_analysis/WT_25deg_111102/roi_bt/lgRoiInclDead.RData")' ) roi_df = com.load_data('lgRoiInclDead') roi_balde = roi_df[roi_df['roi'] == 'blade'] inPath = '/data/biophys/etournay/' inName = 'WT_25deg_111102' inDB = inPath + 'DB/' + inName + '/' + inName + '.sqlite' con = lite.connect(inDB) cells_df = psql.frame_query('SELECT * FROM cells WHERE cell_id>10000', con) cells_df_blade = cells_df[cells_df['cell_id'].isin(roi_balde['cell_id'])] time_data = psql.frame_query('SELECT * FROM timepoints', con)
def test_numeric(self): for name in ('euro', 'islands', 'precip'): series = com.load_data(name) numeric = r[name] names = numeric.names assert np.array_equal(series.index, names)
subplot_kw['sharey'] = ax0 axarr[0] = ax0 # Note off-by-one counting because add_subplot uses the MATLAB 1-based # convention. for i in range(1, nplots): axarr[i] = fig.add_subplot(nrows, ncols, i + 1, **subplot_kw) if squeeze: # Reshape the array to have the final desired dimension (nrow,ncol), # though discarding unneeded dimensions that equal 1. If we only have # one subplot, just return it instead of a 1-element array. if nplots == 1: return fig, axarr[0] else: return fig, axarr.reshape(nrows, ncols).squeeze() else: # returned axis array will be always 2-d, even if nrows=ncols=1 return fig, axarr.reshape(nrows, ncols) if __name__ == '__main__': import pandas.rpy.common as com sales = com.load_data('sanfrancisco.home.sales', package='nutshell') top10 = sales['zip'].value_counts()[:10].index sales2 = sales[sales.zip.isin(top10)] fig = scatter_plot(sales2, 'squarefeet', 'price', by='zip') # plt.show()
def consensusClustering(infile, cutHeight, cluster_algorithm, min_size=30, deepsplit=False): ''' hierachichal clustering based on gene-cluster correlation across resampled datasets. cut tree based with dynamic tree cut TODO: change this to cutHeight? i.e. 0.2 = 80% clustering agreement OR use dynamic tree cut without deepsplit. ''' condition = infile.split("/")[1].split("-")[0] wgcna_out = "tmp.dir/consensus-WGCNA.out" R('''sink(file='%(wgcna_out)s')''' % locals()) R('''suppressMessages(library("WGCNA"))''') R('''suppressMessages(library("flashClust"))''') E.info("loading distance matrix") df = pd.read_table(infile, sep="\t", header=0, index_col=0) labels = df.index.tolist() labels_r = ro.StrVector([l for l in labels]) df_r = com.convert_to_r_dataframe(df) R.assign("distance.frame", df_r) R.assign("labels", labels_r) # large matricies/distance objects may need more # memory - allocate 1GB R('''memory.limit(10000)''') R('''rownames(distance.frame) <- labels''') R('''distance_data <- data.matrix(distance.frame)''') E.info("clustering data by %s linkage" % cluster_algorithm) R('''clustering <- flashClust(as.dist(1-distance_data),''' '''method='%(cluster_algorithm)s')''' % locals()) if cutHeight > float(0.01): R('''cluster_cut <- cutreeStatic(dendro=clustering, ''' '''minSize=%(min_size)i, cutHeight=%(cutHeight)s)''' % locals()) elif deepsplit: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''deepSplit=T, minClusterSize=%(min_size)i)''' % locals()) else: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''deepSplit=F, minClusterSize=%(min_size)i)''' % locals()) R('''color_cut <- labels2colors(cluster_cut)''') R('''cluster_matched <- data.frame(cbind(rownames(distance_data),''' '''color_cut))''') R('''colnames(cluster_matched) = c("gene_id", "cluster")''') R('''cluster_matched <- data.frame(cluster_matched$gene_id,''' '''cluster_matched$cluster)''') # plot and save dendrogram of clustering R('''png("plots.dir/%(condition)s-dendrogram-consensus_clustering.png")''' % locals()) R('''plotDendroAndColors(dendro=clustering, colors=color_cut,''' '''groupLabels="Dynamic tree cut",''' '''dendroLabels=F, addGuide=T, guideHang=0.05, ''' '''hang=0.03, main="%(condition)s")''' % locals()) R('''dev.off()''') R('''sink(file=NULL)''') cluster_frame = com.load_data('cluster_matched') return cluster_frame
def clusterPCA(infile, cluster_file, image_dir): ''' PCA for each module within an experimental condition across the time series. Take PC1 as the module eigengene and return the loadings and proportion of variance explained for the eigengene. The eigengene expression across the time series is taken to be the value for PC1 at each timepoint as a vector. This is basically what WGCNA moduleEigengenes does but it does not recover the PC loadings. Warning: this script will error if there is only one cluster. Make sure you have more than one cluster before trying to perform uninformative analyses. ''' header = cluster_file.split("/")[-1].split("-")[0] # reshape data R('''sink(file='sink_file.txt')''') R('''suppressMessages(library("reshape2"))''') R('''suppressMessages(library("WGCNA"))''') R('''source("%s")''' % os.path.join(Timeseries.get_r_path, "summarySE.R")) R('''source("%s")''' % os.path.join(Timeseries.get_r_path, "clusterEigengenes.R")) R('''cluster_match <- read.table('%(cluster_file)s', h=T, ''' '''row.names=1)''' % locals()) R('''express_data <- read.table('%(infile)s', ''' '''h=T, row.names=1, stringsAsFactors=F)''' % locals()) R('''sink(file=NULL)''') R('''colnames(cluster_match) <- c("genes", "cluster")''') R('''express_data <- data.frame(t(express_data))''') R('''express_data$times <- as.numeric(as.character(express_data$times))''') R('''data_melt <- melt(express_data, ''' '''id.vars=c("times", "replicates"))''') # sometimes data is read in as a factor/string. # Explicitly convert to numeric R('''data_melt$value <- as.numeric(as.character(data_melt$value))''') R('''data_sum <- summarySE(data_melt, measurevar="value", ''' '''groupvars=c("times", "variable"))''') R('''data_mod <- data.frame(data_sum$times,''' ''' data_sum$variable, data_sum$value)''') R('''colnames(data_mod) <- c("times", "gene", "value")''') R('''data_wide <- dcast(data_mod, gene ~ times, value.var="value")''') R('''rownames(data_wide) <- data_wide$gene''') R('''times <- as.numeric(as.character(unique(express_data$times)))''') R('''data_wide <- data.frame(data_wide[,-1])''') R('''colnames(data_wide) <- times''') # derive module eigengenes - return a dataframe of eigengene expression R('''eigen_clustered <- clusterPCA(cluster_frame=cluster_match, ''' '''expression_frame=data_wide, n=times)''') R('''eigen_frame <- eigenExpress(eigen_clustered, n=times)''') # generate loadings plot for each eigengene R('''eigenLoad(clusterPCA(cluster_frame=cluster_match, ''' '''expression_frame=data_wide, n=times), image.dir="%(image_dir)s", ''' '''condition="%(header)s")''' % locals()) # generate expression profile plots for all eigengenes R('''eigenPlot(eigen_frame, image.dir="%(image_dir)s", ''' '''condition="%(header)s")''' % locals()) eigen_frame = com.load_data("eigen_frame") eigen_frame.index = eigen_frame['cluster'] eigen_frame.drop(['cluster'], inplace=True, axis=1) return eigen_frame
import pandas.rpy.common as com import rpy2.robjects as ro from scipy.stats import kruskal import matplotlib.pyplot as plt import numpy as np from scipy.constants import c ro.r('data(morley)') df = com.load_data('morley') df['Speed'] = df['Speed'] + 299000 samples = dict(list(df.groupby('Expt'))) samples = np.array([samples[i]['Speed'].values for i in list(samples.keys())]) print("Kruskal", kruskal(samples[0], samples[1], samples[2], samples[3], samples[4])) plt.title('Speed of light') plt.plot(samples.min(axis=1), 'x', label='min') plt.plot(samples.mean(axis=1), 'o', label='mean') plt.plot(np.ones(5) * samples.mean(), '--', label='All mean') plt.plot(np.ones(5) * c / 1000, lw=2, label='Actual') plt.plot(samples.max(axis=1), 'v', label='max') plt.grid(True) plt.legend() plt.show()
def deseqNormalize(infile, time_points, reps, conditions=None): ''' Library size normalisation and variance stabilizing transformation of timeseries RNA-seq data :param infile: count table from NGS-seq experiment :type infile: str :param time_points: time point labels :type time_points: str list :param reps: replicates labels :type reps: str list :param conditions: if multiple experimental conditions are to be normalised at the same time :type conditions: str list ''' # MM: NB - this should be split into separate library size # normalisation and VST transformations # maybe add in different transformation options. pandas2ri.activate() reps = reps # load library R('''suppressMessages(library("DESeq"))''') # generates a lists for the design data frame # of the proper length # these need to be rpy2 objects to be parsed # properly in the string formatting E.info("converting to pandas dataframe object") if infile.split(".")[-1] == "gz": comp = "gzip" else: comp = None data_frame = pd.read_table(infile, index_col=0, header=0, sep="\t", compression=comp) rdf = com.convert_to_r_dataframe(data_frame) if not conditions: time_rep_comb = [x for x in itertools.product(time_points, reps)] time_cond = ro.StrVector([x[0] for x in time_rep_comb]) rep_cond = ro.StrVector([x[1] for x in time_rep_comb]) R.assign('countsTable', rdf) R('''design <- data.frame(row.names=colnames(countsTable),''' '''times=%s, replicates=%s)''' % (time_cond.r_repr(), rep_cond.r_repr())) elif conditions: design_dict = {} for x in data_frame.columns.values: sample_dict = {} sample_dict['condition'] = str(x).split(".")[0] sample_dict['times'] = int(str(x).split(".")[1]) sample_dict['replicates'] = str(x).split(".")[2] design_dict[x] = sample_dict design_frame = pd.DataFrame(design_dict) design_frame = design_frame.T des_cond = design_frame['condition'].values.tolist() des_time = design_frame['times'].values.tolist() des_reps = design_frame['replicates'].values.tolist() cond_cond = ro.StrVector([x for x in des_cond]) time_cond = ro.StrVector([x for x in des_time]) rep_cond = ro.StrVector([x for x in des_reps]) R.assign('countsTable', rdf) R.assign('design', design_frame) # create the count data set and normalize to library size # transform with variance stabilizing transformation # only select genes with an average of ten reads mapping E.info("calculating size factors and dispersion") R('''notZero <- (rowMeans(countsTable) > 1)''') R('''cds <- newCountDataSet(countsTable[notZero, ], design)''') R('''cds_size <- estimateSizeFactors(cds)''') R('''cds_disp <- estimateDispersions(cds_size, method="blind")''') E.info("applying variance stabilizing transformation") R('''vst <- varianceStabilizingTransformation(cds_disp)''') # format data set to long format with condition and replicate labels # convert to a numpy array R('''replicates <- c(%s)''' % rep_cond.r_repr()) R('''times <- c(%s)''' % time_cond.r_repr()) if conditions: R('''conditions <- c(%s)''' % cond_cond.r_repr()) R('''trans_vst = data.frame(t(exprs(vst)), ''' '''times, replicates, conditions)''') else: R('''trans_vst = data.frame(t(exprs(vst)), times, replicates)''') data_file = com.load_data('trans_vst') return data_file
def maSigPro(infile, order_terms=1, fdr=0.01, adjust="BH", stepwise="backward", include_p=0.01, rsq=0.2, var_group="all"): ''' Generate differentially expressed genes for each experimental condition across a time series. Uses the bioconductor package maSigPro to derive a set of genes of interest. ''' ref_gtf = str(infile).split("-")[1] data_frame = pd.read_table(infile, sep="\t", index_col=0, header=0) design_dict = {} for x in data_frame.index.values: sample_dict = {} condition = str(x).split(".")[0] sample_dict[condition] = 1 sample_dict['times'] = int(str(x).split(".")[1]) sample_dict['replicates'] = str(x).split(".")[2] design_dict[x] = sample_dict design_frame = pd.DataFrame(design_dict) design_frame = design_frame.T cols = ['times', 'replicates', condition] design_frame = design_frame[cols] design_file = "deseq.dir/%s-%s-design.tsv" % (condition, ref_gtf) design_frame.to_csv(design_file, sep="\t") data_file = "deseq.dir/%s-%s-data.tsv" % (condition, ref_gtf) results_file = "deseq.dir/%s-%s-maSigPro.tsv" % (condition, ref_gtf) # data frame columns must be in the order time-replicate-condition # for maSigPro # define the numnber of higher-order terms included in the models masigpro_out = "deseq.dir/maSigPro.out" R('''suppressMessages(library("maSigPro"))''') R('''input_data <- read.table('%(infile)s', sep="\t", ''' '''h=T, row.names=1)''' % locals()) R('''input_data <- t(input_data[0:(length(input_data)-2)])''') E.info("constructing experimental design matrix") R('''input_design <- data.matrix(read.table('%(design_file)s', ''' '''sep="\t", h=T, row.names=1))''' % locals()) R('''%(condition)s_mat <- make.design.matrix(input_design, ''' '''degree = %(order_terms)i )''' % locals()) R('''sink(file = '%(masigpro_out)s')''' % locals()) E.info("fitting linear model for each gene with " "%i polynomial terms" % order_terms) R('''%(condition)s_fit <- p.vector(input_data, %(condition)s_mat, ''' '''Q = %(fdr)f, MT.adjust = '%(adjust)s')''' % locals()) # fit a linear model to each of the genes called as # differentially expressed # report genes with model R-squared > threshold # maSigPro gives an un-suppressable output to stdout # therefore sink is used to shunt this to a temporary file 'maSigPro.out' R('''%(condition)s_step <- T.fit(%(condition)s_fit, ''' '''step.method='%(stepwise)s', alfa=%(include_p)f)''' % locals()) E.info("selecting significantly differentially " "expressed genes at FDR=%0.3f" % fdr) R('''sink(file=NULL)''') R('''%(condition)s_sigs <- get.siggenes(%(condition)s_step, ''' '''rsq=%(rsq)f, vars='%(var_group)s')''' % locals()) R('''write.table(%(condition)s_sigs$sig.genes$%(condition)s$group.coeffs''' ''',file="deseq.dir/%(condition)s-%(ref_gtf)s-coefficients.tsv", ''' '''sep="\t")''' % locals()) R('''write.table(%(condition)s_sigs$sig.genes$%(condition)s$sig.pvalues,''' '''file="deseq.dir/%(condition)s-%(ref_gtf)s-pvalues.tsv",''' ''' sep="\t")''' % locals()) R('''write.table(%(condition)s_sigs$summary, ''' '''file='deseq.dir/%(condition)s-%(ref_gtf)s-geneids.tsv', ''' '''sep="\t")''' % locals()) # merge the p-value and coefficient results into a single file p_file = "deseq.dir/%(condition)s-%(ref_gtf)s-pvalues.tsv" % locals() coef_file = "deseq.dir/%s-%s-coefficients.tsv" % (condition, ref_gtf) p_frame = pd.read_table(p_file, sep="\t") coef_frame = pd.read_table(coef_file, sep="\t") results_frame = pd.merge(coef_frame, p_frame, how='right', left_index=True, right_index=True) results_frame.to_csv(results_file, sep="\t") R('''diff_genes <- data.frame(%(condition)s_fit$SELEC)''' % locals()) diff_genes = com.load_data('diff_genes') return diff_genes