Beispiel #1
0
def load_from_Rdata(path, to_csv=False):
    import pandas.rpy.common as com
    from rpy2 import robjects as r
    input_pensipp = path + 'dataALL.RData'
    dates_to_col = [year * 100 + 1 for year in range(1901, 2061)]
    r.r("load('" + str(input_pensipp) + "')")
    statut = com.load_data('statut')
    statut.columns = dates_to_col
    salaire = com.load_data('salaire')
    salaire.columns = dates_to_col
    info = com.load_data('ind')
    info['naiss'] = [
        datetime.date(1900 + int(year), 1, 1) for year in info['t_naiss']
    ]
    info['id'] = info.index
    id_enf = com.load_data('enf')
    id_enf.columns = ['enf' + str(i) for i in range(id_enf.shape[1])]
    info_child = build_info_child(id_enf, info)

    if to_csv:
        for table in ['info', 'info_child', 'salaire', 'statut']:
            temp = eval(table)
            temp.to_csv(pensipp_comparison_path + table + '.csv', sep=',')

    return info, info_child, salaire, statut
Beispiel #2
0
def r_analisis(path):
    r('''
        regresion <- function(path) {
            myData <- read.csv(path, header = TRUE)
            names(myData)[1] <- "Fecha"
            names(myData)[2] <- "VelViento"
            myData$time <-strptime(myData$Fecha, "%d-%m-%Y %H:%M")

            fit_year <- fitdist(myData$VelViento, "weibull")

            shape <- c()
            scale <- c()
            error_shape <- c()
            error_scale <- c()
            vel_min <- c()
            vel_max <- c()
            vel_prom <- c()

            for(i in 0:11){

              fit <- fitdist(myData$VelViento[myData$time$mon == i], "weibull") 
              shape <- c(shape, fit$estimate[1])
              scale <- c(scale, fit$estimate[2])
              error_shape <- c(error_shape, fit$sd[1])
              error_scale <- c(error_scale, fit$sd[2])
              vel_min <- c(vel_min, min(myData$VelViento[myData$time$mon == i]))
              vel_max <- c(vel_max, max(myData$VelViento[myData$time$mon == i]))
              vel_prom <- c(vel_prom, mean(myData$VelViento[myData$time$mon == i]))
            }

            mes <- c(1:12)

            dt_meses <<- data.frame(mes, shape, scale, error_shape, 
                                  error_scale, vel_max, vel_min, vel_prom)

            shape_year <- c(fit_year$estimate[1], fit_year$sd[1])
            scale_year <- c(fit_year$estimate[2],fit_year$sd[2])

            dt_anio <<- data.frame(shape_year, scale_year)
            
            his <<- hist(myData$VelViento, plot = FALSE)
            histo_breaks <<- data.frame(breaks = his$breaks)
            histo_counts <<- data.frame(counts = his$counts, density = his$density, mids = his$mids)
        }
    ''')
    
    r_regresion = r['regresion']
    r_regresion(path)
    
    df_meses = com.load_data('dt_meses')
    df_anio = com.load_data('dt_anio')
    histo_breaks = com.load_data('histo_breaks')
    histo_counts = com.load_data('histo_counts')

    return(df_anio, df_meses, histo_breaks, histo_counts)
Beispiel #3
0
    def setUp(self):
        try:
            import rpy2
        except ImportError:
            raise nose.SkipTest("No rpy2")

        self.rdata = rpy.load_data('Canada', package='vars', convert=False)
        self.data = rpy.load_data('Canada', package='vars', convert=True)

        self.res = VAR(self.data)
        self.ref = RVAR(self.rdata)
Beispiel #4
0
    def setUp(self):
        try:
            import rpy2
        except ImportError:
            raise nose.SkipTest("No rpy2")

        self.rdata = rpy.load_data('Canada', package='vars', convert=False)
        self.data = rpy.load_data('Canada', package='vars', convert=True)

        self.res = VAR(self.data)
        self.ref = RVAR(self.rdata)
def read_rdata(rdata_fullpath):
    """
    Returns the pandas DataFrame
    """
    import pandas.rpy.common as com
    from rpy2 import robjects as r

    # we want forward slashes for R
    rdata_fullpath_forR = rdata_fullpath.replace("\\", "/")
    print "Loading %s" % rdata_fullpath_forR

    # read in the data from the R session with python
    r.r("load('%s')" % rdata_fullpath_forR)
    # check that it's there
    # print "Dimensions are %s" % str(r.r('dim(model_summary)'))

    table_df = com.load_data("model_summary")

    # fillna
    for col in table_df.columns:
        nullcount = sum(pandas.isnull(table_df[col]))
        if nullcount > 0:
            print "  Found %5d NA values in column %s" % (nullcount, col)
    table_df = table_df.fillna(0)
    for col in table_df.columns:
        nullcount = sum(pandas.isnull(table_df[col]))
        if nullcount > 0:
            print "  -> Found %5d NA values in column %s" % (nullcount, col)

    print "Read %d lines from %s" % (len(table_df), rdata_fullpath)
    return table_df
Beispiel #6
0
def convert_datafiles(datasets_folder):
    '''convert .RData files to .csv files and clean up'''
    print "Convert .RData to .csv and clean up files..."

    for root, dirs, files in os.walk(datasets_folder):
        for name in files:
            # sort out .RData files
            if name.endswith('.RData'):
                name_ = os.path.splitext(name)[0]
                name_path = os.path.join(datasets_folder, name_)
                # creat sub-directory
                if not os.path.exists(name_path):
                    os.makedirs(name_path)
                file_path = os.path.join(root, name)
                robj = robjects.r.load(file_path)
                # check out subfiles in the data frame
                for var in robj:
                    myRData = com.load_data(var)
                    # convert to DataFrame
                    if not isinstance(myRData, pd.DataFrame):
                        myRData = pd.DataFrame(myRData)
                    var_path = os.path.join(datasets_folder, name_,
                                            var + '.csv')
                    myRData.to_csv(var_path)
                os.remove(os.path.join(datasets_folder, name))  # clean up

    print "=> Success!"
Beispiel #7
0
 def test_dist(self):
     for name in ("eurodist",):
         df = com.load_data(name)
         dist = r[name]
         labels = r["labels"](dist)
         assert np.array_equal(df.index, labels)
         assert np.array_equal(df.columns, labels)
Beispiel #8
0
def encode_data():
    robj = robjects.r.load('heart-proc.RData')

    heart = com.load_data(robj[2])

    heart = heart.fillna(method='ffill')
    #Take care of missing data
    heart.iloc[:, 11] = heart.iloc[:, 11].fillna(method='ffill')
    heart.iloc[:, 12] = heart.iloc[:, 12].fillna(method='ffill')

    #Encoding categorical data
    from sklearn.preprocessing import LabelEncoder
    label_X_gender = LabelEncoder()
    heart[['sex']] = label_X_gender.fit_transform(heart[['sex']])

    label_X_fbs = LabelEncoder()
    heart[['fbs']] = label_X_fbs.fit_transform(heart[['fbs']])

    label_X_exang = LabelEncoder()
    heart[['exang']] = label_X_exang.fit_transform(heart[['exang']])

    label_X_cp = LabelEncoder()
    heart[['cp']] = label_X_cp.fit_transform(heart[['cp']])
    label_X_restecg = LabelEncoder()
    heart[['restecg']] = label_X_restecg.fit_transform(heart[['restecg']])
    label_X_slope = LabelEncoder()
    heart[['slope']] = label_X_slope.fit_transform(heart[['slope']])
    label_X_thal = LabelEncoder()
    heart[['thal']] = label_X_thal.fit_transform(heart[['thal']])
    transform_to_csv(heart)

    #x features vector
    x_heart = heart.iloc[:, 0:13].values
    y_heart = heart.iloc[:, 13].values
    return x_heart, y_heart
def read_rdata(rdata_fullpath):
    """
    Returns the pandas DataFrame
    """
    import pandas.rpy.common as com
    from rpy2 import robjects as r

    # we want forward slashes for R
    rdata_fullpath_forR = rdata_fullpath.replace("\\", "/")
    print "Loading %s" % rdata_fullpath_forR

    # read in the data from the R session with python
    r.r("load('%s')" % rdata_fullpath_forR)
    # check that it's there
    # print "Dimensions are %s" % str(r.r('dim(model_summary)'))

    table_df = com.load_data('model_summary')

    # fillna
    for col in table_df.columns:
        nullcount = sum(pandas.isnull(table_df[col]))
        if nullcount > 0:
            print "  Found %5d NA values in column %s" % (nullcount, col)
    table_df = table_df.fillna(0)
    for col in table_df.columns:
        nullcount = sum(pandas.isnull(table_df[col]))
        if nullcount > 0:
            print "  -> Found %5d NA values in column %s" % (nullcount, col)

    print "Read %d lines from %s" % (len(table_df), rdata_fullpath)
    return table_df
Beispiel #10
0
def main():
    # simulated brownian motion
    df = pd.DataFrame(randn(1000, 4),
                      index=pd.date_range('1/1/2000', periods=1000),
                      columns=['Legend A', 'Legend B', 'Legend C', 'Legend D'])
    df = df.cumsum()

    # iris data set
    iris = com.load_data('iris')
    feature_means = iris.groupby('Species').mean()

    with mpl.rc_context(fname=args.style):
        # line plot of brownian motion
        df.plot()
        plt.title('Brownian Motion')
        plt.ylabel('Simulated Values')
        plt.xlabel('Dates')
        plt.tight_layout()
        plt.savefig(args.output.strip('.png') + '.line.png')
        plt.close()

        # bar plot of iris data set
        feature_means.plot(kind='bar')
        plt.title('Mean Features of Iris Data Set')
        plt.ylabel('Values')
        plt.tight_layout()
        plt.savefig(args.output.strip('.png') + '.bar.png')
        plt.close()
def transform_data(r_datfile, full_dump_name="", train_sampl_size=0,test_sampl_size=0,sample_only=False):
	if sample_only:
		full_data = cPickle.load(open('noncoding.data.py.save','rb'))
		train_data = full_data['train_data']
		test_data = full_data['test_data']

		## Sample the data set (~3.6M -> sampl_size)
		train_subset,train_left = sample_split_data(train_data, float(train_sampl_size)/train_data.shape[0])
		test_subset,test_left = sample_split_data(test_data,float(test_sampl_size)/test_data.shape[0])

		print('train_subset:{},test_subset:{}'.format(train_subset.shape,test_subset.shape))
		dump_data_subset = {'train_data': train__subset,
												'test_data': test_subset}
		cPickle.dump(dump_data_subset,open(sampl_dump_name,'wb'),protocol=cPickle.HIGHEST_PROTOCOL)

		return dump_data_subset
	else:
		## Load the R-store data
		robj = robjects.r.load(r_datfile)
		myRData = com.load_data(robj[0])

		train_data = np.array(pd.DataFrame(myRData['train']))
		test_data = np.array(pd.DataFrame(myRData['test']))
		dump_data = {'train_data': train_data,
  		           'test_data': test_data}

		print('train:{}, test:{}'.format(train_data.shape, test_data.shape))
		cPickle.dump(dump_data,open(full_dump_name,'wb'),protocol=cPickle.HIGHEST_PROTOCOL)
		return dump_data
 def region_deform_tensor(self, roi_name):
     if 'avgDeformTensorsWide.tsv' in os.listdir(DB_path+self.name+'/shear_contrib/'+roi_name):
         df_DB_shear= pp.read_csv(DB_path+self.name+'/shear_contrib/'+roi_name+'/avgDeformTensorsWide.tsv', sep='\t')
     else:
         ro.r('load("'+DB_path+self.name+'/shear_contrib/'+roi_name+'/avgDeformTensorsWide.RData")')
         df_DB_shear= com.load_data('avgDeformTensorsWide')
     return df_DB_shear
    def fill_hdf_from_Rdata(self, table):
        import pandas.rpy.common as com
        import rpy2.rpy_classic as rpy
        rpy.set_default_mode(rpy.NO_CONVERSION)
        assert table in self.tables, "Table {} is not a filed table".format(table)
        Rdata_table = self.tables[table]["Rdata_table"]
        Rdata_file = self.tables[table]["Rdata_file"]
        if 'variables' in self.tables:
            variables = self.tables[table]['variables']
        else:
            variables = None
        if not os.path.isfile(Rdata_file):
            raise Exception("file_path do not exists")
        rpy.r.load(Rdata_file)
        stored_dataframe = com.load_data(Rdata_table)
        store_path = table

        log.info("Inserting {} in HDF file {} at point {}".format(
            Rdata_table,
            self.hdf5_file_path,
            table,
            )
        )
        if variables is not None:
            log.info('variables asked by the user: {}'.format(variables))
            variables_stored = list(set(variables).intersection(set(stored_dataframe.columns)))
            log.info('variables stored: {}'.format(variables_stored))
            stored_dataframe = stored_dataframe[variables_stored].copy()

        stored_dataframe.to_hdf(self.hdf5_file_path, store_path, format = 'table', append = False)
        gc.collect()
Beispiel #14
0
 def test_dist(self):
     for name in ('eurodist',):
         df = com.load_data(name)
         dist = r[name]
         labels = r['labels'](dist)
         assert np.array_equal(df.index, labels)
         assert np.array_equal(df.columns, labels)
def convert_datafiles(datasets_folder):
    '''convert .RData files to .csv files and clean up'''
    print "Convert .RData to .csv and clean up files..."

    for root, dirs, files in os.walk(datasets_folder):
        for name in files:
            # sort out .RData files
            if name.endswith('.RData'):
                name_ = os.path.splitext(name)[0]
                name_path = os.path.join(datasets_folder, name_)
                # creat sub-directory
                if not os.path.exists(name_path):
                    os.makedirs(name_path)
                file_path = os.path.join(root, name)
                robj = robjects.r.load(file_path)
                # check out subfiles in the data frame
                for var in robj:
                    myRData = com.load_data(var)
                    # convert to DataFrame
                    if not isinstance(myRData, pd.DataFrame):
                        myRData = pd.DataFrame(myRData)
                    var_path = os.path.join(datasets_folder,name_,var+'.csv')
                    myRData.to_csv(var_path)
                os.remove(os.path.join(datasets_folder, name)) # clean up

    print "=> Success!"
Beispiel #16
0
 def r_to_py(r_df_name):
     """
     :param r_df_name: the r data frame NAME saved in R environment
     :return:
     """
     ro.globalenv['\'' + str(r_df_name) + '\''] = r_df_name
     return com.load_data('\'' + str(r_df_name) + '\'')
Beispiel #17
0
 def test_factor(self):
     for name in ('state.division', 'state.region'):
         vector = r[name]
         factors = list(r['factor'](vector))
         level = list(r['levels'](vector))
         factors = [level[index - 1] for index in factors]
         result = com.load_data(name)
         assert np.equal(result, factors)
Beispiel #18
0
 def test_factor(self):
     for name in ('state.division', 'state.region'):
         vector = r[name]
         factors = list(r['factor'](vector))
         level = list(r['levels'](vector))
         factors = [level[index - 1] for index in factors]
         result = com.load_data(name)
         assert np.equal(result, factors)
Beispiel #19
0
 def test_factor(self):
     for name in ("state.division", "state.region"):
         vector = r[name]
         factors = list(r["factor"](vector))
         level = list(r["levels"](vector))
         factors = [level[index - 1] for index in factors]
         result = com.load_data(name)
         assert np.equal(result, factors)
Beispiel #20
0
def covarFilter(infile,
                time_points,
                replicates,
                quantile):
    '''
    Filter gene list based on the distribution of the
    sums of the covariance of each gene.  This is highly
    recommended to reduce the total number of genes used
    in the dynamic time warping clustering to reduce the
    computational time.  The threshold is placed at the
    intersection of the expected and observed value
    for the given quantile.
    '''

    time_points.sort()
    time_rep_comb = [x for x in itertools.product(time_points, replicates)]
    time_cond = ro.StrVector([x[0] for x in time_rep_comb])
    rep_cond = ro.StrVector([x[1] for x in time_rep_comb])
    df = pd.read_table(infile, sep="\t", header=0, index_col=0)

    df.drop(['replicates'], inplace=True, axis=1)
    df.drop(['times'], inplace=True, axis=1)
    df = df.fillna(0.0)

    R.assign('diff_data', df)

    E.info("loading data frame")

    # need to be careful about column headers and transposing data frames

    R('''trans_data <- data.frame(diff_data)''')
    R('''times <- c(%s)''' % time_cond.r_repr())
    R('''replicates <- c(%s)''' % rep_cond.r_repr())

    # calculate the covariance matrix for all genes
    # sum each gene's covariance vector

    E.info("calculating sum of covariance of expression")

    R('''covar.mat <- abs(cov(trans_data))''')
    R('''sum.covar <- rowSums(covar.mat)''')
    R('''exp.covar <- abs(qnorm(ppoints(sum.covar),'''
      '''mean=mean(sum.covar), sd=sd(sum.covar)))''')
    R('''sum.covar.quant <- quantile(sum.covar)''')
    R('''exp.covar.quant <- quantile(exp.covar)''')

    E.info("filter on quantile")

    R('''filtered_genes <- names(sum.covar[sum.covar > '''
      '''sum.covar.quant[%(quantile)i]'''
      ''' & sum.covar > exp.covar.quant[%(quantile)i]])''' % locals())
    R('''filtered_frame <- data.frame(diff_data[, filtered_genes],'''
      '''times, replicates)''')

    filtered_frame = com.load_data('filtered_frame').T

    return filtered_frame
Beispiel #21
0
def covarFilter(infile,
                time_points,
                replicates,
                quantile):
    '''
    Filter gene list based on the distribution of the
    sums of the covariance of each gene.  This is highly
    recommended to reduce the total number of genes used
    in the dynamic time warping clustering to reduce the
    computational time.  The threshold is placed at the
    intersection of the expected and observed value
    for the given quantile.
    '''

    time_points.sort()
    time_rep_comb = [x for x in itertools.product(time_points, replicates)]
    time_cond = ro.StrVector([x[0] for x in time_rep_comb])
    rep_cond = ro.StrVector([x[1] for x in time_rep_comb])
    df = pd.read_table(infile, sep="\t", header=0, index_col=0)

    df.drop(['replicates'], inplace=True, axis=1)
    df.drop(['times'], inplace=True, axis=1)
    df = df.fillna(0.0)

    R.assign('diff_data', df)

    E.info("loading data frame")

    # need to be careful about column headers and transposing data frames

    R('''trans_data <- data.frame(diff_data)''')
    R('''times <- c(%s)''' % time_cond.r_repr())
    R('''replicates <- c(%s)''' % rep_cond.r_repr())

    # calculate the covariance matrix for all genes
    # sum each gene's covariance vector

    E.info("calculating sum of covariance of expression")

    R('''covar.mat <- abs(cov(trans_data))''')
    R('''sum.covar <- rowSums(covar.mat)''')
    R('''exp.covar <- abs(qnorm(ppoints(sum.covar),'''
      '''mean=mean(sum.covar), sd=sd(sum.covar)))''')
    R('''sum.covar.quant <- quantile(sum.covar)''')
    R('''exp.covar.quant <- quantile(exp.covar)''')

    E.info("filter on quantile")

    R('''filtered_genes <- names(sum.covar[sum.covar > '''
      '''sum.covar.quant[%(quantile)i]'''
      ''' & sum.covar > exp.covar.quant[%(quantile)i]])''' % locals())
    R('''filtered_frame <- data.frame(diff_data[, filtered_genes],'''
      '''times, replicates)''')

    filtered_frame = com.load_data('filtered_frame').T

    return filtered_frame
 def _build_df(self, file_pointer):
     try:
         r = robjects
         r.r("require(foreign)")
         r.r('x <- read.spss("{}",to.data.frame=T)'.format(file_pointer.name))
         r.r('row.names(x) = 0:(nrow(x)-1)')
         return {"dataframe":com.load_data('x')}
     except (RRuntimeError, TypeError):
         raise BlankOrCorruptTableError("Is this a valid SPSS file?")
Beispiel #23
0
def get_prediction_for_daterange(ed='2012-05-20 23:00:00+00:00', asutc=False):
    #start date for forecast is necessarily the period proceeding the last period of the dataset
    sd = hourly_volume.index[-1:][0].isoformat()
    print 'Forecasting...'

    #Time Zone info is annoying
    sd = dateutil.parser.parse(sd).replace(tzinfo=None)
    ed = dateutil.parser.parse(ed).replace(tzinfo=None)

    #Compute # of days to forecast ahead
    td = ed - sd
    td = td.days + 1
    #Brings days to forecast into R
    r('d_forecast = ' + str(td) + ' # of days forecast')

    #R creates hourly forecasts for the specified number of days to forecast ahead
    r('HRforecast <- forecast.HoltWinters(hr_model,h=(d_forecast*24))')
    r('forecast_result <- HRforecast$mean')

    #If monthly model was built (min 8 weeks of data), reconciles hourly forecast with daily forecast over month
    if (r('!is.null(dy_model)')[0]):

        #R creates daily forecast for the number of days to forecast ahead
        r('dy_forecast <- forecast.HoltWinters(dy_model,h=d_forecast)')
        #Extract daily forecast vector from time-series forecast object
        r('dy_fcst_result <- dy_forecast$mean')
        r('dy_fcst_result  <- unclass(dy_fcst_result)')
        r('dy_fcst_result <- dy_fcst_result[]')

        #Re-bucket hourly forecast into daily forecast
        r('hr_fcst_by_day <- unname(tapply(forecast_result, (seq_along(forecast_result)-1) %/% 24, sum))'
          )
        #scale hourly model by daily model
        r('scale_coefs <- dy_fcst_result/hr_fcst_by_day')
        #Scale scaling-coefficient vector to length of hourly forecast vector
        r('scale_coefs <- as.vector(matrix(rep(scale_coefs,each=24),nrow=24))')
        #Extract hourly forecast vector from time-series forecast object
        r('hr_fcst_result <- unclass(forecast_result)')
        r('hr_fcst_result <- hr_fcst_result[]')
        #multiply hourly forecast vector by scaling ceofficients to get final result
        r('forecast_result <- hr_fcst_result*scale_coefs')

    #Load prediction from R into Python workspace
    pred = com.load_data('forecast_result')
    #Convert prediction to Pandas time series
    pred = pd.Series(pred)
    rng = pd.date_range(sd, periods=td * 24, freq='H')
    pred.index = pd.Series(rng, name='time')

    ret = pred.reset_index()
    if asutc:
        ret.index = ret.time.apply(lambda x: str(x))
    else:
        ret.index = ret.time.apply(lambda x: int(x.strftime('%s')))
    ret = ret[0]
    return ret  #.to_dict()
def robjectify(fp):
    """Create a dataframe object using R"""

    import pandas.rpy.common as common
    import rpy2.robjects as robjects
    r = robjects
    r.r("require(foreign)")
    r.r('x <- read.spss("{}",to.data.frame=T)'.format(fp.name))
    r.r('row.names(x) = 0:(nrow(x)-1)')
    return common.load_data('x')
Beispiel #25
0
 def _build_df(self, file_pointer):
     try:
         r = robjects
         r.r("require(foreign)")
         r.r('x <- read.spss("{}",to.data.frame=T)'.format(
             file_pointer.name))
         r.r('row.names(x) = 0:(nrow(x)-1)')
         return {"dataframe": com.load_data('x')}
     except (RRuntimeError, TypeError):
         raise BlankOrCorruptTableError("Is this a valid SPSS file?")
Beispiel #26
0
def get_prediction_for_daterange(ed='2012-05-20 23:00:00+00:00', asutc=False):
    #start date for forecast is necessarily the period proceeding the last period of the dataset
    sd = hourly_volume.index[-1:][0].isoformat()
    print 'Forecasting...'
    
    #Time Zone info is annoying
    sd = dateutil.parser.parse(sd).replace(tzinfo=None)
    ed = dateutil.parser.parse(ed).replace(tzinfo=None)
    
    #Compute # of days to forecast ahead
    td = ed-sd
    td = td.days+1
    #Brings days to forecast into R
    r('d_forecast = '+str(td)+' # of days forecast')
    
    #R creates hourly forecasts for the specified number of days to forecast ahead
    r('HRforecast <- forecast.HoltWinters(hr_model,h=(d_forecast*24))')
    r('forecast_result <- HRforecast$mean')
    
    #If monthly model was built (min 8 weeks of data), reconciles hourly forecast with daily forecast over month
    if (r('!is.null(dy_model)')[0]):
        
        #R creates daily forecast for the number of days to forecast ahead
        r('dy_forecast <- forecast.HoltWinters(dy_model,h=d_forecast)')
        #Extract daily forecast vector from time-series forecast object
        r('dy_fcst_result <- dy_forecast$mean')
        r('dy_fcst_result  <- unclass(dy_fcst_result)')
        r('dy_fcst_result <- dy_fcst_result[]')
        
        #Re-bucket hourly forecast into daily forecast
        r('hr_fcst_by_day <- unname(tapply(forecast_result, (seq_along(forecast_result)-1) %/% 24, sum))')
        #scale hourly model by daily model
        r('scale_coefs <- dy_fcst_result/hr_fcst_by_day')
        #Scale scaling-coefficient vector to length of hourly forecast vector
        r('scale_coefs <- as.vector(matrix(rep(scale_coefs,each=24),nrow=24))')
        #Extract hourly forecast vector from time-series forecast object
        r('hr_fcst_result <- unclass(forecast_result)')
        r('hr_fcst_result <- hr_fcst_result[]')
        #multiply hourly forecast vector by scaling ceofficients to get final result
        r('forecast_result <- hr_fcst_result*scale_coefs')
    
    #Load prediction from R into Python workspace	    
    pred = com.load_data('forecast_result')
    #Convert prediction to Pandas time series
    pred = pd.Series(pred)
    rng = pd.date_range(sd, periods=td*24, freq='H')
    pred.index = pd.Series(rng, name='time')
    
    ret = pred.reset_index() 
    if asutc:
        ret.index =  ret.time.apply(lambda x: str(x))
    else:
        ret.index =  ret.time.apply(lambda x: int(x.strftime('%s')))
    ret = ret[0]
    return ret#.to_dict()
Beispiel #27
0
def transform_data(r_datfile,
                   full_dump_name,
                   train_sampl_size,
                   test_sampl_size,
                   sample_only=True):
    #train_sampl_size = 100000
    #test_sampl_size = 20000

    #full_dump_name = 'noncoding.data.py.save'
    #sampl_dump_name = 'noncoding.data.py.'+str(train_sampl_size/10**3)+"k."+str(test_sampl_size/10**3)+"k.save"
    #print sampl_dump_name

    if sample_only:
        full_data = cPickle.load(open('noncoding.data.py.save', 'rb'))
        train_data = full_data['train_data']
        test_data = full_data['test_data']

        ## Sample the data set (~3.6M -> sampl_size)
        #train_data_subset = train_data[random.sample(range(train_data.shape[0]),train_sampl_size),]
        #test_data_subset = test_data[random.sample(range(test_data.shape[0]),test_sampl_size),]
        train_subset, train_left = sample_split_data(
            train_data,
            float(train_sampl_size) / train_data.shape[0])
        test_subset, test_left = sample_split_data(
            test_data,
            float(test_sampl_size) / test_data.shape[0])

        print('train_subset:{},test_subset:{}'.format(train_subset.shape,
                                                      test_subset.shape))

        dump_data_subset = {
            'train_data': train__subset,
            'test_data': test_subset
        }
        cPickle.dump(dump_data_subset,
                     open(sampl_dump_name, 'wb'),
                     protocol=cPickle.HIGHEST_PROTOCOL)
    else:
        ## Load the R-store data
        robj = robjects.r.load(r_datfile)
        myRData = com.load_data(robj[0])
        #from rpy2.robjects import r
        #r.data('train_test.rda')
        #myRData = pandas2ri.ripy(r[0])

        train_data = np.array(pd.DataFrame(myRData['train']))
        test_data = np.array(pd.DataFrame(myRData['test']))
        dump_data = {'train_data': train_data, 'test_data': test_data}

        print('train:{}, test:{}'.format(train_data.shape, test_data.shape))
        cPickle.dump(dump_data,
                     open(full_dump_name, 'wb'),
                     protocol=cPickle.HIGHEST_PROTOCOL)
        '''
Beispiel #28
0
def conditionDESeq2(data_frame, header, alpha, res_dir):
    '''
    Perform DESeq2-based analysis of condition:time interaction
    dependent differential expression
    '''

    E.info("Differential expression testing for %s" % header)
    cols = data_frame.columns
    counts = com.convert_to_r_dataframe(data_frame)
    des_times = ro.IntVector([x.split(".")[1] for x in cols])
    des_reps = ro.StrVector([x.split(".")[2] for x in cols])
    des_cond = ro.StrVector([x.split(".")[0] for x in cols])
    genes = ro.StrVector([x for x in data_frame.index])

    # setup counts table and design frame

    R('''suppressPackageStartupMessages(library("DESeq2"))''')
    R('''sink(file="/dev/null")''')
    R('''times <- as.factor(%s)''' % des_times.r_repr())
    R('''reps <- c(%s)''' % des_reps.r_repr())
    R('''condition <- c(%s)''' % des_cond.r_repr())
    R('''design <- data.frame(times, reps, condition)''')
    R('''counts <- data.frame(%s)''' % counts.r_repr())
    R('''genes <- c(%s)''' % genes.r_repr())
    R('''rownames(counts) <- genes''')
    R('''rownames(design) <- colnames(counts)''')

    # use DESeq() with LRT and reduced formula.  Use effect
    # size moderation

    R('''dds <- DESeqDataSetFromMatrix(countData=counts, '''
      '''colData=design, '''
      '''design=~reps + times + condition + times:condition)''')
    R('''dds <- DESeq(dds, test="LRT", '''
      '''reduced=~reps + times + condition, betaPrior=T)''')
    R('''res <- results(dds)[order(results(dds)$padj, na.last=T), ]''')
    R('''res.df <- data.frame(res)''')

    # generate dispersion and MA plots
    R('''png("%s/%s-dispersions.png")''' % (res_dir,
                                            header))
    R('''plotDispEsts(dds)''')
    R('''dev.off()''')

    R('''png("%s/%s-MAplot.png")''' % (res_dir,
                                       header))
    R('''plotMA(res, alpha=%0.3f, ylim=c(-5,5))''' % alpha)
    R('''dev.off()''')
    R('''sink(file=NULL)''')

    df = com.load_data('res.df')

    return df
Beispiel #29
0
def treeCutting(infile,
                expression_file,
                cluster_file,
                cluster_algorithm,
                deepsplit=False):
    '''
    Use dynamic tree cutting to derive clusters for each
    resampled distance matrix
    '''
    wgcna_out = "/dev/null"

    E.info("loading distance matrix")

    df = pd.read_table(infile, sep="\t",
                       header=0, index_col=0)
    df = df.fillna(0.0)
    genes = df.index
    genes_r = ro.StrVector([g for g in genes])
    rdf = com.convert_to_r_dataframe(df)
    R.assign("distance_data", rdf)
    R.assign("gene_ids", genes_r)

    R('''sink(file='%(wgcna_out)s')''' % locals())
    R('''suppressPackageStartupMessages(library("WGCNA"))''')
    R('''suppressPackageStartupMessages(library("flashClust"))''')
    E.info("clustering data by %s linkage" % cluster_algorithm)
    R('''rownames(distance_data) <- gene_ids''')
    R('''clustering <- flashClust(as.dist(distance_data),'''
      ''' method='%(cluster_algorithm)s')''' % locals())
    if deepsplit:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=T)''')
    else:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=F)''')

    R('''color_cut <- labels2colors(cluster_cut)''')
    R('''write.table(color_cut, file = '%(cluster_file)s','''
      '''sep="\t")''' % locals())
    R('''cluster_matched <- data.frame(cbind(rownames(distance_data),'''
      '''color_cut))''')
    R('''colnames(cluster_matched) = c("gene_id", "cluster")''')
    R('''cluster_matched <- data.frame(cluster_matched$gene_id,'''
      '''cluster_matched$cluster)''')
    R('''sink(file=NULL)''')

    cluster_frame = com.load_data('cluster_matched')
    cluster_frame.columns = ['gene_id', 'cluster']
    cluster_frame.index = cluster_frame['gene_id']
    cluster_frame.drop(['gene_id'], inplace=True, axis=1)

    return cluster_frame
Beispiel #30
0
 def test_timeseries(self):
     """
     Test that the series has an informative index.
     Unfortunately the code currently does not build a DateTimeIndex
     """
     for name in ('austres', 'co2', 'fdeaths', 'freeny.y', 'JohnsonJohnson',
                  'ldeaths', 'mdeaths', 'nottem', 'presidents',
                  'sunspot.month', 'sunspots', 'UKDriverDeaths', 'UKgas',
                  'USAccDeaths', 'airmiles', 'discoveries',
                  'EuStockMarkets', 'LakeHuron', 'lh', 'lynx', 'nhtemp',
                  'Nile', 'Seatbelts', 'sunspot.year', 'treering', 'uspop'):
         series = com.load_data(name)
         ts = r[name]
         assert np.array_equal(series.index, r['time'](ts))
Beispiel #31
0
def load_pensipp_result(pensipp_path, to_csv=False):
    path = os.path.join(pensipp_path, 'result_pensipp.csv')
    try:
        result_pensipp = read_table(path, sep=',', index_col=0)
    except:
        import pandas.rpy.common as com
        from rpy2 import robjects as r
        print(" Les données sont chargées à partir du Rdata et non du csv")
        output_pensipp = os.path.join(pensipp_path, 'output20.RData')
        r.r['load'](output_pensipp)
        result_pensipp = com.load_data('output1')
        result_pensipp.rename(columns={
            'dec_rg': 'decote_RG',
            'surc_rg': 'surcote_RG',
            'taux': 'taux_RG',
            'sam_rg': 'salref_RG',
            'pliq_rg': 'pension_RG',
            'prorat_rg': 'CP_RG',
            'pts_ar': 'nb_points_arrco',
            'pts_ag': 'nb_points_agirc',
            'pliq_ar': 'pension_arrco',
            'pliq_ag': 'pension_agirc',
            'DA_rg_maj': 'DA_RG',
            'taux_rg': 'taux_RG',
            'pliq_fp': 'pension_FP',
            'prorat_fp': 'CP_FP',
            'taux_fp': 'taux_FP',
            'surc_fp': 'surcote_FP',
            'dec_fp': 'decote_FP',
            'DA_fp_maj': 'DA_FP',
            'DA_in': 'DA_RSI_brute',
            'DA_in_maj': 'DA_RSI',
            'DAcible_rg': 'n_trim_RG',
            'DAcible_fp': 'n_trim_FP',
            'CPcible_rg': 'N_CP_RG',
            'sam_fp': 'salref_FP'
        },
                              inplace=True)
    if to_csv:
        result_pensipp.to_csv(path, sep=',')

    return result_pensipp
Beispiel #32
0
def build_erf_aggregates():
    """
    Fetch the relevant aggregates from erf data
    """
    #    Uses rpy2.
    #    On MS Windows, The environment variable R_HOME and R_USER should be set
    import pandas.rpy.common as com
    import rpy2.rpy_classic as rpy
    rpy.set_default_mode(rpy.NO_CONVERSION)

    country = 'france'
    for year in range(2006, 2008):
        menageXX = "menage" + str(year)[2:]
        menageRdata = menageXX + ".Rdata"
        filename = os.path.join(os.path.dirname(DATA_DIR), 'R', 'erf',
                                str(year), menageRdata)
        yr = str(year)
        simu = SurveySimulation()
        simu.set_config(year=yr, country=country)
        simu.set_param()

        agg = Aggregates()
        agg.set_simulation(simu)
        # print agg.varlist
        rpy.r.load(filename)

        menage = com.load_data(menageXX)
        cols = []
        print year
        for col in agg.varlist:
            #print col
            erf_var = "m_" + col + "m"
            if erf_var in menage.columns:
                cols += [erf_var]

        df = menage[cols]
        wprm = menage["wprm"]
        for col in df.columns:

            tot = (df[col] * wprm).sum() / 1e9
            print col, tot
Beispiel #33
0
 def test_table(self):
     iris3 = pd.DataFrame({'X0': {0: '0', 1: '1', 2: '2', 3: '3', 4: '4'},
                           'X1': {0: 'Sepal L.',
                                  1: 'Sepal L.',
                                  2: 'Sepal L.',
                                  3: 'Sepal L.',
                                  4: 'Sepal L.'},
                           'X2': {0: 'Setosa',
                                  1: 'Setosa',
                                  2: 'Setosa',
                                  3: 'Setosa',
                                  4: 'Setosa'},
                           'value': {0: '5.1', 1: '4.9', 2: '4.7', 3: '4.6', 4: '5.0'}})
     hec = pd.DataFrame(
         {
             'Eye': {0: 'Brown', 1: 'Brown', 2: 'Brown', 3: 'Brown', 4: 'Blue'},
             'Hair': {0: 'Black', 1: 'Brown', 2: 'Red', 3: 'Blond', 4: 'Black'},
             'Sex': {0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4: 'Male'},
             'value': {0: '32.0', 1: '53.0', 2: '10.0', 3: '3.0', 4: '11.0'}})
     titanic = pd.DataFrame(
         {
             'Age': {0: 'Child', 1: 'Child', 2: 'Child', 3: 'Child', 4: 'Child'},
             'Class': {0: '1st', 1: '2nd', 2: '3rd', 3: 'Crew', 4: '1st'},
             'Sex': {0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4: 'Female'},
             'Survived': {0: 'No', 1: 'No', 2: 'No', 3: 'No', 4: 'No'},
             'value': {0: '0.0', 1: '0.0', 2: '35.0', 3: '0.0', 4: '0.0'}})
     for name, expected in zip(('HairEyeColor', 'Titanic', 'iris3'),
                              (hec, titanic, iris3)):
         df = com.load_data(name)
         table = r[name]
         names = r['dimnames'](table)
         try:
             columns = list(r['names'](names))[::-1]
         except TypeError:
             columns = ['X{:d}'.format(i) for i in range(len(names))][::-1]
         columns.append('value')
         assert np.array_equal(df.columns, columns)
         result = df.head()
         cond = ((result.sort(axis=1) == expected.sort(axis=1))).values
         assert np.all(cond)
Beispiel #34
0
 def load_data(self):
     """
     retrieve the data from the DoOR database, but only those odorants that 
     have been measured in Hallem et al., 2006.
     """
     self.hallemresponses = {}
     measured = None
     for OR in self.hallems:
         dataframe = com.load_data(OR)
         dataframe_hallemonly = dataframe[['Hallem.2006.EN', 'CAS', 'Name']]
         responses = numpy.array(dataframe[['Hallem.2006.EN']], dtype=float)
         cas = numpy.array(dataframe[['CAS']], dtype=type('a'))
         names = numpy.array(dataframe[['Name']], dtype=type('a'))
         if measured is None:
             measured = ~numpy.isnan(responses)
         responses = responses[measured]
         cas = cas[measured]
         names = names[measured]
         ordict = {}
         ordict['responses'] = responses
         ordict['names'] = names
         ordict['cas'] = cas
         self.hallemresponses[OR] = ordict
     self.measured = measured
Beispiel #35
0
    def store_survey(self,
                     survey_name,
                     R_table_name,
                     destination_table_name,
                     data_dir,
                     variables=None,
                     force_recreation=True):
        """
        Store a R data table in an HDF5 file

        Parameters
        ----------

        survey_name : string
                       the name of the survey
        R_table_name : string
                       the name of the R data table
        destination_table_name : string
                                 the name of the table in the HDFStore
        data_dir : path
                   the directory where to find the RData file

        variables : list of string, default None
                    When not None, list of the variables to keep
        """
        gc.collect()
        year = self.year

        def get_survey_year(survey_name, year):
            if survey_name == "logement":
                if year == 2003:
                    return 2003
                elif year in range(2006, 2010):
                    return 2006
            if survey_name == "patrimoine":
                return 2004
            else:
                return year

        print "creating %s" % (destination_table_name)
        table_Rdata = R_table_name + ".Rdata"
        filename = os.path.join(data_dir,
                                str(get_survey_year(survey_name, year)),
                                table_Rdata)
        print filename
        if not os.path.isfile(filename):
            raise Exception("filename do  not exists")

        rpy.r.load(filename)
        stored_table = com.load_data(R_table_name)
        store = HDFStore(self.hdf5_filename)
        store_path = str(self.year) + "/" + destination_table_name

        if store_path in store:
            if force_recreation is not True:
                print store_path + "already exists, do not re-create and exit"
                store.close()
                return

        if variables is not None:

            print store
            print store_path
            print variables
            variables_stored = list(
                set(variables).intersection(set(stored_table.columns)))
            print list(set(variables).difference((set(stored_table.columns))))
            store[store_path] = stored_table[variables_stored]
        else:
            store[store_path] = stored_table
        store.close()
        del stored_table
        gc.collect()
Beispiel #36
0

def fill_zeros(s, k):
    while len(s) < k:
        s = '0' + s
    return s


inPath = '/data/biophys/etournay/'
inName = 'WT_25deg_111102'

Rdata_path = '/home/mpopovic/Documents/Work/Projects/drosophila_wing_analysis/WT_25deg_111102/shear_contrib/'
triList_name = 'triList.RData'
Ta_name = 'Ta_t.RData'
ro.r('load("' + Rdata_path + triList_name + '")')
triList_df = com.load_data('triList')
ro.r('load("' + Rdata_path + Ta_name + '")')
Ta_df = com.load_data('Ta_t')
ro.r(
    'load("/home/mpopovic/Documents/Work/Projects/drosophila_wing_analysis/WT_25deg_111102/roi_bt/lgRoiInclDead.RData")'
)
roi_df = com.load_data('lgRoiInclDead')
roi_balde = roi_df[roi_df['roi'] == 'blade']

inPath = '/data/biophys/etournay/'
inName = 'WT_25deg_111102'
inDB = inPath + 'DB/' + inName + '/' + inName + '.sqlite'
con = lite.connect(inDB)
cells_df = psql.frame_query('SELECT * FROM cells WHERE cell_id>10000', con)
cells_df_blade = cells_df[cells_df['cell_id'].isin(roi_balde['cell_id'])]
time_data = psql.frame_query('SELECT * FROM timepoints', con)
Beispiel #37
0
 def test_numeric(self):
     for name in ('euro', 'islands', 'precip'):
         series = com.load_data(name)
         numeric = r[name]
         names = numeric.names
         assert np.array_equal(series.index, names)
Beispiel #38
0
        subplot_kw['sharey'] = ax0
    axarr[0] = ax0

    # Note off-by-one counting because add_subplot uses the MATLAB 1-based
    # convention.
    for i in range(1, nplots):
        axarr[i] = fig.add_subplot(nrows, ncols, i + 1, **subplot_kw)

    if squeeze:
        # Reshape the array to have the final desired dimension (nrow,ncol),
        # though discarding unneeded dimensions that equal 1.  If we only have
        # one subplot, just return it instead of a 1-element array.
        if nplots == 1:
            return fig, axarr[0]
        else:
            return fig, axarr.reshape(nrows, ncols).squeeze()
    else:
        # returned axis array will be always 2-d, even if nrows=ncols=1
        return fig, axarr.reshape(nrows, ncols)


if __name__ == '__main__':
    import pandas.rpy.common as com
    sales = com.load_data('sanfrancisco.home.sales', package='nutshell')
    top10 = sales['zip'].value_counts()[:10].index
    sales2 = sales[sales.zip.isin(top10)]

    fig = scatter_plot(sales2, 'squarefeet', 'price', by='zip')

    # plt.show()
Beispiel #39
0
def consensusClustering(infile,
                        cutHeight,
                        cluster_algorithm,
                        min_size=30,
                        deepsplit=False):
    '''
    hierachichal clustering based on gene-cluster correlation across
    resampled datasets.  cut tree based with dynamic tree cut
    TODO: change this to cutHeight?  i.e. 0.2 = 80% clustering
    agreement OR use dynamic tree cut without deepsplit.
    '''
    condition = infile.split("/")[1].split("-")[0]
    wgcna_out = "tmp.dir/consensus-WGCNA.out"

    R('''sink(file='%(wgcna_out)s')''' % locals())
    R('''suppressMessages(library("WGCNA"))''')
    R('''suppressMessages(library("flashClust"))''')

    E.info("loading distance matrix")

    df = pd.read_table(infile, sep="\t", header=0, index_col=0)
    labels = df.index.tolist()
    labels_r = ro.StrVector([l for l in labels])
    df_r = com.convert_to_r_dataframe(df)
    R.assign("distance.frame", df_r)
    R.assign("labels", labels_r)

    # large matricies/distance objects may need more
    # memory - allocate 1GB
    R('''memory.limit(10000)''')
    R('''rownames(distance.frame) <- labels''')
    R('''distance_data <- data.matrix(distance.frame)''')

    E.info("clustering data by %s linkage" % cluster_algorithm)

    R('''clustering <- flashClust(as.dist(1-distance_data),'''
      '''method='%(cluster_algorithm)s')''' % locals())

    if cutHeight > float(0.01):
        R('''cluster_cut <- cutreeStatic(dendro=clustering, '''
          '''minSize=%(min_size)i, cutHeight=%(cutHeight)s)''' % locals())

    elif deepsplit:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''deepSplit=T, minClusterSize=%(min_size)i)''' % locals())
    else:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''deepSplit=F, minClusterSize=%(min_size)i)''' % locals())

    R('''color_cut <- labels2colors(cluster_cut)''')
    R('''cluster_matched <- data.frame(cbind(rownames(distance_data),'''
      '''color_cut))''')
    R('''colnames(cluster_matched) = c("gene_id", "cluster")''')
    R('''cluster_matched <- data.frame(cluster_matched$gene_id,'''
      '''cluster_matched$cluster)''')

    # plot and save dendrogram of clustering

    R('''png("plots.dir/%(condition)s-dendrogram-consensus_clustering.png")'''
      % locals())
    R('''plotDendroAndColors(dendro=clustering, colors=color_cut,'''
      '''groupLabels="Dynamic tree cut",'''
      '''dendroLabels=F, addGuide=T, guideHang=0.05, '''
      '''hang=0.03, main="%(condition)s")''' % locals())
    R('''dev.off()''')
    R('''sink(file=NULL)''')
    cluster_frame = com.load_data('cluster_matched')

    return cluster_frame
Beispiel #40
0
def clusterPCA(infile,
               cluster_file,
               image_dir):
    '''
    PCA for each module within an experimental condition across
    the time series.
    Take PC1 as the module eigengene and return the loadings and proportion
    of variance explained for the eigengene.
    The eigengene expression across the time series is taken to be the
    value for PC1 at each timepoint as a vector.
    This is basically what WGCNA moduleEigengenes does but it
    does not recover the PC loadings.

    Warning: this script will error if there is only one
    cluster. Make sure you have more than one cluster before
    trying to perform uninformative analyses.
    '''

    header = cluster_file.split("/")[-1].split("-")[0]

    # reshape data
    R('''sink(file='sink_file.txt')''')
    R('''suppressMessages(library("reshape2"))''')
    R('''suppressMessages(library("WGCNA"))''')
    R('''source("%s")''' % os.path.join(Timeseries.get_r_path,
                                        "summarySE.R"))
    R('''source("%s")''' % os.path.join(Timeseries.get_r_path,
                                        "clusterEigengenes.R"))
    R('''cluster_match <- read.table('%(cluster_file)s', h=T, '''
      '''row.names=1)''' % locals())
    R('''express_data <- read.table('%(infile)s', '''
      '''h=T, row.names=1, stringsAsFactors=F)''' % locals())
    R('''sink(file=NULL)''')
    R('''colnames(cluster_match) <- c("genes", "cluster")''')
    R('''express_data <- data.frame(t(express_data))''')
    R('''express_data$times <- as.numeric(as.character(express_data$times))''')
    R('''data_melt <- melt(express_data, '''
      '''id.vars=c("times", "replicates"))''')

    # sometimes data is read in as a factor/string.
    # Explicitly convert to numeric

    R('''data_melt$value <- as.numeric(as.character(data_melt$value))''')
    R('''data_sum <- summarySE(data_melt, measurevar="value", '''
      '''groupvars=c("times", "variable"))''')
    R('''data_mod <- data.frame(data_sum$times,'''
      ''' data_sum$variable, data_sum$value)''')
    R('''colnames(data_mod) <- c("times", "gene", "value")''')
    R('''data_wide <- dcast(data_mod, gene ~ times, value.var="value")''')
    R('''rownames(data_wide) <- data_wide$gene''')
    R('''times <- as.numeric(as.character(unique(express_data$times)))''')
    R('''data_wide <- data.frame(data_wide[,-1])''')
    R('''colnames(data_wide) <- times''')

    # derive module eigengenes - return a dataframe of eigengene expression
    R('''eigen_clustered <- clusterPCA(cluster_frame=cluster_match, '''
      '''expression_frame=data_wide, n=times)''')
    R('''eigen_frame <- eigenExpress(eigen_clustered, n=times)''')

    # generate loadings plot for each eigengene
    R('''eigenLoad(clusterPCA(cluster_frame=cluster_match, '''
      '''expression_frame=data_wide, n=times), image.dir="%(image_dir)s", '''
      '''condition="%(header)s")''' % locals())

    # generate expression profile plots for all eigengenes
    R('''eigenPlot(eigen_frame, image.dir="%(image_dir)s", '''
      '''condition="%(header)s")''' % locals())

    eigen_frame = com.load_data("eigen_frame")
    eigen_frame.index = eigen_frame['cluster']
    eigen_frame.drop(['cluster'], inplace=True, axis=1)

    return eigen_frame
Beispiel #41
0
import pandas.rpy.common as com
import rpy2.robjects as ro
from scipy.stats import kruskal
import matplotlib.pyplot as plt
import numpy as np
from scipy.constants import c

ro.r('data(morley)')
df = com.load_data('morley')
df['Speed'] = df['Speed'] + 299000

samples = dict(list(df.groupby('Expt')))
samples = np.array([samples[i]['Speed'].values for i in list(samples.keys())])
print("Kruskal",
      kruskal(samples[0], samples[1], samples[2], samples[3], samples[4]))

plt.title('Speed of light')
plt.plot(samples.min(axis=1), 'x', label='min')
plt.plot(samples.mean(axis=1), 'o', label='mean')
plt.plot(np.ones(5) * samples.mean(), '--', label='All mean')
plt.plot(np.ones(5) * c / 1000, lw=2, label='Actual')
plt.plot(samples.max(axis=1), 'v', label='max')
plt.grid(True)
plt.legend()
plt.show()
Beispiel #42
0
def deseqNormalize(infile,
                   time_points,
                   reps,
                   conditions=None):
    '''
    Library size normalisation and variance stabilizing transformation of
    timeseries RNA-seq data

    :param infile: count table from NGS-seq experiment
    :type infile: str
    :param time_points: time point labels
    :type time_points: str list
    :param reps: replicates labels
    :type reps: str list
    :param conditions: if  multiple experimental conditions
    are to be normalised at the same time
    :type conditions: str list
    '''
    # MM: NB - this should be split into separate library size
    # normalisation and VST transformations
    # maybe add in different transformation options.

    pandas2ri.activate()
    reps = reps

    # load library
    R('''suppressMessages(library("DESeq"))''')

    # generates a lists for the design data frame
    # of the proper length
    # these need to be rpy2 objects to be parsed
    # properly in the string formatting

    E.info("converting to pandas dataframe object")

    if infile.split(".")[-1] == "gz":
        comp = "gzip"
    else:
        comp = None

    data_frame = pd.read_table(infile,
                               index_col=0,
                               header=0,
                               sep="\t",
                               compression=comp)
    rdf = com.convert_to_r_dataframe(data_frame)

    if not conditions:
        time_rep_comb = [x for x in itertools.product(time_points, reps)]
        time_cond = ro.StrVector([x[0] for x in time_rep_comb])
        rep_cond = ro.StrVector([x[1] for x in time_rep_comb])

        R.assign('countsTable', rdf)
        R('''design <- data.frame(row.names=colnames(countsTable),'''
          '''times=%s, replicates=%s)''' % (time_cond.r_repr(),
                                            rep_cond.r_repr()))
    elif conditions:
        design_dict = {}
        for x in data_frame.columns.values:
            sample_dict = {}
            sample_dict['condition'] = str(x).split(".")[0]
            sample_dict['times'] = int(str(x).split(".")[1])
            sample_dict['replicates'] = str(x).split(".")[2]
            design_dict[x] = sample_dict
            design_frame = pd.DataFrame(design_dict)
            design_frame = design_frame.T

        des_cond = design_frame['condition'].values.tolist()
        des_time = design_frame['times'].values.tolist()
        des_reps = design_frame['replicates'].values.tolist()

        cond_cond = ro.StrVector([x for x in des_cond])
        time_cond = ro.StrVector([x for x in des_time])
        rep_cond = ro.StrVector([x for x in des_reps])

        R.assign('countsTable', rdf)
        R.assign('design', design_frame)

    # create the count data set and normalize to library size
    # transform with variance stabilizing transformation
    # only select genes with an average of ten reads mapping

    E.info("calculating size factors and dispersion")
    R('''notZero <- (rowMeans(countsTable) > 1)''')
    R('''cds <- newCountDataSet(countsTable[notZero, ], design)''')
    R('''cds_size <- estimateSizeFactors(cds)''')
    R('''cds_disp <- estimateDispersions(cds_size, method="blind")''')

    E.info("applying variance stabilizing transformation")

    R('''vst <- varianceStabilizingTransformation(cds_disp)''')

    # format data set to long format with condition and replicate labels
    # convert to a numpy array

    R('''replicates <- c(%s)''' % rep_cond.r_repr())
    R('''times <- c(%s)''' % time_cond.r_repr())
    if conditions:
        R('''conditions <- c(%s)''' % cond_cond.r_repr())
        R('''trans_vst = data.frame(t(exprs(vst)), '''
          '''times, replicates, conditions)''')
    else:
        R('''trans_vst = data.frame(t(exprs(vst)), times, replicates)''')

    data_file = com.load_data('trans_vst')

    return data_file
Beispiel #43
0
def maSigPro(infile,
             order_terms=1,
             fdr=0.01,
             adjust="BH",
             stepwise="backward",
             include_p=0.01,
             rsq=0.2,
             var_group="all"):
    '''
    Generate differentially expressed genes for each experimental
    condition across a time series.  Uses the bioconductor
    package maSigPro to derive a set of genes of interest.
    '''

    ref_gtf = str(infile).split("-")[1]
    data_frame = pd.read_table(infile, sep="\t", index_col=0, header=0)
    design_dict = {}

    for x in data_frame.index.values:
        sample_dict = {}
        condition = str(x).split(".")[0]
        sample_dict[condition] = 1
        sample_dict['times'] = int(str(x).split(".")[1])
        sample_dict['replicates'] = str(x).split(".")[2]
        design_dict[x] = sample_dict

    design_frame = pd.DataFrame(design_dict)
    design_frame = design_frame.T
    cols = ['times', 'replicates', condition]
    design_frame = design_frame[cols]
    design_file = "deseq.dir/%s-%s-design.tsv" % (condition, ref_gtf)
    design_frame.to_csv(design_file, sep="\t")
    data_file = "deseq.dir/%s-%s-data.tsv" % (condition, ref_gtf)
    results_file = "deseq.dir/%s-%s-maSigPro.tsv" % (condition, ref_gtf)

    # data frame columns must be in the order time-replicate-condition
    # for maSigPro
    # define the numnber of higher-order terms included in the models

    masigpro_out = "deseq.dir/maSigPro.out"

    R('''suppressMessages(library("maSigPro"))''')
    R('''input_data <- read.table('%(infile)s', sep="\t", '''
      '''h=T, row.names=1)''' % locals())
    R('''input_data <- t(input_data[0:(length(input_data)-2)])''')

    E.info("constructing experimental design matrix")

    R('''input_design <- data.matrix(read.table('%(design_file)s', '''
      '''sep="\t", h=T, row.names=1))''' % locals())
    R('''%(condition)s_mat <- make.design.matrix(input_design, '''
      '''degree = %(order_terms)i )''' % locals())
    R('''sink(file = '%(masigpro_out)s')''' % locals())

    E.info("fitting linear model for each gene with "
           "%i polynomial terms" % order_terms)

    R('''%(condition)s_fit <- p.vector(input_data, %(condition)s_mat, '''
      '''Q = %(fdr)f, MT.adjust = '%(adjust)s')''' % locals())

    # fit a linear model to each of the genes called as
    # differentially expressed
    # report genes with model R-squared > threshold
    # maSigPro gives an un-suppressable output to stdout
    # therefore sink is used to shunt this to a temporary file 'maSigPro.out'

    R('''%(condition)s_step <- T.fit(%(condition)s_fit, '''
      '''step.method='%(stepwise)s', alfa=%(include_p)f)''' % locals())

    E.info("selecting significantly differentially "
           "expressed genes at FDR=%0.3f" % fdr)

    R('''sink(file=NULL)''')
    R('''%(condition)s_sigs <- get.siggenes(%(condition)s_step, '''
      '''rsq=%(rsq)f, vars='%(var_group)s')''' % locals())
    R('''write.table(%(condition)s_sigs$sig.genes$%(condition)s$group.coeffs'''
      ''',file="deseq.dir/%(condition)s-%(ref_gtf)s-coefficients.tsv", '''
      '''sep="\t")''' % locals())
    R('''write.table(%(condition)s_sigs$sig.genes$%(condition)s$sig.pvalues,'''
      '''file="deseq.dir/%(condition)s-%(ref_gtf)s-pvalues.tsv",'''
      ''' sep="\t")''' % locals())
    R('''write.table(%(condition)s_sigs$summary, '''
      '''file='deseq.dir/%(condition)s-%(ref_gtf)s-geneids.tsv', '''
      '''sep="\t")''' % locals())
    # merge the p-value and coefficient results into a single file
    p_file = "deseq.dir/%(condition)s-%(ref_gtf)s-pvalues.tsv" % locals()
    coef_file = "deseq.dir/%s-%s-coefficients.tsv" % (condition,
                                                      ref_gtf)
    p_frame = pd.read_table(p_file, sep="\t")
    coef_frame = pd.read_table(coef_file, sep="\t")
    results_frame = pd.merge(coef_frame, p_frame,
                             how='right',
                             left_index=True,
                             right_index=True)

    results_frame.to_csv(results_file, sep="\t")

    R('''diff_genes <- data.frame(%(condition)s_fit$SELEC)''' % locals())
    diff_genes = com.load_data('diff_genes')

    return diff_genes