Esempio n. 1
2
File: Qrys.py Progetto: P1R/RHPy
 def MaxTE(self, Cursor, Date, Limit=100):
     '''Calcula Maximo de precio en fila
     debe llevar fecha y limite
     '''
     Qry = """
     select id_ofertas_energia_david,
     hora,
     central,
     precio_per_mw_1,
     precio_per_mw_2,
     precio_per_mw_3,
     precio_per_mw_4,
     precio_per_mw_5,
     precio_per_mw_6,
     precio_per_mw_7,
     precio_per_mw_8,
     precio_per_mw_9,
     precio_per_mw_10,
     precio_per_mw_11
     from ofertas_energia_david 
     where tipo_reporte='TE' and fecha_inicial='{0}' limit {1}
     """.format(Date, Limit)
     Cursor.execute(Qry)
     #da formato pandas
     df = as_pandas(Cursor)
     #pasando a formato R y a R
     df = com.convert_to_r_dataframe(df)
     #print type(rdf)
     ro.r('source("./Rfunctions/max.R")')
     ro.globalenv['tabla'] = df
     ro.r('Out <- Rmax(tabla)')
     print ro.r('Out')
Esempio n. 2
0
    def ccaPermuteOutcomesVsControls(self, groupFreqThresh = 0, nPerms = 25, penaltyXs = None , penaltyZs = None):
        (groups, allOutcomes, controls) = self.outcomeGetter.getGroupsAndOutcomes(groupFreqThresh)
        # groups: set(group_ids)
        # allOutcomes: {outcome: {group_id: value}}
        # controls: {control: {group_id: value}}

        # X contains feature group_norms, Z contains outcome values
        Zdict = allOutcomes
        Xdict = controls
        
        # R doesn't handle '$'s in column names
        Xdict = {k.replace('$','.'):v for k, v in Xdict.iteritems()}
        Zdict = {k.replace('$','.'):v for k, v in Zdict.iteritems()}
        
        # X, Z, Xfreqs, Zfreqs = self.prepMatrices(pd.DataFrame(data=Xdict),pd.DataFrame(data=Zdict), softImputeXtoo=True)
        X, Z, Xfreqs, Zfreqs = self.prepMatricesTogether(pd.DataFrame(data=Xdict), pd.DataFrame(data=Zdict))
        X = com.convert_to_r_dataframe(X)
        Z = com.convert_to_r_dataframe(Z)

        Ngroups = com.convert_robj(ro.r["nrow"](X)[0])
        
        kwParams = {"nperms": nPerms}
        kwParams['penaltyxs'] = penaltyXs if penaltyXs else ro.vectors.FloatVector(np.arange(.1,.91,.05))
        kwParams['penaltyzs'] = penaltyZs if penaltyZs else ro.vectors.FloatVector(np.arange(.1,.91,.05))
        
        self._ccaPermute(X,Z, **kwParams)
Esempio n. 3
0
def venn_diagram(first, second, name1, name2, save_path):
    """Wrapper arround R's VennDiagram."""
    # define function in R to make venn diagram
    ro.r('''venn_diag <- function(df1, df2, save_path){
            library(VennDiagram)
            png()
            venn.diagram(
                x = list(
                    %s = df1$X0,
                    %s = df2$X0
                    ),
                filename = '%s',
                lwd = 4,
                fill = c("cornflowerblue", "darkorchid1"),
                alpha = 0.75,
                label.col = "black",
                cex = 4,
                fontfamily = "serif",
                fontface = "bold",
                cat.col = c("cornflowerblue", "darkorchid1"),
                cat.cex = 3,
                cat.fontfamily = "serif",
                cat.fontface = "bold",
                cat.dist = c(0.03, 0.03),
                cat.pos = c(-20, 14)
                );
            dev.off()
    }''' % (name1, name2, save_path))
    venn_diag = ro.r['venn_diag']  # venn diagram function

    # convert to R data frame
    first_rdf = com.convert_to_r_dataframe(first)
    second_rdf = com.convert_to_r_dataframe(second)

    venn_diag(first_rdf, second_rdf, save_path)
def runDESeq(infile, outfiles, outfileRoot):

    # Report
    print 'Doing ' + infile + '...'

    # Read dataframe
    countDataframe = pd.read_table(infile, index_col='gene_symbol')

    # Sample counts
    sampleCounts = collections.Counter(
        [x.split('-')[-1] for x in countDataframe.columns])

    # Make annotation dataframe
    annotationDataframe = pd.DataFrame.from_dict([{
        'sample_id':
        x,
        'sample_type':
        x.split('-')[-1]
    } for x in countDataframe.columns]).set_index('sample_id')

    # Sample counts
    sampleCounts = collections.Counter(
        [x.split('-')[-1] for x in countDataframe.columns])

    # Get comparisons
    comparisons = [
        list(x[::-1]) for x in itertools.combinations(
            [key for key, value in sampleCounts.iteritems() if value >= 5], 2)
    ]

    # Loop through comparisons
    for comparison in comparisons:

        # Filter
        annotationDataframeSubset = annotationDataframe[
            annotationDataframe['sample_type'].isin(comparison)]
        countDataframeSubset = countDataframe[annotationDataframeSubset.index]

        # Run function
        deseqDataframe = r.runDESeq2(
            com.convert_to_r_dataframe(countDataframeSubset),
            com.convert_to_r_dataframe(annotationDataframeSubset),
            '~ sample_type')

        # Convert to dataframe
        deseqDataframe = com.convert_robj(deseqDataframe)

        # Get comparison string
        comparisonString = 'v'.join(comparison)

        # Get outfile
        outfile = '{outfileRoot}{comparisonString}.txt'.format(**locals())

        # Create outdir
        outDir = os.path.dirname(outfile)
        if not os.path.exists(outDir):
            os.makedirs(outDir)

        # Write
        deseqDataframe.to_csv(outfile, sep='\t', index_label='gene_symbol')
Esempio n. 5
0
    def _cca(self, X, Z, **params):
        """Given two Pandas dataframes and a set of parameters, performs CCA
        returns CCA dict (converted from R CCA named list object)
        """
        
        pma = importr("PMA")
        
        # Defaults:
        kwParams = {"typex": "standard",
                    "typez": "standard",
                    "trace": False,
                    "K": self.numComponents,
        }
        kwParams.update(params)
        
        if isinstance(X, pd.core.frame.DataFrame):
            X = com.convert_to_r_dataframe(X)
        if isinstance(Z, pd.core.frame.DataFrame):
            Z = com.convert_to_r_dataframe(Z)

        assert isinstance(X, ro.vectors.DataFrame) and isinstance(Z, ro.vectors.DataFrame), "X, Z need to be either Pandas DataFrames or R dataframes!"

        assert self.numComponents <= min(len(X.names),len(Z.names)), "Number of components must be smaller than the minimum of columns in each of your matrices"

        nGroups = com.convert_robj(ro.r["nrow"](X)[0])
        
        print "\tCCA parameters:", kwParams
        cca = pma.CCA(X, Z, **kwParams)
        cca = {k:v for k, v in cca.items()}
        cca['nGroups'] = nGroups
        return cca
Esempio n. 6
0
def runComBat(infiles, outfile):

    # Split infiles
    vstFile, annotationFile = infiles

    # Read expression dataframe
    vstDataframe = pd.read_table(vstFile,
                                 index_col='gene_symbol').drop(['B8N', 'B10C'],
                                                               axis=1)

    # Read annotation dataframe
    annotationDataframe = pd.read_table(annotationFile,
                                        index_col='sample_name')

    # Get common samples
    annotationDataframe = annotationDataframe.loc[vstDataframe.columns]

    # Run function
    combatMatrix = r.runComBat(com.convert_to_r_dataframe(vstDataframe),
                               com.convert_to_r_dataframe(annotationDataframe),
                               covariateFormula='~treatment',
                               batchColumn='patient')

    # Convert to dataframe
    combatDataframe = com.convert_robj(combatMatrix)

    # Write file
    combatDataframe.to_csv(outfile, sep='\t', index_label='gene_symbol')
Esempio n. 7
0
    def _cca(self, X, Z, **params):
        """Given two Pandas dataframes and a set of parameters, performs CCA
        returns CCA dict (converted from R CCA named list object)
        """
        
        pma = importr("PMA")
        
        # Defaults:
        kwParams = {"typex": "standard",
                    "typez": "standard",
                    "trace": False,
                    "K": self.numComponents,
        }
        kwParams.update(params)
        
        if isinstance(X, pd.core.frame.DataFrame):
            X = com.convert_to_r_dataframe(X)
        if isinstance(Z, pd.core.frame.DataFrame):
            Z = com.convert_to_r_dataframe(Z)

        assert isinstance(X, ro.vectors.DataFrame) and isinstance(Z, ro.vectors.DataFrame), "X, Z need to be either Pandas DataFrames or R dataframes!"

        assert self.numComponents <= min(len(X.names),len(Z.names)), "Number of components must be smaller than the minimum of columns in each of your matrices"

        nGroups = com.convert_robj(ro.r["nrow"](X)[0])
        
        print("\tCCA parameters:", kwParams)
        cca = pma.CCA(X, Z, **kwParams)
        cca = {k:v for k, v in list(cca.items())}
        cca['nGroups'] = nGroups
        return cca
Esempio n. 8
0
    def ccaPermuteOutcomesVsControls(self, nPerms = 25, penaltyXs = None , penaltyZs = None):
        (groups, allOutcomes, controls) = self.outcomeGetter.getGroupsAndOutcomes()
        # groups: set(group_ids)
        # allOutcomes: {outcome: {group_id: value}}
        # controls: {control: {group_id: value}}

        # X contains feature group_norms, Z contains outcome values
        Zdict = allOutcomes
        Xdict = controls
        
        # R doesn't handle '$'s in column names
        Xdict = {k.replace('$','.'):v for k, v in Xdict.items()}
        Zdict = {k.replace('$','.'):v for k, v in Zdict.items()}
        
        # X, Z, Xfreqs, Zfreqs = self.prepMatrices(pd.DataFrame(data=Xdict),pd.DataFrame(data=Zdict), softImputeXtoo=True)
        X, Z, Xfreqs, Zfreqs = self.prepMatricesTogether(pd.DataFrame(data=Xdict), pd.DataFrame(data=Zdict))
        try:
            X = com.convert_to_r_dataframe(X)
            Z = com.convert_to_r_dataframe(Z)
            Ngroups = com.convert_robj(ro.r["nrow"](X)[0])
        except NameError:
            warn("pandas.rpy.common cannot be imported")
            sys.exit(1)
        
        kwParams = {"nperms": nPerms}
        kwParams['penaltyxs'] = penaltyXs if penaltyXs else ro.vectors.FloatVector(np.arange(.1,.91,.05))
        kwParams['penaltyzs'] = penaltyZs if penaltyZs else ro.vectors.FloatVector(np.arange(.1,.91,.05))
        
        self._ccaPermute(X,Z, **kwParams)
Esempio n. 9
0
def run_gams_model_single_run(proc_data, feature_list):

    if proc_data['outcome'] is not None:
        print("outcome variable found")
    else:
        print("outcome variable missing")
        return (list(None, None, status="FAIL"))

    #Prepare Formula from feature list

    s = "outcome~"
    for i in range(len(feature_list)):
        if (i in [37, 49, 48, 51, 72, 50]):
            continue
        if (feature_list['feature_type'][i] == 'num'):
            s += feature_list['feature_name'][i] + '+'
        else:
            s += feature_list['feature_name'][i] + '+'
    s = s[:-1]
    fmla = Formula(s)
    print s

    proc_data.dropna()
    response = proc_data['outcome']
    positiveOutcomes = np.sum(np.asarray(response))
    full_range = range(0, len(proc_data.index) - 1)
    thres_tune = round((positiveOutcomes / len(proc_data.index)), 3)
    train_ind = random.sample(range(0,
                                    len(proc_data.index) - 1),
                              int(math.floor(0.7 * len(proc_data.index))))
    test_ind = [val for val in full_range if val not in train_ind]
    rdf = com.convert_to_r_dataframe(proc_data.iloc[train_ind])
    #Run the model

    print("Running GAMs model")
    model = mgcv.bam(formula=fmla,
                     data=rdf,
                     family=statsf.binomial(link="logit"))
    print("Model building completed")

    #Predict the values

    tdf = com.convert_to_r_dataframe(proc_data)
    predval = statsf.predict(model, tdf, type="response")
    predicted_values = pd.concat([
        pd.DataFrame({'pred': predval}, index=[response.index.get_values()]),
        response
    ],
                                 axis=1)
    predicted_values.columns = ['prediction', 'observed']

    #Returning the List
    status = "pass"
    res = list()
    res.append(model)
    res.append(predicted_values)
    res.append(status)
    print("+++++++++++++++++  Completed +++++++++++++++++++")
    return res
Esempio n. 10
0
def internal_cluster_evaluation(feature_points, cluster_labels, metrics):
    fp = DataFrame(feature_points)
    fp = com.convert_to_r_dataframe(fp)
    r.assign("feature_points", fp)
    
    cl = DataFrame(cluster_labels)
    cl = com.convert_to_r_dataframe(cl)
    r.assign("cluster_labels", cl)
    
    r('cluster_labels')
Esempio n. 11
0
def create_model(input_json):
    global hourly_volume

    #Loads JSON file
    print 'Loading Data...'
    json = loads(input_json)

    #Converts to Pandas Time Series Dataframe which can be converted to be used by R
    df = pd.DataFrame(json)
    df.columns = ['time']
    df['time'] = df['time'].apply(dateutil.parser.parse)
    df.set_index('time', inplace=True)
    df['t'] = 1

    #Resamples Dataframe into hourly (for weekly model) and daily (for monthly model) buckets
    hourly_volume = df.resample('1H', how=np.count_nonzero)
    daily_volume = df.resample('1D', how=np.count_nonzero)

    print 'Creating Model...'

    #Converts Pandas Dataframe to R Dataframe
    demand_data_daily = com.convert_to_r_dataframe(daily_volume)
    demand_data_hourly = com.convert_to_r_dataframe(hourly_volume)

    #Brings Dataframes into R workspace
    r.assign('train_data_hourly', demand_data_hourly)
    r.assign('train_data_daily', demand_data_daily)
    #Assigns values to required input variables in R
    r('start_index = ' + str(get_friday_index(hourly_volume)))
    r('month_index = ' + str(get_first_of_month(hourly_volume)))

    #Reorganizes hourly dataframe to seasonal time series w/ 168 hr weekly intervals starting at the 1st Fri
    r('train_data_ts <- ts(train_data_hourly[,1],start=c(1,(168-start_index+2)),frequency=168)'
      )
    #Adds 0.01 as model input data must be non-zero
    r('train_data_ts = train_data_ts+ 0.01')
    #R creates hourly model we set beta=0 as we assume no global trend (HOLTZ-WINTERS MODEL)
    r('hr_model <- HoltWinters(train_data_ts,beta=0,seasonal="m",start.periods=(168+start_index-1))'
      )

    #R creates a monthly model IFF there is enough data (min 8 weeks)
    r('dy_model = NULL')
    #1st Fri of hourly dataset translated for daily dataset
    r('start_index = (start_index-1)/24+1')
    if (r('length(train_data_daily[,1])>(28*2+start_index-1)')[0]):
        #if the first fri of the month of the dataset proceeds start date of dataset, sets to prior month's first fri
        r('if(month_index<1){month_index = 28-month_index }')
        #Reorganizes daily dataframe to seasonal time series
        r('train_data_ts <- ts(train_data_daily[,1],start=c(1,month_index),frequency=28)'
          )
        #R creates monthly model, again we assume no global trend
        r('dy_model <- HoltWinters(train_data_ts,seasonal="m",start.periods=(28+start_index-1))'
          )

        print 'Model Created!'
def run_gams_model_single_run(proc_data, feature_list):

    if proc_data["outcome"] is not None:
        print ("outcome variable found")
    else:
        print ("outcome variable missing")
        return list(None, None, status="FAIL")

    # Prepare Formula from feature list

    s = "outcome~"
    for i in range(len(feature_list)):
        if i in [37, 49, 48, 51, 72, 50]:
            continue
        if feature_list["feature_type"][i] == "num":
            s += feature_list["feature_name"][i] + "+"
        else:
            s += feature_list["feature_name"][i] + "+"
    s = s[:-1]
    fmla = Formula(s)
    print s

    proc_data.dropna()
    response = proc_data["outcome"]
    positiveOutcomes = np.sum(np.asarray(response))
    full_range = range(0, len(proc_data.index) - 1)
    thres_tune = round((positiveOutcomes / len(proc_data.index)), 3)
    train_ind = random.sample(range(0, len(proc_data.index) - 1), int(math.floor(0.7 * len(proc_data.index))))
    test_ind = [val for val in full_range if val not in train_ind]
    rdf = com.convert_to_r_dataframe(proc_data.iloc[train_ind])
    # Run the model

    print ("Running GAMs model")
    model = mgcv.bam(formula=fmla, data=rdf, family=statsf.binomial(link="logit"))
    print ("Model building completed")

    # Predict the values

    tdf = com.convert_to_r_dataframe(proc_data)
    predval = statsf.predict(model, tdf, type="response")
    predicted_values = pd.concat(
        [pd.DataFrame({"pred": predval}, index=[response.index.get_values()]), response], axis=1
    )
    predicted_values.columns = ["prediction", "observed"]

    # Returning the List
    status = "pass"
    res = list()
    res.append(model)
    res.append(predicted_values)
    res.append(status)
    print ("+++++++++++++++++  Completed +++++++++++++++++++")
    return res
Esempio n. 13
0
File: timeser.py Progetto: GSng/Uber
def create_model(input_json):
    global hourly_volume
    
    #Loads JSON file
    print 'Loading Data...'
    json = loads(input_json)
    
    #Converts to Pandas Time Series Dataframe which can be converted to be used by R
    df = pd.DataFrame(json)
    df.columns = ['time']
    df['time'] = df['time'].apply(dateutil.parser.parse)
    df.set_index('time', inplace=True)
    df['t'] = 1
    
    #Resamples Dataframe into hourly (for weekly model) and daily (for monthly model) buckets
    hourly_volume = df.resample('1H', how=np.count_nonzero)
    daily_volume =  df.resample('1D', how=np.count_nonzero)
    
    print 'Creating Model...'
	
    #Converts Pandas Dataframe to R Dataframe
    demand_data_daily = com.convert_to_r_dataframe(daily_volume)
    demand_data_hourly = com.convert_to_r_dataframe(hourly_volume)
    
    #Brings Dataframes into R workspace
    r.assign('train_data_hourly',demand_data_hourly)
    r.assign('train_data_daily',demand_data_daily)
	#Assigns values to required input variables in R
    r('start_index = ' +str(get_friday_index(hourly_volume)))
    r('month_index = ' +str(get_first_of_month(hourly_volume)))
    
    #Reorganizes hourly dataframe to seasonal time series w/ 168 hr weekly intervals starting at the 1st Fri    
    r('train_data_ts <- ts(train_data_hourly[,1],start=c(1,(168-start_index+2)),frequency=168)')
    #Adds 0.01 as model input data must be non-zero
    r('train_data_ts = train_data_ts+ 0.01')
	#R creates hourly model we set beta=0 as we assume no global trend (HOLTZ-WINTERS MODEL)
    r('hr_model <- HoltWinters(train_data_ts,beta=0,seasonal="m",start.periods=(168+start_index-1))')
    
    #R creates a monthly model IFF there is enough data (min 8 weeks)
    r('dy_model = NULL')
    #1st Fri of hourly dataset translated for daily dataset
    r('start_index = (start_index-1)/24+1')
    if (r('length(train_data_daily[,1])>(28*2+start_index-1)')[0]):
        #if the first fri of the month of the dataset proceeds start date of dataset, sets to prior month's first fri
        r('if(month_index<1){month_index = 28-month_index }')
        #Reorganizes daily dataframe to seasonal time series
        r('train_data_ts <- ts(train_data_daily[,1],start=c(1,month_index),frequency=28)')
		#R creates monthly model, again we assume no global trend
        r('dy_model <- HoltWinters(train_data_ts,seasonal="m",start.periods=(28+start_index-1))')
    
	print 'Model Created!'
Esempio n. 14
0
    def ccaPermute(self,
                   nPerms=25,
                   penaltyXs=None,
                   penaltyZs=None,
                   controlsWithFeats=False):
        (groups, allOutcomes,
         controls) = self.outcomeGetter.getGroupsAndOutcomes()
        # groups: set(group_ids)
        # allOutcomes: {outcome: {group_id: value}}
        # controls: {control: {group_id: value}}

        (groupNorms, featureNames
         ) = self.featureGetter.getGroupNormsWithZerosFeatsFirst(groups)

        Zdict = allOutcomes
        Xdict = groupNorms

        if controlsWithFeats:
            print("Appending controls to X")
            Xdict.update(controls)
        else:
            print("Appending controls to Z")
            Zdict.update(controls)

        # TO DO: get topic frequencies?

        # groupNorms: {feat: {group_id: group_norm}}
        # featureNames: list of possible feature names

        # X contains feature group_norms, Z contains outcome values
        X, Z, Xfreqs, Zfreqs = self.prepMatrices(pd.DataFrame(data=Xdict),
                                                 pd.DataFrame(data=Zdict))

        try:
            X = com.convert_to_r_dataframe(X)
            Z = com.convert_to_r_dataframe(Z)
            Ngroups = com.convert_robj(ro.r["nrow"](X)[0])
        except NameError:
            warn("pandas.rpy.common cannot be imported")
            sys.exit(1)

        kwParams = {"nperms": nPerms}
        kwParams[
            'penaltyxs'] = penaltyXs if penaltyXs else ro.vectors.FloatVector(
                np.arange(.1, .91, .05))
        kwParams[
            'penaltyzs'] = penaltyZs if penaltyZs else ro.vectors.FloatVector(
                np.arange(.1, .91, .05))

        self._ccaPermute(X, Z, **kwParams)
def gams_for_individual_run(proc_data,feature_list):

    if proc_data['outcome'] is not None:
           print("outcome variable found")
    else:
        print("outcome variable missing")
        return (list(None, None, status = "FAIL"))



    #Prepare Formula from feature list
    
    s="outcome~"
    for i in range(len(feature_list)):
        if(i in [37,49,48,51,72,50]):
            continue;
        if(feature_list['feature_type'][i] == 'num'):
            s += feature_list['feature_name'][i] + '+'
        else:
            s += feature_list['feature_name'][i] + '+'
    s = s[:-1]
    fmla = Formula(s)

    proc_data.dropna()
    full_range = range(0, len(proc_data.index) - 1)
    #thres_tune=round((positiveOutcomes/len(traindata.index)),3)
    train_ind =random.sample(range(0,len(proc_data.index)-1),int(math.floor(0.7*len(proc_data.index))))
    test_ind = [val for val in full_range if val not in train_ind]
    response = proc_data['outcome'].iloc[test_ind]
    rdf = com.convert_to_r_dataframe(proc_data.iloc[train_ind])
    #Run the model

    print("Running GAMs model")
    model = mgcv.bam(formula=fmla, data=rdf, family=statsf.binomial(link="logit"))
    print("Model building completed")

    #Predict the values

    tdf = com.convert_to_r_dataframe(proc_data.iloc[test_ind])
    predval = statsf.predict(model, tdf, type="response")
    predicted_values = pd.concat([pd.DataFrame({'pred':predval}, index=[response.index.get_values()]), response], axis=1)
    predicted_values.columns = ['prediction', 'observed']
    auc=roc_auc_score(response, predval)
    status = "pass"
    res = list()
    res.append(model)
    res.append(predicted_values)
    res.append(auc)
    print("+++++++++++++++++  Completed +++++++++++++++++++")
    return res
Esempio n. 16
0
 def check(self):
     """Performs check, some issue with output however."""
     importr('biotools')
     pan_data = pandas.DataFrame(self.data)
     pan_classes = pandas.DataFrame(self.classes)
     r_data = com.convert_to_r_dataframe(pan_data)
     r_classes = com.convert_to_r_dataframe(pan_classes)
     ro.globalenv['r_data'] = r_data
     ro.globalenv['r_classes'] = r_classes
     ro.r('boxM_test = boxM(data=r_data,grouping=r_classes)')
     ro.r('pvals = boxM_test$p.value')
     pan_data = com.load_data('pvals')
     ##P-value of box test of equal covariances
     self.boxMp = pan_data[0]
Esempio n. 17
0
def removeBatchEffects(infile, outfile):

    # Read data
    vstDataframe = pd.read_table(infile).set_index('gene_symbol').drop(
        ['NK1', 'NK2', 'B8N', 'B10C'], axis=1)

    # Create annotation dataframe
    annotationDataframe = pd.DataFrame(
        [[x, x[:-1], x[-1]] for x in vstDataframe.columns],
        columns=['sample_id', 'patient_id', 'treatment'])

    # Run function
    r.remove_batch_effects(com.convert_to_r_dataframe(vstDataframe),
                           com.convert_to_r_dataframe(annotationDataframe),
                           outfile)
Esempio n. 18
0
def test_converting_to_factors():

    test_data = DataFrame(
        {
            'colA': Series(randn(1, 5000).flatten() > 0),
            'colB': Series(100 * randn(1, 5000).flatten()),
            'colC': Series(100 + randn(1, 5000).flatten()),
            'colD': Series(randn(1, 5000).flatten() > 0),
        },
    )

    test_data['colA'] = test_data['colA'].map(str)
    test_data['colD'] = test_data['colD'].map(str)

    factor_cols = [('colA', 'True'),
                   ('colD', 'True')]

    rpy_test_df = com.convert_to_r_dataframe(test_data)

    rpy_out_df = Rtools.convert_columns_to_factors(rpy_test_df, factor_cols)
    test_cols = [('colA', 'factor'),
                 ('colB', 'numeric'),
                 ('colC', 'numeric'),
                 ('colD', 'factor')]

    for col, typ in test_cols:
        if typ == 'factor':
            yield eq_, rpy_out_df.rx2(col).nlevels, 2
        elif typ == 'numeric':
            yield ok_, (not hasattr(rpy_out_df.rx2(col), 'nlevels'))
Esempio n. 19
0
def pd_py2ri(o):
    """ 
    """
    res = None
    if isinstance(o, pd.Series):
        o = pd.DataFrame(o, index=o.index)

    if isinstance(o, pd.DataFrame):
        if isinstance(o.index, pd.DatetimeIndex):
            res = rconv.convert_df_to_xts(o)
        else:
            res = rcom.convert_to_r_dataframe(o)

    if isinstance(o, pd.DatetimeIndex):
        res = rconv.convert_datetime_index(o)

    if isinstance(o, pd.Timestamp):
        res = rconv.convert_timestamp(o)

    if res is None:
        try:
            res = numpy2ri.py2ri(o)
        except:
            res = robjects.default_converter.py2ri(o)

    return res
Esempio n. 20
0
def run_earth(X, y, **kwargs):
    '''
    Run with the R package earth.
    Return prediction value, training time, and number
    of forward pass iterations.
    '''
    r = robjects.r
    m, n = X.shape
    data = pandas.DataFrame(X)
    data['y'] = y
    r_data = com.convert_to_r_dataframe(data)
    r('library(earth)')
    r_func = '''
    run <-  function(data, degree=1, fast.k=0, penalty=3.0){
       time = system.time(model <- earth(y~.,data=data,degree=degree,penalty=penalty))[3]
       forward_terms = dim(summary(model)$prune.terms)[1]
       y_pred = predict(model,data)
       return(list(y_pred, time, forward_terms, model))
     }
     '''
    r(r_func)
    run = r('run')
    r_list = run(
        **{
            'data': r_data,
            'degree': kwargs['max_degree'],
            'fast.k': 0,
            'penalty': kwargs['penalty']
        })
    y_pred = numpy.array(r_list[0]).reshape(m)
    time = r_list[1][0]
    forward_terms = r_list[2][0]
    return y_pred, time, (forward_terms - 1) / 2
Esempio n. 21
0
def test_mixed_model():

    test_data = DataFrame(
        {
            'colA': Series(randn(1, 5000).flatten() > 0),
            'colB': Series(100 * randn(1, 5000).flatten()),
            'colC': Series(100 + randn(1, 5000).flatten()),
            'colD': Series(randn(1, 5000).flatten() > 0),
            },
        )

    test_data['colA'] = test_data['colA'].map(str)
    test_data['colD'] = test_data['colD'].map(str)

    factor_cols = [('colA', 'True'),
                   ('colD', 'True')]

    rpy_test_df = com.convert_to_r_dataframe(test_data)
    rpy_test_df = Rtools.convert_columns_to_factors(rpy_test_df, factor_cols)

    base_formula = Formula('colC ~ as.factor(colA) + colB')
    rand_formula = Formula('~1|colD')

    results = Rtools.R_linear_mixed_effects_model(rpy_test_df, base_formula, rand_formula)

    print results['tTable']
    ok_(('tTable' in results), 'Did not have the tTable in the results')
    ok_(('as.factor(colA)False' in results['tTable'].index), 'Did not have the factor in the tTable')
    ok_(('colB' in results['tTable'].index), 'Did not have the variable in the tTable')
Esempio n. 22
0
 def set_cv_fold(self, df):
     """Send which genes are valid test sets for each CV fold."""
     if new_pandas_flag:
         r_df = pandas2ri.py2ri(df)
     else:
         r_df = com.convert_to_r_dataframe(df)
     ro.globalenv['cvFoldDf'] = r_df
Esempio n. 23
0
    def predict(self, xtest):
        """Predicts class via majority vote.

        Parameters
        ----------
        xtest : pd.DataFrame
            features for test set
        """
        if new_pandas_flag:
            r_xtest = pandas2ri.py2ri(xtest)
        else:
            r_xtest = com.convert_to_r_dataframe(xtest)
        #r_xtest = pandas2ri.py2ri(xtest)
        pred = self.rf_pred(self.rf, r_xtest)
        if new_pandas_flag:
            #py_pred = pandas2ri.ri2py(pred)
            tmp_genes = pred[1]
            tmp_pred_class = pred[0]
            genes = pandas2ri.ri2py(tmp_genes)
            pred_class = pandas2ri.ri2py(tmp_pred_class)
        else:
            py_pred = com.convert_robj(pred)
            genes, pred_class = zip(*py_pred.items())
            #genes = com.convert_robj(tmp_genes)
            #pred_class = com.convert_robj(tmp_pred_class)
        tmp_df = pd.DataFrame({'pred_class': pred_class},
                              index=genes)
        tmp_df = tmp_df.reindex(xtest.index)
        tmp_df -= 1  # for some reason the class numbers start at 1
        return tmp_df['pred_class']
Esempio n. 24
0
def run_earth(X, y, **kwargs):
    '''Run with the R package earth.  Return prediction value, training time, and number of forward pass iterations.'''
    r = robjects.r
    m, n = X.shape
    data = pandas.DataFrame(X)
    data['y'] = y
    r_data = com.convert_to_r_dataframe(data)
    r('library(earth)')
    r_func = '''
        run <-  function(data, degree=1, fast.k=0, penalty=3.0){
                    time = system.time(model <- earth(y~.,data=data,degree=degree,penalty=penalty))[3]
                    forward_terms = dim(summary(model)$prune.terms)[1]
                    y_pred = predict(model,data)
                    return(list(y_pred, time, forward_terms, model))
                }
        '''
    r(r_func)
    run = r('run')
    r_list = run(
        **{'data': r_data,
           'degree': kwargs['max_degree'],
           'fast.k': 0,
           'penalty': kwargs['penalty']})
    y_pred = numpy.array(r_list[0]).reshape(m)
    time = r_list[1][0]
    forward_terms = r_list[2][0]
    return y_pred, time, (forward_terms - 1) / 2
Esempio n. 25
0
    def fit(self, xtrain, ytrain):
        """The fit method trains R's random forest classifier.

        NOTE: the method name ("fit") and method signature were choosen
        to be consistent with scikit learn's fit method.

        Parameters
        ----------
        xtrain : pd.DataFrame
            features for training set
        ytrain : pd.DataFrame
            true class labels (as integers) for training set
        """
        label_counts = ytrain.value_counts()
        if self.is_onco_pred and self.is_tsg_pred:
            sampsize = [label_counts[self.other_num],
                        label_counts[self.onco_num],
                        label_counts[self.tsg_num]]
        elif self.is_onco_pred:
            sampsize = [label_counts[self.other_num],
                        label_counts[self.onco_num]]
        elif self.is_tsg_pred:
            sampsize = [label_counts[self.other_num],
                        label_counts[self.tsg_num]]

        self.set_sample_size(sampsize)
        ytrain.index = xtrain.index  # ensure indexes match
        xtrain['true_class'] = ytrain
        r_xtrain = com.convert_to_r_dataframe(xtrain)
        #r_xtrain = pandas2ri.py2ri(xtrain)
        self.rf = self.rf_fit(r_xtrain, self.ntrees, self.sample_size)
        r_imp = self.rf_imp(self.rf)  # importance dataframe in R
        self.feature_importances_ = com.convert_robj(r_imp)
Esempio n. 26
0
def pd_py2ri(o):
    """ 
    """
    res = None
    if isinstance(o, pd.Series): 
        o = pd.DataFrame(o, index=o.index)

    if isinstance(o, pd.DataFrame): 
        if isinstance(o.index, pd.DatetimeIndex):
            res = rconv.convert_df_to_xts(o)
        else:
            res = rcom.convert_to_r_dataframe(o)

    if isinstance(o, pd.DatetimeIndex): 
        res = rconv.convert_datetime_index(o)

    if isinstance(o, pd.Timestamp): 
        res = rconv.convert_timestamp(o)
        
    if res is None:
        try:
            res = numpy2ri.py2ri(o)
        except:
            res = robjects.default_converter.py2ri(o)

    return res
Esempio n. 27
0
def skm_permute(data):
    """
    rpy2 wrapper for R function: KMeansSparseCluster.permute from the sparcl package.
    The tuning parameter controls the L1 bound on w, the feature weights. A permutation 
    approach is used to select the tuning parameter. 
    
    Infile:
    ---------------
    data: pandas Dataframe
            nxp dataframe where n is observations and p is features (i.e. ROIs)
            Should be a pandas DataFrame with subject codes as index and features 
            as columns.
    
    Returns:
    ---------------
    best_L1bound: float
    	tuning parameter that returns the highest gap statistic
	(more features given non-zero weights)
    
    lowest_L1bound: float
    	smallest tuning parameter that gives a gap statistic within 
        one sdgap of the largest gap statistic (sparser result)
    """
    sparcl = import_sparcl()
    r_data = com.convert_to_r_dataframe(data)
    km_perm = sparcl.KMeansSparseCluster_permute(r_data,K=2,nperms=25)
    best_L1bound = km_perm.rx2('bestw')[0]
    wbounds = km_perm.rx2('wbounds')
    gaps = km_perm.rx2('gaps')
    bestgap = max(gaps)
    sdgaps = km_perm.rx2('sdgaps')
    # Calculate smallest wbound that returns gap stat within one sdgap of best wbound
    wbound_rnge = [wbounds[i] for i in range(len(gaps)) if (gaps[i]+sdgaps[i]>=bestgap)]
    lowest_L1bound = min(wbound_rnge)
    return best_L1bound, lowest_L1bound
Esempio n. 28
0
def write_r_dataframe(count_dict,outputfile):
    """Write R-compatible dataframe to output file"""
    df = pd.DataFrame.from_dict(count_dict)
    r_dataframe = com.convert_to_r_dataframe(df)

    f1 = open(outputfile,'w+')
    print >>f1, r_dataframe
Esempio n. 29
0
def process_covariates(surv, feature=None, cov=None):
    '''
    Coerce covariates and feature into format suitable for R's
    survival functions. 
    '''
    if type(feature) is type(None):
        feature = pd.Series(index=surv.index.levels[0])
    if type(cov) is type(None):
        cov = pd.DataFrame(index=feature.index)
    if type(cov) == pd.Series:
        cov = pd.concat([cov], axis=1)
    elif type(cov) == list:
        assert map(type, cov) == ([pd.Series] * len(cov))
        cov = pd.concat(cov, axis=1)
    cov = cov.apply(sanitize_lr)
    feature = sanitize_lr(feature)
    c_real = cov.ix[:, cov.dtypes.isin([np.dtype(float), np.dtype(int)])]
    c_real = (c_real - c_real.mean()) / c_real.std()
    if c_real.shape[1] > 0:
        cov[c_real.columns] = c_real
    cov = cov.dropna(1, how='all')
    df = cov.join(surv.unstack()).dropna()
    df.loc[:, 'days'] = df.loc[:, 'days'] / 365
    df = df.groupby(level=0).first()
    if len(feature.dropna()) == 0:
        feature = None
    df, factors = process_factors(df, feature, list(cov.columns))
    df = df[factors + ['days', 'event']]
    df = df.dropna(axis=1, how='all')
    df = convert_to_r_dataframe(df)
    return df, factors
Esempio n. 30
0
def sarima_test(steps, path):
    index_name, my_trend = parse_csv(path)
    dta = pd.DataFrame(my_trend)
    dta.index = index_name
    dta = dta.rename(columns={0: 'search'})
    r_df = com.convert_to_r_dataframe(dta)
    y = stats.ts(r_df)
    order = R.IntVector((1, 1, 1))
    season = R.ListVector({'order': R.IntVector((0, 1, 0)), 'period': 52})
    model = stats.arima(y[-5 * 52:-steps], order=order, seasonal=season)
    f = forecast.forecast(model, h=steps)
    future = [var for var in f[3]]
    y_pred = np.array(future)
    y_true = np.array(my_trend[-steps:])
    metrics_result = {
        'sarima_MAE': metrics.mean_absolute_error(y_true, y_pred),
        'sarima_MSE': metrics.mean_squared_error(y_true, y_pred),
        'sarima_MAPE': np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    }
    p1 = plt.plot(my_trend[-steps:], '*-')
    p2 = plt.plot(future)
    #    p1 = plt.plot(index_name,my_trend,'r-')
    #    p2 = plt.plot(index_name_future,future,'g-')
    plt.ylabel('Search Intensity')
    plt.xlabel('Year')
    plt.title('Search Prediction of ' + path.split('/')[-1][:-4])
    plt.legend((p1[0], p2[0]), ["Actual", "Predicted"], loc=1)
    plt.grid(True)
    #    print metrics_result['sarima_MAPE']
    return metrics_result['sarima_MAPE']
Esempio n. 31
0
def process_covariates(surv, feature=None, cov=None):
    '''
    Coerce covariates and feature into format suitable for R's
    survival functions. 
    '''
    if type(feature) is type(None):
        feature = pd.Series(index=surv.index.levels[0])
    if type(cov) is type(None):
        cov = pd.DataFrame(index=feature.index)
    if type(cov) == pd.Series:
        cov = pd.concat([cov], axis=1)
    elif type(cov) == list:
        assert map(type, cov) == ([pd.Series] * len(cov))
        cov = pd.concat(cov, axis=1)
    cov = cov.apply(sanitize_lr)
    feature = sanitize_lr(feature)
    c_real = cov.ix[:, cov.dtypes.isin([np.dtype(float), np.dtype(int)])]
    c_real = (c_real - c_real.mean()) / c_real.std()
    if c_real.shape[1] > 0:
        cov[c_real.columns] = c_real
    cov = cov.dropna(1, how='all')
    df = cov.join(surv.unstack()).dropna()
    df['days'] = df['days'] / 365.
    df = df.groupby(level=0).first()
    if len(feature.dropna()) == 0:
        feature = None 
    df, factors = process_factors(df, feature, list(cov.columns))
    df = df[factors + ['days', 'event']]
    df = df.dropna(axis=1, how='all')
    df = convert_to_r_dataframe(df)
    return df, factors
Esempio n. 32
0
 def case_classifyCascade(self):
     """ A individual case classification function"""
     ########### To R for classification
     os.chdir("Z:\Cristina\MassNonmass\codeProject\codeBase\extractFeatures\casesDatabase")        
     cF = pd.read_csv('casesFrames_toclasify.csv')
     
     cF['finding.mri_mass_yn'] = cF['finding.mri_mass_yn'].astype('int32')
     cF['finding.mri_nonmass_yn'] = cF['finding.mri_nonmass_yn'].astype('int32')
     cF['finding.mri_foci_yn'] = cF['finding.mri_foci_yn'].astype('int32')
     cF['finding.mri_foci_yn'] = cF['finding.mri_foci_yn'].astype('int32')
     cF['is_insitu'] = cF['is_insitu'].astype('int32')
     cF['is_invasive'] = cF['is_invasive'].astype('int32')
             
     self.rpycasesFrame = com.convert_to_r_dataframe(cF)
     base = importr('base')
     base.source('Z:/Cristina/MassNonmass/codeProject/codeBase/finalClassifier/finalClassifier_classifyCascade.R')
     
     RFcascade = globalenv['finalClassifier_classifyCascade'](self.rpycasesFrame)
     
     self.RFcascade_probs = com.convert_robj(RFcascade)
     print "\n========================"
     print self.RFcascade_probs
     
     # proccess possible outcome
     [veredict, caseoutcome] = self.parse_classes(self.RFcascade_probs)
     print "\n========================\nCascade classification result:"
     print veredict
     print caseoutcome
     
     return
Esempio n. 33
0
def write_r_dataframe(count_dict, outputfile):
    """Write R-compatible dataframe to output file"""
    df = pd.DataFrame.from_dict(count_dict)
    r_dataframe = com.convert_to_r_dataframe(df)

    f1 = open(outputfile, 'w+')
    print >> f1, r_dataframe
Esempio n. 34
0
 def RsoftImpute(self, X):
     softImpute = importr("softImpute")
     X = com.convert_to_r_dataframe(X)
     X = softImpute.complete(
         X, softImpute.softImpute(softImpute.biScale(X, maxit=100)))
     X = com.convert_robj(X)
     return X
Esempio n. 35
0
def av(data, formula, model='', output='', as_strings='', title='Title for Your Output', label='Label for Your Output', pythontex=True):
    if not output:
	output = 'xtable'
    if not model:
	model = 'aov'
    
    if output == 'stargazer':
	stargazer = importr('stargazer')
    elif output == 'texreg':
	texreg = importr('texreg')

    formula = robjects.Formula(formula)
    dfr = com.convert_to_r_dataframe(data)  # convert from pandas to R and make string columns factors
	
    if model == 'aov':
	output = 'xtable' #aov only works with xtable
	av_model = stats.aov(formula, data=dfr)
	av_model_sum = base.summary(av_model)
    
    if output == 'xtable':
	xtable = importr('xtable')
	latex = xtable.xtable(av_model_sum, caption=title, label=label)
    if pythontex:
	return latex
    else:
	return '\n'.join(np.array(latex))
Esempio n. 36
0
def sarima_test(steps,path):
    index_name,my_trend = parse_csv(path)
    dta = pd.DataFrame(my_trend)
    dta.index = index_name
    dta=dta.rename(columns = {0:'search'})
    r_df = com.convert_to_r_dataframe(dta)
    y = stats.ts(r_df)
    order = R.IntVector((1,1,1))
    season = R.ListVector({'order': R.IntVector((0,1,0)), 'period' : 52})
    model = stats.arima(y[-5*52:-steps], order = order, seasonal=season)
    f = forecast.forecast(model,h=steps) 
    future = [var for var in f[3]]
    y_pred = np.array(future)
    y_true = np.array(my_trend[-steps:])
    metrics_result = {'sarima_MAE':metrics.mean_absolute_error(y_true, y_pred),'sarima_MSE':metrics.mean_squared_error(y_true, y_pred),
                  'sarima_MAPE':np.mean(np.abs((y_true - y_pred) / y_true)) * 100}	
    p1 = plt.plot(my_trend[-steps:],'*-')
    p2 = plt.plot(future)
#    p1 = plt.plot(index_name,my_trend,'r-')
#    p2 = plt.plot(index_name_future,future,'g-')
    plt.ylabel('Search Intensity')
    plt.xlabel('Year')
    plt.title('Search Prediction of '+path.split('/')[-1][:-4])
    plt.legend((p1[0], p2[0]), ["Actual","Predicted"], loc=1)
    plt.grid(True)
#    print metrics_result['sarima_MAPE']
    return metrics_result['sarima_MAPE']
Esempio n. 37
0
def spades_assembly_qc_plot(args) :
    from pandas import DataFrame
    import pandas.rpy.common as com
    import rpy2.robjects as ro
    import rpy2.robjects.lib.ggplot2 as ggplot2

    fasta_in = args[0]
    outfile = args[1]

    get_gc = lambda s : float(s.seq.count("G")+s.seq.count("C"))/len(s.seq)
    get_cov = lambda s : float(s.description.split("_")[-1])
    get_len = lambda s : len(s.seq)

    with open(fasta_in) as handle:
        seqs = [s for s in SeqIO.parse(handle,"fasta")]

    seqs_data = DataFrame.from_dict({s.id : {'GC' : get_gc(s), 'length' : get_len(s), 'cov' : get_cov(s)} for s in seqs}).transpose()

    r_data = com.convert_to_r_dataframe(seqs_data)
    ro.r.library('ggplot2')

    x=ro.r.ggplot(r_data, ro.r.aes_string(x='length',y='cov')) + ro.r.geom_point() + ro.r.scale_y_log10() + ro.r.scale_x_log10()+ro.r.geom_vline(xintercept=1000, color="red")+ro.r.geom_vline(xintercept=2500, color="red")+ro.r.geom_vline(xintercept=10000, color="red") + ro.r.theme_bw()
    x.plot()
    ro.r('dev.copy(pdf,"%s")'%(outfile))
    ro.r('dev.off()')
Esempio n. 38
0
    def __init__(self,
                 name,
                 data):
        """
        Dataset class for R data inputs

        Parameters
        ----------
        name : str
            Name of datasett
        data : 2-tuple
            data[0]: data format, e.g. 'gslib', 'bhdf'
            data[1]: data  (or path/location)

        """
        self.name = name
        self.dataformat = data[0]
        self.datapath = data[1]

        xyz = ['x', 'y', 'z']

        if data[0] is 'table':
            self.dataframe = pd.read_csv(self.datapath)
            self.col_names = self.dataframe.columns.values.tolist()
            self.ncol = len(self.col_names)
            self.xyz_cols = [self.col_names.index(i) for i in xyz]
            self.variables = [i for i in self.col_names if i not in xyz]
            self.var_cols = [self.col_names.index(i) for i in self.variables]
            self.nvar = len(self.variables)

        self.rdf = com.convert_to_r_dataframe(self.dataframe)
Esempio n. 39
0
    def test_convert_r_dataframe(self):

        is_na = robj.baseenv.get("is.na")

        seriesd = tm.getSeriesData()
        frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A'])

        # Null data
        frame["E"] = [np.nan for item in frame["A"]]
        # Some mixed type data
        frame["F"] = ["text" if item %
                      2 == 0 else np.nan for item in range(30)]

        r_dataframe = com.convert_to_r_dataframe(frame)

        assert np.array_equal(
            com.convert_robj(r_dataframe.rownames), frame.index)
        assert np.array_equal(
            com.convert_robj(r_dataframe.colnames), frame.columns)
        assert all(is_na(item) for item in r_dataframe.rx2("E"))

        for column in frame[["A", "B", "C", "D"]]:
            coldata = r_dataframe.rx2(column)
            original_data = frame[column]
            assert np.array_equal(com.convert_robj(coldata), original_data)

        for column in frame[["D", "E"]]:
            for original, converted in zip(frame[column],
                                           r_dataframe.rx2(column)):

                if pd.isnull(original):
                    assert is_na(converted)
                else:
                    assert original == converted
Esempio n. 40
0
 def set_cv_fold(self, df):
     """Send which genes are valid test sets for each CV fold."""
     if new_pandas_flag:
         r_df = pandas2ri.py2ri(df)
     else:
         r_df = com.convert_to_r_dataframe(df)
     ro.globalenv['cvFoldDf'] = r_df
Esempio n. 41
0
    def test_convert_r_dataframe(self):

        is_na = robj.baseenv.get("is.na")

        seriesd = tm.getSeriesData()
        frame = pd.DataFrame(seriesd, columns=["D", "C", "B", "A"])

        # Null data
        frame["E"] = [np.nan for item in frame["A"]]
        # Some mixed type data
        frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)]

        r_dataframe = com.convert_to_r_dataframe(frame)

        assert np.array_equal(com.convert_robj(r_dataframe.rownames), frame.index)
        assert np.array_equal(com.convert_robj(r_dataframe.colnames), frame.columns)
        assert all(is_na(item) for item in r_dataframe.rx2("E"))

        for column in frame[["A", "B", "C", "D"]]:
            coldata = r_dataframe.rx2(column)
            original_data = frame[column]
            assert np.array_equal(com.convert_robj(coldata), original_data)

        for column in frame[["D", "E"]]:
            for original, converted in zip(frame[column], r_dataframe.rx2(column)):

                if pd.isnull(original):
                    assert is_na(converted)
                else:
                    assert original == converted
Esempio n. 42
0
    def predict(self, xtest):
        """Predicts class via majority vote.

        Parameters
        ----------
        xtest : pd.DataFrame
            features for test set
        """
        if new_pandas_flag:
            r_xtest = pandas2ri.py2ri(xtest)
        else:
            r_xtest = com.convert_to_r_dataframe(xtest)
        #r_xtest = pandas2ri.py2ri(xtest)
        pred = self.rf_pred(self.rf, r_xtest)
        if new_pandas_flag:
            #py_pred = pandas2ri.ri2py(pred)
            tmp_genes = pred[1]
            tmp_pred_class = pred[0]
            genes = pandas2ri.ri2py(tmp_genes)
            pred_class = pandas2ri.ri2py(tmp_pred_class)
        else:
            py_pred = com.convert_robj(pred)
            genes, pred_class = zip(*py_pred.items())
            #genes = com.convert_robj(tmp_genes)
            #pred_class = com.convert_robj(tmp_pred_class)
        tmp_df = pd.DataFrame({'pred_class': pred_class},
                              index=genes)
        tmp_df = tmp_df.reindex(xtest.index)
        tmp_df -= 1  # for some reason the class numbers start at 1
        return tmp_df['pred_class']
Esempio n. 43
0
def to_r_obj(dataframe):
    if not isinstance(dataframe, pd.DataFrame):
        dataframe = pd.DataFrame(dataframe)
    rdata = rpycom.convert_to_r_dataframe(dataframe)
    if len(rdata.colnames) == 1:
        rdata = rdata[0]
    return rdata
Esempio n. 44
0
def pathway_mutation_section_exp(cancer, gene_sets, cutoff=.25):
    #Format data for report
    path = cancer.report_folder + '/'
    pathway_table_file = path + 'pathway_table.csv'
    pathway_table = format_pathway_table_exp(cancer, gene_sets) 
    if 'survival' in pathway_table:
        pathway_table.sort(columns='survival')
    pathway_table.to_csv(pathway_table_file)
    keepers = cancer.q_pathways[(cancer.q_pathways < .25).sum(1) > 0].index
    pathway_table = pathway_table.ix[keepers]
    if 'survival' in pathway_table:
        pathway_table = pathway_table.sort(columns='survival')
    pathway_table = pathway_table.head(20)
    pathway_table_r = com.convert_to_r_dataframe(pathway_table.replace(nan, 1.23)) #@UndefinedVariable
    if len(pathway_table) == 0:
        return nz.addTo(nz.newSubSection('Expressed Pathways'), nz.newParagraph(''))
    
    #Overview
    tableCaption1 = ('Association of pathway level expression patterns with patient' + 
                     'clinical features.')
    table1 = nz.newTable(pathway_table_r, tableCaption1, file=pathway_table_file, 
                             significantDigits=2);                      
   
    #Fill in the details
    pathway_pos = dict((p,i) for i,p in enumerate(pathway_table.index))
    col_pos = dict((c,i) for i,c in enumerate(pathway_table.columns))
    
    #age scatter plots
    for p in (pathway_table['age'][pathway_table['age'] < cutoff]).index:
        fig_file = path + FIG_EXT + p + '_age.png'
        draw_pathway_age_scatter(p, cancer, fig_file)
        age_fig1 = nz.newFigure(fig_file, 'Age of patients with or without' +
                                           'mutation to pathway.')
        result1 = nz.addTo(nz.newResult('', isSignificant='TRUE'),
                           nz.addTo(nz.newSection(p), age_fig1))
        table1 = nz.addTo(table1, result1, row=pathway_pos[p]+1, 
                          column=col_pos['age']+1)
        
    #survival curves
    for p in (pathway_table['survival'][pathway_table['survival'] < cutoff]).index:
        fig_file = path + FIG_EXT + p + '_survival.png'
        data_frame = cancer.data_matrix.ix[gene_sets[p]].dropna()
        U,S,vH = frame_svd(((data_frame.T - data_frame.mean(1)) / data_frame.std(1)).T)
        
        strat = (vH[0] > vH[0].std()).astype(int) - (vH[0] < -vH[0].std()) + 1
        draw_survival_curves(cancer.clinical, Series(strat, name='pc'), 
                             labels=['low','mid','high'], filename=fig_file)
        sv_fig1 = nz.newFigure(fig_file, 'Survival of patients with ' + 
                                          'varying levels of pathway expression.')
        fig_file2 = path + FIG_EXT + p + '.svg'
        draw_pathway_eig_bar(U, fig_file2)
        sv_fig_2 = nz.newFigure(fig_file2, 'Loading for first eigen-patient.')
        result1 = nz.addTo(nz.newResult('', isSignificant='TRUE'),
                           nz.addTo(nz.newSection(p), sv_fig1, sv_fig_2))
        table1 = nz.addTo(table1, result1, row=pathway_pos[p]+1, 
                          column=col_pos['survival']+1)
        
    section = nz.addTo(nz.newSubSection('Pathway Mutations'), table1)
    return section
Esempio n. 45
0
 def set_cv_fold(self, df):
     """Send which genes are valid test sets for each CV fold."""
     if new_pandas_flag:
         with localconverter(ro.default_converter + pandas2ri.converter):
             r_df = ro.conversion.py2rpy(df)
     else:
         r_df = com.convert_to_r_dataframe(df)
     ro.globalenv['cvFoldDf'] = r_df
Esempio n. 46
0
 def _from_python(obj):
     if isinstance(obj, DataFrame):
         obj = convert_to_r_dataframe(obj)
     elif isinstance(obj, Series):
         obj = numpy2ri(obj.values)
     elif isinstance(obj, np.ndarray):
         obj = numpy2ri(obj)
     return obj
Esempio n. 47
0
 def _from_python(obj):
     if isinstance(obj, DataFrame):
         obj = convert_to_r_dataframe(obj)
     elif isinstance(obj, Series):
         obj = numpy2ri(obj.values)
     elif isinstance(obj, np.ndarray):
         obj = numpy2ri(obj)
     return obj
Esempio n. 48
0
def runCharacteristicDirection(infiles, outfile):

    # Split infiles
    vstFile, annotationFile = infiles

    # Read expression data
    vstDataframe = pd.read_table(vstFile, index_col='gene_symbol')

    # Read annotation data
    annotationDataframe = pd.read_table(annotationFile,
                                        index_col='sample_name')

    # Get timepoint samples
    timepointSampleDict = {
        'day' + str(day):
        annotationDataframe.index[annotationDataframe['day'] == day].tolist()
        for day in set(annotationDataframe['day'])
    }

    # Group 4 and 5 days
    timepointSampleDict[
        'day4-5'] = timepointSampleDict['day4'] + timepointSampleDict['day5']
    del timepointSampleDict['day4']
    del timepointSampleDict['day5']

    # Get controls
    controlColumns = timepointSampleDict.pop('day0')

    # Initialize empty dataframe
    resultDataframe = pd.DataFrame()

    # Loop through timepoints
    for timepoint in timepointSampleDict.keys():

        # Get experiment samples
        experimentColumns = timepointSampleDict[timepoint]

        # Run characteristic direction
        cdResults = r.runCharacteristicDirection(
            com.convert_to_r_dataframe(vstDataframe), experimentColumns,
            controlColumns, 0.1)

        # Convert to dataframe
        cdDataframe = com.convert_robj(cdResults).reset_index()

        # Add timepoint column
        cdDataframe['timepoint'] = timepoint

        # Append
        resultDataframe = pd.concat([resultDataframe, cdDataframe])

    # Pivot
    resultDataframeCast = resultDataframe.pivot(index='index',
                                                columns='timepoint',
                                                values='CD')

    # Save
    resultDataframeCast.to_csv(outfile, sep='\t', index_label='gene_symbol')
Esempio n. 49
0
def preview(fileid):


	download_url = 'http://dataverse.harvard.edu/api/access/datafile/' + str(fileid)
	request = requests.get(download_url)
	d =  request.text


	data_unicode = unicodedata.normalize("NFKD", d).encode("ascii",'ignore')
	data_string = StringIO(data_unicode)
	df = pd.read_table(data_string)

	global df_global
	df_global = df

	
    #create variable names for varaible table
	variables = df.columns   

	total_rows = df.shape[0]
	total_cols = df.shape[1]
   #Create HTML of pandas dataframe
	df.index += 1 
	df_html = df.to_html()
	start = df_html.find('class="dataframe"')
	df_html = df_html[:start] + 'id = "preview_DataTable"' + df_html[start+1:]
	
	r_dataframe = com.convert_to_r_dataframe(df_global)
	robjects.r('''
       source('preprocess.R')
	''')
	r_preprocess =  robjects.globalenv['preprocess']
	meta = str(r_preprocess(testdata =r_dataframe))
	meta= meta.replace('\\', '')
	meta=meta.replace('"\n', '')
	meta = meta.replace('[1] "', '')
	global metadata_all 
	metadata_all = pandas.io.json.read_json(meta)

	
	metadata_subset = [0,3,4,9,12,13,14,16,17,20,24,25] #the summary metrics I want for the summary stats
	variable_info_dict = dict()
	for var in variables:
		metadata_variable_series = metadata_all[var]
		metadata_variable = pandas.DataFrame(metadata_variable_series)
		sumstats_variable = metadata_variable.ix[metadata_subset]
		sumstats_variable_html = str(sumstats_variable.to_html(header = False))
		start = sumstats_variable_html.find('class="')
		stop =sumstats_variable_html.find(">")
		sumstats_variable_html= sumstats_variable_html[:start] + 'class = "table-condensed table-striped>"' + sumstats_variable_html[stop+1:]
		sumstats_variable_html = sumstats_variable_html.replace('<','&lt;')
		sumstats_variable_html = sumstats_variable_html.replace('<','&gt;')
		variable_info_dict[var] = sumstats_variable_html


	d = {"data": df_html, 'variables' : variables, 'fileid':str(fileid), 'variable_info_dict':variable_info_dict, 'total_rows':total_rows, 'total_cols':total_cols}

	return render_template('gui_redo.html', **d)
Esempio n. 50
0
def line_plot(pdf_file,
              data,
              x,
              y,
              var,
              null_label="N/A",
              linetype=None,
              title=None,
              xlab=None,
              ylab=None,
              colorname=None,
              linename=None,
              **extra_aes_params):

    pdf(pdf_file, width=11.7, height=8.3, paper="a4r")
    if any(data[x].isnull()):
        labels = [null_label] + map(str, sorted(set(
            data[data[x].notnull()][x])))
        labels = robjects.StrVector(labels)
        nulls = data[x].isnull()
        label_vals = dict(zip(labels, range(len(labels))))
        data[x] = data[x].astype("str")
        data[x][nulls] = null_label
        data['sortcol'] = data[x].map(label_vals.__getitem__)
        data.sort('sortcol', inplace=True)
    else:
        labels = None

    if linetype and linetype != var:
        data['group'] = data[var].map(str) + data[linetype].map(str)
    else:
        data['group'] = data[var]

    rdata = common.convert_to_r_dataframe(data)
    if labels:
        ix = rdata.names.index(x)
        rdata[ix] = ordered(rdata[ix], levels=labels)

    gp = gg2.ggplot(rdata)
    pp = (
        gp + gg2.geom_point(size=3) +
        gg2.scale_colour_hue(name=(colorname or var)) +
        #gg2.scale_colour_continuous(low="black") +
        gg2.aes_string(x=x, y=y, color=var, variable=var) +
        ggtitle(title or "") + xlabel(xlab or x) + ylabel(ylab or y)  #+
        #gg2.scale_y_continuous(breaks=seq(0.0, 1.0, 0.05))
    )

    # line type stuff
    if linetype:
        pp += gg2.geom_path(gg2.aes_string(group='group', linetype=linetype),
                            size=0.5)
        pp += gg2.scale_linetype(name=(linename or linetype))
    else:
        pp += gg2.geom_path(gg2.aes_string(group='group'), size=0.5)

    pp.plot()
    dev_off()
Esempio n. 51
0
def sarima(steps, path):
    index_name, my_trend = parse_csv(path)
    dta = pd.DataFrame(my_trend)
    dta.index = index_name
    dta = dta.rename(columns={0: 'search'})
    #dta.plot(figsize=(10,4))

    #==============================================================================
    # check stationarity
    #==============================================================================
    #r_df = com.convert_to_r_dataframe(DataFrame(dta))
    #y = stats.ts(r_df)
    #ad = tseries.adf_test(y, alternative="stationary", k=52)
    #a = ad.names[:5]
    #{ad.names[i]:ad[i][0] for i in xrange(len(a))}

    #==============================================================================
    # check the seasonality
    #==============================================================================
    #diff1lev = dta.diff(periods=1).dropna()
    #diff1lev.plot(figsize=(12,6))
    #diff1lev_season = diff1lev.diff(52).dropna()
    #r_df = com.convert_to_r_dataframe(DataFrame(diff1lev_season))
    #diff1lev_season1lev = diff1lev_season.diff().dropna()

    #==============================================================================
    # check stationarity after difference
    #==============================================================================
    #y = stats.ts(r_df)
    #ad = tseries.adf_test(y, alternative="stationary", k=52)
    #a = ad.names[:5]
    #{ad.names[i]:ad[i][0] for i in xrange(len(a))}

    #==============================================================================
    # plot acf and pacf
    #==============================================================================
    #fig = plt.figure(figsize=(12,8))
    #ax1 = fig.add_subplot(211)
    #fig = sm.graphics.tsa.plot_acf(diff1lev_season1lev.values.squeeze(), lags=150, ax=ax1)
    #ax2 = fig.add_subplot(212)
    #fig = sm.graphics.tsa.plot_pacf(diff1lev_season1lev, lags=150, ax=ax2)
    #fig

    r_df = com.convert_to_r_dataframe(dta)
    y = stats.ts(r_df)
    order = R.IntVector((1, 1, 1))
    season = R.ListVector({'order': R.IntVector((0, 1, 0)), 'period': 52})
    a = time.time()
    model = stats.arima(y, order=order, seasonal=season)
    print time.time() - a
    f = forecast.forecast(model, h=steps)
    future = [var for var in f[3]]
    dt = date_range(dta.index[-1], periods=len(future) + 1,
                    freq='W')[1:]  #создаем индекс из дат
    pr = Series(future, index=dt)
    #    dta.plot(figsize=(12,6))
    #    pr.plot(color = 'red')
    return index_name, dt, my_trend, future
Esempio n. 52
0
def sarima(steps,path):
    index_name,my_trend = parse_csv(path)
    dta = pd.DataFrame(my_trend)
    dta.index = index_name
    dta=dta.rename(columns = {0:'search'})
    #dta.plot(figsize=(10,4))
    
    #==============================================================================
    # check stationarity
    #==============================================================================
    #r_df = com.convert_to_r_dataframe(DataFrame(dta))
    #y = stats.ts(r_df)
    #ad = tseries.adf_test(y, alternative="stationary", k=52)
    #a = ad.names[:5]
    #{ad.names[i]:ad[i][0] for i in xrange(len(a))}
    
    #==============================================================================
    # check the seasonality
    #==============================================================================
    #diff1lev = dta.diff(periods=1).dropna()
    #diff1lev.plot(figsize=(12,6))
    #diff1lev_season = diff1lev.diff(52).dropna()
    #r_df = com.convert_to_r_dataframe(DataFrame(diff1lev_season))
    #diff1lev_season1lev = diff1lev_season.diff().dropna()
    
    #==============================================================================
    # check stationarity after difference
    #==============================================================================
    #y = stats.ts(r_df)
    #ad = tseries.adf_test(y, alternative="stationary", k=52)
    #a = ad.names[:5]
    #{ad.names[i]:ad[i][0] for i in xrange(len(a))}
    
    
    #==============================================================================
    # plot acf and pacf
    #==============================================================================
    #fig = plt.figure(figsize=(12,8))
    #ax1 = fig.add_subplot(211)
    #fig = sm.graphics.tsa.plot_acf(diff1lev_season1lev.values.squeeze(), lags=150, ax=ax1)
    #ax2 = fig.add_subplot(212)
    #fig = sm.graphics.tsa.plot_pacf(diff1lev_season1lev, lags=150, ax=ax2)
    #fig
    
    r_df = com.convert_to_r_dataframe(dta)
    y = stats.ts(r_df)
    order = R.IntVector((1,1,1))
    season = R.ListVector({'order': R.IntVector((0,1,0)), 'period' : 52})
    a = time.time()
    model = stats.arima(y, order = order, seasonal=season)
    print time.time()-a
    f = forecast.forecast(model,h=steps) 
    future = [var for var in f[3]]
    dt = date_range(dta.index[-1], periods=len(future)+1,freq='W')[1:] #создаем индекс из дат
    pr = Series(future, index = dt)
#    dta.plot(figsize=(12,6))
#    pr.plot(color = 'red')
    return index_name,dt,my_trend,future
Esempio n. 53
0
def save_to_R(X, filename):
	import numpy as np
	from rpy2.robjects import r
	import pandas.rpy.common as com
	from pandas import DataFrame
	df = DataFrame(np.array(X))
	df = com.convert_to_r_dataframe(df)
	r.assign("X", df)
	r("save(X, file='%s.gz', compress=TRUE)"%(filename))
Esempio n. 54
0
def simpleNetworkx(G):
	"""
	D3 JavaScript networkx graphs using python.
	This is an python interface to Christopher Gandrud's R
	package networkD3.
	Parameters
	----------
	G : A networkx graph.
	Returns
	-------
	An HTML page containing an interactive visual of the graph.
	Example
	-------
	>>> G = nx.Graph()
	>>> H = ["A","B","C","D","E","F","G", "H","I","J"]
	>>> G.add_nodes_from(H)
	>>> G.add_edges_from([("A","B"), ("A","C"), ("A","D"), ("A","J"), ("B","E"), ("B","F"),
		("C","G"),("C","H"), ("D","I")])
	>>> simpleNetworkxD3(G)
	>>> Net.html
	References
	----------
	[1] Christorpher Gandrud - https://github.com/christophergandrud/networkD3
	"""

	ro.r('src = c()')
	ro.r('target =c()')
	ro.r('rdf=data.frame()')

	df = p.DataFrame(data=G.edges())

	df_r = com.convert_to_r_dataframe(df)

	ro.globalenv['src'] = df_r[0]
	ro.globalenv['target'] = df_r[1]

	ro.r('rdf=data.frame(src,target)')

	utils = importr('utils')
	utils.chooseCRANmirror(ind=1)


	try:
		networkD3 = importr('networkD3')
	except:
		utils.install_packages('networkD3')
		networkD3 = importr('networkD3')

	try:
		magrittr = importr('magrittr')
	except:
		utils.install_packages('magrittr')
		magrittr = importr('magrittr')


	ro.r('''simpleNetwork(rdf) %>% saveNetwork(file = 'Net.html')''')
	return None
Esempio n. 55
0
def Sizezonematrixfeatures(ngrl,mask):
    '''Function calculate size zone matrix based features'''
    dircontent = os.listdir('.')
    selionimg = grep(dircontent,'.sim')
    fielid = []; szmfeature=[]
    for f in selionimg:
      sname = f[:-4]
      print('PROCESSING %s' %sname)
      Img = np.genfromtxt(f,dtype=float,delimiter=',')
      if mask == 'drug':
        print('<--- Using drug mask -->')
        Mask = np.genfromtxt(sname + '_drug.msk',dtype=float,delimiter=',')
      if mask == 'mim':
        print('<--- Using MIM tissue mask -->')
        Mask = np.genfromtxt(sname + '_mim.msk',dtype=float,delimiter=',')
      if mask == 'tic':
        print('<--- Using TIC tissue mask -->')
        Mask = np.genfromtxt(sname + '_mim.msk',dtype=float,delimiter=',')
      ## rescaling to the desired number of gray levels
      if (ngrl != 0):
        m = ngrl/Img.max()
        scaledImg = Img*m
        binnedImg = np.rint(scaledImg)
        Img = (binnedImg + 1)  
      else:
        Img = np.sqrt(Img)
        Img = np.rint(Img)
        Img = (Img +1)   
      tissue = np.multiply(Img,Mask) 
      tissue = pd.DataFrame(tissue)
      rdf = com.convert_to_r_dataframe(tissue)
      ro.globalenv['tissue'] = rdf
      ro.r('tissue <- as.matrix(tissue)')
      ro.r('library(radiomics)')
      ro.r('szmatrix <- glszm(tissue)')
      ro.r('szmatrix[0,] <- 0')                           ### Assign zero value to first row which belongs to mask region
      ro.r('szmfeature <- array(NA,dim=c(11,1))')
      ro.r('szmfeature[1,1] <- glszm_SAE(szmatrix)')
      ro.r('szmfeature[2,1] <- glszm_LAE(szmatrix)')
      ro.r('szmfeature[3,1] <- glszm_IV(szmatrix)')
      ro.r('szmfeature[4,1] <- glszm_HILAE(szmatrix)')
      ro.r('szmfeature[5,1] <- glszm_LILAE(szmatrix)')
      ro.r('szmfeature[6,1] <- glszm_HISAE(szmatrix)')
      ro.r('szmfeature[7,1] <- glszm_LISAE(szmatrix)')
      ro.r('szmfeature[8,1] <- glszm_HIE(szmatrix)')
      ro.r('szmfeature[9,1] <- glszm_LIE(szmatrix)')
      ro.r('szmfeature[10,1] <- glszm_ZP(szmatrix)')
      ro.r('szmfeature[11,1] <- glszm_SZV(szmatrix)')
      szm = ro.r.matrix(ro.r('szmfeature'))
      szm = np.array(szm)
      szmfeature.append(szm.transpose())
      fielid.append(sname)
    szmfeature = np.array(szmfeature)    
    output = pd.DataFrame(szmfeature.reshape(szmfeature.shape[0],szmfeature.shape[2]),columns = ["sae","lae","iv","szv","zp","lie","hie","lisae","hisae","lilae","hilae"])
    output['Id'] = fielid
    output.to_csv("SZM_features.csv",delimiter=",")
Esempio n. 56
0
def contextGeneDistances(cdhitProc):
    clusterIDs = []
    dists = []
    #heats = defaultdict( Counter )
    for i,cluster in enumerate(cdhitProc.clusters):
        members = cluster.seqs
        for mem in members:
            tag = cluster_id_reg.findall(mem)[0][:-3]
            bst,bend,ast,aend = tag.split('|')[-4:]
            bst,bend,ast,aend = int(bst),int(bend),int(ast),int(aend)
            if overlap(bst,bend,ast,aend): continue                
            bmid = (bst+bend)/2
            int_st,int_end = ast-bmid,aend-bmid
            interval = xrange(int_st,int_end)
            #heats[i][(int_st+int_end)/2]+=1
            dists+=interval
            clusterIDs+=[i]*len(interval)
            #dists+=[(int_st+int_end)/2]
            #clusterIDs+=[i]
            #dists+= [(int_st+int_end)/2]
            #clusterIDs+=[i]
            
    data = zip(clusterIDs,dists)
    sorted(data,key=lambda x:x[0])
    clusterIDs,dists = zip(*data)
    #heats = pd.DataFrame(heats)
    heats = pd.DataFrame({'clusters':clusterIDs,'distances':dists})
    
    print heats 
    #heats = heats.fillna(0)
    heats_R=com.convert_to_r_dataframe(heats)
    #print heats_R
    importr("ggplot2")
    
    plotViolinFunc = robj.r("""
                            library(ggplot2)
                            function(df){
                            png(filename="violin.png")
                            
                            p <- ggplot(df,aes(x=as.character(clusters),
                                               y=distances)) + 
                                    geom_violin(aes(x=as.character(clusters),
                                               y=distances),
                                               stat="ydensity",
                                               adjust=40,
                                               trim=TRUE,
                                               fill="red") + 
                                    coord_flip()
                            print(p)
                            dev.off()
                            print(p)
                            }
                            """)
    
    plotViolinFunc(heats_R)
    raw_input()
Esempio n. 57
0
def cad_queryset_to_r(qset, outfile='from_python.gzip'):
    from rpy2.robjects import r
    import pandas.rpy.common as com
    from pandas import DataFrame
    rel_dt = np.min([x.inc_datetime for x in qset])
    res = np.array([[(x.inc_datetime - rel_dt).total_seconds()] +
                    list(x.att_map.coords) for x in qset])
    df = com.convert_to_r_dataframe(DataFrame(res))
    r.assign("foo", df)
    r("save(foo, file='%s', compress=TRUE)" % outfile)
Esempio n. 58
0
    def plot_ROC(self, path):
        robjects.r["pdf"](path, width=14, height=8)

        df = self.df
        # print(df)
        gp = ggplot2.ggplot(convert_to_r_dataframe(df, strings_as_factors=True))
        gp += ggplot2.aes_string(x="fpr", y="tpr")
        gp += ggplot2.geom_line(color="blue")
        gp += ggplot2.geom_point(size=2)
        gp.plot()
Esempio n. 59
0
    def plot_ROC(self, path):
        robjects.r['pdf'](path, width=14, height=8)

        df = self.df
        print(df)
        gp = ggplot2.ggplot(convert_to_r_dataframe(df, strings_as_factors=True))
        gp += ggplot2.aes_string(x='fpr', y='tpr')
        gp += ggplot2.geom_line(color='blue')
        gp += ggplot2.geom_point(size=2)
        gp.plot()