Esempio n. 1
0
    def predict(self, x):
        """
        Get the predicted values for the given input values
        Returns an nxm np.array with the predicted y values corresponding to the given x, with m being the number of dependent variables and n the number of observations in x
        
        NOTE: assumes that the dimensions for the predicted y are the same as what was expected from the training, e.g. same amount of dependent variables
        """
        ## Check the estimator has been fit before calling this function
        check_is_fitted(self, "gammodels")

        # input is converted to an at least 2nd numpy array by sklearn util function, this is necessary for handling 1-dimensional x inputs etc. correctly (otherwise also doesn't convert to right amount of columns and rows)
        x = check_array(x, accept_sparse=["csr", "csc", "coo"])

        # Convert to R matrices
        if (
            x.ndim == 1
        ):  # If we're only looking at 1 x at a time, shape[1] will give an error for one-dimensional arrays. Sklearn input validation doesn't change that.
            rx = r.matrix(x, nrow=x.shape[0], ncol=1)
        else:
            rx = r.matrix(x, nrow=x.shape[0], ncol=x.shape[1])
        r.assign("newxdata", rx)  # Put data in R environment for the functions to use
        r("newxdataframe<-data.frame(newxdata)")

        # Use gammodels list to predict each dependent variable and put together in R matrix
        for i, gammodel in enumerate(self.gammodels):
            r.assign("gmodel", gammodel)
            if i == 0:  # array is empty
                r("predmatrix<-predict(gmodel, newxdataframe)")
            else:
                r("predmatrix<-cbind(predmatrix,predict(gmodel,newxdataframe))")
        result = np.asarray(r["predmatrix"])
        return result
Esempio n. 2
0
	def plot(self, filename):
		l = len(self.ts)
		r.assign('l', l)
		r.assign('rfilename', filename)
		r.assign('img', filename + '.png')
		
		#Label creation
		r('lbl <- rep(NA, l)')
		lastindex = dict()
		
		for key in range(0, l):
			if self.ts[key] > 0:
				value = self.ts[key]
				if not value in lastindex or (key - lastindex[value]) > 8:
					r.assign('stamp', self.getLabel(key))
					r.assign('i', key+1)	#Indexes start at 1 in R
					r('lbl[i] <- stamp')
					lastindex[value] = key
		
		
		r('temp <- scan(rfilename)')
		r('timeseries <- ts(temp)')
		r('png(img, width = 800, height = 800)')
		r('plot.ts(timeseries, type = "p")')
		r('text(timeseries, labels = lbl, pos = 3)')
		r('dev.off()')
Esempio n. 3
0
 def computeGAM(self, rX, ry):
     """
     Put together R code iteratively for getting GAM models for each dependent variable in ry
     Returns list of R GAM models    
     """
     # For every dependent variable, construct model and put into list
     models = list()
     r.assign("xdata", rX)  # Put data in R environment for the functions to use
     r.assign("ydata", ry)  # Put data in R environment for the functions to use
     r("alldataframe<-data.frame(cbind(xdata,ydata))")
     self.xcolnames = np.asarray(r("colnames(alldataframe)[1:ncol(xdata)]"))
     self.ycolnames = np.asarray(
         r("colnames(alldataframe)[-(1:ncol(xdata))]")
     )  # All the cols that are after xcol are ycol
     for ycolname in self.ycolnames:
         rcode = "gam(%s~" % ycolname.replace(
             "-", "."
         )  # replace - by . because R doesn't take - in columnnames and converts them to . instead
         for xcolname in self.xcolnames:
             rcode += "s(%s" % xcolname.replace(
                 "-", "."
             )  # replace - by . because R doesn't take - in columnnames and converts them to . instead
             n_unique = r("length(unique(alldataframe$%s))" % xcolname)[0]  # because R starts indexes at 1
             if n_unique < 10:  # If there aren't enough levels, need to set k lower
                 rcode += ",k=5"
             rcode += ")+"
         rcode = rcode[:-1]
         rcode += ",data=alldataframe)"
         gammodel = r(rcode)
         models.append(gammodel)
     return models
Esempio n. 4
0
def test_1():
    x = generate_AR1(0.95, 1, 50**2+25, random_state=0)
    val = integrated_autocorr5(x, size='sqrt')[0]

    r.assign('x', x)
    ref = r('(bm(x)$se)^2 * length(x) / var(x)')[0]
    np.testing.assert_almost_equal(val, ref)
Esempio n. 5
0
 def gof(self, method='Sn', simulation='pb'):
     """
     Goodness-of-fit tests for copulas 
     gofCopula from the R copula package
     
     Input:
         method: "Sn" -> test statistic from Genest, Rémillard, Beaudoin (2009)
                 "SnB"-> test statistic from Genest, Rémillard, Beaudoin (2009)
                 "SnC" -> test statistic from Genest et al. (2009).
                 "AnChisq" -> Anderson-Darling test statistic
                 "AnGamma -> similar to "AnChisq" but based on the gamma distribution 
         simulation: "pb" -> parametric bootstrap
                     "mult" -> multiplier
     Output:
         statistic -> test statistic
         p_value -> p_value of the test
         parameter -> estimates of the parameters for the hypothesized copula family
         
     """
     family = self.family
             
     xy = self.xy
     r.assign('xy',xy.T)
     theta = self.theta
     r('foo <- gofCopula(%sCopula(%f), xy, estim.method="itau", method="%s", simulation="%s")'%(family,theta,method,simulation))
     p_value = float(r('foo$p.value')[0])
     statistic = float(r('foo$statistic')[0])
     parameter = float(r('foo$parameter')[0])
     
     self.p_value = p_value
     self.statistic = statistic 
     self.parameter = parameter
     
     return statistic, p_value, parameter
Esempio n. 6
0
    def getCorrelations(self, dataframe):
        """
        Perform hierarchical clustering on a
        dataframe of expression values

        Arguments
        ---------
        dataframe: pandas.Core.DataFrame
          a dataframe containing gene IDs, sample IDs
          and gene expression values

        Returns
        -------
        corr_frame: pandas.Core.DataFrame
          a dataframe of a pair-wise correlation matrix
          across samples.  Uses the Pearson correlation.
        """

        # set sample_id to index
        pivot = dataframe.pivot(index="sample_name", columns="transcript_id", values="TPM")
        transpose = pivot.T
        # why do I have to resort to R????
        r_df = py2ri.py2ri_pandasdataframe(transpose)
        R.assign("p.df", r_df)
        R("""p.mat <- apply(p.df, 2, as.numeric)""")
        R("""cor.df <- cor(p.mat)""")
        r_cor = R["cor.df"]
        py_cor = py2ri.ri2py_dataframe(r_cor)
        corr_frame = py_cor

        return corr_frame
Esempio n. 7
0
def save_matrix_R(filename, matrix):

    rmatrix = npr.numpy2ri(matrix)

    r.assign('data', rmatrix)

    r.save('data', file=filename)    
Esempio n. 8
0
def covarFilter(infile,
                time_points,
                replicates,
                quantile):
    '''
    Filter gene list based on the distribution of the
    sums of the covariance of each gene.  This is highly
    recommended to reduce the total number of genes used
    in the dynamic time warping clustering to reduce the
    computational time.  The threshold is placed at the
    intersection of the expected and observed value
    for the given quantile.
    '''

    time_points.sort()
    time_rep_comb = [x for x in itertools.product(time_points, replicates)]
    time_cond = ro.StrVector([x[0] for x in time_rep_comb])
    rep_cond = ro.StrVector([x[1] for x in time_rep_comb])
    df = pd.read_table(infile, sep="\t", header=0, index_col=0)

    df.drop(['replicates'], inplace=True, axis=1)
    df.drop(['times'], inplace=True, axis=1)
    df = df.fillna(0.0)

    R.assign('diff_data', df)

    E.info("loading data frame")

    # need to be careful about column headers and transposing data frames

    R('''trans_data <- data.frame(diff_data)''')
    R('''times <- c(%s)''' % time_cond.r_repr())
    R('''replicates <- c(%s)''' % rep_cond.r_repr())

    # calculate the covariance matrix for all genes
    # sum each gene's covariance vector

    E.info("calculating sum of covariance of expression")

    R('''covar.mat <- abs(cov(trans_data))''')
    R('''sum.covar <- rowSums(covar.mat)''')
    R('''exp.covar <- abs(qnorm(ppoints(sum.covar),'''
      '''mean=mean(sum.covar), sd=sd(sum.covar)))''')
    R('''sum.covar.quant <- quantile(sum.covar)''')
    R('''exp.covar.quant <- quantile(exp.covar)''')

    E.info("filter on quantile")

    R('''filtered_genes <- names(sum.covar[sum.covar > '''
      '''sum.covar.quant[%(quantile)i]'''
      ''' & sum.covar > exp.covar.quant[%(quantile)i]])''' % locals())
    R('''filtered_frame <- data.frame(diff_data[, filtered_genes],'''
      '''times, replicates)''')

    filtered_frame = com.load_data('filtered_frame').T

    return filtered_frame
Esempio n. 9
0
def test_1():
    r("require('coda')")

    random = np.random.RandomState(1)
    for i in range(10):
        x = generate_AR1(phi=0.95, sigma=1, n_steps=1000, c=0, y0=0, random_state=random)
        r.assign('x', x)
        tau = r('nrow(x)/effectiveSize(x)')[0]
        np.testing.assert_approx_equal(tau, integrated_autocorr2(x))
Esempio n. 10
0
def save_to_R(X, filename):
	import numpy as np
	from rpy2.robjects import r
	import pandas.rpy.common as com
	from pandas import DataFrame
	df = DataFrame(np.array(X))
	df = com.convert_to_r_dataframe(df)
	r.assign("X", df)
	r("save(X, file='%s.gz', compress=TRUE)"%(filename))
Esempio n. 11
0
def treeCutting(infile,
                expression_file,
                cluster_file,
                cluster_algorithm,
                deepsplit=False):
    '''
    Use dynamic tree cutting to derive clusters for each
    resampled distance matrix
    '''
    wgcna_out = "/dev/null"

    E.info("loading distance matrix")

    df = pd.read_table(infile, sep="\t",
                       header=0, index_col=0)
    df = df.fillna(0.0)
    genes = df.index
    genes_r = ro.StrVector([g for g in genes])

    # py2ri requires activation
    pandas2ri.activate()
    rdf = pandas2ri.py2ri(df)

    R.assign("distance_data", rdf)
    R.assign("gene_ids", genes_r)

    R('''sink(file='%(wgcna_out)s')''' % locals())
    R('''suppressPackageStartupMessages(library("WGCNA"))''')
    R('''suppressPackageStartupMessages(library("flashClust"))''')
    E.info("clustering data by %s linkage" % cluster_algorithm)
    R('''rownames(distance_data) <- gene_ids''')
    R('''clustering <- flashClust(as.dist(distance_data),'''
      ''' method='%(cluster_algorithm)s')''' % locals())
    if deepsplit:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=T)''')
    else:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=F)''')

    R('''color_cut <- labels2colors(cluster_cut)''')
    R('''write.table(color_cut, file = '%(cluster_file)s','''
      '''sep="\t")''' % locals())
    R('''cluster_matched <- data.frame(cbind(rownames(distance_data),'''
      '''color_cut))''')
    R('''colnames(cluster_matched) = c("gene_id", "cluster")''')
    R('''cluster_matched <- data.frame(cluster_matched$gene_id,'''
      '''cluster_matched$cluster)''')
    R('''sink(file=NULL)''')

    cluster_frame = pandas2ri.ri2py(R["cluster_matched"])
    cluster_frame.columns = ['gene_id', 'cluster']
    cluster_frame.index = cluster_frame['gene_id']
    cluster_frame.drop(['gene_id'], inplace=True, axis=1)

    return cluster_frame
Esempio n. 12
0
def test_1():
    y = generate_AR1(0.95, 1, 10000)
    tau = integrated_autocorr3(y)

    r.assign('x', y)
    r('popvar = (var(x)*(nrow(x)-1)/nrow(x))')
    r('init = initseq(x)')
    tau_ref = r('initseq(x)$var.pos / popvar')[0]
    print(tau, tau_ref)
    np.testing.assert_array_almost_equal(tau, tau_ref)
Esempio n. 13
0
def create_model(input_json):
    global hourly_volume

    #Loads JSON file
    print 'Loading Data...'
    json = loads(input_json)

    #Converts to Pandas Time Series Dataframe which can be converted to be used by R
    df = pd.DataFrame(json)
    df.columns = ['time']
    df['time'] = df['time'].apply(dateutil.parser.parse)
    df.set_index('time', inplace=True)
    df['t'] = 1

    #Resamples Dataframe into hourly (for weekly model) and daily (for monthly model) buckets
    hourly_volume = df.resample('1H', how=np.count_nonzero)
    daily_volume = df.resample('1D', how=np.count_nonzero)

    print 'Creating Model...'

    #Converts Pandas Dataframe to R Dataframe
    demand_data_daily = com.convert_to_r_dataframe(daily_volume)
    demand_data_hourly = com.convert_to_r_dataframe(hourly_volume)

    #Brings Dataframes into R workspace
    r.assign('train_data_hourly', demand_data_hourly)
    r.assign('train_data_daily', demand_data_daily)
    #Assigns values to required input variables in R
    r('start_index = ' + str(get_friday_index(hourly_volume)))
    r('month_index = ' + str(get_first_of_month(hourly_volume)))

    #Reorganizes hourly dataframe to seasonal time series w/ 168 hr weekly intervals starting at the 1st Fri
    r('train_data_ts <- ts(train_data_hourly[,1],start=c(1,(168-start_index+2)),frequency=168)'
      )
    #Adds 0.01 as model input data must be non-zero
    r('train_data_ts = train_data_ts+ 0.01')
    #R creates hourly model we set beta=0 as we assume no global trend (HOLTZ-WINTERS MODEL)
    r('hr_model <- HoltWinters(train_data_ts,beta=0,seasonal="m",start.periods=(168+start_index-1))'
      )

    #R creates a monthly model IFF there is enough data (min 8 weeks)
    r('dy_model = NULL')
    #1st Fri of hourly dataset translated for daily dataset
    r('start_index = (start_index-1)/24+1')
    if (r('length(train_data_daily[,1])>(28*2+start_index-1)')[0]):
        #if the first fri of the month of the dataset proceeds start date of dataset, sets to prior month's first fri
        r('if(month_index<1){month_index = 28-month_index }')
        #Reorganizes daily dataframe to seasonal time series
        r('train_data_ts <- ts(train_data_daily[,1],start=c(1,month_index),frequency=28)'
          )
        #R creates monthly model, again we assume no global trend
        r('dy_model <- HoltWinters(train_data_ts,seasonal="m",start.periods=(28+start_index-1))'
          )

        print 'Model Created!'
Esempio n. 14
0
def treeCutting(infile,
                expression_file,
                cluster_file,
                cluster_algorithm,
                deepsplit=False):
    '''
    Use dynamic tree cutting to derive clusters for each
    resampled distance matrix
    '''
    wgcna_out = "/dev/null"

    E.info("loading distance matrix")

    df = pd.read_table(infile, sep="\t", header=0, index_col=0)
    df = df.fillna(0.0)
    genes = df.index
    genes_r = ro.StrVector([g for g in genes])

    # py2ri requires activation
    pandas2ri.activate()
    rdf = pandas2ri.py2ri(df)

    R.assign("distance_data", rdf)
    R.assign("gene_ids", genes_r)

    R('''sink(file='%(wgcna_out)s')''' % locals())
    R('''suppressPackageStartupMessages(library("WGCNA"))''')
    R('''suppressPackageStartupMessages(library("flashClust"))''')
    E.info("clustering data by %s linkage" % cluster_algorithm)
    R('''rownames(distance_data) <- gene_ids''')
    R('''clustering <- flashClust(as.dist(distance_data),'''
      ''' method='%(cluster_algorithm)s')''' % locals())
    if deepsplit:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=T)''')
    else:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=F)''')

    R('''color_cut <- labels2colors(cluster_cut)''')
    R('''write.table(color_cut, file = '%(cluster_file)s','''
      '''sep="\t")''' % locals())
    R('''cluster_matched <- data.frame(cbind(rownames(distance_data),'''
      '''color_cut))''')
    R('''colnames(cluster_matched) = c("gene_id", "cluster")''')
    R('''cluster_matched <- data.frame(cluster_matched$gene_id,'''
      '''cluster_matched$cluster)''')
    R('''sink(file=NULL)''')

    cluster_frame = pandas2ri.ri2py(R["cluster_matched"])
    cluster_frame.columns = ['gene_id', 'cluster']
    cluster_frame.index = cluster_frame['gene_id']
    cluster_frame.drop(['gene_id'], inplace=True, axis=1)

    return cluster_frame
Esempio n. 15
0
def save_simmat_R(filename, simmat):

    rmatrix = npr.numpy2ri(simmat.matrix)

    r.assign('data', rmatrix)
    
    r("rownames(%s) <- c%s" % ('data', tuple(simmat.labels)))

    r("colnames(%s) <- c%s" % ('data', tuple(simmat.labels)))

    r.save('data', file=filename)    
def save_simmat_R(filename, simmat):

    rmatrix = npr.numpy2ri(simmat.matrix)

    r.assign('data', rmatrix)

    r("rownames(%s) <- c%s" % ('data', tuple(simmat.labels)))

    r("colnames(%s) <- c%s" % ('data', tuple(simmat.labels)))

    r.save('data', file=filename)
Esempio n. 17
0
def determineNumberOfFactorsToExtract(data):
    # Given some data, determine the number of factors you should extract.
    # Creates a graph and displays it so you can choose
    # Stolen from
    # http://www.statmethods.net/advstats/factor.html
    r('library(nFactors)')
    r.assign('data', data)
    r('ev = eigen(cor(data))')
    r('ap = parallel(subject=nrow(data),var=ncol(data),rep=100,cent=0.5')
    r('nS = nScree(x=ev$values, aparallel=ap$eigen$qevpea)')
    r('plotnScree(nS)')
    return
Esempio n. 18
0
    def testAgainstQValue(self):

        R.assign("pvalues", self.pvalues)
        qvalue = R('''qvalue( pvalues )''')
        r_qvalues = qvalue[2]
        r_pi0 = qvalue[1][0]

        new = Stats.doFDRPython(self.pvalues)
        self.assertTrue(getRelativeError(r_pi0, new.mPi0) < self.max_error)

        for a, b in zip(r_qvalues, new.mQValues):
            self.assertAlmostEqual(a, b, places=self.nplaces)
def plotFilteredSamples(infiles, outfiles):
    '''Create a plot of the SNP profiles for each filtered sample'''

    error_profile, otu_assignment = infiles

    otu_assignment = P.snip(otu_assignment, '.fasta') + '_up.txt'
    otu_dict = {}
    for row in open(otu_assignment):
        sample_id = row.split()[0].split(';')[0]
        otu_id = row.split().pop()
        otu_dict[sample_id] = otu_id

    def _fetch_loci(infile):
        # Some samples have no snps...
        if not open(infile).readline():
            L.warn('Sample %s has no SNPs' % infile)
            idx = [i for i in range(1, 1501)]
            snp = [
                0,
            ] * 1500
            df = pd.DataFrame([idx, snp]).transpose()
        else:
            df = pd.DataFrame(
                [x.split(',') for x in open(infile).readline().split('\t')])

        df.columns = ['Locus', 'Frequency']
        df = df.applymap(float)

        return df

    sample_id = P.snip(error_profile, '_true_snps.tsv', strip_path=True)
    otu_id = otu_dict[sample_id]
    outfile = os.path.join('14_filter_sample_error_profiles.dir',
                           otu_id + '_' + \
                           sample_id + '.pdf')

    R('''rm(list=ls())''')
    R('''require('ggplot2')''')
    df = _fetch_loci(error_profile)
    R.assign('df', df)

    R('''require('ggplot2')
         pl <- ggplot(df, aes(x=Locus, xend=Locus, y=0, yend=Frequency)) + geom_segment()
         pl <- pl + theme_bw() + theme(panel.grid=element_blank())
         pl <- pl + xlim(0, 1500) + scale_y_continuous(expand=c(0,0), limits=c(0, 100))
         pl <- pl + xlab('Position Along 16S Gene') + ylab('Frequency (%%)')
         pl <- pl + ggtitle('%s\n%s')
         pdf('%s', height=3, width=5)
         plot(pl)
         dev.off()
      ''' % (otu_id, sample_id, outfile))

    R('''rm(list=ls())''')
Esempio n. 20
0
 def kl_s(self, thres=float("inf")):
     r.assign('X', self.data_a)
     r.assign('Y', self.data_b)
     r_ret = r('''
             X = as.numeric(t(X))
             Y = as.numeric(t(Y))
             library(FNN)
             kl = na.omit(KL.divergence(X, Y, k = 10, algorithm=c("kd_tree", "cover_tree", "brute")))
             mean(kl[is.infinite(kl) == 0])
             ''')
     r_ret_str = str(r_ret)
     return d_close, float(r_ret_str[4:])
Esempio n. 21
0
def plot_mutrate(wt, ga):

    ratemat = np.zeros(( 7, 2 ))

    for i in range(7):
        ratemat[i,0] = mutation_rate(wt[:,:,i])
        ratemat[i,1] = mutation_rate(ga[:,:,i])

    d = numpy2ri(ratemat)
    r.assign('data', d)

    r(' source("src/R/figure_mutrate.R") ')
Esempio n. 22
0
    def testAgainstQValue(self):

        R.assign("pvalues", self.pvalues)
        qvalue = R('''qvalue( pvalues )''')
        r_qvalues = qvalue[2]
        r_pi0 = qvalue[1][0]

        new = Stats.doFDRPython(self.pvalues)
        self.assertTrue(getRelativeError(r_pi0, new.mPi0) < self.max_error)

        for a, b in zip(r_qvalues, new.mQValues):
            self.assertAlmostEqual(a, b, places=self.nplaces)
Esempio n. 23
0
def determineNumberOfFactorsToExtract(data):
	# Given some data, determine the number of factors you should extract.
	# Creates a graph and displays it so you can choose
	# Stolen from
	# http://www.statmethods.net/advstats/factor.html
	r('library(nFactors)')
	r.assign('data', data)
	r('ev = eigen(cor(data))')
	r('ap = parallel(subject=nrow(data),var=ncol(data),rep=100,cent=0.5')
	r('nS = nScree(x=ev$values, aparallel=ap$eigen$qevpea)')
	r('plotnScree(nS)')
	return
Esempio n. 24
0
def misha2csv(misha=None, binning=DEFAULT_BINNING, output=None):

    from rpy2.robjects import r

    r_library_expression = """
    library("shaman");
    library("misha")
    """

    if misha is None:
        r_import_expression = """
        gsetroot(shaman_get_test_track_db());
        contact_map <- gextract("hic_obs", gintervals.2d(2, 175e06, 
        178e06, 2, 175e06, 178e06), colnames="score")
        """
    else:
        r.assign("path", misha)
        r_import_expression = """
        contact_map <- gextract("hic_obs", gintervals.2d(2, 0, 
        178e06, 2, 175e06, 178e06), colnames="score")
        """

    r(r_library_expression)
    r(r_import_expression)
    r("write.table(contact_map, 'exported_map.csv')")

    matrix = np.genfromtxt("exported_map.csv", dtype=None, skip_header=True)

    (_, _, start1, end1, _, start2, end2, contacts, _) = zip(*matrix)

    pos1 = (np.array(start1) + np.array(end1)) // 2
    pos2 = (np.array(start2) + np.array(end2)) // 2

    positions1 = np.array(pos1) // binning
    positions2 = np.array(pos2) // binning

    minimum = min(np.amin(positions1), np.amin(positions2))
    positions1 -= minimum
    positions2 -= minimum

    n = int(max(np.amax(positions1), np.amax(positions2))) + 1
    assert len(positions1) == len(
        positions2), "Mismatch between lengths {} and {}".format(
            len(positions1), len(positions2))
    sparse_matrix = sparse.coo_matrix((contacts, (positions1, positions2)),
                                      shape=(n, n))

    dense_matrix = np.array(sparse_matrix.todense(), dtype=np.int32)
    if output is not None:
        np.savetxt(output, dense_matrix, fmt="%i")

    return dense_matrix
Esempio n. 25
0
def do_multivariate_cox(time, censor, var, additional_vars, float_vars=False):
  df = pd.DataFrame({
    'time': time,
    'censor': censor,
    'var': var})
  df = df.join(additional_vars, how='inner')

  surv = importr('survival')
  r.assign('time',robjects.IntVector(np.array(df['time'])))
  r.assign('censor', robjects.IntVector(np.array(df['censor'])))
  r.assign('var', robjects.FloatVector(np.array(df['var'])))

  for i in additional_vars:
    if float_vars:
      r.assign(i, robjects.FloatVector(np.array(df[i])))
    else:
      r.assign(i, robjects.IntVector(np.array(df[i])))

  additional_vars_str = ' + '.join(additional_vars.columns)

  formula = 'Surv(time, censor) ~ var + ' + additional_vars_str
  try:
    coxuh_output = r('summary( coxph(formula = ' + formula + ', model=FALSE, x=FALSE, y=FALSE))')
  except ri.RRuntimeError as e:
    print(e)
    print(var.describe())
    print(additional_vars.columns)
    return {}

  coef_ind = list(coxuh_output.names).index('coefficients')
  coeffs = coxuh_output[coef_ind]

  patient_count_ind = list(coxuh_output.names).index('n')
  patient_count = coxuh_output[patient_count_ind][0]

  var_zscore = get_zscore('var', coeffs)
  var_pvalue = get_pvalue('var', coeffs)
  hazards_dict = get_hazards('var', coxuh_output)

  cox_dict = {
      'var-n': patient_count,
      'var-z': var_zscore,
      'var-p': var_pvalue,
      }
  cox_dict.update(hazards_dict)

  for i in additional_vars.columns:
    cox_dict[i + '-z'] = get_zscore(i, coeffs)
    cox_dict[i + '-p'] = get_pvalue(i, coeffs)
    cox_dict.update(get_hazards(i, coxuh_output))

  return cox_dict
Esempio n. 26
0
 def hell_s(self, thres=1):
     r.assign('X', self.data_a)
     r_ret = r('''
             X = as.numeric(t(X))
             Y = r{}({}, {})
             min2 = min(c(min(X),min(Y)))
             max2 = max(c(max(X),max(Y)))
             library(statip)
             hellinger(X, Y, min2, max2)
             '''.format(self.pyr_dist("r"), len(self.data_a),\
                     str(self.dist_args)[1:-1]))
     r_ret_str = str(r_ret)
     return d_close, float(r_ret_str[4:])
Esempio n. 27
0
 def _get_parameter(self):
     """ estimate the parameter (theta) of copula
     """     
     r.assign('tau.',self.tau)
     
     if self.family == 'clayton':
         self.theta = r('iTau(claytonCopula(), tau.)')[0]
         
     elif self.family == 'frank':
         self.theta = r('iTau(frankCopula(), tau.)')[0]
         
     elif self.family == 'gumbel':
         self.theta = r('iTau(gumbelCopula(), tau.)')[0]
Esempio n. 28
0
    def generate(self, S: int) -> np.ndarray:
        from rpy2.robjects import pandas2ri, r as R
        pandas2ri.activate()
        R.assign('Data', self.Data)
        R.assign('N', N)
        R("""
library(rmgarch)
xspec = ugarchspec(mean.model = list(armaOrder = c(1, 1)), variance.model = list(garchOrder = c(1,1), model = 'sGARCH'), distribution.model = 'norm')
uspec = multispec(replicate(N, xspec))
spec = dccspec(uspec = uspec, dccOrder = c(1, 1), distribution = 'mvnorm')
speca = dccspec(uspec = uspec, dccOrder = c(1, 1), model='aDCC', distribution = 'mvnorm')
fit_adcc = dccfit(spec1a, data = Data)
""")
Esempio n. 29
0
 def hell_s(self, thres=1):
     r.assign('X', self.data_a)
     r.assign('Y', self.data_b)
     r_ret = r('''
             X = as.numeric(t(X))
             Y = as.numeric(t(Y))
             min2 = min(c(min(X),min(Y)))
             max2 = max(c(max(X),max(Y)))
             library(statip)
             hellinger(X, Y, min2, max2)
             ''')
     r_ret_str = str(r_ret)
     return d_close, float(r_ret_str[4:])
Esempio n. 30
0
File: timeser.py Progetto: GSng/Uber
def create_model(input_json):
    global hourly_volume
    
    #Loads JSON file
    print 'Loading Data...'
    json = loads(input_json)
    
    #Converts to Pandas Time Series Dataframe which can be converted to be used by R
    df = pd.DataFrame(json)
    df.columns = ['time']
    df['time'] = df['time'].apply(dateutil.parser.parse)
    df.set_index('time', inplace=True)
    df['t'] = 1
    
    #Resamples Dataframe into hourly (for weekly model) and daily (for monthly model) buckets
    hourly_volume = df.resample('1H', how=np.count_nonzero)
    daily_volume =  df.resample('1D', how=np.count_nonzero)
    
    print 'Creating Model...'
	
    #Converts Pandas Dataframe to R Dataframe
    demand_data_daily = com.convert_to_r_dataframe(daily_volume)
    demand_data_hourly = com.convert_to_r_dataframe(hourly_volume)
    
    #Brings Dataframes into R workspace
    r.assign('train_data_hourly',demand_data_hourly)
    r.assign('train_data_daily',demand_data_daily)
	#Assigns values to required input variables in R
    r('start_index = ' +str(get_friday_index(hourly_volume)))
    r('month_index = ' +str(get_first_of_month(hourly_volume)))
    
    #Reorganizes hourly dataframe to seasonal time series w/ 168 hr weekly intervals starting at the 1st Fri    
    r('train_data_ts <- ts(train_data_hourly[,1],start=c(1,(168-start_index+2)),frequency=168)')
    #Adds 0.01 as model input data must be non-zero
    r('train_data_ts = train_data_ts+ 0.01')
	#R creates hourly model we set beta=0 as we assume no global trend (HOLTZ-WINTERS MODEL)
    r('hr_model <- HoltWinters(train_data_ts,beta=0,seasonal="m",start.periods=(168+start_index-1))')
    
    #R creates a monthly model IFF there is enough data (min 8 weeks)
    r('dy_model = NULL')
    #1st Fri of hourly dataset translated for daily dataset
    r('start_index = (start_index-1)/24+1')
    if (r('length(train_data_daily[,1])>(28*2+start_index-1)')[0]):
        #if the first fri of the month of the dataset proceeds start date of dataset, sets to prior month's first fri
        r('if(month_index<1){month_index = 28-month_index }')
        #Reorganizes daily dataframe to seasonal time series
        r('train_data_ts <- ts(train_data_daily[,1],start=c(1,month_index),frequency=28)')
		#R creates monthly model, again we assume no global trend
        r('dy_model <- HoltWinters(train_data_ts,seasonal="m",start.periods=(28+start_index-1))')
    
	print 'Model Created!'
Esempio n. 31
0
 def getExpertsPrediction(self, verbose, tomo, day_after):
     long_sum = 0.0
     total_sum = 0.0
     test_file = 'experts.test'
     spec_file = 'spec.spec'
     
     # Create "test set" from r DB
     if verbose: print "Generating test-set file... \n"
     test_date=self.date
     r.assign('testDate', test_date.strftime('%Y-%m-%d'))
     r('testDB<-DB[testDate]')
     r('testDB<-subset(testDB, select = -c(ticker) )')
     r.assign('remoteFilename', test_file)
     r('write.table(testDB, file=remoteFilename,quote=FALSE,sep=",",eol=";\n",row.names=FALSE,col.names=FALSE)')
     for expert in self.experts:
         classifier = expert.module(test_file, spec_file)
         score = classifier.get_scores()[0][0]
         expert.prediction = score
         if score>0: long_sum+=expert.weight
         total_sum += expert.weight
     fraction_long = long_sum/float(total_sum)
     fraction_short = 1-fraction_long
     # Delete test file
     os.remove(test_file)
     # next close
     r.assign('remoteTomorrow',tomo.strftime('%Y-%m-%d'))
     next_close = float(r('DB[remoteTomorrow]$close')[0])
     r.assign('remoteDayAft',day_after.strftime('%Y-%m-%d'))
     close_after = float(r('DB[remoteDayAft]$close')[0])
     return fraction_long-fraction_short, next_close, close_after
Esempio n. 32
0
    def getExpertsPrediction(self, verbose, tomo, day_after):
        long_sum = 0.0
        total_sum = 0.0
        test_file = 'experts.test'
        spec_file = 'spec.spec'

        # Create "test set" from r DB
        if verbose: print "Generating test-set file... \n"
        test_date = self.date
        r.assign('testDate', test_date.strftime('%Y-%m-%d'))
        r('testDB<-DB[testDate]')
        r('testDB<-subset(testDB, select = -c(ticker) )')
        r.assign('remoteFilename', test_file)
        r('write.table(testDB, file=remoteFilename,quote=FALSE,sep=",",eol=";\n",row.names=FALSE,col.names=FALSE)'
          )
        for expert in self.experts:
            classifier = expert.module(test_file, spec_file)
            score = classifier.get_scores()[0][0]
            expert.prediction = score
            if score > 0: long_sum += expert.weight
            total_sum += expert.weight
        fraction_long = long_sum / float(total_sum)
        fraction_short = 1 - fraction_long
        # Delete test file
        os.remove(test_file)
        # next close
        r.assign('remoteTomorrow', tomo.strftime('%Y-%m-%d'))
        next_close = float(r('DB[remoteTomorrow]$close')[0])
        r.assign('remoteDayAft', day_after.strftime('%Y-%m-%d'))
        close_after = float(r('DB[remoteDayAft]$close')[0])
        return fraction_long - fraction_short, next_close, close_after
Esempio n. 33
0
def MHtest(data):
    """
    perform MH test in R (faster)
    """ 

    data = numpy2ri(data)
    r.assign('D', data)
    r(' result <- mantelhaen.test(D) ')
    r(' pval <- result$p.value ')
    p = np.array(r.pval)
    r(' odds <- result$estimate ')
    OR = np.array(r.odds)

    return round(p[0], 10), OR[0]
Esempio n. 34
0
def plot_average_coverage(wt, ga):

    wt = np.sum(wt, 1)
    ga = np.sum(ga, 1)

    data_wt = numpy2ri(wt)
    data_ga = numpy2ri(ga)

    r.assign('wt', data_wt)
    r.assign('ga', data_ga)

    r(' wt <- as.matrix(wt) ')
    r(' ga <- as.matrix(ga) ')
    r(' source("src/R/figure_coverage.R") ')
Esempio n. 35
0
File: SSE.py Progetto: chaneyn/SSE
def Add_Semivariogram_Info(vg_clim,rtime):
 #Add semivariogram information to the climatology
 if vg_clim[rtime.month]['svg'] != 0:
  vg = vg_clim[rtime.month]['svg']
  vg_clim[rtime.month]['count'] = vg_clim[rtime.month]['count'] + 1.0
  r.assign("vold",vg)
  r("vnew<-rbind(vold,vnew)")
  #w2 = 1/vg_clim[rtime.month]['count']
  #w1 = 1-w2
  #r('vnew["gamma"] = ' + str(w1) + '*vold["gamma"] + ' + str(w2) + '*vnew["gamma"]')
 #r("vold<-vnew")
 vg = r("vnew")
 vg_clim[rtime.month]['svg'] = vg
 return vg_clim
Esempio n. 36
0
    def testHochberg(self):

        # code for checking
        R.assign("p", self.pvalues)
        R('''
        lp = length(p)
        n = length(p)
        i <- lp:1L
        o <- order(p, decreasing = TRUE)
        ro <- order(o)
        pmin(1, cummin((n - i + 1L) * p[o]))[ro]
        ''')

        self.check("hochberg")
def do_metagene_cox(time, censor, split, metagene):
    df = pd.DataFrame({'time': time, 'censor': censor, 'split': split})
    df = df.join(metagene, how='inner')

    surv = importr('survival')
    r.assign('time', robjects.IntVector(np.array(df['time'])))
    r.assign('censor', robjects.IntVector(np.array(df['censor'])))
    r.assign('split', robjects.FloatVector(np.array(df['split'])))
    r.assign('metagene', robjects.FloatVector(np.array(df['metagene'])))

    coxuh_output = r(
        'summary( coxph(formula = Surv(time, censor) ~ split + metagene, model=FALSE, x=FALSE, y=FALSE))'
    )

    coef_ind = list(coxuh_output.names).index('coefficients')
    coeffs = coxuh_output[coef_ind]

    patient_count_ind = list(coxuh_output.names).index('n')
    patient_count = coxuh_output[patient_count_ind][0]

    split_zscore = get_zscore('split', coeffs)
    split_pvalue = get_pvalue('split', coeffs)
    metagene_zscore = get_zscore('metagene', coeffs)
    metagene_pvalue = get_pvalue('metagene', coeffs)

    cox_dict = {
        'n': patient_count,
        'z': split_zscore,
        'p': split_pvalue,
        'metagene-z': metagene_zscore,
        'metagene-p': metagene_pvalue,
    }
    return cox_dict
Esempio n. 38
0
File: SSE.py Progetto: chaneyn/SSE
def Add_Semivariogram_Info(vg_clim, rtime):
    #Add semivariogram information to the climatology
    if vg_clim[rtime.month]['svg'] != 0:
        vg = vg_clim[rtime.month]['svg']
        vg_clim[rtime.month]['count'] = vg_clim[rtime.month]['count'] + 1.0
        r.assign("vold", vg)
        r("vnew<-rbind(vold,vnew)")
        #w2 = 1/vg_clim[rtime.month]['count']
        #w1 = 1-w2
        #r('vnew["gamma"] = ' + str(w1) + '*vold["gamma"] + ' + str(w2) + '*vnew["gamma"]')
    #r("vold<-vnew")
    vg = r("vnew")
    vg_clim[rtime.month]['svg'] = vg
    return vg_clim
Esempio n. 39
0
    def testHochberg(self):

        # code for checking
        R.assign("p", self.pvalues)
        R('''
        lp = length(p)
        n = length(p)
        i <- lp:1L
        o <- order(p, decreasing = TRUE)
        ro <- order(o)
        pmin(1, cummin((n - i + 1L) * p[o]))[ro]
        ''')

        self.check("hochberg")
Esempio n. 40
0
def test_1():
    r("require('coda')")

    random = np.random.RandomState(1)
    for i in range(10):
        x = generate_AR1(phi=0.95,
                         sigma=1,
                         n_steps=1000,
                         c=0,
                         y0=0,
                         random_state=random)
        r.assign('x', x)
        tau = r('nrow(x)/effectiveSize(x)')[0]
        np.testing.assert_approx_equal(tau, integrated_autocorr2(x))
Esempio n. 41
0
 def _r_tobit(self, data, xvars, rbar):
     """ Estimate tobit with function from r """
     r.assign('data', com.convert_to_r_dataframe(data))
     rhs = '+'.join(xvars)
     model = r("vglm(OverallRank ~ "+ rhs +", \
                       family=tobit(Upper=" + str(rbar) + ", Lower=1), \
                       data=data, crit='coeff')")
     if self.opts['verbose']:
         print(r.summary(model))
     out = r.coef(model, matrix=True)
     out = np.array(out)
     index = deepcopy(xvars)
     index.insert(0, 'const')
     beta = pd.Series(out[:, 0], index=index)
     return {'beta': beta, 'sigma': out[0, 1]}
Esempio n. 42
0
def saveData(synthscRNAseq, cellTypesRecord, gc, projMatrix, constMatrix, cellSizeFactors, synthPseudotimes, copyFolder, targetDirectory, R_Directory):
	
	os.chdir(copyFolder)

	itemsToSave = [synthscRNAseq, cellTypesRecord, gc, projMatrix, constMatrix, cellSizeFactors, synthPseudotimes]
	itemNames = ["synthscRNAseq", "cellTypesRecord", "gc", "projMatrix", "constMatrix", "cellSizeFactors", "synthPseudotimes"]

	for i in range(len(itemsToSave)):
		np.save(itemNames[i], itemsToSave[i])
		np.save(targetDirectory+itemNames[i], itemsToSave[i])
		ro = numpy2ri(itemsToSave[i])
		r.assign(itemNames[i], ro)
		rSaveString = "save(" + itemNames[i] + ", file='"+ itemNames[i] +".gzip', compress=TRUE)"
		r(rSaveString)
		rSaveString = "save(" + itemNames[i] + ", file='" + R_Directory + ".gzip', compress=TRUE)"
		r(rSaveString)
def main(args):
    from rpy2.robjects import r
    raster = importr('raster')
    rgdal = importr('rgdal')

    # import the previously created watershed area into R
    r('frac_sum <- function(x, ...){sum(x, na.rm=TRUE)/225}')

    print 'Loading Catchement Raster'
    catch = raster.raster(args.catchment)
    
    print 'Creating Fraction File'
    fraction = raster.aggregate(catch, fact=15, fun=r('frac_sum'))
    r.assign('fraction', fraction)
    print '\tSaving Fraction File'
    raster.writeRaster(fraction, filename=os.path.join(args.outdir, 'fraction.asc'), format='ascii', overwrite=True, NAflag=0)
Esempio n. 44
0
 def smkl_s(self, thres=float("inf")):
     r.assign('X', self.data_a)
     r.assign('Y', self.data_b)
     try:
         r_ret = r('''
                 X = as.numeric(t(X))
                 Y = as.numeric(t(Y))
                 library(FNN)
                 klxy = na.omit(KL.divergence(X, Y, k = 10, algorithm=c("kd_tree", "cover_tree", "brute")))
                 klyx = na.omit(KL.divergence(Y, X, k = 10, algorithm=c("kd_tree", "cover_tree", "brute")))
                 mean(klxy[is.infinite(klxy) == 0]) + mean(klyx[is.infinite(klyx) == 0])
                 ''')
         r_ret_str = str(r_ret)
         statistics = float(r_ret_str[4:])
     except:
         statistics = np.nan
     return d_close(statistics, thres), statistics
Esempio n. 45
0
def r_MLtheta(y, mu, max_iter=10):
    '''Computes ML estimate for overdispersion theta in R, given predicted and observed counts.'''

    robjects.numpy2ri.activate()
    r.assign("y", y)
    r.assign("mu", mu)
    robjects.numpy2ri.deactivate()

    r_define_warning_handler()

    #convert mu and y to correct format and compute
    r("mu <- as.matrix(mu)")
    r("y <- as.matrix(y)")
    r("res = withWarnings(as.numeric(x = theta.ml(y = y, mu = mu, limit = %u)))"
      % (max_iter))

    return r_extract_results()
Esempio n. 46
0
def save_counts(
    inFile,
    tag="taul",
    pcaFile=None,
    outFile=None,
    logscale=True,
    model=None,
    pathOut="/mnt/fhgfs_ribdata/user_worktmp/dominik.otto/PCa-2016",
    counts_df=None,
    tenplate=None,
):
    if outFile is None:
        outFile = inFile.replace("_params.hdf5", "_tumor_counts.csv")
    if counts_df is None:
        xt_df = get_counts(
            inFile,
            pcaFile=pcaFile,
            logscale=logscale,
            tumor_free=False,
            add_dev=False,
            xt_df=tenplate,
        )
    else:
        xt_df = counts_df
    xt_df.to_csv(outFile)
    if model is not None:
        # save as R data per cohort
        assembly = model.assembly
        pheno = model.pheno
        cohorts = pheno.CohortAbb.unique()
        for cohort in tqdm(cohorts, desc="cohorts"):
            print(f"Saving {cohort} ...")
            samp_names = pheno.index[pheno.CohortAbb == cohort]
            Counts = xt_df.loc[:, samp_names]
            outFile = (
                f"{pathOut}/"
                f"{tag}-normalized-none-{cohort}-{assembly}-kalGene-counts.csv"
            )
            print(f"... as {outFile}")
            Counts.to_csv(outFile)
            outFile = (f"{pathOut}/"
                       f"{tag}-normalized-none-{cohort}-{assembly}-"
                       f"kalGene-counts.RData")
            print(f"... as {outFile}")
            r.assign("Counts", Counts)
            r(f"save(Counts, file='{outFile}')")
Esempio n. 47
0
 def hell_s(self, thres=1):
     r.assign('X', self.data_a)
     r.assign('Y', self.data_b)
     try:
         r_ret = r('''
                 X = as.numeric(t(X))
                 Y = as.numeric(t(Y))
                 min2 = min(c(min(X),min(Y)))
                 max2 = max(c(max(X),max(Y)))
                 library(statip)
                 hellinger(X, Y, min2, max2)
                 ''')
         r_ret_str = str(r_ret)
         statistics = float(r_ret_str[4:])
     except:
         statistics = np.inf
     return d_close(statistics, thres), statistics
    def base(self, baset1=-50, baset2=300, binwidth=0.1):
        self.baset1 = np.max([self.GTI1, baset1])
        self.baset2 = np.min([self.GTI2, baset2])
        self.binwidth = binwidth
        self.tbins = np.arange(self.baset1, self.baset2 + self.binwidth,
                               self.binwidth)
        assert self.baset1 < self.baset2, self.bnname + ': Inappropriate base times!'
        if not os.path.exists(self.baseresultdir):
            os.makedirs(self.baseresultdir)
            expected_pvalue = norm_pvalue()
            f = h5py.File(self.baseresultdir + '/base.h5', mode='w')
            for i in range(14):  #14个探头
                grp = f.create_group(Det[i])  #创建一个探头的组
                ttefile=glob(self.datadir+'/'+'glg_tte_'+Det[i]+'_'+\
                                        self.bnname+'_v*.fit')#找到相应探头的文件名
                hdu = fits.open(ttefile[0])  #打开相应探头文件
                trigtime = hdu['Primary'].header['TRIGTIME']
                data = hdu['EVENTS'].data
                timedata = data.field(0) - trigtime
                chdata = data.field(1)
                for ch in range(128):  #这里应该就是大名鼎鼎的分能道扣除背景
                    time_selected = timedata[chdata == ch]
                    histvalue, histbin = np.histogram(time_selected,
                                                      bins=self.tbins)
                    rate = histvalue / binwidth
                    r.assign('rrate', rate)
                    r("y=matrix(rrate,nrow=1)")
                    fillPeak_hwi = str(int(5 / binwidth))
                    fillPeak_int = str(int(len(rate) / 10))
                    r("rbase=baseline(y,lam = 6, hwi=" + fillPeak_hwi +
                      ", it=10,\
								 int =" + fillPeak_int + ", method='fillPeaks')")
                    r("bs=getBaseline(rbase)")
                    r("cs=getCorrected(rbase)")
                    bs = r('bs')[0]
                    cs = r('cs')[0]
                    corrections_index = (bs < 0)
                    bs[corrections_index] = 0  #baseline小于0的部分强制为0
                    cs[corrections_index] = rate[
                        corrections_index]  #扣除背景的部分等于原来的部分。
                    f['/' + Det[i] + '/ch' + str(ch)] = np.array(
                        [rate, bs, cs])  #将每一个探头中每一个能道中的背景分别保存。
            f.flush()
            f.close()
Esempio n. 49
0
def plot_tediff_supplement(r1_hela, r1_human, r2_hela, r2_human):

    r1hela = numpy2ri(r1_hela)
    r.assign('r1hela', r1hela)
    r1human = numpy2ri(r1_human)
    r.assign('r1human', r1human)
    r2hela = numpy2ri(r2_hela)
    r.assign('r2hela', r2hela)
    r2human = numpy2ri(r2_human)
    r.assign('r2human', r2human)

    r(' source("src/R/figure_supplement_nopt.R") ')
Esempio n. 50
0
def do_cox(time, censor, variable, melk=[]):
    surv = importr("survival")

    time = np.array(time, dtype=np.float)
    censor = np.array(censor, dtype=np.float)
    variable = np.array(variable, dtype=np.float)
    if len(melk):
        melk = np.array(melk, dtype=np.float)
    # remove missing data
    skip_cols = []
    for i in range(len(variable)):
        if np.isnan(variable[i]):
            skip_cols.append(i)
        elif np.isnan(time[i]):
            skip_cols.append(i)
        elif np.isnan(censor[i]):
            skip_cols.append(i)
        elif len(melk) and np.isnan(melk[i]):
            skip_cols.append(i)

    variable = np.delete(variable, skip_cols)
    time = np.delete(time, skip_cols)
    censor = np.delete(censor, skip_cols)
    if len(melk):
        melk = np.delete(melk, skip_cols)
    r.assign('time', robjects.FloatVector(time))
    r.assign('censor', robjects.IntVector(censor))
    r.assign('variable', robjects.FloatVector(variable))
    if len(melk):
        r.assign('melk', robjects.FloatVector(melk))

    if len(melk):
        coxuh_output = r(
            'summary(coxph(formula = Surv(time, censor) ~ variable + melk))')
    else:
        coxuh_output = r(
            'summary(coxph(formula = Surv(time, censor) ~ variable))')

    coef_ind = list(coxuh_output.names).index('coefficients')
    coeffs = coxuh_output[coef_ind]

    patient_count_ind = list(coxuh_output.names).index('n')
    patient_count = coxuh_output[patient_count_ind][0]

    cox_dict = {
        'n': patient_count,
        'z': coeffs.rx('variable', 'z')[0],
        'p': coeffs.rx('variable', 'Pr(>|z|)')[0]
    }
    if len(melk):
        cox_dict['melk-z'] = coeffs.rx('melk', 'z')[0]
        cox_dict['melk-p'] = coeffs.rx('melk', 'Pr(>|z|)')[0]
    return cox_dict
def main(args):
    from rpy2.robjects import r
    raster = importr('raster')
    rgdal = importr('rgdal')

    print 'Loading Catchement Raster'
    fraction = raster.raster(args.fraction)
    r.assign('fraction', fraction)

    # Calculate Velocity Raster
    print 'Calculating Velocity Layer'
    threshold = 30
    velocity = fraction
    r.assign('velocity', velocity)
    r('velocity[which(velocity[]>0)] <- 2')

    print '\tLoading Vegetation Layer'
    d = os.path.dirname(args.vegetation)
    f = os.path.splitext(os.path.basename(args.vegetation))[0]
    veg_shape = rgdal.readOGR(d, f, stringsAsFactors=False)
    r.assign('veg_shape', veg_shape)
    lakes = r('veg_shape[veg_shape$GRIDCODE==20,]')
    lakes_raster = raster.rasterize(lakes, fraction, getCover=True)

    print '\tTransforming lakes layer into raster coordinates'
    r.assign('lakes_raster', lakes_raster)
    r('lakes_raster[which(is.na(fraction[]))] <- 0')
    r.assign('threshold', threshold)
    r('velocity[which(lakes_raster[]>threshold)] <- 0.3')
    velocity = r('velocity')

    print '\tSaving Velocity Layer'
    raster.writeRaster(velocity, filename=os.path.join(args.outdir, 'velocity.asc'), format='ascii', overwrite=True, NAflag=0)

    # Create Diffusion Raster #######
    print '\tCreating Diffusion File'
    r('diffusion <- velocity')
    r('diffusion[which(diffusion[]==2)] <- 2000')
    r('diffusion[which(diffusion[]==0.3)] <- 1300')
    print '\tSaving Diffusion File'
    diffusion = r('diffusion')
    raster.writeRaster(diffusion, filename=os.path.join(args.outdir, 'diffusion.asc'), format='ascii', overwrite=True, NAflag=0)
    print '\tDone saving diffusion file'
Esempio n. 52
0
 def __init__(self, X, Y, family):
     """ initialise the class with X and Y
     Input:
         X:        one dimensional numpy array
         Y:        one dimensional numpy array
         family:   clayton or frank or gumbel
         
         Note: the size of X and Y should be same
     """
     # check dimension of input arrays
     if not ((X.ndim==1) and (Y.ndim==1)):
         raise ValueError('The dimension of array should be one.')
     
     # input array should have same size
     if X.size != Y.size:
         raise ValueError('The size of both array should be same.')
     
     # check if the name of copula family correct
     copula_family = ['clayton', 'frank', 'gumbel']
     if family not in copula_family:
         raise ValueError('The family should be clayton or frank or gumbel')
     
     self.X = X
     self.Y = Y
     self.family = family
     
     # estimate Kendall'rank correlation
     xy = np.vstack([X,Y])
     r.assign('xy',xy.T)
     tau = r('tau. <- cor(xy, method="kendall")[1,2]')[0]
     self.xy = xy
     self.tau = tau          
     
     # estimate pearson R and spearman R
     self.pr = r('cor(xy, method="pearson")[1,2]')[0]
     self.sr = r('cor(xy, method="spearman")[1,2]')[0]
             
     # estimate the parameter of copula
     self._get_parameter()
     
     # set U and V to none
     self.U = None
     self.V = None
Esempio n. 53
0
    def render(self, dataframe, path):

        R.library('ggplot2')

        # add all indices as columns
        dataframe.reset_index(inplace=True)

        rframe = pandas.rpy.common.convert_to_r_dataframe(dataframe)

        # for the issue below, see:
        # http://stackoverflow.com/questions/12865218/getting-rid-of-asis-class-attribute
        unAsIs = R('''function (x) {
                         if(typeof(x) %in% c("integer","double")) {
                             class(x) <- "numeric"
                             return (x)}
                         else if (typeof(x) == "character") {
                             class(x) <- "character"
                             return (x) }
                         else {
                             return(x) } }''')

        rframe = R["as.data.frame"](R.lapply(rframe, unAsIs))
        R.assign("rframe", rframe)

        # start plot
        R('''gp = ggplot(rframe)''')

        # add aesthetics and geometries
        try:
            pp = R('''gp + %s ''' % self.statement)
        except ValueError as msg:
            raise ValueError(
                "could not interprete R statement: "
                "gp + %s; msg=%s" % (self.statement, msg))

        figname = re.sub('/', '_', path2str(path))
        r = ResultBlock('#$ggplot %s$#' % figname,
                        title=path2str(path))
        r.rggplot = pp
        r.figname = figname

        return ResultBlocks(r)
Esempio n. 54
0
def process(outf, dti_f, bval_f, python=False):
    """
    Take a list of lists of files DTI and b-val files, returns a
    gzip R file with all B0 data arrays stored on it.
    """
    if python:
        import collections
        b0s = collections.OrderedDict()

    for idx, scan in enumerate(bval_f):
        print scan
        basename = os.path.basename(scan)
        print basename
        bval = np.loadtxt(scan)
        bval[np.where(bval==np.min(bval))] = 0
        im = nb.load(dti_f[idx])
        b0_loc = np.where(bval==np.min(bval))[0][0]
        dti = im.get_data()[:,:,:,b0_loc]
        if python:
            b0s[basename] = np.ravel(dti)
        else:
            ro = numpy2ri(np.ravel(dti+1))
            rr = robj.Matrix(ro)
            if idx is 0:
                myl = r.list(basename=rr)
            else:
                myl = r.c(myl, r.list(basename=rr))
    if python:
        import pickle
        # write python dict to a file
        #mydict = {'a': 1, 'b': 2, 'c': 3}
        output = open(outf, 'wb')
        pickle.dump(b0s, output)
        output.close()

        # read python dict back from the file
        # pkl_file = open('myfile.pkl', 'rb')
        # mydict2 = pickle.load(pkl_file)
        # pkl_file.close()
    else:
        r.assign('bar', myl)
        r("save(bar, file='"+outf+"', compress=TRUE)")
Esempio n. 55
0
def mtcorrect(p_value_dict, **kwargs):
    """ Apply MT correction.  This is a wrapper for R's p.adjust function.

    Arguments:
     p_value_dict: a dict with keys = probe names, and values of p-values

    kwargs:
     method: MT correction method, from R.  See mtcorrect_methods.  Default is 'none'
      
    Returns:
     adjusted_p
     
 
    """
    continue_flag = True

    method = test_kwarg('method', kwargs, mtcorrect_methods)
    
    try:
        from rpy2.robjects import r
        from rpy2 import robjects
        from rpy2.robjects import numpy2ri
        numpy2ri.activate()
    except ImportError:
        print "ImportError: networkstatistics.mtcorrect() requires a functional rpy2 and R, exiting..."
        continue_flag = False

    if continue_flag:
        row_names = [x for x in p_value_dict.keys()]
        p_values_list = [p_value_dict[id] for id in row_names]
        # need to create an r object first
        p_values_list_r = robjects.FloatVector(p_values_list)
        # need to assign the r object into the r namespace
        r.assign('p_values_list_r', p_values_list_r)
        method_r = mtcorrect_py_2_r_names[method]
        r('corrected_data = p.adjust(p_values_list_r, method = ' + str(method_r) + ')')
        adjusted_p = robjects.numpy2ri.ri2numpy(r('corrected_data'))
        adjusted_p.tolist
        adjusted_p = {id: adjusted_p[i] for i, id in enumerate(row_names)}
        return adjusted_p
    else:
        return {}
Esempio n. 56
0
def stat(year):
    print "Calcul des statistiques individuelles"
    
    simul = "C:/til/output/simul.h5"
    simul = HDFStore(simul)
    
    df = simul['entities/register']  
    df = df.loc[df['period']==year]
    # export en R

    not_bool = df.dtypes[df.dtypes != bool]
    df = df.ix[:,not_bool.index]
    r_dataframe = com.convert_to_r_dataframe(df)
    name = 'result_sim'
    r.assign(name, r_dataframe)
    file_dir = "C:/Myliam2/output/" + name+ ".gzip"
    phrase = "save("+name+", file='" +file_dir+"', compress=TRUE)"
    r(phrase) 

    simul.close()
Esempio n. 57
0
 def createClassifiers(self, verbose):
     test_date=self.date
     for window in self.training_periods:
         training_start = test_date - datetime.timedelta(days = int(1.7*window))
         day_before = test_date - datetime.timedelta(days=1)
         filename = training_start.strftime('%Y%m%d') + test_date.strftime('%Y%m%d') + self.ticker
         
         # Generate features:
         # (make CSV from the R database [mustn't forget to drop the close column]) 
         if verbose:
             print('Generating feature csvs from R xts..\n')
             print('Current window = {}\n\n'.format(filename))
         r.assign('windowStart', training_start.strftime('%Y-%m-%d'))
         r.assign('windowEnd', day_before.strftime('%Y-%m-%d'))
         r.assign('testDate', test_date.strftime('%Y-%m-%d'))
         r('windowDB<-DB[paste(windowStart,windowEnd,sep="/")]')
         r('windowDB<-subset(windowDB, select = -c(ticker) )')
         r('testDB<-DB[testDate]')
         r('testDB<-subset(testDB, select = -c(ticker) )')
         r.assign('remoteFilename', filename)
         r('write.table(windowDB, file=paste(remoteFilename, "train", sep="."),quote=FALSE,sep=",",eol=";\n",row.names=FALSE,col.names=FALSE)')
         r('write.table(testDB, file=paste(remoteFilename, "test", sep="."),quote=FALSE,sep=",",eol=";\n",row.names=FALSE,col.names=FALSE)')
     
         # Run bash script to invoke learner and generate classifier
         if verbose: print("Creating classifier using Jboost with runADTree.sh bash script..\n\n")
         command_line = '{}/runADTree.sh '.format(os.getcwd()) + os.getcwd() + ' ' + filename
         process = subprocess.Popen([command_line], shell=True)
         retcode = process.wait()
         
         # import classifier from file made on the fly
         if verbose: print("Importing classifier...\n")
         test_file = filename + '.test'
         spec_file = 'spec.spec'
         classifier_name = '{}predict'.format(filename)
         m = __import__(classifier_name, globals(), locals(), ['ADTree'])
         
         # Add new expert to list of experts
         self.experts.append(Expert(getattr(m, 'ATree'), self.trading_days_seen, self.new_exp_freq, (len(self.experts)<=len(self.training_periods))))
         # remove oldest expert
         if len(self.experts)>=self.max_experts: self.experts.pop(0)
         
         # delete all files generated above
         os.remove(filename + '.info')
         os.remove(filename + '.log')
         os.remove(filename + '.output.tree')
         os.remove(filename + '.train')
         os.remove(filename + '.test')
         os.remove(filename + '.test.boosting.info')
         os.remove(filename + '.train.boosting.info')
         os.remove(filename + 'predict.py')
         os.remove(filename + 'predict.pyc')
Esempio n. 58
0
def calculate(subset, w0, w1, w2, sub):
	mat_im = subset[0]
	mat_spat = subset[1]
	mat_temp = subset[2]
	# print subset0[:3][:3]
	# print subset1[:3][:3]
	# print subset2[:3][:3]

	l = len(w1)
	dim = len(subset[0])
	new_mat = zeros((dim, dim))
	# print new_mat[:3][:3]
	for i in range(l):
		for y in xrange(dim):
			for x in xrange(dim):
				new_mat[y][x] = w0[i] * mat_im[y][x] + w1[i] * mat_spat[y][x] + w2[i] * mat_temp[y][x]

		mat = array(new_mat, dtype="float64") # <- convert to double precision numeric since R doesn't have unsigned ints
		ro = rpyn.numpy2ri(mat)
		r.assign("bar", ro)
		r("saveRDS(bar, file='./matrix_combined/S"+ str(sub) +"/combined_matrix_"+ str(w0[i]) +"_"+ str(w1[i]) +"_"+ str(w2[i]) +".rds')")