Example #1
0
    def anova(self, design, formula, heteroscedasticity_threshold = 0.05):
        # Checking for errors in R
        # TODO: Deal better with this, catch actual exceptions
        try:
            info("Anova Formula in Python: " + str(formula))
            info("Anova Formula in R: " + str(Formula(formula)))

            aov_data = self.prune_data(self.complete_design_data)

            if self.test_heteroscedasticity(aov_data, formula, heteroscedasticity_threshold):
                regression = self.transform_lm(aov_data, formula)
            else:
                regression = self.stats.aov(Formula(formula), aov_data)

            if regression == None:
                regression = self.stats.aov(Formula(formula), aov_data)

            summary_regression = self.stats.summary_aov(regression)
            info("Regression Step:" + str(summary_regression))

            prf_values = {}
            for k, v in zip(self.base.rownames(summary_regression[0]), summary_regression[0][4]):
                if k.strip() != "Residuals":
                    prf_values[k.strip()] = v
        except:
            info("Regression Step Failed!")
            regression = None
            prf_values = None

        return regression, prf_values
Example #2
0
    def build_drf_model(self, x_old, y):
        from rpy2.robjects.vectors import StrVector, FactorVector, FloatVector, IntVector
        from rpy2.robjects import Formula, pandas2ri

        x, ts = x_old[:, :-1], x_old[:, -1]

        tmp = np.concatenate(
            [x, np.reshape(ts, (-1, 1)),
             np.reshape(y, (-1, 1))], axis=-1)
        data_frame = pandas2ri.py2ri(
            Baseline.to_data_frame(
                tmp,
                column_names=np.arange(0, tmp.shape[-1] - 2).tolist() +
                ["T", "Y"]))

        result = self.gps.hi_est(
            Y="Y",
            treat="T",
            treat_formula=Formula('T ~ ' + '+'.join(data_frame.names[:-2])),
            outcome_formula=Formula('Y ~ T + I(T^2) + gps + T * gps'),
            data=data_frame,
            grid_val=FloatVector([float(tt) for tt in np.linspace(0, 1, 256)]),
            treat_mod="Normal",
            link_function="log"
        )  # link_function is not used with treat_mod = "Normal".

        treatment_model, model = result[1], result[2]
        fitted_values = treatment_model.rx2('fitted.values')
        distribution = norm(np.mean(fitted_values), np.std(fitted_values))
        return distribution, model
Example #3
0
def fetch_stats_totals(des, qn_f, r):
    total_ci = svyciprop_xlogit(Formula(qn_f), des, multicore=False)
    # extract stats
    logger.info('fetching stats totals', r=r, q=qn_f)
    cts = rsvy.svyby(Formula(qn_f), Formula(qn_f), des,
                     rsvy.unwtd_count, na_rm=True,
                     na_rm_by=True, na_rm_all=True, multicore=False)
    cts = pandas2ri.ri2py(cts)
    cols = ['eql', 'ct', 'se_ignore']
    cts.columns = cols
    ct = cts.ct[cts.eql == 1].sum()
    ss = cts.ct.sum()
    res = {'level': 0,
           'response': r,
           'mean': u.guard_nan(
               rbase.as_numeric(total_ci)[0]) if total_ci else None,
           'se': u.guard_nan(
               rsvy.SE(total_ci)[0]) if total_ci else None,
           'ci_l': u.guard_nan(
               rbase.attr(total_ci, 'ci')[0]) if total_ci else None,
           'ci_u': u.guard_nan(
               rbase.attr(total_ci, 'ci')[1]) if total_ci else None,
           'count': ct,
           'sample_size': ss
           }
    # round as appropriate
    logger.info('finished computation lvl1', res=res,
                total_ci=total_ci, ct=ct, ss=ss)
    res = pd.DataFrame([res]).round(DECIMALS)
    return u.fill_none(res)
Example #4
0
    def opt_federov(self, design_formula, trials, data, max_iterations = 1000000, nullify = 0):
        info("Starting \"optFederov\" run")
        info("Using Search Space:")
        info(str(self.utils.str(data)))

        formulas = {}

        for parameter in self.parameter_ranges.keys():
            formulas["{0}e".format(parameter)] = Formula("{0}e ~ ({0} - {1}) / {1}".format(parameter, (self.parameter_ranges[parameter][1] - 1.0) / 2.0))

        info("Encoding formulas: " + str(self.utils.str(ListVector(formulas))))
        info("Data Dimensions: " + str(self.base.dim(data)))

        coded_data = self.rsm.coded_data(data, formulas = ListVector(formulas))

        info("Coded data: " + str(self.utils.str(coded_data)))

        output = self.algdesign.optFederov(frml         = Formula(design_formula),
                                           data         = coded_data,
                                           nTrials      = trials,
                                           nullify      = nullify,
                                           nRepeats     = 10,
                                           maxIteration = max_iterations)

        return output
Example #5
0
    def __init__(self,
                 formula_str,
                 df,
                 factors=None,
                 resid_formula_str=None,
                 **lmer_opts):
        """
        """
        # get the pred_var
        pred_var = formula_str.split('~')[0].strip()

        # convert df to a recarray if it's a dataframe
        if isinstance(df, pd.DataFrame):
            df = df.to_records()

        # add column if necessary
        if pred_var not in df.dtype.names:
            # must add it
            df = append_fields(df, pred_var, [0.0] * len(df), usemask=False)

        # make factor list if necessary
        if factors is None:
            factors = {}
        # add in missingarg for any potential factor not provided
        for k in df.dtype.names:
            if isinstance(df[k][0], str) and k not in factors:
                factors[k] = MissingArg

        for f in factors:
            if factors[f] is None:
                factors[f] = MissingArg
            # checking for both types of R Vectors for rpy2 variations
            elif (not isinstance(factors[f], Vector)
                  and not factors[f] == MissingArg):
                factors[f] = Vector(factors[f])

        # convert the recarray to a DataFrame (releveling if desired)
        self._rdf = DataFrame({
            k: (FactorVector(df[k], levels=factors[k]) if
                (k in factors) or isinstance(df[k][0], str) else df[k])
            for k in df.dtype.names
        })

        # get the column index
        self._col_ind = list(self._rdf.colnames).index(pred_var)

        # make a formula obj
        self._rformula = Formula(formula_str)

        # make one for resid if necessary
        if resid_formula_str:
            self._rformula_resid = Formula(resid_formula_str)
        else:
            self._rformula_resid = None

        # save the args
        self._lmer_opts = lmer_opts

        # model is null to start
        self._ms = None
Example #6
0
def _limma(data: pd.DataFrame, design: pd.DataFrame, alpha: float = 0.05,
           adjust_method: str = 'fdr_bh') -> pd.DataFrame:
    """Wrap limma to perform single sample DE analysis."""
    # Import R libraries
    base = importr('base')
    stats = importr('stats')

    try:
        limma = importr('limma')
    except RRuntimeError as e:
        click.echo(e)
        click.echo("Please check if limma package is installed in R. \n If not, follow the instructions from LINK "
                   "HERE.")
        sys.exit(1)

    # Convert data and design pandas dataframes to R dataframes
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_data = ro.conversion.py2rpy(data)
        r_design = ro.conversion.py2rpy(design)

    # Use the genes index column from data as a R String Vector
    genes = ro.StrVector(
        [
            str(index)
            for index in data.index.tolist()
        ]
    )

    # Create a model matrix using design's Target column using the R formula "~0 + f" to get all the unique factors
    # as columns
    f = base.factor(r_design.rx2('Target'), levels=base.unique(r_design.rx2('Target')))
    form = Formula('~0 + f')
    form.environment['f'] = f
    r_design = stats.model_matrix(form)
    r_design.colnames = base.levels(f)

    # Fit the data to the design using lmFit from limma
    fit = limma.lmFit(r_data, r_design)

    # Make a contrasts matrix with the 1st and the last unique values
    contrast_matrix = limma.makeContrasts(f"{r_design.colnames[0]}-{r_design.colnames[-1]}", levels=r_design)

    # Fit the contrasts matrix to the lmFit data & calculate the bayesian fit
    fit2 = limma.contrasts_fit(fit, contrast_matrix)
    fit2 = limma.eBayes(fit2)

    # topTreat the bayesian fit using the contrasts and add the genelist
    r_output = limma.topTreat(fit2, coef=1, genelist=genes, number=np.Inf)

    # Convert R dataframe to Pandas
    with localconverter(ro.default_converter + pandas2ri.converter):
        output = ro.conversion.rpy2py(r_output)

    # Adjust P value with the provided adjusted method
    output['adj.P.Val'] = multipletests(output['P.Value'], alpha=alpha, method=adjust_method)[1]
    output['logFC'].loc[output['adj.P.Val'] > 0.05] = 0
    output['logFC'].loc[np.abs(output['logFC']) < 1.3] = 0

    return output
Example #7
0
def dirichletreg_df(prop_df,
                    covar_df,
                    formula,
                    onevsrest_category=None,
                    return_reg_input=False):
    from rpy2.robjects import r, Formula
    from rpy2.robjects.packages import importr
    from rpy2.rinterface_lib.callbacks import logger as rpy2_logger

    dr = importr('DirichletReg')
    dr_df = pd.concat([prop_df, covar_df], axis=1)

    f = Formula(formula)

    rpy2_logger.setLevel(
        logging.ERROR)  # will display errors, but not warnings
    f.environment['y'] = dr.DR_data(py2r(prop_df))
    rpy2_logger.setLevel(
        logging.WARNING)  # will display errors, but not warnings

    if onevsrest_category is None:
        fit = dr.DirichReg(f, py2r(dr_df))
    else:
        assert onevsrest_category in prop_df.columns
        cat_index = prop_df.columns.tolist().index(onevsrest_category) + 1
        fit = dr.DirichReg(f,
                           py2r(dr_df),
                           model='alternative',
                           **{'sub.comp': cat_index})

    r.sink(file='/dev/null')
    u = r.summary(fit)
    r.sink()
    if r('sink.number')()[0] > 0:
        r.sink()

    if onevsrest_category is None:
        varnames = u.rx2('varnames')
    else:
        varnames = [onevsrest_category] * 2

    coef_mat = u.rx2('coef.mat')
    rows = r2py(r('rownames')(coef_mat))
    coef_df = r2py(r('as.data.frame')(coef_mat)).reset_index(drop=True)
    coef_df.columns = ['coefficient', 'se', 'zval', 'pval']

    coef_df['compartment'] = np.repeat(varnames, r2py(u.rx2('n.vars')))
    coef_df['variable'] = rows
    coef_df['significance'] = bin_pval(coef_df.pval)

    if onevsrest_category is not None:
        coef_df['coef_type'] = np.repeat(['mean', 'precision'],
                                         r2py(u.rx2('n.vars')))

    if return_reg_input:
        return dr_df, coef_df
    else:
        return coef_df
Example #8
0
def des_from_survey_db(tbl, db, host, port, denovo=False, fpc=False,design='cluster'):
    strata = '~strata'
    if denovo:
        strata = '~yr+sitecode'
    return rsvy.svydesign(id=Formula('~psu'), weight=Formula('~weight'),
                          strata=Formula(strata), nest=True,
                          fpc=(Formula('~fpc') if fpc else ro.NULL),
                          data=tbl, dbname=db, host=host, port=port,
                          dbtype='MonetDB.R')
Example #9
0
 def fit(self, train_data, labels, formula = "class ~ .", feature_names = ""):
     # train should be a dataframe and labels a numpy.array
     train_data = pd.DataFrame(train_data)
     train_data["class"] = labels
     with localconverter(ro.default_converter + pandas2ri.converter):
         train_R = r_from_pd_df = ro.conversion.py2rpy(train_data)
     if type(formula) == type("string"):
         if feature_names:
             formula = Formula("class ~" + "+ ".join(feature_names))
         else:
             formula = Formula(formula)
     self.trained = self.r_model(formula, data = train_R, scale = True, type = "eps-regression", kernel = "radial")
     return self.trained
Example #10
0
 def fit(self, train_data, labels, formula = "class ~ .", feature_names = "", test_data = None, kernel = "gaussian"):
     # train should be a dataframe and labels a numpy.array
     train_data = pd.DataFrame(train_data)
     train_data["class"] = labels
     with localconverter(ro.default_converter + pandas2ri.converter):
         self.train_R = r_from_pd_df = ro.conversion.py2rpy(train_data)
     if type(formula) == type("string"):
         if feature_names:
             formula = Formula("class ~" + "+ ".join(feature_names))
         else:
             formula = Formula(formula)
     self.formula = formula
     self.kernel = kernel
     return self.r_model
Example #11
0
    def from_rdf(cls, spss_file, rdf):
        logging.info('loading column definitions')
        svy_cols = parse_fwfcols_spss(spss_file)

        logging.info('loading variable annotations')
        svy_vars = parse_surveyvars_spss(spss_file)

        logging.info('creating survey design from data and annotations')
        des = rsvy.svydesign(id=Formula('~psu'),
                             weight=Formula('~weight'),
                             strata=Formula('~stratum'),
                             data=rdf,
                             nest=True)
        return cls(des=des, vars=svy_vars, rdf=rdf)
Example #12
0
def Xctree(RESPONSE__,
           datatrain__,
           datatest__=None,
           VERBOSE=False,
           TREE_EXPORT=True):

    Prx__=None
    ACCx__=None
    CFx__=None

    fmla__ = Formula(RESPONSE__+' ~ .')
    CT__ = ctree(fmla__,
                data=datatrain__)
    Pr__,ACC__,CF__= getresponseframe(datatrain__,CT__,
                                        RESPONSE__,olddata=True)
    if datatest__ is not None:
        Prx__,ACCx__,CFx__= getresponseframe(datatest__,CT__,
                                             RESPONSE__)
    TR__= visTree(CT__,Pr__,
                    PLOT=False,
                    VERBOSE=VERBOSE,ACC=ACC__,ACCx=ACCx__,RESP_=RESPONSE__)
    if TR__ is not None:
        if TREE_EXPORT:
            tree_export(TR__,TYPE='polyline',EXEC=True)
    return CT__,Pr__,ACC__,CF__,Prx__,ACCx__,CFx__,TR__
Example #13
0
    def __init__(self,
                 count_matrix,
                 design_matrix,
                 design_formula,
                 gene_column='id'):
        try:
            assert gene_column in count_matrix.columns, 'Wrong gene id column name'

        except AttributeError:
            sys.exit('Wrong Pandas dataframe?')

        self.dds = None
        self.deseq_result = None
        self.comparison = None
        self.normalized_count_matrix = None
        self.gene_column = gene_column
        self.gene_id = count_matrix[self.gene_column]

        count_matrix = count_matrix.drop(gene_column, axis=1)

        print(f'Number of columns in counts data {count_matrix.shape[1]} | '
              f'Number of rows in design matrix {design_matrix.shape[0]}')

        # Load dataframe into R environment
        # Important: Change to r.data() if you use numpys and rpy2 latests versions
        self.count_matrix = rpy2.robjects.conversion.py2rpy(count_matrix)

        # Assign columns to NULL
        self.count_matrix.names = rpy2.rinterface.NULL

        self.count_matrix = count_matrix

        self.design_matrix = rpy2.robjects.conversion.py2rpy(design_matrix)

        self.design_formula = Formula(design_formula)
Example #14
0
    def predict_best_values(self, regression, size, fixed_variables,
                            ordered_prf_keys, prf_values,
                            heteroscedasticity_threshold = 0.05):

        unique_variables = self.get_ordered_fixed_terms(ordered_prf_keys, prf_values)
        info("Predicting Best Values for: " + str(unique_variables))

        if unique_variables == []:
            model = ". ~ ."
        else:
            model = ". ~ " + " + ".join(unique_variables)

        info("Using Model: " + str(model))
        regression = self.stats.update(regression, Formula(model))

        summary_regression = self.stats.summary_aov(regression)
        info("Prediction Regression Step:" + str(summary_regression))

        #TODO only look at the target variables
        data = self.generate_valid_sample(size, fixed_variables)

        predicted = self.stats.predict(regression, data)
        predicted_min = min(predicted)

        pruned_data = data.rx(predicted.ro == self.base.min(predicted), True)

        return pruned_data.rx(1, True)
Example #15
0
    def _gam_fit_predict(cls, x, y, weights=None, pred_x=None):

        # Weights
        if weights is None:
            weights = np.repeat(1.0, len(x))

        # Construct dataframe
        use_inds = np.where(weights > 0)[0]
        r_df = pandas2ri.py2ri(
            pd.DataFrame(np.array([x, y]).T[use_inds, :], columns=['x', 'y']))

        # Fit the model
        rgam = importr('gam')
        model = rgam.gam(Formula('y~s(x)'),
                         data=r_df,
                         weights=pd.Series(weights[use_inds]))

        # Predictions
        if pred_x is None:
            pred_x = x
        y_pred = np.array(
            robjects.r.predict(model,
                               newdata=pandas2ri.py2ri(
                                   pd.DataFrame(pred_x, columns=['x']))))

        deviance = np.array(robjects.r.deviance(model))
        vals = dict(zip(model.names, list(model)))
        df = vals['df.residual'][0]

        return y_pred, [deviance, df]
Example #16
0
def my_evaluate(individual):
    dataFrame['label'] = individual
    robjects.globalenv['dataFrame'] = dataFrame
    fmla = Formula('label ~ .')

    ## -- linearity
    linearityVector = stringr_c.linearity_formula(fmla, dataFrame, measures="L2", summary="return")
    linearity = linearityVector.rx(1)
    fitness = abs(globalLinear - linearity[0][0])

    ## -- neighborhood N1
    n1Vector = stringr_c.neighborhood_formula(fmla, dataFrame, measures="N1", summary="return")
    f1 = n1Vector.rx(1)
    fitness2 = abs(globalN1 - f1[0][0])

    ## -- neighborhood N2
    n2Vector = stringr_c.neighborhood_formula(fmla, dataFrame, measures="N2", summary="return")
    n2 = n2Vector.rx(1)
    fitness3 = abs(globalN2 - n2[0][0])

    ##imbalance
    imbalanceVector = stringr_c.balance_formula(fmla, dataFrame, measures="C2", summary="return")
    imbalance = imbalanceVector.rx(1)
    fitness4 = abs(globalBalance - imbalance[0][0])

    #print("imbalance: " + str(imbalance[0][0]) + " linearity: " + str(linearity[0][0]) + " N1: " + str(
    #    f1[0][0]) + " N2: " + str(n2[0][0]))
    ## --
    return (fitness4), (fitness), (fitness2), (fitness3),
Example #17
0
    def __init__(self, df, design):
        """
        :param df: A data frame formed by merging files, or a list of files.

        :param design: Number of samples in the first treatment group.

                    treatment
        sampleA1        A
        sampleA2        A
        sampleB1        B
        sampleB2        B
        """
        if type(df) == pd.core.frame.DataFrame:
            self.df = df.copy()
        elif type(df) == list:
            self.df = Df(df, 'Count').df
        self.design = design
        self.design_formula = Formula('~ treatment')
        self.design_matrix = None
        self.dds = None
        self.normalized_count_matrix = None
        self.result = None
        self.design_design_matrix()
        self.run_deseq2()
        self.get_result()
Example #18
0
 def generate_slices(self, qn, response, vars=[], filt={}):
     # create the overall filter
     filt_fmla = u.fmla_for_filt(filt)
     # subset the rdf as necessary
     subs = subset_des_wexpr(self.rdf,
                             filt_fmla) if len(filt) > 0 else self.rdf
     # create a formula for generating the cross-tabs/breakouts across
     #   the selected vars
     lvl_f = Formula('~%s' % ' + '.join(vars)) if len(vars) > 0 else None
     # generate the crosstab/breakouts for the selected vars,
     #   turn them into R selector expressions and concatenate
     #   each non-empty selector with the R selector for the outer filter
     calls = thread_first(
         rstats.xtabs(lvl_f, subs), rbase.as_data_frame, pandas2ri.ri2py,
         (pd.DataFrame.query, "Freq > 0"),
         (pd.DataFrame.get,
          vars), lambda df: df.apply(lambda z: thread_last(
              z.to_dict(),
              lambda y: [(v, y[v]) for v in vars],
              list,
              lambda x: [tuple(x[:i + 1]) for i in range(len(x))],
          ),
                                     axis=1),
         (pd.DataFrame.to_records, False), list, concat, set, map(dict),
         list) if len(vars) > 0 else []
     # setup the formula based on the qn and response
     # add the base case with empty slice filter
     #   and dicts of qn/resp fmla, slice selector fmla, filt fmla
     res = [{
         'q': qn,
         'r': response,
         'f': filt,
         's': s
     } for s in [{}, *calls]]
     return res
Example #19
0
def gam_predict(location_csv, prediction_file, num_arrived, k_value):

    if not rpy2_loaded:
        from rpy2.robjects import Formula
        from rpy2.robjects.packages import importr
        base = importr('base')
        utils = importr('utils')
        mgcv = importr('mgcv')

    location_filename = location_csv #os.path.basename(location_csv)
    prediction_filename = prediction_file #os.path.basename(prediction_file)

    # Setup
    #base.setwd(os.path.dirname(location_csv))
    loc = utils.read_csv(location_filename, header=False, nrows=num_arrived)
    pred = utils.read_csv(prediction_filename, header=False, nrows=num_arrived)
    pop = base.cbind(pred, loc)
    pop.colnames = ["shelter","x","y"]

    # GAM
    formula = Formula('shelter~s(x,y,k={})'.format(k_value))
    m = mgcv.gam(formula, family="binomial", method="REML", data=pop)

    # Predict for everyone
    loc = utils.read_csv(location_filename, header=False)
    pred = utils.read_csv(prediction_filename, header=False)
    newd = base.cbind(pred, loc)
    newd.colnames = ["shelter","x","y"]
    result = mgcv.predict_gam(m, newd, type="response", se_fit=False)

    return list(result)
Example #20
0
    def ddx(self, contrasts=None, formula=None):
        if contrasts is None:
            contrasts = self.contrasts
        if formula is None:
            formula = "~" + "+".join(self.contrasts)

        df = self.data["Transcriptome Profiling"]['counts'].astype(int)
        design = self.metadata[contrasts].reindex(df.columns).reset_index()
        formula = Formula(formula)

        DEG = pandas2ri.ri2py_dataframe(
            DE_Genes(counts_df=pandas2ri.py2ri(df),
                     design_matrix=pandas2ri.py2ri(design),
                     design_formula=formula)).set_index("gene")

        #    # Characteristic Direction (Multivariate statistical method)
        #    # 0 excluded, 1 is control, 2 is perturbation
        #    classes = self.metadata[contrasts]

        #    # Calculate differential expression / methylation
        #    sig_features = geode.chdir(data = self.dataframe.values,
        #                 sampleclass = classes,
        #                 genes = self.dataframe.index,
        #                 gamma = 1., # smooths covariance and reduces noise
        #                 sort = True,
        #                 calculate_sig = True,
        #                 nnull = 100,
        #                 sig_only = True,
        #                 norm_vector = False)

        return DEG  #,  pd.DataFrame(sig_features)
Example #21
0
    def __init__(self,
                 count_matrix,
                 design_matrix,
                 design_formula,
                 gene_column='id'):
        try:
            assert gene_column in count_matrix.columns, 'Wrong gene id column name'
            gene_id = count_matrix[gene_column]
        except AttributeError:
            sys.exit('Wrong Pandas dataframe?')

        self.dds = None
        self.result = None
        self.deseq_result = None
        self.resLFC = None
        self.comparison = None
        self.normalized_count_df = None
        self.gene_column = gene_column
        self.gene_id = count_matrix[self.gene_column]
        self.samplenames = count_matrix.columns[
            count_matrix.columns != self.gene_column]
        with localconverter(robjects.default_converter + pandas2ri.converter):
            self.count_matrix = robjects.conversion.py2rpy(
                count_matrix.set_index(self.gene_column))
            self.design_matrix = robjects.conversion.py2rpy(design_matrix)
        self.design_formula = Formula(design_formula)
        self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix,
                                                colData=self.design_matrix,
                                                design=self.design_formula)
Example #22
0
def deseq2_basic(data_frame,
                 numerator=2,
                 denominator=1,
                 category_field='Category',
                 sample_field='Sample',
                 batch_field=None,
                 expression_name_field='Name',
                 counts_field='NumReads'):
    # from a dataframe
    # https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2
    design = '~ `' + category_field + '`'
    if batch_field is not None:
        design = '~ `' + batch_field + '` + `' + category_field + '`'
    #print(design)
    design = Formula(design)
    mat = data_frame.pivot(columns=sample_field,
                           index=expression_name_field,
                           values=counts_field)
    mfields = [sample_field, category_field]
    if batch_field is not None: mfields += [batch_field]
    meta = data_frame[mfields].groupby(sample_field).first().loc[mat.columns]
    metaarr = {}
    metaarr[category_field] = robjects.IntVector(meta[category_field].apply(
        lambda x: _trans(x, numerator, denominator)))
    if batch_field is not None:
        metaarr[batch_field] = robjects.IntVector(meta[batch_field])
    dds0 = deseq.DESeqDataSetFromMatrix(countData=mat.astype(int),
                                        colData=robjects.DataFrame(metaarr),
                                        design=design)
    dds1 = deseq.DESeq(dds0)
    res = rpy2.robjects.pandas2ri.ri2py(as_df(deseq.results(dds1)))
    res.index = mat.index
    res.index.name = expression_name_field
    return (dds0, dds1, res, mat, meta)
Example #23
0
def fit_glmgp(y, coldata, design="~ log10_umi"):
    import rpy2
    import rpy2.robjects as ro
    import rpy2.robjects.numpy2ri
    from rpy2.robjects import Formula
    from rpy2.robjects import IntVector
    from rpy2.robjects import pandas2ri
    from rpy2.robjects import r
    from rpy2.robjects.packages import importr
    pandas2ri.activate()

    rpy2.robjects.numpy2ri.activate()
    glmgp = importr("glmGamPoi")
    y_ro = np.asmatrix(y)
    # design_matrix_ro = np.asarray(design_matrix)
    fit = glmgp.glm_gp(data=y_ro,
                       design=Formula(design),
                       col_data=coldata,
                       size_factors=False)
    overdispersions = fit[fit.names.index("overdispersions")]
    mu = fit[fit.names.index("Mu")]
    beta = fit[fit.names.index("Beta")][0]
    return {
        "theta":
        np.vstack((1 / overdispersions[0],
                   np.mean(mu, axis=1) / 1e-4)).min(axis=0)[0],
        "Intercept":
        beta[0],
        "log10_umi":
        beta[1],
    }
Example #24
0
    def __init__(self, data=None, name='regression', formula=None, var_transform=False, *args, **kwargs):
        super().__init__(data=data, name=name, *args, **kwargs)

        # 创建REnv实例
        self._renv = REnv()
        # 创建公式
        self._formula = formula
        # 原始公式
        self._origin_formula = self._formula
        self._copy_data = deepcopy(data)

        # 转换变量,特别是那些变量是中文的
        self._var_transform = var_transform
        if self._var_transform:
            self._variables_mapping = [(self._copy_data.columns[i],'_'.join(['var',str(i)]))
                                      for i in range(len(self._copy_data.columns))]
            self._variables_mapping_dict = dict(self._variables_mapping)
            self._variables_mapping_dict_reversed = dict([('_'.join(['var',str(i)]),self._copy_data.columns[i]) for i in range(len(self._copy_data.columns))])
            self._generated_variables = [item[1] for item in self._variables_mapping]
            self._copy_data.columns = self._generated_variables
            for key in self._variables_mapping_dict:
                self._formula = re.sub(key,self._variables_mapping_dict[key],self._formula)

        self._formula = Formula(self._formula)

        self._lm = importr('stats').lm
        self._summary = importr('base').summary
Example #25
0
    def run_deseq(self, **kwargs):
        """
        actually running deseq2

        Args:
            **kwargs: Any keyword arguments for DESeq

        From DESeq2 manual:

        DESeq(
            object,
            test = c("Wald", "LRT"),
            fitType = c("parametric", "local", "mean", "glmGamPoi"),
            sfType = c("ratio", "poscounts", "iterate"),
            betaPrior,
            full = design(object),
            reduced,
            quiet = FALSE,
            minReplicatesForReplace = 7,
            modelMatrixType,
            useT = FALSE,
            minmu = if (fitType == "glmGamPoi") 1e-06 else 0.5,
            parallel = FALSE,
            BPPARAM = bpparam()
        )
        """

        for key, value in kwargs.items():
            if key == 'reduced':
                kwargs[key] = Formula(value)
        self.dds = deseq.DESeq(self.dds, **kwargs)
        self.comparison = list(deseq.resultsNames(self.dds))
Example #26
0
 def __init__(self,
              count_matrix,
              design_matrix,
              design_formula,
              gene_column='gene_id'):
     print("you need to have R installed with the DESeq2 library installed")
     try:
         assert gene_column == count_matrix.columns[
             0], 'no $gene_column name in 1st column\'s name'
         gene_id = count_matrix[gene_column]
     except AttributeError:
         sys.exit('Wrong Pandas dataframe?')
     print(rpy2.__version__)
     self.deseq_result = None
     self.resLFC = None
     self.comparison = None
     self.normalized_count_matrix = None
     self.gene_column = gene_column
     self.gene_id = count_matrix[self.gene_column]
     with localconverter(ro.default_converter + pandas2ri.converter):
         self.count_matrix = pandas2ri.py2rpy(
             count_matrix.drop(gene_column, axis=1).astype(int))
         self.design_matrix = pandas2ri.py2rpy(design_matrix.astype(bool))
     self.design_formula = Formula(design_formula)
     self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix,
                                             colData=self.design_matrix,
                                             design=self.design_formula)
Example #27
0
    def __init__(self, arch_lags, garch_lags, ar_lags=0, ma_lags=0):
        """
        """
        # import R packages
        self.r_base = importr("base")
        self.r_fGarch = importr("fGarch")
        self.r_stats = importr("stats")

        # model specs
        self.arch_lags = arch_lags
        self.garch_lags = garch_lags
        self.ar_lags = ar_lags
        self.ma_lags = ma_lags

        # assign coefficient names: omega, ar1,...,ma1,...,alpha1,...,beta1,...
        self.ar_names = ["ar" + str(n + 1) for n in range(ar_lags)]
        self.ma_names = ["ma" + str(n + 1) for n in range(ma_lags)]
        self.arch_names = ["alpha" + str(n + 1) for n in range(arch_lags)]
        self.garch_names = ["beta" + str(n + 1) for n in range(garch_lags)]

        # R formula of the equation
        self.formula = Formula(
            "~arma({ar:1d},{ma:1d})+garch({p:1d},{q:1d})".format(
                ar=self.ar_lags,
                ma=self.ma_lags,
                p=self.arch_lags,
                q=self.garch_lags))
Example #28
0
def my_evaluate(individual):
    vetor = []
    dataFrame['label'] = individual
    robjects.globalenv['dataFrame'] = dataFrame
    fmla = Formula('label ~ .')
    if ("1" in metricasList):
        ##imbalance
        imbalanceVector = stringr_c.balance_formula(fmla,
                                                    dataFrame,
                                                    measures="C2",
                                                    summary="return")
        imbalance = imbalanceVector.rx(1)
        vetor.append(abs(globalBalance - imbalance[0][0]))
    if ("2" in metricasList):
        ## -- linearity
        linearityVector = stringr_c.linearity_formula(fmla,
                                                      dataFrame,
                                                      measures="L2",
                                                      summary="return")
        linearity = linearityVector.rx(1)
        vetor.append(abs(globalLinear - linearity[0][0]))
    if ("3" in metricasList):
        ## -- neighborhood N2
        n2Vector = stringr_c.neighborhood_formula(fmla,
                                                  dataFrame,
                                                  measures="N1",
                                                  summary="return")
        n2 = n2Vector.rx(1)
        vetor.append(abs(globalN1 - n2[0][0]))
    if ("4" in metricasList):
        ## -- Network ClsCoef
        ClsCoefVector = stringr_c.network_formula(fmla,
                                                  dataFrame,
                                                  measures="ClsCoef",
                                                  summary="return")
        ClsCoef = ClsCoefVector.rx(1)
        vetor.append(abs(globalClsCoef - ClsCoef[0][0]))
    if ("5" in metricasList):
        ## -- Dimensionality T2
        t2Vector = stringr_c.dimensionality_formula(fmla,
                                                    dataFrame,
                                                    measures="T2",
                                                    summary="return")
        t2 = t2Vector.rx(1)
        vetor.append(abs(globalt2 - t2[0]))
    if ("6" in metricasList):
        ## -- Feature-based F1
        f1Vector = stringr_c.overlapping_formula(fmla,
                                                 dataFrame,
                                                 measures="F1",
                                                 summary="return")
        f1 = f1Vector.rx(1)
        vetor.append(abs(globalf1 - f1[0][0]))
    ## --
    if (len(vetor) == 2):
        return vetor[0], vetor[1],
    elif (len(vetor) == 3):
        return vetor[0], vetor[1], vetor[2],
    elif (len(vetor) == 4):
        return vetor[0], vetor[1], vetor[2], vetor[3],
Example #29
0
def spectra_difference(counts_table, group_label, test=False):
    """group_label is the column name for category"""
    # we compare direction between group
    columns = ['count', 'direction', group_label]
    assert set(columns) <= set(counts_table.header)
    formula = "count ~ direction + %s" % group_label
    null = Formula(formula)
    if test:
        print(formula)

    counts_table = counts_table.get_columns(columns)
    d = as_dataframe(counts_table)
    f = R.glm(null, data=d, family="poisson")
    f_attr = dict(list(f.items()))
    dev = f_attr['deviance'][0]
    df = f_attr['df.residual'][0]

    collated = convert_rdf_to_pandasdf(f_attr['data'])
    collated['fitted'] = list(f_attr['fitted.values'])
    dev_to_re = DevianceToRelativeEntropy(collated['count'].sum())
    calc_ret = CalcRet(dev_to_re)
    total_re = dev_to_re(dev)

    collated['ret'] = calc_ret(collated['count'], collated['fitted'])
    collated = collated.reindex(columns + ['fitted', 'ret'], axis=1)
    collated = collated.sort_values(by=columns[:-1])
    return total_re, dev, df, collated, formula
Example #30
0
    def __init__(self,
                 count_matrix,
                 design_matrix,
                 design_formula,
                 feature_column='id',
                 var_column='condition',
                 exons=None,
                 genes=None,
                 threads=1):
        try:
            assert feature_column in count_matrix.columns, 'Wrong gene id column name'
            assert var_column in design_matrix.columns, 'Wrong var column for DEXSeq'
        except AttributeError:
            sys.exit('Wrong Pandas dataframe?')

        self.dxd = None
        self.dxd_res = None
        self.dexseq_result = None
        self.comparison = None
        self.normalized_count_matrix = None
        self.feature_column = feature_column
        self.exons = exons
        self.genes = genes
        self.gene_id = count_matrix[self.feature_column]
        self.count_matrix = pandas2ri.py2ri(
            count_matrix.drop(feature_column, axis=1))
        self.design_matrix = pandas2ri.py2ri(design_matrix)
        self.design_formula = Formula(design_formula)
        self.BPPARAM = bp.MulticoreParam(workers=threads)
        self.var_column = var_column
Example #31
0
r_analytical_set = pandas2ri.py2ri(analytical_set)
print r_analytical_set
print type(r_analytical_set)

# get summary
print R.table(r_analytical_set.rx('pass'))
R('p <- 846/5062')
R('odds <- p/(1 - p)')
R('logit <- log(p/(1 - p))')
R('invlogit <- function(x){ exp(x)/(1 + exp(x)) }')
R('invlogit(logit)')

#formula = 'pass~n'

from rpy2.robjects import Formula
formula = Formula('pass~n')
formula.getenvironment()['pass'] = r_analytical_set.rx2('pass')
formula.getenvironment()['n'] = r_analytical_set.rx2('n')

#fit = R.glm(formula=formula, data=r_analytical_set,   family=R('binomial(link="logit")'))
import rpy2.robjects.packages as rpacks
stats = rpacks.importr("stats")
fit = stats.glm(formula = formula,
                family = stats.binomial(link = "logit"),
                data=r_analytical_set)

s = R.summary(fit)
print(fit)
print(R.summary(fit))

R.plot(formula,
Example #32
0
def getSimpleFormula(x, y):
    formula = Formula("y ~ x")
    formula.environment["x"] = x
    formula.environment["y"] = y
    
    return formula
Example #33
0
xyplot = lattice.xyplot
#-- setupxyplot-end

#-- dataset-begin
rnorm = stats.rnorm
dataf_rnorm = robjects.DataFrame({'value': rnorm(300, mean=0) + rnorm(100, mean=3),
                                  'other_value': rnorm(300, mean=0) + rnorm(100, mean=3),
                                  'mean': IntVector([0, ]*300 + [3, ] * 100)})
#-- dataset-end

grdevices.png('../../_static/graphics_lattice_xyplot_1.png',
              width = 612, height = 612, antialias="subpixel", type="cairo")
#-- xyplot1-begin
datasets = importr('datasets')
mtcars = datasets.mtcars
formula = Formula('mpg ~ wt')
formula.getenvironment()['mpg'] = mtcars.rx2('mpg')
formula.getenvironment()['wt'] = mtcars.rx2('wt')

p = lattice.xyplot(formula)
rprint(p)
#-- xyplot1-end
grdevices.dev_off()

grdevices.png('../../_static/graphics_lattice_xyplot_2.png',
    width = 612, height = 612, antialias="subpixel", type="cairo")
#-- xyplot2-begin
p = lattice.xyplot(formula, groups = mtcars.rx2('cyl'))
rprint(p)
#-- xyplot2-end
grdevices.dev_off()