Example #1
0
    def get_pvalue(sorted_scores, stat, n):
        # approximate the gpd tail
        n_exceed = 250
        is_gpd_fitted = False
        while n_exceed >= 10:
            exceedances = sorted_scores[:n_exceed]
            # check if the n_exceed largest permutation values follow GPD
            #   with Anderson-Darling goodness-of-fit test
            try:
                ad = eva.gpdAd(FloatVector(exceedances))
                ad_pval = ad.rx2('p.value')[0]
            except:
                n_exceed -= 10
                continue
            # H0 = exceedances come from a GPD
            if ad_pval > 0.05:
                is_gpd_fitted = True
                break
            n_exceed -= 10
        if not is_gpd_fitted:
            #print('GPD good fit is never reached - use ECDF instead...')
            return (None)
        # compute the exceedance threshold t
        t = float((sorted_scores[n_exceed] + sorted_scores[n_exceed - 1]) / 2)
        # estimate shape and scale params with maximum likelihood
        gpd_fit = eva.gpdFit(FloatVector(sorted_scores),
                             threshold=t,
                             method='mle')
        scale, shape = gpd_fit.rx2('par.ests')[0], gpd_fit.rx2('par.ests')[1]

        # compute GPD p-value
        f_gpd = genpareto.cdf(x=gt_score - t, c=shape, scale=scale)
        return (n_exceed / n * (1 - f_gpd))
def sampleSizeRest():
    # Get the parsed contents of the form data
    data = request.json
    #print(json)

    k = data["k"].split(',')
    prev = data["prev"]
    N = data["N"]
    unique_id = data["unique_id"]
    fixed_flag = data["fixed_flag"]
    sens = data["sens"].split(',')
    spec = data["spec"].split(',')

    start = time.time()
    print "Starting Benchmark"

    if fixed_flag == "Specificity":
        jsonrtn = (wrapper.saveAllSensGraphs(IntVector(k), FloatVector(sens),
                                             FloatVector(spec), float(prev),
                                             IntVector(N), unique_id))
    else:
        jsonrtn = (wrapper.saveAllSpecGraphs(IntVector(k), FloatVector(sens),
                                             FloatVector(spec), float(prev),
                                             IntVector(N), unique_id))

    #end=time.time()
    #print "Seconds"
    #print end - start

    jsonlist = list(jsonrtn)

    #2
    jsonstring = ''.join(jsonlist)
    print jsonstring
    return jsonstring
def adjust_pvalue(data):
    stats = importr('stats')
    p_adjustBH = stats.p_adjust(FloatVector(data.pval.tolist()), method='BH')
    data["BH"] = p_adjustBH
    p_adjustBonferroni = stats.p_adjust(FloatVector(data.pval.tolist()),
                                        method='bonferroni')
    data["Bonferroni"] = p_adjustBonferroni
    return data
Example #4
0
def xicor(x, y, return_pval=False):
    x, y = remove_missing_values(x, y)
    if return_pval:
        xi, sd, pval = XICOR.xicor(FloatVector(x), FloatVector(y), pvalue=True)
    else:
        xi = XICOR.xicor(FloatVector(x), FloatVector(y), pvalue=False)
    if return_pval: return (xi[0], pval[0])
    return (xi[0])
Example #5
0
 def fit(self, x, t, y, refit=False):
     if self.method_name == "lasso":
         print("fit lasso")
         self.model = self.rleaner.rlasso(x, IntVector(t), FloatVector(y))
     else:
         # Takes much longer to fit
         print("fit boost")
         self.model = self.rleaner.rboost(x, IntVector(t), FloatVector(y))
Example #6
0
 def getPv(self, qminrho, MuQ, VarQ, KerQ, lam, VarRemain, Df, tau,
           rho_list, T):
     from rpy2.robjects.vectors import FloatVector
     RV = self.r_davies(FloatVector(qminrho), MuQ, VarQ, KerQ,
                        FloatVector(lam), VarRemain, Df, FloatVector(tau),
                        FloatVector(rho_list), T)
     RV = [sp.array(RV[i]) for i in range(len(RV))]
     RV = RV[0].flatten()[0]
     return RV
Example #7
0
    def fit(self, model, testIndices):
        """
        """

        # ------------------------------ Function --------------------------- #
        def errorFit(parameters):
            def fitData(n, testIndices):
                for i in xrange(n):
                    if i not in testIndices:
                        yield i

            # Instantiate the surrogate model
            cModel = modena.libmodena.modena_model_t(
                model=model, parameters=list(parameters))

            return FloatVector(
                list(
                    model.error(cModel,
                                idxGenerator=fitData(model.nSamples,
                                                     testIndices),
                                checkBounds=False)))

        # ------------------------------------------------------------------- #

        new_parameters = model.parameters
        if not len(new_parameters):
            new_parameters = [None] * len(model.surrogateFunction.parameters)
            for k, v in model.surrogateFunction.parameters.iteritems():
                new_parameters[v.argPos] = (v.min + v.max) / 2

        # make objects usable in R
        R_par = FloatVector(new_parameters)
        R_res = rinterface.rternalize(errorFit)

        max_parameters = [None] * len(new_parameters)
        min_parameters = [None] * len(new_parameters)
        for k, v in model.surrogateFunction.parameters.iteritems():
            min_parameters[v.argPos] = v.min
            max_parameters[v.argPos] = v.max

        # perform fitting (nonlinear MSSQ)
        nlfb = nlmrt.nlfb(start=R_par,
                          resfn=R_res,
                          jacfn=rinterface.NULL,
                          trace=rinterface.FALSE,
                          lower=FloatVector(min_parameters),
                          upper=FloatVector(max_parameters),
                          maskidx=rinterface.NULL)

        # optimised coefficients and sum of squares
        nlfb_coeffs = nlfb[nlfb.names.index('coefficients')]
        nlfb_ssqres = nlfb[nlfb.names.index('ssquares')]
        new_parameters = list(nlfb_coeffs)

        return new_parameters
Example #8
0
 def _comparisons_dataframe(self):
     # col = ('Label.1', 'Label.2', 'win1', 'win2')
     # data = zip(col, [*self.comparison_items, *self.comparison_wins])
     # return DataFrame(OrdDict([data]))
     column_comp1 = ('Label.1',
                     FactorVector(self.comparison_items[0],
                                  levels=StrVector(self.items)))
     column_comp2 = ('Label.2',
                     FactorVector(self.comparison_items[1],
                                  levels=StrVector(self.items)))
     column_win1 = ('win1', FloatVector(self.comparison_wins[0]))
     column_win2 = ('win2', FloatVector(self.comparison_wins[1]))
     return DataFrame(
         OrdDict([column_comp1, column_comp2, column_win1, column_win2]))
Example #9
0
    def fit_generator_for_model(self, model, train_generator, train_steps,
                                val_generator, val_steps, num_epochs):
        from rpy2.robjects.vectors import StrVector, FactorVector, FloatVector, IntVector
        all_outputs = []
        for _ in range(train_steps):
            generator_output = next(train_generator)
            x, y = generator_output[0], generator_output[1]
            all_outputs.append((self.preprocess(x), x[1], y))
        x, t, y = map(partial(np.concatenate, axis=0), zip(*all_outputs))

        self.model = self.grf.causal_forest(
            x,
            FloatVector([float(yy) for yy in y]),
            FloatVector([float(tt) for tt in t]),
            seed=909)
Example #10
0
def _translate_control(control):
    """
    Transforms a python dict to a valid R object
    Args:
      control: python dict

    Returns: R object of type ListVector

    """
    ctrl = {}
    for key, lst in control.items():
        if isinstance(lst, list):
            if all(isinstance(n, int) for n in lst):
                entry = IntVector(control[key])
            elif all(isinstance(n, bool) for n in lst):
                entry = BoolVector(control[key])
            elif all(isinstance(n, float) for n in lst):
                entry = FloatVector(control[key])
            elif all(isinstance(n, str) for n in lst):
                entry = StrVector(control[key])
            else:
                entry = None
            if entry is not None:
                ctrl[key] = entry
        else:
            ctrl[key] = lst
    return ListVector(ctrl)
Example #11
0
def lmer_feat(mer, Dw):
    mer = r['refit'](mer, FloatVector(Dw))
    df = r['data.frame'](r_coef(r['summary'](mer)))
    rows = list(r['row.names'](df))
    new_tvals = np.rec.fromarrays([[tv] for tv in tuple(df.rx2('t.value'))],
                                  names=','.join(rows))
    return new_tvals
Example #12
0
def df2mtr(df):
    '''
    Convert pandas dataframe to r matrix. Category dtype is casted as
    factorVector considering missing values
    (original py2ri function of rpy2 can't handle this properly so far)

    Args:
        data: pandas dataframe of shape (# samples, # features)
              with numeric dtype

    Returns:
        mtr: r matrix of shape (# samples # features)
    '''
    # check arguments
    assert isinstance(df,
                      pd.DataFrame), 'Argument df need to be a pd.Dataframe.'

    # select only numeric columns
    df = df.select_dtypes('number')

    # create and return r matrix
    values = FloatVector(df.values.flatten())
    dimnames = ListVector(
        rlc.OrdDict([('index', StrVector(tuple(df.index))),
                     ('columns', StrVector(tuple(df.columns)))]))

    return robjects.r.matrix(values,
                             nrow=len(df.index),
                             ncol=len(df.columns),
                             dimnames=dimnames,
                             byrow=True)
Example #13
0
def cpt_gamma(x, penalty='MBIC', minseglen=2, shape=100):
    """changepoint detection with Gamma distribution as test statistic

        positive value is required
        negative value is set to a very large RTT, 1e3.

        Args:
            x (list of numeric type): timeseries to be handled
            penalty (string): possible choices "None", "SIC", "BIC", "MBIC", "AIC", "Hannan-Quinn"

        Returns:
            list of int: beginning of new segment in python index, that is starting from 0;
            the actually return from R changepoint detection is the last index of a segment.
            since the R indexing starts from 1, the return naturally become the beginning of segment.
        """
    try:
        base = np.min([i for i in x if i > 0])
    except ValueError:  # if no positive number if x, set base to 0
        base = 0
    x = [(i - base + 0.1) if i > 0 else 1e3 for i in x]
    return [
        int(i) for i in changepoint.cpts(
            changepoint.cpt_meanvar(FloatVector(x),
                                    test_stat='Gamma',
                                    method='PELT',
                                    penalty=penalty,
                                    minseglen=minseglen,
                                    shape=shape))
    ]
Example #14
0
    def CigNet_prediction(self):
        new_table_data = self.scaler.transform(self.real_table)
        real_table = pd.DataFrame(new_table_data,
                                  index=self.real_table.index,
                                  columns=self.real_table.columns)

        result = predict_decision(self.predictor, real_table)
        result2 = predict_proba(self.predictor, real_table)
        result = np.concatenate([result, result2], axis=1)
        result_df = pd.DataFrame(result, index=real_table.index)
        result_df.columns = ['distance', 'non-driver_prob', 'driver_prob']

        stats = importr('stats')
        for l in [
                -2, -1.75, -1.5, -1.25 - 1, -0.75, -0.5, -0.25, 0, 0.25, 0.5, 1
        ]:
            result_df['p_value'] = 1 - norm.cdf(result_df['distance'], loc=l)
            result_df['q_value'] = stats.p_adjust(FloatVector(
                result_df["p_value"].tolist()),
                                                  method='BH')
            if result_df[result_df['q_value'] <
                         0.05].shape[0] * 1. / result_df.shape[0] < 0.65:
                break
        candidate_list = self.CellNet['from'].unique()
        result_df = result_df[result_df.index.isin(candidate_list)]
        result_df = result_df.sort_values(by='distance', ascending=False)
        result_df['Rank'] = result_df['distance'].rank(axis=0, ascending=False)
        return result_df
Example #15
0
    def fdr_boot(self):
        """Calculate the False Discovery Rate on the bootstrap ratios.

        Makes use of the fdrtool package in R, which estimates the
        signal and null distributions across your features.
        """
        # get the boot ratios
        brs = self.boot_ratio
        names = brs.dtype.names
        qvals = []
        for n in names:
            # get R vector of bootstrap ratios
            br = brs[n].flatten()
            good_ind = ~np.isnan(br)
            qv = np.ones_like(br)
            br = FloatVector(br[good_ind])

            # calc the fdr
            results = fdrtool.fdrtool(br,
                                      statistic='normal',
                                      plot=False,
                                      verbose=False)

            # append the qvals
            qv[good_ind] = np.array(results.rx('qval'))

            qvals.append(qv.reshape(self._feat_shape))

        # convert to recarray
        qvals = np.rec.fromarrays(qvals, names=','.join(names))

        # grab the qs
        return qvals
def call_peaks(genome,
               unit_length=200,
               small_length=1000,
               medium_length=5000,
               large_length=10000):

    peaks_out = []
    for contig in genome:

        total_reads = sum(genome[contig])
        contig_length = len(genome[contig])

        if total_reads == 0:
            continue

        window_counts = window_read_counts(genome[contig], unit_length)
        window_sum = window_counts.sum(axis=1)

        small_bin_counts = calculate_bins(window_counts, small_length,
                                          unit_length)
        medium_bin_counts = calculate_bins(window_counts, medium_length,
                                           unit_length)
        large_bin_counts = calculate_bins(window_counts, large_length,
                                          unit_length)

        local_bin_sums = np.hstack(
            (small_bin_counts.sum(axis=1), medium_bin_counts.sum(axis=1),
             large_bin_counts.sum(axis=1)))

        local_lambdas = (local_bin_sums / np.array(
            [small_length, medium_length, large_length])) * unit_length
        lambda_bg = np.ones(
            window_sum.shape) * (total_reads / contig_length) * unit_length

        all_lambdas = np.hstack((lambda_bg, local_lambdas))
        max_lambdas = np.amax(all_lambdas, axis=1)

        p_vals = 1 - poisson.cdf(window_sum.astype(int),
                                 mu=max_lambdas.astype(float))
        p_vals = np.transpose(p_vals.astype(np.longdouble))[0]

        qvalue = importr('qvalue')
        q_vals = np.array(qvalue.qvalue(FloatVector(p_vals))[2])
        q_vals = np.hstack(
            (np.transpose(np.matrix(list(range(1,
                                               len(q_vals) + 1)))),
             np.transpose(np.matrix(q_vals))))
        qv_df = pd.DataFrame(q_vals)
        qv_df.columns = ['Position', 'qvalue']

        peak_indices = np.array(
            qv_df.query('qvalue < 0.01')['Position'].tolist()).astype(int)

        peaks = indices_to_peaks(peak_indices)
        peaks = correct_peaks(peaks, unit_length)

        for peak in peaks:
            peaks_out.append([contig, peak[0], peak[1]])

    return peaks_out
Example #17
0
    def build_drf_model(self, x_old, y):
        from rpy2.robjects.vectors import StrVector, FactorVector, FloatVector, IntVector
        from rpy2.robjects import Formula, pandas2ri

        x, ts = x_old[:, :-1], x_old[:, -1]

        tmp = np.concatenate(
            [x, np.reshape(ts, (-1, 1)),
             np.reshape(y, (-1, 1))], axis=-1)
        data_frame = pandas2ri.py2ri(
            Baseline.to_data_frame(
                tmp,
                column_names=np.arange(0, tmp.shape[-1] - 2).tolist() +
                ["T", "Y"]))

        result = self.gps.hi_est(
            Y="Y",
            treat="T",
            treat_formula=Formula('T ~ ' + '+'.join(data_frame.names[:-2])),
            outcome_formula=Formula('Y ~ T + I(T^2) + gps + T * gps'),
            data=data_frame,
            grid_val=FloatVector([float(tt) for tt in np.linspace(0, 1, 256)]),
            treat_mod="Normal",
            link_function="log"
        )  # link_function is not used with treat_mod = "Normal".

        treatment_model, model = result[1], result[2]
        fitted_values = treatment_model.rx2('fitted.values')
        distribution = norm(np.mean(fitted_values), np.std(fitted_values))
        return distribution, model
def FDR_adjust_pvalues(pvalue_list, N=None, method='BH'):
    """ Adjust a list of p-values for false discovery rate using R's stats::p.adjust function.

    N and method are passed to R_stats.p_adjust: 
     - N is the number of comparisons (if left unspecified, defaults to len(pvalue_list), I think)
     - method is the name of the adjustment method to use (inherited from R)

    Note that this MUST be done after all the p-values are already collected, on the full list of p-values at once:
     trying to do it on single p-values, even with adjusted N, will give different results!
    """
    if not method in R_stats.p_adjust_methods:
        raise ValueError("Unknown method %s - method must be one of (%s)!" %
                         (method, ', '.join(R_stats.p_adjust_methods)))
    if N is None:
        return R_stats.p_adjust(FloatVector(pvalue_list), method=method)
    else:
        return R_stats.p_adjust(FloatVector(pvalue_list), method=method, n=N)
Example #19
0
def gray_plot(data, min=0, max=1, name=""):
    reshape = importr('reshape')
    gg = ggplot2.ggplot(reshape.melt(data, id_var=['x', 'y']))
    pg = gg + ggplot2.aes_string(x='L1',y='L2')+ \
         ggplot2.geom_tile(ggplot2.aes_string(fill='value'))+ \
         ggplot2.scale_fill_gradient(low="black", high="white",limits=FloatVector((min,max)))+ \
         ggplot2.coord_equal() + ggplot2.scale_x_continuous(name)
    return pg
def Parameter_Stability_plot(sample,
                             alpha):  #Parameter stability plot function
    #Defining Threshold array
    step = np.quantile(sample, .995) / 45
    threshold = np.arange(0, np.quantile(sample, .999), step=step)

    #Transforming sample in a R array
    rdata = FloatVector(sample)

    #Initialization of some main arrays
    stdshape = []  #standard deviation of the shape parameter initialization
    shape = []  #shape parameter intialization
    scale = []  #scale paramter initilization
    mod_scale = []  #modified scale parameter initizaliation
    CI_shape = []  #confidence interval of the shape parameter
    CI_mod_scale = []  #confidence interval of the modified scale
    z = norm.ppf(1 - (alpha / 2))

    #Getting parameters and CI's for both plots
    for u in threshold:
        fit = POT.fitgpd(
            rdata, u, est='mle'
        )  #fitting distribution using POT package with the MLE method
        shape.append(
            fit[0][1])  #adding the shape parameter to the respective array
        scale.append(
            fit[0][0])  #adding the scale parameter to the respective array
        stdshape.append(
            fit[1]
            [1])  #adding the shape standard deviation to the respective array
        CI_shape.append(
            fit[1][1] *
            z)  #getting the values of the confidence interval for plotting
        mod_scale.append(
            fit[0][0] - (fit[0][1] * u))  #getting the modified scale parameter
        Var_mod_scale = (fit[3][0] - (u * fit[3][2]) - u * (fit[3][1] -
                                                            (fit[3][3] * u))
                         )  #solving the Delta method
        #in order to get the variance to the modified scale parameter
        CI_mod_scale.append(
            (Var_mod_scale**0.5) * z)  #getting the confidence interval for the
        #modified scale parameter

    #Plotting shape parameter against u vales
    plt.figure(2)
    plt.errorbar(threshold, shape, yerr=CI_shape, fmt='o')
    plt.xlabel('u')
    plt.ylabel('Shape Parameter')
    plt.title('Shape Parameter Stability Plot')

    #Plotting modified scale parameter against u values
    plt.figure(3)
    plt.errorbar(threshold, mod_scale, yerr=CI_mod_scale, fmt='o')
    plt.xlabel('u')
    plt.ylabel('Modified Scale Parameter')
    plt.title('Modified Scale Parameter Stability Plot')

    plt.show()
Example #21
0
def fdr(p_value_list):
    p_adjust = stats.p_adjust(FloatVector(p_value_list), method='BH')
    q_adjust = qval.qvalue(p_adjust)
    p_adjust_value = [k for i, k in p_adjust.items()]
    for i in q_adjust.items():
        if i[0] == "qvalues":
            q_adjust_value = i[1]

    return p_adjust_value, q_adjust_value
def get_content_group_enrichment2(control_frequencies, interest_frequencies,
                                  interest_frequencies_5p,
                                  interest_frequencies_3p, dic_p_val,
                                  set_number, name, reg_dic):
    """
    :param control_frequencies: (dictionary of floats) a dictionary containing the amino acid nature frequencies of the
    control sets
    :param interest_frequencies: (dictionary of floats) a dictionary frequency of each amino acid nature in the user set
    of exons
    :param dic_p_val: (dictionary of floats a dictionary containing the p_values
    :param set_number: (int) the number of set to create
    :param name: (string) : the name of the first column
    :param reg_dic: (dictionary of list of string) : dictionary having keys corresponding to the group of interest and
    a list associated to those keys corresponding to the amino acid aggregated in those groups
    :return: (list of list of strings) the content of the nature sheet ! Each sublist correspond to a row in the
    nature sheet of the enrichment_report.xlsx file
    """
    dic_padjust = {}
    content = [[
        name, "frequencies_of_the_interest_set", "frequencies_interest_set_5p",
        "frequencies_interest_set_3p",
        "average_frequencies_of_the_" + str(set_number) + "_sets",
        "IC_90_of_the_" + str(set_number) + "_sets", "p_values_like", "FDR",
        "regulation_(p<=0.05)", "regulation(fdr<=0.05)", "nb_nt_group",
        "prop_nt_group", "ponderate_nt_group"
    ]]
    ic_90 = calculate_ic_90(control_frequencies)
    p_vals = list()
    for nature in dic_p_val.keys():
        p_vals.append(dic_p_val[nature])
    rstats = importr('stats')
    p_adjust = rstats.p_adjust(FloatVector(p_vals), method="BH")
    i = 0
    for nature in dic_p_val.keys():
        info_count, info_prop, count_pond = get_group_nt_info(reg_dic[nature])
        regulation, regulation_fdr = check_regulation(
            interest_frequencies[nature], ic_90[nature], dic_p_val[nature],
            p_adjust[i])
        content.append([
            str(nature),
            str(interest_frequencies[nature]),
            str(interest_frequencies_5p[nature]),
            str(interest_frequencies_3p[nature]),
            str(np.mean(control_frequencies[nature])),
            str(ic_90[nature]),
            str(dic_p_val[nature]),
            str(p_adjust[i]),
            str(regulation),
            str(regulation_fdr),
            str(info_count),
            str(info_prop),
            str(count_pond)
        ])
        dic_padjust[nature] = p_adjust[i]
        i += 1

    return content, dic_padjust
def nbinom_cdf_fromfit(q, fit_dict):
    
    pnbinom = robj.r('pnbinom')
    
    if np.isscalar(q):
        return np.array( pnbinom(q=q,size=fit_dict['estimate'][0],mu=fit_dict['estimate'][1]) )
        
    else:
        return np.array( pnbinom(q=FloatVector(q),size=fit_dict['estimate'][0],mu=fit_dict['estimate'][1]) )
def nbinom_pdf_fromfit(x, fit_dict):
    
    dnbinom = robj.r('dnbinom')
    
    if np.isscalar(x):
        return np.array( dnbinom(x=x,size=fit_dict['estimate'][0],mu=fit_dict['estimate'][1]) )
        
    else:
        return np.array( dnbinom(x=FloatVector(x),size=fit_dict['estimate'][0],mu=fit_dict['estimate'][1]) )
def nbinom_cdf(q, size, mu):
    
    pnbinom = robj.r('pnbinom')
    
    if np.isscalar(q):
        return np.array( pnbinom(q=q,size=size,mu=mu) )
        
    else:
        return np.array( pnbinom(q=FloatVector(q),size=size,mu=mu) )
def nbinom_pdf(x, size, mu):
    
    dnbinom = robj.r('dnbinom')
    
    if np.isscalar(x):
        return np.array( dnbinom(x=x,size=size,mu=mu) )
        
    else:
        return np.array( dnbinom(x=FloatVector(x),size=size,mu=mu) )
def MWU_vs_groups(data, groups, genes, dview, BH=True, log=True):
    """
    Performs MWU test for differential expression in cluster C compared to each other cluster. Returned is the maximal p-value.
    ----------
    data: pd.DataFrame of m cells x n genes.
    groups: pd.Series of cluster identity in m cells.
    genes: list of selected genes.
    dview: ipyparallel dview object.
    BH: whether to perform Benjamini-Hochberg correction. Default: True
    log: whether to return -log10 transformed pvalues.
    -----------
    returns p-values of genes in [genes] for all clusters in [groups].
    """

    #########################

    def MWU_vs_groups_helper(data, groups, gene):

        output = pd.DataFrame(index=[gene], columns=return_unique(groups))

        for gr1 in return_unique(groups):
            d1 = data.ix[groups[groups == gr1].index]
            pvals = []

            for gr2 in [gr2 for gr2 in return_unique(groups) if gr2 != gr1]:
                d2 = data.ix[groups[groups == gr2].index]

                try:
                    pval_tmp = mwu(d1, d2, alternative='greater')[1]
                except:
                    pval_tmp = 1.0

                pvals.append(pval_tmp)

            output.ix[gene, gr1] = np.max(pvals)

        return output.astype(float)

    #########################

    l = len(genes)

    output_tmp = dview.map_sync(MWU_vs_groups_helper,
                                [data.ix[g] for g in genes], [groups] * l,
                                genes)

    output = pd.concat(output_tmp, axis=0)

    if BH == True:
        for col in output.columns:
            output[col] = stats.p_adjust(FloatVector(output[col]), method='BH')

    if log == True:
        output = -np.log10(output.astype(float))

    return output
def identifyPPIs_chimericAdj(sorted_x_positive1_9, dicIntCount_positive1_9,
                             dicProteinCount_positive1_9, coEff, pCutOff,
                             oddsCutoff):
    factor = sum([x[1]
                  for x in sorted_x_positive1_9]) / len(sorted_x_positive1_9)
    chimTotal = sum([x[1] for x in sorted_x_positive1_9])
    pvalueList = []
    sorted_x_1_select = []
    selectList_1 = []
    posRCList_1 = []
    orList_1 = []
    chiList = []
    for ha in sorted_x_positive1_9:
        [gene1, gene2] = ha[0].split(';')
        a = dicIntCount_positive1_9[ha[0]]
        b = dicProteinCount_positive1_9[gene1] / 2 - a
        c = dicProteinCount_positive1_9[gene2] / 2 - a
        d = chimTotal - a - b - c
        b, c = max(0, b), max(0, c)
        oddsRatio = (a + 1) * (d + 1) / (b + 1) / (c + 1)
        chi2, p, dof, ex = stats.chi2_contingency([[a + 1, b + 1],
                                                   [c + 1, d + 1]])
        orList_1.append(oddsRatio)
        pvalueList.append(p)
        sorted_x_1_select.append(ha)
        selectList_1.append(ha[0])
        posRCList_1.append(a)
        chiList.append(chi2)

    stats1 = importr('stats')
    pvalueList_adj_1 = stats1.p_adjust(FloatVector(pvalueList), method='BH')

    lolCount = 0
    count = 0
    list1 = []
    rcList1 = []
    pvalueSig_1 = []
    orSig_1 = []
    chiSig = []
    for i in range(len(selectList_1)):
        ha = selectList_1[i]
        count += 1
        gene1, gene2 = ha.split(';')
        pAdj = pvalueList_adj_1[i]
        rcc = posRCList_1[i]
        orr = orList_1[i]
        chichi = chiList[i]
        if pAdj <= pCutOff and rcc > coEff * factor and 'MTRNR' not in gene1 and 'MTRNR' not in gene2 and orr > oddsCutoff:
            lolCount += 1
            list1.append(ha)
            pvalueSig_1.append(pAdj)
            orSig_1.append(orList_1[i])
            rcList1.append(rcc)
            chiSig.append(chichi)
    print(len(set(list1)))
    return list1, rcList1, orSig_1, chiSig, pvalueSig_1
Example #29
0
def calcOralEquiv(casrn,conc,q=0.5,species='Rat',units_in='uM',
                  units_out='mg',rest_clear=False):
    if type(q)==list:
        q=FloatVector(q)
    if type(conc)==list or type(conc)==pd.Series:
        conc=FloatVector(conc)
        
    kwargs={'conc':conc,
            'chem.cas':casrn,
            'which.quantile':q,
            'species':species,
            'input.units':units_in,
            'output.units':units_out,
            'restrictive.clearance':rest_clear,
            'suppress.messages':True
           }   
    X = httk.calc_mc_oral_equiv(**kwargs)
    #return pandas2ri.ri2py_listvector(X)
    return X
Example #30
0
    def fit_generator_for_model(self, model, train_generator, train_steps,
                                val_generator, val_steps, num_epochs):
        from rpy2.robjects.vectors import StrVector, IntVector, FactorVector, FloatVector
        x, y = self.collect_generator(train_generator, train_steps)

        self.model = self.bart.bartMachine(X=Baseline.to_data_frame(x),
                                           y=FloatVector([yy for yy in y]),
                                           mem_cache_for_speed=False,
                                           seed=909,
                                           run_in_sample=False)