Python DataFrame Exemples, rpy2.robjects.DataFrame Python Exemples

Exemple #1

0

Afficher le fichier

    def get_federov_data(self, factors):
        low_level_limits  = IntVector([self.parameter_ranges[f][0] for f in factors])
        high_level_limits = IntVector([self.parameter_ranges[f][1] - 1 for f in factors])
        factor_centers    = IntVector([0 for f in factors])
        factor_levels     = IntVector([self.parameter_ranges[f][1] for f in factors])
        factor_round      = IntVector([0 for f in factors])
        is_factor         = BoolVector([False for f in factors])
        mix               = BoolVector([False for f in factors])

        opt_federov_data = {
                             "var": StrVector(factors),
                             "low": low_level_limits,
                             "high": high_level_limits,
                             "center": factor_centers,
                             "nLevels": factor_levels,
                             "round": factor_round,
                             "factor": is_factor,
                             "mix": mix
                           }

        opt_federov_dataframe = DataFrame(opt_federov_data)
        opt_federov_dataframe = opt_federov_dataframe.rx(StrVector(["var",
                                                                   "low",
                                                                   "high",
                                                                   "center",
                                                                   "nLevels",
                                                                   "round",
                                                                   "factor",
                                                                   "mix"]))
        return opt_federov_dataframe

Exemple #2

0

Afficher le fichier

    def generate_valid_sample(self, sample_size):
        search_space_dataframe = {}

        for n in self.axis_names:
            search_space_dataframe[n] = []

        search_space = {}
        evaluated = 0

        info(
            "Generating valid search space of size {0} (does not spend evaluations)"
            .format(sample_size))

        while len(search_space) < sample_size:
            candidate_point = self.getRandomCoord()
            candidate_point_key = str(candidate_point)
            evaluated += 1

            if candidate_point_key not in search_space:
                perf_params = self.coordToPerfParams(candidate_point)

                is_valid = eval(self.constraint, copy.copy(perf_params),
                                dict(self.input_params))

                if is_valid:
                    search_space[candidate_point_key] = candidate_point

                    for n in perf_params:
                        candidate_value = self.parameter_values[n].index(
                            perf_params[n])
                        search_space_dataframe[n].append(candidate_value)

                    if len(search_space) % int(sample_size / 10) == 0:
                        info("Valid coordinates: " + str(len(search_space)) +
                             "/" + str(sample_size))
                        info("Tested coordinates: " + str(evaluated))

                if evaluated % 1000000 == 0:
                    info("Tested coordinates: " + str(evaluated))

        info("Valid/Tested configurations: " + str(len(search_space)) + "/" +
             str(evaluated))

        for k in search_space_dataframe:
            search_space_dataframe[k] = IntVector(search_space_dataframe[k])

        search_space_dataframe_r = DataFrame(search_space_dataframe)
        search_space_dataframe_r = search_space_dataframe_r.rx(
            StrVector(self.axis_names))

        info("Generated Search Space:")
        info(str(self.base.summary_default(search_space_dataframe_r)))

        coded_search_space_dataframe_r = self.encode_data(
            search_space_dataframe_r)

        return coded_search_space_dataframe_r

Exemple #3

0

Afficher le fichier

    def measure_design(self, design, encoded_design, step_number):
        info("Measuring design of size " + str(len(design[0])))

        design_names    = [str(n) for n in self.base.names(design)]
        initial_factors = self.params["axis_names"]
        measurements    = []

        info("Current Design Names: " + str(design_names))

        for line in range(1, len(design[0]) + 1):
            if type(design.rx(line, True)[0]) is int:
                design_line = [v for v in design.rx(line, True)]
            else:
                design_line = [int(round(float(v[0]))) for v in design.rx(line, True)]

            candidate = [0] * len(initial_factors)

            for k, v in self.model["fixed_factors"].items():
                candidate[initial_factors.index(k)] = int(round(float(v)))

            for i in range(len(design_names)):
                candidate[initial_factors.index(design_names[i])] = design_line[i]

            measurement = self.getPerfCosts([candidate])
            if measurement != {}:
                measurements.append(float(numpy.mean(measurement[str(candidate)][0])))
            else:
                measurements.append(robjects.NA_Real)

        design = self.base.cbind(design, DataFrame({self.model["response"]: FloatVector(measurements)}))
        encoded_design = self.base.cbind(encoded_design, DataFrame({self.model["response"]: FloatVector(measurements)}))

        info("Complete design, with measurements:")
        info(str(design))

        design = design.rx(self.stats.complete_cases(design), True)
        design = design.rx(self.base.is_finite(self.base.rowSums(design)), True)

        encoded_design = encoded_design.rx(self.stats.complete_cases(encoded_design), True)
        encoded_design = encoded_design.rx(self.base.is_finite(self.base.rowSums(encoded_design)), True)

        info("Clean design, with measurements:")
        info(str(design))

        info("Clean encoded design, with measurements:")
        info(str(encoded_design))

        self.utils.write_csv(encoded_design, "design_step_{0}.csv".format(step_number))
        self.utils.write_csv(design, "decoded_design_step_{0}.csv".format(step_number))

        if self.complete_design_data == None:
            self.complete_design_data = design
        else:
            self.complete_design_data = self.dplyr.bind_rows(self.complete_design_data, design)

        return design

Exemple #4

0

Afficher le fichier

Fichier : supplementaryFunctions.py Projet : sdomanskyi/decneo

def getNonParametricPValue(labels, values, random_seed=0, printResults=True):
    '''Markers localization p-value calculation:
    Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006.
    Implemented in R in "gravity: Estimation Methods for Gravity Models" at:
    https://rdrr.io/cran/gravity/man/ppml.html
    '''

    np.random.seed(random_seed)
    #np.random.shuffle(labels)

    dataf = DataFrame({
        'label': IntVector(tuple(labels)),
        'distance': FloatVector(tuple(values))
    })
    fit = R(
        'function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)'
    )(dataf)

    # Deviance is -2.*log_likelihood
    altDeviance = list(fit[9].items())[0][1]
    nullDeviance = list(fit[11].items())[0][1]
    p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1)

    if printResults:
        print(
            'Non-parametric method:',
            '\n\t',
            #'  Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t',
            #'Null deviance:\t', np.round(nullDeviance, 1), '\n\t',
            #'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t',
            'p-value:\t',
            '%.1e' % p_value,
            '\n')

    return p_value

Exemple #5

0

Afficher le fichier

def get_hourly_ffmc_on_diurnal_curve(ffmc_solar_noon: float,
                                     target_hour: float, temperature: float,
                                     relative_humidity: float,
                                     wind_speed: float, precip: float):
    """ Computes hourly FFMC based on noon FFMC using diurnal curve for approximation.
    Delegates the calculation to cffdrs R package.

    ffmc_solar_noon is the forecasted or actual FFMC value for solar noon of the date in question.
    target_hour is the hour of the day (on 24 hour clock) for which hourly FFMC should be calculated
    the weather variables (temperature, rh, wind_speed, precip) is the forecasted or actual weather
    values for solar noon.

    # Args: weatherstream:   Input weather stream data.frame which includes
    #                        temperature, relative humidity, wind speed,
    #                        precipitation, hourly value, and bui. More specific
    #                        info can be found in the hffmc.Rd help file.
    #            ffmc_old:   ffmc from previous timestep
    #           time.step:   The time (hours) between previous FFMC and current
    #                        time.
    #           calc.step:   Whether time step between 2 obs is calculated
    #                        (optional)
    #               batch:   Single step or iterative (default=TRUE)
    #           hourlyFWI:   Can calculated hourly ISI & FWI as well
    #                        (TRUE/FALSE, default=FALSE)
    #
    # Returns: A single or multiple hourly ffmc value(s)
    #
    # From hffmc.Rd:
    # {weatherstream}{
    # A dataframe containing input variables of hourly weather observations.
    # It is important that variable names have to be the same as in the following list, but they
    # are case insensitive. The order in which the input variables are entered is not important.
    #
    #     temp (required)  Temperature (centigrade)
    #     rh   (required)  Relative humidity (%)
    #     ws   (required)  10-m height wind speed (km/h)
    #     prec (required)  1-hour rainfall (mm)
    #     hr   (optional)  Hourly value to calculate sub-hourly ffmc
    #     bui  (optional)  Daily BUI value for the computation of hourly FWI. It is
    # required when hourlyFWI=TRUE
    """
    time_offset = target_hour - 13  # solar noon
    # build weather_data dictionary to be passed as weatherstream
    weather_data = {
        'hr': 13.0,
        'temp': temperature,
        'rh': relative_humidity,
        'ws': wind_speed,
        'prec': precip /
        24  # the precip received will be based on the previous 24 hours, but the
        # R function requires 1-hour rainfall. We don't have hourly data, so the best we can do is
        # take the mean amount of precip for the past 24 hours. This is a liberal approximation
        # with a lot of hand-waving.
    }
    weather_data = DataFrame(weather_data)
    # pylint: disable=protected-access, no-member
    result = CFFDRS.instance().cffdrs.hffmc(weatherstream=weather_data,
                                            ffmc_old=ffmc_solar_noon,
                                            time_step=time_offset)
    return result[0]

Exemple #6

0

Afficher le fichier

Fichier : outliers.py Projet : HuttleyLab/gnc

def qcrop2(xlist, ylist, labels=None, nq=4.):
    if labels is None:
        labels = map(str, range(len(xlist)))
    x = []
    y = []
    xcrop = []
    ycrop = []
    facet = []
    for i, (onex, oney) in enumerate(zip(xlist, ylist)):
        xmin, xmax = qlim1(onex, nq)
        ymin, ymax = qlim1(oney, nq)
        cropx, cropy = zip(*[(
            nan,
            nan) if vy > ymax or vy < ymin or vx < xmin or vx > xmax else (vx,
                                                                           vy)
                             for vx, vy in zip(onex, oney)])
        xcrop += cropx
        ycrop += cropy
        x += onex
        y += oney
        facet += [labels[i]] * len(onex)

    df = DataFrame({
        'x':
        FloatVector(x),
        'y':
        FloatVector(y),
        'xcrop':
        FloatVector(xcrop),
        'ycrop':
        FloatVector(ycrop),
        'facet':
        FactorVector(StrVector(facet), levels=StrVector(labels))
    })
    return df

Exemple #7

0

Afficher le fichier

Fichier : paralinear.py Projet : HuttleyLab/geneticdistance

def plot_JS_EN_scatter_by_pairs(stats, output_file=None, pair=None, **kw):
    x = []
    y = []
    ya = []
    for triad in stats:
        for r in stats[triad]:
            paralinear_dists = get_paralinear_distances(r[0]['gene'], **kw)
            ns_EN = sum(r[0]['EN'][t] for t in pair)
            s_EN = sum(r[1]['EN'][t] for t in pair)
            para = paralinear_dists[pair]
            if para:
                x.append(ns_EN)
                y.append(para)
                ya.append(s_EN)
    
    print 'paralinear stats'
    print_stats(x, y)
    print 'GTR stats'
    print_stats(x, ya)
  
    df = DataFrame({'x':FloatVector(x), 'y':FloatVector(y)})
    globalenv['df'] = df
    cmd = 'gg <- ggplot(df, aes(x, y)) + geom_point(alpha=0.2) + ' + \
            'geom_abline(intercept=0, slope=1, color="white") + ' + \
            'xlab(bquote(.("'+' to '.join(pair)+'") ~ d[ENS])) + ' + \
            'ylab(bquote(.("'+' to '.join(pair)+'") ~ d[para])) + ' + \
            'coord_cartesian(xlim=c(0,1), ylim=c(0,1))'
    R(cmd)
    if output_file:
        R('ggsave("' + output_file + '", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')
    return

Exemple #8

0

Afficher le fichier

def call_r(df):
    '''
	Arguments:
	df: A string replicating a CSV file. The observations for the dependent
		variable MUST be in the FIRST COLUMN

	Returns: an rpy2 Robject float vector which stores the coefficients of the
	linear regression
	'''
    from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
    from io import StringIO
    from rpy2.robjects import DataFrame
    from rpy2.robjects import FloatVector
    import rpy2.rinterface as ri

    ri.initr()
    file_like_obj = StringIO(df)
    constructor_dict = parser(file_like_obj)
    rpy2_dataframe = DataFrame(constructor_dict)
    with open('regression_app\linear_modeler_function.R') as f:
        str = f.read()
    mod = SignatureTranslatedAnonymousPackage(str, 'mod')
    a = mod.linear_modeler(rpy2_dataframe)
    del mod
    return a

Exemple #9

0

Afficher le fichier

    def Run(self):

        self.transit_message("Starting Corrplot")
        start_time = time.time()

        # assume first non-comment line is header; samples are
        headers = None
        data, means = [], []

        if self.filetype == "gene_means":
            for line in open(self.gene_means):
                w = line.rstrip().split('\t')
                if line[0] == '#':
                    headers = w[3:]
                    continue  # last comment line has names of samples
                data.append(w)
                cnts = [float(x) for x in w[3:]]
                means.append(cnts)
        elif self.filetype == "anova" or self.filetype == "zinb":
            n = -1  # number of conditions
            for line in open(self.gene_means):
                w = line.rstrip().split('\t')
                if line[0] == '#' or (
                        'pval' in line and 'padj' in line
                ):  # check for 'pval' for backwards compatibility
                    headers = w
                    continue  # keep last comment line as headers
                if n == -1:
                    # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
                    # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
                    if self.filetype == "anova": n = int((len(w) - 6) / 2)
                    elif self.filetype == "zinb":
                        n = int((len(headers) - 6) / 4)
                    headers = headers[3:3 + n]
                    headers = [x.replace("Mean_", "") for x in headers]
                vals = [float(x)
                        for x in w[3:3 + n]]  # take just the columns of means
                qval = float(w[-2])
                if qval < 0.05:
                    data.append(w)
                    means.append(vals)
        else:
            print("filetype not recognized: %s" % self.filetype)
            sys.exit(-1)
        print("correlations based on %s genes" % len(means))

        genenames = ["%s/%s" % (w[0], w[1]) for w in data]
        hash = {}
        headers = [h.replace("Mean_", "") for h in headers]
        for i, col in enumerate(headers):
            hash[col] = FloatVector([x[i] for x in means])
        df = DataFrame(hash)  # can't figure out how to set rownames

        corrplotFunc = self.make_corrplotFunc()
        corrplotFunc(
            df, StrVector(headers), StrVector(genenames), self.outfile
        )  # pass headers to put cols in order, since df comes from dict

        self.finish()
        self.transit_message("Finished Corrplot")

Exemple #10

0

Afficher le fichier

Fichier : util.py Projet : paulhoekman/um

def _align_var(breaks_r, pop_col, n, verbose=False):
    prev_b = -1
    i = 1
    align = dict()
    _vector = list()
    align_t = [1]
    for e, b in enumerate(breaks_r):
        if prev_b + 1 == b and b < n + 1:
            try:
                assert (min(align_t) != max(align_t))
                # align[pop_col + '.' + str(i)] = IntVector(
                align[pop_col] = IntVector((min(align_t), max(align_t)))
                # _vector.extend([min(align_t), max(align_t)])
            except:
                if verbose:
                    print("can't allign {} at {} for {}".format(align_t, e, b))
                pass
            i += 1
            align_t = [e + 2]
        else:
            align_t.append(e + 2)
        prev_b = b
    if len(align) == 0:
        align[pop_col] = IntVector((1, len(breaks_r)))
    # else:
    # align[pop_col] = IntVector(_vector)
    align_r = DataFrame(align)
    return align_r

Exemple #11

0

Afficher le fichier

Fichier : outliers.py Projet : HuttleyLab/gnc

def qlim(x, y):
    df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)})
    rq = quantreg.rq('y ~ x', df, tau=FloatVector((0.25, 0.5, 0.75)))
    print rq.rx2('coefficients')
    fv = array(rq.rx2('fitted.values'))
    return min(fv[:,1]) - 2*max(fv[:,1] - fv[:,0]), \
            2*max(fv[:,2] - fv[:,1]) + max(fv[:,1])

Exemple #12

0

Afficher le fichier

Fichier : handy_r.py Projet : HuttleyLab/geneticdistance

def through_the_origin(x, y):
    df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)})
    s = r.summary(r.lm('y ~ 0 + x', df))
    return {
        'coefficient': s.rx2('coefficients')[0],
        'stderr': s.rx2('coefficients')[1],
        'r.squared': s.rx2('r.squared')[0]
    }

Exemple #13

0

Afficher le fichier

Fichier : bar.py Projet : HuttleyLab/geneticdistance

def plot_bar(stats, output_file=None, **kw):
    names = [r['name'] for r in stats.values()[0][0]]
    with_rates = [r['with_rate'] for r in stats.values()[0][0]]
    names = [n + ('+Gamma' if w else '') for n, w in zip(names, with_rates)]

    by_dir = defaultdict(list)
    for triad in stats:
        for r in stats[triad]:
            by_dir[r[0]['from_directory']].append(r)

    for d in by_dir:
        by_dir[d] = zip(*[[gs_p(_r['gs_p']) for _r in r] for r in by_dir[d]])

    runs = []
    g_stats = []
    data = []
    alpha = 0
    for d, v in by_dir.items():
        if 'exons' in d.split('/'):
            dataset = 'Nuclear'
        elif 'mtDNA' in d.split('/'):
            dataset = 'Mitochondrial'
        else:
            dataset = 'Microbial'
        print dataset
        for j, g in enumerate(v):
            g_stats += g
            data += [dataset] * len(g)
            runs += [j] * len(g)
            print names[j], sum(1 for _g in g if _g > 0.05) / len(g)
            alpha = max(alpha, get_alpha(g))
        print 'Samples', len(g)
    labels = 'expression(' + ','.join(names) + ')'

    df = DataFrame({
        'run': IntVector(runs),
        'g_stat': FloatVector(g_stats),
        'data': StrVector(data)
    })
    globalenv['df'] = df
    R('library(scales)')
    #            'geom_jitter(alpha=0.2, size=1) + ' + \
    #            'geom_boxplot(fill=NA, outlier.size=0, size=1.5, color=alpha("white", 0.5)) + ' + \
    #            'geom_boxplot(alpha=0.8, outlier.size=0) + ' + \
    #            'geom_hline(yintercept=0.05, size=1.5, alpha=0.5, color="white") + ' + \
    #            'geom_hline(yintercept=0.05, color="black") + ' + \
    cmd = 'gg <- ggplot(df, aes(factor(run), g_stat)) + ' + \
            'ylab("Goodness-of-Fit p-value") + xlab("Model") + ' + \
            'geom_boxplot(outlier.size=1, outlier.colour=alpha("black",'+str(alpha)+')) + ' + \
            'scale_x_discrete(labels=' + labels + ') + ' + \
            'theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) + ' + \
            'facet_grid(. ~ data)'
    R(cmd)
    if output_file:
        R('ggsave("' + output_file + '", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')

Exemple #14

0

Afficher le fichier

Fichier : class_renv.py Projet : plutoese/workrobot

    def python_to_r_object(cls, item):
        """ 把python对象转化为R对象，类方法

        :param item: Python对象，可以转换的类型有：list，tuple，pd.Series, np.ndarray, pd.DataFrame
        :return: R对象
        """
        numpy2ri.activate()
        if isinstance(item,(list, tuple, pd.Series)):
            return np.array(item)
        elif isinstance(item, pd.DataFrame):
            data_dict = {col_names: np.array(item[col_names]) for col_names in item.columns}
            rdataframe = DataFrame(data_dict)
            rdataframe.rownames = np.array(item.index)
            return rdataframe
        elif isinstance(item, (np.ndarray,bool,int,float,str)):
            return item
        else:
            print('Unsuported type: ',type(item))
            raise Exception

Exemple #15

0

Afficher le fichier

def tupls2RDataframe(data, titles):
    cols = [[] for _ in titles]
    for datum in data:
        for i, e in enumerate(datum):
            cols[i].append(e)
    col_d = {}
    for i, t in enumerate(titles):
        col_d[t] = StrVector(tuple(cols[i]))
        col_d[t] = FactorVector(col_d[t])
    dataf = DataFrame(col_d)
    return dataf

Exemple #16

0

Afficher le fichier

Fichier : pybrms.py Projet : vals/pybrms

def _convert_python_to_R(data: typing.Union[dict, pd.DataFrame]):
    """
    Converts a python object to an R object brms can handle:
    * python dict      ->   R list
    * python dataframe ->   R dataframe
    """
    with localconverter(default_converter + pandas2ri.converter + numpy2ri.converter) as cv:
        if isinstance(data, pd.DataFrame):
            return DataFrame(data)
        elif isinstance(data, dict):
            return ListVector(data)
        else:
            raise ValueError("Data should be either a pandas dataframe or a dictionary")

Exemple #17

0

Afficher le fichier

 def create_variable_in_R(self, variable_name, value):
     if isinstance(value, (list, np.ndarray, pd.Series)):
         self._R.globalenv[variable_name] = RActor.python_type_to_R_type(value)
     elif isinstance(value, pd.DataFrame):
         indexes = value.index
         rownames = ''.join(["c('","','".join(list(indexes)),"')"])
         value_dict = value.to_dict('list')
         for key in value_dict:
             value_dict[key] = RActor.python_type_to_R_type(value_dict[key])
         self._R.globalenv[variable_name] = DataFrame(value_dict)
         self._R.r(''.join(['rownames(',variable_name,') <- ',rownames]))
     else:
         self._R.globalenv[variable_name] = value

Exemple #18

0

Afficher le fichier

Fichier : class_renv.py Projet : NeckiiZ/workrobot

    def python_to_r_object(cls, item):
        """ 把python对象转化为R对象，类方法

        :param item: Python对象，可以转换的类型有：list，tuple，pd.Series, np.ndarray, pd.DataFrame
        :return: R对象
        """
        numpy2ri.activate()
        if isinstance(item, (list, tuple, pd.Series)):
            return np.array(item)
        elif isinstance(item, pd.DataFrame):
            data_dict = {
                col_names: np.array(item[col_names])
                for col_names in item.columns
            }
            rdataframe = DataFrame(data_dict)
            rdataframe.rownames = np.array(item.index)
            return rdataframe
        elif isinstance(item, (np.ndarray, bool, int, float, str)):
            return item
        else:
            print('Unsuported type: ', type(item))
            raise Exception

Exemple #19

0

Afficher le fichier

 def _comparisons_dataframe(self):
     # col = ('Label.1', 'Label.2', 'win1', 'win2')
     # data = zip(col, [*self.comparison_items, *self.comparison_wins])
     # return DataFrame(OrdDict([data]))
     column_comp1 = ('Label.1',
                     FactorVector(self.comparison_items[0],
                                  levels=StrVector(self.items)))
     column_comp2 = ('Label.2',
                     FactorVector(self.comparison_items[1],
                                  levels=StrVector(self.items)))
     column_win1 = ('win1', FloatVector(self.comparison_wins[0]))
     column_win2 = ('win2', FloatVector(self.comparison_wins[1]))
     return DataFrame(
         OrdDict([column_comp1, column_comp2, column_win1, column_win2]))

Exemple #20

0

Afficher le fichier

    def Run(self):

        if self.filetype != "anova" and self.filetype != "zinb":
            print("filetype not recognized: %s" % self.filetype)
            sys.exit(-1)

        headers = None
        data, hits = [], []
        n = -1  # number of conditions

        for line in open(self.infile):
            w = line.rstrip().split('\t')
            if line[0] == '#' or (
                    'pval' in line and 'padj'
                    in line):  # check for 'pval' for backwards compatibility
                headers = w
                continue  # keep last comment line as headers
            # assume first non-comment line is header
            if n == -1:
                # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
                # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
                if self.filetype == "anova": n = int((len(w) - 6) / 2)
                elif self.filetype == "zinb": n = int((len(headers) - 6) / 4)
                headers = headers[3:3 + n]
                headers = [x.replace("Mean_", "") for x in headers]
            else:
                lfcs = [float(x) for x in w[3 + n:3 + n + n]
                        ]  # take just the columns of means
                qval = float(w[-2])
                data.append((w, lfcs, qval))

        data.sort(key=lambda x: x[-1])
        hits, LFCs = [], []
        for k, (w, lfcs, qval) in enumerate(data):
            if (self.topk == -1 and qval < self.qval) or (self.topk != -1
                                                          and k < self.topk):
                hits.append(w)
                LFCs.append(lfcs)

        print("heatmap based on %s genes" % len(hits))
        genenames = ["%s/%s" % (w[0], w[1]) for w in hits]
        hash = {}
        headers = [h.replace("Mean_", "") for h in headers]
        for i, col in enumerate(headers):
            hash[col] = FloatVector([x[i] for x in LFCs])
        df = DataFrame(hash)
        heatmapFunc = self.make_heatmapFunc()
        heatmapFunc(df, StrVector(genenames), self.outfile)

Exemple #21

0

Afficher le fichier

Fichier : compute_netdist.py Projet : MPBA/renette

    def loadfiles(self):
        """
        Load files into R environment
        """
        rcount = 0
        asmatrix = robjects.r['as.matrix']
        diag = robjects.r['diag']
        names = robjects.r['names']
        
        ## Set the default parameter for reading from csv
        param = {'header': True, 'as_is': True, 'row.names': ri.NULL}
        
        ## Check the correct parameter and set the default        
        for p in param.keys():
            if p in self.param:
                if self.param[p] is not None:
                    param[p] = self.param[p]

        for f, s in zip(self.filelist, self.seplist):
            try:
                dataf = DataFrame.from_csvfile(f,
                                               sep=str(s),
                                               header=param['header'],
                                               as_is=param['as_is'],
                                               row_names=param['row.names'])

                dataf = asmatrix(dataf)
                
              
                # Should be the diagonal set to 0?
                # Do it for all the inputs, just to be sure
                zcount = 0
                for i in xrange(dataf.ncol):
                    if (dataf.rx(i+1,i+1)[0] - 0.0 >= 1e-8):
                        zcount += 1
                        dataf.rx[i+1,i+1] = 0

                if zcount:
                    self.e += f
                    
                self.mylist.append(dataf)
                rcount += 1
            except IOError, e:
                self.error += e
            
            except RRuntimeError, e:
                self.error += e

Exemple #22

0

Afficher le fichier

Fichier : doptanova.py Projet : phrb/Orio

    def measure_design(self, design, response, fixed_factors):
        info("Measuring design of size " + str(len(design)))

        design_names = [str(n) for n in self.base.names(design)]
        initial_factors = self.params["axis_names"]
        measurements = []

        info("Current Design Names: " + str(design_names))
        info("Initial Factors: " + str(initial_factors))

        for line in range(len(design[0])):
            design_line = [int(v[0]) for v in design.rx(line + 1, True)]

            candidate = [0] * len(initial_factors)

            for k, v in fixed_factors.items():
                candidate[initial_factors.index(k)] = int(v)

            for i in range(len(design_names)):
                candidate[initial_factors.index(
                    design_names[i])] = design_line[i]

            info("Initial Design Line: " + str(design_line))
            info("Fixed Factors: " + str(fixed_factors))
            info("Testing candidate " + str(line + 1) + ": " + str(candidate))

            measurement = self.getPerfCosts([candidate])
            if measurement != {}:
                measurements.append(
                    float(numpy.mean(measurement[str(candidate)][0])))
            else:
                measurements.append(float('inf'))

        info("Measurements: " + str(measurements))

        design = self.base.cbind(
            design, DataFrame({response[0]: FloatVector(measurements)}))

        design = design.rx(self.base.is_finite(design.rx2(response[0])), True)

        info(str(design))

        return design

Exemple #23

0

Afficher le fichier

Fichier : clock_test.py Projet : HuttleyLab/geneticdistance

def plot_lrt_histograms(stats, output_file, **kw):
    gtr = []
    general = []
    gtrplusgamma = []
    for triad in stats:
        for r in stats[triad]:
            for o, l in zip((0, 2, 4), (general, gtr, gtrplusgamma)):
                p = lrt_p(r[1 + o]['ll'], r[o]['ll'], r[1 + o]['df'],
                          r[o]['df'])
                l.append(p)

    intercepts = []
    for n, v in zip(('General', 'GTR+Gamma', 'GTR'),
                    (general, gtrplusgamma, gtr)):
        i = sum(1 for p in v if p <= 0.05) / len(v)
        intercepts.append(str(np.round(i, 2)))
        print n, min(v), max(v), i, len(v)
    intercepts = 'c(' + ','.join(intercepts) + ')'

    n = len(gtr)
    globalenv['df'] = DataFrame({
        'Model':
        StrVector(['general'] * n + ['gtrplusgamma'] * n + ['gtr'] * n),
        'pvalue':
        FloatVector(general + gtrplusgamma + gtr)
    })
    cmd = 'gg <- ggplot(df, aes(pvalue, group=Model, linetype=Model)) + ' + \
        'stat_ecdf(geom="line") + ' + \
        'xlab("LRT p-value") + ylab("Empirical CDF") + ' + \
        'theme(legend.position = c(0.85, 0.15)) + ' + \
        'scale_x_continuous(breaks=c(0.05,seq(0.25,1,by=0.25)), limits=c(0,1))+'+\
        'scale_y_continuous(breaks=c(' + intercepts + \
        ', seq(0,1,by=0.25)), limits=c(0,1)) + ' + \
        'scale_linetype_discrete(labels=expression(General, GTR, GTR+Gamma))'
    R(cmd)
    if output_file:
        R('ggsave("' + output_file + '", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')

Exemple #24

0

Afficher le fichier

def is_strong_iv(df, idx):
    df = df.loc[df['index'].isin(idx)]
    Y = df["treated"]
    X0 = df["0"]
    X1 = df["1"]
    X2 = df["2"]
    X3 = df["3"]
    X4 = df["4"]
    X5 = df["5"]
    X6 = df["6"]
    X7 = df["7"]
    X8 = df["8"]
    X9 = df["9"]
    IV = df["iv"]
    formula = Formula(
        'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + iv')
    formula2 = Formula(
        'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9')
    dataf = DataFrame({'treated': robjects.IntVector(Y), \
                        'x0': robjects.IntVector(X0),\
                        'x1': robjects.IntVector(X1),\
                        'x2': robjects.IntVector(X2),\
                        'x3': robjects.IntVector(X3),\
                        'x4': robjects.IntVector(X4),\
                        'x5': robjects.IntVector(X5),\
                        'x6': robjects.IntVector(X6),\
                        'x7': robjects.IntVector(X7),\
                        'x8': robjects.IntVector(X8),\
                        'x9': robjects.IntVector(X9),\
                        'iv': robjects.IntVector(IV)
                        })

    #print(dataf)
    fit = robjects.r.lm(formula=formula, data=dataf)
    fit2 = robjects.r.lm(formula=formula2, data=dataf)
    r_frame = robjects.r.anova(fit, fit2)
    py_frame = pandas2ri.ri2py_dataframe(r_frame)

    return py_frame.iloc[1, 4] >= 10

Exemple #25

0

Afficher le fichier

Fichier : dlmt.py Projet : phrb/orio_experiments

    def measure_design(self, design):
        info("Measuring design of size " + str(len(design[0])))

        design_names    = [str(n) for n in self.base.names(design)]
        initial_factors = self.params["axis_names"]
        measurements    = []

        info("Current Design Names: " + str(design_names))

        for line in range(1, len(design[0]) + 1):
            if type(design.rx(line, True)[0]) is int:
                design_line = [v for v in design.rx(line, True)]
            else:
                design_line = [int(v[0]) for v in design.rx(line, True)]

            candidate = [0] * len(initial_factors)

            for k, v in self.model["fixed_factors"].items():
                candidate[initial_factors.index(k)] = int(v)

            for i in range(len(design_names)):
                candidate[initial_factors.index(design_names[i])] = design_line[i]

            measurement = self.getPerfCosts([candidate])
            if measurement != {}:
                measurements.append(float(numpy.mean(measurement[str(candidate)][0])))
            else:
                measurements.append(float('inf'))

        design = self.base.cbind(design, DataFrame({self.model["response"]: FloatVector(measurements)}))
        design = design.rx(self.base.is_finite(design.rx2(self.model["response"])), True)

        info("Complete design, with measurements:")
        info(str(design))

        return design

Exemple #26

0

Afficher le fichier

def total_concentration(df):
    Y = df["treated"]
    X0 = df["0"]
    X1 = df["1"]
    X2 = df["2"]
    X3 = df["3"]
    X4 = df["4"]
    X5 = df["5"]
    X6 = df["6"]
    X7 = df["7"]
    X8 = df["8"]
    X9 = df["9"]
    IV = df["iv"]
    formula = Formula(
        'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + iv')
    formula2 = Formula(
        'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9')
    dataf = DataFrame({'treated': robjects.IntVector(Y), \
                        'x0': robjects.IntVector(X0),\
                        'x1': robjects.IntVector(X1),\
                        'x2': robjects.IntVector(X2),\
                        'x3': robjects.IntVector(X3),\
                        'x4': robjects.IntVector(X4),\
                        'x5': robjects.IntVector(X5),\
                        'x6': robjects.IntVector(X6),\
                        'x7': robjects.IntVector(X7),\
                        'x8': robjects.IntVector(X8),\
                        'x9': robjects.IntVector(X9),\
                        'iv': robjects.IntVector(IV)
                        })

    fit = robjects.r.lm(formula=formula, data=dataf)
    fit2 = robjects.r.lm(formula=formula2, data=dataf)
    r_frame = robjects.r.anova(fit, fit2)
    py_frame = pandas2ri.ri2py_dataframe(r_frame)
    print("total concentration parameter: " + str(py_frame.iloc[1, 4]))

Exemple #27

0

Afficher le fichier

    def fit(self, training_data, target):
        """
        :param training_data: a pandas dataframe.
        :param target: a string referring to the target variable to be predicted
        :return: An rpart model
        """

        self.outcome = target

        #Converting to proper format for R functions
        train_data = DataFrame(training_data)
        formula = target + " ~ ."

        #TODO: HyperParameter Incorporateion

        #train the model
        self.model = rparty.rpart(
            formula=formula,
            data=train_data,
            method="class"
            #control = rparty.rpart_control(minsplit = ?, cp = ?)
        )

        return self.model

Exemple #28

0

Afficher le fichier

Fichier : doptanova.py Projet : phrb/Orio

            search_space_database = dataset.connect(
                "sqlite:///search_space_{0}.db".format(self.seed_space_size))
            for experiment in search_space_database['experiments']:
                search_space.append(eval(experiment["value"]))

        info("Starting DOPT-anova")

        r_search_space = {}
        for i in range(len(search_space[0])):
            r_row = [self.dim_uplimits[i] - 1, 0]
            for col in search_space:
                r_row.append(col[i])

            r_search_space[initial_factors[i]] = IntVector(r_row)

        data = DataFrame(r_search_space)
        data = data.rx(StrVector(initial_factors))

        self.dopt_anova(initial_factors, initial_inverse_factors, data)

        sys.exit()

        perf_cost, mean_perf_cost = self.MAXFLOAT, self.MAXFLOAT

        params = self.coordToPerfParams(coord)
        end_time = time.time()
        search_time = start_time - end_time
        speedup = float(eval_cost[0]) / float(best_perf_cost)
        search_time = time.time() - start_time

        info('----- end random search -----')

Exemple #29

0

Afficher le fichier

Fichier : zinb.py Projet : mad-lab/transit

    def run_zinb(self, data, genes, NZMeanByRep, LogZPercByRep,
                 RvSiteindexesMap, conditions, covariates, interactions):
        """
            Runs Zinb for each gene across conditions and returns p and q values
            ([[Wigdata]], [Gene], [Number], [Number], {Rv: [SiteIndex]}, [Condition], [Covar], [Interaction]) -> Tuple([Number], [Number], [Status])
            Wigdata :: [Number]
            Gene :: {start, end, rv, gene, strand}
            SiteIndex: Integer
            Condition :: String
            Covar :: String
            Interaction :: String
            Status :: String
        """

        count = 0
        self.progress_range(len(genes))
        pvals, Rvs, status = [], [], []
        r_zinb_signif = self.def_r_zinb_signif()
        if (self.winz):
            self.transit_message("Winsorizing and running analysis...")

        self.transit_message("Condition: %s" % self.condition)

        comp1a = "1+cond"
        comp1b = "1+cond"

        # include cond in mod0 only if testing interactions
        comp0a = "1" if len(self.interactions) == 0 else "1+cond"
        comp0b = "1" if len(self.interactions) == 0 else "1+cond"
        for I in self.interactions:
            comp1a += "*" + I
            comp1b += "*" + I
            comp0a += "+" + I
            comp0b += "+" + I
        for C in self.covars:
            comp1a += "+" + C
            comp1b += "+" + C
            comp0a += "+" + C
            comp0b += "+" + C
        zinbMod1 = "cnt~%s+offset(log(NZmean))|%s+offset(logitZperc)" % (
            comp1a, comp1b)
        zinbMod0 = "cnt~%s+offset(log(NZmean))|%s+offset(logitZperc)" % (
            comp0a, comp0b)

        nbMod1 = "cnt~%s" % (comp1a)
        nbMod0 = "cnt~%s" % (comp0a)
        toRFloatOrStrVec = lambda xs: FloatVector(
            [float(x) for x in xs]) if self.is_number(xs[0]) else StrVector(xs)

        for gene in genes:
            count += 1
            Rv = gene["rv"]
            ## Single gene case for debugging
            if (GENE):
                Rv = None
                if GENE in RvSiteindexesMap:
                    Rv = GENE
                else:
                    for g in genes:
                        if (g['gene'] == GENE):
                            Rv = g["rv"]
                            break
                if not Rv:
                    self.transit_error("Cannot find gene: {0}".format(GENE))
                    sys.exit(0)

            if (DEBUG):
                self.transit_message(
                    "======================================================================"
                )
                self.transit_message(gene["rv"] + " " + gene["gene"])

            if (len(RvSiteindexesMap[Rv]) <= 1):
                status.append("TA sites <= 1, not analyzed")
                pvals.append(1)
            else:
                # For winsorization
                # norm_data = self.winsorize((map(
                #     lambda wigData: wigData[RvSiteindexesMap[Rv]], data))) if self.winz else list(map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data))
                norm_data = list(
                    map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data))
                ([
                    readCounts, condition, covarsData, interactionsData,
                    NZmean, logitZPerc
                ]) = self.melt_data(norm_data, conditions, covariates,
                                    interactions, NZMeanByRep, LogZPercByRep)
                if (numpy.sum(readCounts) == 0):
                    status.append(
                        "pan-essential (no counts in all conditions) - not analyzed"
                    )
                    pvals.append(1)
                else:
                    df_args = {
                        'cnt': IntVector(readCounts),
                        'cond': toRFloatOrStrVec(condition),
                        'NZmean': FloatVector(NZmean),
                        'logitZperc': FloatVector(logitZPerc)
                    }
                    ## Add columns for covariates and interactions if they exist.
                    df_args.update(
                        list(
                            map(
                                lambda t_ic: (t_ic[
                                    1], toRFloatOrStrVec(covarsData[t_ic[0]])),
                                enumerate(self.covars))))
                    df_args.update(
                        list(
                            map(
                                lambda t_ic:
                                (t_ic[1],
                                 toRFloatOrStrVec(interactionsData[t_ic[0]])),
                                enumerate(self.interactions))))

                    melted = DataFrame(df_args)
                    # r_args = [IntVector(readCounts), StrVector(condition), melted, map(lambda x: StrVector(x), covars), FloatVector(NZmean), FloatVector(logitZPerc)] + [True]
                    debugFlag = True if DEBUG or GENE else False
                    pval, msg = r_zinb_signif(melted, zinbMod1, zinbMod0,
                                              nbMod1, nbMod0, debugFlag)
                    status.append(msg)
                    pvals.append(float(pval))
                if (DEBUG or GENE):
                    self.transit_message(
                        "Pval for Gene {0}: {1}, status: {2}".format(
                            Rv, pvals[-1], status[-1]))
                if (GENE):
                    self.transit_message("Ran for single gene. Exiting...")
                    sys.exit(0)
            Rvs.append(Rv)
            # Update progress
            text = "Running ZINB Method... %5.1f%%" % (100.0 * count /
                                                       len(genes))
            self.progress_update(text, count)

        pvals = numpy.array(pvals)
        mask = numpy.isfinite(pvals)
        qvals = numpy.full(pvals.shape, numpy.nan)
        qvals[mask] = statsmodels.stats.multitest.fdrcorrection(pvals)[
            1]  # BH, alpha=0.05

        p, q, statusMap = {}, {}, {}
        for i, rv in enumerate(Rvs):
            p[rv], q[rv], statusMap[rv] = pvals[i], qvals[i], status[i]
        return (p, q, statusMap)

Exemple #30

0

Afficher le fichier

Fichier : spytialProcess.py Projet : pacificclimate/climpyrical

def fit(
    latlon: NDArray[(2, Any), float],
    z: NDArray[(Any, ), float],
    nx: int,
    ny: int,
    extrap: bool,
) -> Tuple[NDArray[(Any, Any), float], NDArray[(Any, ), float], NDArray[(
        Any, ), float]]:
    """Encapsulates the functionality of R's spatialProcess into a Python
    Args:
        latlon: grid of pairwise coordinates of observations
        z: observations
        nx: number of grid cells on interpolated grid x
        nx: number of grid cells on interpolated grid y
        xy: dimensions of interpolated grid output
        distance: distance metric to use (note, only 'geo' supported currently)
        variogram_model: choice of variogram model
          (note, only 'exoponential' supported)
    Returns:
        z: kriged field
        x, y: locations of kriged data

    """

    if not isinstance(latlon, NDArray[(2, Any), float]):
        raise TypeError(
            f"Incorrect grid shape, size, or dtype. Must be {NDArray[(2, Any), float]}"
        )

    if not isinstance(z, NDArray[(Any, ), float]):
        raise TypeError(
            f"Incorrect grid shape, size, or dtype. Must be {NDArray[(Any, ), float]}"
        )

    if not isinstance(nx, int) or not isinstance(ny, int):
        raise TypeError("Provide integer grid size")

    if latlon.shape[1] != z.size:
        raise ValueError(
            "Different number of grid coordinates than observations")

    latlon, z = latlon.tolist(), z.tolist()

    # convert regular numeric data

    # convert latlon list into two R FloatVectors
    # list of FloatVector -> OrderedDict -> R DataFrame
    # -> numeric R data matrix
    r_lists = list(map(FloatVector, latlon))
    coords = OrderedDict(zip(map(str, range(len(r_lists))), r_lists))
    r_dataFrame = DataFrame(coords)
    r_latlon = robjects.r["data.matrix"](r_dataFrame)

    # convert observations
    r_z = FloatVector(z)

    # use separate simple r-script in path below
    rstring = resource_string("climpyrical",
                              "tests/data/spatial_process_r.R").decode("utf-8")

    rfunc = robjects.r(rstring)
    r_surface = rfunc(r_latlon, r_z, nx, ny, extrap)

    # extract data from R's interpolation
    surface_dict = dict(zip(r_surface.names, list(r_surface)))
    # z = np.array(list(r_surface[1]))
    z = np.array(surface_dict["z"]).reshape(nx, ny)
    x = np.array(surface_dict["x"])
    y = np.array(surface_dict["y"])
    # cov = dict(zip(surface_dict["cov"].names, list(surface_dict["cov"])))
    # cov = surface_dict["cov"]

    return z, x, y

Exemple #31

0

Afficher le fichier

Fichier : forPaperMisc.py Projet : sdomanskyi/decneo

                    utils = importr('dplyr')
                    utils = importr('gravity')
                except Exception as exception:
                    print(exception)
                    import rpy2.robjects.packages as rpackages
                    utils = rpackages.importr('utils')
                    utils = rpackages.importr('gravity')
                    utils.chooseCRANmirror(ind=1)
                    utils.install_packages('dplyr')
                    utils.install_packages('gravity')
                finally:
                    from rpy2.robjects.packages import importr
                    utils = importr('dplyr')
                    utils = importr('gravity')

                dataf = DataFrame({'label': IntVector(tuple(labels)), 'distance': FloatVector(tuple(data.T[0]))})
                fit = R('function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)')(dataf)
                #print(fit)

                # Deviance is -2.*log_likelihood
                altDeviance = list(fit[9].items())[0][1]
                nullDeviance = list(fit[11].items())[0][1]
                p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1)
                print('Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006.')
                print('Implemented in R in "gravity: Estimation Methods for Gravity Models" at:')
                print('https://rdrr.io/cran/gravity/man/ppml.html')
                print('Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t', 
                      'Null deviance:\t', np.round(nullDeviance, 1), '\n\t',
                      'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t',
                      'p-value:\t', '%.1e' % p_value, '\n')

Exemple #32

0

Afficher le fichier

# R fitdistr for Beta distribution: which starting parameters?
from rpy2.robjects import DataFrame
starter= DataFrame({'shape1':0.5,'shape2':0.5})
x = MASS.fitdistr(myValues, "beta", start=starter))