Example #1
0
    def get_federov_data(self, factors):
        low_level_limits  = IntVector([self.parameter_ranges[f][0] for f in factors])
        high_level_limits = IntVector([self.parameter_ranges[f][1] - 1 for f in factors])
        factor_centers    = IntVector([0 for f in factors])
        factor_levels     = IntVector([self.parameter_ranges[f][1] for f in factors])
        factor_round      = IntVector([0 for f in factors])
        is_factor         = BoolVector([False for f in factors])
        mix               = BoolVector([False for f in factors])

        opt_federov_data = {
                             "var": StrVector(factors),
                             "low": low_level_limits,
                             "high": high_level_limits,
                             "center": factor_centers,
                             "nLevels": factor_levels,
                             "round": factor_round,
                             "factor": is_factor,
                             "mix": mix
                           }

        opt_federov_dataframe = DataFrame(opt_federov_data)
        opt_federov_dataframe = opt_federov_dataframe.rx(StrVector(["var",
                                                                   "low",
                                                                   "high",
                                                                   "center",
                                                                   "nLevels",
                                                                   "round",
                                                                   "factor",
                                                                   "mix"]))
        return opt_federov_dataframe
Example #2
0
    def generate_valid_sample(self, sample_size):
        search_space_dataframe = {}

        for n in self.axis_names:
            search_space_dataframe[n] = []

        search_space = {}
        evaluated = 0

        info(
            "Generating valid search space of size {0} (does not spend evaluations)"
            .format(sample_size))

        while len(search_space) < sample_size:
            candidate_point = self.getRandomCoord()
            candidate_point_key = str(candidate_point)
            evaluated += 1

            if candidate_point_key not in search_space:
                perf_params = self.coordToPerfParams(candidate_point)

                is_valid = eval(self.constraint, copy.copy(perf_params),
                                dict(self.input_params))

                if is_valid:
                    search_space[candidate_point_key] = candidate_point

                    for n in perf_params:
                        candidate_value = self.parameter_values[n].index(
                            perf_params[n])
                        search_space_dataframe[n].append(candidate_value)

                    if len(search_space) % int(sample_size / 10) == 0:
                        info("Valid coordinates: " + str(len(search_space)) +
                             "/" + str(sample_size))
                        info("Tested coordinates: " + str(evaluated))

                if evaluated % 1000000 == 0:
                    info("Tested coordinates: " + str(evaluated))

        info("Valid/Tested configurations: " + str(len(search_space)) + "/" +
             str(evaluated))

        for k in search_space_dataframe:
            search_space_dataframe[k] = IntVector(search_space_dataframe[k])

        search_space_dataframe_r = DataFrame(search_space_dataframe)
        search_space_dataframe_r = search_space_dataframe_r.rx(
            StrVector(self.axis_names))

        info("Generated Search Space:")
        info(str(self.base.summary_default(search_space_dataframe_r)))

        coded_search_space_dataframe_r = self.encode_data(
            search_space_dataframe_r)

        return coded_search_space_dataframe_r
Example #3
0
    def measure_design(self, design, encoded_design, step_number):
        info("Measuring design of size " + str(len(design[0])))

        design_names    = [str(n) for n in self.base.names(design)]
        initial_factors = self.params["axis_names"]
        measurements    = []

        info("Current Design Names: " + str(design_names))

        for line in range(1, len(design[0]) + 1):
            if type(design.rx(line, True)[0]) is int:
                design_line = [v for v in design.rx(line, True)]
            else:
                design_line = [int(round(float(v[0]))) for v in design.rx(line, True)]

            candidate = [0] * len(initial_factors)

            for k, v in self.model["fixed_factors"].items():
                candidate[initial_factors.index(k)] = int(round(float(v)))

            for i in range(len(design_names)):
                candidate[initial_factors.index(design_names[i])] = design_line[i]

            measurement = self.getPerfCosts([candidate])
            if measurement != {}:
                measurements.append(float(numpy.mean(measurement[str(candidate)][0])))
            else:
                measurements.append(robjects.NA_Real)

        design = self.base.cbind(design, DataFrame({self.model["response"]: FloatVector(measurements)}))
        encoded_design = self.base.cbind(encoded_design, DataFrame({self.model["response"]: FloatVector(measurements)}))

        info("Complete design, with measurements:")
        info(str(design))

        design = design.rx(self.stats.complete_cases(design), True)
        design = design.rx(self.base.is_finite(self.base.rowSums(design)), True)

        encoded_design = encoded_design.rx(self.stats.complete_cases(encoded_design), True)
        encoded_design = encoded_design.rx(self.base.is_finite(self.base.rowSums(encoded_design)), True)

        info("Clean design, with measurements:")
        info(str(design))

        info("Clean encoded design, with measurements:")
        info(str(encoded_design))

        self.utils.write_csv(encoded_design, "design_step_{0}.csv".format(step_number))
        self.utils.write_csv(design, "decoded_design_step_{0}.csv".format(step_number))

        if self.complete_design_data == None:
            self.complete_design_data = design
        else:
            self.complete_design_data = self.dplyr.bind_rows(self.complete_design_data, design)

        return design
def getNonParametricPValue(labels, values, random_seed=0, printResults=True):
    '''Markers localization p-value calculation:
    Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006.
    Implemented in R in "gravity: Estimation Methods for Gravity Models" at:
    https://rdrr.io/cran/gravity/man/ppml.html
    '''

    np.random.seed(random_seed)
    #np.random.shuffle(labels)

    dataf = DataFrame({
        'label': IntVector(tuple(labels)),
        'distance': FloatVector(tuple(values))
    })
    fit = R(
        'function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)'
    )(dataf)

    # Deviance is -2.*log_likelihood
    altDeviance = list(fit[9].items())[0][1]
    nullDeviance = list(fit[11].items())[0][1]
    p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1)

    if printResults:
        print(
            'Non-parametric method:',
            '\n\t',
            #'  Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t',
            #'Null deviance:\t', np.round(nullDeviance, 1), '\n\t',
            #'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t',
            'p-value:\t',
            '%.1e' % p_value,
            '\n')

    return p_value
Example #5
0
def get_hourly_ffmc_on_diurnal_curve(ffmc_solar_noon: float,
                                     target_hour: float, temperature: float,
                                     relative_humidity: float,
                                     wind_speed: float, precip: float):
    """ Computes hourly FFMC based on noon FFMC using diurnal curve for approximation.
    Delegates the calculation to cffdrs R package.

    ffmc_solar_noon is the forecasted or actual FFMC value for solar noon of the date in question.
    target_hour is the hour of the day (on 24 hour clock) for which hourly FFMC should be calculated
    the weather variables (temperature, rh, wind_speed, precip) is the forecasted or actual weather
    values for solar noon.

    # Args: weatherstream:   Input weather stream data.frame which includes
    #                        temperature, relative humidity, wind speed,
    #                        precipitation, hourly value, and bui. More specific
    #                        info can be found in the hffmc.Rd help file.
    #            ffmc_old:   ffmc from previous timestep
    #           time.step:   The time (hours) between previous FFMC and current
    #                        time.
    #           calc.step:   Whether time step between 2 obs is calculated
    #                        (optional)
    #               batch:   Single step or iterative (default=TRUE)
    #           hourlyFWI:   Can calculated hourly ISI & FWI as well
    #                        (TRUE/FALSE, default=FALSE)
    #
    # Returns: A single or multiple hourly ffmc value(s)
    #
    # From hffmc.Rd:
    # {weatherstream}{
    # A dataframe containing input variables of hourly weather observations.
    # It is important that variable names have to be the same as in the following list, but they
    # are case insensitive. The order in which the input variables are entered is not important.
    #
    #     temp (required)  Temperature (centigrade)
    #     rh   (required)  Relative humidity (%)
    #     ws   (required)  10-m height wind speed (km/h)
    #     prec (required)  1-hour rainfall (mm)
    #     hr   (optional)  Hourly value to calculate sub-hourly ffmc
    #     bui  (optional)  Daily BUI value for the computation of hourly FWI. It is
    # required when hourlyFWI=TRUE
    """
    time_offset = target_hour - 13  # solar noon
    # build weather_data dictionary to be passed as weatherstream
    weather_data = {
        'hr': 13.0,
        'temp': temperature,
        'rh': relative_humidity,
        'ws': wind_speed,
        'prec': precip /
        24  # the precip received will be based on the previous 24 hours, but the
        # R function requires 1-hour rainfall. We don't have hourly data, so the best we can do is
        # take the mean amount of precip for the past 24 hours. This is a liberal approximation
        # with a lot of hand-waving.
    }
    weather_data = DataFrame(weather_data)
    # pylint: disable=protected-access, no-member
    result = CFFDRS.instance().cffdrs.hffmc(weatherstream=weather_data,
                                            ffmc_old=ffmc_solar_noon,
                                            time_step=time_offset)
    return result[0]
Example #6
0
def qcrop2(xlist, ylist, labels=None, nq=4.):
    if labels is None:
        labels = map(str, range(len(xlist)))
    x = []
    y = []
    xcrop = []
    ycrop = []
    facet = []
    for i, (onex, oney) in enumerate(zip(xlist, ylist)):
        xmin, xmax = qlim1(onex, nq)
        ymin, ymax = qlim1(oney, nq)
        cropx, cropy = zip(*[(
            nan,
            nan) if vy > ymax or vy < ymin or vx < xmin or vx > xmax else (vx,
                                                                           vy)
                             for vx, vy in zip(onex, oney)])
        xcrop += cropx
        ycrop += cropy
        x += onex
        y += oney
        facet += [labels[i]] * len(onex)

    df = DataFrame({
        'x':
        FloatVector(x),
        'y':
        FloatVector(y),
        'xcrop':
        FloatVector(xcrop),
        'ycrop':
        FloatVector(ycrop),
        'facet':
        FactorVector(StrVector(facet), levels=StrVector(labels))
    })
    return df
Example #7
0
def plot_JS_EN_scatter_by_pairs(stats, output_file=None, pair=None, **kw):
    x = []
    y = []
    ya = []
    for triad in stats:
        for r in stats[triad]:
            paralinear_dists = get_paralinear_distances(r[0]['gene'], **kw)
            ns_EN = sum(r[0]['EN'][t] for t in pair)
            s_EN = sum(r[1]['EN'][t] for t in pair)
            para = paralinear_dists[pair]
            if para:
                x.append(ns_EN)
                y.append(para)
                ya.append(s_EN)
    
    print 'paralinear stats'
    print_stats(x, y)
    print 'GTR stats'
    print_stats(x, ya)
  
    df = DataFrame({'x':FloatVector(x), 'y':FloatVector(y)})
    globalenv['df'] = df
    cmd = 'gg <- ggplot(df, aes(x, y)) + geom_point(alpha=0.2) + ' + \
            'geom_abline(intercept=0, slope=1, color="white") + ' + \
            'xlab(bquote(.("'+' to '.join(pair)+'") ~ d[ENS])) + ' + \
            'ylab(bquote(.("'+' to '.join(pair)+'") ~ d[para])) + ' + \
            'coord_cartesian(xlim=c(0,1), ylim=c(0,1))'
    R(cmd)
    if output_file:
        R('ggsave("' + output_file + '", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')
    return
Example #8
0
def call_r(df):
    '''
	Arguments:
	df: A string replicating a CSV file. The observations for the dependent
		variable MUST be in the FIRST COLUMN

	Returns: an rpy2 Robject float vector which stores the coefficients of the
	linear regression
	'''
    from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
    from io import StringIO
    from rpy2.robjects import DataFrame
    from rpy2.robjects import FloatVector
    import rpy2.rinterface as ri

    ri.initr()
    file_like_obj = StringIO(df)
    constructor_dict = parser(file_like_obj)
    rpy2_dataframe = DataFrame(constructor_dict)
    with open('regression_app\linear_modeler_function.R') as f:
        str = f.read()
    mod = SignatureTranslatedAnonymousPackage(str, 'mod')
    a = mod.linear_modeler(rpy2_dataframe)
    del mod
    return a
Example #9
0
    def Run(self):

        self.transit_message("Starting Corrplot")
        start_time = time.time()

        # assume first non-comment line is header; samples are
        headers = None
        data, means = [], []

        if self.filetype == "gene_means":
            for line in open(self.gene_means):
                w = line.rstrip().split('\t')
                if line[0] == '#':
                    headers = w[3:]
                    continue  # last comment line has names of samples
                data.append(w)
                cnts = [float(x) for x in w[3:]]
                means.append(cnts)
        elif self.filetype == "anova" or self.filetype == "zinb":
            n = -1  # number of conditions
            for line in open(self.gene_means):
                w = line.rstrip().split('\t')
                if line[0] == '#' or (
                        'pval' in line and 'padj' in line
                ):  # check for 'pval' for backwards compatibility
                    headers = w
                    continue  # keep last comment line as headers
                if n == -1:
                    # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
                    # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
                    if self.filetype == "anova": n = int((len(w) - 6) / 2)
                    elif self.filetype == "zinb":
                        n = int((len(headers) - 6) / 4)
                    headers = headers[3:3 + n]
                    headers = [x.replace("Mean_", "") for x in headers]
                vals = [float(x)
                        for x in w[3:3 + n]]  # take just the columns of means
                qval = float(w[-2])
                if qval < 0.05:
                    data.append(w)
                    means.append(vals)
        else:
            print("filetype not recognized: %s" % self.filetype)
            sys.exit(-1)
        print("correlations based on %s genes" % len(means))

        genenames = ["%s/%s" % (w[0], w[1]) for w in data]
        hash = {}
        headers = [h.replace("Mean_", "") for h in headers]
        for i, col in enumerate(headers):
            hash[col] = FloatVector([x[i] for x in means])
        df = DataFrame(hash)  # can't figure out how to set rownames

        corrplotFunc = self.make_corrplotFunc()
        corrplotFunc(
            df, StrVector(headers), StrVector(genenames), self.outfile
        )  # pass headers to put cols in order, since df comes from dict

        self.finish()
        self.transit_message("Finished Corrplot")
Example #10
0
def _align_var(breaks_r, pop_col, n, verbose=False):
    prev_b = -1
    i = 1
    align = dict()
    _vector = list()
    align_t = [1]
    for e, b in enumerate(breaks_r):
        if prev_b + 1 == b and b < n + 1:
            try:
                assert (min(align_t) != max(align_t))
                # align[pop_col + '.' + str(i)] = IntVector(
                align[pop_col] = IntVector((min(align_t), max(align_t)))
                # _vector.extend([min(align_t), max(align_t)])
            except:
                if verbose:
                    print("can't allign {} at {} for {}".format(align_t, e, b))
                pass
            i += 1
            align_t = [e + 2]
        else:
            align_t.append(e + 2)
        prev_b = b
    if len(align) == 0:
        align[pop_col] = IntVector((1, len(breaks_r)))
    # else:
    # align[pop_col] = IntVector(_vector)
    align_r = DataFrame(align)
    return align_r
Example #11
0
def qlim(x, y):
    df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)})
    rq = quantreg.rq('y ~ x', df, tau=FloatVector((0.25, 0.5, 0.75)))
    print rq.rx2('coefficients')
    fv = array(rq.rx2('fitted.values'))
    return min(fv[:,1]) - 2*max(fv[:,1] - fv[:,0]), \
            2*max(fv[:,2] - fv[:,1]) + max(fv[:,1])
Example #12
0
def through_the_origin(x, y):
    df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)})
    s = r.summary(r.lm('y ~ 0 + x', df))
    return {
        'coefficient': s.rx2('coefficients')[0],
        'stderr': s.rx2('coefficients')[1],
        'r.squared': s.rx2('r.squared')[0]
    }
Example #13
0
def plot_bar(stats, output_file=None, **kw):
    names = [r['name'] for r in stats.values()[0][0]]
    with_rates = [r['with_rate'] for r in stats.values()[0][0]]
    names = [n + ('+Gamma' if w else '') for n, w in zip(names, with_rates)]

    by_dir = defaultdict(list)
    for triad in stats:
        for r in stats[triad]:
            by_dir[r[0]['from_directory']].append(r)

    for d in by_dir:
        by_dir[d] = zip(*[[gs_p(_r['gs_p']) for _r in r] for r in by_dir[d]])

    runs = []
    g_stats = []
    data = []
    alpha = 0
    for d, v in by_dir.items():
        if 'exons' in d.split('/'):
            dataset = 'Nuclear'
        elif 'mtDNA' in d.split('/'):
            dataset = 'Mitochondrial'
        else:
            dataset = 'Microbial'
        print dataset
        for j, g in enumerate(v):
            g_stats += g
            data += [dataset] * len(g)
            runs += [j] * len(g)
            print names[j], sum(1 for _g in g if _g > 0.05) / len(g)
            alpha = max(alpha, get_alpha(g))
        print 'Samples', len(g)
    labels = 'expression(' + ','.join(names) + ')'

    df = DataFrame({
        'run': IntVector(runs),
        'g_stat': FloatVector(g_stats),
        'data': StrVector(data)
    })
    globalenv['df'] = df
    R('library(scales)')
    #            'geom_jitter(alpha=0.2, size=1) + ' + \
    #            'geom_boxplot(fill=NA, outlier.size=0, size=1.5, color=alpha("white", 0.5)) + ' + \
    #            'geom_boxplot(alpha=0.8, outlier.size=0) + ' + \
    #            'geom_hline(yintercept=0.05, size=1.5, alpha=0.5, color="white") + ' + \
    #            'geom_hline(yintercept=0.05, color="black") + ' + \
    cmd = 'gg <- ggplot(df, aes(factor(run), g_stat)) + ' + \
            'ylab("Goodness-of-Fit p-value") + xlab("Model") + ' + \
            'geom_boxplot(outlier.size=1, outlier.colour=alpha("black",'+str(alpha)+')) + ' + \
            'scale_x_discrete(labels=' + labels + ') + ' + \
            'theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) + ' + \
            'facet_grid(. ~ data)'
    R(cmd)
    if output_file:
        R('ggsave("' + output_file + '", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')
Example #14
0
    def python_to_r_object(cls, item):
        """ 把python对象转化为R对象,类方法

        :param item: Python对象,可以转换的类型有:list,tuple,pd.Series, np.ndarray, pd.DataFrame
        :return: R对象
        """
        numpy2ri.activate()
        if isinstance(item,(list, tuple, pd.Series)):
            return np.array(item)
        elif isinstance(item, pd.DataFrame):
            data_dict = {col_names: np.array(item[col_names]) for col_names in item.columns}
            rdataframe = DataFrame(data_dict)
            rdataframe.rownames = np.array(item.index)
            return rdataframe
        elif isinstance(item, (np.ndarray,bool,int,float,str)):
            return item
        else:
            print('Unsuported type: ',type(item))
            raise Exception
Example #15
0
def tupls2RDataframe(data, titles):
    cols = [[] for _ in titles]
    for datum in data:
        for i, e in enumerate(datum):
            cols[i].append(e)
    col_d = {}
    for i, t in enumerate(titles):
        col_d[t] = StrVector(tuple(cols[i]))
        col_d[t] = FactorVector(col_d[t])
    dataf = DataFrame(col_d)
    return dataf
Example #16
0
def _convert_python_to_R(data: typing.Union[dict, pd.DataFrame]):
    """
    Converts a python object to an R object brms can handle:
    * python dict      ->   R list
    * python dataframe ->   R dataframe
    """
    with localconverter(default_converter + pandas2ri.converter + numpy2ri.converter) as cv:
        if isinstance(data, pd.DataFrame):
            return DataFrame(data)
        elif isinstance(data, dict):
            return ListVector(data)
        else:
            raise ValueError("Data should be either a pandas dataframe or a dictionary")
Example #17
0
 def create_variable_in_R(self, variable_name, value):
     if isinstance(value, (list, np.ndarray, pd.Series)):
         self._R.globalenv[variable_name] = RActor.python_type_to_R_type(value)
     elif isinstance(value, pd.DataFrame):
         indexes = value.index
         rownames = ''.join(["c('","','".join(list(indexes)),"')"])
         value_dict = value.to_dict('list')
         for key in value_dict:
             value_dict[key] = RActor.python_type_to_R_type(value_dict[key])
         self._R.globalenv[variable_name] = DataFrame(value_dict)
         self._R.r(''.join(['rownames(',variable_name,') <- ',rownames]))
     else:
         self._R.globalenv[variable_name] = value
Example #18
0
    def python_to_r_object(cls, item):
        """ 把python对象转化为R对象,类方法

        :param item: Python对象,可以转换的类型有:list,tuple,pd.Series, np.ndarray, pd.DataFrame
        :return: R对象
        """
        numpy2ri.activate()
        if isinstance(item, (list, tuple, pd.Series)):
            return np.array(item)
        elif isinstance(item, pd.DataFrame):
            data_dict = {
                col_names: np.array(item[col_names])
                for col_names in item.columns
            }
            rdataframe = DataFrame(data_dict)
            rdataframe.rownames = np.array(item.index)
            return rdataframe
        elif isinstance(item, (np.ndarray, bool, int, float, str)):
            return item
        else:
            print('Unsuported type: ', type(item))
            raise Exception
Example #19
0
 def _comparisons_dataframe(self):
     # col = ('Label.1', 'Label.2', 'win1', 'win2')
     # data = zip(col, [*self.comparison_items, *self.comparison_wins])
     # return DataFrame(OrdDict([data]))
     column_comp1 = ('Label.1',
                     FactorVector(self.comparison_items[0],
                                  levels=StrVector(self.items)))
     column_comp2 = ('Label.2',
                     FactorVector(self.comparison_items[1],
                                  levels=StrVector(self.items)))
     column_win1 = ('win1', FloatVector(self.comparison_wins[0]))
     column_win2 = ('win2', FloatVector(self.comparison_wins[1]))
     return DataFrame(
         OrdDict([column_comp1, column_comp2, column_win1, column_win2]))
Example #20
0
    def Run(self):

        if self.filetype != "anova" and self.filetype != "zinb":
            print("filetype not recognized: %s" % self.filetype)
            sys.exit(-1)

        headers = None
        data, hits = [], []
        n = -1  # number of conditions

        for line in open(self.infile):
            w = line.rstrip().split('\t')
            if line[0] == '#' or (
                    'pval' in line and 'padj'
                    in line):  # check for 'pval' for backwards compatibility
                headers = w
                continue  # keep last comment line as headers
            # assume first non-comment line is header
            if n == -1:
                # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
                # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
                if self.filetype == "anova": n = int((len(w) - 6) / 2)
                elif self.filetype == "zinb": n = int((len(headers) - 6) / 4)
                headers = headers[3:3 + n]
                headers = [x.replace("Mean_", "") for x in headers]
            else:
                lfcs = [float(x) for x in w[3 + n:3 + n + n]
                        ]  # take just the columns of means
                qval = float(w[-2])
                data.append((w, lfcs, qval))

        data.sort(key=lambda x: x[-1])
        hits, LFCs = [], []
        for k, (w, lfcs, qval) in enumerate(data):
            if (self.topk == -1 and qval < self.qval) or (self.topk != -1
                                                          and k < self.topk):
                hits.append(w)
                LFCs.append(lfcs)

        print("heatmap based on %s genes" % len(hits))
        genenames = ["%s/%s" % (w[0], w[1]) for w in hits]
        hash = {}
        headers = [h.replace("Mean_", "") for h in headers]
        for i, col in enumerate(headers):
            hash[col] = FloatVector([x[i] for x in LFCs])
        df = DataFrame(hash)
        heatmapFunc = self.make_heatmapFunc()
        heatmapFunc(df, StrVector(genenames), self.outfile)
Example #21
0
    def loadfiles(self):
        """
        Load files into R environment
        """
        rcount = 0
        asmatrix = robjects.r['as.matrix']
        diag = robjects.r['diag']
        names = robjects.r['names']
        
        ## Set the default parameter for reading from csv
        param = {'header': True, 'as_is': True, 'row.names': ri.NULL}
        
        ## Check the correct parameter and set the default        
        for p in param.keys():
            if p in self.param:
                if self.param[p] is not None:
                    param[p] = self.param[p]

        for f, s in zip(self.filelist, self.seplist):
            try:
                dataf = DataFrame.from_csvfile(f,
                                               sep=str(s),
                                               header=param['header'],
                                               as_is=param['as_is'],
                                               row_names=param['row.names'])

                dataf = asmatrix(dataf)
                
              
                # Should be the diagonal set to 0?
                # Do it for all the inputs, just to be sure
                zcount = 0
                for i in xrange(dataf.ncol):
                    if (dataf.rx(i+1,i+1)[0] - 0.0 >= 1e-8):
                        zcount += 1
                        dataf.rx[i+1,i+1] = 0

                if zcount:
                    self.e += f
                    
                self.mylist.append(dataf)
                rcount += 1
            except IOError, e:
                self.error += e
            
            except RRuntimeError, e:
                self.error += e
Example #22
0
    def measure_design(self, design, response, fixed_factors):
        info("Measuring design of size " + str(len(design)))

        design_names = [str(n) for n in self.base.names(design)]
        initial_factors = self.params["axis_names"]
        measurements = []

        info("Current Design Names: " + str(design_names))
        info("Initial Factors: " + str(initial_factors))

        for line in range(len(design[0])):
            design_line = [int(v[0]) for v in design.rx(line + 1, True)]

            candidate = [0] * len(initial_factors)

            for k, v in fixed_factors.items():
                candidate[initial_factors.index(k)] = int(v)

            for i in range(len(design_names)):
                candidate[initial_factors.index(
                    design_names[i])] = design_line[i]

            info("Initial Design Line: " + str(design_line))
            info("Fixed Factors: " + str(fixed_factors))
            info("Testing candidate " + str(line + 1) + ": " + str(candidate))

            measurement = self.getPerfCosts([candidate])
            if measurement != {}:
                measurements.append(
                    float(numpy.mean(measurement[str(candidate)][0])))
            else:
                measurements.append(float('inf'))

        info("Measurements: " + str(measurements))

        design = self.base.cbind(
            design, DataFrame({response[0]: FloatVector(measurements)}))

        design = design.rx(self.base.is_finite(design.rx2(response[0])), True)

        info(str(design))

        return design
Example #23
0
def plot_lrt_histograms(stats, output_file, **kw):
    gtr = []
    general = []
    gtrplusgamma = []
    for triad in stats:
        for r in stats[triad]:
            for o, l in zip((0, 2, 4), (general, gtr, gtrplusgamma)):
                p = lrt_p(r[1 + o]['ll'], r[o]['ll'], r[1 + o]['df'],
                          r[o]['df'])
                l.append(p)

    intercepts = []
    for n, v in zip(('General', 'GTR+Gamma', 'GTR'),
                    (general, gtrplusgamma, gtr)):
        i = sum(1 for p in v if p <= 0.05) / len(v)
        intercepts.append(str(np.round(i, 2)))
        print n, min(v), max(v), i, len(v)
    intercepts = 'c(' + ','.join(intercepts) + ')'

    n = len(gtr)
    globalenv['df'] = DataFrame({
        'Model':
        StrVector(['general'] * n + ['gtrplusgamma'] * n + ['gtr'] * n),
        'pvalue':
        FloatVector(general + gtrplusgamma + gtr)
    })
    cmd = 'gg <- ggplot(df, aes(pvalue, group=Model, linetype=Model)) + ' + \
        'stat_ecdf(geom="line") + ' + \
        'xlab("LRT p-value") + ylab("Empirical CDF") + ' + \
        'theme(legend.position = c(0.85, 0.15)) + ' + \
        'scale_x_continuous(breaks=c(0.05,seq(0.25,1,by=0.25)), limits=c(0,1))+'+\
        'scale_y_continuous(breaks=c(' + intercepts + \
        ', seq(0,1,by=0.25)), limits=c(0,1)) + ' + \
        'scale_linetype_discrete(labels=expression(General, GTR, GTR+Gamma))'
    R(cmd)
    if output_file:
        R('ggsave("' + output_file + '", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')
Example #24
0
def is_strong_iv(df, idx):
    df = df.loc[df['index'].isin(idx)]
    Y = df["treated"]
    X0 = df["0"]
    X1 = df["1"]
    X2 = df["2"]
    X3 = df["3"]
    X4 = df["4"]
    X5 = df["5"]
    X6 = df["6"]
    X7 = df["7"]
    X8 = df["8"]
    X9 = df["9"]
    IV = df["iv"]
    formula = Formula(
        'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + iv')
    formula2 = Formula(
        'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9')
    dataf = DataFrame({'treated': robjects.IntVector(Y), \
                        'x0': robjects.IntVector(X0),\
                        'x1': robjects.IntVector(X1),\
                        'x2': robjects.IntVector(X2),\
                        'x3': robjects.IntVector(X3),\
                        'x4': robjects.IntVector(X4),\
                        'x5': robjects.IntVector(X5),\
                        'x6': robjects.IntVector(X6),\
                        'x7': robjects.IntVector(X7),\
                        'x8': robjects.IntVector(X8),\
                        'x9': robjects.IntVector(X9),\
                        'iv': robjects.IntVector(IV)
                        })

    #print(dataf)
    fit = robjects.r.lm(formula=formula, data=dataf)
    fit2 = robjects.r.lm(formula=formula2, data=dataf)
    r_frame = robjects.r.anova(fit, fit2)
    py_frame = pandas2ri.ri2py_dataframe(r_frame)

    return py_frame.iloc[1, 4] >= 10
Example #25
0
    def measure_design(self, design):
        info("Measuring design of size " + str(len(design[0])))

        design_names    = [str(n) for n in self.base.names(design)]
        initial_factors = self.params["axis_names"]
        measurements    = []

        info("Current Design Names: " + str(design_names))

        for line in range(1, len(design[0]) + 1):
            if type(design.rx(line, True)[0]) is int:
                design_line = [v for v in design.rx(line, True)]
            else:
                design_line = [int(v[0]) for v in design.rx(line, True)]

            candidate = [0] * len(initial_factors)

            for k, v in self.model["fixed_factors"].items():
                candidate[initial_factors.index(k)] = int(v)

            for i in range(len(design_names)):
                candidate[initial_factors.index(design_names[i])] = design_line[i]

            measurement = self.getPerfCosts([candidate])
            if measurement != {}:
                measurements.append(float(numpy.mean(measurement[str(candidate)][0])))
            else:
                measurements.append(float('inf'))

        design = self.base.cbind(design, DataFrame({self.model["response"]: FloatVector(measurements)}))
        design = design.rx(self.base.is_finite(design.rx2(self.model["response"])), True)

        info("Complete design, with measurements:")
        info(str(design))

        return design
Example #26
0
def total_concentration(df):
    Y = df["treated"]
    X0 = df["0"]
    X1 = df["1"]
    X2 = df["2"]
    X3 = df["3"]
    X4 = df["4"]
    X5 = df["5"]
    X6 = df["6"]
    X7 = df["7"]
    X8 = df["8"]
    X9 = df["9"]
    IV = df["iv"]
    formula = Formula(
        'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + iv')
    formula2 = Formula(
        'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9')
    dataf = DataFrame({'treated': robjects.IntVector(Y), \
                        'x0': robjects.IntVector(X0),\
                        'x1': robjects.IntVector(X1),\
                        'x2': robjects.IntVector(X2),\
                        'x3': robjects.IntVector(X3),\
                        'x4': robjects.IntVector(X4),\
                        'x5': robjects.IntVector(X5),\
                        'x6': robjects.IntVector(X6),\
                        'x7': robjects.IntVector(X7),\
                        'x8': robjects.IntVector(X8),\
                        'x9': robjects.IntVector(X9),\
                        'iv': robjects.IntVector(IV)
                        })

    fit = robjects.r.lm(formula=formula, data=dataf)
    fit2 = robjects.r.lm(formula=formula2, data=dataf)
    r_frame = robjects.r.anova(fit, fit2)
    py_frame = pandas2ri.ri2py_dataframe(r_frame)
    print("total concentration parameter: " + str(py_frame.iloc[1, 4]))
Example #27
0
    def fit(self, training_data, target):
        """
        :param training_data: a pandas dataframe.
        :param target: a string referring to the target variable to be predicted
        :return: An rpart model
        """

        self.outcome = target

        #Converting to proper format for R functions
        train_data = DataFrame(training_data)
        formula = target + " ~ ."

        #TODO: HyperParameter Incorporateion

        #train the model
        self.model = rparty.rpart(
            formula=formula,
            data=train_data,
            method="class"
            #control = rparty.rpart_control(minsplit = ?, cp = ?)
        )

        return self.model
Example #28
0
            search_space_database = dataset.connect(
                "sqlite:///search_space_{0}.db".format(self.seed_space_size))
            for experiment in search_space_database['experiments']:
                search_space.append(eval(experiment["value"]))

        info("Starting DOPT-anova")

        r_search_space = {}
        for i in range(len(search_space[0])):
            r_row = [self.dim_uplimits[i] - 1, 0]
            for col in search_space:
                r_row.append(col[i])

            r_search_space[initial_factors[i]] = IntVector(r_row)

        data = DataFrame(r_search_space)
        data = data.rx(StrVector(initial_factors))

        self.dopt_anova(initial_factors, initial_inverse_factors, data)

        sys.exit()

        perf_cost, mean_perf_cost = self.MAXFLOAT, self.MAXFLOAT

        params = self.coordToPerfParams(coord)
        end_time = time.time()
        search_time = start_time - end_time
        speedup = float(eval_cost[0]) / float(best_perf_cost)
        search_time = time.time() - start_time

        info('----- end random search -----')
Example #29
0
    def run_zinb(self, data, genes, NZMeanByRep, LogZPercByRep,
                 RvSiteindexesMap, conditions, covariates, interactions):
        """
            Runs Zinb for each gene across conditions and returns p and q values
            ([[Wigdata]], [Gene], [Number], [Number], {Rv: [SiteIndex]}, [Condition], [Covar], [Interaction]) -> Tuple([Number], [Number], [Status])
            Wigdata :: [Number]
            Gene :: {start, end, rv, gene, strand}
            SiteIndex: Integer
            Condition :: String
            Covar :: String
            Interaction :: String
            Status :: String
        """

        count = 0
        self.progress_range(len(genes))
        pvals, Rvs, status = [], [], []
        r_zinb_signif = self.def_r_zinb_signif()
        if (self.winz):
            self.transit_message("Winsorizing and running analysis...")

        self.transit_message("Condition: %s" % self.condition)

        comp1a = "1+cond"
        comp1b = "1+cond"

        # include cond in mod0 only if testing interactions
        comp0a = "1" if len(self.interactions) == 0 else "1+cond"
        comp0b = "1" if len(self.interactions) == 0 else "1+cond"
        for I in self.interactions:
            comp1a += "*" + I
            comp1b += "*" + I
            comp0a += "+" + I
            comp0b += "+" + I
        for C in self.covars:
            comp1a += "+" + C
            comp1b += "+" + C
            comp0a += "+" + C
            comp0b += "+" + C
        zinbMod1 = "cnt~%s+offset(log(NZmean))|%s+offset(logitZperc)" % (
            comp1a, comp1b)
        zinbMod0 = "cnt~%s+offset(log(NZmean))|%s+offset(logitZperc)" % (
            comp0a, comp0b)

        nbMod1 = "cnt~%s" % (comp1a)
        nbMod0 = "cnt~%s" % (comp0a)
        toRFloatOrStrVec = lambda xs: FloatVector(
            [float(x) for x in xs]) if self.is_number(xs[0]) else StrVector(xs)

        for gene in genes:
            count += 1
            Rv = gene["rv"]
            ## Single gene case for debugging
            if (GENE):
                Rv = None
                if GENE in RvSiteindexesMap:
                    Rv = GENE
                else:
                    for g in genes:
                        if (g['gene'] == GENE):
                            Rv = g["rv"]
                            break
                if not Rv:
                    self.transit_error("Cannot find gene: {0}".format(GENE))
                    sys.exit(0)

            if (DEBUG):
                self.transit_message(
                    "======================================================================"
                )
                self.transit_message(gene["rv"] + " " + gene["gene"])

            if (len(RvSiteindexesMap[Rv]) <= 1):
                status.append("TA sites <= 1, not analyzed")
                pvals.append(1)
            else:
                # For winsorization
                # norm_data = self.winsorize((map(
                #     lambda wigData: wigData[RvSiteindexesMap[Rv]], data))) if self.winz else list(map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data))
                norm_data = list(
                    map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data))
                ([
                    readCounts, condition, covarsData, interactionsData,
                    NZmean, logitZPerc
                ]) = self.melt_data(norm_data, conditions, covariates,
                                    interactions, NZMeanByRep, LogZPercByRep)
                if (numpy.sum(readCounts) == 0):
                    status.append(
                        "pan-essential (no counts in all conditions) - not analyzed"
                    )
                    pvals.append(1)
                else:
                    df_args = {
                        'cnt': IntVector(readCounts),
                        'cond': toRFloatOrStrVec(condition),
                        'NZmean': FloatVector(NZmean),
                        'logitZperc': FloatVector(logitZPerc)
                    }
                    ## Add columns for covariates and interactions if they exist.
                    df_args.update(
                        list(
                            map(
                                lambda t_ic: (t_ic[
                                    1], toRFloatOrStrVec(covarsData[t_ic[0]])),
                                enumerate(self.covars))))
                    df_args.update(
                        list(
                            map(
                                lambda t_ic:
                                (t_ic[1],
                                 toRFloatOrStrVec(interactionsData[t_ic[0]])),
                                enumerate(self.interactions))))

                    melted = DataFrame(df_args)
                    # r_args = [IntVector(readCounts), StrVector(condition), melted, map(lambda x: StrVector(x), covars), FloatVector(NZmean), FloatVector(logitZPerc)] + [True]
                    debugFlag = True if DEBUG or GENE else False
                    pval, msg = r_zinb_signif(melted, zinbMod1, zinbMod0,
                                              nbMod1, nbMod0, debugFlag)
                    status.append(msg)
                    pvals.append(float(pval))
                if (DEBUG or GENE):
                    self.transit_message(
                        "Pval for Gene {0}: {1}, status: {2}".format(
                            Rv, pvals[-1], status[-1]))
                if (GENE):
                    self.transit_message("Ran for single gene. Exiting...")
                    sys.exit(0)
            Rvs.append(Rv)
            # Update progress
            text = "Running ZINB Method... %5.1f%%" % (100.0 * count /
                                                       len(genes))
            self.progress_update(text, count)

        pvals = numpy.array(pvals)
        mask = numpy.isfinite(pvals)
        qvals = numpy.full(pvals.shape, numpy.nan)
        qvals[mask] = statsmodels.stats.multitest.fdrcorrection(pvals)[
            1]  # BH, alpha=0.05

        p, q, statusMap = {}, {}, {}
        for i, rv in enumerate(Rvs):
            p[rv], q[rv], statusMap[rv] = pvals[i], qvals[i], status[i]
        return (p, q, statusMap)
def fit(
    latlon: NDArray[(2, Any), float],
    z: NDArray[(Any, ), float],
    nx: int,
    ny: int,
    extrap: bool,
) -> Tuple[NDArray[(Any, Any), float], NDArray[(Any, ), float], NDArray[(
        Any, ), float]]:
    """Encapsulates the functionality of R's spatialProcess into a Python
    Args:
        latlon: grid of pairwise coordinates of observations
        z: observations
        nx: number of grid cells on interpolated grid x
        nx: number of grid cells on interpolated grid y
        xy: dimensions of interpolated grid output
        distance: distance metric to use (note, only 'geo' supported currently)
        variogram_model: choice of variogram model
          (note, only 'exoponential' supported)
    Returns:
        z: kriged field
        x, y: locations of kriged data

    """

    if not isinstance(latlon, NDArray[(2, Any), float]):
        raise TypeError(
            f"Incorrect grid shape, size, or dtype. Must be {NDArray[(2, Any), float]}"
        )

    if not isinstance(z, NDArray[(Any, ), float]):
        raise TypeError(
            f"Incorrect grid shape, size, or dtype. Must be {NDArray[(Any, ), float]}"
        )

    if not isinstance(nx, int) or not isinstance(ny, int):
        raise TypeError("Provide integer grid size")

    if latlon.shape[1] != z.size:
        raise ValueError(
            "Different number of grid coordinates than observations")

    latlon, z = latlon.tolist(), z.tolist()

    # convert regular numeric data

    # convert latlon list into two R FloatVectors
    # list of FloatVector -> OrderedDict -> R DataFrame
    # -> numeric R data matrix
    r_lists = list(map(FloatVector, latlon))
    coords = OrderedDict(zip(map(str, range(len(r_lists))), r_lists))
    r_dataFrame = DataFrame(coords)
    r_latlon = robjects.r["data.matrix"](r_dataFrame)

    # convert observations
    r_z = FloatVector(z)

    # use separate simple r-script in path below
    rstring = resource_string("climpyrical",
                              "tests/data/spatial_process_r.R").decode("utf-8")

    rfunc = robjects.r(rstring)
    r_surface = rfunc(r_latlon, r_z, nx, ny, extrap)

    # extract data from R's interpolation
    surface_dict = dict(zip(r_surface.names, list(r_surface)))
    # z = np.array(list(r_surface[1]))
    z = np.array(surface_dict["z"]).reshape(nx, ny)
    x = np.array(surface_dict["x"])
    y = np.array(surface_dict["y"])
    # cov = dict(zip(surface_dict["cov"].names, list(surface_dict["cov"])))
    # cov = surface_dict["cov"]

    return z, x, y
Example #31
0
                    utils = importr('dplyr')
                    utils = importr('gravity')
                except Exception as exception:
                    print(exception)
                    import rpy2.robjects.packages as rpackages
                    utils = rpackages.importr('utils')
                    utils = rpackages.importr('gravity')
                    utils.chooseCRANmirror(ind=1)
                    utils.install_packages('dplyr')
                    utils.install_packages('gravity')
                finally:
                    from rpy2.robjects.packages import importr
                    utils = importr('dplyr')
                    utils = importr('gravity')

                dataf = DataFrame({'label': IntVector(tuple(labels)), 'distance': FloatVector(tuple(data.T[0]))})
                fit = R('function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)')(dataf)
                #print(fit)

                # Deviance is -2.*log_likelihood
                altDeviance = list(fit[9].items())[0][1]
                nullDeviance = list(fit[11].items())[0][1]
                p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1)
                print('Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006.')
                print('Implemented in R in "gravity: Estimation Methods for Gravity Models" at:')
                print('https://rdrr.io/cran/gravity/man/ppml.html')
                print('Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t', 
                      'Null deviance:\t', np.round(nullDeviance, 1), '\n\t',
                      'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t',
                      'p-value:\t', '%.1e' % p_value, '\n')
Example #32
0
# R fitdistr for Beta distribution: which starting parameters?
from rpy2.robjects import DataFrame
starter= DataFrame({'shape1':0.5,'shape2':0.5})
x = MASS.fitdistr(myValues, "beta", start=starter))