Esempio n. 1
0
def create_initial_sample(n_obs,
                          dim,
                          type='lhs',
                          lower_bound=None,
                          upper_bound=None):
    """
    Convenient helper function, which creates an initial sample - either based on random (uniform) sampling or using latin hypercube sampling.

    Args:
      n_obs: number of observations
      dim: number of dimensions
      type: type of sampling strategy (Default value = 'lhs')
      lower_bound: The lower bounds of the initial sample as a list of size dim (Default value = 0)
      upper_bound: The upper bounds of the initial sample as a list of size dim (Default value = 1)

    Returns: numpy array of shape (n_obs x dim)

    """
    if lower_bound is None:
        lower_bound = [0] * dim
    if upper_bound is None:
        upper_bound = [1] * dim

    pcontrol = {
        'init_sample.type': type,
        'init_sample.lower': IntVector(lower_bound),
        'init_sample.upper': IntVector(upper_bound)
    }

    return np.array(
        flacco.createInitialSample(n_obs, dim, ListVector(pcontrol)))
def sampleSizeRest():
    # Get the parsed contents of the form data
    data = request.json
    #print(json)

    k = data["k"].split(',')
    prev = data["prev"]
    N = data["N"]
    unique_id = data["unique_id"]
    fixed_flag = data["fixed_flag"]
    sens = data["sens"].split(',')
    spec = data["spec"].split(',')

    start = time.time()
    print "Starting Benchmark"

    if fixed_flag == "Specificity":
        jsonrtn = (wrapper.saveAllSensGraphs(IntVector(k), FloatVector(sens),
                                             FloatVector(spec), float(prev),
                                             IntVector(N), unique_id))
    else:
        jsonrtn = (wrapper.saveAllSpecGraphs(IntVector(k), FloatVector(sens),
                                             FloatVector(spec), float(prev),
                                             IntVector(N), unique_id))

    #end=time.time()
    #print "Seconds"
    #print end - start

    jsonlist = list(jsonrtn)

    #2
    jsonstring = ''.join(jsonlist)
    print jsonstring
    return jsonstring
Esempio n. 3
0
def py2rpy_pandasseries(obj):
    if obj.dtype.name == 'O':
        warnings.warn('Element "%s" is of dtype "O" and converted '
                      'to R vector of strings.' % obj.name)
        res = StrVector(obj)
    elif obj.dtype.name == 'category':
        res = py2rpy_categoryseries(obj)
        res = FactorVector(res)
    elif is_datetime64_any_dtype(obj.dtype):
        # time series
        tzname = obj.dt.tz.zone if obj.dt.tz else ''
        d = [
            IntVector([x.year for x in obj]),
            IntVector([x.month for x in obj]),
            IntVector([x.day for x in obj]),
            IntVector([x.hour for x in obj]),
            IntVector([x.minute for x in obj]),
            FloatSexpVector([x.second + x.microsecond * 1e-6 for x in obj])
        ]
        res = ISOdatetime(*d, tz=StrSexpVector([tzname]))
        # TODO: can the POSIXct be created from the POSIXct constructor ?
        # (is '<M8[ns]' mapping to Python datetime.datetime ?)
        res = POSIXct(res)
    elif (obj.dtype == dt_O_type):
        homogeneous_type = None
        for x in obj.values:
            if x is None:
                continue
            if homogeneous_type is None:
                homogeneous_type = type(x)
                continue
            if type(x) is not homogeneous_type:
                raise ValueError('Series can only be of one type, or None.')
        # TODO: Could this be merged with obj.type.name == 'O' case above ?
        res = {
            int: IntVector,
            bool: BoolVector,
            None: BoolVector,
            str: StrVector,
            bytes: numpy2ri.converter.py2rpy.registry[numpy.ndarray]
        }[homogeneous_type](obj)
    else:
        # converted as a numpy array
        func = numpy2ri.converter.py2rpy.registry[numpy.ndarray]
        # current conversion as performed by numpy

        res = func(obj)
        if len(obj.shape) == 1:
            if (obj.dtype != dt_O_type):
                # force into an R vector
                res = as_vector(res)

    # "index" is equivalent to "names" in R
    if obj.ndim == 1:
        res.do_slot_assign('names',
                           StrVector(tuple(str(x) for x in obj.index)))
    else:
        res.do_slot_assign('dimnames',
                           SexpVector(conversion.py2rpy(obj.index)))
    return res
Esempio n. 4
0
def _align_var(breaks_r, pop_col, n, verbose=False):
    prev_b = -1
    i = 1
    align = dict()
    _vector = list()
    align_t = [1]
    for e, b in enumerate(breaks_r):
        if prev_b + 1 == b and b < n + 1:
            try:
                assert (min(align_t) != max(align_t))
                # align[pop_col + '.' + str(i)] = IntVector(
                align[pop_col] = IntVector((min(align_t), max(align_t)))
                # _vector.extend([min(align_t), max(align_t)])
            except:
                if verbose:
                    print("can't allign {} at {} for {}".format(align_t, e, b))
                pass
            i += 1
            align_t = [e + 2]
        else:
            align_t.append(e + 2)
        prev_b = b
    if len(align) == 0:
        align[pop_col] = IntVector((1, len(breaks_r)))
    # else:
    # align[pop_col] = IntVector(_vector)
    align_r = DataFrame(align)
    return align_r
Esempio n. 5
0
 def fit(self, x, t, y, refit=False):
     if self.method_name == "lasso":
         print("fit lasso")
         self.model = self.rleaner.rlasso(x, IntVector(t), FloatVector(y))
     else:
         # Takes much longer to fit
         print("fit boost")
         self.model = self.rleaner.rboost(x, IntVector(t), FloatVector(y))
Esempio n. 6
0
 def testFunction_select(self):
     dataf_a = self.DataFrame({
         'x': IntVector((1, 2)),
         'y': IntVector((3, 4))
     })
     dataf_as = dplyr.select(dataf_a, 'y')
     self.assertEqual(1, dataf_as.collect().ncol)
     dataf_as = dplyr.select(dataf_a, '-x')
     self.assertEqual(1, dataf_as.collect().ncol)
Esempio n. 7
0
 def testFunction_group_by_summarize_arrange(self):
     dataf_a = self.DataFrame({
         'x': IntVector((1, 2, 1)),
         'y': IntVector((3, 4, 5))
     })
     dataf_ag = dplyr.group_by(dataf_a, 'x')
     dataf_as = dplyr.summarize(dataf_ag, count='n()')
     dataf_aa = dplyr.arrange(dataf_as, 'count')
     self.assertEqual(2, dataf_aa.collect().nrow)
     self.assertSequenceEqual([1, 2], dataf_aa.collect().rx2('count'))
Esempio n. 8
0
def export_smpl_split_to_r(smpls):
    n_smpls = len(smpls)
    all_train = ListVector.from_length(n_smpls)
    all_test = ListVector.from_length(n_smpls)

    for idx, (train, test) in enumerate(smpls):
        all_train[idx] = IntVector(train + 1)
        all_test[idx] = IntVector(test + 1)

    return all_train, all_test
Esempio n. 9
0
 def testMethod_select(self):
     self.DataFrame = self.DataFrame
     dataf_a = self.DataFrame({
         'x': IntVector((1, 2)),
         'y': IntVector((3, 4))
     })
     dataf_as = dataf_a.select('y')
     self.assertEqual(1, dataf_as.collect().ncol)
     dataf_as = dataf_a.select('-x')
     self.assertEqual(1, dataf_as.collect().ncol)
Esempio n. 10
0
def spMatrixToR(x):
    matrix_pkg = rpackages.importr('Matrix')
    coo_matrix = x.tocoo()
    numpy2ri.activate()
    result = matrix_pkg.sparseMatrix(i=IntVector(coo_matrix.row),
                                     j=IntVector(coo_matrix.col),
                                     x=FloatVector(coo_matrix.data),
                                     dims=IntVector(coo_matrix.shape),
                                     index1=False)
    numpy2ri.deactivate()
    return result
Esempio n. 11
0
 def testMethod_group_by_summarize_arrange(self):
     self.DataFrame = self.DataFrame
     dataf_a = self.DataFrame({
         'x': IntVector((1, 2, 1)),
         'y': IntVector((3, 4, 5))
     })
     dataf_ag = dataf_a.group_by('x')
     dataf_as = dataf_ag.summarize(count='n()')
     dataf_aa = dataf_as.arrange('count')
     self.assertEqual(2, dataf_aa.collect().nrow)
     self.assertSequenceEqual([1, 2], dataf_aa.collect().rx2('count'))
Esempio n. 12
0
    def _extract_mapping(self, cimpl_obj, cis_sites):
        # Convert CIS sites to frame format.
        cis_frame = CisSite.to_frame(cis_sites)

        # Convert to R representation for cimpl.
        chr_with_prefix = add_prefix(cis_frame['chromosome'], prefix='chr')

        r_base = importr('base')
        cis_frame_r = RDataFrame({
            'id':
            r_base.I(StrVector(cis_frame['id'])),
            'chromosome':
            r_base.I(StrVector(chr_with_prefix)),
            'scale':
            StrVector(cis_frame['scale']),
            'start':
            IntVector(cis_frame['start']),
            'end':
            IntVector(cis_frame['end'])
        })
        cis_frame_r.rownames = StrVector(cis_frame['id'])

        # Retrieve cis matrix from cimpl.
        cis_matrix_r = self._cimpl.getCISMatrix(cimpl_obj, cis_frame_r)
        cis_matrix = dataframe_to_pandas(cis_matrix_r)

        # Extract scale information from cis matrix.
        scale_cols = [c for c in cis_matrix.columns if c.startswith('X')]
        cis_matrix_scales = cis_matrix[['id'] + scale_cols]

        # Melt matrix into long format.
        mapping = pd.melt(cis_matrix_scales, id_vars=['id'])
        mapping = mapping[['id', 'value']]
        mapping = mapping.rename(columns={
            'id': 'insertion_id',
            'value': 'cis_id'
        })

        # Split cis_id column into individual entries (for entries
        # with multiple ids). Then drop any empty rows, as these
        # entries are empty cells in the matrix.
        mapping = mapping.ix[mapping['cis_id'] != '']
        mapping = expand_column(mapping, col='cis_id', delimiter='|')

        mapping_dict = {
            ins_id: set(grp['cis_id'])
            for ins_id, grp in mapping.groupby('insertion_id')
        }

        return mapping_dict
Esempio n. 13
0
def cpt_poisson(x, penalty="MBIC", minseglen=2):
    """changepoint detection with Poisson distribution as test statistic

    Baseline equaling the smallest non-negative value is remove;
    negative value is set to a very large RTT, 1e3.

        Args:
            x (list of numeric type): timeseries to be handled
            penalty (string): possible choices "None", "SIC", "BIC", "MBIC", "AIC", "Hannan-Quinn"

        Returns:
            list of int: beginning of new segment in python index, that is starting from 0;
            the actually return from R changepoint detection is the last index of a segment.
            since the R indexing starts from 1, the return naturally become the beginning of segment.
        """
    x = np.rint(x)
    try:
        base = np.min([i for i in x if i > 0])
    except ValueError:  # if no positive number if x, set base to 0
        base = 0
    x = [i - base if i > 0 else 1e3 for i in x]
    return [
        int(i) for i in changepoint.cpts(
            changepoint.cpt_meanvar(IntVector(x),
                                    test_stat='Poisson',
                                    method='PELT',
                                    penalty=penalty,
                                    minseglen=minseglen))
    ]
Esempio n. 14
0
def RiverSmooth(dem,
                direction,
                river_summary,
                river_segments,
                mask=None,
                bank_epsilon=0.01,
                river_epsilon=0.0,
                d4: tuple = (1, 2, 3, 4),
                printflag=False):

    if mask is None:
        mask = RNone

    d4 = IntVector(d4)

    results = pf.RiverSmooth(dem=dem,
                             direction=direction,
                             mask=mask,
                             river_summary=river_summary,
                             river_segments=river_segments,
                             bank_epsilon=bank_epsilon,
                             river_epsilon=river_epsilon,
                             d4=d4,
                             printflag=printflag)
    return _pfprocess(results, ["dem.adj", "processed", "summary"])
Esempio n. 15
0
def _translate_control(control):
    """
    Transforms a python dict to a valid R object
    Args:
      control: python dict

    Returns: R object of type ListVector

    """
    ctrl = {}
    for key, lst in control.items():
        if isinstance(lst, list):
            if all(isinstance(n, int) for n in lst):
                entry = IntVector(control[key])
            elif all(isinstance(n, bool) for n in lst):
                entry = BoolVector(control[key])
            elif all(isinstance(n, float) for n in lst):
                entry = FloatVector(control[key])
            elif all(isinstance(n, str) for n in lst):
                entry = StrVector(control[key])
            else:
                entry = None
            if entry is not None:
                ctrl[key] = entry
        else:
            ctrl[key] = lst
    return ListVector(ctrl)
Esempio n. 16
0
 def testMethod_mutate(self):
     self.DataFrame = self.DataFrame
     dataf_a = self.DataFrame({'x': IntVector((1, 2))})
     dataf_am = dataf_a.mutate(y='x + 3')
     self.assertEqual(2, dataf_am.ncol)
     self.assertSequenceEqual([x + 3 for x in dataf_a.collect().rx2('x')],
                              dataf_am.collect().rx2('y'))
Esempio n. 17
0
    def _create_R_dataframe(self, job_ads, include_columns):
        """Converts job ads to R dataframe.

        Arguments
        ----------
        job_ads : list[:class:`JobAd`]
            List of :class:`JobAd` instances.
        include_columns : list[str]
            Defines which columns are included in the dataframe. 

        Returns
        ----------
        dataf : :class:`robjects.DataFrame`
            :class:`robjects.DataFrame` representing job ads.
        """
        
        #modify structure to type {column:[rows]}   
        if len(job_ads) == 0:
            raise Exception("No job ads to convert to R dataframe.")

        job_ads_dataf = {}
        for column in include_columns:
            job_ads_dataf[column] = [self._remove_diacritics(ad[column]) 
                                       for ad in job_ads]
            if (column == "relevant"):
                job_ads_dataf[column] = IntVector(job_ads_dataf[column])
            else:
                job_ads_dataf[column] = self._base.I(StrVector(job_ads_dataf[column]))
             
        return robjects.DataFrame(job_ads_dataf)
Esempio n. 18
0
def calc_gini(probes):
    # Count node occurences.
    nodes = dict()
    for probe in probes:
        if not probe[1] in nodes:
            nodes[probe[1]] = 1
        else:
            nodes[probe[1]] += 1
        if not probe[3] in nodes:
            nodes[probe[3]] = 1
        else:
            nodes[probe[3]] += 1

    # Calculate Gini coefficient.
    r_stats = importr('stats')
    total = 0
    node_selection = [nodes[node] for node in nodes.iterkeys()]
    if len(node_selection) == 0:
        return 1.0
    fdata = IntVector(node_selection)
    Fn = r_stats.ecdf(fdata)
    for nr in set(node_selection):
        cdf_x = Fn(nr)[0]
        total += cdf_x * (1 - cdf_x)
    return total / mean(node_selection)
Esempio n. 19
0
 def as_data_frame(self, channels, unit):
     """
     Preferred use of the ATF class. Outputs the dataset as a dataframe for further computation.
     """
     unit = NULL if unit is None else unit
     channels = IntVector(channels)
     return r["adaptATF"](self._atf,
                          channels=channels,
                          unit=unit)
Esempio n. 20
0
def pd_ts2r_ts(pd_ts):
    '''Pandas timeseries (pd_ts) to R timeseries (r_ts) conversion
    '''
    from rpy2.robjects.vectors import IntVector, FloatVector
    rstats = rpackages.importr('stats')
    r_start = IntVector(
        (pd_ts.index[0].year, pd_ts.index[0].month, pd_ts.index[0].day))
    r_end = IntVector(
        (pd_ts.index[-1].year, pd_ts.index[-1].month, pd_ts.index[-1].day))
    freq_pandas2r_ts = {
        # A dictionary for converting pandas.Series frequencies into R ts frequencies
        'D': 365,  # is this correct, how about leap-years?
        'M': 12,
        'Y': 1,
    }
    r_freq = freq_pandas2r_ts[pd_ts.index.freqstr]
    result = rstats.ts(FloatVector(pd_ts.values),
                       start=r_start,
                       end=r_end,
                       frequency=r_freq)
    return result
Esempio n. 21
0
def py2ri_pandasseries(obj):
    if obj.dtype.name == 'category':
        res = py2ri_categoryseries(obj)
        res = FactorVector(res)
    elif obj.dtype == dt_datetime64ns_type:
        # time series
        d = [
            IntVector([x.year for x in obj]),
            IntVector([x.month for x in obj]),
            IntVector([x.day for x in obj]),
            IntVector([x.hour for x in obj]),
            IntVector([x.minute for x in obj]),
            IntVector([x.second for x in obj])
        ]
        res = ISOdatetime(*d)
        #FIXME: can the POSIXct be created from the POSIXct constructor ?
        # (is '<M8[ns]' mapping to Python datetime.datetime ?)
        res = POSIXct(res)
    else:
        # converted as a numpy array
        func = numpy2ri.converter.py2ri.registry[numpy.ndarray]
        # current conversion as performed by numpy
        res = func(obj)
        if len(obj.shape) == 1:
            if (obj.dtype != dt_O_type):
                # force into an R vector
                res = as_vector(res)

    # "index" is equivalent to "names" in R
    if obj.ndim == 1:
        res.do_slot_assign('names',
                           StrVector(tuple(str(x) for x in obj.index)))
    else:
        res.do_slot_assign('dimnames', SexpVector(conversion.py2ri(obj.index)))
    return res
Esempio n. 22
0
def py2ri_pandasseries(obj):
    if obj.dtype == '<M8[ns]':
        # time series
        d = [
            IntVector([x.year for x in obj]),
            IntVector([x.month for x in obj]),
            IntVector([x.day for x in obj]),
            IntVector([x.hour for x in obj]),
            IntVector([x.minute for x in obj]),
            IntVector([x.second for x in obj])
        ]
        res = ISOdatetime(*d)
        #FIXME: can the POSIXct be created from the POSIXct constructor ?
        # (is '<M8[ns]' mapping to Python datetime.datetime ?)
        res = POSIXct(res)
    else:
        # converted as a numpy array
        res = numpy2ri.numpy2ri(obj.values)
    # "index" is equivalent to "names" in R
    if obj.ndim == 1:
        res.do_slot_assign('names',
                           StrVector(tuple(str(x) for x in obj.index)))
    else:
        res.do_slot_assign('dimnames', SexpVector(conversion.py2ri(obj.index)))
    return res
Esempio n. 23
0
def create_feature_object(x, y, minimize=True, lower=0, upper=1, blocks=None):
    """
    Creates a FeatureObject which will be used as input for all the feature computations.,

    Args:
      x: numpy 2D array containing the initial sample
      y: list containing the objective values of the initial sample 
      minimize: logical variable defining whether the objective is to minimize or not (Default value = True)
      lower:  python list or integer defining the lower limits per dimension (Default value = 0)
      upper:  python list or integer defining the lower limits per dimension (Default value = 1)
      blocks: number of blocks per dimension (Default value = None)

    Returns: rpy2.robject

    """
    numpy2ri.activate()
    x = R.r.matrix(x, nrow=len(x))
    numpy2ri.deactivate()
    y = FloatVector(y)

    if blocks is None:
        result = flacco.createFeatureObject(X=x,
                                            y=y,
                                            minimize=minimize,
                                            lower=lower,
                                            upper=upper,
                                            force=False)
    else:
        blocks = IntVector(blocks) if isinstance(blocks, list) else IntVector(
            [blocks])
        result = flacco.createFeatureObject(X=x,
                                            y=y,
                                            minimize=minimize,
                                            lower=lower,
                                            upper=upper,
                                            blocks=blocks,
                                            force=False)

    return result
Esempio n. 24
0
    def as_data_frame(self, sweep, channels, unit):
        """
        Preferred use of the ABF class. Outputs the dataset as a dataframe for further computation.
        """
        # the only supported `type` argument to `as.data.frame` is "one"
        # this is intended, we never need other types in our GUI
        sweep = NULL if sweep is None else sweep
        unit = NULL if unit is None else unit
        channels = IntVector(channels)

        return r["as.data.frame"](self._abf, sweep=sweep,
                                  type="one",
                                  channels=channels,
                                  unit=unit)
Esempio n. 25
0
def drainageArea(direction,
                 mask=None,
                 d4: tuple = (1, 2, 3, 4),
                 printflag=False):
    if mask is None:
        mask = RNone

    d4 = IntVector(d4)

    results = pf.drainageArea(direction=direction,
                              mask=mask,
                              d4=d4,
                              printflag=printflag)
    return _pfprocess(results, ["drainarea"])
Esempio n. 26
0
    def fit(self, x: np.array, t: np.array, y: np.array) -> None:
        """Fits the forest using factual data"""
        from rpy2.robjects.vectors import FloatVector, IntVector

        integer_random_state = int_from_random_state(self.random_state)

        self.forest = self.grf.causal_forest(
            x,
            FloatVector(y),
            IntVector(t),
            seed=integer_random_state,
            num_trees=self.num_trees,
            **self.kwargs
        )
Esempio n. 27
0
def _get_breaks(census_col, verbose=False):
    """Compute census breaks"""
    old_var_name = None
    breaks = list()
    for e, col in enumerate(census_col):
        var_name = col.split('_')[0]
        if var_name != old_var_name:
            if e - 1 > 0:
                if verbose:
                    print("adding {} as break = {}".format(var_name, e - 1))
                breaks.append(e - 1)
            old_var_name = var_name
    breaks = breaks[:-1]
    breaks_r = IntVector(breaks)
    return breaks_r
Esempio n. 28
0
def fit_forecast_model(y, freq, model, **kwargs):
    """Wrapper of the following flow:
        - Load _forecast_ package.
        - Transform data into a ts object.
        - Fit the model.

    Parameters
    ----------
    model: str
        Name of a model included in the
        _forecast_ package. Ej. 'auto.arima'.
    freq: int or iterable
        Frequency of the time series.
        Can be multiple seasonalities. (Last seasonality
        considered as frequency.)
    kwargs:
        Arguments of the model function.

    Returns
    -------
    rpy2 object
        Fitted model
    """


    pandas2ri.activate()

    freq = deepcopy(freq)
    if isinstance(freq, int):
        freq = freq
    else:
        freq = IntVector(freq)

    rstring = """
     function(y, freq, ...){
         suppressMessages(library(forecast))
         y_ts <- msts(y, seasonal.periods=freq)
         fitted_model<-%s(y_ts, ...)
         fitted_model
     }
    """ % (model)

    rfunc = robjects.r(rstring)

    fitted = rfunc(FloatVector(y), freq, **kwargs)

    return fitted
Esempio n. 29
0
def InitQueue(dem, initmask=None, domainmask=None, d4: tuple = (1, 2, 3, 4)):
    # https://github.com/lecondon/PriorityFlow/blob/master/Rpkg/R/Init_Queue.R#L18

    if initmask is None:
        initmask = RNone
    if domainmask is None:
        domainmask = RNone

    d4 = IntVector(d4)

    results = pf.InitQueue(dem=dem,
                           initmask=initmask,
                           domainmask=domainmask,
                           d4=d4)

    #return PFQueue(results)
    return _pfprocess(results,
                      ["mask", "queue", "marked", "basins", "direction"])
Esempio n. 30
0
def run_boruta(data, target, names, name, outdir):
    #uruchomienie algorytmu Boruta na data i target
    grdevices = importr('grDevices')
    boruta = importr('Boruta')
    r = robjects.r
    base = importr('base')

    data2 = {}
    for i in xrange(len(names)):
        data2[names[i]] = FloatVector((data[:,i]))

    x = robjects.DataFrame(data2)
    y = IntVector((target))
    print "running Boruta"
    result = boruta.Boruta(x, y)
    print result
    print boruta.attStats(result)
    f = file(outdir+"boruta.data", 'w')
    pickle.dump(result, f)
    f.close()