Esempio n. 1
0
 def testRepr(self):
     # this should go to testVector, with other tests for repr()
     l = (('b', numpy.array([True, False, True], dtype=numpy.bool_)),
          ('i', numpy.array([1, 2, 3], dtype="i")),
          ('f', numpy.array([1, 2, 3], dtype="f")),
          ('s', numpy.array(["a", "b", "c"], dtype="S")),
          ('u', numpy.array([u"a", u"b", u"c"], dtype="U")))
     od = OrderedDict(l)
     pd_df = pandas.core.frame.DataFrame(od)
     rp_df = rpyp.pandas2ri(pd_df)
     s = repr(rp_df) # used to fail with a TypeError
     s = s.split('\n')
     self.assertEqual('[Array, Array, Array, FactorV..., FactorV...]', s[1].strip())
Esempio n. 2
0
    def generateLocalMetrics(self, resource_url):
        # read pandas dataframe
        df = pd.DataFrame()
        try:
            response = requests.get(resource_url)
            content = io.StringIO(response.content.decode('utf-8'))
            if self.is_arff(resource_url):
                data, meta = arff.loadarff(content)
                print('*** arff data: ', meta)
                df = pd.DataFrame(data)
                df = df.select_dtypes(
                    exclude=['number']).applymap(lambda x: x.replace("'", ""))
            if self.is_csv(resource_url):
                df = pd.read_csv(content)  # not tested yet
        except Exception as e:
            print('Exception caught: ', e)

        #df = df.apply(pd.to_numeric, errors='ignore')
        print('*** Numeric df: ', df.dtypes)

        # convert to R dataframe
        # Assuming that the last column is the output values
        pandas2ri.activate()
        # print('** pandas2ri methods: ', dir(pandas2ri))
        # Latest pyr2 docs recommend to use pandas2ri.py2ri functionality, but we are not able to use latest version of pyr2 due to dependence on python 2 forced by CKAN. The following works for pyr2=2.4.0, not tested on other versions.
        inputD = pandas2ri.pandas2ri(df)
        # inputR = pandas2ri.pandas2ri(df.iloc[:, :-1])
        # outputR = pandas2ri.pandas2ri(df.iloc[:, -1:])

        # Call R functions on dataframe
        ro.r('require(ECoL)')
        ro.globalenv['dat'] = inputD
        ## Conversion from pandas sets column types to StrVector, we need them to be FactorVectors in the APIs we invoke.
        ro.r('''dat[,1:ncol(dat)]=lapply(1:ncol(dat),function(x) {
        tryCatch({
        as.factor(dat[[x]])
        },warning = function(w) {
        dat[[x]]}
        )} )''')
        # ro.globalenv['xvar'] = inputR
        # ro.globalenv['yvar'] = outputR
        # result = ro.r('complexity(xvar, yvar)')
        ## For the lack of knowledge of input data, we are making an assumption that the last column of dataset is the dependent attribute.
        result = ro.r('complexity(dat[-ncol(dat)], dat[ncol(dat)])')
        print('*** Complexity output: ', pandas2ri.ri2pandas(result),
              ' type: ', dir(result))
        for name in result.names:
            print('** Datacard metric: ', name, ' -> ', result.rx2(name)[0])
            group, metric = name.split('.', 1)
            value = result.rx2(name)[0]
            self.add_to_datacard(group, metric, value)
Esempio n. 3
0
 def testDataFrame(self):
     l = (('b', numpy.array([True, False, True], dtype=numpy.bool_)),
          ('i', numpy.array([1, 2, 3], dtype="i")),
          ('f', numpy.array([1, 2, 3], dtype="f")),
          ('s', numpy.array(["a", "b", "c"], dtype="S")),
          ('u', numpy.array([u"a", u"b", u"c"], dtype="U")),
          ('dates', [datetime(2012, 5, 2), 
                     datetime(2012, 6, 3), 
                     datetime(2012, 7, 1)]))
     od = OrderedDict(l)
     pd_df = pandas.core.frame.DataFrame(od)
     rp_df = rpyp.pandas2ri(pd_df)
     self.assertEqual(pd_df.shape[0], rp_df.nrow)
     self.assertEqual(pd_df.shape[1], rp_df.ncol)
 def testRepr(self):
     # this should go to testVector, with other tests for repr()
     l = (('b', numpy.array([True, False, True],
                            dtype=numpy.bool_)), ('i',
                                                  numpy.array([1, 2, 3],
                                                              dtype="i")),
          ('f', numpy.array([1, 2, 3], dtype="f")),
          ('s', numpy.array(["a", "b", "c"],
                            dtype="S")), ('u',
                                          numpy.array([u"a", u"b", u"c"],
                                                      dtype="U")))
     od = OrderedDict(l)
     pd_df = pandas.core.frame.DataFrame(od)
     rp_df = rpyp.pandas2ri(pd_df)
     s = repr(rp_df)  # used to fail with a TypeError
     s = s.split('\n')
     self.assertEqual('[Array, Array, Array, FactorV..., FactorV...]',
                      s[1].strip())
Esempio n. 5
0
    def predict(self, X):

        if self._model is not None:

            from rpy2.robjects.packages import importr
            from rpy2.robjects import pandas2ri
            mgcv = importr('mgcv')

            import pandas as pd

            n_channels = self.n_channels
            lin_model = self._linear_model

            N = X.shape[0]
            w = np.copy(lin_model.get_weights())
            m = w.shape[0] / n_channels
            chan_ind = np.reshape(np.arange(w.shape[0]), (m, n_channels),
                                  order='F')

            Yw_pred = np.zeros((N, n_channels))
            for j in range(n_channels):

                # predictions for channel j
                Yw_pred[:, j] = np.dot(X[:, chan_ind[:, j]], w[chan_ind[:, j]])

            y = np.zeros((N, ))
            YX = np.hstack((np.atleast_2d(y).T, Yw_pred))
            df = pd.DataFrame(YX, columns=self.columns)

            try:
                df_r = pandas2ri.py2ri(df)
            except BaseException:
                df_r = pandas2ri.pandas2ri(df)

            y_pred = mgcv.predict_gam(self._model,
                                      df_r,
                                      type="response",
                                      se="False")

            return np.asarray(y_pred)

        else:
            raise ValueError('Fit model first!')
 def testDataFrame(self):
     l = (('b', numpy.array([True, False, True],
                            dtype=numpy.bool_)), ('i',
                                                  numpy.array([1, 2, 3],
                                                              dtype="i")),
          ('f', numpy.array([1, 2, 3],
                            dtype="f")), ('s',
                                          numpy.array(["a", "b", "c"],
                                                      dtype="S")),
          ('u', numpy.array([u"a", u"b", u"c"], dtype="U")), ('dates', [
              datetime(2012, 5, 2),
              datetime(2012, 6, 3),
              datetime(2012, 7, 1)
          ]))
     od = OrderedDict(l)
     pd_df = pandas.core.frame.DataFrame(od)
     rp_df = rpyp.pandas2ri(pd_df)
     self.assertEqual(pd_df.shape[0], rp_df.nrow)
     self.assertEqual(pd_df.shape[1], rp_df.ncol)
Esempio n. 7
0
    def _fit_regression(self, dataset, target, **kwargs):
        r_df_name = 'df_' + target['name']
        r_model_name = 'model_' + target['name']

        df = self._get_pd_df(dataset, target['name'])
        # TODO - handle this feature gracefully
        if '...' in set(df.columns):
            df.drop('...', axis=1, inplace=True)

        rpy2.robjects.globalenv[r_df_name] = pandas2ri.pandas2ri(df)

        cmd = "lmer(%s, data=%s, REML=FALSE)"
        formula = '%s ~ . %s %s' % (target['name'], ''.join(
            ' + (1|%s)' % varname for varname in self.confound_names), ''.join(
                ' - %s' % varname for varname in self.confound_names))

        print "MIXED: fitting ", cmd % (formula, r_df_name)
        model, params = self._train_r_model(cmd % (formula, r_df_name),
                                            r_model_name)

        return regression_base.ModelResult(model=model,
                                           weights=params,
                                           response_type='continuous')
Esempio n. 8
0
    def _fit_classifier(self, dataset, target, level=None, **kwargs):
        r_df_name = 'df_%s_%s' % (target['name'], level)
        r_model_name = 'model_%s_%s' % (target['name'], level)

        df = self._get_pd_df(dataset, target['name'], level)
        # TODO - handle this feature gracefully
        if '...' in set(df.columns):
            df.drop('...', axis=1, inplace=True)

        rpy2.robjects.globalenv[r_df_name] = pandas2ri.pandas2ri(df)

        cmd = "glmer(%s, family=binomial(link='logit'), data=%s, REML=FALSE)"
        formula = '%s ~ . %s %s' % (
            '%s.%s' % (target['name'], level) if level else target['name'],
            ''.join(' + (1|%s)' % varname
                    for varname in self.confound_names), ''.join(
                        ' - %s' % varname for varname in self.confound_names))

        model, params = self._train_r_model(cmd % (formula, r_df_name),
                                            r_model_name)

        return regression_base.ModelResult(model=model,
                                           weights=params,
                                           response_type='categorical')
Esempio n. 9
0
    def fit(self, X, y):

        from lnpy import linear as linear_models

        n_channels = self.n_channels

        # estimation a GAM using all dimensions is not working well;
        # thus we try to estimate the linear weights using a linear model
        # and then fit a GAM to the linear predictions for each channel

        from rpy2.robjects.packages import importr
        from rpy2.robjects import pandas2ri
        import rpy2.robjects as ro
        pandas2ri.activate()
        import pandas as pd
        try:
            import pandas.rpy.common as com
            com_available = True
        except BaseException:
            com_available = False

        mgcv = importr('mgcv')

        if self.linear_model is None:

            lin_model = linear_models.ARD(verbose=False)
            lin_model.fit(X, y)

        elif isinstance(self.linear_model, string_types):

            if self.linear_model.upper() == 'ARD':
                lin_model = linear_models.ARD(verbose=False)
            elif self.linear_model.upper() == 'RIDGE':
                lin_model = linear_models.Ridge(verbose=False)

            lin_model.fit(X, y)

        N = X.shape[0]
        w = np.copy(lin_model.get_weights())
        m = w.shape[0] / n_channels
        chan_ind = np.reshape(np.arange(w.shape[0]), (m, n_channels),
                              order='F')

        Yw_pred = np.zeros((N, n_channels))
        for j in range(n_channels):

            # predictions for channel j
            Yw_pred[:, j] = np.dot(X[:, chan_ind[:, j]], w[chan_ind[:, j]])

        # fit GAM
        YX = np.hstack((np.atleast_2d(y).T, Yw_pred))
        df = pd.DataFrame(YX, columns=self.columns)
        if com_available:
            df_r = com.convert_to_r_dataframe(df)
        else:
            try:
                df_r = pandas2ri.py2ri(df)
            except BaseException:
                df_r = pandas2ri.pandas2ri(df)

        mod = self.model_string

        m = mgcv.gam(ro.r(mod),
                     data=df_r,
                     family='gaussian()',
                     optimizer='perf')

        self._model = m
        self._linear_model = lin_model
 def testSeries(self):
     Series = pandas.core.series.Series
     s = Series(numpy.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
     rp_s = rpyp.pandas2ri(s)
     self.assertEqual(rinterface.SexpVector, type(rp_s))
Esempio n. 11
0
 def testSeries(self):
     Series = pandas.core.series.Series
     s = Series(numpy.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
     rp_s = rpyp.pandas2ri(s)
     self.assertEqual(rinterface.SexpVector, type(rp_s))