def testRepr(self): # this should go to testVector, with other tests for repr() l = (('b', numpy.array([True, False, True], dtype=numpy.bool_)), ('i', numpy.array([1, 2, 3], dtype="i")), ('f', numpy.array([1, 2, 3], dtype="f")), ('s', numpy.array(["a", "b", "c"], dtype="S")), ('u', numpy.array([u"a", u"b", u"c"], dtype="U"))) od = OrderedDict(l) pd_df = pandas.core.frame.DataFrame(od) rp_df = rpyp.pandas2ri(pd_df) s = repr(rp_df) # used to fail with a TypeError s = s.split('\n') self.assertEqual('[Array, Array, Array, FactorV..., FactorV...]', s[1].strip())
def generateLocalMetrics(self, resource_url): # read pandas dataframe df = pd.DataFrame() try: response = requests.get(resource_url) content = io.StringIO(response.content.decode('utf-8')) if self.is_arff(resource_url): data, meta = arff.loadarff(content) print('*** arff data: ', meta) df = pd.DataFrame(data) df = df.select_dtypes( exclude=['number']).applymap(lambda x: x.replace("'", "")) if self.is_csv(resource_url): df = pd.read_csv(content) # not tested yet except Exception as e: print('Exception caught: ', e) #df = df.apply(pd.to_numeric, errors='ignore') print('*** Numeric df: ', df.dtypes) # convert to R dataframe # Assuming that the last column is the output values pandas2ri.activate() # print('** pandas2ri methods: ', dir(pandas2ri)) # Latest pyr2 docs recommend to use pandas2ri.py2ri functionality, but we are not able to use latest version of pyr2 due to dependence on python 2 forced by CKAN. The following works for pyr2=2.4.0, not tested on other versions. inputD = pandas2ri.pandas2ri(df) # inputR = pandas2ri.pandas2ri(df.iloc[:, :-1]) # outputR = pandas2ri.pandas2ri(df.iloc[:, -1:]) # Call R functions on dataframe ro.r('require(ECoL)') ro.globalenv['dat'] = inputD ## Conversion from pandas sets column types to StrVector, we need them to be FactorVectors in the APIs we invoke. ro.r('''dat[,1:ncol(dat)]=lapply(1:ncol(dat),function(x) { tryCatch({ as.factor(dat[[x]]) },warning = function(w) { dat[[x]]} )} )''') # ro.globalenv['xvar'] = inputR # ro.globalenv['yvar'] = outputR # result = ro.r('complexity(xvar, yvar)') ## For the lack of knowledge of input data, we are making an assumption that the last column of dataset is the dependent attribute. result = ro.r('complexity(dat[-ncol(dat)], dat[ncol(dat)])') print('*** Complexity output: ', pandas2ri.ri2pandas(result), ' type: ', dir(result)) for name in result.names: print('** Datacard metric: ', name, ' -> ', result.rx2(name)[0]) group, metric = name.split('.', 1) value = result.rx2(name)[0] self.add_to_datacard(group, metric, value)
def testDataFrame(self): l = (('b', numpy.array([True, False, True], dtype=numpy.bool_)), ('i', numpy.array([1, 2, 3], dtype="i")), ('f', numpy.array([1, 2, 3], dtype="f")), ('s', numpy.array(["a", "b", "c"], dtype="S")), ('u', numpy.array([u"a", u"b", u"c"], dtype="U")), ('dates', [datetime(2012, 5, 2), datetime(2012, 6, 3), datetime(2012, 7, 1)])) od = OrderedDict(l) pd_df = pandas.core.frame.DataFrame(od) rp_df = rpyp.pandas2ri(pd_df) self.assertEqual(pd_df.shape[0], rp_df.nrow) self.assertEqual(pd_df.shape[1], rp_df.ncol)
def predict(self, X): if self._model is not None: from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri mgcv = importr('mgcv') import pandas as pd n_channels = self.n_channels lin_model = self._linear_model N = X.shape[0] w = np.copy(lin_model.get_weights()) m = w.shape[0] / n_channels chan_ind = np.reshape(np.arange(w.shape[0]), (m, n_channels), order='F') Yw_pred = np.zeros((N, n_channels)) for j in range(n_channels): # predictions for channel j Yw_pred[:, j] = np.dot(X[:, chan_ind[:, j]], w[chan_ind[:, j]]) y = np.zeros((N, )) YX = np.hstack((np.atleast_2d(y).T, Yw_pred)) df = pd.DataFrame(YX, columns=self.columns) try: df_r = pandas2ri.py2ri(df) except BaseException: df_r = pandas2ri.pandas2ri(df) y_pred = mgcv.predict_gam(self._model, df_r, type="response", se="False") return np.asarray(y_pred) else: raise ValueError('Fit model first!')
def testDataFrame(self): l = (('b', numpy.array([True, False, True], dtype=numpy.bool_)), ('i', numpy.array([1, 2, 3], dtype="i")), ('f', numpy.array([1, 2, 3], dtype="f")), ('s', numpy.array(["a", "b", "c"], dtype="S")), ('u', numpy.array([u"a", u"b", u"c"], dtype="U")), ('dates', [ datetime(2012, 5, 2), datetime(2012, 6, 3), datetime(2012, 7, 1) ])) od = OrderedDict(l) pd_df = pandas.core.frame.DataFrame(od) rp_df = rpyp.pandas2ri(pd_df) self.assertEqual(pd_df.shape[0], rp_df.nrow) self.assertEqual(pd_df.shape[1], rp_df.ncol)
def _fit_regression(self, dataset, target, **kwargs): r_df_name = 'df_' + target['name'] r_model_name = 'model_' + target['name'] df = self._get_pd_df(dataset, target['name']) # TODO - handle this feature gracefully if '...' in set(df.columns): df.drop('...', axis=1, inplace=True) rpy2.robjects.globalenv[r_df_name] = pandas2ri.pandas2ri(df) cmd = "lmer(%s, data=%s, REML=FALSE)" formula = '%s ~ . %s %s' % (target['name'], ''.join( ' + (1|%s)' % varname for varname in self.confound_names), ''.join( ' - %s' % varname for varname in self.confound_names)) print "MIXED: fitting ", cmd % (formula, r_df_name) model, params = self._train_r_model(cmd % (formula, r_df_name), r_model_name) return regression_base.ModelResult(model=model, weights=params, response_type='continuous')
def _fit_classifier(self, dataset, target, level=None, **kwargs): r_df_name = 'df_%s_%s' % (target['name'], level) r_model_name = 'model_%s_%s' % (target['name'], level) df = self._get_pd_df(dataset, target['name'], level) # TODO - handle this feature gracefully if '...' in set(df.columns): df.drop('...', axis=1, inplace=True) rpy2.robjects.globalenv[r_df_name] = pandas2ri.pandas2ri(df) cmd = "glmer(%s, family=binomial(link='logit'), data=%s, REML=FALSE)" formula = '%s ~ . %s %s' % ( '%s.%s' % (target['name'], level) if level else target['name'], ''.join(' + (1|%s)' % varname for varname in self.confound_names), ''.join( ' - %s' % varname for varname in self.confound_names)) model, params = self._train_r_model(cmd % (formula, r_df_name), r_model_name) return regression_base.ModelResult(model=model, weights=params, response_type='categorical')
def fit(self, X, y): from lnpy import linear as linear_models n_channels = self.n_channels # estimation a GAM using all dimensions is not working well; # thus we try to estimate the linear weights using a linear model # and then fit a GAM to the linear predictions for each channel from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri import rpy2.robjects as ro pandas2ri.activate() import pandas as pd try: import pandas.rpy.common as com com_available = True except BaseException: com_available = False mgcv = importr('mgcv') if self.linear_model is None: lin_model = linear_models.ARD(verbose=False) lin_model.fit(X, y) elif isinstance(self.linear_model, string_types): if self.linear_model.upper() == 'ARD': lin_model = linear_models.ARD(verbose=False) elif self.linear_model.upper() == 'RIDGE': lin_model = linear_models.Ridge(verbose=False) lin_model.fit(X, y) N = X.shape[0] w = np.copy(lin_model.get_weights()) m = w.shape[0] / n_channels chan_ind = np.reshape(np.arange(w.shape[0]), (m, n_channels), order='F') Yw_pred = np.zeros((N, n_channels)) for j in range(n_channels): # predictions for channel j Yw_pred[:, j] = np.dot(X[:, chan_ind[:, j]], w[chan_ind[:, j]]) # fit GAM YX = np.hstack((np.atleast_2d(y).T, Yw_pred)) df = pd.DataFrame(YX, columns=self.columns) if com_available: df_r = com.convert_to_r_dataframe(df) else: try: df_r = pandas2ri.py2ri(df) except BaseException: df_r = pandas2ri.pandas2ri(df) mod = self.model_string m = mgcv.gam(ro.r(mod), data=df_r, family='gaussian()', optimizer='perf') self._model = m self._linear_model = lin_model
def testSeries(self): Series = pandas.core.series.Series s = Series(numpy.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) rp_s = rpyp.pandas2ri(s) self.assertEqual(rinterface.SexpVector, type(rp_s))