def test_stata_writer_pandas(): buf = BytesIO() dta = macrodata.load_pandas().data dta4 = dta.copy() for col in ('year', 'quarter'): dta[col] = dta[col].astype(np.int64) dta4[col] = dta4[col].astype(np.int32) # dta is int64 'i8' given to Stata writer with pytest.warns(FutureWarning): writer = StataWriter(buf, dta) with warnings.catch_warnings(record=True) as w: writer.write_file() assert len(w) == 0 buf.seek(0) with pytest.warns(FutureWarning): dta2 = genfromdta(buf) dta5 = DataFrame.from_records(dta2) # dta2 is int32 'i4' returned from Stata reader if dta5.dtypes[1] is np.dtype('int64'): assert_frame_equal(dta.reset_index(), dta5) else: # do not check index because it has different size, int32 versus int64 assert_frame_equal(dta4, dta5[dta5.columns[1:]])
def test_datetime_roundtrip(): dta = np.array([(1, datetime(2010, 1, 1), 2), (2, datetime(2010, 2, 1), 3), (4, datetime(2010, 3, 1), 5)], dtype=[('var1', float), ('var2', object), ('var3', float)]) buf = BytesIO() with pytest.warns(FutureWarning): writer = StataWriter(buf, dta, {"var2": "tm"}) writer.write_file() buf.seek(0) with pytest.warns(FutureWarning): dta2 = genfromdta(buf) assert_equal(dta, dta2) dta = DataFrame.from_records(dta) buf = BytesIO() with pytest.warns(FutureWarning): writer = StataWriter(buf, dta, {"var2": "tm"}) writer.write_file() buf.seek(0) with pytest.warns(FutureWarning): dta2 = genfromdta(buf, pandas=True) assert_frame_equal(dta, dta2.drop('index', axis=1))
def test_sort(self): # data frame sorted_data, index = self.grouping.sort(self.data) expected_sorted_data = self.data.sort_index() assert_frame_equal(sorted_data, expected_sorted_data) np.testing.assert_(isinstance(sorted_data, pd.DataFrame)) np.testing.assert_(not index.equals(self.grouping.index)) # make sure it copied if hasattr(sorted_data, 'equals'): # newer pandas np.testing.assert_(not sorted_data.equals(self.data)) # 2d arrays sorted_data, index = self.grouping.sort(self.data.values) np.testing.assert_array_equal(sorted_data, expected_sorted_data.values) np.testing.assert_(isinstance(sorted_data, np.ndarray)) # 1d series series = self.data[self.data.columns[0]] sorted_data, index = self.grouping.sort(series) expected_sorted_data = series.sort_index() assert_series_equal(sorted_data, expected_sorted_data) np.testing.assert_(isinstance(sorted_data, pd.Series)) if hasattr(sorted_data, 'equals'): np.testing.assert_(not sorted_data.equals(series)) # 1d array array = series.values sorted_data, index = self.grouping.sort(array) expected_sorted_data = series.sort_index().values np.testing.assert_array_equal(sorted_data, expected_sorted_data) np.testing.assert_(isinstance(sorted_data, np.ndarray))
def test_repeated_measures_aggregate_compare_with_ezANOVA(): # Results should reproduces those from R's `ezANOVA` (library ez). ez = pd.DataFrame( { 'F Value': [ 8.7650709, 8.4985785, 20.5076546, 0.8457797, 21.7593382, 6.2416695, 5.4253359 ], 'Num DF': [1, 2, 1, 2, 1, 2, 2], 'Den DF': [7, 14, 7, 14, 7, 14, 14], 'Pr > F': [ 0.021087505, 0.003833921, 0.002704428, 0.450021759, 0.002301792, 0.011536846, 0.018010647 ] }, index=pd.Index(['A', 'B', 'D', 'A:B', 'A:D', 'B:D', 'A:B:D'])) ez = ez[['F Value', 'Num DF', 'Den DF', 'Pr > F']] double_data = pd.concat([data, data], axis=0) df = (AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean).fit().anova_table) assert_frame_equal(ez, df, check_dtype=False)
def test_should_calculate_addChangePct(): """ Adds the close percentage to the DataFrame : close_pc Adds the cumulative returns the DataFrame : close_cpc Excellent video to understand cumulative returns : https://www.youtube.com/watch?v=fWHQwqT3lNY """ # GIVEN a series of values closes_list = [0.0003, 0.0004, 0.0010, 0.0020, 0.0009] df = pd.DataFrame({ 'date': [ '2021-10-10 14:30:00', '2021-10-10 14:31:00', '2021-10-10 14:32:00', '2021-10-10 14:33:00', '2021-10-10 14:34:00' ], 'close': closes_list }) df['date'] = pd.to_datetime(df['date'], format="%Y-%d-%m %H:%M:%S") df.set_index(['date']) ta = TechnicalAnalysis(df) # WHEN calculate the percentage evolution and cumulative returns percentage ta.addChangePct() # THEN percentage evolution and cumulative returns percentage should be added to dataframe actual = ta.getDataFrame() close_pc = [ calculate_percentage_evol(closes_list[0], closes_list[0]), calculate_percentage_evol(closes_list[0], closes_list[1]), calculate_percentage_evol(closes_list[1], closes_list[2]), calculate_percentage_evol(closes_list[2], closes_list[3]), calculate_percentage_evol(closes_list[3], closes_list[4]), ] close_cpc = [] close_cpc.append(0.000000) close_cpc.append((1 + close_pc[1]) * (1 + close_cpc[0]) - 1) close_cpc.append((1 + close_pc[2]) * (1 + close_cpc[1]) - 1) close_cpc.append((1 + close_pc[3]) * (1 + close_cpc[2]) - 1) close_cpc.append((1 + close_pc[4]) * (1 + close_cpc[3]) - 1) expected = pd.DataFrame({ 'date': [ '2021-10-10 14:30:00', '2021-10-10 14:31:00', '2021-10-10 14:32:00', '2021-10-10 14:33:00', '2021-10-10 14:34:00' ], 'close': closes_list, 'close_pc': close_pc, 'close_cpc': close_cpc }) expected['date'] = pd.to_datetime(df['date'], format="%Y-%d-%m %H:%M:%S") expected.set_index(['date']) assert_frame_equal(actual, expected)
def test_cohn(self): cols = ['nuncen_above', 'nobs_below', 'ncen_equal', 'prob_exceedance'] cohn = ros.cohn_numbers(self.df, self.rescol, self.cencol) # Use round in place of the deprecated check_less_precise arg assert_frame_equal( np.round(cohn[cols], 3), np.round(self.expected_cohn[cols], 3), )
def test_pandas(self): results = tools._ensure_2d(self.df, False) assert_frame_equal(results[0], self.df) assert_array_equal(results[1], self.df.columns) results = tools._ensure_2d(self.series, False) assert_frame_equal(results[0], self.df.iloc[:, [0]]) assert_equal(results[1], self.df.columns[0])
def test_add_constant_dataframe(self): df = pd.DataFrame([[1.0, 'a', 4], [2.0, 'bc', 9], [3.0, 'def', 16]]) output = tools.add_constant(df) expected = pd.Series([1.0, 1.0, 1.0], name='const') assert_series_equal(expected, output['const']) dfc = df.copy() dfc.insert(0, 'const', np.ones(3)) assert_frame_equal(dfc, output)
def test_add_constant_dataframe(self): df = pd.DataFrame([[1.0, "a", 4], [2.0, "bc", 9], [3.0, "def", 16]]) output = tools.add_constant(df) expected = pd.Series([1.0, 1.0, 1.0], name="const") assert_series_equal(expected, output["const"]) dfc = df.copy() dfc.insert(0, "const", np.ones(3)) assert_frame_equal(dfc, output)
def check_predict_types(results): """ Check that the `predict` method of the given results object produces the correct output type. Parameters ---------- results : Results Raises ------ AssertionError """ res = results # squeeze to make 1d for single regressor test case p_exog = np.squeeze(np.asarray(res.model.exog[:2])) # ignore wrapper for isinstance check from statsmodels.genmod.generalized_linear_model import GLMResults from statsmodels.discrete.discrete_model import DiscreteResults from statsmodels.compat.pandas import assert_frame_equal, assert_series_equal # possibly unwrap -- GEE has no wrapper results = getattr(results, '_results', results) if isinstance(results, (GLMResults, DiscreteResults)): # SMOKE test only TODO: mark this somehow res.predict(p_exog) res.predict(p_exog.tolist()) res.predict(p_exog[0].tolist()) else: fitted = res.fittedvalues[:2] assert_allclose(fitted, res.predict(p_exog), rtol=1e-12) # this needs reshape to column-vector: assert_allclose(fitted, res.predict(np.squeeze(p_exog).tolist()), rtol=1e-12) # only one prediction: assert_allclose(fitted[:1], res.predict(p_exog[0].tolist()), rtol=1e-12) assert_allclose(fitted[:1], res.predict(p_exog[0]), rtol=1e-12) # Check that pandas wrapping works as expected exog_index = range(len(p_exog)) predicted = res.predict(p_exog) cls = pd.Series if p_exog.ndim == 1 else pd.DataFrame predicted_pandas = res.predict(cls(p_exog, index=exog_index)) # predicted.ndim may not match p_exog.ndim because it may be squeezed # if p_exog has only one column cls = pd.Series if predicted.ndim == 1 else pd.DataFrame predicted_expected = cls(predicted, index=exog_index) if isinstance(predicted_expected, pd.Series): assert_series_equal(predicted_expected, predicted_pandas) else: assert_frame_equal(predicted_expected, predicted_pandas)
def test_repeated_measures_aggregation_one_subject_duplicated(): df1 = AnovaRM(data, 'DV', 'id', within=['A', 'B', 'D']).fit() df2 = AnovaRM(data.append(data.loc[data['id'] == '1', :]).reset_index(), 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean).fit() assert_frame_equal(df1.anova_table, df2.anova_table)
def test_repeated_measures_aggregation(): df1 = AnovaRM(data, 'DV', 'id', within=['A', 'B', 'D']).fit() df2 = AnovaRM(data.append(data), 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean).fit() assert_frame_equal(df1.anova_table, df2.anova_table)
def test_categorical_dataframe(string_var): df = pd.DataFrame(string_var) design = tools.categorical(df, "string_var", drop=True) dummies = pd.get_dummies(pd.Categorical(string_var)) assert_frame_equal(design, dummies) df = pd.DataFrame({"apple": string_var, "ban": string_var}) design = tools.categorical(df, "apple", drop=True) assert_frame_equal(design, dummies)
def test_noop(self): df = make_dataframe() df.values[[2, 5, 10], [2, 3, 1]] = np.nan y, X = df[df.columns[0]], df[df.columns[1:]] data, _ = sm_data.handle_missing(y, X, missing='none') y_exp, X_exp = df[df.columns[0]], df[df.columns[1:]] assert_frame_equal(data['exog'], X_exp) assert_series_equal(data['endog'], y_exp)
def test_categorical_dataframe(string_var): df = pd.DataFrame(string_var) design = tools.categorical(df, 'string_var', drop=True) dummies = pd.get_dummies(pd.Categorical(string_var)) assert_frame_equal(design, dummies) df = pd.DataFrame({'apple': string_var, 'ban': string_var}) design = tools.categorical(df, 'apple', drop=True) assert_frame_equal(design, dummies)
def test_unobserved_components_time_varying(revisions, updates): # This is primarily a test that the `news` method works with a time-varying # setup (i.e. time-varying state space matrices). It tests a time-varying # UnobservedComponents model where the time-varying component has been set # to zeros against a time-invariant version of the model. # Construct previous and updated datasets endog = dta['infl'].copy() comparison_type = None if updates: endog1 = endog.loc[:'2009Q2'].copy() endog2 = endog.loc[:'2009Q3'].copy() else: endog1 = endog.loc[:'2009Q3'].copy() endog2 = endog.loc[:'2009Q3'].copy() # Without updates and without NaN values, we need to specify that # the type of the comparison object that we're passing is "updated" comparison_type = 'updated' if revisions: endog1.iloc[-1] = 0. exog1 = np.ones_like(endog1) exog2 = np.ones_like(endog2) # Compute the news from a model with a trend/exog term (so the model is # time-varying), but with the coefficient set to zero (so that it will be # equivalent to the time-invariant model) mod1 = structural.UnobservedComponents(endog1, 'llevel', exog=exog1) res1 = mod1.smooth([0.5, 0.2, 0.0]) news1 = res1.news(endog2, exog=exog2, start='2008Q1', end='2009Q3', comparison_type=comparison_type) # Compute the news from a model without a trend term mod2 = structural.UnobservedComponents(endog1, 'llevel') res2 = mod2.smooth([0.5, 0.2]) news2 = res2.news(endog2, start='2008Q1', end='2009Q3', comparison_type=comparison_type) attrs = [ 'total_impacts', 'update_impacts', 'revision_impacts', 'news', 'weights', 'update_forecasts', 'update_realized', 'prev_impacted_forecasts', 'post_impacted_forecasts', 'revisions_iloc', 'revisions_ix', 'updates_iloc', 'updates_ix' ] for attr in attrs: w = getattr(news1, attr) x = getattr(news2, attr) if isinstance(x, pd.Series): assert_series_equal(w, x) else: assert_frame_equal(w, x)
def test_dynamic_factor_time_varying(revisions, updates): # This is primarily a test that the `news` method works with a time-varying # setup (i.e. time-varying state space matrices). It tests a time-varying # DynamicFactor model where the time-varying component has been set to # zeros against a time-invariant version of the model. # Construct previous and updated datasets endog = dta[['realgdp', 'unemp']].copy() endog['realgdp'] = np.log(endog['realgdp']).diff() * 400 endog = endog.iloc[1:] comparison_type = None if updates: endog1 = endog.loc[:'2009Q2'].copy() endog2 = endog.loc[:'2009Q3'].copy() else: endog1 = endog.loc[:'2009Q3'].copy() endog2 = endog.loc[:'2009Q3'].copy() # Without updates and without NaN values, we need to specify that # the type of the comparison object that we're passing is "updated" comparison_type = 'updated' if revisions: # TODO: add test for only one of the variables revising? endog1.iloc[-1] = 0. exog1 = np.ones_like(endog1['realgdp']) exog2 = np.ones_like(endog2['realgdp']) params1 = np.r_[0.9, 0.2, 0.0, 0.0, 1.2, 1.1, 0.5, 0.2] params2 = np.r_[0.9, 0.2, 1.2, 1.1, 0.5, 0.2] # Compute the news from a model with an exog term (so the model is # time-varying), but with the coefficient set to zero (so that it will be # equivalent to the time-invariant model) mod1 = dynamic_factor.DynamicFactor(endog1, exog=exog1, k_factors=1, factor_order=2) res1 = mod1.smooth(params1) news1 = res1.news(endog2, exog=exog2, start='2008Q1', end='2009Q3', comparison_type=comparison_type) # Compute the news from a model without a trend term mod2 = dynamic_factor.DynamicFactor(endog1, k_factors=1, factor_order=2) res2 = mod2.smooth(params2) news2 = res2.news(endog2, start='2008Q1', end='2009Q3', comparison_type=comparison_type) attrs = ['total_impacts', 'update_impacts', 'revision_impacts', 'news', 'weights', 'update_forecasts', 'update_realized', 'prev_impacted_forecasts', 'post_impacted_forecasts', 'revisions_iloc', 'revisions_ix', 'updates_iloc', 'updates_ix'] for attr in attrs: w = getattr(news1, attr) x = getattr(news2, attr) if isinstance(x, pd.Series): assert_series_equal(w, x) else: assert_frame_equal(w, x)
def test_array_pandas(self): df = make_dataframe() df.values[[2, 5, 10], [2, 3, 1]] = np.nan y, X = df[df.columns[0]].values, df[df.columns[1:]] data, _ = sm_data.handle_missing(y, X, missing='drop') df = df.dropna() y_exp, X_exp = df[df.columns[0]].values, df[df.columns[1:]] assert_frame_equal(data['exog'], X_exp) np.testing.assert_array_equal(data['endog'], y_exp)
def test_attach(self): data = self.data # this makes sure what the wrappers need work but not the wrapped # results themselves assert_series_equal(data.wrap_output(self.col_input, 'columns'), self.col_result) assert_series_equal(data.wrap_output(self.row_input, 'rows'), self.row_result) assert_frame_equal(data.wrap_output(self.cov_input, 'cov'), self.cov_result)
def test_genfromdta_pandas(): dta = macrodata.load_pandas().data curdir = os.path.dirname(os.path.abspath(__file__)) with pytest.warns(FutureWarning): res1 = genfromdta(curdir + '/../../datasets/macrodata/macrodata.dta', pandas=True) res1 = res1.astype(float) assert_frame_equal(res1, dta.astype(float))
def test_repeated_measures_aggregation(): df1 = AnovaRM(data, 'DV', 'id', within=['A', 'B', 'D']).fit() double_data = pd.concat([data, data], axis=0) df2 = AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean).fit() assert_frame_equal(df1.anova_table, df2.anova_table)
def test_cohn(self): cols = [ 'nuncen_above', 'nobs_below', 'ncen_equal', 'prob_exceedance' ] cohn = ros.cohn_numbers(self.df, self.rescol, self.cencol) assert_frame_equal( cohn[cols], self.expected_cohn[cols], check_less_precise=True, )
def test_drop(self): y = self.y X = self.X combined = np.c_[y, X] idx = ~np.isnan(combined).any(axis=1) y = y.loc[idx] X = X.loc[idx] data = sm_data.handle_data(self.y, self.X, 'drop') np.testing.assert_array_equal(data.endog, y.values) assert_series_equal(data.orig_endog, self.y.loc[idx]) np.testing.assert_array_equal(data.exog, X.values) assert_frame_equal(data.orig_exog, self.X.loc[idx])
def test_categorical_series(string_var): design = tools.categorical(string_var, drop=True) dummies = pd.get_dummies(pd.Categorical(string_var)) assert_frame_equal(design, dummies) design = tools.categorical(string_var, drop=False) dummies.columns = list(dummies.columns) assert_frame_equal(design.iloc[:, :5], dummies) assert_series_equal(design.iloc[:, 5], string_var) _, dictnames = tools.categorical(string_var, drop=False, dictnames=True) for i, c in enumerate(pd.Categorical(string_var).categories): assert i in dictnames assert dictnames[i] == c
def test_pandas_freq_decorator(): x = pd.util.testing.makeDataFrame() # in x, get a function back that returns an x with the same columns func = pandas_wrapper(dummy_func) np.testing.assert_equal(func(x.values), x) func = pandas_wrapper(dummy_func_array) assert_frame_equal(func(x), x) expected = x.rename(columns=dict(zip('ABCD', 'EFGH'))) func = pandas_wrapper(dummy_func_array, names=list('EFGH')) assert_frame_equal(func(x), expected)
def test_dataframe_forward(self): data = self.macro_df columns = list(data.columns) n = data.shape[0] values = np.zeros((n + 3, 16)) values[:n, :4] = data.values for lag in range(1, 4): new_cols = [col + '.L.' + str(lag) for col in data] columns.extend(new_cols) values[lag:n + lag, 4 * lag:4 * (lag + 1)] = data.values index = data.index values = values[:n] expected = pd.DataFrame(values, columns=columns, index=index) both = sm.tsa.lagmat(self.macro_df, 3, trim='forward', original='in', use_pandas=True) assert_frame_equal(both, expected) lags = sm.tsa.lagmat(self.macro_df, 3, trim='forward', original='ex', use_pandas=True) assert_frame_equal(lags, expected.iloc[:, 4:]) lags, lead = sm.tsa.lagmat(self.macro_df, 3, trim='forward', original='sep', use_pandas=True) assert_frame_equal(lags, expected.iloc[:, 4:]) assert_frame_equal(lead, expected.iloc[:, :4])
def test_series_both(self): expected = pd.DataFrame( index=self.series.index, columns=['cpi', 'cpi.L.1', 'cpi.L.2', 'cpi.L.3']) expected['cpi'] = self.series for lag in range(1, 4): expected['cpi.L.' + str(int(lag))] = self.series.shift(lag) expected = expected.iloc[3:] both = sm.tsa.lagmat(self.series, 3, trim='both', original='in', use_pandas=True) assert_frame_equal(both, expected) lags = sm.tsa.lagmat(self.series, 3, trim='both', original='ex', use_pandas=True) assert_frame_equal(lags, expected.iloc[:, 1:]) lags, lead = sm.tsa.lagmat(self.series, 3, trim='both', original='sep', use_pandas=True) assert_frame_equal(lead, expected.iloc[:, :1]) assert_frame_equal(lags, expected.iloc[:, 1:])
def test_series_both(self): expected = pd.DataFrame( index=self.series.index, columns=["cpi", "cpi.L.1", "cpi.L.2", "cpi.L.3"], ) expected["cpi"] = self.series for lag in range(1, 4): expected["cpi.L." + str(int(lag))] = self.series.shift(lag) expected = expected.iloc[3:] both = stattools.lagmat(self.series, 3, trim="both", original="in", use_pandas=True) assert_frame_equal(both, expected) lags = stattools.lagmat(self.series, 3, trim="both", original="ex", use_pandas=True) assert_frame_equal(lags, expected.iloc[:, 1:]) lags, lead = stattools.lagmat(self.series, 3, trim="both", original="sep", use_pandas=True) assert_frame_equal(lead, expected.iloc[:, :1]) assert_frame_equal(lags, expected.iloc[:, 1:])
def test_dataframe(self): df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df) expected = df.copy() expected["const"] = self.c assert_frame_equal(expected, appended) prepended = tools.add_trend(df, prepend=True) expected = df.copy() expected.insert(0, "const", self.c) assert_frame_equal(expected, prepended) df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df, trend="t") expected = df.copy() expected["trend"] = self.t assert_frame_equal(expected, appended) df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df, trend="ctt") expected = df.copy() expected["const"] = self.c expected["trend"] = self.t expected["trend_squared"] = self.t**2 assert_frame_equal(expected, appended)
def test_dataframe(self): df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df) expected = df.copy() expected['const'] = self.c assert_frame_equal(expected, appended) prepended = tools.add_trend(df, prepend=True) expected = df.copy() expected.insert(0, 'const', self.c) assert_frame_equal(expected, prepended) df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df, trend='t') expected = df.copy() expected['trend'] = self.t assert_frame_equal(expected, appended) df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df, trend='ctt') expected = df.copy() expected['const'] = self.c expected['trend'] = self.t expected['trend_squared'] = self.t ** 2 assert_frame_equal(expected, appended)