def test_align_multiindex(self): # GH#10665 # same test cases as test_align_multiindex in test_series.py midx = pd.MultiIndex.from_product( [range(2), range(3), range(2)], names=("a", "b", "c")) idx = Index(range(2), name="b") df1 = DataFrame(np.arange(12, dtype="int64"), index=midx) df2 = DataFrame(np.arange(2, dtype="int64"), index=idx) # these must be the same results (but flipped) res1l, res1r = df1.align(df2, join="left") res2l, res2r = df2.align(df1, join="right") expl = df1 tm.assert_frame_equal(expl, res1l) tm.assert_frame_equal(expl, res2r) expr = DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) tm.assert_frame_equal(expr, res1r) tm.assert_frame_equal(expr, res2l) res1l, res1r = df1.align(df2, join="right") res2l, res2r = df2.align(df1, join="left") exp_idx = pd.MultiIndex.from_product( [range(2), range(2), range(2)], names=("a", "b", "c")) expl = DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) tm.assert_frame_equal(expl, res1l) tm.assert_frame_equal(expl, res2r) expr = DataFrame([0, 0, 1, 1] * 2, index=exp_idx) tm.assert_frame_equal(expr, res1r) tm.assert_frame_equal(expr, res2l)
def test_frame_align_aware(self): idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) new1, new2 = df1.align(df2) assert df1.index.tz == new1.index.tz assert df2.index.tz == new2.index.tz # different timezones convert to UTC # frame with frame df1_central = df1.tz_convert("US/Central") new1, new2 = df1.align(df1_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC # frame with Series new1, new2 = df1.align(df1_central[0], axis=0) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC df1[0].align(df1_central, axis=0) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC
def test_frame_align_aware(self): idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern') df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) new1, new2 = df1.align(df2) assert df1.index.tz == new1.index.tz assert df2.index.tz == new2.index.tz # different timezones convert to UTC # frame with frame df1_central = df1.tz_convert('US/Central') new1, new2 = df1.align(df1_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC # frame with Series new1, new2 = df1.align(df1_central[0], axis=0) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC df1[0].align(df1_central, axis=0) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC
def to_categories(train: pd.DataFrame, test: pd.DataFrame, vizualize=False) -> [pd.DataFrame, pd.DataFrame]: """ Turns both dataframes into categories or labels depending on categories contained by each column :param train: :param test: :param vizualize: Print for debugging purposes :return: """ encoder = LabelEncoder() for column in train: if train[column].dtype == 'object' and len( list(train[column].unique())) <= 2: train[column] = encoder.fit_transform(train[column]) test[column] = encoder.transform(test[column]) train = pd.get_dummies(train) test = pd.get_dummies(test) train_labels = train['TARGET'] train, test = train.align(test, join='inner', axis=1) train["TARGET"] = train_labels if vizualize: with pd.option_context('display.max_rows', None, 'display.max_columns', None): print("Train") print(train.head(), "\n") print("Test") print(test.head(), "\n") return train, test
def prepare_val_features_for_predict(train_features: pd.DataFrame, val_features: pd.DataFrame): train_features, val_features = train_features.align(val_features, join="left", axis=1) val_features = val_features.fillna(0) return val_features
def test_align_aware(self): idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) new1, new2 = df1.align(df2) self.assertEqual(df1.index.tz, new1.index.tz) self.assertEqual(df2.index.tz, new2.index.tz)
def test_align_aware(self): idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern') df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) new1, new2 = df1.align(df2) self.assertEqual(df1.index.tz, new1.index.tz) self.assertEqual(df2.index.tz, new2.index.tz)
class LevelAlign: def setup(self): self.index = MultiIndex( levels=[np.arange(10), np.arange(100), np.arange(100)], codes=[ np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10), ], ) self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) def time_align_level(self): self.df.align(self.df_level, level=1, copy=False) def time_reindex_level(self): self.df_level.reindex(self.index, level=1)
class LevelAlign(object): def setup(self): self.index = MultiIndex( levels=[np.arange(10), np.arange(100), np.arange(100)], codes=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) def time_align_level(self): self.df.align(self.df_level, level=1, copy=False) def time_reindex_level(self): self.df_level.reindex(self.index, level=1)
def __init__(self, X: DataFrame, y): if isinstance(y, SparseSeries): y = y.to_dense() assert isinstance(y, Series) assert X.ndim == 2 assert y.ndim == 1 self._X_tr, self._y_tr = X.align(y, axis=0, join="inner") self._X_te = X.ix[~X.index.isin(y.index),:] self.name = y.name self.n_predictors = X.shape[1] self.n_samples = X.shape[0]
def test_multiindex_align_to_series_with_common_index_level(self): # GH-46001 foo_index = Index([1, 2, 3], name="foo") bar_index = Index([1, 2], name="bar") series = Series([1, 2], index=bar_index, name="foo_series") df = DataFrame( {"col": np.arange(6)}, index=pd.MultiIndex.from_product([foo_index, bar_index]), ) expected_r = Series([1, 2] * 3, index=df.index, name="foo_series") result_l, result_r = df.align(series, axis=0) tm.assert_frame_equal(result_l, df) tm.assert_series_equal(result_r, expected_r)
def test_multiindex_align_to_series_with_common_index_level_non_unique_cols( self): # GH-46001 foo_index = Index([1, 2, 3], name="foo") bar_index = Index([1, 2], name="bar") series = Series([1, 2], index=bar_index, name="foo_series") df = DataFrame( np.arange(18).reshape(6, 3), index=pd.MultiIndex.from_product([foo_index, bar_index]), ) df.columns = ["cfoo", "cbar", "cfoo"] expected = Series([1, 2] * 3, index=df.index, name="foo_series") result_left, result_right = df.align(series, axis=0) tm.assert_series_equal(result_right, expected) tm.assert_index_equal(result_left.columns, df.columns)
def align_data(training_data: pd.DataFrame, test_data: pd.DataFrame, preserve=['TARGET']) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Align the data between a training_data set and a test_data set while preserving all columns specified by preserve in the training_data. Parameters ------------------ training_data (pd.DataFrame) data frame of the training features. Must include the columns specified by the value of preserve test_data (pd.DataFrame) corresponding data frame of the test data preserve (List[str]) list of columns from training_data to be preserved and restored after the data frames are aligned Return ------------------- aligned_training_data (pd.DataFrame) training_data aligned with test_data and columns specified by preserve aligned_test_data (pd.DataFrame) test_data aligned with training_data """ print( f"Aligning training and test data before combining for feature engineering:" ) preserved_features = training_data[preserve] aligned_training_data, aligned_test_data = training_data.align( test_data, join='inner', axis=1) aligned_training_data[preserve] = preserved_features print(f" Aligned data has {aligned_training_data.shape[1]} columns") return aligned_training_data, aligned_test_data
def test_align_categorical(self, l_ordered, r_ordered, expected): # GH-28397 df_1 = DataFrame({ "A": np.arange(6, dtype="int64"), "B": Series(list("aabbca")).astype( pd.CategoricalDtype(list("cab"), ordered=l_ordered)), }).set_index("B") df_2 = DataFrame({ "A": np.arange(5, dtype="int64"), "B": Series(list("babca")).astype( pd.CategoricalDtype(list("cab"), ordered=r_ordered)), }).set_index("B") aligned_1, aligned_2 = df_1.align(df_2) assert isinstance(aligned_1.index, expected) assert isinstance(aligned_2.index, expected) tm.assert_index_equal(aligned_1.index, aligned_2.index)
def test_align_series_combinations(self): df = DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) s = Series([1, 2, 4], index=list("ABD"), name="x") # frame + series res1, res2 = df.align(s, axis=0) exp1 = DataFrame( { "a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5] }, index=list("ABCDE"), ) exp2 = Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") tm.assert_frame_equal(res1, exp1) tm.assert_series_equal(res2, exp2) # series + frame res1, res2 = s.align(df) tm.assert_series_equal(res1, exp2) tm.assert_frame_equal(res2, exp1)
def classify( features: pd.DataFrame, y: qiime2.CategoricalMetadataColumn, c: np.ndarray = None, weights: np.ndarray = None, # taxa: skbio.TreeNode = None, # PATH parameters : path: bool = True, path_numerical_method: str = "not specified", path_n_active: int = 0, path_nlam_log: int = 40, path_lamin_log: float = 1e-2, # CV parameters : cv: bool = True, cv_numerical_method: str = "not specified", cv_seed: int = 1, cv_one_se: bool = True, cv_subsets: int = 5, cv_nlam: int = 100, cv_lamin: float = 1e-3, cv_logscale: bool = True, # StabSel parameters : stabsel: bool = True, stabsel_numerical_method: str = "not specified", stabsel_seed: int = None, # do something here ! for now it can be a bool ! stabsel_lam: float = -1.0, # if negative, then it means 'theoretical' stabsel_true_lam: bool = True, stabsel_method: str = "first", stabsel_b: int = 50, stabsel_q: int = 10, stabsel_percent_ns: float = 0.5, stabsel_lamin: float = 1e-2, stabsel_threshold: float = 0.7, stabsel_threshold_label: float = 0.4, # might unneeded here, but needed for visualisation # LAMfixed parameters : lamfixed: bool = True, lamfixed_numerical_method: str = "not specified", lamfixed_lam: float = -1.0, # if negative, then it means 'theoretical' lamfixed_true_lam: bool = True, # Formulation parameters huber: bool = False, rho: float = 0.0, intercept: bool = True, ) -> classo_problem: complete_y = y.to_series() complete_y = complete_y[~complete_y.isna()] first_cell = complete_y[0] #print(sum(complete_y==complete_y[0]), len(complete_y)) features, pdY = features.align(y.to_series(), join="inner", axis=0) missing = pdY.isna() training_labels = list(pdY[~missing].index) label_missing = list(pdY.index[missing]) if label_missing: print("{} are missing in y ".format(label_missing)) Y = pdY[~missing].to_numpy() X = features.values[~missing, :] verfify_binary(Y) Y = Y == first_cell Y = 2 * Y - 1 problem = classo_problem(X, Y, C=c, label=list(features.columns)) problem.formulation.classification = True problem.formulation.concomitant = False problem.formulation.huber = huber #print(rho) problem.formulation.rho_classification = rho problem.formulation.intercept = intercept d = X.shape[1] if weights is not None: if len(weights) < d: problem.formulation.w = np.concatenate( [weights, np.ones(d - len(weights))], axis=0) else: problem.formulation.w = weights[:d] problem.model_selection.PATH = path if path: param = problem.model_selection.PATHparameters param.numerical_method = path_numerical_method param.n_active = path_n_active param.logscale = True param.Nlam = path_nlam_log param.lamin = path_lamin_log problem.model_selection.CV = cv if cv: param = problem.model_selection.CVparameters param.numerical_method = cv_numerical_method param.seed = cv_seed param.oneSE = cv_one_se param.Nsubsets = cv_subsets param.lamin = cv_lamin param.Nlam = cv_nlam param.logscale = cv_logscale problem.model_selection.StabSel = stabsel if stabsel: param = problem.model_selection.StabSelparameters param.numerical_method = stabsel_numerical_method param.seed = stabsel_seed param.true_lam = stabsel_true_lam param.method = stabsel_method param.B = stabsel_b param.q = stabsel_q param.percent_nS = stabsel_percent_ns param.lamin = stabsel_lamin param.threshold = stabsel_threshold param.threshold_label = stabsel_threshold_label if stabsel_lam > 0.0: param.lam = stabsel_lam else: param.lam = "theoretical" problem.model_selection.LAMfixed = lamfixed if lamfixed: param = problem.model_selection.LAMfixedparameters param.numerical_method = lamfixed_numerical_method param.true_lam = lamfixed_true_lam if lamfixed_lam > 0.0: param.lam = lamfixed_lam else: param.lam = "theoretical" problem.solve() cy = complete_y.values problem.data.complete_y = 2 * (cy == cy[0]) - 1 problem.data.complete_labels = list(complete_y.index) problem.data.training_labels = training_labels return problem
# In[10]: s2 = s[1:] s2 # In[12]: s1.align(s2) # In[13]: df.align(df2, join ='inner') # In[ ]: # In[ ]: #filter and column selection in single statement #frame.loc[frame['PRIMARY_DESIGN_VALUE']>20,['ARRANGEMENT_ID', 'PRIMARY_DESIGN_VALUE']] # In[ ]:
def test_align_broadcast_axis(self): # GH 13194 # First four tests for DataFrame.align(Index) # For 'right' join df = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB')) ts = Series([5., 6., 7.]) result = df.align(ts, join='right', axis=0, broadcast_axis=1) expected1 = DataFrame(np.array([[1., 2.], [3., 4.], [pd.np.nan, pd.np.nan]]), columns=list('AB')) expected2 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2) # For 'right' join on different index result = df.align(ts, join='right', axis=1, broadcast_axis=1) expected1 = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB')) expected2 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2) # For 'left' join result = df.align(ts, join='left', axis=0, broadcast_axis=1) expected1 = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB')) expected2 = DataFrame(np.array([[5., 5.], [6., 6.]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2) # For 'left' join on different axis result = df.align(ts, join='left', axis=1, broadcast_axis=1) expected1 = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB')) expected2 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2) # Series.align(DataFrame) tests, 'outer' join result = ts.align(df, join='outer', axis=0, broadcast_axis=1) expected1 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]), columns=list('AB')) expected2 = DataFrame(np.array([[1., 2.], [3., 4.], [pd.np.nan, pd.np.nan]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2) # Series.align(DataFrame) tests, 'inner' join result = ts.align(df, join='inner', axis=0, broadcast_axis=1) expected1 = DataFrame(np.array([[5., 5.], [6., 6.]]), columns=list('AB')) expected2 = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB')) assert_frame_equal(result[0], expected1) assert_frame_equal(result[1], expected2)
def test_missing_axis_specification_exception(self): df = DataFrame(np.arange(50).reshape((10, 5))) series = Series(np.arange(5)) with pytest.raises(ValueError, match=r"axis=0 or 1"): df.align(series)
def analyze_all(records, **conf): '''Analyze all codes. records: a list of (code, dataframe) pairs. ''' rates = DataFrame() for code, values in records: try: rates[code] = change_rate(values, 'Close', 1, 500) except Exception as e: print 'Error in %s: %s' % (code, str(e)) raise e # test # for code, values in records: # mean = values['Close'].tail(300).aggregate('mean') # last = values['Close'].tail(1).aggregate('min') # if code in kospi_code.kospi200map and last < mean * (1 - 0.1): # print '<div><a href="http://finance.naver.com/item/main.nhn?code=' \ # + code + '">' + kospi200map[code] + '</a></div>' # Up-Down # print '=== Up-downs ===' # counts = DataFrame() # for code, rate in rates.iteritems(): # counts[code] = rate.gt(0).value_counts() # print 'Most ups:' # print counts.transpose()[True].nlargest(10) # print 'Least ups:' # print counts.transpose()[True].nsmallest(10) # print counts.transpose()[False].nlargest(10) # Hike print '=== Hike ranking ===' hike_counts = DataFrame() h1 = DataFrame() h2 = DataFrame() # h3 = DataFrame() # h4 = DataFrame() # h5 = DataFrame() for code, rate in rates.iteritems(): h1[code] = rate.tail(30).apply( lambda x: x > 0.5 and x < 1.5).value_counts() h2[code] = rate.tail(60).apply( lambda x: x > 0.5 and x < 1.5).value_counts() # h3[code] = rate.tail(300).apply(lambda x: x > 0.5).value_counts() # h4[code] = rate.tail(400).apply(lambda x: x > 1.0).value_counts() # h5[code] = rate.tail(500).apply(lambda x: x > 1.0).value_counts() h1 = h1.transpose()[True].nlargest(30) h2 = h2.transpose()[True].nsmallest(h2.size - 30) # h3 = h3.transpose()[True] # h3 = h3.nsmallest(h3.size - 50) # h4 = h4.transpose()[True].nlargest(50) # h5 = h5.transpose()[True].nlargest(50) # h1, _ = h1.align(h2, axis = 0, join = 'inner') h1, _ = h1.align(h2, axis=0, join='inner') # h1, _ = h1.align(h4, axis = 0, join = 'inner') # h1, _ = h1.align(h5, axis = 0, join = 'inner') for code, value in h1.iteritems(): print code, value