def test_mixed_concat(): df, df2 = generate_dfs() df3 = df.copy() mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3] assert(ray_df_equals_pandas(pd.concat(mixed_dfs), pandas.concat([df, df2, df3])))
def test_ray_concat_on_index(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) assert modin_df_equals_pandas( pd.concat([modin_df, modin_df2], axis="index"), pandas.concat([df, df2], axis="index"), ) assert modin_df_equals_pandas( pd.concat([modin_df, modin_df2], axis="rows"), pandas.concat([df, df2], axis="rows"), ) assert modin_df_equals_pandas( pd.concat([modin_df, modin_df2], axis=0), pandas.concat([df, df2], axis=0) )
def test_mixed_inner_concat(): df, df2 = generate_dfs() df3 = df.copy() mixed_dfs = [from_pandas(df), from_pandas(df2), df3] assert modin_df_equals_pandas(pd.concat(mixed_dfs, join="inner"), pandas.concat([df, df2, df3], join="inner"))
def test_concat_series_only(): modin_series = pd.Series(list(range(1000))) pandas_series = pandas.Series(list(range(1000))) df_equals( pd.concat([modin_series, modin_series]), pandas.concat([pandas_series, pandas_series]), )
def test_concat_with_empty_frame(): modin_empty_df = pd.DataFrame() pandas_empty_df = pandas.DataFrame() modin_row = pd.Series({0: "a", 1: "b"}) pandas_row = pandas.Series({0: "a", 1: "b"}) df_equals( pd.concat([modin_empty_df, modin_row]), pandas.concat([pandas_empty_df, pandas_row]), )
def merge(self): """ conbine all the dataframes into a single by joining on their axes """ #fn = lambda a, b: pd.concat([a, b], axis=1) #total = reduce(fn, [i.df for i in self._dosefiles]) total = pd.concat([i.df for i in self._dosefiles], axis=1) keys = [i.copc for i in self._dosefiles] total['dose'] = total[keys].sum(axis=1) return total
def main(): tqdm.pandas() modelpath = os.path.join(content.DATAPATH, "MODEL") df = pd.concat([pd.read_hdf(os.path.join(modelpath, f"data_preprocessed_{i}.h5"), 'X', mode='r') for i in range(8)]) filepath = os.path.join(modelpath, "data_preprocessed.h5") df.to_hdf(filepath, key='X', mode='w')
def test_concat_non_subscriptable_keys(): frame_data = np.random.randint(0, 100, size=(2 ** 10, 2 ** 6)) df = pd.DataFrame(frame_data).add_prefix("col") pdf = pandas.DataFrame(frame_data).add_prefix("col") modin_dict = {"c": df.copy(), "b": df.copy()} pandas_dict = {"c": pdf.copy(), "b": pdf.copy()} modin_result = pd.concat(modin_dict.values(), keys=modin_dict.keys()) pandas_result = pandas.concat(pandas_dict.values(), keys=pandas_dict.keys()) modin_df_equals_pandas(modin_result, pandas_result)
def test_read_and_concat(self): csv_file = os.path.join(self.root, "modin/pandas/test/data", "test_usecols.csv") ref1 = pandas.read_csv(csv_file) ref2 = pandas.read_csv(csv_file) ref = pandas.concat([ref1, ref2]) exp1 = pandas.read_csv(csv_file) exp2 = pandas.read_csv(csv_file) exp = pd.concat([exp1, exp2]) df_equals(ref, exp)
def test_concat_with_empty_frame(): modin_empty_df = pd.DataFrame() pandas_empty_df = pandas.DataFrame() modin_row = pd.Series({0: "a", 1: "b"}) pandas_row = pandas.Series({0: "a", 1: "b"}) df_equals( pd.concat([modin_empty_df, modin_row]), pandas.concat([pandas_empty_df, pandas_row]), ) md_empty1, pd_empty1 = create_test_dfs(index=[1, 2, 3]) md_empty2, pd_empty2 = create_test_dfs(index=[2, 3, 4]) df_equals( pd.concat([md_empty1, md_empty2], axis=0), pandas.concat([pd_empty1, pd_empty2], axis=0), ) df_equals( pd.concat([md_empty1, md_empty2], axis=1), pandas.concat([pd_empty1, pd_empty2], axis=1), )
def test_sort_order(sort, join, axis): pandas_df = pandas.DataFrame({"c": [3], "d": [4]}, columns=["d", "c"]) pandas_df2 = pandas.DataFrame({"a": [1], "b": [2]}, columns=["b", "a"]) modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2) pandas_concat = pandas.concat([pandas_df, pandas_df2], join=join, sort=sort) modin_concat = pd.concat([modin_df, modin_df2], join=join, sort=sort) df_equals( pandas_concat, modin_concat, ) assert list(pandas_concat.columns) == list(modin_concat.columns)
def test_concat_dictionary(axis): pandas_df, pandas_df2 = generate_dfs() modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2) df_equals( pd.concat({ "A": modin_df, "B": modin_df2 }, axis=axis), pandas.concat({ "A": pandas_df, "B": pandas_df2 }, axis=axis), )
def test_mixed_columns(columns): def get_columns(df): return [df[name] if lookup else name for (lookup, name) in columns] data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]} df1 = pandas.DataFrame(data) df1 = pandas.concat([df1]) ref = df1.groupby(get_columns(df1)).size() df2 = pd.DataFrame(data) df2 = pd.concat([df2]) exp = df2.groupby(get_columns(df2)).size() df_equals(ref, exp)
def q07(lineitem, supplier, orders, customer, nation): """ This version is faster than q07_old. Keeping the old one for reference """ t1 = time.time() lineitem_filtered = lineitem[(lineitem["L_SHIPDATE"] >= pd.Timestamp("1995-01-01")) & (lineitem["L_SHIPDATE"] < pd.Timestamp("1997-01-01"))] lineitem_filtered["L_YEAR"] = lineitem_filtered["L_SHIPDATE"].apply(lambda x: x.year) lineitem_filtered["VOLUME"] = lineitem_filtered["L_EXTENDEDPRICE"] * (1.0 - lineitem_filtered["L_DISCOUNT"]) lineitem_filtered = lineitem_filtered.loc[:, ["L_ORDERKEY", "L_SUPPKEY", "L_YEAR", "VOLUME"]] supplier_filtered = supplier.loc[:, ["S_SUPPKEY", "S_NATIONKEY"]] orders_filtered = orders.loc[:, ["O_ORDERKEY", "O_CUSTKEY"]] customer_filtered = customer.loc[:, ["C_CUSTKEY", "C_NATIONKEY"]] n1 = nation[(nation["N_NAME"] == "FRANCE")].loc[:, ["N_NATIONKEY", "N_NAME"]] n2 = nation[(nation["N_NAME"] == "GERMANY")].loc[:, ["N_NATIONKEY", "N_NAME"]] # ----- do nation 1 ----- N1_C = customer_filtered.merge(n1, left_on='C_NATIONKEY', right_on='N_NATIONKEY', how='inner') N1_C = N1_C.drop(columns=["C_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "CUST_NATION"}) N1_C_O = N1_C.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='inner') N1_C_O = N1_C_O.drop(columns=["C_CUSTKEY", "O_CUSTKEY"]) # NOTE: this is faster than first merging lineitem with N1_C_O N2_S = supplier_filtered.merge(n2, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner') N2_S = N2_S.drop(columns=["S_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "SUPP_NATION"}) N2_S_L = N2_S.merge(lineitem_filtered, left_on='S_SUPPKEY', right_on='L_SUPPKEY', how='inner') N2_S_L = N2_S_L.drop(columns=["S_SUPPKEY", "L_SUPPKEY"]) total1 = N1_C_O.merge(N2_S_L, left_on='O_ORDERKEY', right_on='L_ORDERKEY', how='inner') total1 = total1.drop(columns=["O_ORDERKEY", "L_ORDERKEY"]) # ----- do nation 2 ----- (same as nation 1 section but with nation 2) N2_C = customer_filtered.merge(n2, left_on='C_NATIONKEY', right_on='N_NATIONKEY', how='inner') N2_C = N2_C.drop(columns=["C_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "CUST_NATION"}) N2_C_O = N2_C.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='inner') N2_C_O = N2_C_O.drop(columns=["C_CUSTKEY", "O_CUSTKEY"]) N1_S = supplier_filtered.merge(n1, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner') N1_S = N1_S.drop(columns=["S_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "SUPP_NATION"}) N1_S_L = N1_S.merge(lineitem_filtered, left_on='S_SUPPKEY', right_on='L_SUPPKEY', how='inner') N1_S_L = N1_S_L.drop(columns=["S_SUPPKEY", "L_SUPPKEY"]) total2 = N2_C_O.merge(N1_S_L, left_on='O_ORDERKEY', right_on='L_ORDERKEY', how='inner') total2 = total2.drop(columns=["O_ORDERKEY", "L_ORDERKEY"]) # concat results total = pd.concat([total1, total2]) total = total.groupby(["SUPP_NATION", "CUST_NATION", "L_YEAR"], as_index = False).agg(REVENUE=pd.NamedAgg(column="VOLUME", aggfunc="sum")) total = total.sort_values(by=["SUPP_NATION","CUST_NATION","L_YEAR"], ascending=[True,True,True,]) print(total) print("Q07 Execution time (s): ", time.time() - t1)
def test_concat_multiindex(axis, names): pd_df1, pd_df2 = generate_multiindex_dfs(axis=axis) md_df1, md_df2 = map(from_pandas, [pd_df1, pd_df2]) keys = ["first", "second"] if names: names = [str(i) for i in np.arange(pd_df1.axes[axis].nlevels + 1)] else: names = None df_equals( pd.concat([md_df1, md_df2], keys=keys, axis=axis, names=names), pandas.concat([pd_df1, pd_df2], keys=keys, axis=axis, names=names), )
def test_concat_on_column(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) df_equals(pd.concat([modin_df, modin_df2], axis=1), pandas.concat([df, df2], axis=1)) df_equals( pd.concat([modin_df, modin_df2], axis="columns"), pandas.concat([df, df2], axis="columns"), ) modin_result = pd.concat([pd.Series(np.ones(10)), pd.Series(np.ones(10))], axis=1, ignore_index=True) pandas_result = pandas.concat( [pandas.Series(np.ones(10)), pandas.Series(np.ones(10))], axis=1, ignore_index=True, ) df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes)
def extract_feature_ans_dif(data, feature, ans_feature): target = data.copy() print_time('extract feature ans dif') que = pd.read_csv('../datasets/question_info.csv', usecols=['qid', 'topic_id']) ans_feature = pd.merge(ans_feature, que, how='left', on='qid').fillna(0) target = pd.merge(target, que, how='left', on='qid').fillna(0) ans_feature[ 'a_time'] = ans_feature['a_day'] + 0.04166 * ans_feature['a_hour'] target['i_time'] = target['day'] + 0.04166 * target['hour'] total_extend = ans_feature['topic_id'].str.split(',', expand=True).stack() \ .reset_index(level=0).set_index('level_0') \ .rename(columns={0: 'topic'}).join(ans_feature.drop('topic_id', axis=1)) \ .reset_index(drop=True) t = total_extend.groupby(['uid', 'topic'])['a_time'].agg([ 'max' ]).reset_index().rename(columns={'max': 'uid_topic_ans_recent_time'}) topic_df = target['topic_id'].str.split(',', expand=True) topic_df = topic_df.fillna(0) target = pd.concat([target, topic_df], axis=1) fea_name = 'uid_topic_ans_recent_time' tmp_name = [] result_list = [] for field in [0, 1, 2, 3, 4, 5]: target = pd.merge(target, t, how='left', left_on=['uid', field], right_on=['uid', 'topic' ]).rename(columns={ fea_name: fea_name + str(field) }).fillna(1000) target['s' + str(field)] = target['i_time'] - target[fea_name + str(field)] tmp_name.append('s' + str(field)) target[fea_name + '_mean'] = target[tmp_name].mean(axis=1) target[fea_name + '_min'] = target[tmp_name].min(axis=1) target[fea_name + '_max'] = target[tmp_name].max(axis=1) result_list.append(fea_name + '_min') result_list.append(fea_name + '_mean') result_list.append(fea_name + '_max') return target[result_list]
def main(): directory_path = Path('data/') pattern = '*.csv' start = datetime.now() df = pd.concat( (file_to_dataframe(file_path) for file_path in directory_path.glob(pattern)), sort=False, copy=False, ) end = datetime.now() print(f'Units mean: "{df["KWMENG_C"].mean()}"') print(f'Elapsed "{end - start}" seconds')
def missing_values_table(df): # Total missing values mis_val = df.isnull().sum() mis_val_percent = 100 * df.isnull().sum() / len(df) mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) mis_val_table_ren_columns = mis_val_table.rename(columns={ 0: "Missing Values", 1: "% of Total Values" }) mis_val_table_ren_columns = (mis_val_table_ren_columns[ mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values( "% of Total Values", ascending=False).round(1)) print("Your selected dataframe has " + str(df.shape[1]) + " columns.\n" "There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.") return mis_val_table_ren_columns
def test_mixed_columns_not_from_df(columns): """ Unlike the previous test, in this case the Series is not just a column from the original DataFrame, so you can't use a fasttrack. """ def get_columns(df): return [(df[name] + 1) if lookup else name for (lookup, name) in columns] data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]} df1 = pandas.DataFrame(data) df1 = pandas.concat([df1]) ref = df1.groupby(get_columns(df1)).size() df2 = pd.DataFrame(data) df2 = pd.concat([df2]) exp = df2.groupby(get_columns(df2)).size() df_equals(ref, exp)
def arrange_features(features_list, total_col, feature_type): features = [] if feature_type == 'mfcc': mfcc_fetures = modin.DataFrame(features_list['mfcc']) mfcc_fetures.columns = col_generator('mfcc_features_', total_col) mfcc_delta_fetures = modin.DataFrame(features_list['mfcc_delta']) mfcc_delta_fetures.columns = col_generator('mfcc_delta', total_col) mfcc_dd_fetures = modin.DataFrame(features_list['mfcc_delta2']) mfcc_dd_fetures.columns = col_generator('mfcc_dd_fetures', total_col) features.extend([mfcc_fetures, mfcc_delta_fetures, mfcc_dd_fetures]) features = modin.concat(features, axis=1) else: features = modin.DataFrame(features_list) features.columns = col_generator('formant_features_', total_col) return features
def test_ray_concat(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]), pandas.concat([df, df2]))
def test_invalid_axis_errors(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) with pytest.raises(ValueError): pd.concat([ray_df, ray_df2], axis=2)
# %% import os os.environ["MODIN_ENGINE"] = "ray" # Modin will use Ray import ray #ray.init(memory=5242880000) import modin.pandas as ray_pd %time mray_df = ray_pd.read_csv("data/kiva_loans.csv") # %% %time df1 = pandas_df.append(pandas_df) %time df2 = mdask_df.append(mdask_df) %time df3 = mray_df.append(mray_df) # %% %time df1 = pandas_pd.concat([pandas_df for _ in range(5)]) %time df2 = dask_pd.concat([mdask_df for _ in range(5)]) %time df3 = ray_pd.concat([mray_df for _ in range(5)]) # %% %time pandas_pd.isnull(pandas_df["use"]) %time dask_pd.isnull(mdask_df["use"]) %time ray_pd.isnull(mray_df["use"]) # %% %time pandas_df.fillna(value=0) %time mdask_df.fillna(value=0) %time mray_df.fillna(value=0) # %%
def test_df_concat(): df, df2 = generate_dfs() assert (ray_df_equals_pandas(pd.concat([df, df2]), pandas.concat([df, df2])))
def test_df_concat(): df, df2 = generate_dfs() df_equals(pd.concat([df, df2]), pandas.concat([df, df2]))
test_term_doc = vec.transform(test[COMMENT]) trn_term_doc, test_term_doc def pr(y_i, y): p = x[y == y_i].sum(0) return (p + 1) / ((y == y_i).sum() + 1) x = trn_term_doc test_x = test_term_doc def get_mdl(y): y = y.values r = np.log(pr(1, y) / pr(0, y)) m = LogisticRegression(C=4, dual=True) x_nb = x.multiply(r) return m.fit(x_nb, y), r preds = np.zeros((len(test), len(label_cols))) for i, j in enumerate(label_cols): print("fit", j) m, r = get_mdl(train[j]) preds[:, i] = m.predict_proba(test_x.multiply(r))[:, 1] submid = pd.DataFrame({"id": subm["id"]}) submission = pd.concat( [submid, pd.DataFrame(preds, columns=label_cols)], axis=1) submission.to_csv("submission.csv", index=False)
def test_ray_concat(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) assert modin_df_equals_pandas(pd.concat([modin_df, modin_df2]), pandas.concat([df, df2]))
def test_invalid_axis_errors(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) with pytest.raises(ValueError): pd.concat([modin_df, modin_df2], axis=2)
import matplotlib matplotlib.use("PS") import modin.pandas as pd import numpy as np import matplotlib import matplotlib.pyplot as plt from scipy.stats import skew train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") train.head() all_data = pd.concat( ( train.loc[:, "MSSubClass":"SaleCondition"], test.loc[:, "MSSubClass":"SaleCondition"], ) ) matplotlib.rcParams["figure.figsize"] = (12.0, 6.0) prices = pd.DataFrame( {"price": train["SalePrice"], "log(price + 1)": np.log1p(train["SalePrice"])} ) prices.hist() train["SalePrice"] = np.log1p(train["SalePrice"]) numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply( lambda x: skew(x.dropna()) ) # compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats])