Ejemplo n.º 1
0
def test_mixed_concat():
    df, df2 = generate_dfs()
    df3 = df.copy()

    mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]

    assert(ray_df_equals_pandas(pd.concat(mixed_dfs),
                                pandas.concat([df, df2, df3])))
Ejemplo n.º 2
0
def test_ray_concat_on_index():
    df, df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(df), from_pandas(df2)

    assert modin_df_equals_pandas(
        pd.concat([modin_df, modin_df2], axis="index"),
        pandas.concat([df, df2], axis="index"),
    )

    assert modin_df_equals_pandas(
        pd.concat([modin_df, modin_df2], axis="rows"),
        pandas.concat([df, df2], axis="rows"),
    )

    assert modin_df_equals_pandas(
        pd.concat([modin_df, modin_df2], axis=0), pandas.concat([df, df2], axis=0)
    )
Ejemplo n.º 3
0
def test_mixed_inner_concat():
    df, df2 = generate_dfs()
    df3 = df.copy()

    mixed_dfs = [from_pandas(df), from_pandas(df2), df3]

    assert modin_df_equals_pandas(pd.concat(mixed_dfs, join="inner"),
                                  pandas.concat([df, df2, df3], join="inner"))
Ejemplo n.º 4
0
def test_concat_series_only():
    modin_series = pd.Series(list(range(1000)))
    pandas_series = pandas.Series(list(range(1000)))

    df_equals(
        pd.concat([modin_series, modin_series]),
        pandas.concat([pandas_series, pandas_series]),
    )
Ejemplo n.º 5
0
def test_concat_with_empty_frame():
    modin_empty_df = pd.DataFrame()
    pandas_empty_df = pandas.DataFrame()
    modin_row = pd.Series({0: "a", 1: "b"})
    pandas_row = pandas.Series({0: "a", 1: "b"})
    df_equals(
        pd.concat([modin_empty_df, modin_row]),
        pandas.concat([pandas_empty_df, pandas_row]),
    )
Ejemplo n.º 6
0
 def merge(self):
     """ conbine all the dataframes into a single
          by joining on their axes """
     #fn = lambda a, b: pd.concat([a, b], axis=1)
     #total = reduce(fn, [i.df for i in self._dosefiles])
     total = pd.concat([i.df for i in self._dosefiles], axis=1)
     keys = [i.copc for i in self._dosefiles]
     total['dose'] = total[keys].sum(axis=1)
     return total
Ejemplo n.º 7
0
def main():
    tqdm.pandas()
    modelpath = os.path.join(content.DATAPATH, "MODEL")

    df = pd.concat([pd.read_hdf(os.path.join(modelpath, f"data_preprocessed_{i}.h5"),
                                'X', mode='r') for i in range(8)])

    filepath = os.path.join(modelpath, "data_preprocessed.h5")

    df.to_hdf(filepath, key='X', mode='w')
Ejemplo n.º 8
0
def test_concat_non_subscriptable_keys():
    frame_data = np.random.randint(0, 100, size=(2 ** 10, 2 ** 6))
    df = pd.DataFrame(frame_data).add_prefix("col")
    pdf = pandas.DataFrame(frame_data).add_prefix("col")

    modin_dict = {"c": df.copy(), "b": df.copy()}
    pandas_dict = {"c": pdf.copy(), "b": pdf.copy()}
    modin_result = pd.concat(modin_dict.values(), keys=modin_dict.keys())
    pandas_result = pandas.concat(pandas_dict.values(), keys=pandas_dict.keys())
    modin_df_equals_pandas(modin_result, pandas_result)
Ejemplo n.º 9
0
    def test_read_and_concat(self):
        csv_file = os.path.join(self.root, "modin/pandas/test/data", "test_usecols.csv")
        ref1 = pandas.read_csv(csv_file)
        ref2 = pandas.read_csv(csv_file)
        ref = pandas.concat([ref1, ref2])

        exp1 = pandas.read_csv(csv_file)
        exp2 = pandas.read_csv(csv_file)
        exp = pd.concat([exp1, exp2])

        df_equals(ref, exp)
Ejemplo n.º 10
0
def test_concat_with_empty_frame():
    modin_empty_df = pd.DataFrame()
    pandas_empty_df = pandas.DataFrame()
    modin_row = pd.Series({0: "a", 1: "b"})
    pandas_row = pandas.Series({0: "a", 1: "b"})
    df_equals(
        pd.concat([modin_empty_df, modin_row]),
        pandas.concat([pandas_empty_df, pandas_row]),
    )

    md_empty1, pd_empty1 = create_test_dfs(index=[1, 2, 3])
    md_empty2, pd_empty2 = create_test_dfs(index=[2, 3, 4])

    df_equals(
        pd.concat([md_empty1, md_empty2], axis=0),
        pandas.concat([pd_empty1, pd_empty2], axis=0),
    )
    df_equals(
        pd.concat([md_empty1, md_empty2], axis=1),
        pandas.concat([pd_empty1, pd_empty2], axis=1),
    )
Ejemplo n.º 11
0
def test_sort_order(sort, join, axis):
    pandas_df = pandas.DataFrame({"c": [3], "d": [4]}, columns=["d", "c"])
    pandas_df2 = pandas.DataFrame({"a": [1], "b": [2]}, columns=["b", "a"])
    modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2)
    pandas_concat = pandas.concat([pandas_df, pandas_df2],
                                  join=join,
                                  sort=sort)
    modin_concat = pd.concat([modin_df, modin_df2], join=join, sort=sort)
    df_equals(
        pandas_concat,
        modin_concat,
    )
    assert list(pandas_concat.columns) == list(modin_concat.columns)
Ejemplo n.º 12
0
def test_concat_dictionary(axis):
    pandas_df, pandas_df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2)

    df_equals(
        pd.concat({
            "A": modin_df,
            "B": modin_df2
        }, axis=axis),
        pandas.concat({
            "A": pandas_df,
            "B": pandas_df2
        }, axis=axis),
    )
Ejemplo n.º 13
0
def test_mixed_columns(columns):
    def get_columns(df):
        return [df[name] if lookup else name for (lookup, name) in columns]

    data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]}

    df1 = pandas.DataFrame(data)
    df1 = pandas.concat([df1])
    ref = df1.groupby(get_columns(df1)).size()

    df2 = pd.DataFrame(data)
    df2 = pd.concat([df2])
    exp = df2.groupby(get_columns(df2)).size()
    df_equals(ref, exp)
def q07(lineitem, supplier, orders, customer, nation):
    """ This version is faster than q07_old. Keeping the old one for reference """
    t1 = time.time()

    lineitem_filtered = lineitem[(lineitem["L_SHIPDATE"] >= pd.Timestamp("1995-01-01")) & (lineitem["L_SHIPDATE"] < pd.Timestamp("1997-01-01"))]
    lineitem_filtered["L_YEAR"] = lineitem_filtered["L_SHIPDATE"].apply(lambda x: x.year)
    lineitem_filtered["VOLUME"] = lineitem_filtered["L_EXTENDEDPRICE"] * (1.0 - lineitem_filtered["L_DISCOUNT"])
    lineitem_filtered = lineitem_filtered.loc[:, ["L_ORDERKEY", "L_SUPPKEY", "L_YEAR", "VOLUME"]]
    supplier_filtered = supplier.loc[:, ["S_SUPPKEY", "S_NATIONKEY"]]
    orders_filtered = orders.loc[:, ["O_ORDERKEY", "O_CUSTKEY"]]
    customer_filtered = customer.loc[:, ["C_CUSTKEY", "C_NATIONKEY"]]
    n1 = nation[(nation["N_NAME"] == "FRANCE")].loc[:, ["N_NATIONKEY", "N_NAME"]]
    n2 = nation[(nation["N_NAME"] == "GERMANY")].loc[:, ["N_NATIONKEY", "N_NAME"]]

    # ----- do nation 1 -----
    N1_C = customer_filtered.merge(n1, left_on='C_NATIONKEY', right_on='N_NATIONKEY', how='inner')
    N1_C = N1_C.drop(columns=["C_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "CUST_NATION"})
    N1_C_O = N1_C.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='inner')
    N1_C_O = N1_C_O.drop(columns=["C_CUSTKEY", "O_CUSTKEY"])

    # NOTE: this is faster than first merging lineitem with N1_C_O
    N2_S = supplier_filtered.merge(n2, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner')
    N2_S = N2_S.drop(columns=["S_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "SUPP_NATION"})
    N2_S_L = N2_S.merge(lineitem_filtered, left_on='S_SUPPKEY', right_on='L_SUPPKEY', how='inner')
    N2_S_L = N2_S_L.drop(columns=["S_SUPPKEY", "L_SUPPKEY"])

    total1 = N1_C_O.merge(N2_S_L, left_on='O_ORDERKEY', right_on='L_ORDERKEY', how='inner')
    total1 = total1.drop(columns=["O_ORDERKEY", "L_ORDERKEY"])

    # ----- do nation 2 ----- (same as nation 1 section but with nation 2)
    N2_C = customer_filtered.merge(n2, left_on='C_NATIONKEY', right_on='N_NATIONKEY', how='inner')
    N2_C = N2_C.drop(columns=["C_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "CUST_NATION"})
    N2_C_O = N2_C.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='inner')
    N2_C_O = N2_C_O.drop(columns=["C_CUSTKEY", "O_CUSTKEY"])

    N1_S = supplier_filtered.merge(n1, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner')
    N1_S = N1_S.drop(columns=["S_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "SUPP_NATION"})
    N1_S_L = N1_S.merge(lineitem_filtered, left_on='S_SUPPKEY', right_on='L_SUPPKEY', how='inner')
    N1_S_L = N1_S_L.drop(columns=["S_SUPPKEY", "L_SUPPKEY"])

    total2 = N2_C_O.merge(N1_S_L, left_on='O_ORDERKEY', right_on='L_ORDERKEY', how='inner')
    total2 = total2.drop(columns=["O_ORDERKEY", "L_ORDERKEY"])

    # concat results
    total = pd.concat([total1, total2])

    total = total.groupby(["SUPP_NATION", "CUST_NATION", "L_YEAR"], as_index = False).agg(REVENUE=pd.NamedAgg(column="VOLUME", aggfunc="sum"))
    total = total.sort_values(by=["SUPP_NATION","CUST_NATION","L_YEAR"], ascending=[True,True,True,])
    print(total)
    print("Q07 Execution time (s): ", time.time() - t1)
Ejemplo n.º 15
0
def test_concat_multiindex(axis, names):
    pd_df1, pd_df2 = generate_multiindex_dfs(axis=axis)
    md_df1, md_df2 = map(from_pandas, [pd_df1, pd_df2])

    keys = ["first", "second"]
    if names:
        names = [str(i) for i in np.arange(pd_df1.axes[axis].nlevels + 1)]
    else:
        names = None

    df_equals(
        pd.concat([md_df1, md_df2], keys=keys, axis=axis, names=names),
        pandas.concat([pd_df1, pd_df2], keys=keys, axis=axis, names=names),
    )
Ejemplo n.º 16
0
def test_concat_on_column():
    df, df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(df), from_pandas(df2)

    df_equals(pd.concat([modin_df, modin_df2], axis=1),
              pandas.concat([df, df2], axis=1))

    df_equals(
        pd.concat([modin_df, modin_df2], axis="columns"),
        pandas.concat([df, df2], axis="columns"),
    )

    modin_result = pd.concat([pd.Series(np.ones(10)),
                              pd.Series(np.ones(10))],
                             axis=1,
                             ignore_index=True)
    pandas_result = pandas.concat(
        [pandas.Series(np.ones(10)),
         pandas.Series(np.ones(10))],
        axis=1,
        ignore_index=True,
    )
    df_equals(modin_result, pandas_result)
    assert modin_result.dtypes.equals(pandas_result.dtypes)
Ejemplo n.º 17
0
def extract_feature_ans_dif(data, feature, ans_feature):
    target = data.copy()
    print_time('extract feature ans dif')

    que = pd.read_csv('../datasets/question_info.csv',
                      usecols=['qid', 'topic_id'])
    ans_feature = pd.merge(ans_feature, que, how='left', on='qid').fillna(0)
    target = pd.merge(target, que, how='left', on='qid').fillna(0)
    ans_feature[
        'a_time'] = ans_feature['a_day'] + 0.04166 * ans_feature['a_hour']
    target['i_time'] = target['day'] + 0.04166 * target['hour']

    total_extend = ans_feature['topic_id'].str.split(',', expand=True).stack() \
        .reset_index(level=0).set_index('level_0') \
        .rename(columns={0: 'topic'}).join(ans_feature.drop('topic_id', axis=1)) \
        .reset_index(drop=True)

    t = total_extend.groupby(['uid', 'topic'])['a_time'].agg([
        'max'
    ]).reset_index().rename(columns={'max': 'uid_topic_ans_recent_time'})

    topic_df = target['topic_id'].str.split(',', expand=True)
    topic_df = topic_df.fillna(0)
    target = pd.concat([target, topic_df], axis=1)
    fea_name = 'uid_topic_ans_recent_time'
    tmp_name = []
    result_list = []
    for field in [0, 1, 2, 3, 4, 5]:
        target = pd.merge(target,
                          t,
                          how='left',
                          left_on=['uid', field],
                          right_on=['uid', 'topic'
                                    ]).rename(columns={
                                        fea_name: fea_name + str(field)
                                    }).fillna(1000)
        target['s' +
               str(field)] = target['i_time'] - target[fea_name + str(field)]
        tmp_name.append('s' + str(field))

        target[fea_name + '_mean'] = target[tmp_name].mean(axis=1)
        target[fea_name + '_min'] = target[tmp_name].min(axis=1)
        target[fea_name + '_max'] = target[tmp_name].max(axis=1)
    result_list.append(fea_name + '_min')
    result_list.append(fea_name + '_mean')
    result_list.append(fea_name + '_max')

    return target[result_list]
Ejemplo n.º 18
0
def main():
    directory_path = Path('data/')
    pattern = '*.csv'

    start = datetime.now()

    df = pd.concat(
        (file_to_dataframe(file_path) for file_path in directory_path.glob(pattern)),
        sort=False,
        copy=False,
    )

    end = datetime.now()

    print(f'Units mean: "{df["KWMENG_C"].mean()}"')
    print(f'Elapsed "{end - start}" seconds')
Ejemplo n.º 19
0
def missing_values_table(df):
    # Total missing values
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(columns={
        0: "Missing Values",
        1: "% of Total Values"
    })
    mis_val_table_ren_columns = (mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values(
            "% of Total Values", ascending=False).round(1))
    print("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
          "There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")
    return mis_val_table_ren_columns
Ejemplo n.º 20
0
def test_mixed_columns_not_from_df(columns):
    """
    Unlike the previous test, in this case the Series is not just a column from
    the original DataFrame, so you can't use a fasttrack.
    """
    def get_columns(df):
        return [(df[name] + 1) if lookup else name
                for (lookup, name) in columns]

    data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]}

    df1 = pandas.DataFrame(data)
    df1 = pandas.concat([df1])
    ref = df1.groupby(get_columns(df1)).size()

    df2 = pd.DataFrame(data)
    df2 = pd.concat([df2])
    exp = df2.groupby(get_columns(df2)).size()
    df_equals(ref, exp)
Ejemplo n.º 21
0
def arrange_features(features_list, total_col, feature_type):

    features = []

    if feature_type == 'mfcc':

        mfcc_fetures = modin.DataFrame(features_list['mfcc'])
        mfcc_fetures.columns = col_generator('mfcc_features_', total_col)

        mfcc_delta_fetures = modin.DataFrame(features_list['mfcc_delta'])
        mfcc_delta_fetures.columns = col_generator('mfcc_delta', total_col)

        mfcc_dd_fetures = modin.DataFrame(features_list['mfcc_delta2'])
        mfcc_dd_fetures.columns = col_generator('mfcc_dd_fetures', total_col)

        features.extend([mfcc_fetures, mfcc_delta_fetures, mfcc_dd_fetures])
        features = modin.concat(features, axis=1)

    else:
        features = modin.DataFrame(features_list)
        features.columns = col_generator('formant_features_', total_col)

    return features
Ejemplo n.º 22
0
def test_ray_concat():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]),
                                pandas.concat([df, df2]))
Ejemplo n.º 23
0
def test_invalid_axis_errors():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)

    with pytest.raises(ValueError):
        pd.concat([ray_df, ray_df2], axis=2)
Ejemplo n.º 24
0
# %%
import os
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
import ray
#ray.init(memory=5242880000)
import modin.pandas as ray_pd
%time  mray_df = ray_pd.read_csv("data/kiva_loans.csv")

# %%
%time df1 = pandas_df.append(pandas_df)
%time df2 = mdask_df.append(mdask_df)
%time df3 = mray_df.append(mray_df)

# %%
%time df1 = pandas_pd.concat([pandas_df for _ in range(5)])
%time df2 = dask_pd.concat([mdask_df for _ in range(5)])
%time df3 = ray_pd.concat([mray_df for _ in range(5)])

# %%
%time pandas_pd.isnull(pandas_df["use"])
%time dask_pd.isnull(mdask_df["use"])
%time ray_pd.isnull(mray_df["use"])

# %%
%time pandas_df.fillna(value=0)
%time mdask_df.fillna(value=0)
%time mray_df.fillna(value=0)

# %%
Ejemplo n.º 25
0
def test_df_concat():
    df, df2 = generate_dfs()

    assert (ray_df_equals_pandas(pd.concat([df, df2]), pandas.concat([df,
                                                                      df2])))
Ejemplo n.º 26
0
def test_df_concat():
    df, df2 = generate_dfs()

    df_equals(pd.concat([df, df2]), pandas.concat([df, df2]))
Ejemplo n.º 27
0
test_term_doc = vec.transform(test[COMMENT])
trn_term_doc, test_term_doc


def pr(y_i, y):
    p = x[y == y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)


x = trn_term_doc
test_x = test_term_doc


def get_mdl(y):
    y = y.values
    r = np.log(pr(1, y) / pr(0, y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r


preds = np.zeros((len(test), len(label_cols)))
for i, j in enumerate(label_cols):
    print("fit", j)
    m, r = get_mdl(train[j])
    preds[:, i] = m.predict_proba(test_x.multiply(r))[:, 1]
submid = pd.DataFrame({"id": subm["id"]})
submission = pd.concat(
    [submid, pd.DataFrame(preds, columns=label_cols)], axis=1)
submission.to_csv("submission.csv", index=False)
Ejemplo n.º 28
0
def test_ray_concat():
    df, df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(df), from_pandas(df2)

    assert modin_df_equals_pandas(pd.concat([modin_df, modin_df2]),
                                  pandas.concat([df, df2]))
Ejemplo n.º 29
0
def test_invalid_axis_errors():
    df, df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(df), from_pandas(df2)

    with pytest.raises(ValueError):
        pd.concat([modin_df, modin_df2], axis=2)
Ejemplo n.º 30
0
import matplotlib

matplotlib.use("PS")
import modin.pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import skew

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()
all_data = pd.concat(
    (
        train.loc[:, "MSSubClass":"SaleCondition"],
        test.loc[:, "MSSubClass":"SaleCondition"],
    )
)
matplotlib.rcParams["figure.figsize"] = (12.0, 6.0)
prices = pd.DataFrame(
    {"price": train["SalePrice"], "log(price + 1)": np.log1p(train["SalePrice"])}
)
prices.hist()
train["SalePrice"] = np.log1p(train["SalePrice"])
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(
    lambda x: skew(x.dropna())
)  # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])