コード例 #1
0
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()

    train_y = train['y']
    train.drop(['y'], axis=1, inplace=True)
    # 合并训练集和测试集
    conbined_data = pd.concat([train, test])
    ids = conbined_data['ID']
    del conbined_data['ID']

    num_columns = conbined_data.select_dtypes(exclude=['object']).columns
    missing_rates = [0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0]
    conbined_data = kmeans_impute_datas(conbined_data, num_columns,
                                        missing_rates)

    conbined_data['ID'] = ids
    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['y'] = train_y.values

    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_util.save_dataset(train, test)
コード例 #2
0
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()

    train_y = train['y']
    train.drop(['y'], axis=1, inplace=True)
    # 合并训练集和测试集
    conbined_data = pd.concat([train, test])
    ids = conbined_data['ID']

    remove_features = [
        'X33', 'X39', 'X42', 'X95', 'X105', 'X124', 'X190', 'X204', 'X207',
        'X210', 'X236', 'X252', 'X257', 'X259', 'X260', 'X270', 'X278', 'X280',
        'X288', 'X295', 'X296', 'X339', 'X372', 'label_encoder_X4_median_y',
        'label_encoder_X4_mean_y'
    ]
    conbined_data.drop(remove_features, axis=1, inplace=True)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['y'] = train_y.values
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_util.save_dataset(train, test)
コード例 #3
0
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()
    print 'train:', train.shape, ', test:', test.shape

    if (not os.path.exists(Configure.decomposition_features_train_path)) or \
            (not os.path.exists(Configure.decomposition_features_test_path)):
        # 合并训练集和测试集
        conbined_data = pd.concat([train.drop(['y'], axis=1), test])
        ids = conbined_data['ID']
        conbined_data.drop(['ID'], axis=1, inplace=True)

        results_df = pd.DataFrame({'ID': ids})
        random_state = 100
        n_jobs = -1

        # PCA
        n_comp = 50
        pca = PCA(n_components=n_comp, random_state=random_state)
        pca_df = pca.fit_transform(conbined_data)
        for i in range(0, n_comp):
            results_df['pca_' + str(i)] = pca_df[:, i]

        # IncrementalPCA
        n_comp = 70
        ipca_df = IncrementalPCA(
            n_components=n_comp,
            batch_size=conbined_data.shape[0]).fit_transform(conbined_data)
        for i in range(0, n_comp):
            results_df['ipca_' + str(i)] = ipca_df[:, i]

        # KernelPCA
        n_comp = 60
        ipca_df = KernelPCA(n_components=n_comp,
                            kernel='linear',
                            random_state=random_state,
                            n_jobs=n_jobs).fit_transform(conbined_data)
        for i in range(0, n_comp):
            results_df['kernel_pca_' + str(i)] = ipca_df[:, i]

        decomposition_features_train = results_df.iloc[:train.shape[0], :]
        decomposition_features_test = results_df.iloc[train.shape[0]:, :]

        with open(Configure.decomposition_features_train_path, "wb") as f:
            cPickle.dump(decomposition_features_train, f, -1)
        with open(Configure.decomposition_features_test_path, "wb") as f:
            cPickle.dump(decomposition_features_test, f, -1)
    else:
        with open(Configure.decomposition_features_train_path, "rb") as f:
            decomposition_features_train = cPickle.load(f)
        with open(Configure.decomposition_features_test_path, "rb") as f:
            decomposition_features_test = cPickle.load(f)

    # merge
    train = pd.merge(train, decomposition_features_train, how='left', on='ID')
    test = pd.merge(test, decomposition_features_test, how='left', on='ID')

    print("Save data...")
    print 'train:', train.shape, ', test:', test.shape
    data_util.save_dataset(train, test)
コード例 #4
0
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()

    # 删除 train 中只存在一种值的特征
    removed_features = []
    for c in train.columns:
        if len(set(train[c])) == 1:
            removed_features.append(c)

    train.drop(removed_features, axis=1, inplace=True)
    test.drop(removed_features, axis=1, inplace=True)

    # # 去除 train 中的 outlier 数据
    # train = train[train.y < 250]
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_util.save_dataset(train, test)
コード例 #5
0
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()

    train_y = train['y']
    train.drop(['y'], axis=1, inplace=True)
    # 合并训练集和测试集
    conbined_data = pd.concat([train, test])
    ids = conbined_data['ID']

    str_columns = conbined_data.select_dtypes(include=['object']).columns

    label_encoder_df = pd.DataFrame({'ID': ids})
    print 'perform label encoder...'
    for c in str_columns:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(conbined_data[c].values))
        label_encoder_df['label_encoder_' + c] = lbl.transform(list(conbined_data[c].values))

    # print 'perform dummy encoder...'
    # dummy_encoder_df = pd.DataFrame({'ID': ids})
    # for c in str_columns:
    #     dummies_df = pd.get_dummies(conbined_data[c], prefix=c)
    #     dummy_encoder_df = pd.concat([dummy_encoder_df, dummies_df], axis=1)

    # 合并数据
    del label_encoder_df['ID']
    conbined_data = pd.concat([conbined_data, label_encoder_df], axis=1)
    # del dummy_encoder_df['ID']
    # conbined_data = pd.concat([conbined_data, dummy_encoder_df], axis=1)

    # 去除原有的 category features
    conbined_data.drop(str_columns, axis=1, inplace=True)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['y'] = train_y.values
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_util.save_dataset(train, test)
コード例 #6
0
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()
    print 'train:', train.shape, ', test:', test.shape

    train_y = train['y']
    train.drop(['y'], axis=1, inplace=True)
    # 合并训练集和测试集
    conbined_data = pd.concat([train, test])

    dis_features = [c for c in conbined_data.columns if 'pca' in c]
    for c in dis_features:
        mingap = (conbined_data[c].max() - conbined_data[c].min()) / 4000.0
        conbined_data[c] = conbined_data[c].values // mingap

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['y'] = train_y.values
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_util.save_dataset(train, test)
コード例 #7
0
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()

    groupby_features_train = pd.DataFrame({'ID': train['ID']})
    groupby_features_test = pd.DataFrame({'ID': test['ID']})

    if (not os.path.exists(Configure.groupby_features_train_path)) or \
            (not os.path.exists(Configure.groupby_features_test_path)):
        groupby_features = []
        for c in train.columns:
            if ('label_encoder' in c) or ('tsne_cluster' in c):
                groupby_features.append(c)

        for c in groupby_features:
            print '>>>> perform groupby features, feature : {}...'.format(c)
            groupby_df = train[[
                c, 'y'
            ]].groupby(c).aggregate('mean')['y'].reset_index()

            def label_encoder_mean_map(data):
                values = groupby_df.loc[groupby_df[c] == data, 'y'].values
                if len(values) == 0:
                    return sum(groupby_df.y) / groupby_df.shape[0]
                return values[0]

            groupby_features_train[c + '_mean_y'] = train[c].map(
                label_encoder_mean_map)
            groupby_features_test[c + '_mean_y'] = test[c].map(
                label_encoder_mean_map)

            groupby_df = train[[
                c, 'y'
            ]].groupby(c).aggregate('median')['y'].reset_index()

            def label_encoder_median_map(data):
                values = groupby_df.loc[groupby_df[c] == data, 'y'].values
                if len(values) == 0:
                    return sum(groupby_df.y) / groupby_df.shape[0]
                return values[0]

            groupby_features_train[c + '_median_y'] = train[c].map(
                label_encoder_median_map)
            groupby_features_test[c + '_median_y'] = test[c].map(
                label_encoder_median_map)

            groupby_df = train[[
                c, 'y'
            ]].groupby(c).aggregate('std')['y'].reset_index()
            groupby_df.fillna(0, inplace=True)

            def label_encoder_std_map(data):
                values = groupby_df.loc[groupby_df[c] == data, 'y'].values
                if len(values) == 0:
                    return sum(groupby_df.y) / groupby_df.shape[0]
                return values[0]

            groupby_features_train[c + '_std_y'] = train[c].map(
                label_encoder_std_map)
            groupby_features_test[c + '_std_y'] = test[c].map(
                label_encoder_std_map)

        with open(Configure.groupby_features_train_path, "wb") as f:
            cPickle.dump(groupby_features_train, f, -1)
        with open(Configure.groupby_features_test_path, "wb") as f:
            cPickle.dump(groupby_features_test, f, -1)
    else:
        with open(Configure.groupby_features_train_path, "rb") as f:
            groupby_features_train = cPickle.load(f)
        with open(Configure.groupby_features_test_path, "rb") as f:
            groupby_features_test = cPickle.load(f)

    # merge
    train = pd.merge(train, groupby_features_train, how='left', on='ID')
    test = pd.merge(test, groupby_features_test, how='left', on='ID')

    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_util.save_dataset(train, test)
コード例 #8
0
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()
    print 'train:', train.shape, ', test:', test.shape

    train_y = train['y']
    train.drop(['y'], axis=1, inplace=True)
    # 合并训练集和测试集
    conbined_data = pd.concat([train, test])
    ids = conbined_data['ID']

    perform_clusters = [7]
    for n_clusters in perform_clusters:
        print '>>>> perform kmeans cluser, n_clusters = {}...'.format(n_clusters)
        feature_train_path = Configure.kmeans_feature_distance_train_path.format(n_clusters)
        feature_test_path = Configure.kmeans_feature_distance_test_path.format(n_clusters)
        results_df = pd.DataFrame({'ID': ids})
        if (not os.path.exists(feature_train_path)) or \
                (not os.path.exists(feature_test_path)):
            cls = KMeans(n_clusters=n_clusters, n_jobs=-1)
            kmeans_labels = cls.fit_predict(conbined_data.values)
            conbined_data['cluster_label'] = kmeans_labels
            results_df['cluster_{}_cluster_label'.format(n_clusters)] = kmeans_labels

            # 计算距离
            cluster_centers = cls.cluster_centers_

            def calc_intra_center_distance(data):
                center = cluster_centers[int(data['cluster_label']), :]
                raw_data = data.drop(['cluster_label'])
                return np.linalg.norm(center - raw_data)

            def calc_extra_center_distance(data):
                ci = range(0, n_clusters)
                ci.remove(int(data['cluster_label']))

                distance = 0.0
                raw_data = data.drop(['cluster_label'])
                for i in ci:
                    center = cluster_centers[i, :]
                    dis = np.linalg.norm(center - raw_data)
                    distance += dis
                return distance

            def calc_extra_center_distance2(data):
                ci = range(0, n_clusters)
                ci.remove(int(data['cluster_label']))

                distance = 0.0
                raw_data = data.drop(['cluster_label'])
                for i in ci:
                    center = cluster_centers[i, :]
                    dis = np.linalg.norm(center - raw_data)
                    distance += dis
                return distance / len(ci)

            results_df['intra_center_distance_cluster_{}'.format(n_clusters)] = \
                                            conbined_data.apply(calc_intra_center_distance, axis=1, raw=True)
            results_df['average_extra_center_distance_cluster_{}'.format(n_clusters)] = \
                                            conbined_data.apply(calc_extra_center_distance, axis=1, raw=True)
            results_df['sum_extra_center_distance_cluster_{}'.format(n_clusters)] = \
                                            conbined_data.apply(calc_extra_center_distance2, axis=1, raw=True)

            del conbined_data['cluster_label']

            center_distance_train = results_df.iloc[:train.shape[0], :]
            center_distance_test = results_df.iloc[train.shape[0]:, :]

            with open(feature_train_path, "wb") as f:
                cPickle.dump(center_distance_train, f, -1)
            with open(feature_test_path, "wb") as f:
                cPickle.dump(center_distance_test, f, -1)
        else:
            with open(feature_train_path, "rb") as f:
                center_distance_train = cPickle.load(f)
            with open(feature_test_path, "rb") as f:
                center_distance_test = cPickle.load(f)

        # merge
        train = pd.merge(train, center_distance_train, how='left', on='ID')
        test = pd.merge(test, center_distance_test, how='left', on='ID')

    train['y'] = train_y.values
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_util.save_dataset(train, test)
コード例 #9
0
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        y_train.append(label)
        x_train.append(specgram)
x_train = np.array(x_train)
x_train = x_train.reshape(tuple(list(x_train.shape) + [1]))
y_train = label_transform(y_train, relabel=relabel, get_dummies=True)

label_index = y_train.columns.values
y_train = y_train.values
y_train = np.array(y_train)
del labels, fnames
gc.collect()

print('x_train:', x_train.shape, ', y_train:', y_train.shape)
print("Save train data...")
data_util.save_dataset(x_train, y_train)

del x_train, y_train
gc.collect()

batch = 16
test_fname = []
x_test = []
fpaths = glob(os.path.join(Configure.test_data_path, '*wav'))
i = 0
for path in fpaths:
    #     if i == 0:
    #         imgs = []
    #         fnames = []
    #     i += 1
    rate, samples = wavfile.read(path)
コード例 #10
0
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()
    print 'train:', train.shape, ', test:', test.shape

    print 'perform tsne...'
    if not os.path.exists(Configure.tsne_transformed_data_path):
        features = [
            'X118', 'X127', 'X47', 'X315', 'X311', 'X179', 'X314', 'X261'
        ]
        tsne = TSNE(random_state=2000, perplexity=80)
        tsne_transformed = tsne.fit_transform(
            pd.concat([train[features], test[features]]))
        tsne_transformed = pd.DataFrame(tsne_transformed)
        tsne_transformed.columns = ['tsne_transform_x', 'tsne_transform_y']

        with open(Configure.tsne_transformed_data_path, "wb") as f:
            cPickle.dump(tsne_transformed, f, -1)
    else:
        with open(Configure.tsne_transformed_data_path, "rb") as f:
            tsne_transformed = cPickle.load(f)

    ids = pd.concat([train.drop(['y'], axis=1), test])['ID']

    perform_clusters = [7]
    for n_clusters in perform_clusters:
        print '>>>> perform kmeans cluser, n_clusters = {}...'.format(
            n_clusters)
        feature_train_path = Configure.tsne_feature_train_path.format(
            n_clusters)
        feature_test_path = Configure.tsne_feature_test_path.format(n_clusters)
        if (not os.path.exists(feature_train_path)) or \
                (not os.path.exists(feature_test_path)):

            results_df = pd.DataFrame({'ID': ids})
            conbined_data = tsne_transformed.copy()

            cls = KMeans(n_clusters=n_clusters, n_jobs=-1)
            kmeans_labels = cls.fit_predict(conbined_data.values)
            conbined_data['cluster_label'] = kmeans_labels
            results_df['tsne_cluster_{}_cluster_label'.format(
                n_clusters)] = kmeans_labels

            # 计算距离
            cluster_centers = cls.cluster_centers_

            def calc_intra_center_distance(data):
                center = cluster_centers[int(data['cluster_label']), :]
                raw_data = data.drop(['cluster_label'])
                return np.linalg.norm(center - raw_data)

            def calc_extra_center_distance(data):
                ci = range(0, n_clusters)
                ci.remove(int(data['cluster_label']))

                distance = 0.0
                raw_data = data.drop(['cluster_label'])
                for i in ci:
                    center = cluster_centers[i, :]
                    dis = np.linalg.norm(center - raw_data)
                    distance += dis
                return distance

            def calc_extra_center_distance2(data):
                ci = range(0, n_clusters)
                ci.remove(int(data['cluster_label']))

                distance = 0.0
                raw_data = data.drop(['cluster_label'])
                for i in ci:
                    center = cluster_centers[i, :]
                    dis = np.linalg.norm(center - raw_data)
                    distance += dis
                return distance / len(ci)

            results_df['tsne_intra_center_distance_cluster_{}'.format(n_clusters)] = \
                conbined_data.apply(calc_intra_center_distance, axis=1, raw=True)
            results_df['tsne_average_extra_center_distance_cluster_{}'.format(n_clusters)] = \
                conbined_data.apply(calc_extra_center_distance, axis=1, raw=True)
            results_df['tsne_sum_extra_center_distance_cluster_{}'.format(n_clusters)] = \
                conbined_data.apply(calc_extra_center_distance2, axis=1, raw=True)

            results_df.drop(
                ['tsne_cluster_{}_cluster_label'.format(n_clusters)],
                axis=1,
                inplace=True)

            tsne_df_train = results_df.iloc[:train.shape[0], :]
            tsne_df_test = results_df.iloc[train.shape[0]:, :]
            tsne_df_train['ID'] = train['ID']
            tsne_df_test['ID'] = test['ID']

            with open(feature_train_path, "wb") as f:
                cPickle.dump(tsne_df_train, f, -1)
            with open(feature_test_path, "wb") as f:
                cPickle.dump(tsne_df_test, f, -1)
        else:
            with open(feature_train_path, "rb") as f:
                tsne_df_train = cPickle.load(f)
            with open(feature_test_path, "rb") as f:
                tsne_df_test = cPickle.load(f)

        train = pd.merge(train, tsne_df_train, how='left', on='ID')
        test = pd.merge(test, tsne_df_test, how='left', on='ID')

    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_util.save_dataset(train, test)
コード例 #11
0
def main():
    print 'load datas...'
    priors, train, orders, products, aisles, departments, sample_submission, order_streaks = data_util.load_data()

    groupby_features_train = pd.DataFrame()
    groupby_features_test = pd.DataFrame()

    if (not os.path.exists(Configure.groupby_features_train_path)) or \
            (not os.path.exists(Configure.groupby_features_test_path)):

        # # Product part

        # Products information ----------------------------------------------------------------
        # add order information to priors set
        priors_orders_detail = orders.merge(right=priors, how='inner', on='order_id')

        # create new variables
        ## _user_buy_product_times: 用户是第几次购买该商品
        priors_orders_detail.loc[:,'_user_buy_product_times'] = priors_orders_detail.groupby(['user_id', 'product_id']).cumcount() + 1
        # _prod_tot_cnts: 该商品被购买的总次数,表明被喜欢的程度
        # _reorder_tot_cnts_of_this_prod: 这件商品被再次购买的总次数
        ### 我觉得下面两个很不好理解,考虑改变++++++++++++++++++++++++++
        # _prod_order_once: 该商品被购买一次的总次数
        # _prod_order_more_than_once: 该商品被购买一次以上的总次数
        agg_dict = {'user_id':{'_prod_tot_cnts':'count'}, 
                    'reordered':{'_prod_reorder_tot_cnts':'sum'}, 
                    '_user_buy_product_times': {'_prod_buy_first_time_total_cnt':lambda x: sum(x==1),
                                                '_prod_buy_second_time_total_cnt':lambda x: sum(x==2)}}
        prd = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['product_id'], agg_dict)

        # _prod_reorder_prob: 这个指标不好理解
        # _prod_reorder_ratio: 商品复购率
        prd['_prod_reorder_prob'] = prd._prod_buy_second_time_total_cnt / prd._prod_buy_first_time_total_cnt
        prd['_prod_reorder_ratio'] = prd._prod_reorder_tot_cnts / prd._prod_tot_cnts
        prd['_prod_reorder_times'] = 1 + prd._prod_reorder_tot_cnts / prd._prod_buy_first_time_total_cnt

        # # User Part

        # _user_total_orders: 用户的总订单数
        # 可以考虑加入其它统计指标++++++++++++++++++++++++++
        # _user_sum_days_since_prior_order: 距离上次购买时间(和),这个只能在orders表里面计算,priors_orders_detail不是在order level上面unique
        # _user_mean_days_since_prior_order: 距离上次购买时间(均值)
        agg_dict_2 = {'order_number':{'_user_total_orders':'max'},
                      'days_since_prior_order':{'_user_sum_days_since_prior_order':'sum', 
                                                '_user_mean_days_since_prior_order': 'mean'}}
        users = ka_add_groupby_features_1_vs_n(orders[orders.eval_set == 'prior'], ['user_id'], agg_dict_2)

        # _user_reorder_ratio: reorder的总次数 / 第一单后买后的总次数
        # _user_total_products: 用户购买的总商品数
        # _user_distinct_products: 用户购买的unique商品数
        # agg_dict_3 = {'reordered':
        #               {'_user_reorder_ratio': 
        #                lambda x: sum(priors_orders_detail.ix[x.index,'reordered']==1)/
        #                          sum(priors_orders_detail.ix[x.index,'order_number'] > 1)},
        #               'product_id':{'_user_total_products':'count', 
        #                             '_user_distinct_products': lambda x: x.nunique()}}
        # us = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['user_id'], agg_dict_3)

        us = pd.concat([
        priors_orders_detail.groupby('user_id')['product_id'].count().rename('_user_total_products'),
        priors_orders_detail.groupby('user_id')['product_id'].nunique().rename('_user_distinct_products'),
        (priors_orders_detail.groupby('user_id')['reordered'].sum() /
        priors_orders_detail[priors_orders_detail['order_number'] > 1].groupby('user_id')['order_number'].count()).rename('_user_reorder_ratio')
        ], axis=1).reset_index()
        users = users.merge(us, how='inner')

        # 平均每单的商品数
        # 每单中最多的商品数,最少的商品数++++++++++++++
        users['_user_average_basket'] = users._user_total_products / users._user_total_orders

        us = orders[orders.eval_set != "prior"][['user_id', 'order_id', 'eval_set', 'days_since_prior_order']]
        us.rename(index=str, columns={'days_since_prior_order': 'time_since_last_order'}, inplace=True)

        users = users.merge(us, how='inner')


        # # Database Part

        # 这里应该还有很多变量可以被添加
        # _up_order_count: 用户购买该商品的次数
        # _up_first_order_number: 用户第一次购买该商品所处的订单数
        # _up_last_order_number: 用户最后一次购买该商品所处的订单数
        # _up_average_cart_position: 该商品被添加到购物篮中的平均位置
        agg_dict_4 = {'order_number':{'_up_order_count': 'count', 
                                      '_up_first_order_number': 'min', 
                                      '_up_last_order_number':'max'}, 
                      'add_to_cart_order':{'_up_average_cart_position': 'mean'}}

        data = ka_add_groupby_features_1_vs_n(df=priors_orders_detail, 
                                                              group_columns_list=['user_id', 'product_id'], 
                                                              agg_dict=agg_dict_4)

        data = data.merge(prd, how='inner', on='product_id').merge(users, how='inner', on='user_id')
        # 该商品购买次数 / 总的订单数
        # 最近一次购买商品 - 最后一次购买该商品
        # 该商品购买次数 / 第一次购买该商品到最后一次购买商品的的订单数
        data['_up_order_rate'] = data._up_order_count / data._user_total_orders
        data['_up_order_since_last_order'] = data._user_total_orders - data._up_last_order_number
        data['_up_order_rate_since_first_order'] = data._up_order_count / (data._user_total_orders - data._up_first_order_number + 1)

        # add user_id to train set
        train = train.merge(right=orders[['order_id', 'user_id']], how='left', on='order_id')
        data = data.merge(train[['user_id', 'product_id', 'reordered']], on=['user_id', 'product_id'], how='left')
        data = pd.merge(data, products[['product_id', 'aisle_id', 'department_id']], how='left', on='product_id')
        transform_categorical_data(data, ['aisle_id', 'department_id'])
        data = data.merge(order_streaks[['user_id', 'product_id', 'order_streak']], on=['user_id', 'product_id'], how='left')


        # release Memory
        # del train, prd, users
        # gc.collect()
        # release Memory
        #del priors_orders_detail
        del orders, order_streaks
        gc.collect()

        starting_size = sys.getsizeof(data)
        i = 0
        for c, dtype in zip(data.columns, data.dtypes):
            if 'int' in str(dtype):
                if min(data[c]) >=0:
                    max_int =  max(data[c])
                    if max_int <= 255:
                        data[c] = data[c].astype(np.uint8)
                    elif max_int <= 65535:
                        data[c] = data[c].astype(np.uint16)
                    elif max_int <= 4294967295:
                        data[c] = data[c].astype(np.uint32)
                    i += 1
        print("Number of colums adjusted: {}\n".format(i))
        ## Changing known reorderd col to smaller int size
        data['reordered'] = np.nan_to_num(data['reordered']).astype(np.uint8)
        data['reordered'][data['reordered']==0] = np.nan
        print("Reduced size {:.2%}".format(float(sys.getsizeof(data))/float(starting_size)))


        # # Create Train / Test
        train = data.loc[data.eval_set == "train",:]
        #train.drop(['eval_set', 'user_id', 'product_id', 'order_id'], axis=1, inplace=True)
        #train.loc[:, 'reordered'] = train.reordered.fillna(0)

        test = data.loc[data.eval_set == "test",:]
        #test.drop(['eval_set', 'user_id', 'product_id', 'order_id', 'reordered'], axis=1, inplace=True)
        #groupby_features_train = train
        #groupby_features_test = test

        # with open(Configure.groupby_features_train_path, "wb") as f:
        #     cPickle.dump(groupby_features_train, f, -1)
        # with open(Configure.groupby_features_test_path, "wb") as f:
        #     cPickle.dump(groupby_features_test, f, -1)

        print 'train:', train.shape, ', test:', test.shape
        print("Save data...")
        data_util.save_dataset(train, test)
        
    else:
        with open(Configure.groupby_features_train_path, "rb") as f:
            groupby_features_train = cPickle.load(f)
        with open(Configure.groupby_features_test_path, "rb") as f:
            groupby_features_test = cPickle.load(f)