def test_user_id_are_same(self): dt1 = DataSet(100) dt2 = DataSet(100) dt3 = DataSet(100) self.assertEqual(set(dt1.users.index), set(dt2.users.index)) self.assertEqual(set(dt2.users.index), set(dt3.users.index)) self.assertEqual(set(dt1.users.index), set(dt3.users.index))
def test_user_product_two_real(self): dt = DataSet(4000) st_time = time.time() a = dt.set_user_x_products() print(time.time() - st_time) st_time = time.time() b = dt.set_user_x_products_jit() print(time.time() - st_time) self.assertTrue(a.shape == b.shape) self.assertTrue(np.all(a == b))
def setUp(self): if not os.path.isfile("./pckl/dt_test.p"): dt = DataSet(1000) pickle.dump(dt, open("./pckl/dt_test.p", "wb")) else: dt = pickle.load(open("./pckl/dt_test.p", "rb")) self.dt = dt
def test_init_from_array(self): users_id = [144288, 145552, 152713, 153941, 158231] dt = DataSet(ARR_ORDERS_ID=users_id) self.assertTrue(set(dt.orders.user_id) == set(users_id)) self.assertTrue(set(dt.users.index) == set(users_id)) self.assertTrue(set(dt.priors.user_id) == set(users_id))
params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'binary_logloss'}, 'num_leaves': 96, 'max_depth': 10, 'feature_fraction': 0.9, 'bagging_fraction': 0.95, 'bagging_freq': 5, 'verbose': -1 # 'scale_pos_weight': 5 } ROUNDS = 100 d_train = lgb.Dataset(X_train, label=y_train, categorical_feature=['aisle_id', 'department_id']) # , 'order_hour_of_day', 'dow' bst = lgb.train(params, d_train, ROUNDS) y_pred_prob = bst.predict(X_test) return y_pred_prob user_by_cluster = pd.read_csv('../tmp/user_by_cluster.csv', index_col='cluster') for cluster in user_by_cluster.index.unique(): print('\n\nCulster N', cluster) user_array = user_by_cluster.loc[cluster, 'user_id'].values dt = DataSet(ARR_ORDERS_ID=user_array) cv = CrossVal('lgb_pred_by_clusters_' + str(cluster)) res = cv.cross_val_predict(lgb_predict, dt, f_to_use) print(cv.res)
import lightgbm as lgb from my_classes import DataSet, CrossVal import pandas as pd import pickle import os user_by_cluster = pd.read_csv('../tmp/user_by_cluster.csv', index_col='cluster') dt = DataSet(ARR_ORDERS_ID=user_by_cluster.user_id.values) f_to_use = [ 'user_total_orders', 'user_total_items', 'total_distinct_items', 'user_average_days_between_orders', 'user_average_basket', 'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio', 'aisle_id', 'department_id', 'product_orders', 'product_reorders', 'product_reorder_rate', 'UP_orders', 'UP_orders_ratio', 'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last', 'UP_delta_hour_vs_last' ] def lgb_predict(X_train, y_train, X_test): params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'binary_logloss'}, 'num_leaves': 96, 'max_depth': 10, 'feature_fraction': 0.9, 'bagging_fraction': 0.95,
def test_init_from_nb_of_sample(self): sample_nb = 93 dt = DataSet(sample_nb) self.assertEqual(dt.users.shape[0], sample_nb) self.assertEqual(len(dt.orders.user_id.unique()), sample_nb)
import pandas as pd from numba import jit, guvectorize, int64 from sklearn.cluster import AgglomerativeClustering from sklearn.pipeline import Pipeline from my_classes import DataSet from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer from sklearn.decomposition import TruncatedSVD from nltk.cluster.gaac import GAAClusterer N_COMPONENTS = 200 N_TAKE_TOP = 15 N_CLUSTERS = 12 LAST_N_PRIORS = 5 dt = DataSet(20000) class PipClassSVDTakeTop: def __init__(self, n_take_top): self.n_take_top = n_take_top def fit(self, X, y): return self def transform(self, X, y=None): return X[:, :self.n_take_top] pipeline = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),