Exemple #1
0
    def test_user_id_are_same(self):
        dt1 = DataSet(100)
        dt2 = DataSet(100)
        dt3 = DataSet(100)

        self.assertEqual(set(dt1.users.index), set(dt2.users.index))
        self.assertEqual(set(dt2.users.index), set(dt3.users.index))
        self.assertEqual(set(dt1.users.index), set(dt3.users.index))
Exemple #2
0
    def test_user_product_two_real(self):
        dt = DataSet(4000)

        st_time = time.time()
        a = dt.set_user_x_products()
        print(time.time() - st_time)

        st_time = time.time()
        b = dt.set_user_x_products_jit()
        print(time.time() - st_time)

        self.assertTrue(a.shape == b.shape)
        self.assertTrue(np.all(a == b))
Exemple #3
0
 def setUp(self):
     if not os.path.isfile("./pckl/dt_test.p"):
         dt = DataSet(1000)
         pickle.dump(dt, open("./pckl/dt_test.p", "wb"))
     else:
         dt = pickle.load(open("./pckl/dt_test.p", "rb"))
     self.dt = dt
Exemple #4
0
    def test_init_from_array(self):
        users_id = [144288, 145552, 152713, 153941, 158231]

        dt = DataSet(ARR_ORDERS_ID=users_id)
        self.assertTrue(set(dt.orders.user_id) == set(users_id))
        self.assertTrue(set(dt.users.index) == set(users_id))
        self.assertTrue(set(dt.priors.user_id) == set(users_id))
Exemple #5
0
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss'},
        'num_leaves': 96,
        'max_depth': 10,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.95,
        'bagging_freq': 5,
        'verbose': -1
        # 'scale_pos_weight': 5
    }
    ROUNDS = 100

    d_train = lgb.Dataset(X_train,
                          label=y_train,
                          categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'
    bst = lgb.train(params, d_train, ROUNDS)
    y_pred_prob = bst.predict(X_test)
    return y_pred_prob

user_by_cluster = pd.read_csv('../tmp/user_by_cluster.csv', index_col='cluster')

for cluster in user_by_cluster.index.unique():
    print('\n\nCulster N', cluster)
    user_array = user_by_cluster.loc[cluster, 'user_id'].values
    dt = DataSet(ARR_ORDERS_ID=user_array)
    cv = CrossVal('lgb_pred_by_clusters_' + str(cluster))
    res = cv.cross_val_predict(lgb_predict, dt, f_to_use)
    print(cv.res)
Exemple #6
0
import lightgbm as lgb
from my_classes import DataSet, CrossVal
import pandas as pd
import pickle
import os

user_by_cluster = pd.read_csv('../tmp/user_by_cluster.csv',
                              index_col='cluster')
dt = DataSet(ARR_ORDERS_ID=user_by_cluster.user_id.values)

f_to_use = [
    'user_total_orders', 'user_total_items', 'total_distinct_items',
    'user_average_days_between_orders', 'user_average_basket',
    'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
    'aisle_id', 'department_id', 'product_orders', 'product_reorders',
    'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
    'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
    'UP_delta_hour_vs_last'
]


def lgb_predict(X_train, y_train, X_test):
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss'},
        'num_leaves': 96,
        'max_depth': 10,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.95,
Exemple #7
0
 def test_init_from_nb_of_sample(self):
     sample_nb = 93
     dt = DataSet(sample_nb)
     self.assertEqual(dt.users.shape[0], sample_nb)
     self.assertEqual(len(dt.orders.user_id.unique()), sample_nb)
import pandas as pd
from numba import jit, guvectorize, int64
from sklearn.cluster import AgglomerativeClustering
from sklearn.pipeline import Pipeline

from my_classes import DataSet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from nltk.cluster.gaac import GAAClusterer

N_COMPONENTS = 200
N_TAKE_TOP = 15
N_CLUSTERS = 12
LAST_N_PRIORS = 5

dt = DataSet(20000)


class PipClassSVDTakeTop:
    def __init__(self, n_take_top):
        self.n_take_top = n_take_top

    def fit(self, X, y):
        return self

    def transform(self, X, y=None):
        return X[:, :self.n_take_top]


pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),