def main(): meta = pd.read_csv(RAW + 'item_metadata.csv') meta = featurize(meta, 'properties') meta['rating'] = count_rating(meta) meta['stars'] = count_stars(meta) meta['features'] = meta['list'].apply(len) ensure_dir(PREP) meta.to_csv(PREP + 'item_metadata.csv')
def main(): keep = [ 'user_id', 'session_id', 'timestamp', 'step', 'item_recommendations' ] solution = load_csv(BASE_PATH + SET + 'solution_' + KEY + '.csv').sort_values(['session_id']) solution['recommendations'] = solution['recommendations'].apply( lambda x: eval( x.replace('\n', '').replace('\r', '').replace(' ', ',').replace( ',,', ',').replace(',,', ',').replace(',,', ',').replace( '[,', '['))) solution['confidences'] = solution['confidences'].apply(lambda x: eval( x.replace('\n', '').replace('\r', '').replace(' ', ',').replace( ',,', ',').replace(',,', ',').replace(',,', ',').replace( ',,', ',').replace('[,', '['))) example = load_csv(BASE_PATH + RAW + 'submission_popular.csv') umap, smap = load_maps(BASE_PATH + PREPROCESS) solution['item_recommendations'] = solution[ 'recommendations'] #.apply( lambda x: eval(x.replace('\n','').replace('\r','').replace(' ',',').replace(',,',',').replace(',,',',').replace(',,',',').replace('[,','[')) ) del solution['recommendations'] def okay(s): if type(s) == int: print(s) if type(s) == str: print(s) return ' '.join(map(str, s)) print(solution['item_recommendations']) solution['item_recommendations'] = solution['item_recommendations'].apply( okay) print(solution['item_recommendations']) solution['user_id'] = solution['user_id'].apply(lambda x: umap[x]) solution['session_id'] = solution['session_id'].apply(lambda x: smap[x]) solution = solution[keep] if check(solution, example): ensure_dir(BASE_PATH + OUT) solution.to_csv(BASE_PATH + OUT + 'sub_' + KEY + '.csv')
def clean_and_map(data): #map cats data = data.sort_values(['user_id', 'timestamp', 'step']).reset_index(drop=True) data = fix_sessions(data) data = fix_sessions_duplicate(data) ok = check_sessions(data) if not ok: print('check data') exit() data['item_id'] = pd.to_numeric(data.reference, errors='coerce').fillna(-1).astype(int) data = map_strings(data) #check_sessions(data) ensure_dir(PATH_PROCESSED) data.to_feather(PATH_PROCESSED + 'joined_tmp.fthr') return data
def main(): train = create_set(base_path=BASE_PATH + SET, conf=CONF, key=DSKEY, redo=False) len_test = (len(train[train.train == 0])) train.query('train == 1', inplace=True) indices = train_test_cv(train, train.label, splits=SPLITS, shuffle=SHUFFLE) del train gc.collect() score = np.zeros((SPLITS, len_test)) i = 0 for train_idx, val_idx in indices: dataset_file_key = DSKEY + 'SPLIT' + str(i) + 'of' + str( SPLITS) + 'SHFL' + str(SHUFFLE) ensure_dir(BASE_PATH + SET + 'tmp/') file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_train.lgbm' if not Path(file).is_file(): tstart = time.time() train = create_set(base_path=BASE_PATH + SET, conf=CONF, key=DSKEY, redo=False) print('loaded in {}'.format((time.time() - tstart))) tstart = time.time() train.query('train == 1', inplace=True) file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_train.csv' #dump_svmlight_file( X_train[KEEP_FEATURES], y_train, file, zero_based=True, multilabel=False ) train.loc[train_idx, ['label'] + FEATURES].to_csv(file, index=False, header=False) #num_cols = len(X_train.columns) q_train = train.loc[train_idx].groupby('session_id').size().values d_train = lgbm.Dataset(file) #, categorical_feature=CAT_FEATURES ) d_train.set_group(q_train) d_train.set_label(train.loc[train_idx, 'label']) d_train.set_feature_name(FEATURES) #del q_train gc.collect() file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_valid.csv' #dump_svmlight_file( X_valid[KEEP_FEATURES], y_valid, file, zero_based=True, multilabel=False ) train.loc[val_idx, ['label'] + FEATURES].to_csv(file, index=False, header=False) q_valid = train.loc[val_idx].groupby('session_id').size().values y_val = train.loc[val_idx, 'label'] del train gc.collect() d_valid = d_train.create_valid( file) #, categorical_feature=CAT_FEATURES ) file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_valid.lgbm' d_valid.set_group(q_valid) d_valid.set_label(y_val) d_valid.set_feature_name(FEATURES) d_valid.save_binary(file) file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_train.lgbm' d_train.save_binary(file) #del q_valid, d_valid, d_train gc.collect() file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_train.lgbm' d_train = lgbm.Dataset(file) file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_valid.lgbm' d_valid = lgbm.Dataset(file) else: tstart = time.time() print('load binary lgbm sets') file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_train.lgbm' d_train = lgbm.Dataset(file) file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_valid.lgbm' d_valid = lgbm.Dataset(file) print('loaded sets in {}'.format((time.time() - tstart))) watchlist = [d_train, d_valid] params = {} params['boosting'] = 'dart' params['learning_rate'] = 0.1 params['application'] = 'lambdarank' params['metric'] = 'ndcg' params['eval_at'] = '30' #params['max_depth'] = -1 #params['num_leaves'] = 64 #params['max_bin'] = 512 params['feature_fraction'] = 0.5 params['bagging_fraction'] = 0.5 #params['min_data_in_leaf'] = 20 #params['verbosity'] = 0 evals_result = {} model = lgbm.train(params, train_set=d_train, num_boost_round=MAX_EPOCHS, valid_sets=watchlist, early_stopping_rounds=STOPPING, evals_result=evals_result, verbose_eval=10) ensure_dir(BASE_PATH + SET + 'lgbm/') model.save_model(BASE_PATH + SET + 'lgbm/' + ALGKEY + '.' + str(i) + '.txt', num_iteration=model.best_iteration) del params, watchlist, d_train, d_valid, evals_result gc.collect() test = create_set(base_path=BASE_PATH + SET, conf=CONF, key=DSKEY, redo=False) test.query('train == 0', inplace=True) X_test = test[FEATURES].values.astype(np.float32) del test gc.collect() y_test = model.predict(X_test, num_iteration=model.best_iteration) score[i] = y_test i += 1 del y_test, model, X_test gc.collect() test = create_set(base_path=BASE_PATH + SET, conf=CONF, key=DSKEY, redo=False) test.query('train == 0', inplace=True) test['prob_norm'] = 0 test['prob_direct'] = 0 for i in range(SPLITS): test['prob_direct_' + str(i)] = score[i] test['prob_norm' + str(i)] = (test['prob_direct_' + str(i)] - test['prob_direct_' + str(i)].min()) / ( test['prob_direct_' + str(i)].max() - test['prob_direct_' + str(i)].min()) test['prob_direct'] += test['prob_direct_' + str(i)] test['prob_norm'] += test['prob_norm' + str(i)] test['prob_norm'] = test['prob_norm'] / SPLITS test['prob_direct'] = test['prob_direct'] / SPLITS #truth = pd.read_csv( self.folder + 'truth.csv' ) #truth['label2'] = 1 #test = test.merge( truth[['session_id','reference','label2']], left_on=['session_id','impressions'], right_on=['session_id','reference'], how='left' ) #test['label'] = test['label2'].fillna(0) #del test['label2'] test = test.sort_values(['session_id', 'prob_norm'], ascending=False) #test.to_csv( BASE_PATH + SET + 'test_debugcv.csv' ) solution = pd.DataFrame() solution['recommendations'] = test.groupby('session_id').impressions.apply( list) solution['confidences'] = test.groupby('session_id').prob_norm.apply(list) solution.reset_index(drop=True) solution = solution.merge( test[['session_id', 'user_id', 'timestamp', 'step']].drop_duplicates(keep='last'), on='session_id', how='inner') solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY + '_norm.csv') test = test.sort_values(['session_id', 'prob_direct'], ascending=False) solution = pd.DataFrame() solution['recommendations'] = test.groupby('session_id').impressions.apply( list) solution['confidences'] = test.groupby('session_id').prob_direct.apply( list) solution.reset_index(drop=True) solution = solution.merge( test[['session_id', 'user_id', 'timestamp', 'step']].drop_duplicates(keep='last'), on='session_id', how='inner') solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY + '_direct.csv') result = evaluate(solution, base=BASE_PATH, dataset=SET) print(result.T)
def main(): train = create_set(base_path=BASE_PATH + SET, conf=CONF, key=DSKEY, redo=False) test = train.query('train == 0') test_file_key = DSKEY ensure_dir(BASE_PATH + SET + 'tmp/') test_file = BASE_PATH + SET + 'tmp/' + test_file_key + '_test.fthr' if not Path(test_file).is_file(): test = test.reset_index(drop=True) test.to_feather(test_file) test_len = len(test) del test gc.collect() train.query('train == 1', inplace=True) X = train[FEATURES + ['session_id']] y = train['label'] del train gc.collect() score = np.zeros((SPLITS, test_len)) i = 0 for train_idx, val_idx in train_test_cv(X, y, splits=SPLITS, shuffle=SHUFFLE): X_train = X.loc[train_idx] X_valid = X.loc[val_idx] y_train = y.loc[train_idx] y_valid = y.loc[val_idx] if LTR: q_train = X_train.groupby(['session_id' ]).size().values.astype(np.float32) q_valid = X_valid.groupby(['session_id' ]).size().values.astype(np.float32) xtrain = X_train[FEATURES].values.astype(np.float32) ytrain = y_train.values.astype(np.float32) del X_train, y_train gc.collect() d_train = lgbm.Dataset( xtrain, label=ytrain, group=q_train, feature_name=FEATURES) #, categorical_feature=CAT_FEATURES ) del q_train gc.collect() xval = X_valid[FEATURES].values.astype(np.float32) yval = y_valid.values.astype(np.float32) del X_valid, y_valid gc.collect() d_valid = lgbm.Dataset( xval, label=yval, group=q_valid, feature_name=FEATURES) #, categorical_feature=CAT_FEATURES ) del q_valid gc.collect() else: d_train = lgbm.Dataset( X_train[FEATURES], label=y_train, feature_name=FEATURES ) #+ ['session_id'])#, categorical_feature=CAT_FEATURES ) d_valid = lgbm.Dataset( X_valid[FEATURES], label=y_valid, feature_name=FEATURES ) #+ ['session_id'])#, categorical_feature=CAT_FEATURES ) watchlist = [d_train, d_valid] params = {} params['boosting'] = 'dart' params['learning_rate'] = 0.1 if LTR: params['application'] = 'lambdarank' params['metric'] = 'ndcg' params['eval_at'] = '30' else: params['application'] = 'binary' params['metric'] = 'binary_logloss' #params['max_depth'] = -1 #params['num_leaves'] = 64 #params['max_bin'] = 512 params['feature_fraction'] = 0.5 params['bagging_fraction'] = 0.5 #params['min_data_in_leaf'] = 20 #params['verbosity'] = 0 evals_result = {} model = lgbm.train(params, train_set=d_train, num_boost_round=MAX_EPOCHS, valid_sets=watchlist, early_stopping_rounds=STOPPING, evals_result=evals_result, verbose_eval=10) ensure_dir(BASE_PATH + SET + 'lgbm/') model.save_model( BASE_PATH + SET + 'lgbm/' + ALGKEY + '.' + str(i) + '.txt', num_iteration=model.best_iteration, ) del params, watchlist, d_train, d_valid, evals_result gc.collect() test = load_feather(test_file) X_test = test[FEATURES].values.astype(np.float32) y_test = model.predict(X_test, num_iteration=model.best_iteration) score[i] = y_test i += 1 del y_test, model, X_test, test gc.collect() test = load_feather(test_file) test['prob_norm'] = 0 test['prob_direct'] = 0 for i in range(SPLITS): test['prob_direct_' + str(i)] = score[i] test['prob_norm' + str(i)] = (test['prob_direct_' + str(i)] - test['prob_direct_' + str(i)].min()) / ( test['prob_direct_' + str(i)].max() - test['prob_direct_' + str(i)].min()) test['prob_direct'] += test['prob_direct_' + str(i)] test['prob_norm'] += test['prob_norm' + str(i)] test['prob_norm'] = test['prob_norm'] / SPLITS test['prob_direct'] = test['prob_direct'] / SPLITS #truth = pd.read_csv( self.folder + 'truth.csv' ) #truth['label2'] = 1 #test = test.merge( truth[['session_id','reference','label2']], left_on=['session_id','impressions'], right_on=['session_id','reference'], how='left' ) #test['label'] = test['label2'].fillna(0) #del test['label2'] test = test.sort_values(['session_id', 'prob_norm'], ascending=False) #test.to_csv( BASE_PATH + SET + 'test_debugcv.csv' ) solution = pd.DataFrame() solution['recommendations'] = test.groupby('session_id').impressions.apply( list) solution['confidences'] = test.groupby('session_id').prob_norm.apply(list) solution.reset_index(drop=True) solution = solution.merge( test[['session_id', 'user_id', 'timestamp', 'step']].drop_duplicates(keep='last'), on='session_id', how='inner') solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY + '_norm.csv') test = test.sort_values(['session_id', 'prob_direct'], ascending=False) solution = pd.DataFrame() solution['recommendations'] = test.groupby('session_id').impressions.apply( list) solution['confidences'] = test.groupby('session_id').prob_direct.apply( list) solution.reset_index(drop=True) solution = solution.merge( test[['session_id', 'user_id', 'timestamp', 'step']].drop_duplicates(keep='last'), on='session_id', how='inner') solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY + '_direct.csv') result = evaluate(solution, base=BASE_PATH, dataset=SET) print(result.T)
def crawl(folder, url): items = get_items(BASE_PATH + RAW_FOLDER) items_done = get_processed(folder) items = list(filter(lambda x: x not in items_done, items)) result_map = {} first = items[0] togo = len(items) tstart = time.time() try: retries = RETRY problem = False dump = DUMP_AFTER i = 0 while not problem: if i == len(items): break try: item = int(items[i]) result_map[item] = get_item(url, item) #time.sleep(1) i += 1 retries = RETRY togo -= 1 dump -= 1 if dump == 0: ensure_dir(folder) pickle.dump( result_map, open(folder + 'from_' + str(first) + '.pkl', 'wb')) dump = DUMP_AFTER first = item result_map = {} if togo % 100 == 0: spent = time.time() - tstart done = i + 1 each = spent / done left = each * togo eta = datetime.timedelta(seconds=left) spent = datetime.timedelta(seconds=spent) print('done {} of {} in {}, {} left'.format( done, len(items), spent, eta)) except Exception: retries -= 1 print('retries ', retries) if retries <= 0: raise wait = RETRY - retries + 1 time.sleep(pow(2, wait)) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() print("*** print_tb:") traceback.print_tb(exc_traceback) ensure_dir(folder) pickle.dump(result_map, open(folder + 'from_' + str(first) + '.pkl', 'wb')) ensure_dir(folder) pickle.dump(result_map, open(folder + 'from_' + str(first) + '.pkl', 'wb'))
def create_latent_factors( full, size=32, actions=None, values=None, key='all', method='bpr' ): start = time.time() full = full[full.action_type.isin(actions)] if IMPRESSION in actions: full = extend_clicks( full ) full = full.drop_duplicates( ['session_id','reference','action_type'], keep='last' ) full = full[~full.reference.isnull() & (full.exclude == 0)] items = full['reference'].unique() item_map = pd.Series( index=items, data=range(len(items)) ) full['item_idx'] = full['reference'].map( item_map ) sessions = full['session_id'].unique() session_map = pd.Series( index=sessions, data=range(len(sessions)) ) full['session_idx'] = full['session_id'].map( session_map ) full['value'] = 1 for i,action in enumerate(actions): full.ix[full.action_type == action, 'value'] = values[i] full['value'] = full.groupby( ['session_id','reference'] ).value.transform(max) full = full.drop_duplicates( ['session_id','reference','action_type'], keep='last' ) SPM = sparse.csr_matrix(( full['value'].tolist(), (full.item_idx, full.session_idx)), shape=( full.item_idx.nunique(), full.session_idx.nunique() )) print( 'created user features in ',(time.time() - start) ) start = time.time() start = time.time() if method == 'bpr': model = implicit.bpr.BayesianPersonalizedRanking( factors=size-1, iterations=200, use_gpu=False ) model.fit(SPM) If = model.item_factors Sf = model.user_factors elif method == 'als': model = implicit.als.AlternatingLeastSquares( factors=size, iterations=200, use_gpu=False, calculate_training_loss=True ) model.fit(SPM) If = model.item_factors Sf = model.user_factors elif method == 'nmf': nmf = dc.NMF(n_components=size, init='random', random_state=0, max_iter=500, verbose=1) If = nmf.fit_transform( SPM ) Sf = nmf.components_.T # train the model on a sparse matrix of item/user/confidence weights IF = ['if_'+str(i) for i in range(size)] SF = ['sf_'+str(i) for i in range(size)] Sf = pd.DataFrame( Sf, index=full.session_idx.unique() ) Sf.columns = SF Sf['session_id'] = session_map.index If = pd.DataFrame( If, index=full.item_idx.unique() ) If.columns = IF If['item_id'] = item_map.index item_emb = If.sort_values('item_id')[IF].values session_emb = Sf.sort_values('session_id')[SF].values ensure_dir( DATA_FOLDER + 'latent/' ) If.to_csv( DATA_FOLDER + 'latent/' + method + 'c_'+key+'_item_features.'+str(size)+'.csv', index=False) Sf.to_csv( DATA_FOLDER + 'latent/' + method + 'c_'+key+'_session_features.'+str(size)+'.csv', index=False) print('created latent features in ',(time.time() - start)) res = [] for row in full.itertuples(): session = session_emb[int(row.session_idx)] item = item_emb[row.item_idx] res.append( np.dot( item, session.T ) ) full['reconst'] = res print( full[['item_idx','session_idx','value','reconst']] )
def create_features(base_path, log, examples, min_occurences=None, hidden=False, train_only=False, fillna_mean=False): tstart = time.time() print('create_features price') cols_pre = examples.columns.values mask_log = log.hidden > -1 mask_examples = examples.train > -1 if train_only: mask_log = mask_log & (log.train == 1) mask_examples = mask_examples & (examples.train == 1) if not hidden: mask_log = mask_log & (log.hidden < 1) clicks = log[log.action_type == CLICK][[ 'train', 'session_id', 'prices', 'impressions', 'city', 'platform' ]].copy() clicks = expand(clicks, ['impressions', 'prices']) clicks = clicks.drop_duplicates(['session_id', 'impressions'], keep='last') mask_clicks = clicks.train > -1 if not train_only else clicks.train == 1 ensure_dir(base_path + 'tmp/') clicks.groupby('city').prices.mean().to_csv(base_path + 'tmp/' + 'city_price.csv') examples = price_by_group_imp(examples, clicks, mask_clicks, group=['impressions'], key='item', min_occurences=min_occurences, fillna_mean=fillna_mean) examples = price_by_group_imp(examples, clicks, mask_clicks, group=['city'], key='city', min_occurences=min_occurences, fillna_mean=fillna_mean) examples = price_by_group_imp(examples, clicks, mask_clicks, group=['platform'], key='platform', min_occurences=min_occurences, fillna_mean=fillna_mean) examples = price_by_group_imp(examples, clicks, mask_clicks, group=['city', 'platform'], key='city_platform', min_occurences=min_occurences, fillna_mean=fillna_mean) examples = price_by_group_imp(examples, examples, mask_examples, group=['session_id'], key='list', min_occurences=min_occurences, fillna_mean=fillna_mean) del clicks gc.collect() # clickprice = pd.DataFrame() # clickprice['prices_click'] = examples[(examples.train==1) & (examples.label==1)].groupby( 'session_id' ).prices.min() # examples = examples.merge( clickprice, left_on='session_id', right_index=True, how='left' ) # del clickprice print(sum(mask_log)) examples = price_by_group_action(log, examples, mask_log, group=['item_id'], group_examples=['impressions'], key='item', hidden=hidden, min_occurences=min_occurences, fillna_mean=fillna_mean) examples = price_by_group_action(log, examples, mask_log, group=['platform'], key='platform', hidden=hidden, min_occurences=min_occurences, fillna_mean=fillna_mean) #examples = price_by_group_action(log, examples, mask_log, group=['device'], key='device', hidden=hidden, min_occurences=min_occurences, fillna_mean=fillna_mean) examples = price_by_group_action(log, examples, mask_log, group=['city'], key='city', hidden=hidden, min_occurences=min_occurences, fillna_mean=fillna_mean) examples = price_by_group_action(log, examples, mask_log, group=['city', 'platform'], key='city_platform', hidden=hidden, min_occurences=min_occurences, fillna_mean=fillna_mean) # del examples['prices_click'] new_cols = np.setdiff1d(examples.columns.values, cols_pre) print('create_features price in {}s'.format((time.time() - tstart))) return examples, new_cols
def main(): tstart = time.time() train = create_set(base_path=BASE_PATH + SET, conf=CONF, key=DSKEY, redo=False) #train = resolve_na(train) print('loaded in {}'.format((time.time() - tstart))) tstart = time.time() #test = train.query('train == 0') train.query('train == 1', inplace=True) print('split in {}'.format((time.time() - tstart))) tstart = time.time() if FS_IMP is not None: FEATURES_IMP = get_features_by_importance(FS_IMP) else: FEATURES_IMP = FEATURES print([item for item, count in Counter(FEATURES).items() if count > 1]) y = train['label'] X = train[FEATURES_IMP + ['session_id']] #input("Press Enter to continue...") print('FEATURES in in {}'.format((time.time() - tstart))) tstart = time.time() if STACK: train_stack = train[[ 'user_id', 'session_id', 'step', 'timestamp', 'impressions' ]].copy() del train gc.collect() print('gc collect in in {}'.format((time.time() - tstart))) tstart = time.time() if FS != None: check_cols(X) keep = feature_selection(X[FEATURES_IMP], y, FS) KEEP_FEATURES = [FEATURES_IMP[i] for i in keep] else: KEEP_FEATURES = FEATURES_IMP X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=VALID, shuffle=SHUFFLE) print('split in in {}'.format((time.time() - tstart))) tstart = time.time() if LTR: q_train = X_train.groupby(['session_id' ]).size().values.astype(np.float32) q_valid = X_valid.groupby(['session_id' ]).size().values.astype(np.float32) xtrain = X_train[KEEP_FEATURES].values.astype(np.float32) ytrain = y_train.values.astype(np.float32) del X_train, y_train gc.collect() d_train = lgbm.Dataset( xtrain, label=ytrain, group=q_train, feature_name=KEEP_FEATURES) #, categorical_feature=CAT_FEATURES ) del q_train gc.collect() xval = X_valid[KEEP_FEATURES].values.astype(np.float32) yval = y_valid.values.astype(np.float32) del X_valid, y_valid gc.collect() d_valid = lgbm.Dataset( xval, label=yval, group=q_valid, feature_name=KEEP_FEATURES) #, categorical_feature=CAT_FEATURES ) del q_valid gc.collect() else: xtrain = X_train[KEEP_FEATURES].values.astype(np.float32) ytrain = y_train.values.astype(np.float32) del X_train, y_train gc.collect() d_train = lgbm.Dataset( xtrain, label=ytrain, feature_name=KEEP_FEATURES ) #+ ['session_id'])#, categorical_feature=CAT_FEATURES ) del xtrain, ytrain gc.collect() xval = X_valid[KEEP_FEATURES].values.astype(np.float32) yval = y_valid.values.astype(np.float32) del X_valid, y_valid gc.collect() d_valid = lgbm.Dataset( xval, label=yval, feature_name=KEEP_FEATURES ) #+ ['session_id'])#, categorical_feature=CAT_FEATURES ) del xval, yval gc.collect() print('create sets in {}'.format((time.time() - tstart))) tstart = time.time() watchlist = [d_train, d_valid] params = {} params['boosting'] = 'dart' params['learning_rate'] = 0.1 if LTR: params['application'] = 'lambdarank' params['metric'] = 'ndcg' params['eval_at'] = '30' #params['group_column'] = 'name:session_id' else: params['application'] = 'binary' params['metric'] = 'binary_logloss' # params['max_depth'] = 34 # params['num_leaves'] = 234 # params['max_bin'] = 485 # params['feature_fraction'] = 0.202505 # params['bagging_fraction'] = 0.823505 # params['min_data_in_leaf'] = 15 params['feature_fraction'] = 0.5 params['bagging_fraction'] = 0.5 #params['bagging_freq'] = 5 #params['verbosity'] = 0 evals_result = {} model = lgbm.train(params, train_set=d_train, num_boost_round=10000, valid_sets=watchlist, early_stopping_rounds=STOPPING, evals_result=evals_result, verbose_eval=10) #, feval=mrr ) print('train in in {}'.format((time.time() - tstart))) tstart = time.time() # ax = lgbm.plot_metric(evals_result, metric='auc') # plt.show() export_importance(model, KEEP_FEATURES, export=FS is None and FS_IMP is None) ensure_dir(BASE_PATH + SET + 'lgbm/') model.save_model(BASE_PATH + SET + 'lgbm/' + ALGKEY + '.txt', num_iteration=model.best_iteration) test = create_set(base_path=BASE_PATH + SET, conf=CONF, key=DSKEY, redo=False) test.query('train == 0', inplace=True) X_test = test[KEEP_FEATURES] y_test = model.predict(X_test, num_iteration=model.best_iteration) print('predict in {}'.format((time.time() - tstart))) tstart = time.time() test['prob'] = y_test if STACK: test[[ 'user_id', 'session_id', 'step', 'timestamp', 'impressions', 'prob' ]].to_csv(BASE_PATH + '/' + SET + '/stacking/teprobs_' + ALGKEY + '.csv') y_pred = model.predict(X[KEEP_FEATURES]) train_stack['prob'] = y_pred train_stack[[ 'user_id', 'session_id', 'step', 'timestamp', 'impressions', 'prob' ]].to_csv(BASE_PATH + '/' + SET + '/stacking/trprobs_' + ALGKEY + '.csv') # truth = pd.read_csv( BASE_PATH + SET + 'truth.csv' ) # truth['label2'] = 1 # test = test.merge( truth[['session_id','reference','label2']], left_on=['session_id','impressions'], right_on=['session_id','reference'], how='left' ) # test['label'] = test['label2'].fillna(0) # del test['label2'] test = test.sort_values(['session_id', 'prob'], ascending=False) # test.to_csv( BASE_PATH + SET + '/test_examine.csv' ) solution = pd.DataFrame() solution['recommendations'] = test.groupby('session_id').impressions.apply( list) solution['confidences'] = test.groupby('session_id').prob.apply(list) solution.reset_index(drop=True) solution = solution.merge( test[['session_id', 'user_id', 'timestamp', 'step']].drop_duplicates(keep='last'), on='session_id', how='inner') solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY + '.csv') result = evaluate(solution, base=BASE_PATH, dataset=SET) print(result.T)
def create_latent_factors( full, size=32, actions=ACTIONS_CLICK, key=KEY_CLICK ): start = time.time() full = full[full.action_type.isin(actions)] full = full.drop_duplicates( ['session_id','reference','action_type'], keep='last' ) full = full[~full.reference.isnull() & (full.exclude == 0)] items = set( full.reference.unique() ) print( len(items) ) lists = pd.DataFrame() lists['session_id'] = full.groupby('session_id').session_id.min() lists['sequence'] = full.groupby('session_id').reference.apply( list ) del full sequences = [] for row in lists.itertuples(): props = [str(i) for i in row.sequence] sequences.append( TaggedDocument(words=props, tags=[str(row.session_id)]) ) print( 'created sequences in ',(time.time() - start) ) start = time.time() print('ITEM2VEC FEATURES') start = time.time() model = Doc2Vec(vector_size=size, window=5, min_count=1, workers=4) model.build_vocab(sequences) print('vocab build') for i in range(ITERATIONS): model.train(sequences, epochs=1, total_examples=model.corpus_count) print('trained {} in {}'.format( i, ( time.time() - start ) )) d = {} for item in lists.session_id.values: d[str(item)] = model[str(item)] frame = pd.DataFrame( d ) frame = frame.T frame.columns = ['sf_'+str(i) for i in range(size)] frame['session_id'] = pd.to_numeric( frame.index ).astype(np.int32) ensure_dir( DATA_FOLDER + 'latent/' ) frame.to_csv( DATA_FOLDER + 'latent/' + 'd2v_'+key+'_session_features.'+str(size)+'.csv', index=False) d = {} for item in items: d[str(item)] = model.wv[str(item)] frame = pd.DataFrame( d ) frame = frame.T frame.columns = ['if_'+str(i) for i in range(size)] frame['item_id'] = pd.to_numeric( frame.index ).astype(np.int32) frame.to_csv( DATA_FOLDER + 'latent/' + 'd2v_'+key+'_item_features.'+str(size)+'.csv', index=False) print('created latent features in ',(time.time() - start))