def main(path, delimiter, verbose): # loading the dataframe data_frame, all_headers = read_csv(path, delimiter, verbose) # selecting headers of interest headers = ['id', #'hash_email', #'hash_email_conversion', #'hash_userid', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', #'click_rate', 'nb_purchases', 'last_time', 'nb_ips'] headers_to_drop = list(set(all_headers) - set(headers)) headers_to_scale = headers[:] headers_to_scale.remove('id') headers_to_scale.remove('rank') # K-Fold cross-validation nb_folds = 10 fold = KFolds(data_frame, nb_folds) missranked_scores_train = [] missranked_scores_test = [] for k in range(nb_folds): train, test = fold.get_fold(k) train = train.sort_values(by='id') test = test.sort_values(by='id') # dropping not usefull columns for drop in headers_to_drop: train = train.drop(drop, 1) test = test.drop(drop, 1) # train set train, mean, std = scaling(train, headers_to_scale) train = train.reset_index(drop=True) X = train[headers_to_scale].values Y = train['rank']==1 # training Logistic regression model linreg = LinearRegression(fit_intercept=True, normalize=False) linreg.fit(X, Y) # computing score on train set Y_score_train = linreg.predict(X) Y_predicted_train = compute_prediction(train, Y_score_train, verbose) missranked_train, wellranked_train, total_train = compute_error(Y, Y_predicted_train) missranked_scores_train.append(missranked_train/total_train) # test set test = scaling(test, headers_to_scale, mean, std) test = test.reset_index(drop=True) X_test = test[headers_to_scale].values Y_test = test['rank'].values==1 # computing score on test set Y_score_test = linreg.predict(X_test) Y_predicted_test = compute_prediction(test, Y_score_test, verbose) missranked_test, wellranked_test, total_test = compute_error(Y_test, Y_predicted_test) missranked_scores_test.append(missranked_test/total_test) # printing intermediate results if verbose: print('\n**** fold ', k, '****') print('train set:') print(' missranked =', round(missranked_train/total_train, 3)) print(' wellranked =', round(wellranked_train/total_train, 3)) print('test set:') print(' missranked =', round(missranked_test/total_test, 3)) print(' wellranked =', round(wellranked_test/total_test, 3)) # printing final result if verbose: print('\n******** MEAN over all folds ********') print('Train missranked = ', np.mean(missranked_scores_train)) print(' Test missranked = ', np.mean(missranked_scores_test))
def main(train_path, test_path): delimiter = ';' verbose = True data_frame, all_headers = read_csv(train_path, delimiter, verbose) delimiter_test = ',' verbose = True data_frame_test, all_headers_test = read_csv(test_path, delimiter_test, verbose) headers = [ 'id', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time', 'nb_ips' ] headers_test = [ 'id', 'similarity', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time', 'nb_ips' ] headers_to_drop = list(set(all_headers) - set(headers)) headers_to_drop_test = list(set(all_headers_test) - set(headers_test)) headers_to_scale = headers[:] headers_to_scale.remove('id') headers_to_scale.remove('rank') headers_to_scale_test = headers_test[:] headers_to_scale_test.remove('id') headers_to_scale_test.remove('similarity') train = data_frame.copy() train = train.sort_values(by='id') test = data_frame_test.copy() test = test.sort_values(by='id') for drop in headers_to_drop: train = train.drop(drop, 1) for drop in headers_to_drop_test: test = test.drop(drop, 1) train = train.reset_index(drop=True) X = train[headers_to_scale].values Y = train['rank'] == 1 similarity = np.array(test.loc[:, 'similarity']) baseline = np.zeros(similarity.shape) test = test.reset_index(drop=True) test['arg_max'] = 0 old_id = test.loc[0, 'id'] rows = [] for row in range(test.shape[0]): current_id = test.loc[row, 'id'] if current_id == old_id: rows.append(row) else: sample = np.random.randint(min(rows), max(rows) + 1) baseline[sample] = 1 old_id = current_id m = test.loc[rows, 'similarity'].max() test.loc[rows, 'arg_max'] = (test.loc[rows, 'similarity'] == m) rows = [] rows.append(row) if row == test.shape[0] - 1: sample = np.random.randint(min(rows), max(rows) + 1) baseline[sample] = 1 old_id = current_id m = test.loc[rows, 'similarity'].max() test.loc[rows, 'arg_max'] = (test.loc[rows, 'similarity'] == m) rows = [] rows.append(row) X_test = test[headers_to_scale].values Y_test = test['arg_max'] # training model regularization = 1e10 xgb_reg = GradientBoostingClassifier(loss='exponential', n_estimators=50, criterion='friedman_mse', max_depth=5, verbose=verbose) xgb_reg.fit(X, Y) # computing score on train set Y_score_train = xgb_reg.predict_proba(X)[:, 1] Y_predicted_train = compute_prediction(train, Y_score_train) #, verbose) missranked_train, wellranked_train, total_train = compute_error( Y, Y_predicted_train) # printing intermediate results print('train set:') print(' missranked =', round(missranked_train / total_train, 3)) print(' wellranked =', round(wellranked_train / total_train, 3)) # # computing score on test set Y_score_test = xgb_reg.predict_proba(X_test)[:, 1] Y_predicted_test = compute_prediction(test, Y_score_test) #, verbose) missranked_test, wellranked_test, total_test = compute_error( Y_test, Y_predicted_test) missranked_test_baseline, wellranked_test_baseline, total_test_baseline = compute_error( Y_test, baseline) # printing intermediate results print('test set:') print(' missranked =', round(missranked_test / total_test, 3)) print(' wellranked =', round(wellranked_test / total_test, 3)) print('baseline prediction:') print(' missranked =', round(missranked_test_baseline / total_test_baseline, 3)) print(' wellranked =', round(wellranked_test_baseline / total_test_baseline, 3)) # Les deux métriques du pdf #df_2 = data_frame_test.copy() old_id = test.loc[0, 'id'] rows = [] score_1 = 0 score_2 = 0 score_1_baseline = 0 score_2_baseline = 0 N = 0 for row in range(len(Y_predicted_test)): current_id = test.loc[row, 'id'] if current_id == old_id: rows.append(row) if Y_predicted_test[row] == 1: prediction = row N += 1 if baseline[row] == 1: prediction_baseline = row else: m = test.loc[rows, 'similarity'].max() score_1 += m - test.loc[prediction, 'similarity'] score_2 += m * test.loc[prediction, 'similarity'] score_1_baseline += m - test.loc[prediction_baseline, 'similarity'] score_2_baseline += m * test.loc[prediction_baseline, 'similarity'] #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity'])) old_id = current_id rows = [] rows.append(row) if Y_predicted_test[row] == 1: prediction = row N += 1 if baseline[row] == 1: prediction_baseline = row if row == len(Y_predicted_test) - 1: m = test.loc[rows, 'similarity'].max() score_1 += m - test.loc[prediction, 'similarity'] score_2 += m * test.loc[prediction, 'similarity'] score_1_baseline += m - test.loc[prediction_baseline, 'similarity'] score_2_baseline += m * test.loc[prediction_baseline, 'similarity'] #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity'])) score_1 /= N score_2 /= N score_1_baseline /= N score_2_baseline /= N print('score_1 = ' + str(score_1)) print('score_2 = ' + str(score_2)) print('score_1_baseline = ' + str(score_1_baseline)) print('score_2_baseline = ' + str(score_2_baseline))