def main(path, delimiter, verbose): # loading the dataframe data_frame, all_headers = read_csv(path, delimiter, verbose) # selecting headers of interest headers = ['id', #'hash_email', #'hash_email_conversion', #'hash_userid', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', #'click_rate', 'nb_purchases', 'last_time', 'nb_ips'] headers_to_drop = list(set(all_headers) - set(headers)) headers_to_scale = headers[:] headers_to_scale.remove('id') headers_to_scale.remove('rank') # K-Fold cross-validation nb_folds = 10 fold = KFolds(data_frame, nb_folds) missranked_scores_train = [] missranked_scores_test = [] for k in range(nb_folds): train, test = fold.get_fold(k) train = train.sort_values(by='id') test = test.sort_values(by='id') # dropping not usefull columns for drop in headers_to_drop: train = train.drop(drop, 1) test = test.drop(drop, 1) # train set train, mean, std = scaling(train, headers_to_scale) train = train.reset_index(drop=True) X = train[headers_to_scale].values Y = train['rank']==1 # training Logistic regression model linreg = LinearRegression(fit_intercept=True, normalize=False) linreg.fit(X, Y) # computing score on train set Y_score_train = linreg.predict(X) Y_predicted_train = compute_prediction(train, Y_score_train, verbose) missranked_train, wellranked_train, total_train = compute_error(Y, Y_predicted_train) missranked_scores_train.append(missranked_train/total_train) # test set test = scaling(test, headers_to_scale, mean, std) test = test.reset_index(drop=True) X_test = test[headers_to_scale].values Y_test = test['rank'].values==1 # computing score on test set Y_score_test = linreg.predict(X_test) Y_predicted_test = compute_prediction(test, Y_score_test, verbose) missranked_test, wellranked_test, total_test = compute_error(Y_test, Y_predicted_test) missranked_scores_test.append(missranked_test/total_test) # printing intermediate results if verbose: print('\n**** fold ', k, '****') print('train set:') print(' missranked =', round(missranked_train/total_train, 3)) print(' wellranked =', round(wellranked_train/total_train, 3)) print('test set:') print(' missranked =', round(missranked_test/total_test, 3)) print(' wellranked =', round(wellranked_test/total_test, 3)) # printing final result if verbose: print('\n******** MEAN over all folds ********') print('Train missranked = ', np.mean(missranked_scores_train)) print(' Test missranked = ', np.mean(missranked_scores_test))
def main(path, delimiter, score, threshold, verbose): # loading the dataframe data_frame, all_headers = read_csv(path, delimiter, verbose) # selecting headers of interest headers = ['id', #'hash_email', #'hash_email_conversion', #'hash_userid', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', #'click_rate', 'nb_purchases', 'last_time', 'nb_ips'] headers_to_drop = list(set(all_headers) - set(headers)) headers_to_scale = headers[:] headers_to_scale.remove('id') headers_to_scale.remove('rank') # K-Fold cross-validation nb_folds = 10 fold = KFolds(data_frame, nb_folds) missranked_scores_train = [] missranked_scores_test = [] for k in range(nb_folds): # recover the train and the test set train, test = fold.get_fold(k) train = train.sort_values(by='id') test = test.sort_values(by='id') # dropping not usefull columns for drop in headers_to_drop: train = train.drop(drop, 1) test = test.drop(drop, 1) # split the feature and the [rank, id] X_train = train[headers_to_scale].values Y_train = train[['rank', 'id']].values X_test = test[headers_to_scale].values Y_test = test[['rank', 'id']].values # Create our model rank_svm = RankSVM() # Fit our model with the train set rank_svm = rank_svm.fit(X_train, Y_train) if score == 'inversion': # Compute the missranked score for the train set missranked_score_train = 1 - rank_svm.scoreInversion(X_train, Y_train) missranked_scores_train.append(missranked_score_train) # Compute the missranked score for the test set missranked_score_test = 1 - rank_svm.scoreInversion(X_test, Y_test) missranked_scores_test.append(missranked_score_test) elif score == 'thresholdId': # Compute the missranked score for the train set missranked_score_train = 1 - rank_svm.scoreThresholdId(X_train, Y_train, threshold) missranked_scores_train.append(missranked_score_train) # Compute the missranked score for the test set missranked_score_test = 1 - rank_svm.scoreThresholdId(X_test, Y_test, threshold) missranked_scores_test.append(missranked_score_test) elif score == 'id': # Compute the missranked score for the train set missranked_score_train = 1 - rank_svm.scoreId(X_train, Y_train) missranked_scores_train.append(missranked_score_train) # Compute the missranked score for the test set missranked_score_test = 1 - rank_svm.scoreId(X_test, Y_test) missranked_scores_test.append(missranked_score_test) else: print('Not a valid score method') return # printing intermediate results if verbose: print('\n**** fold ', k, '****') print('train set:') print(' missranked =', round(missranked_score_train, 3)) print(' wellranked =', round(1 - missranked_score_train, 3)) print('test set:') print(' missranked =', round(missranked_score_test, 3)) print(' wellranked =', round(1 - missranked_score_test, 3)) # print the final results if verbose: print('\n******** MEAN over all folds ********') print('Train missranked = ', np.mean(missranked_scores_train)) print(' Test missranked = ', np.mean(missranked_scores_test))