Esempio n. 1
0
import numpy as np
import pandas as pd
from utils.kfolds import KFolds, read_csv

# loading the dataframe
path = './data/dataset_augmented.csv'
delimiter = ';'

data_frame, all_headers = read_csv(path, delimiter, verbose=True)

# selecting headers of interest
headers = [  #'id',
    #'hash_email',
    #'hash_email_conversion',
    #'hash_userid',
    #'rank',
    'occurrences',
    'lifetime',  # trop biasé ?
    'nb_days',
    'nb_idtags',
    'nb_idtags_site',
    'nb_idtags_media',
    #'click_rate',
    'nb_purchases',
    'last_time',
    'nb_ips'
]

# computing the co-variance matrix
values = data_frame[headers].values
values = (values - np.mean(values, axis=0)) / np.std(values, axis=0)
Esempio n. 2
0
def main(path, delimiter, verbose):
	# loading the dataframe
	data_frame, all_headers = read_csv(path, delimiter, verbose)

	# selecting headers of interest
	headers = ['id',
		#'hash_email', 
		#'hash_email_conversion', 
		#'hash_userid', 
		'rank', 
		'occurrences', 
		'lifetime', 
		'nb_days', 
		'nb_idtags', 
		'nb_idtags_site', 
		'nb_idtags_media', 
		#'click_rate', 
		'nb_purchases', 
		'last_time', 
		'nb_ips']
	headers_to_drop = list(set(all_headers) - set(headers))
	headers_to_scale = headers[:]
	headers_to_scale.remove('id')
	headers_to_scale.remove('rank')

	# K-Fold cross-validation
	nb_folds = 10
	fold = KFolds(data_frame, nb_folds)
	missranked_scores_train = []
	missranked_scores_test = []

	for k in range(nb_folds):
		train, test = fold.get_fold(k)
		train = train.sort_values(by='id')
		test = test.sort_values(by='id')

		# dropping not usefull columns
		for drop in headers_to_drop:
		    train = train.drop(drop, 1)
		    test = test.drop(drop, 1)

		# train set
		train, mean, std = scaling(train, headers_to_scale)
		train = train.reset_index(drop=True)
		X = train[headers_to_scale].values
		Y = train['rank']==1

		# training Logistic regression model
		linreg = LinearRegression(fit_intercept=True, normalize=False)
		linreg.fit(X, Y)

		# computing score on train set
		Y_score_train = linreg.predict(X)
		Y_predicted_train = compute_prediction(train, Y_score_train, verbose)
		missranked_train, wellranked_train, total_train = compute_error(Y, Y_predicted_train)
		missranked_scores_train.append(missranked_train/total_train)

		# test set
		test = scaling(test, headers_to_scale, mean, std)
		test = test.reset_index(drop=True)
		X_test = test[headers_to_scale].values
		Y_test = test['rank'].values==1

		# computing score on test set
		Y_score_test = linreg.predict(X_test)
		Y_predicted_test = compute_prediction(test, Y_score_test, verbose)
		missranked_test, wellranked_test, total_test = compute_error(Y_test, Y_predicted_test)
		missranked_scores_test.append(missranked_test/total_test)

		# printing intermediate results
		if verbose:
			print('\n**** fold ', k, '****')
			print('train set:')
			print('   missranked =', round(missranked_train/total_train, 3))
			print('   wellranked =', round(wellranked_train/total_train, 3))
			print('test set:')
			print('   missranked =', round(missranked_test/total_test, 3))
			print('   wellranked =', round(wellranked_test/total_test, 3))

	# printing final result
	if verbose:
		print('\n******** MEAN over all folds ********')
		print('Train missranked = ', np.mean(missranked_scores_train))
		print(' Test missranked = ', np.mean(missranked_scores_test))
Esempio n. 3
0
def main(train_path, test_path):
    delimiter = ';'
    verbose = True
    data_frame, all_headers = read_csv(train_path, delimiter, verbose)

    delimiter_test = ','
    verbose = True
    data_frame_test, all_headers_test = read_csv(test_path, delimiter_test,
                                                 verbose)

    headers = [
        'id', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags',
        'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time',
        'nb_ips'
    ]

    headers_test = [
        'id', 'similarity', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags',
        'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time',
        'nb_ips'
    ]

    headers_to_drop = list(set(all_headers) - set(headers))
    headers_to_drop_test = list(set(all_headers_test) - set(headers_test))

    headers_to_scale = headers[:]
    headers_to_scale.remove('id')
    headers_to_scale.remove('rank')

    headers_to_scale_test = headers_test[:]
    headers_to_scale_test.remove('id')
    headers_to_scale_test.remove('similarity')

    train = data_frame.copy()
    train = train.sort_values(by='id')

    test = data_frame_test.copy()
    test = test.sort_values(by='id')

    for drop in headers_to_drop:
        train = train.drop(drop, 1)

    for drop in headers_to_drop_test:
        test = test.drop(drop, 1)

    train = train.reset_index(drop=True)
    X_train = train[headers_to_scale].values
    Y_train = train[['rank', 'id']].values

    similarity = np.array(test.loc[:, 'similarity'])
    baseline = np.zeros(similarity.shape)

    test = test.reset_index(drop=True)
    test['arg_max'] = 0

    old_id = test.loc[0, 'id']
    rows = []

    for row in range(test.shape[0]):
        current_id = test.loc[row, 'id']
        if current_id == old_id:
            rows.append(row)
        else:
            sample = np.random.randint(min(rows), max(rows) + 1)
            baseline[sample] = 1
            old_id = current_id
            m = test.loc[rows, 'similarity'].max()
            test.loc[rows, 'arg_max'] = 2 - (test.loc[rows, 'similarity'] == m)
            rows = []
            rows.append(row)

        if row == test.shape[0] - 1:
            sample = np.random.randint(min(rows), max(rows) + 1)
            baseline[sample] = 1
            old_id = current_id
            m = test.loc[rows, 'similarity'].max()
            test.loc[rows, 'arg_max'] = 2 - (test.loc[rows, 'similarity'] == m)
            rows = []
            rows.append(row)

    X_test = test[headers_to_scale].values
    Y_test = test[['arg_max', 'id']].values

    # training model
    rank_svm = RankSVM()
    rank_svm = rank_svm.fit(X_train, Y_train)

    # =============================================================================
    #     # The following part is commented in order to decrease the execution time
    #     # computing score on train set
    #     missranked_score_train = 1 - rank_svm.scoreId(X_train, Y_train)
    #
    #     # printing intermediate results
    #     print('train set:')
    #     print('   missranked =', round(missranked_score_train, 3))
    #     print('   wellranked =', round(1 - missranked_score_train, 3))
    # =============================================================================
    print('Training finished')

    # computing score on test set
    Y_predicted_test = rank_svm.predictId(X_test, Y_test)
    missranked_test, wellranked_test, total_test = compute_error(
        Y_test[:, 0], Y_predicted_test)

    # testing baseline
    missranked_test_baseline, wellranked_test_baseline, total_test_baseline = compute_error(
        Y_test[:, 0], baseline)

    # printing intermediate results
    print('test set:')
    print('   missranked =', round(missranked_test / total_test, 3))
    print('   wellranked =', round(wellranked_test / total_test, 3))
    print('baseline prediction:')
    print('   missranked =',
          round(missranked_test_baseline / total_test_baseline, 3))
    print('   wellranked =',
          round(wellranked_test_baseline / total_test_baseline, 3))

    # Les deux métriques du pdf

    #df_2 = data_frame_test.copy()

    old_id = test.loc[0, 'id']
    rows = []
    score_1 = 0
    score_2 = 0
    score_1_baseline = 0
    score_2_baseline = 0
    N = 0

    for row in range(len(Y_predicted_test)):
        current_id = test.loc[row, 'id']

        if current_id == old_id:
            rows.append(row)

            if Y_predicted_test[row] == 1:
                prediction = row
                N += 1

            if baseline[row] == 1:
                prediction_baseline = row
        else:
            m = test.loc[rows, 'similarity'].max()
            score_1 += m - test.loc[prediction, 'similarity']
            score_2 += m * test.loc[prediction, 'similarity']
            score_1_baseline += m - test.loc[prediction_baseline, 'similarity']
            score_2_baseline += m * test.loc[prediction_baseline, 'similarity']
            #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity']))

            old_id = current_id
            rows = []
            rows.append(row)

            if Y_predicted_test[row] == 1:
                prediction = row
                N += 1

            if baseline[row] == 1:
                prediction_baseline = row

        if row == len(Y_predicted_test) - 1:
            m = test.loc[rows, 'similarity'].max()
            score_1 += m - test.loc[prediction, 'similarity']
            score_2 += m * test.loc[prediction, 'similarity']
            score_1_baseline += m - test.loc[prediction_baseline, 'similarity']
            score_2_baseline += m * test.loc[prediction_baseline, 'similarity']
            #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity']))

    score_1 /= N
    score_2 /= N
    score_1_baseline /= N
    score_2_baseline /= N

    print('score_1 = ' + str(score_1))
    print('score_2 = ' + str(score_2))
    print('score_1_baseline = ' + str(score_1_baseline))
    print('score_2_baseline = ' + str(score_2_baseline))
Esempio n. 4
0
def main(path, delimiter, score, threshold, verbose):
    # loading the dataframe
    data_frame, all_headers = read_csv(path, delimiter, verbose)
    
    # selecting headers of interest
    headers = ['id',
        #'hash_email', 
        #'hash_email_conversion', 
        #'hash_userid', 
        'rank', 
        'occurrences', 
        'lifetime', 
        'nb_days', 
        'nb_idtags', 
        'nb_idtags_site', 
        'nb_idtags_media', 
        #'click_rate', 
        'nb_purchases', 
        'last_time', 
        'nb_ips']
    
    headers_to_drop = list(set(all_headers) - set(headers))
    
    headers_to_scale = headers[:]
    headers_to_scale.remove('id')
    headers_to_scale.remove('rank')
    
    # K-Fold cross-validation
    nb_folds = 10
    fold = KFolds(data_frame, nb_folds)
    missranked_scores_train = []
    missranked_scores_test = []
    
    for k in range(nb_folds):
        # recover the train and the test set
        train, test = fold.get_fold(k)
        train = train.sort_values(by='id')
        test = test.sort_values(by='id')
    
        # dropping not usefull columns
        for drop in headers_to_drop:
            train = train.drop(drop, 1)
            test = test.drop(drop, 1)
        
        # split the feature and the [rank, id]
        X_train = train[headers_to_scale].values
        Y_train = train[['rank', 'id']].values
        
        X_test = test[headers_to_scale].values
        Y_test = test[['rank', 'id']].values
        
        # Create our model
        rank_svm = RankSVM()
        
        # Fit our model with the train set
        rank_svm = rank_svm.fit(X_train, Y_train)
        
        if score == 'inversion':
            # Compute the missranked score for the train set
            missranked_score_train = 1 - rank_svm.scoreInversion(X_train, Y_train)
            missranked_scores_train.append(missranked_score_train)
            
            # Compute the missranked score for the test set
            missranked_score_test = 1 - rank_svm.scoreInversion(X_test, Y_test)
            missranked_scores_test.append(missranked_score_test)
        elif score == 'thresholdId':
            # Compute the missranked score for the train set
            missranked_score_train = 1 - rank_svm.scoreThresholdId(X_train, Y_train, threshold)
            missranked_scores_train.append(missranked_score_train)
            
            # Compute the missranked score for the test set
            missranked_score_test = 1 - rank_svm.scoreThresholdId(X_test, Y_test, threshold)
            missranked_scores_test.append(missranked_score_test)
        elif score == 'id':
            # Compute the missranked score for the train set
            missranked_score_train = 1 - rank_svm.scoreId(X_train, Y_train)
            missranked_scores_train.append(missranked_score_train)
            
            # Compute the missranked score for the test set
            missranked_score_test = 1 - rank_svm.scoreId(X_test, Y_test)
            missranked_scores_test.append(missranked_score_test)
        else:
            print('Not a valid score method')
            return
        
        # printing intermediate results
        if verbose:
            print('\n**** fold ', k, '****')
            print('train set:')
            print('   missranked =', round(missranked_score_train, 3))
            print('   wellranked =', round(1 - missranked_score_train, 3))
            print('test set:')
            print('   missranked =', round(missranked_score_test, 3))
            print('   wellranked =', round(1 - missranked_score_test, 3))

    # print the final results
    if verbose:
        print('\n******** MEAN over all folds ********')
        print('Train missranked = ', np.mean(missranked_scores_train))
        print(' Test missranked = ', np.mean(missranked_scores_test))
Esempio n. 5
0
def main(train_path, test_path):

    delimiter = ';'
    verbose = True
    data_frame, all_headers = read_csv(train_path, delimiter, verbose)

    delimiter_test = ','
    verbose = True
    data_frame_test, all_headers_test = read_csv(test_path, delimiter_test,
                                                 verbose)

    headers = [
        'id', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags',
        'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time',
        'nb_ips'
    ]

    headers_test = [
        'id', 'similarity', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags',
        'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time',
        'nb_ips'
    ]

    headers_to_drop = list(set(all_headers) - set(headers))
    headers_to_drop_test = list(set(all_headers_test) - set(headers_test))

    headers_to_scale = headers[:]
    headers_to_scale.remove('id')
    headers_to_scale.remove('rank')

    headers_to_scale_test = headers_test[:]
    headers_to_scale_test.remove('id')
    headers_to_scale_test.remove('similarity')

    train = data_frame.copy()
    train = train.sort_values(by='id')

    test = data_frame_test.copy()
    test = test.sort_values(by='id')

    for drop in headers_to_drop:
        train = train.drop(drop, 1)

    for drop in headers_to_drop_test:
        test = test.drop(drop, 1)

    train = train.reset_index(drop=True)
    X = train[headers_to_scale].values
    Y = train['rank'] == 1

    similarity = np.array(test.loc[:, 'similarity'])
    baseline = np.zeros(similarity.shape)

    test = test.reset_index(drop=True)
    test['arg_max'] = 0

    old_id = test.loc[0, 'id']
    rows = []

    for row in range(test.shape[0]):
        current_id = test.loc[row, 'id']
        if current_id == old_id:
            rows.append(row)
        else:
            sample = np.random.randint(min(rows), max(rows) + 1)
            baseline[sample] = 1
            old_id = current_id
            m = test.loc[rows, 'similarity'].max()
            test.loc[rows, 'arg_max'] = (test.loc[rows, 'similarity'] == m)
            rows = []
            rows.append(row)

        if row == test.shape[0] - 1:
            sample = np.random.randint(min(rows), max(rows) + 1)
            baseline[sample] = 1
            old_id = current_id
            m = test.loc[rows, 'similarity'].max()
            test.loc[rows, 'arg_max'] = (test.loc[rows, 'similarity'] == m)
            rows = []
            rows.append(row)

    X_test = test[headers_to_scale].values
    Y_test = test['arg_max']

    # training model
    regularization = 1e10
    xgb_reg = GradientBoostingClassifier(loss='exponential',
                                         n_estimators=50,
                                         criterion='friedman_mse',
                                         max_depth=5,
                                         verbose=verbose)
    xgb_reg.fit(X, Y)

    # computing score on train set
    Y_score_train = xgb_reg.predict_proba(X)[:, 1]
    Y_predicted_train = compute_prediction(train, Y_score_train)  #, verbose)
    missranked_train, wellranked_train, total_train = compute_error(
        Y, Y_predicted_train)

    # printing intermediate results
    print('train set:')
    print('   missranked =', round(missranked_train / total_train, 3))
    print('   wellranked =', round(wellranked_train / total_train, 3))

    #
    # computing score on test set
    Y_score_test = xgb_reg.predict_proba(X_test)[:, 1]
    Y_predicted_test = compute_prediction(test, Y_score_test)  #, verbose)
    missranked_test, wellranked_test, total_test = compute_error(
        Y_test, Y_predicted_test)
    missranked_test_baseline, wellranked_test_baseline, total_test_baseline = compute_error(
        Y_test, baseline)

    # printing intermediate results
    print('test set:')
    print('   missranked =', round(missranked_test / total_test, 3))
    print('   wellranked =', round(wellranked_test / total_test, 3))
    print('baseline prediction:')
    print('   missranked =',
          round(missranked_test_baseline / total_test_baseline, 3))
    print('   wellranked =',
          round(wellranked_test_baseline / total_test_baseline, 3))

    # Les deux métriques du pdf

    #df_2 = data_frame_test.copy()

    old_id = test.loc[0, 'id']
    rows = []
    score_1 = 0
    score_2 = 0
    score_1_baseline = 0
    score_2_baseline = 0
    N = 0

    for row in range(len(Y_predicted_test)):
        current_id = test.loc[row, 'id']

        if current_id == old_id:
            rows.append(row)
            if Y_predicted_test[row] == 1:
                prediction = row
                N += 1
            if baseline[row] == 1:
                prediction_baseline = row
        else:
            m = test.loc[rows, 'similarity'].max()
            score_1 += m - test.loc[prediction, 'similarity']
            score_2 += m * test.loc[prediction, 'similarity']
            score_1_baseline += m - test.loc[prediction_baseline, 'similarity']
            score_2_baseline += m * test.loc[prediction_baseline, 'similarity']
            #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity']))
            old_id = current_id
            rows = []
            rows.append(row)
            if Y_predicted_test[row] == 1:
                prediction = row
                N += 1
            if baseline[row] == 1:
                prediction_baseline = row

        if row == len(Y_predicted_test) - 1:
            m = test.loc[rows, 'similarity'].max()
            score_1 += m - test.loc[prediction, 'similarity']
            score_2 += m * test.loc[prediction, 'similarity']
            score_1_baseline += m - test.loc[prediction_baseline, 'similarity']
            score_2_baseline += m * test.loc[prediction_baseline, 'similarity']
            #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity']))

    score_1 /= N
    score_2 /= N
    score_1_baseline /= N
    score_2_baseline /= N

    print('score_1 = ' + str(score_1))
    print('score_2 = ' + str(score_2))
    print('score_1_baseline = ' + str(score_1_baseline))
    print('score_2_baseline = ' + str(score_2_baseline))