Esempio n. 1
0
def main(path, delimiter, verbose):
	# loading the dataframe
	data_frame, all_headers = read_csv(path, delimiter, verbose)

	# selecting headers of interest
	headers = ['id',
		#'hash_email', 
		#'hash_email_conversion', 
		#'hash_userid', 
		'rank', 
		'occurrences', 
		'lifetime', 
		'nb_days', 
		'nb_idtags', 
		'nb_idtags_site', 
		'nb_idtags_media', 
		#'click_rate', 
		'nb_purchases', 
		'last_time', 
		'nb_ips']
	headers_to_drop = list(set(all_headers) - set(headers))
	headers_to_scale = headers[:]
	headers_to_scale.remove('id')
	headers_to_scale.remove('rank')

	# K-Fold cross-validation
	nb_folds = 10
	fold = KFolds(data_frame, nb_folds)
	missranked_scores_train = []
	missranked_scores_test = []

	for k in range(nb_folds):
		train, test = fold.get_fold(k)
		train = train.sort_values(by='id')
		test = test.sort_values(by='id')

		# dropping not usefull columns
		for drop in headers_to_drop:
		    train = train.drop(drop, 1)
		    test = test.drop(drop, 1)

		# train set
		train, mean, std = scaling(train, headers_to_scale)
		train = train.reset_index(drop=True)
		X = train[headers_to_scale].values
		Y = train['rank']==1

		# training Logistic regression model
		linreg = LinearRegression(fit_intercept=True, normalize=False)
		linreg.fit(X, Y)

		# computing score on train set
		Y_score_train = linreg.predict(X)
		Y_predicted_train = compute_prediction(train, Y_score_train, verbose)
		missranked_train, wellranked_train, total_train = compute_error(Y, Y_predicted_train)
		missranked_scores_train.append(missranked_train/total_train)

		# test set
		test = scaling(test, headers_to_scale, mean, std)
		test = test.reset_index(drop=True)
		X_test = test[headers_to_scale].values
		Y_test = test['rank'].values==1

		# computing score on test set
		Y_score_test = linreg.predict(X_test)
		Y_predicted_test = compute_prediction(test, Y_score_test, verbose)
		missranked_test, wellranked_test, total_test = compute_error(Y_test, Y_predicted_test)
		missranked_scores_test.append(missranked_test/total_test)

		# printing intermediate results
		if verbose:
			print('\n**** fold ', k, '****')
			print('train set:')
			print('   missranked =', round(missranked_train/total_train, 3))
			print('   wellranked =', round(wellranked_train/total_train, 3))
			print('test set:')
			print('   missranked =', round(missranked_test/total_test, 3))
			print('   wellranked =', round(wellranked_test/total_test, 3))

	# printing final result
	if verbose:
		print('\n******** MEAN over all folds ********')
		print('Train missranked = ', np.mean(missranked_scores_train))
		print(' Test missranked = ', np.mean(missranked_scores_test))
Esempio n. 2
0
def main(path, delimiter, score, threshold, verbose):
    # loading the dataframe
    data_frame, all_headers = read_csv(path, delimiter, verbose)
    
    # selecting headers of interest
    headers = ['id',
        #'hash_email', 
        #'hash_email_conversion', 
        #'hash_userid', 
        'rank', 
        'occurrences', 
        'lifetime', 
        'nb_days', 
        'nb_idtags', 
        'nb_idtags_site', 
        'nb_idtags_media', 
        #'click_rate', 
        'nb_purchases', 
        'last_time', 
        'nb_ips']
    
    headers_to_drop = list(set(all_headers) - set(headers))
    
    headers_to_scale = headers[:]
    headers_to_scale.remove('id')
    headers_to_scale.remove('rank')
    
    # K-Fold cross-validation
    nb_folds = 10
    fold = KFolds(data_frame, nb_folds)
    missranked_scores_train = []
    missranked_scores_test = []
    
    for k in range(nb_folds):
        # recover the train and the test set
        train, test = fold.get_fold(k)
        train = train.sort_values(by='id')
        test = test.sort_values(by='id')
    
        # dropping not usefull columns
        for drop in headers_to_drop:
            train = train.drop(drop, 1)
            test = test.drop(drop, 1)
        
        # split the feature and the [rank, id]
        X_train = train[headers_to_scale].values
        Y_train = train[['rank', 'id']].values
        
        X_test = test[headers_to_scale].values
        Y_test = test[['rank', 'id']].values
        
        # Create our model
        rank_svm = RankSVM()
        
        # Fit our model with the train set
        rank_svm = rank_svm.fit(X_train, Y_train)
        
        if score == 'inversion':
            # Compute the missranked score for the train set
            missranked_score_train = 1 - rank_svm.scoreInversion(X_train, Y_train)
            missranked_scores_train.append(missranked_score_train)
            
            # Compute the missranked score for the test set
            missranked_score_test = 1 - rank_svm.scoreInversion(X_test, Y_test)
            missranked_scores_test.append(missranked_score_test)
        elif score == 'thresholdId':
            # Compute the missranked score for the train set
            missranked_score_train = 1 - rank_svm.scoreThresholdId(X_train, Y_train, threshold)
            missranked_scores_train.append(missranked_score_train)
            
            # Compute the missranked score for the test set
            missranked_score_test = 1 - rank_svm.scoreThresholdId(X_test, Y_test, threshold)
            missranked_scores_test.append(missranked_score_test)
        elif score == 'id':
            # Compute the missranked score for the train set
            missranked_score_train = 1 - rank_svm.scoreId(X_train, Y_train)
            missranked_scores_train.append(missranked_score_train)
            
            # Compute the missranked score for the test set
            missranked_score_test = 1 - rank_svm.scoreId(X_test, Y_test)
            missranked_scores_test.append(missranked_score_test)
        else:
            print('Not a valid score method')
            return
        
        # printing intermediate results
        if verbose:
            print('\n**** fold ', k, '****')
            print('train set:')
            print('   missranked =', round(missranked_score_train, 3))
            print('   wellranked =', round(1 - missranked_score_train, 3))
            print('test set:')
            print('   missranked =', round(missranked_score_test, 3))
            print('   wellranked =', round(1 - missranked_score_test, 3))

    # print the final results
    if verbose:
        print('\n******** MEAN over all folds ********')
        print('Train missranked = ', np.mean(missranked_scores_train))
        print(' Test missranked = ', np.mean(missranked_scores_test))