Esempio n. 1
0
def main(path, delimiter, verbose):
	# loading the dataframe
	data_frame, all_headers = read_csv(path, delimiter, verbose)

	# selecting headers of interest
	headers = ['id',
		#'hash_email', 
		#'hash_email_conversion', 
		#'hash_userid', 
		'rank', 
		'occurrences', 
		'lifetime', 
		'nb_days', 
		'nb_idtags', 
		'nb_idtags_site', 
		'nb_idtags_media', 
		#'click_rate', 
		'nb_purchases', 
		'last_time', 
		'nb_ips']
	headers_to_drop = list(set(all_headers) - set(headers))
	headers_to_scale = headers[:]
	headers_to_scale.remove('id')
	headers_to_scale.remove('rank')

	# K-Fold cross-validation
	nb_folds = 10
	fold = KFolds(data_frame, nb_folds)
	missranked_scores_train = []
	missranked_scores_test = []

	for k in range(nb_folds):
		train, test = fold.get_fold(k)
		train = train.sort_values(by='id')
		test = test.sort_values(by='id')

		# dropping not usefull columns
		for drop in headers_to_drop:
		    train = train.drop(drop, 1)
		    test = test.drop(drop, 1)

		# train set
		train, mean, std = scaling(train, headers_to_scale)
		train = train.reset_index(drop=True)
		X = train[headers_to_scale].values
		Y = train['rank']==1

		# training Logistic regression model
		linreg = LinearRegression(fit_intercept=True, normalize=False)
		linreg.fit(X, Y)

		# computing score on train set
		Y_score_train = linreg.predict(X)
		Y_predicted_train = compute_prediction(train, Y_score_train, verbose)
		missranked_train, wellranked_train, total_train = compute_error(Y, Y_predicted_train)
		missranked_scores_train.append(missranked_train/total_train)

		# test set
		test = scaling(test, headers_to_scale, mean, std)
		test = test.reset_index(drop=True)
		X_test = test[headers_to_scale].values
		Y_test = test['rank'].values==1

		# computing score on test set
		Y_score_test = linreg.predict(X_test)
		Y_predicted_test = compute_prediction(test, Y_score_test, verbose)
		missranked_test, wellranked_test, total_test = compute_error(Y_test, Y_predicted_test)
		missranked_scores_test.append(missranked_test/total_test)

		# printing intermediate results
		if verbose:
			print('\n**** fold ', k, '****')
			print('train set:')
			print('   missranked =', round(missranked_train/total_train, 3))
			print('   wellranked =', round(wellranked_train/total_train, 3))
			print('test set:')
			print('   missranked =', round(missranked_test/total_test, 3))
			print('   wellranked =', round(wellranked_test/total_test, 3))

	# printing final result
	if verbose:
		print('\n******** MEAN over all folds ********')
		print('Train missranked = ', np.mean(missranked_scores_train))
		print(' Test missranked = ', np.mean(missranked_scores_test))
Esempio n. 2
0
def main(train_path, test_path):

    delimiter = ';'
    verbose = True
    data_frame, all_headers = read_csv(train_path, delimiter, verbose)

    delimiter_test = ','
    verbose = True
    data_frame_test, all_headers_test = read_csv(test_path, delimiter_test,
                                                 verbose)

    headers = [
        'id', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags',
        'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time',
        'nb_ips'
    ]

    headers_test = [
        'id', 'similarity', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags',
        'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time',
        'nb_ips'
    ]

    headers_to_drop = list(set(all_headers) - set(headers))
    headers_to_drop_test = list(set(all_headers_test) - set(headers_test))

    headers_to_scale = headers[:]
    headers_to_scale.remove('id')
    headers_to_scale.remove('rank')

    headers_to_scale_test = headers_test[:]
    headers_to_scale_test.remove('id')
    headers_to_scale_test.remove('similarity')

    train = data_frame.copy()
    train = train.sort_values(by='id')

    test = data_frame_test.copy()
    test = test.sort_values(by='id')

    for drop in headers_to_drop:
        train = train.drop(drop, 1)

    for drop in headers_to_drop_test:
        test = test.drop(drop, 1)

    train = train.reset_index(drop=True)
    X = train[headers_to_scale].values
    Y = train['rank'] == 1

    similarity = np.array(test.loc[:, 'similarity'])
    baseline = np.zeros(similarity.shape)

    test = test.reset_index(drop=True)
    test['arg_max'] = 0

    old_id = test.loc[0, 'id']
    rows = []

    for row in range(test.shape[0]):
        current_id = test.loc[row, 'id']
        if current_id == old_id:
            rows.append(row)
        else:
            sample = np.random.randint(min(rows), max(rows) + 1)
            baseline[sample] = 1
            old_id = current_id
            m = test.loc[rows, 'similarity'].max()
            test.loc[rows, 'arg_max'] = (test.loc[rows, 'similarity'] == m)
            rows = []
            rows.append(row)

        if row == test.shape[0] - 1:
            sample = np.random.randint(min(rows), max(rows) + 1)
            baseline[sample] = 1
            old_id = current_id
            m = test.loc[rows, 'similarity'].max()
            test.loc[rows, 'arg_max'] = (test.loc[rows, 'similarity'] == m)
            rows = []
            rows.append(row)

    X_test = test[headers_to_scale].values
    Y_test = test['arg_max']

    # training model
    regularization = 1e10
    xgb_reg = GradientBoostingClassifier(loss='exponential',
                                         n_estimators=50,
                                         criterion='friedman_mse',
                                         max_depth=5,
                                         verbose=verbose)
    xgb_reg.fit(X, Y)

    # computing score on train set
    Y_score_train = xgb_reg.predict_proba(X)[:, 1]
    Y_predicted_train = compute_prediction(train, Y_score_train)  #, verbose)
    missranked_train, wellranked_train, total_train = compute_error(
        Y, Y_predicted_train)

    # printing intermediate results
    print('train set:')
    print('   missranked =', round(missranked_train / total_train, 3))
    print('   wellranked =', round(wellranked_train / total_train, 3))

    #
    # computing score on test set
    Y_score_test = xgb_reg.predict_proba(X_test)[:, 1]
    Y_predicted_test = compute_prediction(test, Y_score_test)  #, verbose)
    missranked_test, wellranked_test, total_test = compute_error(
        Y_test, Y_predicted_test)
    missranked_test_baseline, wellranked_test_baseline, total_test_baseline = compute_error(
        Y_test, baseline)

    # printing intermediate results
    print('test set:')
    print('   missranked =', round(missranked_test / total_test, 3))
    print('   wellranked =', round(wellranked_test / total_test, 3))
    print('baseline prediction:')
    print('   missranked =',
          round(missranked_test_baseline / total_test_baseline, 3))
    print('   wellranked =',
          round(wellranked_test_baseline / total_test_baseline, 3))

    # Les deux métriques du pdf

    #df_2 = data_frame_test.copy()

    old_id = test.loc[0, 'id']
    rows = []
    score_1 = 0
    score_2 = 0
    score_1_baseline = 0
    score_2_baseline = 0
    N = 0

    for row in range(len(Y_predicted_test)):
        current_id = test.loc[row, 'id']

        if current_id == old_id:
            rows.append(row)
            if Y_predicted_test[row] == 1:
                prediction = row
                N += 1
            if baseline[row] == 1:
                prediction_baseline = row
        else:
            m = test.loc[rows, 'similarity'].max()
            score_1 += m - test.loc[prediction, 'similarity']
            score_2 += m * test.loc[prediction, 'similarity']
            score_1_baseline += m - test.loc[prediction_baseline, 'similarity']
            score_2_baseline += m * test.loc[prediction_baseline, 'similarity']
            #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity']))
            old_id = current_id
            rows = []
            rows.append(row)
            if Y_predicted_test[row] == 1:
                prediction = row
                N += 1
            if baseline[row] == 1:
                prediction_baseline = row

        if row == len(Y_predicted_test) - 1:
            m = test.loc[rows, 'similarity'].max()
            score_1 += m - test.loc[prediction, 'similarity']
            score_2 += m * test.loc[prediction, 'similarity']
            score_1_baseline += m - test.loc[prediction_baseline, 'similarity']
            score_2_baseline += m * test.loc[prediction_baseline, 'similarity']
            #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity']))

    score_1 /= N
    score_2 /= N
    score_1_baseline /= N
    score_2_baseline /= N

    print('score_1 = ' + str(score_1))
    print('score_2 = ' + str(score_2))
    print('score_1_baseline = ' + str(score_1_baseline))
    print('score_2_baseline = ' + str(score_2_baseline))