Example #1
0
 def predict(self, X, ids, weight, feature_names=None):
     self.feature_names = feature_names
     query_indptr, query_ids = self._build_query_indptr(ids)
     # We wont be using this, but Queries wont instantiate without it
     y = np.zeros(X.shape[0])
     q = Queries(X, y, query_indptr, query_ids=query_ids)
     y_pred = self.model.predict(q, n_jobs=self.params['n_jobs'])
     return y_pred
Example #2
0
 def _build_queries(self, X, y, ids, w):
     query_indptr, query_ids = self._build_query_indptr(ids)
     q = Queries(X, y, query_indptr, query_ids=query_ids)
     # weights as per query instead of per-row ... just guess
     wn = [
         np.mean(w[query_indptr[i]:query_indptr[i + 1]])
         for i in range(len(query_indptr) - 1)
     ]
     wn = [w[i] for i in query_indptr[:-1]]
     return q, np.ascontiguousarray(wn, dtype='float64')
def output_prediction(test_data,head=True):
    test_queries = Queries.load_from_text('../data/svmlight_true_test_chunk_temp1.txt',purge=None)
    metric = NormalizedDiscountedCumulativeGain(38, queries=test_queries)
    # Make the prediction
    predict_scores = model.predict(test_queries, n_jobs=-1)
    # Extract the srch_id and prop_id into a dataframe
    result_unordered = test_data.loc[:,['srch_id','prop_id']]
    result_unordered['scores'] = predict_scores
    result_ordered = result_unordered.sort(['srch_id','scores'],ascending=[1,0])
    # Write the submission into file
    result_ordered.loc[:,['srch_id','prop_id']].to_csv('../data/predict_test_data.csv',index=False, mode='a',header = head)
    print "Prediction has been written inte the file..."
Example #4
0
def output_prediction(test_data, head=True):
    test_queries = Queries.load_from_text(
        '../data/svmlight_true_test_chunk_temp1.txt', purge=None)
    metric = NormalizedDiscountedCumulativeGain(38, queries=test_queries)
    # Make the prediction
    predict_scores = model.predict(test_queries, n_jobs=-1)
    # Extract the srch_id and prop_id into a dataframe
    result_unordered = test_data.loc[:, ['srch_id', 'prop_id']]
    result_unordered['scores'] = predict_scores
    result_ordered = result_unordered.sort(['srch_id', 'scores'],
                                           ascending=[1, 0])
    # Write the submission into file
    result_ordered.loc[:, ['srch_id', 'prop_id']].to_csv(
        '../data/predict_test_data.csv', index=False, mode='a', header=head)
    print "Prediction has been written inte the file..."
                                  'booking_bool']].apply(relevance, axis=1),
                       '../data/svmlight_test_avg_mean_std_competitors_m2.txt',
                       query_id=test_data.srch_id)

    # Please set it for each training
    model_name = 'model_012'
    # Parameters for file recording
    # LOG = model_log_folder + model_name + '.log'
    # MODELLER_DIR = modeller_folder + model_name + '/'

    # Turn on logging.
    # logging.basicConfig(filename=LOG, format='%(asctime)s : %(message)s',
    #                     level=logging.INFO)

    # Load the query datasets.
    train_queries = Queries.load_from_text(
        '../data/svmlight_training_avg_mean_std_competitors_m2.txt')
    with open('../data/train_queries2.pkl', 'wb') as pickle_output_train:
        pickle.dump(train_queries, pickle_output_train)

    valid_queries = Queries.load_from_text(
        '../data/svmlight_validation_avg_mean_std_competitors_m2.txt')
    with open('../data/valid_queries2.pkl', 'wb') as pickle_output_valid:
        pickle.dump(valid_queries, pickle_output_valid)

    test_queries = Queries.load_from_text(
        '../data/svmlight_test_avg_mean_std_competitors_m2.txt')
    with open('../data/test_queries2.pkl', 'wb') as pickle_output_test:
        pickle.dump(test_queries, pickle_output_test)
    # pickle_output_test.close()

logging.info('===============================================================')
from rankpy.queries import Queries
from rankpy.queries import find_constant_features

from rankpy.models import LambdaRandomForest

from rankpy.gridsearch import gridsearch
from rankpy.gridsearch import train_test_split

# Turn on logging.
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : '
                    '%(message)s',
                    level=logging.INFO)

# Load the query datasets.
training_queries = Queries.load_from_text('data/MQ2007/Fold1/train.txt')
validation_queries = Queries.load_from_text('data/MQ2007/Fold1/vali.txt')
test_queries = Queries.load_from_text('data/MQ2007/Fold1/test.txt')

logging.info('=' * 80)

# Save them to binary format ...
training_queries.save('data/MQ2007/Fold1/training')
validation_queries.save('data/MQ2007/Fold1/validation')
test_queries.save('data/MQ2007/Fold1/test')

# ... because loading them will be then faster.
training_queries = Queries.load('data/MQ2007/Fold1/training')
validation_queries = Queries.load('data/MQ2007/Fold1/validation')
test_queries = Queries.load('data/MQ2007/Fold1/test')

logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
logging.info('dump train file')

# listwise_sampling()

logging.info('================================================================================')
logging.info('dump test file')

# listwise_sampling_test()

logging.info('================================================================================')
logging.info('load query database')

train_queries = Queries.load_from_text('../data/svmlight_train.txt')
valid_queries = Queries.load_from_text('../data/svmlight_val.txt')
test_queries = Queries.load_from_text('../data/svmlight_test.txt')

logging.info('================================================================================')
logging.info('train LambdaMart')

# Prepare metric for this set of queries.
metric = NormalizedDiscountedCumulativeGain(38, queries=[train_queries, valid_queries, test_queries])

# Initialize LambdaMART model and train it.
model = LambdaMART(n_estimators=10000, max_depth=5, shrinkage=0.08, estopping=100, n_jobs=-1, n_iterations=100)
model.fit(metric, train_queries, validation=valid_queries)

logging.info('================================================================================')
logging.info('test')
Example #8
0
from rankpy.queries import Queries
from rankpy.queries import find_constant_features
from rankpy.models import LambdaMART
from sklearn.grid_search import ParameterGrid

from ndcg import NDCG
from maprec import MAP

# Turn on logging.
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
dcg_folds_scores = []
map_folds_scores = []
# load the data for each fold
for i in xrange(1, 6):
    # load training, validation and testing sets for current
    training_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/training')
    validation_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) +
                                      '/validation')
    test_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/test')

    logging.info(
        '================================================================================'
    )
    # Print basic info about query datasets.
    logging.info('Train queries: %s' % training_queries)
    logging.info('Valid queries: %s' % validation_queries)
    logging.info('Test queries: %s' % test_queries)

    logging.info(
        '================================================================================'
    )
Example #9
0
File: prep.py Project: ncanna/IRDM
import numpy as np
import logging
import rankpy
from rankpy.queries import Queries

# Turn on logging.
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

for i in xrange(1, 6):
    # Load the query datasets.
    training_queries = Queries.load_from_text('MSLR-WEB10K/Fold' + str(i) +
                                              '/train.txt')
    validation_queries = Queries.load_from_text('MSLR-WEB10K/Fold' + str(i) +
                                                '/vali.txt')
    test_queries = Queries.load_from_text('MSLR-WEB10K/Fold' + str(i) +
                                          '/test.txt')

    logging.info(
        '================================================================================'
    )

    # Save them to binary format ...
    training_queries.save('MSLR-WEB10K/Fold' + str(i) + '/training')
    validation_queries.save('MSLR-WEB10K/Fold' + str(i) + '/validation')
    test_queries.save('MSLR-WEB10K/Fold' + str(i) + '/test')

    # ... because loading them will be then faster.
    training_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/training')
    validation_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) +
                                      '/validation')
    test_queries = Queries.load('MSLR-WEB10K/Fold' + str(i) + '/test')
Example #10
0
import logging,argparse

from rankpy.queries import Queries
from rankpy.models import LambdaMART
from rankpy.metrics import *

parser = argparse.ArgumentParser(description="Rank py.")
parser.add_argument("metric",help="metric",type=int)
parser.add_argument("iter",help="iterations",type=int)
args = parser.parse_args()

# Turn on logging.
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

# Load the query datasets.
train_queries = Queries.load_from_text('data/train.txt')
valid_queries = Queries.load_from_text('data/vali.txt')
test_queries = Queries.load_from_text('data/test.txt')

logging.info('================================================================================')

# Save them to binary format ...
train_queries.save('data/fold2_train')
valid_queries.save('data/fold2_vali')
test_queries.save('data/fold2_test')

# ... because loading them will be then faster.
train_queries = Queries.load('data/fold2_train')
valid_queries = Queries.load('data/fold2_vali')
test_queries = Queries.load('data/fold2_test')
dump_svmlight_file(validation_data.iloc[:,1:-2].values,validation_data.iloc[:,-2:].apply(relevance,axis = 1),'../data/svmlight_validation.txt',query_id=validation_data.srch_id)


# In[52]:

dump_svmlight_file(test_data.iloc[:,1:-2].values,test_data.iloc[:,-2:].apply(relevance,axis = 1),'../data/svmlight_test.txt',query_id = test_data.srch_id)


# In[54]:

# Turn on logging.
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

# Load the query datasets.
train_queries = Queries.load_from_text('../data/svmlight_training.txt')
valid_queries = Queries.load_from_text('../data/svmlight_validation.txt')
test_queries = Queries.load_from_text('../data/svmlight_test.txt')

logging.info('================================================================================')

# Save them to binary format ...
#train_queries.save('../data/train_bin')
#valid_queries.save('../data/validation_bin')
#test_queries.save('../data/test_bin')

# ... because loading them will be then faster.
#train_queries = Queries.load('../data/train_bin')
#valid_queries = Queries.load('../data/validation_bin')
#test_queries = Queries.load('../data/test_bin')
# Generate the SVMLight format file
dump_svmlight_file(training_data_new[col_names].values,training_data.iloc[:,-2:].apply(relevance,axis = 1),'../data/svmlight_training_avg_mean_std_competitors_m2.txt',query_id=training_data_new.srch_id)
dump_svmlight_file(validation_data_new[col_names].values,validation_data.iloc[:,-2:].apply(relevance,axis = 1),'../data/svmlight_validation_avg_mean_std_competitors_m2.txt',query_id=validation_data_new.srch_id)
dump_svmlight_file(test_data_new[col_names].values,test_data.iloc[:,-2:].apply(relevance,axis = 1),'../data/svmlight_test_avg_mean_std_competitors_m2.txt',query_id = test_data_new.srch_id)

# Please set it for each training
model_name = 'model_012'
# Parameters for file recording
LOG = model_log_folder + model_name +'.log'
MODELLER_DIR = modeller_folder + model_name + '/'

# Turn on logging.
logging.basicConfig(filename = LOG,format='%(asctime)s : %(message)s', level=logging.INFO)

# Load the query datasets.
train_queries = Queries.load_from_text('../data/svmlight_training_avg_mean_std_competitors_m2.txt')
pickle_output_train = open('../data/train_queries2.pkl','w')
pickle.dump(train_queries,pickle_output_train)
pickle_output_train.close()
valid_queries = Queries.load_from_text('../data/svmlight_validation_avg_mean_std_competitors_m2.txt')
pickle_output_valid = open('../data/valid_queries2.pkl','w')
pickle.dump(valid_queries,pickle_output_valid)
pickle_output_valid.close()
test_queries = Queries.load_from_text('../data/svmlight_test_avg_mean_std_competitors_m2.txt')
pickle_output_test = open('../data/test_queries2.pkl','w')
pickle.dump(test_queries,pickle_output_test)
pickle_output_test.close()

logging.info('================================================================================')

# Save them to binary format ...
Example #13
0
import logging

from rankpy.queries import Queries
from rankpy.queries import find_constant_features

from rankpy.models import LambdaMART

from rankpy.gridsearch import gridsearch
from rankpy.gridsearch import train_test_split

# Turn on logging.
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : '
                    '%(message)s', level=logging.INFO)

# Load the query datasets.
training_queries = Queries.load_from_text('data/MQ2007/Fold1/train.txt')
validation_queries = Queries.load_from_text('data/MQ2007/Fold1/vali.txt')
test_queries = Queries.load_from_text('data/MQ2007/Fold1/test.txt')

logging.info('=' * 80)

# Save them to binary format ...
training_queries.save('data/MQ2007/Fold1/training')
validation_queries.save('data/MQ2007/Fold1/validation')
test_queries.save('data/MQ2007/Fold1/test')

# ... because loading them will be then faster.
training_queries = Queries.load('data/MQ2007/Fold1/training')
validation_queries = Queries.load('data/MQ2007/Fold1/validation')
test_queries = Queries.load('data/MQ2007/Fold1/test')
Example #14
0
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

import logging

from rankpy.queries import Queries
from rankpy.queries import find_constant_features

from rankpy.models import LambdaMART

# Turn on logging.
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

# Load the query datasets.
training_queries = Queries.load_from_text('train.svmlight')
validation_queries = Queries.load_from_text('vali.svmlight')
test_queries = Queries.load_from_text('test.svmlight')

logging.info(
    '================================================================================'
)

# Print basic info about query datasets.
logging.info('Train queries: %s' % training_queries)
logging.info('Valid queries: %s' % validation_queries)
logging.info('Test queries: %s' % test_queries)

logging.info(
    '================================================================================'
)
Example #15
0
import numpy as np
import pandas as pd

import logging

from rankpy.queries import Queries
from rankpy.queries import find_constant_features

from rankpy.models import LambdaMART


# Turn on logging.
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

# Load the query datasets.
training_queries = Queries.load_from_text('train.svmlight')
validation_queries = Queries.load_from_text('vali.svmlight')
test_queries = Queries.load_from_text('test.svmlight')

logging.info('================================================================================')

# Print basic info about query datasets.
logging.info('Train queries: %s' % training_queries)
logging.info('Valid queries: %s' % validation_queries)
logging.info('Test queries: %s' % test_queries)

logging.info('================================================================================')

# Set this to True in order to remove queries containing all documents
# of the same relevance score -- these are useless for LambdaMART.
remove_useless_queries = False
Example #16
0
import logging, argparse

from rankpy.queries import Queries
from rankpy.models import LambdaMART
from rankpy.metrics import *

parser = argparse.ArgumentParser(description="Rank py.")
parser.add_argument("metric", help="metric", type=int)
parser.add_argument("iter", help="iterations", type=int)
args = parser.parse_args()

# Turn on logging.
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

# Load the query datasets.
train_queries = Queries.load_from_text('data/train.txt')
valid_queries = Queries.load_from_text('data/vali.txt')
test_queries = Queries.load_from_text('data/test.txt')

logging.info(
    '================================================================================'
)

# Save them to binary format ...
train_queries.save('data/fold2_train')
valid_queries.save('data/fold2_vali')
test_queries.save('data/fold2_test')

# ... because loading them will be then faster.
train_queries = Queries.load('data/fold2_train')
valid_queries = Queries.load('data/fold2_vali')