def get_train_test(search_sessions_path, search_sessions_num): """ Reads the given number of search sessions from the given path and splits them into train and test sets with the ratio 3/1. Queries that are not present in the train set are removed from the test set. Returns the train/test sessions and queries. :param search_sessions_path: The path to the file with search sessions in the format of Yandex Relevance Prediction Challenge (http://imat-relpred.yandex.ru/en). :param search_sessions_num: The number of search sessions to consider. :returns: Returns train sessions, train queries (distinct), test sessions, test queries (distinct). """ search_sessions = YandexRelPredChallengeParser().parse( search_sessions_path, search_sessions_num) train_test_split = int(len(search_sessions) * 0.75) train_sessions = search_sessions[:train_test_split] train_queries = Utils.get_unique_queries(train_sessions) test_sessions = Utils.filter_sessions(search_sessions[train_test_split:], train_queries) test_queries = Utils.get_unique_queries(test_sessions) return train_sessions, train_queries, test_sessions, test_queries
def fit_click_model(search_sessions, cm): train_test_split = int(len(search_sessions) * 0.75) train_sessions = search_sessions[:train_test_split] train_queries = pc_utils.get_unique_queries(train_sessions) test_sessions = pc_utils.filter_sessions( search_sessions[train_test_split:], train_queries) test_queries = pc_utils.get_unique_queries(test_sessions) module = importlib.import_module(modules[cm]) click_model = getattr(module, cm) click_model.train(train_sessions) with open( Path(config.DATASET_OUTPUT_FOLDER + 'click_models/' + cm + '.pkl')) as out: pickle.dump(click_model, out)
ranker = QueryDocumentRanker(PyClickModelAdapter(model)) df = expand_content_ids(ranker.rank(query).to_frame()).sort_values(0) for idx, row in df.iterrows(): a = sdbn_click_model.params[sdbn_click_model.param_names.attr].get(query, idx) s = sdbn_click_model.params[sdbn_click_model.param_names.sat].get(query, idx) n = sdbn_click_model.params[sdbn_click_model.param_names.attr]._container[query][idx]._denominator print(f'a={a} s={s}, n={n}: {idx} ({row["title"]})') if __name__ == "__main__": logging.basicConfig(filename='estimate_with_pyclick.log',level=logging.INFO) training, test = load_from_csv() train_sessions = map_to_pyclick_format(training) test_sessions = map_to_pyclick_format(test) train_queries = Utils.get_unique_queries(train_sessions) # PyClick normally filters out any test sessions that aren't in the training set. # I shouldn't need to do this, because my train/test split shouldn't let this happen. assert len(test_sessions) == len(Utils.filter_sessions(test_sessions, train_queries)) test_queries = Utils.get_unique_queries(test_sessions) print('SDBN') train_model(sdbn_click_model, train_sessions, train_queries) evaluate_fit(sdbn_click_model, test_sessions, test_queries) with open('sdbn_model.json', 'w') as f: f.write(sdbn_click_model.to_json()) from evaluate_model import PyClickModelAdapter, QueryDocumentRanker
result = SearchResult(search.final_click_url, 1) session.web_results.append(result) sessions.append(session) return sessions if __name__ == "__main__": click_model = SDBN() search_sessions = get_sessions() train_test_split = int(len(search_sessions) * 0.75) train_sessions = search_sessions[:train_test_split] train_queries = Utils.get_unique_queries(train_sessions) test_sessions = Utils.filter_sessions(search_sessions[train_test_split:], train_queries) test_queries = Utils.get_unique_queries(test_sessions) print("===============================") print("Training on %d search sessions (%d unique queries)." % (len(train_sessions), len(train_queries))) print("===============================") start = time.time() click_model.train(train_sessions) end = time.time() print("\tTrained %s click model in %i secs:\n%r" % (click_model.__class__.__name__, end - start, click_model))
print "\tdataset - the path to the dataset from Yandex Relevance Prediction Challenge" print "\tsessions_max - the maximum number of one-query search sessions to consider" print "" sys.exit(1) click_model = TCM() search_sessions_path = sys.argv[1] search_sessions_num = int(sys.argv[2]) # search_sessions = YandexRelPredChallengeParser().parse(search_sessions_path, search_sessions_num) search_sessions = YandexPersonalizedChallengeParser().parse( search_sessions_path, search_sessions_num) train_test_split = int(len(search_sessions) * 0.75) train_sessions = search_sessions[:train_test_split] train_queries = Utils.get_unique_queries(train_sessions) train_tasks = SearchTask.get_search_tasks(train_sessions) # test_sessions = Utils.filter_sessions(search_sessions[train_test_split:], train_queries) # test_queries = Utils.get_unique_queries(test_sessions) # test_tasks = SearchTask.get_search_tasks(test_sessions) print "-------------------------------" print "Training on %d search tasks (%d search sessions, %d unique queries)." % \ (len(train_tasks), len(train_sessions), len(train_queries)) print "-------------------------------" # print "data ", train_tasks[:5] click_model.train(train_tasks) print "\tTrained %s click model:\n%r" % (click_model.__class__.__name__, click_model)
with open('./data/search_sessions.pkl', 'wb') as fout: pickle.dump(search_sessions, fout, protocol=-1) else: with open('./data/search_sessions.pkl') as fin: search_sessions = pickle.load(fin) # Because we are interested mostly in getting parameter estimates # for `search_sessions` queries, they are also used as 'hold-out set'. holdout_search_sessions = list(search_sessions) n_search_sessions = 1000000 # Reads `n_search_session` number of sessions from the Yandex dataset ... additional_search_sessions = Utils.filter_sessions(YandexRelPredChallengeParser().parse(search_sessions_path, sessions_max=n_search_sessions), Utils.get_unique_queries(search_sessions), operation='remove') # ... and merges them with `search_sessions` while taking care of not # including any session twice. search_sessions.extend(additional_search_sessions) queries = Utils.get_unique_queries(search_sessions) print "---------------------------------------------------------" print "Training on %d search sessions (%d unique queries)." % (len(search_sessions), len(queries)) print "---------------------------------------------------------" for model_name, model_filename in models_to_train: # Makes sure the output file for model parameters # does not exist (overwrite protection).
import numpy as np from operator import add import pickle import scipy.stats as st import matplotlib.pyplot as mp click_model = UBM() search_sessions_path = "YandexRelPredChallenge.txt" search_sessions_num = 10000 search_sessions = YandexRelPredChallengeParser().parse(search_sessions_path, search_sessions_num) train_test_split = int(len(search_sessions) * 0.75) train_sessions = search_sessions[:train_test_split] train_queries = Utils.get_unique_queries(train_sessions) test_sessions = Utils.filter_sessions(search_sessions[train_test_split:], train_queries) test_queries = Utils.get_unique_queries(test_sessions) click_model.train(train_sessions) measures = {'dcg':cm.dcg, 'rbp':cm.rbp, 'err':cm.err} ep_pairs = cm.pairs() for m in measures: print "making groups" groups = cm.make_groups(measures[m], ep_pairs) print 'ubm with %s' % m
if len(sys.argv) < 3: print "USAGE: %s <dataset> <sessions_max>" % sys.argv[0] print "\tdataset - the path to the dataset from Yandex Relevance Prediction Challenge" print "\tsessions_max - the maximum number of one-query search sessions to consider" print "" sys.exit(1) click_model = TCM() search_sessions_path = sys.argv[1] search_sessions_num = int(sys.argv[2]) search_sessions = YandexRelPredChallengeParser().parse(search_sessions_path, search_sessions_num) train_test_split = int(len(search_sessions) * 0.75) train_sessions = search_sessions[:train_test_split] train_queries = Utils.get_unique_queries(train_sessions) train_tasks = SearchTask.get_search_tasks(train_sessions) # test_sessions = Utils.filter_sessions(search_sessions[train_test_split:], train_queries) # test_queries = Utils.get_unique_queries(test_sessions) # test_tasks = SearchTask.get_search_tasks(test_sessions) print "-------------------------------" print "Training on %d search tasks (%d search sessions, %d unique queries)." % \ (len(train_tasks), len(train_sessions), len(train_queries)) print "-------------------------------" click_model.train(train_tasks) print "\tTrained %s click model:\n%r" % (click_model.__class__.__name__, click_model) # print "-------------------------------"