Esempio n. 1
0
def get_train_test(search_sessions_path, search_sessions_num):
    """
    Reads the given number of search sessions from the given path
    and splits them into train and test sets with the ratio 3/1.
    Queries that are not present in the train set are removed from the test set.
    Returns the train/test sessions and queries.

    :param search_sessions_path: The path to the file with search sessions
        in the format of Yandex Relevance Prediction Challenge
        (http://imat-relpred.yandex.ru/en).
    :param search_sessions_num: The number of search sessions to consider.
    :returns: Returns train sessions, train queries (distinct), test sessions, test queries (distinct).
    """
    search_sessions = YandexRelPredChallengeParser().parse(
        search_sessions_path, search_sessions_num)

    train_test_split = int(len(search_sessions) * 0.75)
    train_sessions = search_sessions[:train_test_split]
    train_queries = Utils.get_unique_queries(train_sessions)

    test_sessions = Utils.filter_sessions(search_sessions[train_test_split:],
                                          train_queries)
    test_queries = Utils.get_unique_queries(test_sessions)

    return train_sessions, train_queries, test_sessions, test_queries
Esempio n. 2
0
def fit_click_model(search_sessions, cm):
    train_test_split = int(len(search_sessions) * 0.75)
    train_sessions = search_sessions[:train_test_split]
    train_queries = pc_utils.get_unique_queries(train_sessions)

    test_sessions = pc_utils.filter_sessions(
        search_sessions[train_test_split:], train_queries)
    test_queries = pc_utils.get_unique_queries(test_sessions)

    module = importlib.import_module(modules[cm])
    click_model = getattr(module, cm)

    click_model.train(train_sessions)

    with open(
            Path(config.DATASET_OUTPUT_FOLDER + 'click_models/' + cm +
                 '.pkl')) as out:
        pickle.dump(click_model, out)
    ranker = QueryDocumentRanker(PyClickModelAdapter(model))
    df = expand_content_ids(ranker.rank(query).to_frame()).sort_values(0)
    for idx, row in df.iterrows():
        a = sdbn_click_model.params[sdbn_click_model.param_names.attr].get(query, idx)
        s = sdbn_click_model.params[sdbn_click_model.param_names.sat].get(query, idx)
        n = sdbn_click_model.params[sdbn_click_model.param_names.attr]._container[query][idx]._denominator
        print(f'a={a} s={s}, n={n}: {idx} ({row["title"]})')


if __name__ == "__main__":
    logging.basicConfig(filename='estimate_with_pyclick.log',level=logging.INFO)

    training, test = load_from_csv()
    train_sessions = map_to_pyclick_format(training)
    test_sessions = map_to_pyclick_format(test)
    train_queries = Utils.get_unique_queries(train_sessions)

    # PyClick normally filters out any test sessions that aren't in the training set.
    # I shouldn't need to do this, because my train/test split shouldn't let this happen.
    assert len(test_sessions) == len(Utils.filter_sessions(test_sessions, train_queries))

    test_queries = Utils.get_unique_queries(test_sessions)

    print('SDBN')
    train_model(sdbn_click_model, train_sessions, train_queries)
    evaluate_fit(sdbn_click_model, test_sessions, test_queries)

    with open('sdbn_model.json', 'w') as f:
        f.write(sdbn_click_model.to_json())

    from evaluate_model import PyClickModelAdapter, QueryDocumentRanker
Esempio n. 4
0
        result = SearchResult(search.final_click_url, 1)
        session.web_results.append(result)

        sessions.append(session)

    return sessions


if __name__ == "__main__":
    click_model = SDBN()

    search_sessions = get_sessions()
    train_test_split = int(len(search_sessions) * 0.75)
    train_sessions = search_sessions[:train_test_split]
    train_queries = Utils.get_unique_queries(train_sessions)

    test_sessions = Utils.filter_sessions(search_sessions[train_test_split:],
                                          train_queries)
    test_queries = Utils.get_unique_queries(test_sessions)

    print("===============================")
    print("Training on %d search sessions (%d unique queries)." %
          (len(train_sessions), len(train_queries)))
    print("===============================")

    start = time.time()
    click_model.train(train_sessions)
    end = time.time()
    print("\tTrained %s click model in %i secs:\n%r" %
          (click_model.__class__.__name__, end - start, click_model))
Esempio n. 5
0
        print "\tdataset - the path to the dataset from Yandex Relevance Prediction Challenge"
        print "\tsessions_max - the maximum number of one-query search sessions to consider"
        print ""
        sys.exit(1)

    click_model = TCM()
    search_sessions_path = sys.argv[1]
    search_sessions_num = int(sys.argv[2])

    # search_sessions = YandexRelPredChallengeParser().parse(search_sessions_path, search_sessions_num)
    search_sessions = YandexPersonalizedChallengeParser().parse(
        search_sessions_path, search_sessions_num)

    train_test_split = int(len(search_sessions) * 0.75)
    train_sessions = search_sessions[:train_test_split]
    train_queries = Utils.get_unique_queries(train_sessions)
    train_tasks = SearchTask.get_search_tasks(train_sessions)

    # test_sessions = Utils.filter_sessions(search_sessions[train_test_split:], train_queries)
    # test_queries = Utils.get_unique_queries(test_sessions)
    # test_tasks = SearchTask.get_search_tasks(test_sessions)

    print "-------------------------------"
    print "Training on %d search tasks (%d search sessions, %d unique queries)." % \
          (len(train_tasks), len(train_sessions), len(train_queries))
    print "-------------------------------"
    # print "data ", train_tasks[:5]
    click_model.train(train_tasks)
    print "\tTrained %s click model:\n%r" % (click_model.__class__.__name__,
                                             click_model)
Esempio n. 6
0
        with open('./data/search_sessions.pkl', 'wb') as fout:
            pickle.dump(search_sessions, fout, protocol=-1)
    else:
        with open('./data/search_sessions.pkl') as fin:
            search_sessions = pickle.load(fin)

        # Because we are interested mostly in getting parameter estimates
        # for `search_sessions` queries, they are also used as 'hold-out set'.
        holdout_search_sessions = list(search_sessions)

        n_search_sessions = 1000000

        # Reads `n_search_session` number of sessions from the Yandex dataset ...
        additional_search_sessions = Utils.filter_sessions(YandexRelPredChallengeParser().parse(search_sessions_path, sessions_max=n_search_sessions),
                                                           Utils.get_unique_queries(search_sessions),
                                                           operation='remove')

        # ... and merges them with `search_sessions` while taking care of not
        # including any session twice.
        search_sessions.extend(additional_search_sessions)

    queries = Utils.get_unique_queries(search_sessions)

    print "---------------------------------------------------------"
    print "Training on %d search sessions (%d unique queries)." % (len(search_sessions), len(queries))
    print "---------------------------------------------------------"

    for model_name, model_filename in models_to_train:
        # Makes sure the output file for model parameters
        # does not exist (overwrite protection).
import numpy as np
from operator import add
import pickle
import scipy.stats as st
import matplotlib.pyplot as mp


click_model = UBM()
search_sessions_path = "YandexRelPredChallenge.txt"
search_sessions_num = 10000

search_sessions = YandexRelPredChallengeParser().parse(search_sessions_path, search_sessions_num)

train_test_split = int(len(search_sessions) * 0.75)
train_sessions = search_sessions[:train_test_split]
train_queries = Utils.get_unique_queries(train_sessions)

test_sessions = Utils.filter_sessions(search_sessions[train_test_split:], train_queries)
test_queries = Utils.get_unique_queries(test_sessions)
click_model.train(train_sessions)




measures = {'dcg':cm.dcg, 'rbp':cm.rbp, 'err':cm.err}
ep_pairs = cm.pairs()
for m in measures:
    print "making groups"
    groups = cm.make_groups(measures[m], ep_pairs)
    
    print 'ubm with %s' % m
Esempio n. 8
0
    if len(sys.argv) < 3:
        print "USAGE: %s <dataset> <sessions_max>" % sys.argv[0]
        print "\tdataset - the path to the dataset from Yandex Relevance Prediction Challenge"
        print "\tsessions_max - the maximum number of one-query search sessions to consider"
        print ""
        sys.exit(1)

    click_model = TCM()
    search_sessions_path = sys.argv[1]
    search_sessions_num = int(sys.argv[2])

    search_sessions = YandexRelPredChallengeParser().parse(search_sessions_path, search_sessions_num)

    train_test_split = int(len(search_sessions) * 0.75)
    train_sessions = search_sessions[:train_test_split]
    train_queries = Utils.get_unique_queries(train_sessions)
    train_tasks = SearchTask.get_search_tasks(train_sessions)

    # test_sessions = Utils.filter_sessions(search_sessions[train_test_split:], train_queries)
    # test_queries = Utils.get_unique_queries(test_sessions)
    # test_tasks = SearchTask.get_search_tasks(test_sessions)

    print "-------------------------------"
    print "Training on %d search tasks (%d search sessions, %d unique queries)." % \
          (len(train_tasks), len(train_sessions), len(train_queries))
    print "-------------------------------"

    click_model.train(train_tasks)
    print "\tTrained %s click model:\n%r" % (click_model.__class__.__name__, click_model)

    # print "-------------------------------"