Esempio n. 1
0
    reward_method = "positive"
    click_models = ["informational", "perfect"]
    # click_models = ["informational"]
    # dataset_fold = "../datasets/2007_mq_dataset"
    dataset_fold = "../datasets/MSLR10K"
    output_fold = "results/mslr10k/MDP_with_SGD_optimizer/MDP_001_positive_naive_gamma01"
    print("reward:", reward_method, "lr:", learning_rate, "eta:", eta,
          output_fold, "gamma", gamma)
    # for 5 folds
    for f in range(1, 6):
        # training_path = "{}/set1.train.txt".format(dataset_fold)
        # test_path = "{}/set1.test.txt".format(dataset_fold)
        training_path = "{}/Fold{}/train.txt".format(dataset_fold, f)
        test_path = "{}/Fold{}/test.txt".format(dataset_fold, f)
        train_set = LetorDataset(training_path,
                                 FEATURE_SIZE,
                                 query_level_norm=True)
        test_set = LetorDataset(test_path, FEATURE_SIZE, query_level_norm=True)
        # %%
        processors = []
        # for 3 click_models
        for click_model in click_models:
            p = mp.Process(target=job,
                           args=(click_model, learning_rate, eta, gamma,
                                 reward_method, f, train_set, test_set,
                                 FEATURE_SIZE, output_fold))
            p.start()
            processors.append(p)
    for p in processors:
        p.join()
from clickModel.FBNCM import FBNCM
from utils import read_file as rf
from utils import utility
from clickModel.SDBN import SDBN
import multiprocessing as mp
from dataset import LetorDataset


def job(click_log_path, output_path, simulator, dataset):
    model = FBNCM(64, 700, 700, dataset)
    click_log = rf.read_click_log(click_log_path)
    model.initial_representation(click_log)
    model.save_training_tfrecord(click_log, output_path, simulator)


if __name__ == "__main__":

    # simulators = ["SDBN", "DCTR", "UBM", "Mixed"]
    simulators = ["SDBN_reverse"]
    dataset_path = "../datasets/ltrc_yahoo/set1.train.txt"
    print("loading training set.......")
    dataset = LetorDataset(dataset_path, 700)
    for r in range(2, 16):
        pool = []
        for simulator in simulators:
            click_log_path = "../click_logs/{}/train_set{}.txt".format(
                simulator, r)
            output_path = "../click_logs/{}/train_set{}_FBNCM.tfrecord".format(
                simulator, r)
            job(click_log_path, output_path, simulator, dataset)
Esempio n. 3
0
            line += str(int(c)) + " "
        line += "\n"
        f.write(line)
        # if index % 10000 == 0:
        # print("write %d/%d queries" % (index, num_queries))
    f.close()
    print(cm.name, "unseen_set finished!")


# %%
if __name__ == "__main__":
    # %%
    train_path = "../datasets/ltrc_yahoo/set1.train.txt"
    test_path = "../datasets/ltrc_yahoo/set1.test.txt"
    print("loading training set.......")
    train_set = LetorDataset(train_path, 700)
    print("loading testing set.......")
    test_set = LetorDataset(test_path, 700)
    # %%
    # pc = [0.4, 0.6, 0.7, 0.8, 0.9]
    # ps = [0.1, 0.2, 0.3, 0.4, 0.5]
    pc = [0.05, 0.3, 0.5, 0.7, 0.95]
    ps = [0.2, 0.3, 0.5, 0.7, 0.9]
    for id in range(1, 16):
        p1 = mp.Process(target=generate_dataset,
                        args=(train_set, test_set, DCTR(pc),
                              "../feature_click_datasets/DCTR/", id))
        p2 = mp.Process(target=generate_dataset,
                        args=(train_set, test_set, CM(pc),
                              "../feature_click_datasets/CM/", id))
        p3 = mp.Process(target=generate_dataset,
Esempio n. 4
0
from dataset import LetorDataset
import numpy as np
from clickModel.LSTMv2 import LSTMv2
from utils import read_file as rf
from clickModel.DCTR import DCTR

train_path = "../datasets/ltrc_yahoo/test_set.txt"
print("loading training set.......")
train_set = LetorDataset(train_path, 700)

click_log_path = "../datasets/ltrc_yahoo/test_click_log.txt"
test_click_log_path = "../datasets/ltrc_yahoo/test_click_log_test.txt"
click_log = rf.read_click_log(click_log_path)
test_click_log = rf.read_click_log(test_click_log_path)

pc = [0.05, 0.3, 0.5, 0.7, 0.95]
ps = [0.2, 0.3, 0.5, 0.7, 0.9]
simulator = DCTR(pc)
print(click_log.shape)
print(test_click_log.shape)
#
click_model = LSTMv2(700, 1024, train_set)
click_model.train(click_log)
print(
    click_model.get_MSE(
        test_click_log[np.random.choice(test_click_log.shape[0], 100)],
        train_set, simulator))
Esempio n. 5
0
    gamma = 0.0
    reward_method = "both"
    click_models = ["informational", "perfect"]
    # click_models = ["informational"]
    # dataset_fold = "../datasets/2007_mq_dataset"
    dataset_fold = "../datasets/MSLR10K"
    output_fold = "results/mslr10k/MDP_with_SGD_optimizer/MDP_001_both_one_at_time"
    print("reward:", reward_method, "lr:", learning_rate, "eta:", eta,
          output_fold, "gamma", gamma)
    # for 5 folds
    for f in range(1, 6):
        # training_path = "{}/set1.train.txt".format(dataset_fold)
        # test_path = "{}/set1.test.txt".format(dataset_fold)
        training_path = "{}/Fold{}/train.txt".format(dataset_fold, f)
        test_path = "{}/Fold{}/test.txt".format(dataset_fold, f)
        train_set = LetorDataset(training_path,
                                 FEATURE_SIZE,
                                 query_level_norm=True,
                                 cache_root="../datasets/cache")
        test_set = LetorDataset(test_path,
                                FEATURE_SIZE,
                                query_level_norm=True,
                                cache_root="../datasets/cache")

        for click_model in click_models:
            p = mp.Process(target=job,
                           args=(click_model, learning_rate, eta, gamma,
                                 reward_method, f, train_set, test_set,
                                 FEATURE_SIZE, output_fold))
            p.start()
Esempio n. 6
0
print(get_qrel_avg_num_of_rel(dic0))
dic1 = read_intent_qrel("1.txt")
print(get_qrel_avg_num_of_rel(dic1))
dic2 = read_intent_qrel("2.txt")
print(get_qrel_avg_num_of_rel(dic2))
dic3 = read_intent_qrel("3.txt")
print(get_qrel_avg_num_of_rel(dic3))
dic4 = read_intent_qrel("4.txt")
print(get_qrel_avg_num_of_rel(dic4))
dic5 = read_intent_qrel("5.txt")
print(get_qrel_avg_num_of_rel(dic5))

dataset_fold = "../datasets/clueweb09/ClueWeb09-TREC-LTR.txt"
# dataset_fold = "../datasets/clueweb09/clueweb09_intent_change.txt"
train_set = LetorDataset(dataset_fold,
                         91,
                         query_level_norm=True,
                         binary_label=True)

train_set.update_relevance_label(dic0)
print(get_dataset_avg_num_of_rel(train_set))
train_set.update_relevance_label(dic1)
print(get_dataset_avg_num_of_rel(train_set))
train_set.update_relevance_label(dic2)
print(get_dataset_avg_num_of_rel(train_set))
train_set.update_relevance_label(dic3)
print(get_dataset_avg_num_of_rel(train_set))
train_set.update_relevance_label(dic4)
print(get_dataset_avg_num_of_rel(train_set))

train_set.write_cross_validation_datasets("datasets/Intent_change_Lin", 5)