reward_method = "positive" click_models = ["informational", "perfect"] # click_models = ["informational"] # dataset_fold = "../datasets/2007_mq_dataset" dataset_fold = "../datasets/MSLR10K" output_fold = "results/mslr10k/MDP_with_SGD_optimizer/MDP_001_positive_naive_gamma01" print("reward:", reward_method, "lr:", learning_rate, "eta:", eta, output_fold, "gamma", gamma) # for 5 folds for f in range(1, 6): # training_path = "{}/set1.train.txt".format(dataset_fold) # test_path = "{}/set1.test.txt".format(dataset_fold) training_path = "{}/Fold{}/train.txt".format(dataset_fold, f) test_path = "{}/Fold{}/test.txt".format(dataset_fold, f) train_set = LetorDataset(training_path, FEATURE_SIZE, query_level_norm=True) test_set = LetorDataset(test_path, FEATURE_SIZE, query_level_norm=True) # %% processors = [] # for 3 click_models for click_model in click_models: p = mp.Process(target=job, args=(click_model, learning_rate, eta, gamma, reward_method, f, train_set, test_set, FEATURE_SIZE, output_fold)) p.start() processors.append(p) for p in processors: p.join()
from clickModel.FBNCM import FBNCM from utils import read_file as rf from utils import utility from clickModel.SDBN import SDBN import multiprocessing as mp from dataset import LetorDataset def job(click_log_path, output_path, simulator, dataset): model = FBNCM(64, 700, 700, dataset) click_log = rf.read_click_log(click_log_path) model.initial_representation(click_log) model.save_training_tfrecord(click_log, output_path, simulator) if __name__ == "__main__": # simulators = ["SDBN", "DCTR", "UBM", "Mixed"] simulators = ["SDBN_reverse"] dataset_path = "../datasets/ltrc_yahoo/set1.train.txt" print("loading training set.......") dataset = LetorDataset(dataset_path, 700) for r in range(2, 16): pool = [] for simulator in simulators: click_log_path = "../click_logs/{}/train_set{}.txt".format( simulator, r) output_path = "../click_logs/{}/train_set{}_FBNCM.tfrecord".format( simulator, r) job(click_log_path, output_path, simulator, dataset)
line += str(int(c)) + " " line += "\n" f.write(line) # if index % 10000 == 0: # print("write %d/%d queries" % (index, num_queries)) f.close() print(cm.name, "unseen_set finished!") # %% if __name__ == "__main__": # %% train_path = "../datasets/ltrc_yahoo/set1.train.txt" test_path = "../datasets/ltrc_yahoo/set1.test.txt" print("loading training set.......") train_set = LetorDataset(train_path, 700) print("loading testing set.......") test_set = LetorDataset(test_path, 700) # %% # pc = [0.4, 0.6, 0.7, 0.8, 0.9] # ps = [0.1, 0.2, 0.3, 0.4, 0.5] pc = [0.05, 0.3, 0.5, 0.7, 0.95] ps = [0.2, 0.3, 0.5, 0.7, 0.9] for id in range(1, 16): p1 = mp.Process(target=generate_dataset, args=(train_set, test_set, DCTR(pc), "../feature_click_datasets/DCTR/", id)) p2 = mp.Process(target=generate_dataset, args=(train_set, test_set, CM(pc), "../feature_click_datasets/CM/", id)) p3 = mp.Process(target=generate_dataset,
from dataset import LetorDataset import numpy as np from clickModel.LSTMv2 import LSTMv2 from utils import read_file as rf from clickModel.DCTR import DCTR train_path = "../datasets/ltrc_yahoo/test_set.txt" print("loading training set.......") train_set = LetorDataset(train_path, 700) click_log_path = "../datasets/ltrc_yahoo/test_click_log.txt" test_click_log_path = "../datasets/ltrc_yahoo/test_click_log_test.txt" click_log = rf.read_click_log(click_log_path) test_click_log = rf.read_click_log(test_click_log_path) pc = [0.05, 0.3, 0.5, 0.7, 0.95] ps = [0.2, 0.3, 0.5, 0.7, 0.9] simulator = DCTR(pc) print(click_log.shape) print(test_click_log.shape) # click_model = LSTMv2(700, 1024, train_set) click_model.train(click_log) print( click_model.get_MSE( test_click_log[np.random.choice(test_click_log.shape[0], 100)], train_set, simulator))
gamma = 0.0 reward_method = "both" click_models = ["informational", "perfect"] # click_models = ["informational"] # dataset_fold = "../datasets/2007_mq_dataset" dataset_fold = "../datasets/MSLR10K" output_fold = "results/mslr10k/MDP_with_SGD_optimizer/MDP_001_both_one_at_time" print("reward:", reward_method, "lr:", learning_rate, "eta:", eta, output_fold, "gamma", gamma) # for 5 folds for f in range(1, 6): # training_path = "{}/set1.train.txt".format(dataset_fold) # test_path = "{}/set1.test.txt".format(dataset_fold) training_path = "{}/Fold{}/train.txt".format(dataset_fold, f) test_path = "{}/Fold{}/test.txt".format(dataset_fold, f) train_set = LetorDataset(training_path, FEATURE_SIZE, query_level_norm=True, cache_root="../datasets/cache") test_set = LetorDataset(test_path, FEATURE_SIZE, query_level_norm=True, cache_root="../datasets/cache") for click_model in click_models: p = mp.Process(target=job, args=(click_model, learning_rate, eta, gamma, reward_method, f, train_set, test_set, FEATURE_SIZE, output_fold)) p.start()
print(get_qrel_avg_num_of_rel(dic0)) dic1 = read_intent_qrel("1.txt") print(get_qrel_avg_num_of_rel(dic1)) dic2 = read_intent_qrel("2.txt") print(get_qrel_avg_num_of_rel(dic2)) dic3 = read_intent_qrel("3.txt") print(get_qrel_avg_num_of_rel(dic3)) dic4 = read_intent_qrel("4.txt") print(get_qrel_avg_num_of_rel(dic4)) dic5 = read_intent_qrel("5.txt") print(get_qrel_avg_num_of_rel(dic5)) dataset_fold = "../datasets/clueweb09/ClueWeb09-TREC-LTR.txt" # dataset_fold = "../datasets/clueweb09/clueweb09_intent_change.txt" train_set = LetorDataset(dataset_fold, 91, query_level_norm=True, binary_label=True) train_set.update_relevance_label(dic0) print(get_dataset_avg_num_of_rel(train_set)) train_set.update_relevance_label(dic1) print(get_dataset_avg_num_of_rel(train_set)) train_set.update_relevance_label(dic2) print(get_dataset_avg_num_of_rel(train_set)) train_set.update_relevance_label(dic3) print(get_dataset_avg_num_of_rel(train_set)) train_set.update_relevance_label(dic4) print(get_dataset_avg_num_of_rel(train_set)) train_set.write_cross_validation_datasets("datasets/Intent_change_Lin", 5)