def queryRanker(self): #Extract the high frequency queries from the training_queries HighFreqQueries = [] training_queries = queryClass.load_queries(self.path_train, self.feature_count) test_queries = queryClass.load_queries(self.path_test, self.feature_count) #loop through all queries in the training set for index in training_queries.get_qids(): highQuery = training_queries.get_query(index) #only keep the frequent queries if(len(highQuery.__labels__) > self.minFreqCount): HighFreqQueries.append(highQuery) print "found "+ str(len(HighFreqQueries)) + " high frequency queries" #build the query-ranker dictionary BestRanker = queryRankers() user_model = environment.CascadeUserModel(self.clickModel) evaluation2 = evaluation.NdcgEval() #test_queries = query.load_queries(sys.argv[2], feature_count) print "Read in training and testing queries" #for every query learn the best ranker and save it to the dictionary iter=0 for highQuery in HighFreqQueries: ran=random.random() iter=iter+1 if ran<self.threshold: print str(iter*100/len(HighFreqQueries))+"%" for i in xrange(self.rankersPerQuery): learner = retrieval_system.ListwiseLearningSystem(self.feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01') BestRanker.addInitRank(highQuery.get_qid(),learner.get_solution().w) q = highQuery for t in range(self.iterationCount): l = learner.get_ranked_list(q) c = user_model.get_clicks(l, q.get_labels()) s = learner.update_solution(c) e = evaluation2.evaluate_all(s, test_queries) BestRanker.add(highQuery.get_qid(),learner.get_solution().w) BestRanker.addList(highQuery.get_qid(),l) BestRanker.addEval(highQuery.get_qid(),e) #save the dictionary to a file ('bestRanker.p') paths=self.path_train.split('/') name=paths[1] #pickle.dump(BestRanker, open( "QueryData/"+name+".data", "wb" ) ) pickle.dump(BestRanker, open( "QueryData/"+self.dataset+str(self.iterationCount)+".data", "wb" ) ) test = pickle.load( open( "QueryData/"+self.dataset+str(self.iterationCount)+".data", "rb" ) ) print test.query_ranker.values()
def groupRanker(self): #Extract the high frequency queries from the training_queries clusterData=pickle.load(open( self.clusterDataPath, "rb" ) ) queryData= self.queryData HighFreqQueries = [] training_queries = queryClass.load_queries(self.path_train, self.feature_count) test_queries = queryClass.load_queries(self.path_test, self.feature_count) #loop through all queries in the training set #build the query-ranker dictionary BestRanker = queryRankers() user_model = environment.CascadeUserModel(self.clickModel) evaluation2 = evaluation.NdcgEval() #test_queries = query.load_queries(sys.argv[2], feature_count) print "Read in training and testing queries" #for every query learn the best ranker and save it to the dictionary iter=0 learner=[0]*len(clusterData.clusterToRanker.keys()) for cluster in clusterData.clusterToRanker: learner[cluster] = retrieval_system.ListwiseLearningSystem(self.feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01') for t in range(self.iterationCount): q = training_queries[random.choice(training_queries.keys())] temp=(float(np.sum(clusterData.queryToCluster[q.get_qid()])))/(float(len(clusterData.queryToCluster[q.get_qid()]))) temp=int(temp+0.5) cluster=temp #cluster=clusterData.queryToCluster[q.get_qid()][0] iter=iter+1 if iter%200==0: print str(iter*100/self.iterationCount)+"%" l = learner[cluster].get_ranked_list(q) c = user_model.get_clicks(l, q.get_labels()) s = learner[cluster].update_solution(c) #e = evaluation2.evaluate_all(s, test_queries) for cluster in clusterData.clusterToRanker: clusterData.clusterToRanker[cluster]=[learner[cluster].get_solution().w.tolist()] #save the dictionary to a file ('bestRanker.p') paths=self.path_train.split('/') name=paths[1] #pickle.dump(BestRanker, open( "QueryData/"+name+".data", "wb" ) ) pickle.dump(clusterData, open( "ClusterData/"+self.dataset+".data", "wb" ) )
def __init__(self, queries, feature_count, log_fh, args): """Initialize an experiment using the provided arguments.""" self.log_fh = log_fh self.queries = queries self.feature_count = feature_count self.ties = "first" # construct experiment according to provided arguments self.result_length = args["result_length"] self.num_queries = args["num_queries"] self.query_sampling_method = args["query_sampling_method"] self.um_class = get_class(args["user_model"]) self.um_args = args["user_model_args"] self.um = self.um_class(self.um_args) # set up methods to compare parser = argparse.ArgumentParser(description="parse arguments of an " "evaluation method.", prog="evaluation method configuration") parser.add_argument("-c", "--class_name") parser.add_argument("-r", "--ranker") parser.add_argument("-a", "--ranker_args") parser.add_argument("-i", "--interleave_method") self.rankers = {} self.live_methods = {} self.hist_methods = {} self.ndcg = evaluation.NdcgEval() # init live methods if "live_evaluation_methods" in args: for method_id, method in enumerate( args["live_evaluation_methods"]): self.live_methods[method] = {} method_args_str = \ args["live_evaluation_methods_args"][method_id] method_args = vars(parser.parse_known_args( method_args_str.split())[0]) class_name = method_args["class_name"] self.live_methods[method]["instance"] = \ get_class(class_name)(method_args_str) ranker = method_args["ranker"] ranker_args = method_args["ranker_args"] self.live_methods[method]["ranker"] = ranker self.live_methods[method]["ranker_args"] = ranker_args if not ranker in self.rankers: self.rankers[ranker] = {} if not ranker_args in self.rankers[ranker]: self.rankers[ranker][ranker_args] = {} # init hist methods if "hist_evaluation_methods" in args: for method_id, method in enumerate( args["hist_evaluation_methods"]): self.hist_methods[method] = {} method_args_str = \ args["hist_evaluation_methods_args"][method_id] method_args = vars(parser.parse_known_args( method_args_str.split())[0]) class_name = method_args["class_name"] self.hist_methods[method]["instance"] = \ get_class(class_name)(method_args_str) ranker = method_args["ranker"] ranker_args = method_args["ranker_args"] self.hist_methods[method]["ranker"] = method_args["ranker"] self.hist_methods[method]["ranker_args"] = \ method_args["ranker_args"] if not ranker in self.rankers: self.rankers[ranker] = {} if not ranker_args in self.rankers[ranker]: self.rankers[ranker][ranker_args] = {} self.hist_methods[method]["interleave_method"] = \ get_class(method_args["interleave_method"])() # sample source and target ranker pair, create deterministic and # probabilistic ranker pairs self.source_pair = [0, 0] self.source_pair[0] = self._sample_ranker_without_replacement( self.feature_count, []) self.source_pair[1] = self._sample_ranker_without_replacement( self.feature_count, [self.source_pair[0]]) self.target_pair = [0, 0] self.target_pair[0] = self._sample_ranker_without_replacement( self.feature_count, self.source_pair) self.target_pair[1] = self._sample_ranker_without_replacement( self.feature_count, [self.target_pair[0], self.source_pair[0], self.source_pair[1]]) # init rankers needed by live and/or hist methods for ranker in self.rankers: for ranker_args in self.rankers[ranker]: self.rankers[ranker][ranker_args]["source"] = \ self._get_ranker_pair(ranker, ranker_args, self.source_pair, self.feature_count, self.ties) self.rankers[ranker][ranker_args]["target"] = \ self._get_ranker_pair(ranker, ranker_args, self.target_pair, self.feature_count, self.ties)
import datetime import numpy as np # init data, query_samples, d's train_queries = query.load_queries('../../DATA/NP2004/Fold1/train.txt', 64) test_queries = query.load_queries('../../DATA/NP2004/Fold1/test.txt', 64) query_samples = 5000 # how many queries we sample d = 3 k = 10 number_of_evaluation = query_samples / k # init user model, evaluation methods user_model = environment.CascadeUserModel( '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0') evaluation = evaluation.NdcgEval() rem_ndcg_evaluation_train = [] full_ndcg_evaluation_train = [] rem_ndcg_evaluation_test = [] full_ndcg_evaluation_test = [] for m in range(0, k): # for each k, we have different A matrix # as mentioned on the REMBO paper rem_learner = retrieval_system.ListwiseLearningSystemREMBO( 64, d, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunctionREMBO -s 3 -d 0.1 -a 0.01' ) full_learner = retrieval_system.ListwiseLearningSystem( 64,
alphas = [0.01, 0.05, 0.1, 0.5, 1.0] deltas = [0.05, 0.1, 0.5, 1.0, 2.0] reps = 5 nrqueries = 500 #alphas = [0.75, 1.0, 1.25, 1.5] #deltas = [1.75, 2.0, 2.25] #reps = 5 #nrqueries = 500 factor_k1 = 13.3 factor_k3 = .1 user_model = environment.CascadeUserModel( '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0') evaluator = evaluation.NdcgEval() training_queries = query.load_queries(sys.argv[1], 64) test_queries = query.load_queries(sys.argv[2], 64) def run(alpha, delta): results = [] for _ in range(reps): #learner = retrieval_system.ListwiseLearningSystem(64, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 ranker.model.BM25 -d %.2f -a %.2f' % (delta, alpha)) learner = retrieval_system.ListwiseLearningSystemWithCandidateSelection( 64, '--num_repetitions 10 --num_candidates 6 --history_length 10 --select_candidate select_candidate_repeated -w random -c comparison.ProbabilisticInterleave --ranker ranker.ProbabilisticRankingFunction --ranker_args 3 ranker.model.BM25 -d %s -a %s --anneal 50' % (delta, alpha)) for _ in range(nrqueries): q = training_queries[random.choice(training_queries.keys())]