def compareSystems(vali_queries,classifierPath,basic_ranker_path,clust_data_path,data,click): print "-Loading Data-" clf = pickle.load( open( classifierPath ) ) basic_ranker=pickle.load( open( basic_ranker_path ) ) clusterData=pickle.load(open(clust_data_path)) queryData=pickle.load(open(data)) ranker_tie="random" feature_count=basic_ranker.feature_count ranker_args="3" arg_str="" sample_send="sample_unit_sphere" iterations=100 rankers=[0]*2 rankers[0]=basic_ranker user_model = environment.CascadeUserModel(click) training_queries = query.load_queries(vali_queries, feature_count) compar_interleave=ProbabilisticInterleave(None) first_win=0 print "-Calculating-" for i in range(iterations): if i%(iterations/10)==0: print str(float(i)*100/float(iterations))+"%" q = training_queries.get_query(random.choice(queryData.query_ranker.keys())) test=queryData.query_ranker[q.get_qid()][0] testWeights=str(test) testWeights=testWeights.replace("[", "") testWeights=testWeights.replace("]", "") weights = np.array([float(num) for num in testWeights.split(",")]) print len(weights) ranker_tie="random" ranker_args="3" sample_send="sample_unit_sphere" rankers[1]=rankerClass.ProbabilisticRankingFunction(ranker_args, ranker_tie, feature_count, sample=sample_send, init=testWeights) l, a = compar_interleave.interleave(rankers[0], rankers[1], q, 10) c = user_model.get_clicks(l, q.get_labels()) o = compar_interleave.infer_outcome(l, a, c, q) if(o<0): first_win+=1 elif(o==0): coin=random.random() if(coin>0.5): first_win+=1 result_com=float(first_win)/float(iterations) print "Basic ranker win rate:"+ str(result_com)
def compareSystems(vali_queries, classifierPath, basic_ranker_path, clust_data_path, click): print "-Loading Data-" clf = pickle.load(open(classifierPath)) basic_ranker = pickle.load(open(basic_ranker_path)) clusterData = pickle.load(open(clust_data_path)) ranker_tie = "random" feature_count = basic_ranker.feature_count ranker_args = "3" arg_str = "" sample_send = "sample_unit_sphere" iterations = 100 rankers = [0] * 2 rankers[0] = basic_ranker user_model = environment.CascadeUserModel(click) training_queries = query.load_queries(vali_queries, feature_count) compar_interleave = ProbabilisticInterleave(None) second_win = 0 second_win_or_e = 0 generic_win = 0 equal = 0 print "-Calculating-" for i in range(iterations): if i % (iterations / 10) == 0: print str(float(i) * 100 / float(iterations)) + "%" q = training_queries[random.choice(training_queries.keys())] rankers[1] = classifier.getRanker(clf, basic_ranker, q, clusterData) l, a = compar_interleave.interleave(rankers[0], rankers[1], q, 10) c = user_model.get_clicks(l, q.get_labels()) o = compar_interleave.infer_outcome(l, a, c, q) if (o > 0): second_win += 1 second_win_or_e += 1 elif (o == 0): equal += 1 coin = random.random() if (coin > 0.5): second_win_or_e += 1 else: generic_win += 1 result_com = float(second_win_or_e) / float(iterations) result_win = float(second_win) / float(iterations) result_win_generic = float(generic_win) / float(iterations) print "Our ranker win rate (with random choice if result was equal):" + str( result_com) print "Our ranker win rate:" + str(result_win) print "Generic ranker win rate:" + str(result_win_generic) print "Number win ours:" + str(second_win) print "Number win generic:" + str(generic_win) print "Number equal:" + str(equal) print "Total number iterations:" + str(iterations)
def queryRanker(self): #Extract the high frequency queries from the training_queries HighFreqQueries = [] training_queries = queryClass.load_queries(self.path_train, self.feature_count) test_queries = queryClass.load_queries(self.path_test, self.feature_count) #loop through all queries in the training set for index in training_queries.get_qids(): highQuery = training_queries.get_query(index) #only keep the frequent queries if(len(highQuery.__labels__) > self.minFreqCount): HighFreqQueries.append(highQuery) print "found "+ str(len(HighFreqQueries)) + " high frequency queries" #build the query-ranker dictionary BestRanker = queryRankers() user_model = environment.CascadeUserModel(self.clickModel) evaluation2 = evaluation.NdcgEval() #test_queries = query.load_queries(sys.argv[2], feature_count) print "Read in training and testing queries" #for every query learn the best ranker and save it to the dictionary iter=0 for highQuery in HighFreqQueries: ran=random.random() iter=iter+1 if ran<self.threshold: print str(iter*100/len(HighFreqQueries))+"%" for i in xrange(self.rankersPerQuery): learner = retrieval_system.ListwiseLearningSystem(self.feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01') BestRanker.addInitRank(highQuery.get_qid(),learner.get_solution().w) q = highQuery for t in range(self.iterationCount): l = learner.get_ranked_list(q) c = user_model.get_clicks(l, q.get_labels()) s = learner.update_solution(c) e = evaluation2.evaluate_all(s, test_queries) BestRanker.add(highQuery.get_qid(),learner.get_solution().w) BestRanker.addList(highQuery.get_qid(),l) BestRanker.addEval(highQuery.get_qid(),e) #save the dictionary to a file ('bestRanker.p') paths=self.path_train.split('/') name=paths[1] #pickle.dump(BestRanker, open( "QueryData/"+name+".data", "wb" ) ) pickle.dump(BestRanker, open( "QueryData/"+self.dataset+str(self.iterationCount)+".data", "wb" ) ) test = pickle.load( open( "QueryData/"+self.dataset+str(self.iterationCount)+".data", "rb" ) ) print test.query_ranker.values()
def groupRanker(self): #Extract the high frequency queries from the training_queries clusterData=pickle.load(open( self.clusterDataPath, "rb" ) ) queryData= self.queryData HighFreqQueries = [] training_queries = queryClass.load_queries(self.path_train, self.feature_count) test_queries = queryClass.load_queries(self.path_test, self.feature_count) #loop through all queries in the training set #build the query-ranker dictionary BestRanker = queryRankers() user_model = environment.CascadeUserModel(self.clickModel) evaluation2 = evaluation.NdcgEval() #test_queries = query.load_queries(sys.argv[2], feature_count) print "Read in training and testing queries" #for every query learn the best ranker and save it to the dictionary iter=0 learner=[0]*len(clusterData.clusterToRanker.keys()) for cluster in clusterData.clusterToRanker: learner[cluster] = retrieval_system.ListwiseLearningSystem(self.feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01') for t in range(self.iterationCount): q = training_queries[random.choice(training_queries.keys())] temp=(float(np.sum(clusterData.queryToCluster[q.get_qid()])))/(float(len(clusterData.queryToCluster[q.get_qid()]))) temp=int(temp+0.5) cluster=temp #cluster=clusterData.queryToCluster[q.get_qid()][0] iter=iter+1 if iter%200==0: print str(iter*100/self.iterationCount)+"%" l = learner[cluster].get_ranked_list(q) c = user_model.get_clicks(l, q.get_labels()) s = learner[cluster].update_solution(c) #e = evaluation2.evaluate_all(s, test_queries) for cluster in clusterData.clusterToRanker: clusterData.clusterToRanker[cluster]=[learner[cluster].get_solution().w.tolist()] #save the dictionary to a file ('bestRanker.p') paths=self.path_train.split('/') name=paths[1] #pickle.dump(BestRanker, open( "QueryData/"+name+".data", "wb" ) ) pickle.dump(clusterData, open( "ClusterData/"+self.dataset+".data", "wb" ) )
def perform(): # init test_num_features = 64 queries = query.load_queries('../../data/NP2004/Fold1/test.txt', 64) bi = comparison.BalancedInterleave() user_model = environment.CascadeUserModel( '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0') # make rankers rankers = [] for i in range(0, 5): rankers.append( ranker.ProbabilisticRankingFunction( '3', 'random', 64, init=parseRanker('../../data/features64/ranker-0' + str(i) + '.txt'), sample='sample_unit_sphere')) # main loop for N in [100, 1000, 10000]: pref_matrix = [[0 for x in xrange(5)] for x in xrange(5)] for iter in range(0, N): q = queries[random.choice(queries.keys())] for i in range(0, 5): for j in range(0, 5): if i != j: list, context = bi.interleave(rankers[i], rankers[j], q, 10) clicks = user_model.get_clicks(list, q.get_labels()) result = bi.infer_outcome(list, context, clicks, q) if result < 0: pref_matrix[i][j] += 1 else: pref_matrix[i][j] = 0.50 pref_matrix = generateProbabilityMatrix(pref_matrix, N) printMatrix(pref_matrix) print 'Best ranker is ' + '0' + str( getBestRanker(pref_matrix)) + ' (N = ' + str(N) + ').' print 'done!'
def __init__(self, data_path, features): self.bi = comparison.BalancedInterleave() self.user_model = environment.CascadeUserModel('--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0') self.queries = query.load_queries(data_path, features)
import environment, evaluation, query, retrieval_system import time import datetime import numpy as np # init data, query_samples, d's train_queries = query.load_queries('../../DATA/NP2004/Fold1/train.txt', 64) test_queries = query.load_queries('../../DATA/NP2004/Fold1/test.txt', 64) query_samples = 5000 # how many queries we sample d = 3 k = 10 number_of_evaluation = query_samples / k # init user model, evaluation methods user_model = environment.CascadeUserModel( '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0') evaluation = evaluation.NdcgEval() rem_ndcg_evaluation_train = [] full_ndcg_evaluation_train = [] rem_ndcg_evaluation_test = [] full_ndcg_evaluation_test = [] for m in range(0, k): # for each k, we have different A matrix # as mentioned on the REMBO paper rem_learner = retrieval_system.ListwiseLearningSystemREMBO( 64, d, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunctionREMBO -s 3 -d 0.1 -a 0.01' ) full_learner = retrieval_system.ListwiseLearningSystem(
def compareSystemsHist(vali_queries,classifierPath,basic_ranker_path,clust_data_path,data,click): print "-Loading Data-" clf = pickle.load( open( classifierPath ) ) basic_ranker=pickle.load( open( basic_ranker_path ) ) clusterData=pickle.load(open(clust_data_path)) queryData=pickle.load(open(data)) ranker_tie="random" feature_count=basic_ranker.feature_count ranker_args="3" arg_str="" sample_send="sample_unit_sphere" iterations=100 rankers=[0]*2 rankers[0]=basic_ranker user_model = environment.CascadeUserModel(click) training_queries = query.load_queries(vali_queries, feature_count) compar_interleave=ProbabilisticInterleave(None) print "-Calculating-" ii=0 results=[] for qid in queryData.query_ranker.keys(): print str(float(ii)*100/float(len(queryData.query_ranker.keys())))+"%" ii+=1 q=training_queries.get_query(qid) for val in queryData.query_ranker[qid]: test=val #test=queryData.query_ranker[q][0] testWeights=str(test.tolist()) testWeights=testWeights.replace("[", "") testWeights=testWeights.replace("]", "") #weights = np.array([float(num) for num in testWeights.split(",")]) #print len(weights) ranker_tie="random" ranker_args="3" sample_send="sample_unit_sphere" rankers[1]=rankerClass.ProbabilisticRankingFunction(ranker_args, ranker_tie, feature_count, sample=sample_send, init=testWeights) second_win=0 for i in range(iterations): #q = training_queries.get_query(random.choice(training_queries.keys())) l, a = compar_interleave.interleave(rankers[0], rankers[1], q, 10) c = user_model.get_clicks(l, q.get_labels()) o = compar_interleave.infer_outcome(l, a, c, q) if(o>0): second_win+=1 elif(o==0): coin=random.random() if(coin>0.5): second_win+=1 result_com=float(second_win)/float(iterations) results.append(result_com) g=P.hist(results, bins = 20,range=[0,1]) P.xlabel("The win rate of the ranker",fontsize=20) P.ylabel("Number of rankers",fontsize=20) P.show(g)