Esempio n. 1
0
    def queryRanker(self):
        #Extract the high frequency queries from the training_queries
        HighFreqQueries = []
        training_queries = queryClass.load_queries(self.path_train, self.feature_count)
        test_queries = queryClass.load_queries(self.path_test, self.feature_count)
        #loop through all queries in the training set
        for index in training_queries.get_qids():
            highQuery = training_queries.get_query(index)
            #only keep the frequent queries 
            if(len(highQuery.__labels__) > self.minFreqCount):
                HighFreqQueries.append(highQuery)    
        print "found "+ str(len(HighFreqQueries)) + " high frequency queries"

        #build the query-ranker dictionary
        BestRanker = queryRankers()

        user_model = environment.CascadeUserModel(self.clickModel)
        evaluation2 = evaluation.NdcgEval()
        #test_queries = query.load_queries(sys.argv[2], feature_count)
        print "Read in training and testing queries"
        #for every query learn the best ranker and save it to the dictionary
        iter=0
        for highQuery in HighFreqQueries:
            ran=random.random()
            iter=iter+1
            if ran<self.threshold:
                print str(iter*100/len(HighFreqQueries))+"%"
                for i in xrange(self.rankersPerQuery):
                    learner = retrieval_system.ListwiseLearningSystem(self.feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01')
                    BestRanker.addInitRank(highQuery.get_qid(),learner.get_solution().w)
                    q = highQuery
                    for t in range(self.iterationCount):
                        l = learner.get_ranked_list(q)
                        c = user_model.get_clicks(l, q.get_labels())
                        s = learner.update_solution(c)
                        e = evaluation2.evaluate_all(s, test_queries)
                    
    
                    BestRanker.add(highQuery.get_qid(),learner.get_solution().w)
                    BestRanker.addList(highQuery.get_qid(),l)
                    BestRanker.addEval(highQuery.get_qid(),e)

        #save the dictionary to a file ('bestRanker.p')
        paths=self.path_train.split('/')
        name=paths[1]
        #pickle.dump(BestRanker, open( "QueryData/"+name+".data", "wb" ) )
        pickle.dump(BestRanker, open( "QueryData/"+self.dataset+str(self.iterationCount)+".data", "wb" ) )
        test = pickle.load( open( "QueryData/"+self.dataset+str(self.iterationCount)+".data", "rb" ) )
        print test.query_ranker.values()
Esempio n. 2
0
    def groupRanker(self):
        #Extract the high frequency queries from the training_queries
        clusterData=pickle.load(open( self.clusterDataPath, "rb" ) )
        queryData= self.queryData

        
        HighFreqQueries = []
        training_queries = queryClass.load_queries(self.path_train, self.feature_count)
        test_queries = queryClass.load_queries(self.path_test, self.feature_count)
        #loop through all queries in the training set
        

        #build the query-ranker dictionary
        BestRanker = queryRankers()

        user_model = environment.CascadeUserModel(self.clickModel)
        evaluation2 = evaluation.NdcgEval()
        #test_queries = query.load_queries(sys.argv[2], feature_count)
        print "Read in training and testing queries"
        #for every query learn the best ranker and save it to the dictionary
        iter=0
        learner=[0]*len(clusterData.clusterToRanker.keys())
        for cluster in clusterData.clusterToRanker:
            learner[cluster] = retrieval_system.ListwiseLearningSystem(self.feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01')  
        for t in range(self.iterationCount):
            q = training_queries[random.choice(training_queries.keys())]
            temp=(float(np.sum(clusterData.queryToCluster[q.get_qid()])))/(float(len(clusterData.queryToCluster[q.get_qid()])))
            temp=int(temp+0.5)
            cluster=temp
            #cluster=clusterData.queryToCluster[q.get_qid()][0]
            
            iter=iter+1
            if iter%200==0:
                print str(iter*100/self.iterationCount)+"%"
            l = learner[cluster].get_ranked_list(q)
            c = user_model.get_clicks(l, q.get_labels())
            s = learner[cluster].update_solution(c)
            #e = evaluation2.evaluate_all(s, test_queries)
        for cluster in clusterData.clusterToRanker:
             clusterData.clusterToRanker[cluster]=[learner[cluster].get_solution().w.tolist()]
      
            
        #save the dictionary to a file ('bestRanker.p')
        paths=self.path_train.split('/')
        name=paths[1]
        #pickle.dump(BestRanker, open( "QueryData/"+name+".data", "wb" ) )
        pickle.dump(clusterData, open( "ClusterData/"+self.dataset+".data", "wb" ) )
 def __init__(self, queries, feature_count, log_fh, args):
     """Initialize an experiment using the provided arguments."""
     self.log_fh = log_fh
     self.queries = queries
     self.feature_count = feature_count
     self.ties = "first"
     # construct experiment according to provided arguments
     self.result_length = args["result_length"]
     self.num_queries = args["num_queries"]
     self.query_sampling_method = args["query_sampling_method"]
     self.um_class = get_class(args["user_model"])
     self.um_args = args["user_model_args"]
     self.um = self.um_class(self.um_args)
     # set up methods to compare
     parser = argparse.ArgumentParser(description="parse arguments of an "
         "evaluation method.", prog="evaluation method configuration")
     parser.add_argument("-c", "--class_name")
     parser.add_argument("-r", "--ranker")
     parser.add_argument("-a", "--ranker_args")
     parser.add_argument("-i", "--interleave_method")
     self.rankers = {}
     self.live_methods = {}
     self.hist_methods = {}
     self.ndcg = evaluation.NdcgEval()
     # init live methods
     if "live_evaluation_methods" in args:
         for method_id, method in enumerate(
                 args["live_evaluation_methods"]):
             self.live_methods[method] = {}
             method_args_str = \
                 args["live_evaluation_methods_args"][method_id]
             method_args = vars(parser.parse_known_args(
                 method_args_str.split())[0])
             class_name = method_args["class_name"]
             self.live_methods[method]["instance"] = \
                 get_class(class_name)(method_args_str)
             ranker = method_args["ranker"]
             ranker_args = method_args["ranker_args"]
             self.live_methods[method]["ranker"] = ranker
             self.live_methods[method]["ranker_args"] = ranker_args
             if not ranker in self.rankers:
                 self.rankers[ranker] = {}
             if not ranker_args in self.rankers[ranker]:
                 self.rankers[ranker][ranker_args] = {}
     # init hist methods
     if "hist_evaluation_methods" in args:
         for method_id, method in enumerate(
                 args["hist_evaluation_methods"]):
             self.hist_methods[method] = {}
             method_args_str = \
                 args["hist_evaluation_methods_args"][method_id]
             method_args = vars(parser.parse_known_args(
                 method_args_str.split())[0])
             class_name = method_args["class_name"]
             self.hist_methods[method]["instance"] = \
                 get_class(class_name)(method_args_str)
             ranker = method_args["ranker"]
             ranker_args = method_args["ranker_args"]
             self.hist_methods[method]["ranker"] = method_args["ranker"]
             self.hist_methods[method]["ranker_args"] = \
                 method_args["ranker_args"]
             if not ranker in self.rankers:
                 self.rankers[ranker] = {}
             if not ranker_args in self.rankers[ranker]:
                 self.rankers[ranker][ranker_args] = {}
             self.hist_methods[method]["interleave_method"] = \
             get_class(method_args["interleave_method"])()
     # sample source and target ranker pair, create deterministic and
     # probabilistic ranker pairs
     self.source_pair = [0, 0]
     self.source_pair[0] = self._sample_ranker_without_replacement(
         self.feature_count, [])
     self.source_pair[1] = self._sample_ranker_without_replacement(
         self.feature_count, [self.source_pair[0]])
     self.target_pair = [0, 0]
     self.target_pair[0] = self._sample_ranker_without_replacement(
         self.feature_count, self.source_pair)
     self.target_pair[1] = self._sample_ranker_without_replacement(
         self.feature_count, [self.target_pair[0], self.source_pair[0],
         self.source_pair[1]])
     # init rankers needed by live and/or hist methods
     for ranker in self.rankers:
         for ranker_args in self.rankers[ranker]:
             self.rankers[ranker][ranker_args]["source"] = \
                 self._get_ranker_pair(ranker, ranker_args,
                 self.source_pair, self.feature_count, self.ties)
             self.rankers[ranker][ranker_args]["target"] = \
                 self._get_ranker_pair(ranker, ranker_args,
                 self.target_pair, self.feature_count, self.ties)
Esempio n. 4
0
import datetime
import numpy as np

# init data, query_samples, d's
train_queries = query.load_queries('../../DATA/NP2004/Fold1/train.txt', 64)
test_queries = query.load_queries('../../DATA/NP2004/Fold1/test.txt', 64)
query_samples = 5000  # how many queries we sample

d = 3
k = 10
number_of_evaluation = query_samples / k

# init user model, evaluation methods
user_model = environment.CascadeUserModel(
    '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')
evaluation = evaluation.NdcgEval()

rem_ndcg_evaluation_train = []
full_ndcg_evaluation_train = []
rem_ndcg_evaluation_test = []
full_ndcg_evaluation_test = []

for m in range(0, k):
    # for each k, we have different A matrix
    # as mentioned on the REMBO paper
    rem_learner = retrieval_system.ListwiseLearningSystemREMBO(
        64, d,
        '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunctionREMBO -s 3 -d 0.1 -a 0.01'
    )
    full_learner = retrieval_system.ListwiseLearningSystem(
        64,
alphas = [0.01, 0.05, 0.1, 0.5, 1.0]
deltas = [0.05, 0.1, 0.5, 1.0, 2.0]
reps = 5
nrqueries = 500

#alphas = [0.75, 1.0, 1.25, 1.5]
#deltas = [1.75, 2.0, 2.25]
#reps = 5
#nrqueries = 500

factor_k1 = 13.3
factor_k3 = .1

user_model = environment.CascadeUserModel(
    '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')
evaluator = evaluation.NdcgEval()

training_queries = query.load_queries(sys.argv[1], 64)
test_queries = query.load_queries(sys.argv[2], 64)


def run(alpha, delta):
    results = []
    for _ in range(reps):
        #learner = retrieval_system.ListwiseLearningSystem(64, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 ranker.model.BM25 -d %.2f -a %.2f' % (delta, alpha))
        learner = retrieval_system.ListwiseLearningSystemWithCandidateSelection(
            64,
            '--num_repetitions 10 --num_candidates 6 --history_length 10 --select_candidate select_candidate_repeated -w random -c comparison.ProbabilisticInterleave --ranker ranker.ProbabilisticRankingFunction --ranker_args 3 ranker.model.BM25 -d %s -a %s --anneal 50'
            % (delta, alpha))
        for _ in range(nrqueries):
            q = training_queries[random.choice(training_queries.keys())]