Ejemplo n.º 1
0
def speedTestAndEval():
    rankNet=RankNet(64)
    rankNetMod=RankNetMod(64)

    file='data/Fold2/train.txt'
    queries=load_queries(file,64)
    start=timer()
    rankNet.train_with_queries(queries,4)
    print("- Took %.2f sec to train rankNet " % (timer() - start))

    start=timer()
    rankNetMod.train_with_queries(queries,4)
    print("- Took %.2f sec to train rankNetMod " % (timer() - start))


    print('----Now lets evaluate--------')
    ndcg=NDCG(1)
    testQueries=load_queries('data/Fold2/test.txt',64)
    for name,ranker in zip(['rankNet','rankNetMod'],[rankNet,rankNetMod]):
        r=[]
        for q in testQueries:
            rel=getRankedList(ranker,q)
            r.append(ndcg.run(rel,max_c=np.sum(rel)))
        print('mNDCG for '+name+": ")
        print(np.mean(r))
Ejemplo n.º 2
0
def experiment(experiment_type):
    print '- Running', experiment_type
    n_features = 64

    # Implements 5-Folds validation
    kfold_ndcg = []
    for i in xrange(1, 6):
        ranker = LambdaRankHW(n_features, type=experiment_type)
        n_epochs = 5

        def query_ndcg(q):
            scores = ranker.score(q).flatten()
            labels = q.get_labels()
            return ndcg(zip(*sorted(zip(labels, scores), key=itemgetter(1), reverse=True))[0])

        for j in xrange(1, 6):
            if i == j:
                continue

            queries = query.load_queries('HP2003/Fold%d/train.txt' % j, n_features)
            ranker.train_with_queries(queries, n_epochs)
        
        queries = query.load_queries('HP2003/Fold%d/train.txt' % i, n_features)
        kfold_ndcg.append(np.mean([query_ndcg(q) for q in queries]))
        print "- mNDCG: %.3f" % kfold_ndcg[-1]

    print "- Average mNDCG: %.3f" % np.average(kfold_ndcg)
def run_exp():
    ### CHANGE ALGORITHM HERE ###
    ## Possible values: POINTWISE, PAIRWISE, LISTWISE
    ALGORITHM = PAIRWISE  # Change here for other algos
    FEATURES = 64
    EPOCHS = 5
    num_folds = 5
    NDCG_AVG = []
    folds = []
    for fold in range(1, num_folds + 1):
        # Ranker for this cross-validation.
        ranker = LambdaRankHW(FEATURES, algorithm=ALGORITHM)

        # Fold NDCG scores
        fold_scores = []

        for cross_fold in range(1, num_folds + 1):
            if fold == cross_fold:
                continue

            # Current fold taining queries
            training_queries = query.load_queries(
                "HP2003/Fold%d/train.txt" % cross_fold, FEATURES)
            ranker.train_with_queries(training_queries, EPOCHS)

            fold_scores.append(
                np.mean(
                    [compute_query_NDCG(q, ranker) for q in training_queries]))

        # Load test queries
        test_queries = query.load_queries("HP2003/Fold%d/test.txt" % fold,
                                          FEATURES)

        # Compute and add NDCG on test set
        NDCG_AVG.append(
            np.mean([compute_query_NDCG(q, ranker) for q in test_queries]))

        # Also store NDCG scores on fold to plot them
        folds.append(fold_scores)

    # Save and compute average over all folds
    list_file_name = ALGORITHM + "_NDCGS.npy"
    average_file_name = ALGORITHM + "_average_NDCG.npy"
    all_fold_scores = ALGORITHM + "_allscores.npy"

    np.save(list_file_name, np.array(NDCG_AVG))
    np.save(all_fold_scores, np.array(folds))

    total_average = np.average(NDCG_AVG)
    np.save(average_file_name, total_average)
Ejemplo n.º 4
0
    def queryRanker(self):
        #Extract the high frequency queries from the training_queries
        HighFreqQueries = []
        training_queries = queryClass.load_queries(self.path_train, self.feature_count)
        test_queries = queryClass.load_queries(self.path_test, self.feature_count)
        #loop through all queries in the training set
        for index in training_queries.get_qids():
            highQuery = training_queries.get_query(index)
            #only keep the frequent queries 
            if(len(highQuery.__labels__) > self.minFreqCount):
                HighFreqQueries.append(highQuery)    
        print "found "+ str(len(HighFreqQueries)) + " high frequency queries"

        #build the query-ranker dictionary
        BestRanker = queryRankers()

        user_model = environment.CascadeUserModel(self.clickModel)
        evaluation2 = evaluation.NdcgEval()
        #test_queries = query.load_queries(sys.argv[2], feature_count)
        print "Read in training and testing queries"
        #for every query learn the best ranker and save it to the dictionary
        iter=0
        for highQuery in HighFreqQueries:
            ran=random.random()
            iter=iter+1
            if ran<self.threshold:
                print str(iter*100/len(HighFreqQueries))+"%"
                for i in xrange(self.rankersPerQuery):
                    learner = retrieval_system.ListwiseLearningSystem(self.feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01')
                    BestRanker.addInitRank(highQuery.get_qid(),learner.get_solution().w)
                    q = highQuery
                    for t in range(self.iterationCount):
                        l = learner.get_ranked_list(q)
                        c = user_model.get_clicks(l, q.get_labels())
                        s = learner.update_solution(c)
                        e = evaluation2.evaluate_all(s, test_queries)
                    
    
                    BestRanker.add(highQuery.get_qid(),learner.get_solution().w)
                    BestRanker.addList(highQuery.get_qid(),l)
                    BestRanker.addEval(highQuery.get_qid(),e)

        #save the dictionary to a file ('bestRanker.p')
        paths=self.path_train.split('/')
        name=paths[1]
        #pickle.dump(BestRanker, open( "QueryData/"+name+".data", "wb" ) )
        pickle.dump(BestRanker, open( "QueryData/"+self.dataset+str(self.iterationCount)+".data", "wb" ) )
        test = pickle.load( open( "QueryData/"+self.dataset+str(self.iterationCount)+".data", "rb" ) )
        print test.query_ranker.values()
Ejemplo n.º 5
0
def perform():
	# init
	test_num_features = 64
	queries = query.load_queries('../../data/NP2004/Fold1/test.txt', 64)
	bi = comparison.BalancedInterleave()
	user_model = environment.CascadeUserModel('--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')	
	
	# make rankers
	rankers = []
	for i in range(0,5):
		rankers.append(ranker.ProbabilisticRankingFunction('3', 'random', 64, init=parseRanker('../../data/features64/ranker-0'+str(i)+'.txt'),sample='sample_unit_sphere'))

	# main loop
	for N in [100,1000,10000]:
		pref_matrix = [[0 for x in xrange(5)] for x in xrange(5)] 
		for iter in range(0,N):
			q = queries[random.choice(queries.keys())]
			for i in range(0,5):
				for j in range (0,5):
					if i!=j:
						list, context = bi.interleave(rankers[i], rankers[j], q, 10)
						clicks = user_model.get_clicks(list,q.get_labels())
						result = bi.infer_outcome(list,context,clicks,q)
						if result < 0:
							pref_matrix[i][j] += 1
					else:
						pref_matrix[i][j] = 0.50
		pref_matrix = generateProbabilityMatrix(pref_matrix,N)
		printMatrix(pref_matrix)
		print 'Best ranker is ' + '0' + str(getBestRanker(pref_matrix)) + ' (N = ' + str(N) + ').'
	print 'done!'
Ejemplo n.º 6
0
    def Train(self):
        
        print "Loading Data"
        clusterData=pickle.load(open( self.clusterDataPath, "rb" ) )
        feature_count=len(clusterData.clusterToRanker[0][0])
        training_queries = queryClass.load_queries(self.testQueries, feature_count)
        ranker=pickle.load( open( self.rankerPath ) )
        
        """
        testWeights=str(clusterData.clusterToRanker[0][0])
        testWeights=testWeights.replace("[", "")
        testWeights=testWeights.replace("]", "")
        weights = np.array([float(num) for num in testWeights.split(",")])
        ranker_tie="random"
        ranker_args="3"
        sample_send="sample_unit_sphere"

        ranker=rankerClass.ProbabilisticRankingFunction(ranker_args,
                                                ranker_tie,
                                                feature_count,
                                                sample=sample_send,
                                                init=testWeights)
        """
        X=[]
        Y=[]
        max=100 #max number of docs in the ranking 

    #print clusterData.queryToCluster.keys()
    #print training_queries.keys()
        print "Loading training objects"
        for qid in clusterData.queryToCluster:
            query = training_queries.get_query(qid)
            ranker.init_ranking(query)
            docIds=ranker.get_ranking()
            iter=0
            for docId in docIds:
                if iter>max:
                    break
                features=query.get_feature_vector(docId)
                X.append(features)
                Y.append(clusterData.queryToCluster[qid][0])
                
                iter=iter+1
            
        #X = [[0, 0], [1, 1]]
        #y = [0, 1]
        X=np.array(X)
        Y=np.array(Y)
        print "Training"
        clf = svm.SVC()
        clf.fit(X, Y) 
       
        if not os.path.exists("Classifier"):
            os.makedirs("Classifier")

        paths=self.clusterDataPath.split('/')
        name=paths[len(paths)-1]
        parts=name.split('.')
        name=parts[0]
        pickle.dump(clf, open( "Classifier/"+name+".data", "wb" ) )
Ejemplo n.º 7
0
def main(scale, schema, runs, warm, end):
    queries = load_queries()
    logger.info("Loaded %d queries", len(queries))
    db = "SNOWFLAKE_SAMPLE_DATA" if schema else "TCPH_SCHEMALESS"
    schema_name = f"TPCH_SF{scale}"
    schema_id = "SCHEMA" if schema else "SCHEMALESS"

    with conn_vw(name = f'TEMPORARY_{schema_id}_SF{scale}', size='MEDIUM') as conn:
        logger.info("Running on database %s", db)
        cur = conn.cursor()
        cur.execute(f"USE DATABASE {db};")

        
        logger.info("Running on schema %s", db)
        cur.execute(f"USE SCHEMA {schema_name};")
        
        logger.info("Disabling result set cache!")
        cur.execute("ALTER SESSION SET USE_CACHED_RESULT = FALSE;")
        cur.close()

        
        filename = f"results_{schema_id}_SF{scale}.csv"
        filepath = os.path.join("results", filename)
        logger.info("Writing results to %s", filepath)
        with open(filepath, "w") as f:
            header = ["Query"]+[f"Run {i+1}" for i in range(runs)] + ["Average", "Standard Deviation"]
            write_row(f, header)
            for q in queries[:end]:
                timings, avg, std = time_query(q, conn, runs=runs, warmups=warm)
                timings = list(timings)
                timings += [avg / 1000.0, std / 1000.0]
                timings = [f"{q.num}"] + [f"{x:.06f}" for x in timings]
                write_row(f, timings)
Ejemplo n.º 8
0
    def Save(self):

        print "Loading Data"

        training_queries = queryClass.load_queries(self.testQueries,
                                                   self.feature_count)
        ranker = pickle.load(open(self.rankerPath))

        max = 100  #max number of docs in the ranking

        #print clusterData.queryToCluster.keys()
        #print training_queries.keys()
        BestRanker = queryFeatures()
        print "Loading training objects"
        i = 0
        for query in training_queries:
            #print str(i*100/len(training_queries))+"%"
            i = i + 1
            #query = training_queries.get_query(qid)
            ranker.init_ranking(query)
            docIds = ranker.get_ranking()
            iter = 0
            for docId in docIds:
                if iter > max:
                    break
                iter = iter + 1
                features = query.get_feature_vector(docId)
                BestRanker.add(query.get_qid(), features)
                #print features
                #BestRanker.addFeaturesToQid([float(i) for i in features],query.get_qid())

        pickle.dump(BestRanker,
                    open("QueryData/" + self.dataset + ".data", "wb"))
Ejemplo n.º 9
0
    def groupRanker(self):
        #Extract the high frequency queries from the training_queries
        clusterData=pickle.load(open( self.clusterDataPath, "rb" ) )
        queryData= self.queryData

        
        HighFreqQueries = []
        training_queries = queryClass.load_queries(self.path_train, self.feature_count)
        test_queries = queryClass.load_queries(self.path_test, self.feature_count)
        #loop through all queries in the training set
        

        #build the query-ranker dictionary
        BestRanker = queryRankers()

        user_model = environment.CascadeUserModel(self.clickModel)
        evaluation2 = evaluation.NdcgEval()
        #test_queries = query.load_queries(sys.argv[2], feature_count)
        print "Read in training and testing queries"
        #for every query learn the best ranker and save it to the dictionary
        iter=0
        learner=[0]*len(clusterData.clusterToRanker.keys())
        for cluster in clusterData.clusterToRanker:
            learner[cluster] = retrieval_system.ListwiseLearningSystem(self.feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01')  
        for t in range(self.iterationCount):
            q = training_queries[random.choice(training_queries.keys())]
            temp=(float(np.sum(clusterData.queryToCluster[q.get_qid()])))/(float(len(clusterData.queryToCluster[q.get_qid()])))
            temp=int(temp+0.5)
            cluster=temp
            #cluster=clusterData.queryToCluster[q.get_qid()][0]
            
            iter=iter+1
            if iter%200==0:
                print str(iter*100/self.iterationCount)+"%"
            l = learner[cluster].get_ranked_list(q)
            c = user_model.get_clicks(l, q.get_labels())
            s = learner[cluster].update_solution(c)
            #e = evaluation2.evaluate_all(s, test_queries)
        for cluster in clusterData.clusterToRanker:
             clusterData.clusterToRanker[cluster]=[learner[cluster].get_solution().w.tolist()]
      
            
        #save the dictionary to a file ('bestRanker.p')
        paths=self.path_train.split('/')
        name=paths[1]
        #pickle.dump(BestRanker, open( "QueryData/"+name+".data", "wb" ) )
        pickle.dump(clusterData, open( "ClusterData/"+self.dataset+".data", "wb" ) )
Ejemplo n.º 10
0
def compareSystems(vali_queries,classifierPath,basic_ranker_path,clust_data_path,data,click):
    
    print "-Loading Data-"
    clf = pickle.load( open( classifierPath ) )
    basic_ranker=pickle.load( open( basic_ranker_path ) )
    clusterData=pickle.load(open(clust_data_path))
    queryData=pickle.load(open(data))
    
    ranker_tie="random"
    feature_count=basic_ranker.feature_count
    ranker_args="3"
    arg_str=""
    sample_send="sample_unit_sphere"
    iterations=100
    
    rankers=[0]*2
    rankers[0]=basic_ranker
    
    
    user_model = environment.CascadeUserModel(click)
    training_queries = query.load_queries(vali_queries, feature_count)
    compar_interleave=ProbabilisticInterleave(None)

    first_win=0
    print "-Calculating-"
    
    for i in range(iterations):
        if i%(iterations/10)==0:
            print str(float(i)*100/float(iterations))+"%"
        q = training_queries.get_query(random.choice(queryData.query_ranker.keys()))
        
        test=queryData.query_ranker[q.get_qid()][0]
        testWeights=str(test)
        testWeights=testWeights.replace("[", "")
        testWeights=testWeights.replace("]", "")
        weights = np.array([float(num) for num in testWeights.split(",")])
        print len(weights)
        ranker_tie="random"
        ranker_args="3"
        sample_send="sample_unit_sphere"

        rankers[1]=rankerClass.ProbabilisticRankingFunction(ranker_args,
                                                ranker_tie,
                                                feature_count,
                                                sample=sample_send,
                                                init=testWeights)
        
        
        l, a = compar_interleave.interleave(rankers[0], rankers[1], q, 10)
        c = user_model.get_clicks(l, q.get_labels())
        o = compar_interleave.infer_outcome(l, a, c, q)
        if(o<0):
            first_win+=1
        elif(o==0):
            coin=random.random()
            if(coin>0.5):
                first_win+=1
    result_com=float(first_win)/float(iterations)
    print "Basic ranker win rate:"+ str(result_com)
Ejemplo n.º 11
0
    def OneFoldTest(self,folder,model,epochs):
        testFile=folder+"/test.txt"
        testQueries=load_queries(testFile,64)


        trainFile=folder+"/train.txt"
        trainQueries=load_queries(trainFile,64)


        model.train_with_queries(trainQueries,epochs)


        ndcgs=[]
        for q in testQueries:
            ndcgs.append(self.ndcg.run(getRankedList(model,q)))


        return (np.mean(ndcgs))
Ejemplo n.º 12
0
def compareSystems(vali_queries, classifierPath, basic_ranker_path,
                   clust_data_path, click):

    print "-Loading Data-"
    clf = pickle.load(open(classifierPath))
    basic_ranker = pickle.load(open(basic_ranker_path))
    clusterData = pickle.load(open(clust_data_path))

    ranker_tie = "random"
    feature_count = basic_ranker.feature_count
    ranker_args = "3"
    arg_str = ""
    sample_send = "sample_unit_sphere"
    iterations = 100

    rankers = [0] * 2
    rankers[0] = basic_ranker

    user_model = environment.CascadeUserModel(click)
    training_queries = query.load_queries(vali_queries, feature_count)
    compar_interleave = ProbabilisticInterleave(None)

    second_win = 0
    second_win_or_e = 0
    generic_win = 0
    equal = 0
    print "-Calculating-"
    for i in range(iterations):
        if i % (iterations / 10) == 0:
            print str(float(i) * 100 / float(iterations)) + "%"
        q = training_queries[random.choice(training_queries.keys())]
        rankers[1] = classifier.getRanker(clf, basic_ranker, q, clusterData)
        l, a = compar_interleave.interleave(rankers[0], rankers[1], q, 10)
        c = user_model.get_clicks(l, q.get_labels())
        o = compar_interleave.infer_outcome(l, a, c, q)
        if (o > 0):
            second_win += 1
            second_win_or_e += 1
        elif (o == 0):
            equal += 1
            coin = random.random()
            if (coin > 0.5):
                second_win_or_e += 1
        else:
            generic_win += 1

    result_com = float(second_win_or_e) / float(iterations)
    result_win = float(second_win) / float(iterations)
    result_win_generic = float(generic_win) / float(iterations)
    print "Our ranker win rate (with random choice if result was equal):" + str(
        result_com)
    print "Our ranker win rate:" + str(result_win)
    print "Generic ranker win rate:" + str(result_win_generic)
    print "Number win ours:" + str(second_win)
    print "Number win generic:" + str(generic_win)
    print "Number equal:" + str(equal)
    print "Total number iterations:" + str(iterations)
Ejemplo n.º 13
0
def compareSystems(vali_queries,classifierPath,basic_ranker_path,clust_data_path,click):
    
    print "-Loading Data-"
    clf = pickle.load( open( classifierPath ) )
    basic_ranker=pickle.load( open( basic_ranker_path ) )
    clusterData=pickle.load(open(clust_data_path))
    
    ranker_tie="random"
    feature_count=basic_ranker.feature_count
    ranker_args="3"
    arg_str=""
    sample_send="sample_unit_sphere"
    iterations=100
    
    rankers=[0]*2
    rankers[0]=basic_ranker
    
    
    user_model = environment.CascadeUserModel(click)
    training_queries = query.load_queries(vali_queries, feature_count)
    compar_interleave=ProbabilisticInterleave(None)

    second_win=0
    second_win_or_e=0
    generic_win=0
    equal = 0
    print "-Calculating-"
    for i in range(iterations):
        if i%(iterations/10)==0:
            print str(float(i)*100/float(iterations))+"%"
        q = training_queries[random.choice(training_queries.keys())]
        rankers[1]=classifier.getRanker(clf, basic_ranker,q,clusterData)
        l, a = compar_interleave.interleave(rankers[0], rankers[1], q, 10)
        c = user_model.get_clicks(l, q.get_labels())
        o = compar_interleave.infer_outcome(l, a, c, q)
        if(o>0):
            second_win+=1
            second_win_or_e+=1
        elif(o==0):
            equal += 1
            coin=random.random()
            if(coin>0.5):
                second_win_or_e+=1
        else:
            generic_win+=1

    result_com=float(second_win_or_e)/float(iterations)
    result_win=float(second_win)/float(iterations)
    result_win_generic=float(generic_win)/float(iterations)
    print "Our ranker win rate (with random choice if result was equal):"+ str(result_com)
    print "Our ranker win rate:"+ str(result_win)
    print "Generic ranker win rate:"+ str(result_win_generic)
    print "Number win ours:" + str(second_win)
    print "Number win generic:" + str(generic_win)
    print "Number equal:" + str(equal)
    print "Total number iterations:" + str(iterations)
Ejemplo n.º 14
0
    def run(self,mainFolder,epochs):
        folders = get_immediate_subdirectories(mainFolder)
        #folders = random.sample(get_immediate_subdirectories(mainFolder),2)
        ndcgs=[[] for i in range(len(self.models))]
        elapsed=np.zeros(len(self.models)) # for timing

        for i,folder in enumerate(folders):
            print("fold "+str(i+1))

            trainFile=mainFolder+folder+"/train.txt"
            train_queries=load_queries(trainFile,64)
            testFile=mainFolder+folder+"/test.txt"
            testQueries=load_queries(testFile,64)

            for i,model in enumerate(self.models):
                now=timer()
                model.train_with_queries(train_queries,epochs)
                elapsed[i]+=timer()-now
                # evaluation
                ndcgs[i]+=self.__evalaute(testQueries,model)
        return ([np.mean(n) for n in ndcgs],elapsed)
Ejemplo n.º 15
0
 def __init__(self, Path, feature_count, queriesPath):
     self.Path = Path
     self.queriesPath = queriesPath
     self.feature_count = feature_count
     self.listOrEucl = int(raw_input("Euclidean(0) or list(1) distance (answer with 0 or 1)?"))
     if(self.listOrEucl):
           print 'using list distance'
     else:
           print 'using euclidean distance'
     print "Reading in queries"
     self.training_queries = queryClass.load_queries(queriesPath, feature_count)
     print "Calculating distance"
     self.calculate()
def experiment(n_epochs, measure_type, num_features, num_folds):

    best_ranker = None
    best_val_score = 0
    store_res = []
    for fold in range(1, num_folds + 1):

        # Load queries from the corresponding fold
        print('Loading train queries')
        train_queries = query.load_queries(
            os.path.normpath('./HP2003/Fold%d/train.txt' % fold), num_features)

        print('Loading val queries')
        val_queries = query.load_queries(
            os.path.normpath('./HP2003/Fold%d/vali.txt' % fold), num_features)

        # Creates the S matrix (as described in paper) using scipy.sparse
        print('Creating the S Matrix')
        S = create_S_matrix({**train_queries, **val_queries})

        # Creates a new ranker
        ranker = LambdaRankHW(num_features, measure_type)

        # Stores the statistics for each epoch
        res = ranker.train_with_queries(train_queries, n_epochs, val_queries,
                                        S)
        final_val_score = res[-1][
            'val_mndcg']  # validation mNDCG after all epochs

        # keep a running maximum of val mNDCG (also the best ranker)
        if final_val_score > best_val_score:
            best_ranker = ranker
            best_val_score = final_val_score

        # Stores the results for the current fold
        store_res.append(res)

    return store_res, ranker
Ejemplo n.º 17
0
 def __init__(self, Path, feature_count, queriesPath):
     self.Path = Path
     self.queriesPath = queriesPath
     self.feature_count = feature_count
     self.listOrEucl = int(
         raw_input(
             "Euclidean(0) or list(1) distance (answer with 0 or 1)?"))
     if (self.listOrEucl):
         print 'using list distance'
     else:
         print 'using euclidean distance'
     print "Reading in queries"
     self.training_queries = queryClass.load_queries(
         queriesPath, feature_count)
     print "Calculating distance"
     self.calculate()
Ejemplo n.º 18
0
def perform():
    # init
    test_num_features = 64
    queries = query.load_queries('../../data/NP2004/Fold1/test.txt', 64)
    bi = comparison.BalancedInterleave()
    user_model = environment.CascadeUserModel(
        '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')

    # make rankers
    rankers = []
    for i in range(0, 5):
        rankers.append(
            ranker.ProbabilisticRankingFunction(
                '3',
                'random',
                64,
                init=parseRanker('../../data/features64/ranker-0' + str(i) +
                                 '.txt'),
                sample='sample_unit_sphere'))

    # main loop
    for N in [100, 1000, 10000]:
        pref_matrix = [[0 for x in xrange(5)] for x in xrange(5)]
        for iter in range(0, N):
            q = queries[random.choice(queries.keys())]
            for i in range(0, 5):
                for j in range(0, 5):
                    if i != j:
                        list, context = bi.interleave(rankers[i], rankers[j],
                                                      q, 10)
                        clicks = user_model.get_clicks(list, q.get_labels())
                        result = bi.infer_outcome(list, context, clicks, q)
                        if result < 0:
                            pref_matrix[i][j] += 1
                    else:
                        pref_matrix[i][j] = 0.50
        pref_matrix = generateProbabilityMatrix(pref_matrix, N)
        printMatrix(pref_matrix)
        print 'Best ranker is ' + '0' + str(
            getBestRanker(pref_matrix)) + ' (N = ' + str(N) + ').'
    print 'done!'
reps = 5
nrqueries = 500

#alphas = [0.75, 1.0, 1.25, 1.5]
#deltas = [1.75, 2.0, 2.25]
#reps = 5
#nrqueries = 500

factor_k1 = 13.3
factor_k3 = .1

user_model = environment.CascadeUserModel(
    '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')
evaluator = evaluation.NdcgEval()

training_queries = query.load_queries(sys.argv[1], 64)
test_queries = query.load_queries(sys.argv[2], 64)


def run(alpha, delta):
    results = []
    for _ in range(reps):
        #learner = retrieval_system.ListwiseLearningSystem(64, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 ranker.model.BM25 -d %.2f -a %.2f' % (delta, alpha))
        learner = retrieval_system.ListwiseLearningSystemWithCandidateSelection(
            64,
            '--num_repetitions 10 --num_candidates 6 --history_length 10 --select_candidate select_candidate_repeated -w random -c comparison.ProbabilisticInterleave --ranker ranker.ProbabilisticRankingFunction --ranker_args 3 ranker.model.BM25 -d %s -a %s --anneal 50'
            % (delta, alpha))
        for _ in range(nrqueries):
            q = training_queries[random.choice(training_queries.keys())]
            l = learner.get_ranked_list(q)
            c = user_model.get_clicks(l, q.get_labels())
metrics = []
scores = {}
for metric in "evaluation.NdcgEval", "evaluation.LetorNdcgEval":
    eval_class = get_class(metric)
    eval_metric = eval_class()
    metrics.append(eval_metric)
    scores[eval_metric.__class__.__name__] = {}
    for cutoff in cutoffs:
        scores[eval_metric.__class__.__name__][cutoff] = []

# load all queries
test_queries = {}
for fold in range(1, 6):
    test_file = "".join((args.test_dir, str(fold)))
    test_file = os.path.join(test_file, args.test_file)
    qs = load_queries(test_file, args.feature_count)
    test_queries[fold] = qs

# process all experiments for all metrics
count_experiments = 0
for experiment in args.experiment_dirs:
    print "%% %s" % experiment
    count_runs = 0
    count_experiments += 1
    # process all folds and run files
    for fold_id in sorted(os.listdir(experiment)):
        fold = os.path.join(experiment, fold_id)
        fold_id = int(fold_id)
        if not os.path.isdir(fold):
            continue
        for filename in sorted(os.listdir(fold)):
Ejemplo n.º 21
0
metrics = []
scores = {}
for metric in  "evaluation.NdcgEval", "evaluation.LetorNdcgEval":
    eval_class = get_class(metric)
    eval_metric = eval_class()
    metrics.append(eval_metric)
    scores[eval_metric.__class__.__name__] = {}
    for cutoff in cutoffs:
        scores[eval_metric.__class__.__name__][cutoff] = []

# load all queries
test_queries = {}
for fold in range(1, 6):
    test_file = "".join((args.test_dir, str(fold)))
    test_file = os.path.join(test_file, args.test_file)
    qs = load_queries(test_file, args.feature_count)
    test_queries[fold] = qs

# process all experiments for all metrics
count_experiments = 0
for experiment in args.experiment_dirs:
    print "%% %s" % experiment
    count_runs = 0
    count_experiments += 1
    # process all folds and run files
    for fold_id in sorted(os.listdir(experiment)):
        fold = os.path.join(experiment, fold_id)
        fold_id = int(fold_id)
        if not os.path.isdir(fold):
            continue
        for filename in sorted(os.listdir(fold)):
Ejemplo n.º 22
0
import sys, random
try:
    import include, pickle
except:
    pass
import retrieval_system, environment, evaluation, query


learner = retrieval_system.ListwiseLearningSystem(64, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01')
user_model = environment.CascadeUserModel('--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')
evaluation = evaluation.NdcgEval()
training_queries = query.load_queries(sys.argv[1], 64)
test_queries = query.load_queries(sys.argv[2], 64)
i=0
for i in range(10):
    q = training_queries[random.choice(training_queries.keys())]
    l = learner.get_ranked_list(q)
    c = user_model.get_clicks(l, q.get_labels())
    s = learner.update_solution(c)
    print i
    i=i+1
    print evaluation.evaluate_all(s, test_queries)

pickle.dump(learner.ranker, open( "QueryData/"+"generalRanker"+".data", "wb" ) )

Ejemplo n.º 23
0
    def __init__(self, args_str=None):
        # parse arguments
        parser = argparse.ArgumentParser(description="""
            Construct and run a learning experiment. Provide either the name
            of a config file from which the experiment configuration is
            read, or provide all arguments listed under Command line. If
            both are provided the  config file is ignored.""",
                                         prog=self.__class__.__name__)

        # option 1: use a config file
        file_group = parser.add_argument_group("FILE")
        file_group.add_argument("-f",
                                "--file",
                                help="Filename of the config "
                                "file from which the experiment details"
                                " should be read.")

        # option 2: specify all experiment details as arguments
        detail_group = parser.add_argument_group("DETAILS")
        detail_group.add_argument(
            "-i",
            "--training_queries",
            help="File from which to load the training queries (svmlight "
            "format).")
        detail_group.add_argument(
            "-j",
            "--test_queries",
            help="File from which to load the test queries (svmlight format).")
        detail_group.add_argument(
            "-c",
            "--feature_count",
            type=int,
            help="The number of features included in the data.")
        detail_group.add_argument(
            "-r",
            "--num_runs",
            type=int,
            help="Number of runs (how many times to repeat the experiment).")
        detail_group.add_argument("-q",
                                  "--num_queries",
                                  type=int,
                                  help="Number of queries in each run.")
        detail_group.add_argument("-u",
                                  "--user_model",
                                  help="Class implementing a user model.")
        detail_group.add_argument(
            "-v",
            "--user_model_args",
            help="Arguments for initializing the user model.")
        # the retrieval system maintains ranking functions, accepts queries and
        # generates result lists, and in return receives user clicks to learn
        # from
        detail_group.add_argument(
            "-s",
            "--system",
            help="Which system to use (e.g., pairwise, listwise).")
        detail_group.add_argument("-a",
                                  "--system_args",
                                  help="Arguments for "
                                  "the system (comparison method, learning "
                                  "algorithm and parameters...).")
        detail_group.add_argument(
            "-o",
            "--output_dir",
            help="(Empty) directory for storing output generated by this"
            " experiment. Subdirectory for different folds will be generated"
            "automatically.")
        detail_group.add_argument("--output_dir_overwrite", default="False")
        detail_group.add_argument(
            "-p",
            "--output_prefix",
            help="Prefix to be added to output filenames, e.g., the name of "
            "the data set, fold, etc. Output files will be stored as "
            "OUTPUT_DIR/PREFIX-RUN_ID.txt.gz")
        detail_group.add_argument("-e",
                                  "--experimenter",
                                  help="Experimenter type.")
        # run the parser
        if args_str:
            args = parser.parse_known_args(args_str.split())[0]
        else:
            args = parser.parse_known_args()[0]

        # determine whether to use config file or detailed args
        self.experiment_args = None
        if args.file:
            config_file = open(args.file)
            self.experiment_args = yaml.load(config_file)
            config_file.close()
            # overwrite with command-line options if given
            for arg, value in vars(args).items():
                if value:
                    self.experiment_args[arg] = value
        else:
            self.experiment_args = vars(args)

        # workaround - check if we have all the arguments needed
        if not ("training_queries" in self.experiment_args
                and "test_queries" in self.experiment_args and "feature_count"
                in self.experiment_args and "num_runs" in self.experiment_args
                and "num_queries" in self.experiment_args
                and "user_model" in self.experiment_args and "user_model_args"
                in self.experiment_args and "system" in self.experiment_args
                and "system_args" in self.experiment_args
                and "output_dir" in self.experiment_args):
            parser.print_help()
            sys.exit("Missing required arguments, please check the program"
                     " arguments or configuration file. %s" %
                     self.experiment_args)

        # set default values for optional arguments
        if not "query_sampling_method" in self.experiment_args:
            self.experiment_args["query_sampling_method"] = "random"
        if not "output_dir_overwrite" in self.experiment_args:
            self.experiment_args["output_dir_overwrite"] = False
        if not "experimenter" in self.experiment_args:
            self.experiment_args[
                "experimenter"] = "experiment.LearningExperiment"
        if not "evaluation" in self.experiment_args:
            self.experiment_args["evaluation"] = "evaluation.NdcgEval"
        if not "processes" in self.experiment_args:
            self.experiment_args["processes"] = 0

        # locate or create directory for the current fold
        if not os.path.exists(self.experiment_args["output_dir"]):
            os.makedirs(self.experiment_args["output_dir"])
        elif not(self.experiment_args["output_dir_overwrite"]) and \
                            os.listdir(self.experiment_args["output_dir"]):
            # make sure the output directory is empty
            raise Exception(
                "Output dir %s is not an empty directory. "
                "Please use a different directory, or move contents out "
                "of the way." % self.experiment_args["output_dir"])

        logging.basicConfig(format='%(asctime)s %(module)s: %(message)s',
                            level=logging.INFO)

        logging.info("Arguments: %s" % self.experiment_args)
        for k, v in sorted(self.experiment_args.iteritems()):
            logging.info("\t%s: %s" % (k, v))
        config_bk = os.path.join(self.experiment_args["output_dir"],
                                 "config_bk.yml")
        logging.info("Backing up configuration to: %s" % config_bk)
        config_bk_file = open(config_bk, "w")
        yaml.dump(self.experiment_args,
                  config_bk_file,
                  default_flow_style=False)
        config_bk_file.close()

        # load training and test queries
        training_file = self.experiment_args["training_queries"]
        test_file = self.experiment_args["test_queries"]
        self.feature_count = self.experiment_args["feature_count"]
        logging.info("Loading training data: %s " % training_file)
        self.training_queries = load_queries(training_file, self.feature_count)
        logging.info("... found %d queries." %
                     self.training_queries.get_size())
        logging.info("Loading test data: %s " % test_file)
        self.test_queries = load_queries(test_file, self.feature_count)
        logging.info("... found %d queries." % self.test_queries.get_size())

        # initialize and run the experiment num_run times
        self.num_runs = self.experiment_args["num_runs"]
        self.output_dir = self.experiment_args["output_dir"]
        self.output_prefix = self.experiment_args["output_prefix"]
        self.experimenter = get_class(self.experiment_args["experimenter"])
Ejemplo n.º 24
0
    def __init__(self, args_str=None):
        # parse arguments
        parser = argparse.ArgumentParser(description="""
            Construct and run a learning experiment. Provide either the name
            of a config file from which the experiment configuration is
            read, or provide all arguments listed under Command line. If
            both are provided the  config file is ignored.""",
            prog=self.__class__.__name__)

        # option 1: use a config file
        file_group = parser.add_argument_group("FILE")
        file_group.add_argument("-f", "--file", help="Filename of the config "
                                "file from which the experiment details"
                                " should be read.")

        # option 2: specify all experiment details as arguments
        detail_group = parser.add_argument_group("DETAILS")
        detail_group.add_argument("-i", "--training_queries",
            help="File from which to load the training queries (svmlight "
            "format).")
        detail_group.add_argument("-j", "--test_queries",
            help="File from which to load the test queries (svmlight format).")
        detail_group.add_argument("-c", "--feature_count", type=int,
            help="The number of features included in the data.")
        detail_group.add_argument("-r", "--num_runs", type=int,
            help="Number of runs (how many times to repeat the experiment).")
        detail_group.add_argument("-q", "--num_queries", type=int,
            help="Number of queries in each run.")
        detail_group.add_argument("-u", "--user_model",
            help="Class implementing a user model.")
        detail_group.add_argument("-v", "--user_model_args",
            help="Arguments for initializing the user model.")
        # the retrieval system maintains ranking functions, accepts queries and
        # generates result lists, and in return receives user clicks to learn
        # from
        detail_group.add_argument("-s", "--system",
            help="Which system to use (e.g., pairwise, listwise).")
        detail_group.add_argument("-a", "--system_args", help="Arguments for "
                                  "the system (comparison method, learning "
                                  "algorithm and parameters...).")
        detail_group.add_argument("-o", "--output_dir",
            help="(Empty) directory for storing output generated by this"
            " experiment. Subdirectory for different folds will be generated"
            "automatically.")
        detail_group.add_argument("--output_dir_overwrite", default="False")
        detail_group.add_argument("-p", "--output_prefix",
            help="Prefix to be added to output filenames, e.g., the name of "
            "the data set, fold, etc. Output files will be stored as "
            "OUTPUT_DIR/PREFIX-RUN_ID.txt.gz")
        detail_group.add_argument("-e", "--experimenter",
            help="Experimenter type.")
        # run the parser
        if args_str:
            args = parser.parse_known_args(args_str.split())[0]
        else:
            args = parser.parse_known_args()[0]

        # determine whether to use config file or detailed args
        self.experiment_args = None
        if args.file:
            config_file = open(args.file)
            self.experiment_args = yaml.load(config_file)
            config_file.close()
            # overwrite with command-line options if given
            for arg, value in vars(args).items():
                if value:
                    self.experiment_args[arg] = value
        else:
            self.experiment_args = vars(args)

        # workaround - check if we have all the arguments needed
        if not ("training_queries" in self.experiment_args and
                "test_queries" in self.experiment_args and
                "feature_count" in self.experiment_args and
                "num_runs" in self.experiment_args and
                "num_queries" in self.experiment_args and
                "user_model" in self.experiment_args and
                "user_model_args" in self.experiment_args and
                "system" in self.experiment_args and
                "system_args" in self.experiment_args and
                "output_dir" in self.experiment_args):
            parser.print_help()
            sys.exit("Missing required arguments, please check the program"
                     " arguments or configuration file. %s" %
                     self.experiment_args)

        # set default values for optional arguments
        if not "query_sampling_method" in self.experiment_args:
            self.experiment_args["query_sampling_method"] = "random"
        if not "output_dir_overwrite" in self.experiment_args:
            self.experiment_args["output_dir_overwrite"] = False
        if not "experimenter" in self.experiment_args:
            self.experiment_args["experimenter"] = "experiment.LearningExperiment"
        if not "evaluation" in self.experiment_args:
            self.experiment_args["evaluation"] = "evaluation.NdcgEval"
        if not "processes" in self.experiment_args:
            self.experiment_args["processes"] = 0

        # locate or create directory for the current fold
        if not os.path.exists(self.experiment_args["output_dir"]):
            os.makedirs(self.experiment_args["output_dir"])
        elif not(self.experiment_args["output_dir_overwrite"]) and \
                            os.listdir(self.experiment_args["output_dir"]):
            # make sure the output directory is empty
            raise Exception("Output dir %s is not an empty directory. "
            "Please use a different directory, or move contents out "
            "of the way." %
             self.experiment_args["output_dir"])

        logging.basicConfig(format='%(asctime)s %(module)s: %(message)s',
                        level=logging.INFO)

        logging.info("Arguments: %s" % self.experiment_args)
        for k, v in sorted(self.experiment_args.iteritems()):
            logging.info("\t%s: %s" % (k, v))
        config_bk = os.path.join(self.experiment_args["output_dir"],
                                 "config_bk.yml")
        logging.info("Backing up configuration to: %s" % config_bk)
        config_bk_file = open(config_bk, "w")
        yaml.dump(self.experiment_args,
                  config_bk_file,
                  default_flow_style=False)
        config_bk_file.close()

        # load training and test queries
        training_file = self.experiment_args["training_queries"]
        test_file = self.experiment_args["test_queries"]
        self.feature_count = self.experiment_args["feature_count"]
        logging.info("Loading training data: %s " % training_file)
        self.training_queries = load_queries(training_file, self.feature_count)
        logging.info("... found %d queries." %
            self.training_queries.get_size())
        logging.info("Loading test data: %s " % test_file)
        self.test_queries = load_queries(test_file, self.feature_count)
        logging.info("... found %d queries." % self.test_queries.get_size())

        # initialize and run the experiment num_run times
        self.num_runs = self.experiment_args["num_runs"]
        self.output_dir = self.experiment_args["output_dir"]
        self.output_prefix = self.experiment_args["output_prefix"]
        self.experimenter = get_class(self.experiment_args["experimenter"])
Ejemplo n.º 25
0
import sys
import include
import evaluation, query, ranker
import numpy as np
from ranker.AbstractRankingFunction import AbstractRankingFunction

evaluation = evaluation.NdcgEval()
bm25ranker = AbstractRankingFunction(["ranker.model.BM25"], 'first', 3, sample="utils.sample_fixed")
queries = query.load_queries(sys.argv[1], 64)
#print evaluation.evaluate_all(bm25ranker, queries)

fh = open(sys.argv[1] + ".out.missing-b0.45.txt", "w")

for k1 in sorted([2.6, 2.5]):
    for b in sorted([0.45]):
#for k1 in np.arange(19.5, 100, 0.5):
#     for b in np.arange(-1, 1.2, 0.1):
         #for k3 in np.arange(100*itt, 100*(itt+1), 10):
         k3 = 0.0
         bm25ranker.update_weights(np.array([k1,k3,b]))
         print >> fh, "k1:%f k3:%f b:%f score:%f" % (k1, k3, b, evaluation.evaluate_all(bm25ranker, queries))
fh.close()
Ejemplo n.º 26
0
	def __init__(self, data_path, features):
		self.bi = comparison.BalancedInterleave()
		self.user_model = environment.CascadeUserModel('--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')	
		self.queries = query.load_queries(data_path, features)
Ejemplo n.º 27
0
for m in range(3):
    print('Method ' + str(m))
    if not os.path.exists('results/method' + str(m)):
        os.makedirs('results/method' + str(m))

    test_scores = []
    for i in range(folds):
        fold = i + 1
        print('Fold ' + str(fold))
        print('Creating model')
        model = LambdaRankHW(
            64, 2,
            'results/' + 'method' + str(m) + '/results' + str(fold) + '.csv')
        print('Model created')
        print('Loading queries')
        train_queries = query.load_queries(
            'HP2003/Fold' + str(fold) + '/train.txt', 64)
        val_queries = query.load_queries(
            'HP2003/Fold' + str(fold) + '/vali.txt', 64)
        test_queries = query.load_queries(
            'HP2003/Fold' + str(fold) + '/test.txt', 64)
        print('Loaded\n')

        test_scores.append(
            model.train_with_queries(train_queries,
                                     val_queries,
                                     test_queries,
                                     epochs=15))
        print('\n')
    method_scores.append(np.mean(test_scores))
print('Final scores: ')
print(method_scores)
Ejemplo n.º 28
0
# Experiment 1
# Get optimum configuration for REM (number of k, and d)

import sys, random
import include
import environment, evaluation, query, retrieval_system
import time
import datetime

# init data, query_samples, d's
train_queries = query.load_queries('../../DATA/NP2004/Fold1/train.txt', 64)
test_queries = query.load_queries('../../DATA/NP2004/Fold1/test.txt', 64)
query_samples = 5000 # how many queries we sample

d_array = [2,3,4,5,6]
k_array = [1,2,5,10]

# init user model, evaluation methods
user_model = environment.CascadeUserModel('--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')
evaluation = evaluation.NdcgEval()

# calculate using the lower dimensional slice(s) in d_array
rem_ndcg_result = [[0 for i in range(len(d_array))] for j in range(len(k_array))]

for n in range(0, len(k_array)):
    # number of evaluation for each k
    # make sure the number of total evaluation are equal for all k
    number_of_evaluation = query_samples / k_array[n]
    k = k_array[n]

    for idx in range(0,len(d_array)):
Ejemplo n.º 29
0
os.chdir("..")
os.chdir("..")
os.chdir("..")
#feature_count=136
rankerDict = queryRankers()
#feature_count=245
feature_count = 64
learner = retrieval_system.ListwiseLearningSystem(
    feature_count,
    '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01'
)
user_model = environment.CascadeUserModel(
    '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')
evaluation = evaluation.NdcgEval()
training_queries = query.load_queries(sys.argv[1], feature_count)
query_freq = {}
for train in training_queries:

    if (len(train.__labels__) in query_freq):
        query_freq[len(
            train.__labels__)] = query_freq[len(train.__labels__)] + 1
    else:
        query_freq[len(train.__labels__)] = 1
print query_freq
test_queries = query.load_queries(sys.argv[2], feature_count)
for i in range(20):
    q = training_queries[random.choice(training_queries.keys())]
    l = learner.get_ranked_list(q)
    c = user_model.get_clicks(l, q.get_labels())
    s = learner.update_solution(c)
Ejemplo n.º 30
0
import sys
import include
import evaluation, query, ranker
import numpy as np
from ranker.AbstractRankingFunction import AbstractRankingFunction

evaluation = evaluation.NdcgEval()
bm25ranker = AbstractRankingFunction(["ranker.model.BM25"],
                                     'first',
                                     3,
                                     sample="utils.sample_fixed")
queries = query.load_queries(sys.argv[1], 64)
#print evaluation.evaluate_all(bm25ranker, queries)

fh = open(sys.argv[1] + ".out.missing-b0.45.txt", "w")

for k1 in sorted([2.6, 2.5]):
    for b in sorted([0.45]):
        #for k1 in np.arange(19.5, 100, 0.5):
        #     for b in np.arange(-1, 1.2, 0.1):
        #for k3 in np.arange(100*itt, 100*(itt+1), 10):
        k3 = 0.0
        bm25ranker.update_weights(np.array([k1, k3, b]))
        print >> fh, "k1:%f k3:%f b:%f score:%f" % (
            k1, k3, b, evaluation.evaluate_all(bm25ranker, queries))
fh.close()
Ejemplo n.º 31
0
import sys, random
import include
import environment, evaluation, query, retrieval_system

# init data, query_samples, d's
queries = query.load_queries('../../data/NP2004/Fold1/test.txt', 64)
query_samples = 10 # how many queries we sample
d_array = [3,4,5,6]

# init user model, evaluation methods
user_model = environment.CascadeUserModel('--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')
evaluation = evaluation.NdcgEval()

# calculate using the full 64 dimensions
full_learner = retrieval_system.ListwiseLearningSystem(64,'-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01')
full_ndcg_result = []
for i in range(0,query_samples):
    q = queries[random.choice(queries.keys())]
    l = full_learner.get_ranked_list(q)
    c = user_model.get_clicks(l, q.get_labels())
    s = full_learner.update_solution(c)
    full_ndcg_result.append( evaluation.evaluate_all(s, queries) )
full_ranker = full_learner.get_solution()

# calculate using the lower dimensional slice(s) in d_array
rem_ndcg_result = [[0 for i in range(len(d_array))] for j in range(query_samples)]
rem_ranker = []
for idx in range(0,len(d_array)):
    d = d_array[idx]
    rem_learner = retrieval_system.ListwiseLearningSystemREMBO(64,d,'-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunctionREMBO -s 3 -d 0.1 -a 0.01')
    for i in range(0,query_samples):
Ejemplo n.º 32
0
#!/usr/bin/python3

import json, os.path, requests, sys

import config
token = config.load_token('/vagrant/token/twitter.json')

# load queries
import query
from query import i, q, count
queries = query.load_queries('/vagrant/data/twitter_search_queries.json')
query.init_queries_output_file(queries, '/vagrant/data')

# execute each query
for query in queries:
    path = '/vagrant/data/' + query[i] + '.json'

    # load previously saved tweets from the same query
    with open(path, 'r+') as tweets_file:
        tweets = json.load(tweets_file)

    # request
    url = 'https://api.twitter.com/1.1/search/tweets.json'
    params = {
        'q': query[q],
        'result_type': 'recent',
        'count': count,
        'since_id': tweets['since_id']
    }
    headers = {'Authorization': 'Bearer ' + token}
    r = requests.get(url, params=params, headers=headers)
Ejemplo n.º 33
0
import query

from query import i, q, count

# path
access_token_path = './credentials/access_token.json'

# load access token
access_token = auth.load_access_token(access_token_path)

# paths
queries_dir = './tweets'
queries_file = 'queries.json'

# load queries
queries = query.load_queries(queries_dir + '/' + queries_file)
query.init_queries_output_file(queries, queries_dir)

# execute each query
for query in queries:
    path = queries_dir + '/' + query[i] + '.json'

    # load previously saved tweets from the same query
    with open(path, 'r+') as tweets_file:
        tweets = json.load(tweets_file)

    # request
    url = 'https://api.twitter.com/1.1/search/tweets.json'
    params = {
        'q': query[q],
        'result_type': 'recent',
Ejemplo n.º 34
0
# Experiment 2
# Compare REM with original DBGD

import sys, random
import include
import environment, evaluation, query, retrieval_system
import time
import datetime
import numpy as np

# init data, query_samples, d's
train_queries = query.load_queries('../../DATA/NP2004/Fold1/train.txt', 64)
test_queries = query.load_queries('../../DATA/NP2004/Fold1/test.txt', 64)
query_samples = 5000  # how many queries we sample

d = 3
k = 10
number_of_evaluation = query_samples / k

# init user model, evaluation methods
user_model = environment.CascadeUserModel(
    '--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')
evaluation = evaluation.NdcgEval()

rem_ndcg_evaluation_train = []
full_ndcg_evaluation_train = []
rem_ndcg_evaluation_test = []
full_ndcg_evaluation_test = []

for m in range(0, k):
    # for each k, we have different A matrix
        # Stores the results for the current fold
        store_res.append(res)

    return store_res, ranker
    #test_queries = query.load_queries(os.path.normpath('./HP2003/Fold%d/test.txt' % fold), num_features)


## Run
if __name__ == '__main__':
    print('Run Experiment.ipynb to see the plots and statistical tests')
    n_epochs = 200
    measure_type = POINTWISE
    num_features = 64
    num_folds = 5

    results, best_ranker = experiment(n_epochs, measure_type, num_features,
                                      num_folds)

    print('Loading all test queries')
    test_queries = {}
    for fold in range(1, 6):
        test_queries = {
            **test_queries,
            **query.load_queries(
                os.path.normpath('./HP2003/Fold%d/test.txt' % fold), num_features)
        }

    print('Running test mndcg')
    test_results = report_test_ndcg(ranker, test_queries)
    print('Test mNDCG: %s' % sum(list(test_results.values())) /
          len(test_queries))
Ejemplo n.º 36
0
import retrieval_system, environment, evaluation, query
import os
from queryRankers import *
import pickle

os.chdir("..")
os.chdir("..")
os.chdir("..")
#feature_count=136
rankerDict=queryRankers()
#feature_count=245
feature_count=64
learner = retrieval_system.ListwiseLearningSystem(feature_count, '-w random -c comparison.ProbabilisticInterleave -r ranker.ProbabilisticRankingFunction -s 3 -d 0.1 -a 0.01')
user_model = environment.CascadeUserModel('--p_click 0:0.0,1:1 --p_stop 0:0.0,1:0.0')
evaluation = evaluation.NdcgEval()
training_queries = query.load_queries(sys.argv[1], feature_count)
query_freq={}
for train in training_queries:
    
    if(len(train.__labels__) in query_freq):
        query_freq[len(train.__labels__)]=query_freq[len(train.__labels__)]+1
    else:
        query_freq[len(train.__labels__)]=1    
print query_freq                                 
test_queries = query.load_queries(sys.argv[2], feature_count)
for i in range(20):
    q = training_queries[random.choice(training_queries.keys())]
    l = learner.get_ranked_list(q)
    c = user_model.get_clicks(l, q.get_labels())
    s = learner.update_solution(c)
    print evaluation.evaluate_all(s, test_queries)
Ejemplo n.º 37
0
def compareSystemsHist(vali_queries,classifierPath,basic_ranker_path,clust_data_path,data,click):
    
    print "-Loading Data-"
    clf = pickle.load( open( classifierPath ) )
    basic_ranker=pickle.load( open( basic_ranker_path ) )
    clusterData=pickle.load(open(clust_data_path))
    queryData=pickle.load(open(data))
    
    ranker_tie="random"
    feature_count=basic_ranker.feature_count
    ranker_args="3"
    arg_str=""
    sample_send="sample_unit_sphere"
    iterations=100
    
    rankers=[0]*2
    rankers[0]=basic_ranker
    
    
    user_model = environment.CascadeUserModel(click)
    training_queries = query.load_queries(vali_queries, feature_count)
    compar_interleave=ProbabilisticInterleave(None)

    print "-Calculating-"
    
    ii=0


    results=[]
    for qid in queryData.query_ranker.keys():
        print str(float(ii)*100/float(len(queryData.query_ranker.keys())))+"%"
        ii+=1
        q=training_queries.get_query(qid)
        for val in queryData.query_ranker[qid]:
            test=val
            #test=queryData.query_ranker[q][0]
            testWeights=str(test.tolist())
            testWeights=testWeights.replace("[", "")
            testWeights=testWeights.replace("]", "")
            #weights = np.array([float(num) for num in testWeights.split(",")])
            #print len(weights)
            ranker_tie="random"
            ranker_args="3"
            sample_send="sample_unit_sphere"
    
            rankers[1]=rankerClass.ProbabilisticRankingFunction(ranker_args,
                                                    ranker_tie,
                                                    feature_count,
                                                    sample=sample_send,
                                                    init=testWeights)
           
            second_win=0
            for i in range(iterations):
                #q = training_queries.get_query(random.choice(training_queries.keys()))          
                l, a = compar_interleave.interleave(rankers[0], rankers[1], q, 10)
                c = user_model.get_clicks(l, q.get_labels())
                o = compar_interleave.infer_outcome(l, a, c, q)
                if(o>0):
                    second_win+=1
                elif(o==0):
                    coin=random.random()
                    if(coin>0.5):
                        second_win+=1
            result_com=float(second_win)/float(iterations)
            results.append(result_com)

    g=P.hist(results, bins = 20,range=[0,1])
    P.xlabel("The win rate of the ranker",fontsize=20)
    P.ylabel("Number of rankers",fontsize=20)
    P.show(g)