Beispiel #1
0
    def Save(self):

        print "Loading Data"

        training_queries = queryClass.load_queries(self.testQueries,
                                                   self.feature_count)
        ranker = pickle.load(open(self.rankerPath))

        max = 100  #max number of docs in the ranking

        #print clusterData.queryToCluster.keys()
        #print training_queries.keys()
        BestRanker = queryFeatures()
        print "Loading training objects"
        i = 0
        for query in training_queries:
            #print str(i*100/len(training_queries))+"%"
            i = i + 1
            #query = training_queries.get_query(qid)
            ranker.init_ranking(query)
            docIds = ranker.get_ranking()
            iter = 0
            for docId in docIds:
                if iter > max:
                    break
                iter = iter + 1
                features = query.get_feature_vector(docId)
                BestRanker.add(query.get_qid(), features)
                #print features
                #BestRanker.addFeaturesToQid([float(i) for i in features],query.get_qid())

        pickle.dump(BestRanker,
                    open("QueryData/" + self.dataset + ".data", "wb"))
Beispiel #2
0
    def Train(self):
        
        print "Loading Data"
        clusterData=pickle.load(open( self.clusterDataPath, "rb" ) )
        feature_count=len(clusterData.clusterToRanker[0][0])
        training_queries = queryClass.load_queries(self.testQueries, feature_count)
        ranker=pickle.load( open( self.rankerPath ) )
        
        """
        testWeights=str(clusterData.clusterToRanker[0][0])
        testWeights=testWeights.replace("[", "")
        testWeights=testWeights.replace("]", "")
        weights = np.array([float(num) for num in testWeights.split(",")])
        ranker_tie="random"
        ranker_args="3"
        sample_send="sample_unit_sphere"

        ranker=rankerClass.ProbabilisticRankingFunction(ranker_args,
                                                ranker_tie,
                                                feature_count,
                                                sample=sample_send,
                                                init=testWeights)
        """
        X=[]
        Y=[]
        max=100 #max number of docs in the ranking 

    #print clusterData.queryToCluster.keys()
    #print training_queries.keys()
        print "Loading training objects"
        for qid in clusterData.queryToCluster:
            query = training_queries.get_query(qid)
            ranker.init_ranking(query)
            docIds=ranker.get_ranking()
            iter=0
            for docId in docIds:
                if iter>max:
                    break
                features=query.get_feature_vector(docId)
                X.append(features)
                Y.append(clusterData.queryToCluster[qid][0])
                
                iter=iter+1
            
        #X = [[0, 0], [1, 1]]
        #y = [0, 1]
        X=np.array(X)
        Y=np.array(Y)
        print "Training"
        clf = svm.SVC()
        clf.fit(X, Y) 
       
        if not os.path.exists("Classifier"):
            os.makedirs("Classifier")

        paths=self.clusterDataPath.split('/')
        name=paths[len(paths)-1]
        parts=name.split('.')
        name=parts[0]
        pickle.dump(clf, open( "Classifier/"+name+".data", "wb" ) )