Example #1
0
 def __init__(self,Index,alpha=0.7,N=30):
     #init ranking model
     #LanguageModel(Index,0.2)
     self.weighter = Log_plus(Index)
     self.ranking_model = Euclidian_model(Index, self.weighter)
     self.Index = Index
     
     self.alpha = alpha
     self.N = N
Example #2
0
def test_weighter():
    parser = ParserCACM()
    textRepresenter = PorterStemmer()
    fname = "data/cacm/cacm.txt"
    I = Index(parser, textRepresenter)
    I.indexation(fname)
    weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)]
    for i, w in enumerate(weighters):
        print "Test of weighter" + str(i)
        print "getDocWeightsForDoc"
        print w.getDocWeightsForDoc("20")
        print "getDocWeightsForStem"
        print w.getDocWeightsForStem("accelerat")
        print "getDocWeightsForQuery"
        print w.getWeightsForQuery(I.getTfsForDoc("20"))
Example #3
0
def initModels(I, modelType):
    """Init Models of type modelType or load if already computed"""

    model_file_name = modelType + '.p'

    sys.stdout.write("Creating models...")
    sys.stdout.flush()

    if os.path.isfile(model_file_name):
        models = pickle.load(open(model_file_name, "rb"))

    elif modelType == "Vectoriel":
        weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)]
        models = [Vectoriel(Index, True, w) for w in weighters]
        pickle.dump(models, open(model_file_name, "wb"))

    else:
        print "Unknown model type ABORT THE MISSION"

    sys.stdout.write("Done!\n")

    return models
Example #4
0
    def __init__(self,
                 N,
                 index_file,
                 query_file,
                 relevance_file,
                 model_type="Vectoriel",
                 div_K=None,
                 div_N=None,
                 eval_N=20):
        """ model_type = Vectoriel | Okapi | Language | PageRank | MetaModel """
        self.N = eval_N
        self.Index = initIndex(index_file)

        if model_type == "Vectoriel":
            self.models = [Vectoriel(Index, True, Log_plus(self.Index))
                           ]  #initModels(self.Index,model_type)
        elif model_type == "Euclidian_model":
            self.models = [Euclidian_model(self.Index, Log_plus(self.Index))]

        elif model_type == "Language":
            print "Init of Language model"
            self.models = [LanguageModel(self.Index, 0.2)]

        elif model_type == "Okapi":
            self.models = [Okapi(self.Index)]

        elif model_type == "PageRank":
            self.models = [RankModel(self.Index)]

        elif model_type == "Hits":
            self.models = [HitsModel(self.Index)]

        elif model_type == "KMeans_diversity":
            self.models = [KMeans_diversity(self.Index, div_K, div_N)]

        elif model_type == "Greedy_diversity":
            self.models = [Greedy_diversity(self.Index, div_K, div_N)]

        elif model_type == "Greedy_diversity_euclidian":
            print "alpha, N:", div_K, div_N
            self.models = [
                Greedy_diversity_euclidian(self.Index, alpha=div_K, N=div_N)
            ]

        elif model_type == "MetaModel":
            """Learning a linear combination of 4 models"""
            I = self.Index
            w1 = TF_IDF(I)
            model1 = Vectoriel(I, True, w1)
            w2 = Log_plus(I)
            model2 = Vectoriel(I, True, w2)
            #w3 = Log(I)
            #model3 = Vectoriel(I,True, w3)

            model3 = Okapi(I)

            f1 = FeaturerModel(I, model1)
            f2 = FeaturerModel(I, model2)
            f3 = FeaturerModel(I, model3)
            #f4 = FeaturerModel(I,model4)

            listFeaturers = FeaturerList([f1, f2, f3])  #,f4])
            metamodel = MetaModel(listFeaturers, I, query_file, relevance_file)
            metamodel.train()
            self.models = [metamodel]

        print type(self.models[0])
        self.query_file = query_file
        self.relevance_file = relevance_file
        self.query_parser = GroundTruthParser(self.query_file,
                                              self.relevance_file)
Example #5
0
    sys.stdout.write("Indexing database...")
    sys.stdout.flush()
    if os.path.isfile('Index.p'):
        I = pickle.load(open("Index.p", "rb"))

    else:
        parser = ParserCACM()
        textRepresenter = PorterStemmer()
        I = Index(parser, textRepresenter)
        I.indexation(fname)
        I.parser = None
        pickle.dump(I, open("Index.p", "wb"))

    sys.stdout.write("Done!\n")
    sys.stdout.flush()

    sys.stdout.write("Creating weighters...")
    sys.stdout.flush()

    if os.path.isfile('Vectoriel.p'):
        models = pickle.load(open("Models.p", "rb"))
    else:
        weighters = [Binary(I),
                     Log_plus(I)]  #, TF(I), TF_IDF(I), Log(I)] # Log_plus(I)]
        models = [Vectoriel(True, w) for w in weighters]
        pickle.dump(models, open("Models.p", "wb"))

    sys.stdout.write("Done!\n")

    queryExample = {'techniqu': 1, 'accelerat': 1}
    query_results = testQuery(queryExample, models)
Example #6
0
    def __init__(self, index_file, query_file, relevance_file,model_type="Vectoriel"):
        """ model_type = Vectoriel | Okapi | Language | PageRank | MetaModel """

        self.Index = initIndex(index_file)
        
        if model_type  == "Vectoriel":
            self.models = initModels(self.Index,model_type)
            
        elif model_type == "Language":
            print "Init of Language model"
            self.models = [LanguageModel(self.Index,0.2)]
            
        elif model_type == "Okapi":
            self.models = [Okapi(self.Index)]
            
        elif model_type == "PageRank":
            self.models = [RankModel(self.Index)]
            
        elif model_type == "Hits":
            self.models = [HitsModel(self.Index)]
            
        elif model_type == "MetaModel":
            """Learning a linear combination of 4 models"""
            I = self.Index
            
            w1 = TF_IDF(I)
            model1 = Vectoriel(I,True, w1)
            
            w2 = Log_plus(I)
            model2 = Vectoriel(I,True, w2)
            
            w3 = Binary(I)
            model3 = Vectoriel(I,True, w3)
            
            w4 = TF(I)
            model4 = Vectoriel(I,True, w4)
            
            model5 = Okapi(I)
            
            model6 = LanguageModel(I,0.2)
            
            model7 = RankModel(I,n=5, K=100,d=.85)
            
            f1 = FeaturerModel(I,model1)
            f2 = FeaturerModel(I,model2)
            f3 = FeaturerModel(I,model3)
            f4 = FeaturerModel(I,model4)
            f5 = FeaturerModel(I,model5)
            f6 = FeaturerModel(I,model6)
            f7 = FeaturerModel(I,model7)
            
            
            listFeaturers = FeaturerList([f1,f2,f3,f4,f5,f6,f7])
            metamodel = MetaModel(listFeaturers,I,query_file,relevance_file)
            metamodel.train()
            self.models = [metamodel]
        elif model_type == "Random":
            self.models = [RandomModel(self.Index)]
        else:
            pass
        
        print type(self.models[0])    
        self.query_file = query_file
        self.relevance_file = relevance_file
        self.query_parser = QueryParser(self.query_file, self.relevance_file)  
Example #7
0
class Greedy_diversity_euclidian(IRmodel):
    
    def __init__(self,Index,alpha=0.7,N=30):
        #init ranking model
        #LanguageModel(Index,0.2)
        self.weighter = Log_plus(Index)
        self.ranking_model = Euclidian_model(Index, self.weighter)
        self.Index = Index
        
        self.alpha = alpha
        self.N = N
        
    def getName(self):
        return "Greedy_diversity_euclidian"
        
    def getIndex(self):
        return self.Index
        
    def getRanking(self,query):
        CST = 32.
        #first compute rank using ranking model
        doc_ranking = self.ranking_model.getRanking(query)
        
        #first document is the most pertinent
        hinge = [doc_ranking[0]]
        hinge_doc_id,_ = hinge[0]
        du = self.weighter.getDocWeightsForDoc(str(hinge_doc_id)) 
        #print "hinge doc :", hinge_doc_id
        K = self.N
        unordered_docs = list(doc_ranking[1:K])
        for rank_idx in range(1,K):
            
            max_score = -sys.maxint
            max_doc = -1
            
            for doc_id, relev_score in unordered_docs:
                #computes similarity with docs in hinge (MAX similarity)
                di = self.weighter.getDocWeightsForDoc(str(doc_id))
                #relev_score_ = 1 - self.Cos_Distance(di,query)/CST
                sim_phi = 0.0
                for hinge_doc_id,_ in hinge:
                    # Using euclidian distance                                       
                    sim = -Cos_Distance(du,di)/CST  
                    #print "sim :", sim
                    sim_phi += sim
                    #print "sim :",sim
                denom = len(hinge)
                #print "sim/denom, rel_score :",sim_phi/denom,relev_score
                score = (self.alpha * relev_score ) - ((1-self.alpha) * sim_phi / denom)
                #print("score :",score)
                if score > max_score:
                    max_score = score
                    max_doc = (doc_id,relev_score) #relev_score)
                assert(max_score != -sys.maxint)
            
            #doc with max score is next in ranking            
            hinge.append(max_doc)
            #remove it from remaining docs
            unordered_docs.pop(unordered_docs.index(max_doc))
        assert(len(unordered_docs) == 0)
        #create top N ranking using original doc scores
        return hinge + doc_ranking[self.N:]      
Example #8
0
    sys.stdout.write("Indexing database...")
    sys.stdout.flush()
    if os.path.isfile('Index.p'):
        I = pickle.load(open("Index.p", "rb"))

    else:
        parser = ParserCACM()
        textRepresenter = PorterStemmer()
        I = Index(parser, textRepresenter)
        I.indexation(fname)
        I.parser = None
        pickle.dump(I, open("Index.p", "wb"))

    w1 = TF_IDF(I)
    model1 = Vectoriel(I, True, w1)
    w2 = Log_plus(I)
    model2 = Vectoriel(I, True, w2)
    w3 = Log(I)
    model3 = Vectoriel(I, True, w3)

    model4 = Okapi(I)
    queryExample = {'techniqu': 1, 'accelerat': 1}

    f1 = FeaturerModel(I, model1)
    print "\ndone building f1"
    f2 = FeaturerModel(I, model2)
    print "\ndone building f2"
    f3 = FeaturerModel(I, model3)
    print "\ndone building f3"

    f4 = FeaturerModel(I, model4)