Exemple #1
0
 def __init__(self,Index):
     """Setting the params"""
     self.k1 = 1. #np.random.uniform(1,2)
     self.b = 0.75
     self.Weighter = TF(Index)
     self.Index = Index
     
     # Collecting docs length
     self.L = {}
     self.L_moy = 0.0
     for doc_id in self.Index.docFrom.keys():
         self.L[doc_id] = float(self.Index.docFrom[doc_id][2])
         self.L_moy += self.L[doc_id]
     self.L_moy = self.L_moy / self.Weighter.N # Check that the mean length is okay !!
     print 'L moy : ',self.L_moy
     
     # Calculating all probabilistic ids for all stems        
     self.idf_probabilistic = self.idf_probabilistic()
Exemple #2
0
def test_weighter():
    parser = ParserCACM()
    textRepresenter = PorterStemmer()
    fname = "data/cacm/cacm.txt"
    I = Index(parser, textRepresenter)
    I.indexation(fname)
    weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)]
    for i, w in enumerate(weighters):
        print "Test of weighter" + str(i)
        print "getDocWeightsForDoc"
        print w.getDocWeightsForDoc("20")
        print "getDocWeightsForStem"
        print w.getDocWeightsForStem("accelerat")
        print "getDocWeightsForQuery"
        print w.getWeightsForQuery(I.getTfsForDoc("20"))
Exemple #3
0
def initModels(I, modelType):
    """Init Models of type modelType or load if already computed"""

    model_file_name = modelType + '.p'

    sys.stdout.write("Creating models...")
    sys.stdout.flush()

    if os.path.isfile(model_file_name):
        models = pickle.load(open(model_file_name, "rb"))

    elif modelType == "Vectoriel":
        weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)]
        models = [Vectoriel(Index, True, w) for w in weighters]
        pickle.dump(models, open(model_file_name, "wb"))

    else:
        print "Unknown model type ABORT THE MISSION"

    sys.stdout.write("Done!\n")

    return models
Exemple #4
0
    def __init__(self, index_file, query_file, relevance_file,model_type="Vectoriel"):
        """ model_type = Vectoriel | Okapi | Language | PageRank | MetaModel """

        self.Index = initIndex(index_file)
        
        if model_type  == "Vectoriel":
            self.models = initModels(self.Index,model_type)
            
        elif model_type == "Language":
            print "Init of Language model"
            self.models = [LanguageModel(self.Index,0.2)]
            
        elif model_type == "Okapi":
            self.models = [Okapi(self.Index)]
            
        elif model_type == "PageRank":
            self.models = [RankModel(self.Index)]
            
        elif model_type == "Hits":
            self.models = [HitsModel(self.Index)]
            
        elif model_type == "MetaModel":
            """Learning a linear combination of 4 models"""
            I = self.Index
            
            w1 = TF_IDF(I)
            model1 = Vectoriel(I,True, w1)
            
            w2 = Log_plus(I)
            model2 = Vectoriel(I,True, w2)
            
            w3 = Binary(I)
            model3 = Vectoriel(I,True, w3)
            
            w4 = TF(I)
            model4 = Vectoriel(I,True, w4)
            
            model5 = Okapi(I)
            
            model6 = LanguageModel(I,0.2)
            
            model7 = RankModel(I,n=5, K=100,d=.85)
            
            f1 = FeaturerModel(I,model1)
            f2 = FeaturerModel(I,model2)
            f3 = FeaturerModel(I,model3)
            f4 = FeaturerModel(I,model4)
            f5 = FeaturerModel(I,model5)
            f6 = FeaturerModel(I,model6)
            f7 = FeaturerModel(I,model7)
            
            
            listFeaturers = FeaturerList([f1,f2,f3,f4,f5,f6,f7])
            metamodel = MetaModel(listFeaturers,I,query_file,relevance_file)
            metamodel.train()
            self.models = [metamodel]
        elif model_type == "Random":
            self.models = [RandomModel(self.Index)]
        else:
            pass
        
        print type(self.models[0])    
        self.query_file = query_file
        self.relevance_file = relevance_file
        self.query_parser = QueryParser(self.query_file, self.relevance_file)  
Exemple #5
0
class Okapi(IRmodel):
    """BM25 - Okapi : classical Probilistic model for Information Retrieval"""
    
    def __init__(self,Index):
        """Setting the params"""
        self.k1 = 1. #np.random.uniform(1,2)
        self.b = 0.75
        self.Weighter = TF(Index)
        self.Index = Index
        
        # Collecting docs length
        self.L = {}
        self.L_moy = 0.0
        for doc_id in self.Index.docFrom.keys():
            self.L[doc_id] = float(self.Index.docFrom[doc_id][2])
            self.L_moy += self.L[doc_id]
        self.L_moy = self.L_moy / self.Weighter.N # Check that the mean length is okay !!
        print 'L moy : ',self.L_moy
        
        # Calculating all probabilistic ids for all stems        
        self.idf_probabilistic = self.idf_probabilistic()
        #print 'Proba. IDFs : ',self.idf_probabilistic
        
    def getName(self):
        return "Okapi"
    
    def getIndex(self):
        return self.Index
        
    def idf_probabilistic(self):
        """ Probabilistic Inverse Document Frequency
            TODO : add this function to __init__() in Weighter class with a switch parameter 
                   such as probabilistic = True | False 
        """
        idf = {}
        N = self.Weighter.N 
        for stem in self.Index.stems.keys():
            tfs = self.Index.getTfsForStem(stem)
            df_t = float(len(tfs))
            r = np.log(( N - df_t + .5 ) / ( df_t + .5) )
            idf[stem] = max(0, r)
        return idf
        
    def f(self,q,d):
        """Score measuring how well Query q matches Document d"""
        score = 0.0        
        tfs = self.Weighter.getDocWeightsForDoc(d)
        for t in q:
            num = (self.k1 + 1)*  tfs.get(t,0) #getDocWeightsForStem(t).get(d,0.)
            denom = self.k1 * ( (1-self.b) + self.b * (self.L[d] / self.L_moy)) \
                                + tfs.get(t,0) #getDocWeightsForStem(t).get(d,0.)
            #print 'num :',num
            #print 'denom :',denom
            #print "idf prb :",self.idf_probabilistic.get(t,0.0)
            score += self.idf_probabilistic.get(t,0.0) * (num / denom)                                        
        #print "score :",score
        return score        
    
    def getScores(self,query):        
        """compute doncument's scores for a given query"""
        scores = {}        
        docs = self.L.keys()
        for doc_id in docs :
            scores[int(doc_id)] = self.f(query,doc_id)
        #print "scores :",scores
        return scores