def __init__(self,Index): """Setting the params""" self.k1 = 1. #np.random.uniform(1,2) self.b = 0.75 self.Weighter = TF(Index) self.Index = Index # Collecting docs length self.L = {} self.L_moy = 0.0 for doc_id in self.Index.docFrom.keys(): self.L[doc_id] = float(self.Index.docFrom[doc_id][2]) self.L_moy += self.L[doc_id] self.L_moy = self.L_moy / self.Weighter.N # Check that the mean length is okay !! print 'L moy : ',self.L_moy # Calculating all probabilistic ids for all stems self.idf_probabilistic = self.idf_probabilistic()
def test_weighter(): parser = ParserCACM() textRepresenter = PorterStemmer() fname = "data/cacm/cacm.txt" I = Index(parser, textRepresenter) I.indexation(fname) weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)] for i, w in enumerate(weighters): print "Test of weighter" + str(i) print "getDocWeightsForDoc" print w.getDocWeightsForDoc("20") print "getDocWeightsForStem" print w.getDocWeightsForStem("accelerat") print "getDocWeightsForQuery" print w.getWeightsForQuery(I.getTfsForDoc("20"))
def initModels(I, modelType): """Init Models of type modelType or load if already computed""" model_file_name = modelType + '.p' sys.stdout.write("Creating models...") sys.stdout.flush() if os.path.isfile(model_file_name): models = pickle.load(open(model_file_name, "rb")) elif modelType == "Vectoriel": weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)] models = [Vectoriel(Index, True, w) for w in weighters] pickle.dump(models, open(model_file_name, "wb")) else: print "Unknown model type ABORT THE MISSION" sys.stdout.write("Done!\n") return models
def __init__(self, index_file, query_file, relevance_file,model_type="Vectoriel"): """ model_type = Vectoriel | Okapi | Language | PageRank | MetaModel """ self.Index = initIndex(index_file) if model_type == "Vectoriel": self.models = initModels(self.Index,model_type) elif model_type == "Language": print "Init of Language model" self.models = [LanguageModel(self.Index,0.2)] elif model_type == "Okapi": self.models = [Okapi(self.Index)] elif model_type == "PageRank": self.models = [RankModel(self.Index)] elif model_type == "Hits": self.models = [HitsModel(self.Index)] elif model_type == "MetaModel": """Learning a linear combination of 4 models""" I = self.Index w1 = TF_IDF(I) model1 = Vectoriel(I,True, w1) w2 = Log_plus(I) model2 = Vectoriel(I,True, w2) w3 = Binary(I) model3 = Vectoriel(I,True, w3) w4 = TF(I) model4 = Vectoriel(I,True, w4) model5 = Okapi(I) model6 = LanguageModel(I,0.2) model7 = RankModel(I,n=5, K=100,d=.85) f1 = FeaturerModel(I,model1) f2 = FeaturerModel(I,model2) f3 = FeaturerModel(I,model3) f4 = FeaturerModel(I,model4) f5 = FeaturerModel(I,model5) f6 = FeaturerModel(I,model6) f7 = FeaturerModel(I,model7) listFeaturers = FeaturerList([f1,f2,f3,f4,f5,f6,f7]) metamodel = MetaModel(listFeaturers,I,query_file,relevance_file) metamodel.train() self.models = [metamodel] elif model_type == "Random": self.models = [RandomModel(self.Index)] else: pass print type(self.models[0]) self.query_file = query_file self.relevance_file = relevance_file self.query_parser = QueryParser(self.query_file, self.relevance_file)
class Okapi(IRmodel): """BM25 - Okapi : classical Probilistic model for Information Retrieval""" def __init__(self,Index): """Setting the params""" self.k1 = 1. #np.random.uniform(1,2) self.b = 0.75 self.Weighter = TF(Index) self.Index = Index # Collecting docs length self.L = {} self.L_moy = 0.0 for doc_id in self.Index.docFrom.keys(): self.L[doc_id] = float(self.Index.docFrom[doc_id][2]) self.L_moy += self.L[doc_id] self.L_moy = self.L_moy / self.Weighter.N # Check that the mean length is okay !! print 'L moy : ',self.L_moy # Calculating all probabilistic ids for all stems self.idf_probabilistic = self.idf_probabilistic() #print 'Proba. IDFs : ',self.idf_probabilistic def getName(self): return "Okapi" def getIndex(self): return self.Index def idf_probabilistic(self): """ Probabilistic Inverse Document Frequency TODO : add this function to __init__() in Weighter class with a switch parameter such as probabilistic = True | False """ idf = {} N = self.Weighter.N for stem in self.Index.stems.keys(): tfs = self.Index.getTfsForStem(stem) df_t = float(len(tfs)) r = np.log(( N - df_t + .5 ) / ( df_t + .5) ) idf[stem] = max(0, r) return idf def f(self,q,d): """Score measuring how well Query q matches Document d""" score = 0.0 tfs = self.Weighter.getDocWeightsForDoc(d) for t in q: num = (self.k1 + 1)* tfs.get(t,0) #getDocWeightsForStem(t).get(d,0.) denom = self.k1 * ( (1-self.b) + self.b * (self.L[d] / self.L_moy)) \ + tfs.get(t,0) #getDocWeightsForStem(t).get(d,0.) #print 'num :',num #print 'denom :',denom #print "idf prb :",self.idf_probabilistic.get(t,0.0) score += self.idf_probabilistic.get(t,0.0) * (num / denom) #print "score :",score return score def getScores(self,query): """compute doncument's scores for a given query""" scores = {} docs = self.L.keys() for doc_id in docs : scores[int(doc_id)] = self.f(query,doc_id) #print "scores :",scores return scores