def __init__(self,Index,alpha=0.7,N=30): #init ranking model #LanguageModel(Index,0.2) self.weighter = Log_plus(Index) self.ranking_model = Euclidian_model(Index, self.weighter) self.Index = Index self.alpha = alpha self.N = N
def test_weighter(): parser = ParserCACM() textRepresenter = PorterStemmer() fname = "data/cacm/cacm.txt" I = Index(parser, textRepresenter) I.indexation(fname) weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)] for i, w in enumerate(weighters): print "Test of weighter" + str(i) print "getDocWeightsForDoc" print w.getDocWeightsForDoc("20") print "getDocWeightsForStem" print w.getDocWeightsForStem("accelerat") print "getDocWeightsForQuery" print w.getWeightsForQuery(I.getTfsForDoc("20"))
def initModels(I, modelType): """Init Models of type modelType or load if already computed""" model_file_name = modelType + '.p' sys.stdout.write("Creating models...") sys.stdout.flush() if os.path.isfile(model_file_name): models = pickle.load(open(model_file_name, "rb")) elif modelType == "Vectoriel": weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)] models = [Vectoriel(Index, True, w) for w in weighters] pickle.dump(models, open(model_file_name, "wb")) else: print "Unknown model type ABORT THE MISSION" sys.stdout.write("Done!\n") return models
def __init__(self, N, index_file, query_file, relevance_file, model_type="Vectoriel", div_K=None, div_N=None, eval_N=20): """ model_type = Vectoriel | Okapi | Language | PageRank | MetaModel """ self.N = eval_N self.Index = initIndex(index_file) if model_type == "Vectoriel": self.models = [Vectoriel(Index, True, Log_plus(self.Index)) ] #initModels(self.Index,model_type) elif model_type == "Euclidian_model": self.models = [Euclidian_model(self.Index, Log_plus(self.Index))] elif model_type == "Language": print "Init of Language model" self.models = [LanguageModel(self.Index, 0.2)] elif model_type == "Okapi": self.models = [Okapi(self.Index)] elif model_type == "PageRank": self.models = [RankModel(self.Index)] elif model_type == "Hits": self.models = [HitsModel(self.Index)] elif model_type == "KMeans_diversity": self.models = [KMeans_diversity(self.Index, div_K, div_N)] elif model_type == "Greedy_diversity": self.models = [Greedy_diversity(self.Index, div_K, div_N)] elif model_type == "Greedy_diversity_euclidian": print "alpha, N:", div_K, div_N self.models = [ Greedy_diversity_euclidian(self.Index, alpha=div_K, N=div_N) ] elif model_type == "MetaModel": """Learning a linear combination of 4 models""" I = self.Index w1 = TF_IDF(I) model1 = Vectoriel(I, True, w1) w2 = Log_plus(I) model2 = Vectoriel(I, True, w2) #w3 = Log(I) #model3 = Vectoriel(I,True, w3) model3 = Okapi(I) f1 = FeaturerModel(I, model1) f2 = FeaturerModel(I, model2) f3 = FeaturerModel(I, model3) #f4 = FeaturerModel(I,model4) listFeaturers = FeaturerList([f1, f2, f3]) #,f4]) metamodel = MetaModel(listFeaturers, I, query_file, relevance_file) metamodel.train() self.models = [metamodel] print type(self.models[0]) self.query_file = query_file self.relevance_file = relevance_file self.query_parser = GroundTruthParser(self.query_file, self.relevance_file)
sys.stdout.write("Indexing database...") sys.stdout.flush() if os.path.isfile('Index.p'): I = pickle.load(open("Index.p", "rb")) else: parser = ParserCACM() textRepresenter = PorterStemmer() I = Index(parser, textRepresenter) I.indexation(fname) I.parser = None pickle.dump(I, open("Index.p", "wb")) sys.stdout.write("Done!\n") sys.stdout.flush() sys.stdout.write("Creating weighters...") sys.stdout.flush() if os.path.isfile('Vectoriel.p'): models = pickle.load(open("Models.p", "rb")) else: weighters = [Binary(I), Log_plus(I)] #, TF(I), TF_IDF(I), Log(I)] # Log_plus(I)] models = [Vectoriel(True, w) for w in weighters] pickle.dump(models, open("Models.p", "wb")) sys.stdout.write("Done!\n") queryExample = {'techniqu': 1, 'accelerat': 1} query_results = testQuery(queryExample, models)
def __init__(self, index_file, query_file, relevance_file,model_type="Vectoriel"): """ model_type = Vectoriel | Okapi | Language | PageRank | MetaModel """ self.Index = initIndex(index_file) if model_type == "Vectoriel": self.models = initModels(self.Index,model_type) elif model_type == "Language": print "Init of Language model" self.models = [LanguageModel(self.Index,0.2)] elif model_type == "Okapi": self.models = [Okapi(self.Index)] elif model_type == "PageRank": self.models = [RankModel(self.Index)] elif model_type == "Hits": self.models = [HitsModel(self.Index)] elif model_type == "MetaModel": """Learning a linear combination of 4 models""" I = self.Index w1 = TF_IDF(I) model1 = Vectoriel(I,True, w1) w2 = Log_plus(I) model2 = Vectoriel(I,True, w2) w3 = Binary(I) model3 = Vectoriel(I,True, w3) w4 = TF(I) model4 = Vectoriel(I,True, w4) model5 = Okapi(I) model6 = LanguageModel(I,0.2) model7 = RankModel(I,n=5, K=100,d=.85) f1 = FeaturerModel(I,model1) f2 = FeaturerModel(I,model2) f3 = FeaturerModel(I,model3) f4 = FeaturerModel(I,model4) f5 = FeaturerModel(I,model5) f6 = FeaturerModel(I,model6) f7 = FeaturerModel(I,model7) listFeaturers = FeaturerList([f1,f2,f3,f4,f5,f6,f7]) metamodel = MetaModel(listFeaturers,I,query_file,relevance_file) metamodel.train() self.models = [metamodel] elif model_type == "Random": self.models = [RandomModel(self.Index)] else: pass print type(self.models[0]) self.query_file = query_file self.relevance_file = relevance_file self.query_parser = QueryParser(self.query_file, self.relevance_file)
class Greedy_diversity_euclidian(IRmodel): def __init__(self,Index,alpha=0.7,N=30): #init ranking model #LanguageModel(Index,0.2) self.weighter = Log_plus(Index) self.ranking_model = Euclidian_model(Index, self.weighter) self.Index = Index self.alpha = alpha self.N = N def getName(self): return "Greedy_diversity_euclidian" def getIndex(self): return self.Index def getRanking(self,query): CST = 32. #first compute rank using ranking model doc_ranking = self.ranking_model.getRanking(query) #first document is the most pertinent hinge = [doc_ranking[0]] hinge_doc_id,_ = hinge[0] du = self.weighter.getDocWeightsForDoc(str(hinge_doc_id)) #print "hinge doc :", hinge_doc_id K = self.N unordered_docs = list(doc_ranking[1:K]) for rank_idx in range(1,K): max_score = -sys.maxint max_doc = -1 for doc_id, relev_score in unordered_docs: #computes similarity with docs in hinge (MAX similarity) di = self.weighter.getDocWeightsForDoc(str(doc_id)) #relev_score_ = 1 - self.Cos_Distance(di,query)/CST sim_phi = 0.0 for hinge_doc_id,_ in hinge: # Using euclidian distance sim = -Cos_Distance(du,di)/CST #print "sim :", sim sim_phi += sim #print "sim :",sim denom = len(hinge) #print "sim/denom, rel_score :",sim_phi/denom,relev_score score = (self.alpha * relev_score ) - ((1-self.alpha) * sim_phi / denom) #print("score :",score) if score > max_score: max_score = score max_doc = (doc_id,relev_score) #relev_score) assert(max_score != -sys.maxint) #doc with max score is next in ranking hinge.append(max_doc) #remove it from remaining docs unordered_docs.pop(unordered_docs.index(max_doc)) assert(len(unordered_docs) == 0) #create top N ranking using original doc scores return hinge + doc_ranking[self.N:]
sys.stdout.write("Indexing database...") sys.stdout.flush() if os.path.isfile('Index.p'): I = pickle.load(open("Index.p", "rb")) else: parser = ParserCACM() textRepresenter = PorterStemmer() I = Index(parser, textRepresenter) I.indexation(fname) I.parser = None pickle.dump(I, open("Index.p", "wb")) w1 = TF_IDF(I) model1 = Vectoriel(I, True, w1) w2 = Log_plus(I) model2 = Vectoriel(I, True, w2) w3 = Log(I) model3 = Vectoriel(I, True, w3) model4 = Okapi(I) queryExample = {'techniqu': 1, 'accelerat': 1} f1 = FeaturerModel(I, model1) print "\ndone building f1" f2 = FeaturerModel(I, model2) print "\ndone building f2" f3 = FeaturerModel(I, model3) print "\ndone building f3" f4 = FeaturerModel(I, model4)