def __init__(self, query_file, relevance_file): self.query = open(query_file, 'r') self.textRepresenter = PorterStemmer() #init boolean to be able to close source files self.already_closed = False #Create parser to read query_file #WARNING WILL ONLY WORK ON CACM DATASET TODO FIND SOLUTION self.parser = ParserCLEF08() self.parser.initFile(query_file) #Build a dictionary (query_id, list of relevant documents) self.relevant_docs = {} with open(relevance_file, 'r') as f: for line in f: data = line.split(" ") #print "see data",data if data[0]=="#" or data[0]=="" or data[0]=="\n": continue query_id = int(data[0]) if(not self.relevant_docs.has_key(query_id)): self.relevant_docs[query_id] = [] #A list is added per relevant doc for later use of couple (themes, score) self.relevant_docs.get(query_id).append([ int(data[1]), int(data[2]), int(data[3][:-1])])
def getScores(self, query): """ query : texte de la requête renvoie un dictionnnaire contenant le score de chaque document pour la requête """ ps = PorterStemmer() #stemmer les mots de la requête requete = dict(ps.getTextRepresentation(query)) obj_index = self._index idsDoc = [] dicoScores = dict() dicoTfsTermesInQueryColl = dict() totalTfsCol = 0 #calculer le tf de chaque terme par rapport à l'ensemble de la collection for t in obj_index.getIndexInv().keys(): #tf(t,collection) = nb d'occurence de t dans la collection tfTermeCollection = 0 #dico de la forme {numero_doc:nb_occurence de t dans le doc} tfInDocs = obj_index.getTfsForStem(t) #calcul du tfTotal dans la collection for num_doc in tfInDocs.keys(): tfTermeCollection += tfInDocs[num_doc] totalTfsCol += tfTermeCollection #si t est dans la requete, l'ajouter dans un dictionnaire #on en aura besoin pour calculer le score des documents if t in requete.keys(): dicoTfsTermesInQueryColl[t] = tfTermeCollection #récupérer la liste des documents qui contiennent au moins un des termes de la requête for term in requete.keys(): tfs_term = obj_index.getTfsForStem(term) #vérifier que le terme est dans l'index inversé if tfs_term != 0: for num_doc in tfs_term.keys(): if num_doc not in idsDoc: idsDoc.append(num_doc) #calculer le score des documents pour chaque terme de la requete # d'abord pour chaque document, récupérer l'ensemble des termes qui le compose ainsi que leur tf for num_doc in idsDoc: TotalTfTerms = 0 dictTfTermesDoc = dict() tfsForDoc = obj_index.getTfsForDoc(num_doc) score_doc = 1 #ensuite pour chaque terme de l'index inversé calculer son tf s'il est présent dans le document dont l'id est num_doc #ajouter son tf au tf total des termes du document for terme in obj_index.getIndexInv().keys(): #tfsForDoc de la forme {terme: tf(t,d)} if terme in tfsForDoc.keys(): tfTermeDoc = tfsForDoc[terme] dictTfTermesDoc[terme] = tfTermeDoc TotalTfTerms += tfTermeDoc #calculer le ratio tf(terme,doc)/TotalTfTerms score du doc for stem, tf in dictTfTermesDoc.items(): if stem in requete.keys(): score_doc *= tf / TotalTfTerms + dicoTfsTermesInQueryColl[ stem] / totalTfsCol dicoScores[num_doc] = score_doc return dicoScores
def getWeigtsForQuery(self, query): ps = PorterStemmer() nb_doc = len(self.index.docs) tf_q = ps.getTextRepresentation(query) return { stem: np.log(nb_doc / (1 + len(self.index.getTfsForStem(stem).values()[0]))) for stem in tf_q.keys() }
def getFeatures(self, idDoc, query): ps = PorterStemmer() tf_query = ps.getTextRepresentation(query).values() querykey = tuple(tf_query.items()) if querykey not in self.featuresqueryidflen.keys(): idf_query = weighter.WeigtsForQuery(query) idf_sum = np.sum(idf_query.values()) len_query = np.sum(querykey) self.featuresqueryidflen[querykey] = (idf_sum, len_query) return self.featuresqueryidflen[querykey]
def getScores(self, query): """ query : la requête renvoie le score des documents en utilisant le modèle vectoriel """ dictScoresDocs = dict() dictWeightDocs = dict( ) # dict contenant les poids des termes des docs qui les contiennent ps = PorterStemmer() query_stemmed = ps.getTextRepresentation(query) for terme in query_stemmed.keys(): weightDocs = self._weighter.getWeightsForStem(terme) #print(" voici weight docs : ",weightDocs) dictWeightDocs[terme] = weightDocs q = self._weighter.getWeightsForQuery(query_stemmed) #print (" i am q : ",q) for term, value in dictWeightDocs.items(): for doc, occ in value.items(): if doc in dictScoresDocs.keys(): #print("term : ",term) #print("value : ",value) #print("doc : ",doc) #print("occ : ",occ) dictScoresDocs[doc] += q[term] * occ #print(" q terme ",q[term]) #print(" dict scores docs : ",dictScoresDocs) else: if Utils.isPresent(q, term): dictScoresDocs[doc] = q[term] * occ #print(" dict score doc :",dictScoresDocs) if self._normalized == False: # score produit scalaire return dictScoresDocs else: # similarité cosinus norme_q = 0 for terme, value in q.items(): if terme in query_stemmed: norme_q += math.pow(value, 2) norme_q = math.sqrt(norme_q) norme_d = dict() score_d = 0 for cle in dictScoresDocs.keys(): dictTerms = self._weighter.getWeightsForDoc(cle) for key, value in dictTerms.items(): if value != 0: score_d += math.pow(value, 2) norme_d[cle] = math.sqrt(score_d) score_d = 0 dictScoreCos = { doc: score / ((norme_q) * (norme_d[doc])) for doc, score in dictScoresDocs.items() } return dictScoreCos
def getFeatures(self, idDoc, query): ps = PorterStemmer() tf_query = ps.getTextRepresentation(query) querykey = tuple(tf_query.items()) # ranking is a list dictionary query is just text if querykey not in self.featuredocquery.keys() or (idDoc not in [ f[0] for f in self.featuredocquery[querykey] ]): ranking = self.model.getRanking(query) self.featuredocquery[querykey] = ranking return [f[1] for f in self.featuredocquery[querykey] if f[0] == idDoc][0]
def elementsFromDoc(self, doc): elements = {} text = doc.getText() # preprocessing text = text.lower() text = re.sub(r'(!|#|"|%|\$|\'|&|\)|\(|\+|\*|(^| )(-( |$))+|,|/|\.|;|:|=|<|\?|>|@|[|]|\|_|^|`|{|}|\||~)', ' ', text) text = re.sub(r'\n', ' ', text) text = re.sub(r'\d+', '', text) text = re.sub(r'(^| )(\w($| ))+', ' ', text) text = re.sub(r' +', ' ', text) stemmer = PorterStemmer() return stemmer.getTextRepresentation(text)
def indexation(self,collection): """ Partie indexation normale """ index = dict() ind_rev = dict() for i in range(1,len(collection)+1): doc = collection[i] pS = PorterStemmer() index[i] = pS.getTextRepresentation(doc.T) for key,item in index[i].items(): if key not in ind_rev.keys(): ind_rev.update({key:{i:item}}) else: ind_rev[key][i] = item self.setIndex(index) self.setIndexInv(ind_rev) """ Partie indexation normalisée """ indexNormalise = dict() indexInvNormalise = dict() somme = 0 for key,dico in self._index.items(): for keyDico, itemDico in dico.items(): somme += itemDico dictInDict= dict() dictInDictInv = dict() for keyDicoBis, itemDicoBis in dico.items(): if (somme != 0): dictInDict[keyDicoBis] = itemDicoBis / somme indexNormalise[key] = dictInDict dictInDictInv[key] = itemDicoBis / somme indexInvNormalise[keyDicoBis] = dictInDictInv else: dictInDict[keyDicoBis] = itemDicoBis indexNormalise[key] = dictInDict dictInDictInv[key] = itemDicoBis indexInvNormalise[keyDicoBis] = dictInDictInv somme = 0 self.setIndexNormalise(indexNormalise) self.setIndexInvNormalise(indexInvNormalise)
def indexation(self): ## Creat docs and stems docFrom self.docs=CreatDocs() self.stems=CreatTerms() self.docFrom=CreatdocFrom() self.parser=ParserCACM() self.texRepresenter=PorterStemmer() self.linkin,self.linkout=createlink() return self
class GroundTruthParser(object): """Class for query reading from file""" def __init__(self, query_file, relevance_file): self.query = open(query_file, 'r') self.textRepresenter = PorterStemmer() #init boolean to be able to close source files self.already_closed = False #Create parser to read query_file #WARNING WILL ONLY WORK ON CACM DATASET TODO FIND SOLUTION self.parser = ParserCLEF08() self.parser.initFile(query_file) #Build a dictionary (query_id, list of relevant documents) self.relevant_docs = {} with open(relevance_file, 'r') as f: for line in f: data = line.split(" ") #print "see data",data if data[0]=="#" or data[0]=="" or data[0]=="\n": continue query_id = int(data[0]) if(not self.relevant_docs.has_key(query_id)): self.relevant_docs[query_id] = [] #A list is added per relevant doc for later use of couple (themes, score) self.relevant_docs.get(query_id).append([ int(data[1]), int(data[2]), int(data[3][:-1])]) def nextQuery(self): """Return next Query object""" query_data = self.parser.nextDocument() if (query_data == None): if( not self.already_closed ): self.query.close() self.already_closed = True return -1 query_id = query_data.getId() #champs = self.relevant_docs.get(int(query_id)) #doc_id = champs[1] query_text = query_data.getText() query_tf = self.textRepresenter.getTextRepresentation(query_text) relevance= np.array(self.relevant_docs.get(int(query_id))) #,[[None,None,None]])) #subtopic_id = champs[3] return GroundTruth(query_id, query_text, query_tf, relevance)
def test_weighter(): parser = ParserCACM() textRepresenter = PorterStemmer() fname = "data/cacm/cacm.txt" I = Index(parser, textRepresenter) I.indexation(fname) weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)] for i, w in enumerate(weighters): print "Test of weighter" + str(i) print "getDocWeightsForDoc" print w.getDocWeightsForDoc("20") print "getDocWeightsForStem" print w.getDocWeightsForStem("accelerat") print "getDocWeightsForQuery" print w.getWeightsForQuery(I.getTfsForDoc("20"))
def initIndex(database_file): """Init Index or load it if previously computed""" sys.stdout.write("Indexing database...") sys.stdout.flush() if os.path.isfile('Index.p'): I = pickle.load(open("Index.p", "rb")) else: parser = ParserCLEF08() textRepresenter = PorterStemmer() I = Index(parser, textRepresenter) I.indexation(database_file) I.parser = None pickle.dump(I, open("Index.p", "wb")) sys.stdout.write("Done!\n") sys.stdout.flush() return I
doc_tf = index_file_inverse.read(index_inverse[stem][1]) index_file_inverse.flush() index_file_inverse.close() return doc_tf def getStrDoc(self, id_doc): docFrom = pkl.Unpickler(open(self.docFrom_file,'rb')).load() f = open(docFrom[id_doc][0], 'r') f.seek(int(docFrom[id_doc][1])) doc = f.read(int(docFrom[id_doc][2])) f.flush() f.close() return doc def _dict_to_file(self, dict): return ''.join([self._line_to_file(i, dict[i]) for i in dict.keys()]) def _line_to_file(self, i, v): return str(i) + '|' + str(v) + ' ' tr = PorterStemmer() parser = ParserCACM() index_cacm = Index("cacm", parser, tr, "/home/gozuslayer/DAC/DAC/RI/TP1-6/RI/cacm/") index_cacm.indexation("/home/gozuslayer/DAC/DAC/RI/TP1-6/RI/cacm/cacm.txt")
map(str, query_results[i][0:3])) return query_results if __name__ == "__main__": fname = "data/cacm/cacm.txt" sys.stdout.write("Indexing database...") sys.stdout.flush() if os.path.isfile('Index.p'): I = pickle.load(open("Index.p", "rb")) else: parser = ParserCACM() textRepresenter = PorterStemmer() I = Index(parser, textRepresenter) I.indexation(fname) I.parser = None pickle.dump(I, open("Index.p", "wb")) sys.stdout.write("Done!\n") sys.stdout.flush() sys.stdout.write("Creating weighters...") sys.stdout.flush() if os.path.isfile('Vectoriel.p'): models = pickle.load(open("Models.p", "rb")) else: weighters = [Binary(I),
if (self.name == "AP"): evaluation = PrecisionMoyenne(irlist) AP.append(evaluation.eval()) if (self.name == "PR"): evaluation = PrecisionRappel(irlist) PR.append(evaluation.eval(11)) Query = QueryParser.nextQuery() if (self.name == "AP"): print AP MAP = sum(AP) / float(len(AP)) return MAP if (self.name == "PR"): PR = np.array(PR) return PR A = PorterStemmer() parser = ParserCACM() Index_CACM = Index("cacm", parser, A, "/home/gozuslayer/Bureau/TP1-6/RI/cacm/") Weighters = Weighter_1(Index_CACM) model = Vectoriel(Weighters, Index_CACM) from modeles import * L = languageModel(Index_CACM, 0.7) A = PorterStemmer() Parser = CACM_QueryParser() Parser.initFile_query("/home/gozuslayer/Bureau/TP1-6/RI/cacm/cacm.qry") Parser.initFile_jugement("/home/gozuslayer/Bureau/TP1-6/RI/cacm/cacm.rel") Eval = EvalIRModel(L, "AP") print Eval.evaluate(Parser, A)
def string2json(self, string): '''convert "{'4': 1}{'7': 10}" to {'4':1, '7':10} ''' string_list = [s + '}' for s in string.split('}')][:-1] json_list = [ast.literal_eval(s) for s in string_list] return {d.keys()[0]: d.values()[0] for d in json_list} if __name__ == '__main__': parser = ParserCACM() parser.initFile("cacm/cacm.txt") index = Index(name='test', parser=ParserCACM, textRepresenter=PorterStemmer, create_index=False) for i in range(20): doc = parser.nextDocument() doc_id = doc.getId() doc_rep = PorterStemmer().getTextRepresentation(doc.getText()) print('doc_rep', doc_rep) print('length', index.getDocsLength(doc_id)) tfs_for_doc = index.getTfsForDoc(doc_id) print('tfs_for_doc', tfs_for_doc) tfs_for_stem = index.getTfsForStem('diverg') print('tfs_for_stem diverg', tfs_for_stem)
def getWeigtsForQuery(self, query): ps = PorterStemmer() return ps.getTextRepresentation(query)
def getScores(self, query): ps = PorterStemmer() requete = ps.getTextRepresentation(query) # {mot1 : nb1 , mot2 : nb2} nb_documents = len( self._index.getIndex()) # nombre de documents dans la collection listScores = [] k1 = 1.2 b = 0.75 indexInv = self._index.getIndexInv() index = self._index.getIndex() count = 0 # nb total d'occurences score = 0 for key, dico in index.items(): for keyDico, itemDico in dico.items(): count += itemDico avg = count / len(index) """ for keyIndex, itemIndex in index.items(): for key,item in dict(requete).items(): idf = log((1+nb_documents)/(1+len(self._index.getTfsForStem(key)))) f_qi_d = indexInv[key][keyIndex] D = sum(self._index.getTfsForDoc(keyIndex)[term] for term in self._index.getTfsForDoc(keyIndex).keys()) score += idf * ( ( f_qi_d * (k1+1)) / f_qi_d + k1 * ( 1 - b + b * ( D / avg ) ) ) listScores[i-1] = score i = i + 1 """ """ for keyIndexInv, itemIndexInv in indexInv.items(): if keyIndexInv in requete: for keyDico, itemDico in itemIndexInv.items(): idf = log( ( 1 + nb_documents ) / ( 1 + len(itemIndexInv)) ) print("idf",idf) f_qi_d = itemDico print("f",f_qi_d) D = sum(self._index.getTfsForDoc(keyDico)[term] for term in self._index.getTfsForDoc(keyDico).keys()) print("D",D) score += idf * ( ( f_qi_d * (k1+1)) / f_qi_d + k1 * ( 1 - b + b * ( D / avg ) ) ) print("score",score) listScores.append((keyDico,score)) score = 0 """ dictScores = dict() c = 1 liste_id = [] #print("aaa : ",dict(requete).items()) for mot, occu in dict(requete).items(): if mot in indexInv.keys(): #print("MOT : ",mot) #print("indexinv : ",indexInv[mot].items()) for key, item in indexInv[mot].items(): #print("key : ",key) #print("item : ",item) idf = log((1 + nb_documents) / (1 + len(indexInv[mot]))) f_qi_d = item D = sum( self._index.getTfsForDoc(key)[term] for term in self._index.getTfsForDoc(key).keys()) #print("c : ",c) c += 1 #print("idf : ",idf) #print("f_qi_d : ",f_qi_d) #print("D : ",D) score = idf * ((f_qi_d * 1) / (f_qi_d + k1 * (1 - b + b * (D / avg)))) #print("avg : ",avg) #print("score : ",score) #print("key : ",key) #print("list id : ",liste_id) if key in liste_id: dictScores[key] += score else: dictScores[key] = score liste_id.append(key) score = 0 return dictScores
def getWeigtsForQuery(self, query): ps = PorterStemmer() self.Weights_Query = {}.fromkeys( ps.getTextRepresentation(query).keys(), 1) return self.Weights_Query