def getTfIdfVector(): """Get the dictionnary which contains articles, then delete 'useless' words listed in the list motsVides to count how much times a word is present on a document""" tf = td.load_json("data/tf.json") idf = td.load_json("data/idf.json") tab = [] for doc in tf: vectDoc = {} for word in doc: vectDoc[word] = doc[word] * idf[word] tab.append(vectDoc) td.save_json(tab, "data/tfidf.json")
def search_tf(request): """" TF comparison with idf :""" req = td.cleanup(request) tmp = getOccurrenciesVector(req, motsVide) vectRequestTF = getTermFrenquency(tmp) descripteurtf = td.load_json("data/tf.json") result = findSimilarite(descripteurtf, vectRequestTF) return result
def get_sim_random(): """Give random similarity to documents for a special request, for test only""" dic = {} doc = td.load_json("data/database.json") for i in range(0, len(doc)): a = random.random() dic[i] = a return dic
def getIndiceKey(): dico = td.load_json("./data/idf.json") d = {} i = 0 for word in dico.keys(): d[word] = i i = i+1 return d
def liste_inversee(): dic = {} doc = td.load_json("data/tf.json") for i in range(0, len(doc)): for j in doc[i]: if not(dic.has_key(j)): dic[j] = [] (dic[j]).append(i+1) td.save_json(dic, "data/liste_inverse.json")
def generateIDF(filename): tf_doc = td.load_json("data/tf.json") nb_doc = len(tf_doc) + 1 occ = {} for doc in tf_doc: for word in doc: if occ.has_key(word): occ[word] += 1 else: occ[word] = 1 idf_tab = {} for word in occ: idf_tab[word] = math.log(nb_doc/occ[word]) td.save_json(idf_tab, "data/idf.json")
def Word2vec_avg_doc(filename): dict = [] t = td.load_json(filename) for i in range(0, len(t)): t[i] = "".join( j for j in t[i] if not j.isdigit()) # In addition to tokenisation we omit all digits t[i] = td.cleanup( t[i]) # preprocess every word in every document in our data base doc_avg_vector = mean_vector(t[i].split(), model=model, num_features=300, index2word_set=index2word_set) dict.append(doc_avg_vector) np.save( 'word2vec/doc_vector.npy', dict ) # we save our document's vectors once and for all to save time in calculus
vectRequestIDF = getTFIdfResquest(request) mots = vectRequestIDF.keys() tab = {} for i in mots: for doc in liste_inverse[i]: tab[doc] = True result = findSimilarite(vectRequestIDF, tab) return result def sortResult(dicoOfSimilarite): """Used to sort result given by search to show the most similar at first""" s = sorted(dicoOfSimilarite.items(), key=lambda t: t[1], reverse=True) return s def showResult(sortedDicoOfSimi): html = "<div class='result'><h3>Listes des resultats</h3>" for doc in sortedDicoOfSimi: html += "<div class='item'><a href='./doc/" + str(doc[0]) + "' >Document numero"+str(doc[0])+"</a><p>Similarite : "+str(doc[1])+"</p></div>" html += "</div>" return html motsVide = td.load_empty_words("data/motsvides.txt") descripteurs = td.load_json("data/tfidf.json") idf = td.load_json("data/idf.json") liste_inverse = td.load_json("data/liste_inverse.json")
def generate_JSON_DataBase(filenames): database = [] for filename in filenames : js = td.load_json(filename) database = database + js td.save_json(database, "data/database.json")
def prepareData(): print("debut de la preparation des donnees...") print("chargement des json...") result = td.load_json("./data/relations.json") tfidf = td.load_json("./data/tfidf.json") req = td.load_json("./data/request.json") print("creation de la base de mots") indice_key = getIndiceKey() ## Trop de donnee pour faire comme ca... #for numReq in result: # numReq = int(numReq) # tfidf_req = getdf.getTFIdfResquest(req[numReq]) # tab_req = dicoToTab(tfidf_req, indice_key) #for i in range(0, len(tfidf)): # tfidf_doc = tfidf[i] # tab_doc = dicoToTab(tfidf_doc, indice_key) # data = tab_doc + tab_req # if is_in(result, i+1): #decalage; ERREUR !!! # data.append(1) ##la requete match avec le doc # else: # data.append(0) ##la requete match pas avec le doc # datas.append(data) ## Methode 2 : ajouter les n doc qui match + n doc random qui match pas + ecriture de tps en tps pour pas saturer la memoire print("encodage de tfidf...") #print(result) tfidf_encode = [] for i in tfidf: tfidf_encode.append(encode_csv(dicoToTab(i, indice_key))) print("lancement de la generation...") datas = "" for numReq in result: #numReq = int(numReq) tfidf_req = getdf.getTFIdfResquest(req[int(numReq)]) req_encode = encode_csv(dicoToTab(tfidf_req, indice_key)) n = len(result[numReq]) docs = {} for i in range(0, n-1): doc = int(result[numReq][i])-1 docs[doc] = True ligne = tfidf_encode[doc] + req_encode + "1\n" datas = datas + ligne ## rajouter n mauvaise correspondance en aleatoire c = 0 while c < n: rand = random.randrange(0, len(tfidf), 1) if not docs.has_key(rand): c = c + 1 ligne = tfidf_encode[rand] + req_encode + "0\n" datas = datas + ligne print("donnes de la requete: "+numReq) ##print(datas) #time.sleep(10) #s = encode_csv(datas) saveDatas(datas)