Exemple #1
0
def getTfIdfVector():
	"""Get the dictionnary which contains articles, then delete 'useless' words listed in the list motsVides
	to count how much times a word is present on a document"""
	tf = td.load_json("data/tf.json")
	idf = td.load_json("data/idf.json")
	tab = []
	for doc in tf:
		vectDoc = {}
		for word in doc:
			vectDoc[word] = doc[word] * idf[word]
		tab.append(vectDoc)
	td.save_json(tab, "data/tfidf.json")
Exemple #2
0
def search_tf(request):
	"""" TF comparison with idf :"""
	req = td.cleanup(request)
	tmp = getOccurrenciesVector(req, motsVide)
	vectRequestTF = getTermFrenquency(tmp)
	descripteurtf = td.load_json("data/tf.json")
	result = findSimilarite(descripteurtf, vectRequestTF)
	return result
Exemple #3
0
def get_sim_random():
	"""Give random similarity to documents for a special request, for test only"""
	dic = {}
	doc = td.load_json("data/database.json")
	for i in range(0, len(doc)):
		a = random.random()
		dic[i] = a
	return dic
Exemple #4
0
def getIndiceKey():
	dico = td.load_json("./data/idf.json")
	d = {}
	i = 0
	for word in dico.keys():
		d[word] = i
		i = i+1
	return d
Exemple #5
0
def liste_inversee():
	dic = {}
	doc = td.load_json("data/tf.json")
	for i in range(0, len(doc)):
		for j in doc[i]:
			if not(dic.has_key(j)):
				dic[j] = []
			(dic[j]).append(i+1)
	td.save_json(dic, "data/liste_inverse.json")
Exemple #6
0
def generateIDF(filename):
	tf_doc = td.load_json("data/tf.json")
	nb_doc = len(tf_doc) + 1
	occ = {}
	for doc in tf_doc:
		for word in doc:
			if occ.has_key(word):
				occ[word] += 1
			else:
				occ[word] = 1
	idf_tab = {}
	for word in occ: 
		idf_tab[word] = math.log(nb_doc/occ[word])
	
	td.save_json(idf_tab, "data/idf.json")
Exemple #7
0
def Word2vec_avg_doc(filename):
    dict = []
    t = td.load_json(filename)
    for i in range(0, len(t)):
        t[i] = "".join(
            j for j in t[i] if
            not j.isdigit())  # In addition to tokenisation we omit all digits
        t[i] = td.cleanup(
            t[i])  # preprocess every word in every document in our data base
        doc_avg_vector = mean_vector(t[i].split(),
                                     model=model,
                                     num_features=300,
                                     index2word_set=index2word_set)
        dict.append(doc_avg_vector)
    np.save(
        'word2vec/doc_vector.npy', dict
    )  # we save our document's vectors once and for all to save time in calculus
Exemple #8
0
	vectRequestIDF = getTFIdfResquest(request)
	mots = vectRequestIDF.keys()
	tab = {}
	for i in mots:
		for doc in liste_inverse[i]:
			tab[doc] = True
	result = findSimilarite(vectRequestIDF, tab)
	return result


def sortResult(dicoOfSimilarite):
	"""Used to sort result given by search to show the most similar at first"""
	s = sorted(dicoOfSimilarite.items(), key=lambda t: t[1], reverse=True)
	return s


def showResult(sortedDicoOfSimi):
	html = "<div class='result'><h3>Listes des resultats</h3>"
	for doc in sortedDicoOfSimi:
		html += "<div class='item'><a href='./doc/" + str(doc[0]) + "' >Document numero"+str(doc[0])+"</a><p>Similarite : "+str(doc[1])+"</p></div>"
	html += "</div>"
	return html


motsVide = td.load_empty_words("data/motsvides.txt")
descripteurs = td.load_json("data/tfidf.json")
idf = td.load_json("data/idf.json")
liste_inverse = td.load_json("data/liste_inverse.json")


Exemple #9
0
def generate_JSON_DataBase(filenames):
	database = []
	for filename in filenames :
		js = td.load_json(filename)
		database = database + js
	td.save_json(database, "data/database.json")
Exemple #10
0
def prepareData():
	print("debut de la preparation des donnees...")
	print("chargement des json...")
	result = td.load_json("./data/relations.json")
	tfidf = td.load_json("./data/tfidf.json")
	req = td.load_json("./data/request.json")
	print("creation de la base de mots")
	indice_key = getIndiceKey()
	
	
	
	## Trop de donnee pour faire comme ca... 
	#for numReq in result:
	#	numReq = int(numReq)
	#	tfidf_req = getdf.getTFIdfResquest(req[numReq])
	#	tab_req = dicoToTab(tfidf_req, indice_key)
		
		
		#for i in range(0, len(tfidf)):
		#	tfidf_doc = tfidf[i]
		#	tab_doc = dicoToTab(tfidf_doc, indice_key)
		#	data = tab_doc + tab_req
		#	if is_in(result, i+1): #decalage; ERREUR !!! 
		#		data.append(1) ##la requete match avec le doc
		#	else:
		#		data.append(0) ##la requete match pas avec le doc
		#	datas.append(data)
		
	## Methode 2 : ajouter les n doc qui match + n doc random qui match pas + ecriture de tps en tps pour pas saturer la memoire
	
	print("encodage de tfidf...")
	#print(result)
	tfidf_encode = []
	for i in tfidf:
		tfidf_encode.append(encode_csv(dicoToTab(i, indice_key)))
		
	print("lancement de la generation...")
	datas = ""
	for numReq in result:
		#numReq = int(numReq)
		tfidf_req = getdf.getTFIdfResquest(req[int(numReq)]) 
		req_encode = encode_csv(dicoToTab(tfidf_req, indice_key))
		
		n = len(result[numReq])
		docs = {}
		for i in range(0, n-1):
			doc = int(result[numReq][i])-1
			docs[doc] = True
			ligne = tfidf_encode[doc] + req_encode + "1\n"
			datas = datas + ligne
		
		## rajouter n mauvaise correspondance en aleatoire
		c = 0
		while c < n:
			rand = random.randrange(0, len(tfidf), 1)
			if not docs.has_key(rand):
				c = c + 1
				ligne = tfidf_encode[rand] + req_encode + "0\n"
				datas = datas + ligne
				
		print("donnes de la requete: "+numReq)
		##print(datas)
		
	#time.sleep(10)
	#s = encode_csv(datas)
	saveDatas(datas)