Esempio n. 1
0
def generateTF(filename):
	doc = td.loadDoc(filename)
	biblio = []
	for i in range(1, len(doc)):
		vect = gdf.getOccurrenciesVector(doc[i], motsVide)
		vect = gdf.getTermFrenquency(vect)
		biblio.append(vect)
	td.save_json(biblio, "data/tf.json")
Esempio n. 2
0
def liste_inversee():
	dic = {}
	doc = td.load_json("data/tf.json")
	for i in range(0, len(doc)):
		for j in doc[i]:
			if not(dic.has_key(j)):
				dic[j] = []
			(dic[j]).append(i+1)
	td.save_json(dic, "data/liste_inverse.json")
Esempio n. 3
0
def getTfIdfVector():
	"""Get the dictionnary which contains articles, then delete 'useless' words listed in the list motsVides
	to count how much times a word is present on a document"""
	tf = td.load_json("data/tf.json")
	idf = td.load_json("data/idf.json")
	tab = []
	for doc in tf:
		vectDoc = {}
		for word in doc:
			vectDoc[word] = doc[word] * idf[word]
		tab.append(vectDoc)
	td.save_json(tab, "data/tfidf.json")
Esempio n. 4
0
def generateIDF(filename):
	tf_doc = td.load_json("data/tf.json")
	nb_doc = len(tf_doc) + 1
	occ = {}
	for doc in tf_doc:
		for word in doc:
			if occ.has_key(word):
				occ[word] += 1
			else:
				occ[word] = 1
	idf_tab = {}
	for word in occ: 
		idf_tab[word] = math.log(nb_doc/occ[word])
	
	td.save_json(idf_tab, "data/idf.json")
Esempio n. 5
0
def main(mode, filename):
	if mode == "query":  # Generate the json containing different queries
		f = open("data/CISI.QRY", "r")
		t = f.read().split(".I")
		for i in range(0, len(t)):  # Clean target file
			a = t[i].split('\n')
			text = ""
			for z in range(2, len(a)):
				text = text + " " + a[z]
			t[i] = td.cleanup(text)
		txt = json.dumps(t)
		f.close()
		jsonFile = open("data/request.json", "w")
		jsonFile.write(txt)
		jsonFile.close()

	elif mode == "tfidf":
		print("TF Generation")
		generateTF("data/database.json")
		print("IDF Generation")
		generateIDF("data/database.json")
		print("TFIDF Generation")
		getTfIdfVector()

	elif mode == "relations":  # For test
		rel = {}  # Creation of the relation dictionnary
		f = open("data/CISI.REL", "r")
		data = f.readlines()
		for line in data:  # Get a relation and store it to the dictionnary
			elements = line.split()
			doc = elements[0]
			query = elements[1]
			if not rel.has_key(doc):
				rel[doc] = []
			rel[doc].append(query)
		td.save_json(rel, "data/relations.json")

	elif mode == "database":
		generate_JSON_DataBase(filename)
		
	elif mode == "load-NYT":
		loadBaseNYT(filename[0])
		
	elif mode == "load-BaseProf":
		loadBaseFileProf(filename[0])

	elif mode == "liste_inverse":
		liste_inversee()
Esempio n. 6
0
def loadBaseFileProf(filename):
	"""Open target file and split it according to the differents lines .I.
Then, replace some character to avoid problems."""
	f = open(filename, "r")
	t = f.read().split(".I ")
	c = 0 
	for i in range(0, len(t)):  # Clean target file
		a = t[i].split('\n')
		text = ""
		for z in range(1, len(a)):
			text = text + " " + a[z]
		if text != "":
			t[c] = text
			c = c+1
	f.close()
	td.save_json(t, filename + ".json")
Esempio n. 7
0
def generate_JSON_DataBase(filenames):
	database = []
	for filename in filenames :
		js = td.load_json(filename)
		database = database + js
	td.save_json(database, "data/database.json")