def create_table_inverted_index(): conn = sqlite3.connect('data.db') conn.execute("DROP TABLE IF EXISTS inverted_index") conn.execute( "CREATE TABLE inverted_index (keyword TEXT, URL TEXT, frequency REAL)") insert_inverted_index = "INSERT INTO inverted_index (keyword, URL, frequency) VALUES (?, ?, ?);" for row in conn.execute("SELECT URL, content FROM webpages"): URL = row[0] content = row[1] list_words = list(extractListOfWords(content)) list_words = [stem(w) for w in list_words] nb_mots = len(list_words) occ_mot_page = {w: list_words.count(stem(w)) for w in list_words} nb = len(occ_mot_page) tf = {x: list_words.count(stem(x)) / nb for x in list_words} # result = [] for keyword, frequency in tf.items(): conn.execute(insert_inverted_index, (keyword, URL, frequency)) conn.commit() print("Insertion in inverted_index is done!")
def tf_idf(word): conn = sqlite3.connect('data.db') for row in conn.execute( "SELECT URL, MAX(frequency) FROM inverted_index WHERE keyword = '{:s}'" .format(stem(word))): print(row) for row in conn.execute( "SELECT keyword, idf FROM inverse_document_frequency WHERE keyword = '{:s}'" .format(stem(word))): print(row)
def countFreq(L, corpus): L_stemmed = [stem(i) for i in L] cnt = Counter() for word in corpus: cnt[word] += 1 tf = [] for word in L_stemmed: if len(corpus) == 0: f = 0 else: f = cnt[word] / len(corpus) tf.append((word, f)) return tf
def tri_tfidf(): conn = sqlite3.connect('data.db') query = input('Tapez une phrase: ') queryWords = [stem(w) for w in query.split()] print(queryWords) print(type(queryWords)) queryWords = tuple(queryWords) # sum (prod) # prod = tf(term,doc) x idf(term) # term --> keyword from inverted_index / doc --> URL from inverted_index # idf(term) --> keyword from inverse_document_frequency t = "SELECT URL, SUM(prod) AS tfidf \ FROM (SELECT ii.keyword, ii.URL, ii.frequency * idf.idf AS prod \ FROM inverted_index AS ii, inverse_document_frequency AS idf \ WHERE ii.keyword = idf.keyword AND ii.keyword IN {}) \ GROUP BY URL \ ORDER BY SUM(prod) DESC LIMIT 10" \ .format(queryWords) for row in conn.execute(t): print(row) conn.commit()
def tri_tfidf_pageRank(): # tf-idf multiplié par le PageRank de la page conn = sqlite3.connect('data.db') query = input('Tapez une phrase: ') queryWords = [stem(w) for w in query.split()] queryWords = tuple(queryWords) # prod = tf(term,doc) x idf(term) # term --> keyword from inverted_index / doc --> URL from inverted_index # idf(term) --> keyword from inverse_document_frequency # page_rank: p / t: tfidf # URL, tf-idf x PageRank query = "SELECT pr.URL, t.tfidf * pr.pageRank AS score \ FROM page_rank AS pr, (SELECT URL, SUM(prod) AS tfidf \ FROM ( SELECT ii.keyword, ii.URL, ii.frequency * idf.idf AS prod \ FROM inverted_index AS ii, inverse_document_frequency AS idf \ WHERE ii.keyword = idf.keyword AND ii.keyword IN {}) AS tf_idf \ GROUP BY URL) AS t \ WHERE pr.URL = t.URL \ GROUP BY pr.URL \ ORDER BY score DESC LIMIT 10".format(queryWords) for row in conn.execute(query): print(row) conn.commit()
if len(corpus) == 0: f = 0 else: f = cnt[word] / len(corpus) tf.append((word, f)) return tf #pages = df_webpages['content'].values columns = ['URL'] columns.extend(L) result = pd.DataFrame(columns=columns) result['URL'] = None for i, row in df_webpages.iterrows(): page = row['content'] corpus = [stem(i) for i in extractListOfWords(page)] tf = countFreq(L, corpus) temp = [row['URL']] temp.extend([t[1] for t in tf]) temp = pd.Series(temp).to_frame(0).T temp.columns = columns result = result.append(temp) idf = {} for word in L: idf[word] = log(len(result) / sum(result[word] > 0)) idf = pd.DataFrame.from_dict(idf, orient='index') conn.execute("DROP TABLE IF EXISTS inverted_index") result.to_sql('inverted_index', conn, if_exists='replace', index=False)
#!/usr/bin/env python3 import sqlite3 from shared import stem, sortFreqDict from collections import defaultdict import sys conn = sqlite3.connect('data.db') cursor_1 = conn.cursor() cursor_2 = conn.cursor() # Test pour savoir si le programme est lancé avec arguments if len(sys.argv) > 1: queryWords = [stem(w) for w in sys.argv[1:] if stem(w) != ""] else: # on prend l'input de la requête query = input("Saisissez votre requête :") if query == "": # si pas de saisie, on utilise une requête type query = "comment multiplier des matrices" queryWords = [stem(w) for w in query.split() if stem(w) != ""] print() # compute best query solution and output them # 2 méthodes possibles : (tf-idf simple) ou (tf-idf * pagerank) possible_modes = ["tf-idf", "tf-idf * pagerank"] for mode in possible_modes: # création d'un dictionnaire contenant la somme des points par page