def choose_collection(): """ Demande à l'utilisateurs la collection et renvoie la collection et l'index """ print('CHOIX DE LA COLLECTION:') print('1 - CACM') print('2 - Wikipedia (non implémenté pour le moment)') collection_choice = input('Choisissez une collection: ') if collection_choice == 1: # Import import_time, collection = time_func(CACMCollection) print("\n") print("Collection CACM importée en %s secondes" % (import_time)) # index index_time, index = time_func(Index, collection.documents) print("Collection CACM indéxée en %s secondes" % (index_time)) print("Taille de l'index en mémoire: ~ %s Méga-octets" % (sys.getsizeof(index) / float(10**6))) # print explications taille memoire print("\n") print("En realite, ce script utilise plus de memoire car on garde") print("egalement dans la RAM la collection entiere (~40 Mega-octets)") print("pour pouvoir afficher le contenu les resultats de sarecherche") print("a l'utilisateur (et non juste l'id des documents).") print("Cependant, les fonctions de recherche utilisent exclusivement") print("les indexes (cf methodes boolean_search et vectorial_search)") return collection, index else: raise ValueError("Input invalide")
def evaluate_search(query, expected_results): ''' Evalue la performance de la recherche donnée pour les differents modeles Calcule temps de recherche, precision, R precision, rappel, F et E measure ''' # Ce qu'on va renvoyer # dict {'modele': {'time': x, 'precision': x, 'rappel': x}} evaluation = defaultdict(dict) # modele booleen bool_time, search_results = time_func(boolean_search, query, index) evaluation['bool']['time'] = bool_time evaluation['bool']['precision'] = precision(search_results, expected_results) evaluation['bool']['rappel'] = rappel(search_results, expected_results) # pas de R precision car resultats non ordonées evaluation['bool']['F_measure'] = F_measure(search_results, expected_results) evaluation['bool']['E_measure'] = E_measure(search_results, expected_results) # modele vectoriel (evalué avec poids tf idf log normalisee # (d'apres mes tests, c'est la ponderation qui donne les meilleurs resultats) vect_time, search_results = time_func(vectorial_search, query, index, "tf_idf_log_normalized") search_results = [result.doc_id for result in search_results] evaluation['vect']['time'] = bool_time evaluation['vect']['precision'] = precision(search_results, expected_results) evaluation['vect']['rappel'] = rappel(search_results, expected_results) evaluation['vect']['R_precision'] = R_precision(search_results, expected_results) evaluation['vect']['F_measure'] = F_measure(search_results, expected_results) evaluation['vect']['E_measure'] = E_measure(search_results, expected_results) evaluation['vect']['average_precision'] = average_precision(search_results, expected_results) return evaluation
for (doc_id, similarity) in search_results[:nb_doc_to_show]: print("Document: %sSimilarité: %s \n" % (collection.get_document_by_id(doc_id), similarity)) def print_results_boolean_search(search_results, query, collection): print('%s resultats pour la recherche "%s"' % (len(search_results), query)) for doc_id in search_results: print("%s" % collection.get_document_by_id(doc_id)) # Run uniquement si le script est appelé directement if __name__ == '__main__': # Choix de collection collection, index = choose_collection() while True: # la possibilite de quitter est dans le choix du type de recherche search_type = choose_search_type() if search_type == "vectorial": weight = choose_weight_type() query = choose_query() search_time, search_results = time_func(vectorial_search, query, index, weight) print("Temps d'exécution de la recherche: %s secondes" % (search_time)) print_results_vectorial_search(search_results, query, collection) if search_type == "boolean": query = choose_query_bool() search_time, search_results = time_func(boolean_search, query, index) print("Temps d'exécution de la recherche: %s secondes" % (search_time)) print_results_boolean_search(search_results, query, collection)
import sys from collections import defaultdict from collection import CACMCollection from index import Index from vectorial_search import vectorial_search from boolean_search import boolean_search from evaluation_utils import time_func, get_queries, get_expected_results, average_precision from evaluation_utils import E_measure, F_measure, average, precision, rappel, R_precision ############## # INDEXATION # ############## # Timing de l'import de la collection import_time, collection = time_func(CACMCollection) # Timing de l'indexation et taille de l'index obtenu indexation_time, index = time_func(Index, collection.documents) index_size = sys.getsizeof(index) / float(10**6) ################################## # METHODE POUR EVALUER UNE QUERY # ################################### def evaluate_search(query, expected_results): ''' Evalue la performance de la recherche donnée pour les differents modeles Calcule temps de recherche, precision, R precision, rappel, F et E measure '''