Example #1
0
def choose_collection():
    """
    Demande à l'utilisateurs la collection et renvoie la collection et l'index
    """
    print('CHOIX DE LA COLLECTION:')
    print('1 - CACM')
    print('2 - Wikipedia (non implémenté pour le moment)')
    collection_choice = input('Choisissez une collection: ')

    if collection_choice == 1:
        # Import
        import_time, collection = time_func(CACMCollection)
        print("\n")
        print("Collection CACM importée en %s secondes" % (import_time))

        # index
        index_time, index = time_func(Index, collection.documents)
        print("Collection CACM indéxée en %s secondes" % (index_time))
        print("Taille de l'index en mémoire: ~ %s Méga-octets"
              % (sys.getsizeof(index) / float(10**6)))

        # print explications taille memoire
        print("\n")
        print("En realite, ce script utilise plus de memoire car on garde")
        print("egalement dans la RAM la collection entiere (~40 Mega-octets)")
        print("pour pouvoir afficher le contenu les resultats de  sarecherche")
        print("a l'utilisateur (et non juste l'id des documents).")
        print("Cependant, les fonctions de recherche utilisent exclusivement")
        print("les indexes (cf methodes boolean_search et vectorial_search)")

        return collection, index
    else:
        raise ValueError("Input invalide")
Example #2
0
def evaluate_search(query, expected_results):
    '''
    Evalue la performance de la recherche donnée pour les differents modeles
    Calcule temps de recherche, precision, R precision, rappel, F et E measure
    '''

    # Ce qu'on va renvoyer
    # dict {'modele': {'time': x, 'precision': x, 'rappel': x}}
    evaluation = defaultdict(dict)

    # modele booleen
    bool_time, search_results = time_func(boolean_search, query, index)
    evaluation['bool']['time'] = bool_time
    evaluation['bool']['precision'] = precision(search_results, expected_results)
    evaluation['bool']['rappel'] = rappel(search_results, expected_results)
    # pas de R precision car resultats non ordonées
    evaluation['bool']['F_measure'] = F_measure(search_results, expected_results)
    evaluation['bool']['E_measure'] = E_measure(search_results, expected_results)

    # modele vectoriel (evalué avec poids tf idf log normalisee
    # (d'apres mes tests, c'est la ponderation qui donne les meilleurs resultats)
    vect_time, search_results = time_func(vectorial_search, query, index, "tf_idf_log_normalized")
    search_results = [result.doc_id for result in search_results]
    evaluation['vect']['time'] = bool_time
    evaluation['vect']['precision'] = precision(search_results, expected_results)
    evaluation['vect']['rappel'] = rappel(search_results, expected_results)
    evaluation['vect']['R_precision'] = R_precision(search_results, expected_results)
    evaluation['vect']['F_measure'] = F_measure(search_results, expected_results)
    evaluation['vect']['E_measure'] = E_measure(search_results, expected_results)
    evaluation['vect']['average_precision'] = average_precision(search_results, expected_results)

    return evaluation
Example #3
0
    for (doc_id, similarity) in search_results[:nb_doc_to_show]:
        print("Document: %sSimilarité: %s \n" % (collection.get_document_by_id(doc_id), similarity))


def print_results_boolean_search(search_results, query, collection):
    print('%s resultats pour la recherche "%s"' % (len(search_results), query))
    for doc_id in search_results:
        print("%s" % collection.get_document_by_id(doc_id))


# Run uniquement si le script est appelé directement
if __name__ == '__main__':
    # Choix de collection
    collection, index = choose_collection()

    while True:  # la possibilite de quitter est dans le choix du type de recherche
        search_type = choose_search_type()

        if search_type == "vectorial":
            weight = choose_weight_type()
            query = choose_query()
            search_time, search_results = time_func(vectorial_search, query, index, weight)
            print("Temps d'exécution de la recherche: %s secondes" % (search_time))
            print_results_vectorial_search(search_results, query, collection)

        if search_type == "boolean":
            query = choose_query_bool()
            search_time, search_results = time_func(boolean_search, query, index)
            print("Temps d'exécution de la recherche: %s secondes" % (search_time))
            print_results_boolean_search(search_results, query, collection)
Example #4
0
import sys
from collections import defaultdict

from collection import CACMCollection
from index import Index
from vectorial_search import vectorial_search
from boolean_search import boolean_search
from evaluation_utils import time_func, get_queries, get_expected_results, average_precision
from evaluation_utils import E_measure, F_measure, average, precision, rappel, R_precision

##############
# INDEXATION #
##############
# Timing de l'import de la collection
import_time, collection = time_func(CACMCollection)

# Timing de l'indexation et taille de l'index obtenu
indexation_time, index = time_func(Index, collection.documents)
index_size = sys.getsizeof(index) / float(10**6)


##################################
# METHODE POUR EVALUER UNE QUERY #
###################################
def evaluate_search(query, expected_results):
    '''
    Evalue la performance de la recherche donnée pour les differents modeles
    Calcule temps de recherche, precision, R precision, rappel, F et E measure
    '''