Esempio n. 1
0
def binary_search_test(tokens, doc_ids):

    # Genarte inverted index for the collection
    print("Generating inverted index")
    start_time = time()
    inverted_index = InvertedIndex.invert_index(tokens)
    end_time = time()
    print("Genartion finished, took: %.2f seconds" % (end_time - start_time))

    print("Binary search AND test starting")
    start_time = time()
    res = binarySearch("network AND computer", inverted_index, doc_ids)
    end_time = time()
    print("AND test finished, returned: {}, took: {} seconds".format(
        res, end_time - start_time))

    print("Binary search OR test starting")
    start_time = time()
    res = binarySearch("network OR computer", inverted_index, doc_ids)
    end_time = time()
    print("OR test finished, returned: {}, took: {} seconds".format(
        res[10], end_time - start_time))

    print("Binary search NOT test starting")
    start_time = time()
    res = binarySearch("NOT computer", inverted_index, doc_ids)
    end_time = time()
    print("NOT test finished, returned: {}, took: {} seconds".format(
        res[10], end_time - start_time))

    return True
Esempio n. 2
0
def test_accuracy_recall(doc_list, tokens, return_doc_count):
    inverted_index = InvertedIndex.invert_index(tokens)
    test_queries = parse_query_text()
    test_rels = parse_qrels()
    accuracies = []
    recalls = []
    docs = {}
    common_words = DocumentParser.read_common_words()
    for doc in doc_list:
        docs[doc.id] = len(
            DocumentParser.remove_common_words(doc.tokenize(), common_words))
    for key in tqdm(list(test_rels.keys())):
        cleaned_query = VectorialModel.parse_query(test_queries[key])
        postings = VectorialModel.posting_union(cleaned_query, inverted_index)
        vectors = VectorialModel.doc_vectors_ponderation(
            postings, cleaned_query, inverted_index, docs)
        cleaned_query = VectorialModel.generate_query_vector(
            cleaned_query, inverted_index, len(doc_list))
        cosines = VectorialModel.cosinus(cleaned_query, vectors)
        res = VectorialModel.search_result(cosines, postings)
        res = res[:return_doc_count]
        related_docs = test_rels[key]
        tp = 0
        fn = 0
        for doc in res:
            if str(doc) in related_docs:
                tp += 1
            else:
                fn += 1
        accuracy = tp / len(res)
        recall = tp / (tp + fn)
        accuracies.append(accuracy)
        recalls.append(recall)
    accuracies = np.asarray(accuracies)
    recalls = np.asarray(recalls)
    mean_accuracy = np.mean(accuracies)
    min_accuracy = np.min(accuracies)
    max_accuracy = np.max(accuracies)
    print("Accuracy, recall tests for %i documents returned" %
          return_doc_count)
    print("Accuracy results: Mean %.2f, Max %.2f, Min %.2f" %
          (mean_accuracy, max_accuracy, min_accuracy))
    mean_recall = np.mean(recalls)
    min_recall = np.min(recalls)
    max_recall = np.max(recalls)
    print("Recall results: Mean %.2f, Max %.2f, Min %.2f" %
          (mean_recall, max_recall, min_recall))
    beta = mean_accuracy / mean_recall
    alpha = 1 / (beta**2 + 1)
    e_mesure = 1 - 1 / (alpha / mean_accuracy + (1 - alpha) / mean_recall)
    f_mesure = 1 - e_mesure
    print("E-mesure: %.2f, F-Mesure: %.2f" % (e_mesure, f_mesure))
Esempio n. 3
0
def vectorial_search_test(doc_list, tokens):

    # Genarte inverted index for the collection
    print("Generating inverted index")
    start_time = time()
    inverted_index = InvertedIndex.invert_index(tokens)
    end_time = time()
    print("Genartion finished, took: %.2f seconds" % (end_time - start_time))

    cleaned_query = VectorialModel.parse_query(
        "computer science applied to networks")
    postings = VectorialModel.posting_union(cleaned_query, inverted_index)

    print("Genarting document vectors")
    start_time = time()
    vectors = VectorialModel.doc_vectors(postings, cleaned_query,
                                         inverted_index)
    end_time = time()
    print("Generation finished, took %.2f seconds" % (end_time - start_time))

    print("Searching using cosine measurement")
    start_time = time()
    cleaned_query = VectorialModel.generate_query_vector(
        cleaned_query, inverted_index, len(doc_list))
    cosines = VectorialModel.cosinus(cleaned_query, vectors)
    res = VectorialModel.search_result(cosines, postings)
    end_time = time()
    print("Search finished, took %.2f seconds" % (end_time - start_time))
    print("First ten results: {}".format(res[:10]))

    docs = {}
    common_words = DocumentParser.read_common_words()
    for doc in doc_list:
        docs[doc.id] = len(
            DocumentParser.remove_common_words(doc.tokenize(), common_words))

    print("Genarting ponderated vectors")
    start_time = time()
    ponderated_vectors = VectorialModel.doc_vectors_ponderation(
        postings, cleaned_query, inverted_index, docs)
    end_time = time()
    print("Genartion finished, took %.2f seconds" % (end_time - start_time))

    print("Searching using cosine measurement")
    start_time = time()
    p_cosines = VectorialModel.cosinus(cleaned_query, ponderated_vectors)
    res = VectorialModel.search_result(p_cosines, postings)
    end_time = time()
    print("Search finished, took %.2f seconds" % (end_time - start_time))
    print("First ten results: {}".format(res[:10]))

    return True
Esempio n. 4
0
def test_accuracy_recall(documents, tokens, return_doc_count=10):
    inverted_index = InvertedIndex.invert_index(tokens)
    test_queries = parse_query_text()
    test_rels = parse_qrels()
    accuracies = []
    recalls = []
    docs = {}
    for doc_id in documents.keys():
        docs[doc_id] = len(documents[doc_id].post_tokens)
    for key in tqdm(list(test_rels.keys())):
        cleaned_query = VectorialModel.parse_query(test_queries[key])
        postings = VectorialModel.posting_union(cleaned_query, inverted_index)
        vectors = VectorialModel.doc_vectors_ponderation(postings, cleaned_query, inverted_index, docs)
        query_vector = VectorialModel.generate_query_vector(cleaned_query, inverted_index, len(documents.keys()))
        cosines = VectorialModel.cosinus(query_vector, vectors)
        res = VectorialModel.search_result(cosines, postings)
        res = res[:return_doc_count]
        related_docs = test_rels[key]
        tp = 0
        fn = 0
        for doc in res:
            if str(doc) in related_docs:
                tp+=1
            else:
                fn+=1
        accuracy = tp / len(res)
        recall = tp/(tp + fn)
        accuracies.append(accuracy)
        recalls.append(recall)
        # the following lines are used to test that the documents returned
        # are indeed relevant.
        print(test_queries[key])
        for doc_id in res[:10]:
            print(documents[str(doc_id)].title)
        break
    accuracies = np.asarray(accuracies)
    recalls = np.asarray(recalls)
    mean_accuracy = np.mean(accuracies)
    min_accuracy = np.min(accuracies)
    max_accuracy = np.max(accuracies)
    print("Accuracy, recall tests for %i documents returned" % return_doc_count)
    print("Accuracy results: Mean %.2f, Max %.2f, Min %.2f" % (mean_accuracy, max_accuracy, min_accuracy))
    mean_recall = np.mean(recalls)
    min_recall = np.min(recalls)
    max_recall = np.max(recalls)
    print("Recall results: Mean %.2f, Max %.2f, Min %.2f" % (mean_recall, max_recall, min_recall))
    beta = max_accuracy / max_recall
    alpha = 1 / (beta**2 + 1)
    e_mesure = 1 - 1/(alpha/max_accuracy + (1 - alpha)/max_recall)
    f_mesure = 1 - e_mesure
    print("E-mesure: %.2f, F-Mesure: %.2f" % (e_mesure, f_mesure))
Esempio n. 5
0
def accuracy_recall_graph(doc_list, tokens):
    inverted_index = InvertedIndex.invert_index(tokens)
    test_queries = parse_query_text()
    test_rels = parse_qrels()
    accuracies = []
    recalls = []
    keys = list(test_rels.keys())
    query = test_queries[keys[0]]
    rels = test_rels[keys[0]]
    cleaned_query = VectorialModel.parse_query(query)
    postings = VectorialModel.posting_union(cleaned_query, inverted_index)
    vectors = VectorialModel.doc_vectors(postings, cleaned_query,
                                         inverted_index)
    cleaned_query = VectorialModel.generate_query_vector(
        cleaned_query, inverted_index, len(doc_list))
    cosines = VectorialModel.cosinus(cleaned_query, vectors)
    res = VectorialModel.search_result(cosines, postings)
    for i in range(1, 51):
        temp_res = res[:i]
        tp = 0
        fn = 0
        for doc in temp_res:
            if str(doc) in rels:
                tp += 1
            else:
                fn += 1
        accuracy = tp / len(res)
        recall = tp / (tp + fn)
        accuracies.append(accuracy)
        recalls.append(recall)
    print(accuracies)
    print(recalls)
    x = np.linspace(1, 50, 50)
    plt.plot(x, accuracies, color="r")
    plt.plot(x, recalls, color="b")
    plt.show()
Esempio n. 6
0
    beta = max_accuracy / max_recall
    alpha = 1 / (beta**2 + 1)
    e_mesure = 1 - 1/(alpha/max_accuracy + (1 - alpha)/max_recall)
    f_mesure = 1 - e_mesure
    print("E-mesure: %.2f, F-Mesure: %.2f" % (e_mesure, f_mesure))


if __name__ == "__main__":
    common_words = read_stop_words(common_words_file)
    docs = parse_entry_file(file, common_words)
    tokens, vocab = calculate_tokens_and_vocab(docs)
    print("Number of tokens: %i, Vocabulary size: %i" % (len(tokens), len(vocab)))
    htokens , hvocab = half_way_tokens_and_vocab(docs)
    print("Halfway tokens: %i, Halfway vocab: %i" % (len(htokens), len(hvocab)))
    b = log(float(len(vocab)/len(hvocab)))/log(len(tokens)/len(htokens))
    k = len(vocab)/pow(len(tokens), b)
    print("k: %.2f, b: %.2f" % (k, b))
    voc_million = voc_million = k * (10**6)**b
    print("Vocabulary for 1 million tokens: %.2f" % voc_million)
    tokens = get_tokens_dict(docs)
    inverted_index = InvertedIndex.invert_index(tokens)
    frequencies = calculate_term_frequencies(inverted_index)
    log_freqs = [log(freq) for freq in frequencies]
    ranks = [i for i in range(1, len(frequencies) + 1)]
    log_ranks = [log(rank) for rank in ranks]
    frequencies = frequencies[::-1]
    # plt.plot(ranks[:200], frequencies[:200])
    # plt.show()
    # plt.plot(log_ranks[:200], log_freqs[:200])
    # plt.show()
    test_accuracy_recall(docs, tokens, 50)
Esempio n. 7
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "h f", [
            "help",
            "file",
        ])
    except getopt.GetoptError:
        print("For help, type: --help")
        sys.exit(2)

    HELP = """
        This program tokenizes an input file respecting the following
        format, then calculates it's vocabulary:
            - .I : document's id
            - .T : indicates start of document's title
            - .W : indicates start of document's summary
            - .K : indicates the start of the document's keywords

        options:
            -h --help : shows this help message
            -f --file : the path to the input file
    """

    filename = None
    for o, a in opts:
        if o in ("-h", "--help"):
            print(HELP)
            sys.exit(0)
        elif o in ("-f", "--file"):
            i = opts[0].index(o)
            if (len(args) < i):
                print("Please enter a filename")
                sys.exit(1)
            filename = args[i]
    if filename is None:
        print("Please provide a filename")
    doc_list = DocumentParser.parse_entry_file(filename)
    tokens, token_counts = DocumentParser.tokenize(doc_list,
                                                   "cacm/common_words")
    vocab, vocab_lengths = DocumentParser.calculate_vocabulary(tokens)
    for i in range(len(token_counts)):
        token_counts[i] = math.log10(token_counts[i])
    for i in range(len(vocab_lengths)):
        vocab_lengths[i] = math.log10(vocab_lengths[i])
    token_counts = np.asarray(token_counts)
    vocab_lengths = np.asarray(vocab_lengths)
    # coefs = estimate_coef(token_counts, vocab_lengths)
    # print(coefs)
    # k = 10**coefs[0]
    # b = coefs[1]
    # print("k= " + str(10**coefs[0]) + " b= " + str(coefs[1]))
    # voc_million = k * (10**6)**b
    # print("Vocabulaire 1million tokens: " + str(voc_million))
    # plot_regression_line(token_counts, vocab_lengths, coefs)
    doc_ids = []
    for doc in doc_list:
        doc_ids.append(doc.id)

    inverted_index = InvertedIndex.invert_index(tokens)
    # res = binarySearch("network AND computer", inverted_index, doc_ids)
    # print(res)

    cleaned_q = VectorialModel.parse_query(
        "computer science applied to networks")
    posting = VectorialModel.posting_union(cleaned_q, inverted_index)
    vectors = VectorialModel.doc_vectors(posting, cleaned_q, inverted_index)
    cosines = VectorialModel.cosinus(cleaned_q, vectors)
    vecmod_result = VectorialModel.search_result(cosines, posting)
    print(vecmod_result[:10])

    docs = {}
    common_words = DocumentParser.read_common_words()
    for doc in doc_list:
        docs[doc.id] = len(
            DocumentParser.remove_common_words(doc.tokenize(), common_words))

    ponderated_vectors = VectorialModel.doc_vectors_ponderation(
        posting, cleaned_q, inverted_index, docs)
    p_cosines = VectorialModel.cosinus(cleaned_q, ponderated_vectors)
    p_vecmod_result = VectorialModel.search_result(p_cosines, posting)
    print(p_vecmod_result[:10])
    print(ponderated_vectors[posting.index(2900)])

    tokens = None
    token_counts = None
    vocab_lengths = None
    vocab = None
    gc.collect()
    sys.exit(0)