Beispiel #1
0
def test_run(data_type, lower_and_remove_punctuation, remove_stop_words, distance_method):
    """
    Performs a test run, according to the given parameters
    :param data_type: Defines how to store the sentences, expects: 'boolean' / 'tf' / 'tfidf'
    :param lower_and_remove_punctuation: bool, if true turns all words to lower case and removes punctuation
    :param remove_stop_words: bool, if true removes all stop words
    :param distance_method: defines how to calculate distance, expects: 'euclidean' / 'cosine'
    :return: accuracy, the accuracy of the test run
    """
    file_name = "./dataset/amazon_cells_labelled_full.txt"
    train_file_name = "./dataset/amazon_cells_labelled_train.txt"
    test_file_name = "./dataset/amazon_cells_labelled_test.txt"

    data = FileReader(file_name, lower_and_remove_punctuation, remove_stop_words)
    train_set, _ = data.build_set(data_type, train_file_name)
    test_set, _ = data.build_set(data_type, test_file_name)
    classifier = RocchioClassifier(train_set)
    accuracy = calc_accuracy(test_set, classifier, distance_method)
    return accuracy
Beispiel #2
0
def retrieve_best_k_related_reviews(K, query):
    """
    Utilizing the Rocchio Classifier,
    finds and displays the k most relevant reviews to a given query, and their score.
    :param K: int, number of related documents desired
    :param query: string, the query to check the reviews against.
    :return:
    """
    file_name = "./dataset/amazon_cells_labelled_full.txt"
    data = FileReader(file_name, True, True)
    tfidf_set, text_set = data.build_set('tfidf', file_name)
    parsed_query = data.parse_query(query)
    retrieve(K, parsed_query, tfidf_set, text_set)