Beispiel #1
0
    def run_classifier(self, classifier):
        """
        Creates dummy data for testing. Then, it creates the data for the content-based models, creates train and test
        datasets and performs cross-validation.

        Args
            classifier(ContentBasedClassifier): the model to run cross-validation

        Returns
            list, ndarray, ndarray: list of tuples with the true and predicted values, the input data, true labels

        """
        input_data, labels = np.arange(1000).reshape((100, 10)), [randint(0, 4) for _ in range(100)]
        dp = ContentBasedPreprocessing()
        input_train, input_test, labels_training, labels_testing = dp.create_train_test_data(input_data=input_data,
                                                                                             labels=labels)
        labels_training = np.asarray(labels_training)
        folds = dp.create_cross_validation_data(input_data=input_train, properties=self.properties)
        fold_idx = list(folds)
        preds = []
        for idx, (train_idx, test_idx) in enumerate(fold_idx):
            print("Running fold #{}/{}".format(idx + 1, len(fold_idx)))
            input_training, input_testing = input_train[train_idx], input_train[test_idx]
            labels_train, labels_test = labels_training[train_idx], labels_training[test_idx]
            classifier.train(self.properties, input_training, labels_train)
            true_labels, predicted_labels = classifier.test(input_testing, labels_test)
            preds.append((true_labels, predicted_labels))
        return preds, input_test, labels_testing
Beispiel #2
0
    def test_preprocess_text(self):
        """
        Test method for the preprocess_text function. Given a movie and user id, the movie title, genres and given tags
        by the user are collected and concatenated into a single text. Then the text is preprocessed by removing symbols
        and numbers and splitting the text into a list of words.

        Examined test case: the returned list of words is the same as the expected list of words
        """
        logger = utils.config_logger(properties=load_test_properties())
        movies_df = pd.DataFrame(data=[[1, "Toy Story (1995)", "Adventure|Animation|Children|Comedy|Fantasy"]],
                                 columns=["movieId", "title", "genres"])
        tags_df = pd.DataFrame(data=[[1, 1, "funny"]], columns=["userId", "movieId", "tag"])
        movie_id = 1
        user_id = 1
        data_preprocess = ContentBasedPreprocessing()
        text = data_preprocess._preprocess_text(movies_df=movies_df, tags_df=tags_df, movie_id=movie_id,
                                                user_id=user_id, logger=logger)
        expected_text = ["Toy", "Story", "Adventure", "Animation", "Children", "Comedy", "Fantasy", "funny"]
        self.assertEqual(text, expected_text)
Beispiel #3
0
    def test_create_cross_validation_data(self):
        """
        Test method for the create_cross_validation_data. The training dataset is used to generate k folds.

        Examined test cases:

        1. The size of train indices
        2. The size of test indices
        3. The number of the generated folds
        """
        properties = {"cross-validation": 2}
        input_data = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
        dp = ContentBasedPreprocessing()
        folds = dp.create_cross_validation_data(input_data=input_data, properties=properties)
        count = 0
        for idx, (train_index, test_index) in enumerate(folds):
            self.assertEqual(train_index.shape, (2,))
            self.assertEqual(test_index.shape, (2,))
            count += 1
        self.assertEqual(2, count)
Beispiel #4
0
    def test_create_train_test_data(self):
        """
        Test method for the create_train_test_data function. The purpose of the function is to split a given dataset
        into training and test datasets by keeping 20% of the data as test.

        Examined test cases:

        1. Size of input train data
        2. Size of input test data
        3. Length of training labels list
        4. Length of testing labels list
        """
        input_data, labels = np.arange(10).reshape((5, 2)), range(5)
        dp = ContentBasedPreprocessing()
        input_train, input_test, labels_train, labels_test = dp.create_train_test_data(input_data=input_data,
                                                                                       labels=labels)
        self.assertEqual(input_train.shape, (4, 2))
        self.assertEqual(input_test.shape, (1, 2))
        self.assertEqual(len(labels_train), 4)
        self.assertEqual(len(labels_test), 1)
Beispiel #5
0
    def test_preprocess_rating(self):
        """
        Test method for preprocess_rating function. Based on the classification (binary or multi-class) the rating
        values are replaced by 0,1 or 1,2,3,4,5 respectively.

        Examined test cases:

            1. Binary classification
                a. rating for dislike
                b. rating for like
            2. Multi-class classification
                a. round rating to the next number
                b. same rating
                c. round rating to the same number without the decimal numbers
        """
        data_preprocessing = ContentBasedPreprocessing()
        # test cases for binary classification
        properties = {"classification": "binary"}
        # case dislike
        rating = 1.5
        expected_rating = 1
        new_rating = data_preprocessing._preprocess_rating(properties, rating)
        self.assertEqual(new_rating, expected_rating)
        # case like
        rating = 4
        expected_rating = 0
        new_rating = data_preprocessing._preprocess_rating(properties, rating)
        self.assertEqual(new_rating, expected_rating)
        # case dislike
        rating = 2.9
        expected_rating = 1
        new_rating = data_preprocessing._preprocess_rating(properties, rating)
        self.assertEqual(new_rating, expected_rating)

        # test rating for multi-class classification
        properties["classification"] = "multi"
        # round rating
        rating = 1.5
        expected_rating = 2
        new_rating = data_preprocessing._preprocess_rating(properties, rating)
        self.assertEqual(new_rating, expected_rating)
        # rating remains the same
        rating = 3
        expected_rating = 3
        new_rating = data_preprocessing._preprocess_rating(properties, rating)
        self.assertEqual(new_rating, expected_rating)
        # round rating
        rating = 4.22
        expected_rating = 4
        new_rating = data_preprocessing._preprocess_rating(properties, rating)
        self.assertEqual(new_rating, expected_rating)
Beispiel #6
0
    def test_text_to_glove(self):
        """
        Method to test the functionality of the text_to_glove function. Given a list of words and a DataFrame of
        word embeddings (words represented as vectors) the method transforms the list into list of vectors following
        an aggregation strategy (avg or max).

        Examined test case: given and expected input vectors are the same
        """
        word_list = ["Toy", "Story", "Adventure", "Animation", "Children", "Comedy", "Fantasy", "funny"]
        data = [["toy", 1, 1, 1, 1, 1],
                ["story", 2, 2, 2, 2, 2],
                ["adventure", 3, 3, 3, 3, 3],
                ["animation", 4, 4, 4, 4, 4],
                ["children", 5, 5, 5, 5, 5],
                ["comedy", 6, 6, 6, 6, 6]]
        glove_df = pd.DataFrame(data=data, columns=None)
        glove_df = glove_df.set_index(0)
        properties = {"aggregation": "avg"}
        expected_vector = np.array([[3.5, 3.5, 3.5, 3.5, 3.5]])
        data_preprocess = ContentBasedPreprocessing()
        text_vector = data_preprocess._text_to_glove(properties=properties, glove_df=glove_df, word_list=word_list)
        self.assertEqual(text_vector.all(), expected_vector.all())
Beispiel #7
0
def run_content_based(properties, csvs, logger):
    """
    It processes the data to obtain the input vectors for the content-based methods and then uses them to create the
    models. It splits the data into train and test datasets, uses k-fold cross-validation and finally, run the models
    and write them into files for both the train and test results. In the end it calculates the average of the folds
    for the validation and test dataset.

    Args
        properties (dict): datasets, classification, models and output folder
        csvs (dict): the non-processed datasets
    """
    dp = ContentBasedPreprocessing()
    logger.info("Creating input vectors for content-based method")
    dp.preprocess(properties=properties, datasets=csvs, logger=logger)
    input_data = dp.input_data
    ratings = dp.ratings
    logger.info("Split train and test datasets")
    input_train, input_test, ratings_train, ratings_test = dp.create_train_test_data(
        input_data, ratings)
    ratings_test = np.asarray(ratings_test)
    logger.info("Get k-fold indices")
    folds = dp.create_cross_validation_data(input_train, properties)
    folds = list(folds)
    results_folder = "results_{}_{}".format(properties["dataset"],
                                            properties["classification"])
    classifiers = {}
    for model in properties["models"]["content-based"]:
        logger.info("Starting cross-validation for model {}".format(model))
        tic = time.time()
        classifier = init_content_based_model(model)
        classifier.run_cross_validation(classifier, properties, input_train,
                                        ratings_train, folds, results_folder,
                                        logger)
        logger.info(
            "Time needed for classifier {} for train/test is {}".format(
                model, utils.elapsed_str(tic)))
        classifiers[model] = classifier
    logger.info(
        "Calculating average for macro/micro precision, recall and F-measure")
    for model in properties["models"]["content-based"]:
        classifier = classifiers[model]
        classifier.get_fold_avg_result(
            output_folder=properties["output_folder"],
            results_folder=results_folder)
        logger.info("Best classifier with metric {} for model {}".format(
            properties["metric_best_model"], model))
        classifier.find_best_model(properties)
        true_labels, predictions = classifier.test(input_test,
                                                   ratings_test,
                                                   kind=MetricKind.test.value)
        predicted_labels, probabilities = classifier.get_predicted_labels_and_probabilities(
            properties=properties, predictions=predictions)
        classifier.get_results(true_labels,
                               predicted_labels,
                               kind=MetricKind.test.value)
        classifier.write_test_results_to_file(properties["output_folder"],
                                              results_folder)
    print("Done!")
Beispiel #8
0
def run_test(properties, csvs, logger):
    """
    Method to run the recommendation system using the best produced models for content-based method.
    Uses the test_recommendation.csv file where no rating is available. 

    Args
        properties (dict): the loaded configuration file
        csvs (dict): the DataFrames from the input csv files
        logger (Logger): a Logger object to print info/error messages
    """
    # preprocess with test recommendation csv
    logger.info("Testing the recommendation system")
    content_based_results = join(utils.app_dir, properties["output_folder"],
                                 "test_results", "content-based")
    collaborative_results = join(utils.app_dir, properties["output_folder"],
                                 "test_results", "collaborative")
    if not exists(content_based_results):
        mkdir(content_based_results)
    if not exists(collaborative_results):
        mkdir(collaborative_results)
        pearson_dir = join(utils.app_dir, properties["output_folder"],
                           "results_pearson_{}".format(properties["dataset"]))
        for file in listdir(pearson_dir):
            if file.startswith("Predictions"):
                copyfile(join(pearson_dir, file),
                         join(collaborative_results, file))
    content_based_files = listdir(content_based_results)
    if not content_based_files or len(content_based_files) != 3:
        dp = ContentBasedPreprocessing()
        logger.info("Creating input vectors for content-based method")
        test_recommendation_df = csvs["test_recommendation"]
        test_recommendation_df.loc[:, "rating"] = 0.0
        csvs["test_recommendation"] = test_recommendation_df
        dp.preprocess(properties=properties,
                      datasets=csvs,
                      logger=logger,
                      kind=PreprocessKind.recommend.value)
        input_data = dp.input_data
        ratings = dp.ratings
        for model in properties["models"]["content-based"]:
            logger.info("Testing model: {}".format(model))
            classifier = init_content_based_model(model)
            directory = join("output", "best_models")
            filename = "best_model_{}_{}.pickle".format(
                model, properties["dataset"])
            classifier.best_model = utils.load_from_pickle(directory=directory,
                                                           file=filename)
            true_labels, predictions = classifier.test(
                input_data, ratings, kind=MetricKind.test.value)
            predicted_labels, probabilities = classifier.get_predicted_labels_and_probabilities(
                properties=properties, predictions=predictions)
            dataset_folder = Datasets.ml_latest_small.value if properties["dataset"] == Datasets.small.value \
                else Datasets.ml_latest.value
            test_csv_path = join(utils.app_dir, properties["datasets_folder"],
                                 dataset_folder, "test_recommendation.csv")
            df = pd.read_csv(test_csv_path)
            df["rating"] = predicted_labels
            df.insert(loc=4, column='probability', value=probabilities)
            logger.info("Writing results to file")
            new_csv = join(content_based_results,
                           "test_recommendation_{}.csv".format(model))
            df.to_csv(new_csv, sep=",")
    qualitative_collaborative(properties=properties,
                              logger=logger,
                              directory=collaborative_results)
    qualitative_content_based(properties=properties,
                              logger=logger,
                              directory=content_based_results)