Beispiel #1
0
booking_file = '../../data/booking.csv'
users_file = '../../data/user.csv'
rating_thresholds = []
true_objects_indexes = [0, 1, 2, 3, 4, 5]
false_objects_indexes = [6, 7, 8, 9]

file_names = os.listdir(data_directory)
ids_vector = [int(name.split('-')[0]) for name in file_names]
categories_vector = [name.split('-')[1] for name in file_names]
ratings_vector = [int(name.split('.')[0].split('-')[2]) for name in file_names]
name_vector = [data_directory + name for name in file_names]

ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
    data_directory, booking_file, users_file, rating_thresholds)

features, new_ratings_vector, new_categories_vector, new_ids_vector, new_paths_vector, text_indexes = divide_texts(
    name_vector, ratings_vector, categories_vector, ids_vector, n=10)

ratings_vector = new_ratings_vector
ids_vector = new_ids_vector

scores_auc = []
scores_rmse = []
for i in range(10):
    cv_results_file = '../results/cv-generated-data-r-10-n-04-z-rf-' + str(
        i) + '.csv'
    selection = BasicFactorization(show_selection_results=False,
                                   selection_algorithm='rf')
    selection.transform(ids=ids_vector,
                        features=features,
                        ratings=ratings_vector,
                        users_ratings=ratings_matrix,
Beispiel #2
0
    def transform(self, results_file='', short_texts_length=15):
        """
        Classify texts for each provider and save predictions

        :param results_file: path to previously computed predictions
        :param short_texts_length: length of short texts for different objects
        """
        if path.exists(results_file):
            self.load_results(results_file)
            return
        file_names = os.listdir(self.data_directory)
        paths = [self.data_directory + '/' + name for name in file_names]
        ids_vector = [name.split('-')[0] for name in file_names]
        categories_vector = [name.split('-')[1] for name in file_names]
        ratings_vector = [
            int(name.split('-')[2].split('.')[0]) for name in file_names
        ]
        #features = texts_to_vectors(paths)

        features, ratings_vector, categories_vector, ids_vector, paths = divide_texts(
            paths,
            ratings_vector,
            categories_vector,
            ids_vector,
            n=short_texts_length)

        # Feature Agglomeration
        if self.feature_agglomeration:
            agglomeration = cluster.FeatureAgglomeration(n_clusters=5)
            agglomeration.fit(features)
            features_reduced = agglomeration.transform(features)
            features = features_reduced

        self.unique_ratings = sorted(list(set(ratings_vector)))
        unique_ids = list(set(ids_vector))

        # Object selection
        if self.selection == 'none':
            selected_features = features
            selected_ids_vector = ids_vector
            selected_ratings_vector = ratings_vector
        elif self.selection == 'kmeans':
            selected_features, selected_ids_vector, selected_ratings_vector = self.selection_kmeans(
                ids_vector, ratings_vector, features)
        elif self.selection == 'random':
            selected_features, selected_ids_vector, selected_ratings_vector = self.selection_random(
                ids_vector, ratings_vector, features)
        elif self.selection == 'silhouette':
            selected_features, selected_ids_vector, selected_ratings_vector = self.selection_silhouette(
                ids_vector, ratings_vector, features, categories_vector)
        true_ratings_object = {}
        predicted_ratings_object = {}
        predicted_ratings_vector = []
        true_ratings_vector = []
        paths_object = {}
        ids_object = {}

        if self.algorithm == 'knn':
            model = KNeighborsClassifier(n_neighbors=3)
        elif self.algorithm == 'lr':
            model = linear_model.Lasso(alpha=0.1)
        else:
            model = RandomForestClassifier()

        for current_id in unique_ids:
            # Images for current_id to test set and other images to train set
            test_indexes = []
            train_indexes = []
            for index, img_id in enumerate(ids_vector):
                if img_id == current_id:
                    test_indexes.append(index)

            for index, img_id in enumerate(selected_ids_vector):
                if img_id != current_id:
                    train_indexes.append(index)
            train_X = selected_features[train_indexes, :]
            test_X = features[test_indexes, :]

            train_y = [selected_ratings_vector[j] for j in train_indexes]
            test_y = [ratings_vector[j] for j in test_indexes]

            if len(test_y) == 0:
                continue

            model.fit(train_X, train_y)
            predictions = model.predict(test_X)

            # Save to object
            predicted_ratings_object[current_id] = predictions
            true_ratings_object[current_id] = test_y
            paths_object[current_id] = [
                paths[test_index] for test_index in test_indexes
            ]
            ids_object[current_id] = [
                ids_vector[test_index] for test_index in test_indexes
            ]

            # Save to vector
            predicted_ratings_vector.extend(predictions)
            true_ratings_vector.extend(test_y)

        # Save to class properties
        self.predicted_ratings_object = predicted_ratings_object
        self.true_ratings_object = true_ratings_object
        self.predicted_ratings_vector = predicted_ratings_vector
        self.true_ratings_vector = true_ratings_vector
        self.paths_object = paths_object
        self.ids_object = ids_object

        # Save predictions to a file
        self.save_results(results_file)