booking_file = '../../data/booking.csv' users_file = '../../data/user.csv' rating_thresholds = [] true_objects_indexes = [0, 1, 2, 3, 4, 5] false_objects_indexes = [6, 7, 8, 9] file_names = os.listdir(data_directory) ids_vector = [int(name.split('-')[0]) for name in file_names] categories_vector = [name.split('-')[1] for name in file_names] ratings_vector = [int(name.split('.')[0].split('-')[2]) for name in file_names] name_vector = [data_directory + name for name in file_names] ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds) features, new_ratings_vector, new_categories_vector, new_ids_vector, new_paths_vector, text_indexes = divide_texts( name_vector, ratings_vector, categories_vector, ids_vector, n=10) ratings_vector = new_ratings_vector ids_vector = new_ids_vector scores_auc = [] scores_rmse = [] for i in range(10): cv_results_file = '../results/cv-generated-data-r-10-n-04-z-rf-' + str( i) + '.csv' selection = BasicFactorization(show_selection_results=False, selection_algorithm='rf') selection.transform(ids=ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix,
def transform(self, results_file='', short_texts_length=15): """ Classify texts for each provider and save predictions :param results_file: path to previously computed predictions :param short_texts_length: length of short texts for different objects """ if path.exists(results_file): self.load_results(results_file) return file_names = os.listdir(self.data_directory) paths = [self.data_directory + '/' + name for name in file_names] ids_vector = [name.split('-')[0] for name in file_names] categories_vector = [name.split('-')[1] for name in file_names] ratings_vector = [ int(name.split('-')[2].split('.')[0]) for name in file_names ] #features = texts_to_vectors(paths) features, ratings_vector, categories_vector, ids_vector, paths = divide_texts( paths, ratings_vector, categories_vector, ids_vector, n=short_texts_length) # Feature Agglomeration if self.feature_agglomeration: agglomeration = cluster.FeatureAgglomeration(n_clusters=5) agglomeration.fit(features) features_reduced = agglomeration.transform(features) features = features_reduced self.unique_ratings = sorted(list(set(ratings_vector))) unique_ids = list(set(ids_vector)) # Object selection if self.selection == 'none': selected_features = features selected_ids_vector = ids_vector selected_ratings_vector = ratings_vector elif self.selection == 'kmeans': selected_features, selected_ids_vector, selected_ratings_vector = self.selection_kmeans( ids_vector, ratings_vector, features) elif self.selection == 'random': selected_features, selected_ids_vector, selected_ratings_vector = self.selection_random( ids_vector, ratings_vector, features) elif self.selection == 'silhouette': selected_features, selected_ids_vector, selected_ratings_vector = self.selection_silhouette( ids_vector, ratings_vector, features, categories_vector) true_ratings_object = {} predicted_ratings_object = {} predicted_ratings_vector = [] true_ratings_vector = [] paths_object = {} ids_object = {} if self.algorithm == 'knn': model = KNeighborsClassifier(n_neighbors=3) elif self.algorithm == 'lr': model = linear_model.Lasso(alpha=0.1) else: model = RandomForestClassifier() for current_id in unique_ids: # Images for current_id to test set and other images to train set test_indexes = [] train_indexes = [] for index, img_id in enumerate(ids_vector): if img_id == current_id: test_indexes.append(index) for index, img_id in enumerate(selected_ids_vector): if img_id != current_id: train_indexes.append(index) train_X = selected_features[train_indexes, :] test_X = features[test_indexes, :] train_y = [selected_ratings_vector[j] for j in train_indexes] test_y = [ratings_vector[j] for j in test_indexes] if len(test_y) == 0: continue model.fit(train_X, train_y) predictions = model.predict(test_X) # Save to object predicted_ratings_object[current_id] = predictions true_ratings_object[current_id] = test_y paths_object[current_id] = [ paths[test_index] for test_index in test_indexes ] ids_object[current_id] = [ ids_vector[test_index] for test_index in test_indexes ] # Save to vector predicted_ratings_vector.extend(predictions) true_ratings_vector.extend(test_y) # Save to class properties self.predicted_ratings_object = predicted_ratings_object self.true_ratings_object = true_ratings_object self.predicted_ratings_vector = predicted_ratings_vector self.true_ratings_vector = true_ratings_vector self.paths_object = paths_object self.ids_object = ids_object # Save predictions to a file self.save_results(results_file)