def test(self): # This tests that GraphConvolutionalMatrixCompletion runs without error, and its loss and rmse are small enough. n_users = 101 n_items = 233 n_data = 3007 am1 = _make_sparse_matrix(n_users, n_items, n_data) am2 = 2 * _make_sparse_matrix(n_users, n_items, n_data) adjacency_matrix = am1 + am2 user_ids = adjacency_matrix.tocoo().row item_ids = adjacency_matrix.tocoo().col ratings = adjacency_matrix.tocoo().data item_features = [{i: np.array([i]) for i in range(n_items)}] rating_data = GcmcDataset(user_ids, item_ids, ratings, item_features=item_features) dataset = GcmcGraphDataset(dataset=rating_data, test_size=0.2) self.assertEqual( (n_items + 1, 1), dataset.item_features[0].shape) # because of default index.
def run(self): tf.reset_default_graph() df = self.load_data_frame('train_data', required_columns={ self.user_column_name, self.item_column_name, self.rating_column_name }) user_features = self.load('user_features') item_features = self.load('item_features') df.drop_duplicates( subset=[self.user_column_name, self.item_column_name], inplace=True) df = sklearn.utils.shuffle(df) df = df.head(n=int(self.max_data_size)) user_ids = df[self.user_column_name].values item_ids = df[self.item_column_name].values ratings = df[self.rating_column_name].values dataset = GcmcDataset(user_ids=user_ids, item_ids=item_ids, ratings=ratings, user_features=user_features, item_features=item_features) graph_dataset = GcmcGraphDataset( dataset=dataset, test_size=self.test_size, min_user_click_count=self.min_user_click_count, max_user_click_count=self.max_user_click_count) model = GraphConvolutionalMatrixCompletion(graph_dataset=graph_dataset, **self.model_kwargs) self.task_log['report'] = [str(self.model_kwargs)] + model.fit( try_count=self.try_count, decay_speed=self.decay_speed) self.dump(self.task_log['report'], 'report') self.dump(model, 'model')
def test_with_information(self): user_ids = np.array([1, 1, 2, 2, 2]) item_ids = np.array([1, 2, 1, 2, 3]) ratings = np.array([1, 0, 1, 0, 1]) test_size = 0.0 user_features = [{1: np.array([10, 11]), 2: np.array([20, 21])}] item_features = [{ 1: np.array([10, 11, 12]), 2: np.array([20, 21, 22]), 3: np.array([30, 31, 32]) }] dataset = GcmcDataset(user_ids=user_ids, item_ids=item_ids, ratings=ratings, user_features=user_features, item_features=item_features) graph_dataset = GcmcGraphDataset(dataset, test_size) data = graph_dataset.train_data() self.assertEqual(user_ids.shape, data['user'].shape) self.assertEqual(item_ids.shape, data['item'].shape) self.assertEqual((ratings.shape[0], 2), data['label'].shape) self.assertEqual(ratings.shape, data['rating'].shape) self.assertEqual(user_ids.shape, data['user_feature_indices'].shape) self.assertEqual(item_ids.shape, data['item_feature_indices'].shape)
class GraphConvolutionalMatrixCompletion(object): def __init__( self, user_ids: np.ndarray, item_ids: np.ndarray, ratings: np.ndarray, encoder_hidden_size: int, encoder_size: int, scope_name: str, test_size: float, batch_size: int, epoch_size: int, dropout_rate: float, learning_rate: float, normalization_type: str, weight_sharing: bool = True, ignore_item_embedding: bool = False, save_directory_path: str = None, user_features: Optional[List[Dict[Any, np.ndarray]]] = None, item_features: Optional[List[Dict[Any, np.ndarray]]] = None) -> None: self.session = tf.Session() self.user_ids = user_ids self.item_ids = item_ids self.ratings = ratings self.item_features = item_features self.user_features = user_features self.encoder_hidden_size = encoder_hidden_size self.encoder_size = encoder_size self.test_size = test_size self.batch_size = batch_size self.epoch_size = epoch_size self.scope_name = scope_name self.dropout_rate = dropout_rate self.learning_rate = learning_rate self.normalization_type = normalization_type self.weight_sharing = weight_sharing self.ignore_item_embedding = ignore_item_embedding self.save_directory_path = save_directory_path self.dataset = GcmcDataset(self.user_ids, self.item_ids, self.ratings, self.test_size, user_information=self.user_features, item_information=self.item_features, min_user_click_count=5) self.graph = None def fit(self, try_count=1, decay_speed=10.) -> List[str]: if self.graph is None: logger.info('making graph...') self.graph = self._make_graph() logger.info('done making graph') early_stopping = EarlyStopping(try_count=try_count, decay_speed=decay_speed, save_directory=self.save_directory_path, learning_rate=self.learning_rate, threshold=1e-4) test_data = self.dataset.test_data() report = [] with self.session.as_default(): self.session.run(tf.global_variables_initializer()) dataset = tf.data.Dataset.from_tensor_slices( self.dataset.train_data()) dataset = dataset.shuffle(buffer_size=self.batch_size) batch = dataset.batch(self.batch_size) iterator = batch.make_initializable_iterator() next_batch = iterator.get_next() rating_adjacency_matrix = self.dataset.train_rating_adjacency_matrix( ) logger.info('start to optimize...') for i in range(self.epoch_size): self.session.run(iterator.initializer) while True: try: train_data = self.session.run(next_batch) _rating_adjacency_matrix = [ self._eliminate(matrix, train_data['user'], train_data['item']) for matrix in rating_adjacency_matrix ] feed_dict = { self.graph.input_learning_rate: early_stopping.learning_rate, self.graph.input_dropout: self.dropout_rate, self.graph.input_user: train_data['user'], self.graph.input_item: train_data['item'], self.graph.input_label: train_data['label'], self.graph.input_rating: train_data['rating'], self.graph.input_user_information: train_data['user_information'], self.graph.input_item_information: train_data['item_information'], } feed_dict.update({ g: _convert_sparse_matrix_to_sparse_tensor(m) for g, m in zip(self.graph.input_adjacency_matrix, _rating_adjacency_matrix) }) feed_dict.update({ g: m.count_nonzero() for g, m in zip(self.graph.input_edge_size, _rating_adjacency_matrix) }) _, train_loss, train_rmse = self.session.run( [self.graph.op, self.graph.loss, self.graph.rmse], feed_dict=feed_dict) report.append( f'train: epoch={i + 1}/{self.epoch_size}, loss={train_loss}, rmse={train_rmse}.' ) except tf.errors.OutOfRangeError: logger.info(report[-1]) feed_dict = { self.graph.input_dropout: 0.0, self.graph.input_user: test_data['user'], self.graph.input_item: test_data['item'], self.graph.input_label: test_data['label'], self.graph.input_rating: test_data['rating'], self.graph.input_user_information: test_data['user_information'], self.graph.input_item_information: test_data['item_information'], } feed_dict.update({ g: _convert_sparse_matrix_to_sparse_tensor(m) for g, m in zip(self.graph.input_adjacency_matrix, rating_adjacency_matrix) }) feed_dict.update({ g: m.count_nonzero() for g, m in zip(self.graph.input_edge_size, rating_adjacency_matrix) }) test_loss, test_rmse = self.session.run( [self.graph.loss, self.graph.rmse], feed_dict=feed_dict) report.append( f'test: epoch={i + 1}/{self.epoch_size}, loss={test_loss}, rmse={test_rmse}.' ) logger.info(report[-1]) break if early_stopping.does_stop(test_rmse, self.session): break return report def predict(self, user_ids: List, item_ids: List, with_user_embedding: bool = True) -> np.ndarray: if self.graph is None: RuntimeError('Please call fit first.') rating_adjacency_matrix = self.dataset.train_rating_adjacency_matrix() user_indices, item_indices = self.dataset.to_indices( user_ids, item_ids) if not with_user_embedding: user_indices = np.array( [0] * len(user_indices)) # TODO use default user index. user_information_indices, item_information_indices = self.dataset.to_information_indices( user_ids, item_ids) feed_dict = { self.graph.input_dropout: 0.0, self.graph.input_user: user_indices, self.graph.input_item: item_indices, self.graph.input_user_information: user_information_indices, self.graph.input_item_information: item_information_indices, } feed_dict.update({ g: _convert_sparse_matrix_to_sparse_tensor(m) for g, m in zip(self.graph.input_adjacency_matrix, rating_adjacency_matrix) }) feed_dict.update({ g: m.count_nonzero() for g, m in zip(self.graph.input_edge_size, rating_adjacency_matrix) }) with self.session.as_default(): predictions = self.session.run(self.graph.expectation, feed_dict=feed_dict) predictions = predictions.flatten() predictions = np.clip(predictions, self.dataset.rating()[0], self.dataset.rating()[-1]) return predictions def predict_item_scores(self, item_ids: List, with_user_embedding: bool = True) -> pd.DataFrame: user_ids = list(self.dataset.user_id_map.id2index.keys()) _test_users, _test_items = zip( *list(itertools.product(user_ids, item_ids))) predicts = self.predict(user_ids=_test_users, item_ids=_test_items, with_user_embedding=with_user_embedding) results = pd.DataFrame( dict(user=_test_users, item=_test_items, score=predicts)) results.sort_values('score', ascending=False, inplace=True) return results def _make_graph(self) -> GraphConvolutionalMatrixCompletionGraph: return GraphConvolutionalMatrixCompletionGraph( n_rating=len(self.dataset.rating_id_map.id2index), n_user=len(self.dataset.user_id_map.id2index) + 1, # TODO n_item=len(self.dataset.item_id_map.id2index) + 1, # TODO rating=self.dataset.rating(), normalization_type=self.normalization_type, encoder_hidden_size=self.encoder_hidden_size, encoder_size=self.encoder_size, weight_sharing=self.weight_sharing, scope_name=self.scope_name, user_side_information=self.dataset.user_information, item_side_information=self.dataset.item_information, ignore_item_embedding=self.ignore_item_embedding) @staticmethod def _eliminate(matrix: sp.csr_matrix, user_indices, item_indices): matrix = matrix.copy() matrix[user_indices, item_indices] = 0 matrix.eliminate_zeros() return matrix def save(self, file_path: str) -> None: redshells.model.utils.save_tf_session(self, self.session, file_path) @staticmethod def load(file_path: str) -> 'GraphConvolutionalMatrixCompletion': session = tf.Session() model = redshells.model.utils.load_tf_session( GraphConvolutionalMatrixCompletion, session, file_path, GraphConvolutionalMatrixCompletion._make_graph ) # type: GraphConvolutionalMatrixCompletion return model