def test_batching(self): dataset_1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] dataset_2 = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] dataset_3 = [ "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X" ] # Case 1 batcher = batching(list_of_iterables=[dataset_1, dataset_2, dataset_3], n=2, infinite=False, return_incomplete_batches=False) batches_1, batches_2, batches_3 = zip(*list(batcher)) self.assertEqual(5, len(batches_1)) self.assertEqual(5, len(batches_2)) self.assertEqual(5, len(batches_3)) self.assertListEqual(dataset_1, flatten(batches_1)) self.assertListEqual(dataset_2, flatten(batches_2)) self.assertListEqual(dataset_3, flatten(batches_3)) # Case 2 batcher = batching(list_of_iterables=[dataset_1, dataset_2, dataset_3], n=3, infinite=False, return_incomplete_batches=True) batches_1, batches_2, batches_3 = zip(*list(batcher)) self.assertEqual(4, len(batches_1)) self.assertEqual(4, len(batches_2)) self.assertEqual(4, len(batches_3)) self.assertListEqual(dataset_1, flatten(batches_1)) self.assertListEqual(dataset_2, flatten(batches_2)) self.assertListEqual(dataset_3, flatten(batches_3)) # Case 3 batcher = batching(list_of_iterables=[dataset_1, dataset_2, dataset_3], n=3, infinite=False, return_incomplete_batches=False) batches_1, batches_2, batches_3 = zip(*list(batcher)) self.assertEqual(3, len(batches_1)) self.assertEqual(3, len(batches_2)) self.assertEqual(3, len(batches_3)) self.assertListEqual(dataset_1[:-1], flatten(batches_1)) self.assertListEqual(dataset_2[:-1], flatten(batches_2)) self.assertListEqual(dataset_3[:-1], flatten(batches_3))
def predict(self, x, batch_size): batcher = batching([list(x)], n=batch_size, return_incomplete_batches=True) preds = [] for batch_x in batcher: batch_x = batch_x[0] u, i = list(zip(*list(batch_x))) preds.append(np.sum(self.u_emb[list(u)] * self.i_emb[list(i)] \ + self.u_bias[list(u)] \ + self.i_bias[list(i)], axis=1, keepdims=True)) preds = np.row_stack(preds) return preds
def get_batches(self, return_incomplete_batches: bool = False): list_of_iterables = [self.audios, self.targets] if not self.scoring else [self.audios] for batch in batching(list_of_iterables=list_of_iterables, n=self.batch_size, return_incomplete_batches=return_incomplete_batches): batch[0] = np.expand_dims(np.array(batch[0]), 1) batch[0] = torch.from_numpy(batch[0]) if self.scoring: batch += [None] else: batch[1] = torch.from_numpy(batch[1]) yield batch
def predict(self, x, batch_size): batcher = batching([list(x)], n=batch_size, return_incomplete_batches=True) preds = [] for batch_x in batcher: batch_x = batch_x[0] u_ids, i_ids = list(zip(*list(batch_x))) preds.append( self.sess.run(self.output, feed_dict={ self.ph_u_ids: u_ids, self.ph_i_ids: i_ids, self.ph_keep_prob: 1.0 })) preds = np.row_stack(preds) return preds
def get_batcher(df, b_size=16, train=True): columns_target = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] if train: pool = [ df.id.values.tolist(), df.code.map(np.matrix).values.tolist(), df[columns_target].values.tolist() ] batcher = batching(pool, b_size) for element in batcher: max_len = max(map(lambda x: x.shape[1], element[1])) batch = np.row_stack( list( map( lambda x: np.pad(np.array(x)[0], (0, max_len - x.shape[1]), mode="constant"), element[1]))) targets = np.row_stack(element[2]) yield element[0], batch, targets
def fit(self, x, y, batch_size): batcher = batching([list(x), list(y)], n=batch_size, return_incomplete_batches=True) for batch_x, batch_y in batcher: self.train_on_batch(batch_x, batch_y)
def get_batches_generator( df_time, df_static, batch_size=128, min_history=300, forecast_horizon=7, shuffle=True, shuffle_present=True, cuda=False, ): from src.constants import ( numeric_feats, categorical_feats, target_name, batch_time_normalizable_feats, embedding_sizes, ) logger.info("Shuffling dataframe...") df_time, df_static = shuffle_multiple(df_time, df_static) logger.info("Shuffle successful!") # Assure perfect alignment case_static = df_static[["store_nbr", "item_nbr"]] case_time = df_time[:, 0][["store_nbr", "item_nbr"]] assert (case_static == case_time).all() time_steps = df_time.shape[1] batcher = batching( list_of_iterables=[df_time, df_static], n=batch_size, return_incomplete_batches=False, ) num_time_feats = np.intersect1d(numeric_feats, df_time.dtype.names) num_static_feats = np.intersect1d(numeric_feats, df_static.dtype.names) cat_time_feats = np.intersect1d(categorical_feats, df_time.dtype.names) cat_static_feats = np.intersect1d(categorical_feats, df_static.dtype.names) for batch_time, batch_static in batcher: if shuffle_present: present = random.randint(min_history, time_steps - forecast_horizon) else: present = time_steps - forecast_horizon # Numerical time-dependent features numeric_time_batch = batch_time[num_time_feats][:, :present] # Categorical time-dependent features cat_time_batch = batch_time[cat_time_feats][:, :present] # Numerical static features (Not defined) # numeric_static_batch = batch_static[num_static_feats] # Categorical static features cat_static_batch = batch_static[cat_static_feats] # Target target = batch_time[target_name][:, present:(present + forecast_horizon)] # Convert to arrays numeric_time_batch = recarray_to_array(numeric_time_batch, np.float32).swapaxes(0, 1) cat_time_batch = recarray_to_array(cat_time_batch, np.int32).swapaxes(0, 1) cat_static_batch = recarray_to_array(cat_static_batch, np.int32) target = target.astype(np.float32).swapaxes(0, 1) # Convert to torch tensors numeric_time_batch = torch.from_numpy(numeric_time_batch) cat_time_batch = torch.from_numpy(cat_time_batch).long() cat_static_batch = torch.from_numpy(cat_static_batch).long() target = torch.from_numpy(target) # Move to cuda if required if cuda: numeric_time_batch = numeric_time_batch.cuda() cat_time_batch = cat_time_batch.cuda() cat_static_batch = cat_static_batch.cuda() target = target.cuda() yield numeric_time_batch, cat_time_batch, cat_static_batch, target