def get_mnist(location="./", batch_size=64, labels_per_class=100): from functools import reduce from operator import __or__ from torch.utils.data.sampler import SubsetRandomSampler from torchvision.datasets import MNIST import torchvision.transforms as transforms from utils import onehot flatten_bernoulli = lambda x: transforms.ToTensor()(x).view(-1).bernoulli() mnist_train = MNIST(location, train=True, download=True, transform=flatten_bernoulli, target_transform=onehot(n_labels)) mnist_valid = MNIST(location, train=False, download=True, transform=flatten_bernoulli, target_transform=onehot(n_labels)) def get_sampler(labels, n=None): # Only choose digits in n_labels (indices, ) = np.where( reduce(__or__, [labels == i for i in np.arange(n_labels)])) # Ensure uniform distribution of labels np.random.shuffle(indices) indices = np.hstack([ list(filter(lambda idx: labels[idx] == i, indices))[:n] for i in range(n_labels) ]) indices = torch.from_numpy(indices) sampler = SubsetRandomSampler(indices) return sampler # Dataloaders for MNIST labelled = torch.utils.data.DataLoader( mnist_train, batch_size=batch_size, num_workers=2, pin_memory=cuda, sampler=get_sampler(mnist_train.train_labels.numpy(), labels_per_class)) unlabelled = torch.utils.data.DataLoader( mnist_train, batch_size=batch_size, num_workers=2, pin_memory=cuda, sampler=get_sampler(mnist_train.train_labels.numpy())) validation = torch.utils.data.DataLoader( mnist_valid, batch_size=batch_size, num_workers=2, pin_memory=cuda, sampler=get_sampler(mnist_valid.test_labels.numpy())) return labelled, unlabelled, validation
def init_train_and_val_classes_and_labels(self): ''' Method initialized val and train classes and labels. These properties will be used by various other methods. :return: self.val_classes, self.val_labels, self.train_classes, self.train_labels, ''' self.val_classes = self.val_batches.classes self.val_labels = onehot(self.val_classes) self.train_classes = self.train_batches.classes self.train_labels = onehot(self.train_classes)
def true_online_gtd(env, episodes, target, behavior, Lambda, gamma=lambda x: 0.95, alpha=0.05, beta=0.0001, diagnose=False, evaluation=None): """ episodes: number of episodes target: target policy matrix (|S|*|A|) behavior: behavior policy matrix (|S|*|A|) Lambda: LAMBDA object determining each lambda for each feature (or state or observation) gamma: anonymous function determining each lambda for each feature (or state or observation) alpha: learning rate for the weight vector of the values beta: learning rate for the auxiliary vector for off-policy """ learner = TRUE_ONLINE_GTD_LEARNER(env) if evaluation is not None: value_trace = np.zeros((episodes, 1)) value_trace[:] = np.nan else: value_trace = [] for epi in range(episodes): s_curr, done = env.reset(), False x_curr = onehot(s_curr, env.observation_space.n) learner.refresh() if evaluation is not None: value_trace[epi, 0] = evaluation(learner.w_curr, 'expectation') else: value_trace.append(np.copy(learner.w_curr)) while not done: action = decide(s_curr, behavior) rho_curr = importance_sampling_ratio(target, behavior, s_curr, action) s_next, r_next, done, _ = env.step(action) x_next = onehot(s_next, env.observation_space.n) if diagnose: print('rho_curr: %.2e, lambda_curr: %.2e, lambda_next: %.2e' % (rho_curr, Lambda.value(x_curr), Lambda.value(x_next))) learner.learn(r_next, gamma(x_next), gamma(x_curr), x_next, x_curr, Lambda.value(x_next), Lambda.value(x_curr), rho_curr, alpha, beta) learner.next() x_curr = x_next return value_trace
def train(self, data_loader, valid_loader, epochs, learning_rate, dropout_prob=None): losses_train = [] losses_valid = [] for epoch in range(epochs): print("epoch", epoch) # 训练部分 epoch_loss_train = 0 for step, (x, y) in enumerate(data_loader): # x:[b, 28, 28] -> [b, 784] , y:[b, 1] -> [b, 10] x = x.reshape(-1, 28 * 28) y = onehot(y, 10) nets, pred = self.forward(x, dropout_prob) loss = cross_entropy(y, pred) epoch_loss_train += loss grads = self.backward(nets, y, pred, dropout_prob) # SGD更新参数 # self.params = optimizer.optimize(self.weight_num, self.params, grads, y.shape[0]) self.params = self.optimizer.optimize(self.weight_num, self.params, grads, y.shape[0]) if step % 100 == 0: print("epoch {} training step {} loss {:.4f}".format( epoch, step, loss)) losses_train.append(epoch_loss_train) print(epoch_loss_train) data_loader.restart() # 验证部分,只进行前向传播 epoch_loss_valid = 0 for step, (x, y) in enumerate(valid_loader): x = x.reshape(-1, 28 * 28) y = onehot(y, 10) nets, pred = self.forward(x, dropout_prob) loss = cross_entropy(y, pred) epoch_loss_valid += loss if step % 100 == 0: print("epoch {} validation step {} loss {:.4f}".format( epoch, step, loss)) losses_valid.append(epoch_loss_valid) valid_loader.restart() his = {'train_loss': losses_train, 'valid_loss': losses_valid} return his
def Q_estimates(self, state, goal=None): # Generate Q values for all actions. if goal == None: goal = self.w else: goal = utils.onehot(goal, self.n_state) return np.matmul(self.M[:, state, :], goal)
def decode_for_classification(X_syn): bins = np.linspace(-1e-6, 1, 17, endpoint=True) for name, dtype in zip(X_syn.columns, X_syn.dtypes): if name in disc_features: feature_min = X_syn[name].min() feature_max = X_syn[name].max() X_syn[name] = (X_syn[name] - feature_min) / (feature_max - feature_min) X_syn[name] = pd.cut(X_syn[name], bins=bins, labels=range(16)).astype('int') X_syn[name] = X_syn[name].map( {key: i for i, key in enumerate(np.unique(X_syn[name]))}) del X_syn['Education'] ## Relabel education number X_syn['Education-Num'] = X_syn['Education-Num'] + 1 ## One hot categorical features onehotteds = [] for col in X_syn.columns: feature = X_syn[col] if (feature.dtype == 'int' or feature.dtype == 'O') and col not in onehotteds: if len(np.unique(feature)) > 2: X_syn.pop(col) onehotted = onehot(feature) X_syn = pd.concat([X_syn, onehotted], axis=1) onehotteds.append(col) X_syn['Sex'] = X_syn['Sex'].map({'Female': 0, 'Male': 1}) return X_syn
def propagate_error(self, target): """Propagate the error backwards through the network (backpropagation).""" if not self.continuous: target = onehot(target, self.num_targets) for layer in reversed(self.layers): layer.propagate_error(target)
def checkOneHot(self): v = torch.LongTensor([1, 2, 1, 2, 0]) v_length = torch.LongTensor([2, 3]) v_onehot = utils.onehot(v, v_length, 4) target = torch.FloatTensor([[[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 0]], [[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]]) assert target.equal(v_onehot)
def sq_error(self, sample): """Calculate the square error for a given sample.""" prediction = self.predict(sample.features) if self.continuous: return (sample.label - prediction)**2 else: target = onehot(sample.label, self.num_targets) return sum([(target[i] - prediction[i])**2 for i in range(len(target))])
def update_sr(self, current_exp, next_exp): # SARSA TD learning rule # update the M(s, s', a) s = current_exp[0] # current state s_a = current_exp[1] # choosed action s_ = current_exp[2] # next state s_a_1 = next_exp[1] # next state choosed action r = current_exp[3] # reward in current state d = current_exp[4] # wheather the current state is terminal I = utils.onehot( s, env.state_size) # transform current state to one-hot vector if d: td_error = (I + self.gamma * utils.onehot(s_, env.state_size) - self.M[s_a, s, :]) else: td_error = (I + self.gamma * self.M[s_a_1, s_, :] - self.M[s_a, s, :]) self.M[s_a, s, :] += self.learning_rate * td_error return td_error
def bprop(self, X, y): X = np.array([[float(x)] for x in X]) self._gradoa = self._os - utils.onehot(self._m,y) self._gradb2 = self._gradoa self._gradw2 = np.dot(self._gradoa, np.transpose(self._hs)) + 2 * self.wd * self._w2 self._gradhs = np.dot(np.transpose(self._w2), self._gradoa) self._gradha = self._gradhs * np.where(self._ha > 0, 1, 0) self._gradb1 = np.array(self._gradha) self._gradw1 = np.dot(self._gradha,np.transpose(X)) + 2 * self.wd * self._w1 self._gradx = np.dot(np.transpose(self._w1), self._gradha)
def bprop(self, X, y): X = np.array([[float(x)] for x in X]) self._gradoa = self._os - utils.onehot(self._m, y) self._gradb2 = self._gradoa self._gradw2 = np.dot(self._gradoa, np.transpose( self._hs)) + 2 * self.wd * self._w2 self._gradhs = np.dot(np.transpose(self._w2), self._gradoa) self._gradha = self._gradhs * np.where(self._ha > 0, 1, 0) self._gradb1 = np.array(self._gradha) self._gradw1 = np.dot(self._gradha, np.transpose(X)) + 2 * self.wd * self._w1 self._gradx = np.dot(np.transpose(self._w1), self._gradha)
def _load(self, filenames): images, labels = None, [] for i, filename in enumerate(filenames): datafile = utils.unpickle(filename) if i == 0: images = datafile['data'] else: images = np.append(images, datafile['data'], axis=0) labels.extend(datafile['labels']) print(images.shape, len(labels)) return images, utils.onehot(np.asarray(labels), label_size=self.labels_size)
def train_bn(self, data_loader, valid_loader, epochs, learning_rate): losses_train = [] losses_valid = [] for epoch in range(epochs): print("epoch", epoch) epoch_loss_train = 0 # 重置全局均值和方差 # 批量训练 for step, (x, y) in enumerate(data_loader): # x:[b, 28, 28] -> [b, 784] , y:[b, 1] -> [b, 10] x = x.reshape(-1, 28 * 28) y = onehot(y, 10) nets, pred = self.forward_bn(x, bn_mode='train') grads = self.backward_bn(nets, y, pred) self.optimizer.optimize(self.weight_num, self.params, grads, y.shape[0]) loss = cross_entropy(y, pred) epoch_loss_train += loss if step % 100 == 0: print("epoch {} step {} loss {:.4f}".format( epoch, step, loss)) losses_train.append(epoch_loss_train) data_loader.restart() print(epoch_loss_train) # 验证集测试 epoch_loss_valid = 0 for step, (x, y) in enumerate(valid_loader): x = x.reshape(-1, 28 * 28) y = onehot(y, 10) nets, pred = self.forward_bn(x, bn_mode='test') loss = cross_entropy(y, pred) epoch_loss_valid += loss if step % 100 == 0: print("epoch {} step {} loss {:.4f}".format( epoch, step, loss)) losses_valid.append(epoch_loss_valid) valid_loader.restart() his = {'train_loss': losses_train, 'valid_loss': losses_valid} return his
def gen_test(self): x_batch, y_batch = self._batch_init() i = 0 for idx in self._idcs_test: x_batch[i] = self._test[idx] y_batch[i] = onehot(self._test_label[idx], self._num_classes) i += 1 if i >= self._batch_size: yield i, x_batch, y_batch x_batch, y_batch = self._batch_init() i = 0 if i != 0: yield i, x_batch, y_batch
def phi8(x, a): f = [[x[0]]] for aa in range(nactions): if aa == a: y = np.array([x[1 + i] for i in range(nactions)]) f += [y] else: f += [np.zeros((nactions, ))] y3 = int(x[1 + a] + 1 > env.max_queue_length) f.append([y3]) f.append(onehot(nactions, a)) return np.concatenate(f)
def goal(self): if self.obs_mode == "onehot": return utils.onehot( self.goal_pos[0] * self.grid_size + self.goal_pos[1], self.state_size) if self.obs_mode == "twohot": return self.twohot(self.goal_pos, self.grid_size) if self.obs_mode == "geometric": return (2 * np.array(self.goal_pos) / (self.grid_size - 1)) - 1 if self.obs_mode == "visual": return env.grid if self.obs_mode == "index": return self.goal_pos[0] * self.grid_size + self.goal_pos[1]
def __getitem__(self, index): if isinstance(index, torch.Tensor): index = index.item() line = self.ids.iloc[index] image = cv2.imread(line['path']) image = self.transform(image=image)['image'] label = np.array([self.mapping[line['label']]]) label = ToTensor()(image=label)['image'] label = onehot(label, self.num_classes) return {'image': image, 'mask': label}
def bprop(self, X, y): # chaque colonne de X est une entrée X = np.array([np.array([float(x) for x in j]) for j in X]) X = X.transpose() self._gradoa = self._os - utils.onehot(self._m, y) self._gradb2 = self._gradoa # gradw2 va être la somme des gradient pour chaque point individuelle self._gradw2 = np.dot(self._gradoa, np.transpose(self._hs)) + 2 * self.wd * self._w2 self._gradhs = np.dot(np.transpose(self._w2), self._gradoa) self._gradha = self._gradhs * np.where(self._ha > 0, 1, 0) self._gradb1 = np.array(self._gradha) # gradw2 va être la somme des gradient pour chaque point individuelle self._gradw1 = np.dot(self._gradha, np.transpose(X)) + 2 * self.wd * self._w1 self._gradx = np.dot(np.transpose(self._w1), self._gradha)
def __getitem__(self, index): if isinstance(index, torch.Tensor): index = index.item() line_1 = self.ids.iloc[index] label_1 = np.array([self.mapping[line_1['label']]]) image_1 = cv2.imread(line_1['path']) if self.mixup and np.random.uniform(0, 1) > self.mixup_p: while True: idx = next(iter( self.sampler)).item() # generate idx with self.sampler line_2 = self.ids.iloc[idx] label_2 = np.array([self.mapping[line_2['label']]]) if label_1 != label_2: break image_2 = cv2.imread(line_2['path']) image_1 = self.transform(image=image_1)['image'] image_2 = self.transform(image=image_2)['image'] label_1 = ToTensor()(image=label_1)['image'] label_2 = ToTensor()(image=label_2)['image'] label_1 = onehot(label_1, self.num_classes) label_2 = onehot(label_2, self.num_classes) _lambda = np.random.beta(self.alpha, self.alpha) images = _lambda * image_1 + (1 - _lambda) * image_2 labels = _lambda * label_1 + (1 - _lambda) * label_2 else: images = self.transform(image=image_1)['image'] label_1 = ToTensor()(image=label_1)['image'] labels = onehot(label_1, self.num_classes) return {'image': images, 'mask': labels}
def state_to_obs(self, state): if self.obs_mode == "onehot": point = self.state_to_point(state) return utils.onehot(point[0] * self.grid_size + point[1], self.state_size) if self.obs_mode == "twohot": point = self.state_to_point(state) return self.twohot(point, self.grid_size) if self.obs_mode == "geometric": point = self.state_to_point(state) return (2 * np.array(point) / (self.grid_size - 1)) - 1 if self.obs_mode == "visual": return self.state_to_grid(state) if self.obs_mode == "index": return state
def bprop(self, X, y): # chaque colonne de X est une entrée X = np.array([np.array([float(x) for x in j]) for j in X]) X = X.transpose() self._gradoa = self._os - utils.onehot(self._m, y) self._gradb2 = self._gradoa # gradw2 va être la somme des gradient pour chaque point individuelle self._gradw2 = np.dot(self._gradoa, np.transpose( self._hs)) + 2 * self.wd * self._w2 self._gradhs = np.dot(np.transpose(self._w2), self._gradoa) self._gradha = self._gradhs * np.where(self._ha > 0, 1, 0) self._gradb1 = np.array(self._gradha) # gradw2 va être la somme des gradient pour chaque point individuelle self._gradw1 = np.dot(self._gradha, np.transpose(X)) + 2 * self.wd * self._w1 self._gradx = np.dot(np.transpose(self._w1), self._gradha)
def phi2(x, a): f = [[x[0]]] for aa in range(nactions): if aa == a: if x[1 + a] > 0: y = [float(x[1 + i]) / (x[1 + a] + x[1 + i]) for i in range(nactions) if not i == a] else: y = np.ones((nactions - 1,)) y2 = [x[1 + nactions + i] - x[1 + nactions + a] for i in range(nactions) if not i == a] f += [y, y2] else: f.append(np.zeros((2 * nactions - 2,))) y3 = int(x[1 + a] + 1 > env.max_queue_length) f.append([y3]) f.append(onehot(nactions, a)) return np.concatenate(f)
def gen_train(self): x_batch, y_batch = self._batch_init() iteration = 0 i = 0 while iteration < self._num_iterations: # shuffling all batches self._shuffle_train() for idx in self._idcs_train: # extract data from dict x_batch[i], y_batch[i] = random_flip( self._train[idx], onehot(self._train_label[idx], self._num_classes)) i += 1 if i >= self._batch_size: yield x_batch, y_batch x_batch, y_batch = self._batch_init() i = 0 iteration += 1
def _get_bp_indexes_labranchor(self, soi): """ Get indexes of branch point regions in given sequences. :param soi: batch of sequences of interest for introns (intron-3..intron+6) :return: array of predicted bp indexes """ encoded = [ onehot(str(seq)[self.acc_i - 70:self.acc_i]) for seq in np.nditer(soi) ] labr_in = np.stack(encoded, axis=0) out = self.labranchor.predict_on_batch(labr_in) # for each row, pick the base with max branchpoint probability, and get its index max_indexes = np.apply_along_axis( lambda x: self.acc_i - 70 + np.argmax(x), axis=1, arr=out) # self.write_bp(max_indexes) return max_indexes
def predict(self, data_loader, bn=False): labels = [] pred = [] losses = 0 for (x, y) in data_loader: x = x.reshape(-1, 28 * 28) y = onehot(y, 10) if bn: _, out = self.forward_bn(x, 'test') else: _, out = self.forward(x) loss = cross_entropy(y, out) losses += loss out = list(np.argmax(out, axis=-1).flatten()) y = list(np.argmax(y, axis=1).flatten()) labels += y pred += out return np.array(pred).astype('int'), np.array(labels).astype('int')
def phi3(x, a): f = [[x[0]]] for aa in range(nactions): if aa == a: y = np.array([np.tanh(float(x[1 + a]) / (x[1 + i])) if x[1 + i] > 0 else 1 for i in range(nactions) if not i == a]) if x[1 + a] > 0: y2 = np.array([np.tanh(float(x[1 + i]) / (x[1 + a])) for i in range(nactions) if not i == a]) else: y2 = np.ones((nactions - 1,)) # y2 = [x[1 + nactions + i] - x[1 + nactions + a] for i in range(nactions) if not i == a] f += [y, 1-y, y2, 1 - y2] else: f.append(np.zeros((4 * (nactions - 1),))) y3 = int(x[1 + a] + 1 > env.max_queue_length) f.append([y3]) f.append(onehot(nactions, a)) return np.concatenate(f)
def phi4(x,a): f = [[x[0]]] for aa in range(nactions): if not aa == a : if x[1+a] == 0 : f += [np.zeros((2,))] else : if x[1+aa] == 0: f += [[1, 0]] else: frac = float(x[1+a]/x[1+aa]) f += [[0,frac]] else : f += [np.zeros((2,))] y3 = int(x[1 + a] + 1 > env.max_queue_length) f.append(onehot(nactions, a)) f.append([y3]) return np.concatenate(f)
def phi6(x,a): f = [[x[0]]] for aa in range(nactions): if not aa == a : if x[1+a] == 0 : f += [np.zeros((12,))] else : if x[1+aa] == 0: f += [[1], np.zeros(11,)] else: frac = float(x[1+a]/x[1+aa]) y = np.array([int(frac>j) for j in (0.1, 0.2, 0.25, 1.0/3, 0.5, 1, 2, 3, 4, 5, 10) ]) f += [[0],y] else : f += [np.zeros((12,))] pass y3 = int(x[1 + a] + 1 > env.max_queue_length) f.append(onehot(nactions, a)) f.append([y3]) return np.concatenate(f)
def decode_for_classification(X_syn): ## Decode features for col in X_syn.columns: if col not in disc_features: if data[col].dtype == 'float' and col != 'Education-Num': min_value = maps[col][0] max_value = maps[col][1] X_syn[col] = X_syn[col] * (max_value - min_value) + min_value else: X_syn[col] = X_syn[col].map(maps[col][1]) if col == 'Education-Num': X_syn[col] = X_syn[col].map(maps[col][1]) ## Decode discretized features bins = np.linspace(-1e-6, 1, 17, endpoint=True) for col in disc_features: if col != 'Education-Num': discr_feature = pd.cut(data[col], bins=bins, labels=range(16)).astype('int') decode_map = {i: u for i, u in enumerate(np.unique(discr_feature))} X_syn[col] = X_syn[col].map(decode_map) ## One hot categorical features from utils import onehot onehotteds = [] for col in X_syn.columns: feature = X_syn[col] if (feature.dtype == 'int' or feature.dtype == 'O') and col not in onehotteds: if len(np.unique(feature)) > 2: X_syn.pop(col) onehotted = onehot(feature) X_syn = pd.concat([X_syn, onehotted], axis=1) onehotteds.append(col) ## Reorder columns X_syn = X_syn[X_test.columns] return X_syn
def processing_data(infile, labelfile, outfile, vocab_file, stopwords_file): print('Loading stopwords...') stopwords = get_stopwords(stopwords_file) print('Loading data...') data = pd.read_csv(infile) print('Saving labels') with open(labelfile, 'w') as f: for label in data.columns[2:]: f.write(label + '\n') # 把句子分割成词 print('Splitting content') contents = data['content'].tolist() seg_contents = segmentData(contents, stopwords) if not os.path.exists(vocab_file): print('Creating vocabulary...') create_vocab(seg_contents, vocab_file, 50000) print('Loading vocabulary...') w2i, _ = read_vocab(vocab_file) # word2id print('Tokenize...') token_contents = [tokenizer(c, w2i) for c in seg_contents] data['content'] = token_contents # 把标签转换成one hot形式 print('One-hot label') for col in data.columns[2:]: label = data[col].tolist() onehot_label = [onehot(l) for l in label] data[col] = onehot_label print('Saving...') data[data.columns[1:]].to_csv(outfile, index=False)
def generate_seqs(images, data_desc, onehot_lab=True): idx = [] runn_idx = 0 img_seqs = [] labels = [] label = None tid = 0 for _, row in data_desc.iterrows(): if (tid != row['trackid']): if (len(idx) != 0): idx = list(map(lambda x: x + runn_idx, idx)) img_seqs.append(np.array(images[idx])) labels.append(label) runn_idx = runn_idx + len(idx) tid = row['trackid'] idx = [row['framenr'] - 2] #TODO else: idx.append(row['framenr'] - 2) label = row['class'] if (onehot_lab): labels = onehot(labels, label_dict={'boat': 1, 'nature': 0}) return img_seqs, labels
softmax = GumbelSoftmax() # train adverserially try: while epoch < num_epochs: train_iter = iter(train_data) temperature = max_temperature**((epoch+1)/num_epochs) g_lr_scheduler.step() d_lr_scheduler.step() for n_batch,batch in enumerate(train_iter): real_data = batch.text.to(device) N = real_data.size(0) num_steps = real_data.size(1) # 1. Train Discriminator real_data_onehot = onehot(real_data,num_classes) real_data_onehot[real_data_onehot==1] = 0.7 real_data_onehot[real_data_onehot==0] = (1.0-0.7)/(num_classes-1.0) real_data_onehot = softmax(real_data_onehot,temperature) # Generate fake data and detach # (so gradients are not calculated for generator) noise_tensor = sample_noise(N,noise_size,device) with torch.no_grad(): fake_data = generator(z=noise_tensor,num_steps=num_steps,temperature=temperature).detach() # Train D d_error = train_discriminator(discriminator,real_data_onehot,fake_data,d_optimizer) # 2. Train Generator every 'gen_train_freq' steps if global_step % gen_train_freq == 0: for _ in range(gen_steps):
import numpy as np from confusionmatrix import ConfusionMatrix from layers import * from utils import onehot # Load Mnist data and convert vector represention to one-hot data = np.load('mnist.npz') num_classes = 10 x_train = data['X_train'] targets_train = data['y_train'] targets_train = onehot(targets_train, num_classes) num_samples, num_inputs = x_train.shape num_hidden_units = 100 batch_size = 200 num_epochs = 50 learning_rate = 0.001 num_samples = x_train.shape[0] num_batches = num_samples // batch_size ffn = FeedforwardNetwork() ffn.add(LinearLayer(num_inputs, num_hidden_units)) ffn.add(ReluActivationLayer()) ffn.add(LinearLayer(num_hidden_units, num_classes)) ffn.add(SoftmaxActivationLayer()) losslayer = CrossEntropyLoss()