def log_softmax_likelihood(yhat_linear, y): """ Likelihood of output yhat_linear, given the label y yhat_linear, y: ndarray """ return nd.nansum(y * nd.log_softmax(yhat_linear), axis=0, exclude=True)
def forward(self, cls_pred, box_pred, cls_target, box_target): """Compute loss in entire batch across devices.""" # require results across different devices at this time cls_pred, box_pred, cls_target, box_target = [ _as_list(x) for x in (cls_pred, box_pred, cls_target, box_target)] # cross device reduction to obtain positive samples in entire batch num_pos = [] for cp, bp, ct, bt in zip( *[cls_pred, box_pred, cls_target, box_target]): pos_samples = (ct > 0) num_pos.append(pos_samples.sum()) num_pos_all = sum([p.asscalar() for p in num_pos]) if num_pos_all < 1: # no positive samples found, return dummy losses return nd.zeros((1,)), nd.zeros((1,)), nd.zeros((1,)) # compute element-wise cross entropy loss and sort, then perform # negative mining cls_losses = [] box_losses = [] sum_losses = [] for cp, bp, ct, bt in zip( *[cls_pred, box_pred, cls_target, box_target]): pred = nd.log_softmax(cp, axis=-1) pos = ct > 0 cls_loss = -nd.pick(pred, ct, axis=-1, keepdims=False) rank = (cls_loss * (pos - 1)).argsort(axis=1).argsort(axis=1) hard_negative = rank < ( pos.sum(axis=1) * self._negative_mining_ratio).expand_dims(-1) # mask out if not positive or negative cls_loss = nd.where( (pos + hard_negative) > 0, cls_loss, nd.zeros_like(cls_loss)) cls_losses.append( nd.sum( cls_loss, axis=0, exclude=True) / num_pos_all) bp = _reshape_like(nd, bp, bt) box_loss = nd.abs(bp - bt) box_loss = nd.where( box_loss > self._rho, box_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(box_loss)) # box loss only apply to positive samples box_loss = box_loss * pos.expand_dims(axis=-1) box_losses.append( nd.sum( box_loss, axis=0, exclude=True) / num_pos_all) sum_losses.append(cls_losses[-1] + self._lambd * box_losses[-1]) return sum_losses, cls_losses, box_losses
def forward(self, cls_pred, box_pred, cls_target, box_target): """Compute loss in entire batch across devices.""" # require results across different devices at this time cls_pred, box_pred, cls_target, box_target = [_as_list(x) \ for x in (cls_pred, box_pred, cls_target, box_target)] # cross device reduction to obtain positive samples in entire batch pos_ct = [ct > 0 for ct in cls_target] num_pos = [ct.sum() for ct in pos_ct] num_pos_all = sum([p.asscalar() for p in num_pos]) # print ('num_pos_all: {}'.format(num_pos_all)) if num_pos_all < 1 and self._min_hard_negatives < 1: # no positive samples and no hard negatives, return dummy losses cls_losses = [nd.sum(cp * 0) for cp in cls_pred] box_losses = [nd.sum(bp * 0) for bp in box_pred] sum_losses = [ nd.sum(cp * 0) + nd.sum(bp * 0) for cp, bp in zip(cls_pred, box_pred) ] return sum_losses, cls_losses, box_losses # compute element-wise cross entropy loss and sort, then perform negative mining cls_losses = [] box_losses = [] sum_losses = [] for cp, bp, ct, bt in zip( *[cls_pred, box_pred, cls_target, box_target]): # print ('cp shape: {}'.format(cp.shape)) # print ('bp shape: {}'.format(bp.shape)) # print ('ct shape: {}'.format(ct.shape)) # print ('bt shape: {}'.format(bt.shape)) pred = nd.log_softmax(cp, axis=-1) pos = ct > 0 cls_loss = -nd.pick(pred, ct, axis=-1, keepdims=False) rank = (cls_loss * (pos - 1)).argsort(axis=1).argsort(axis=1) hard_negative = rank < nd.maximum( self._min_hard_negatives, pos.sum(axis=1) * self._negative_mining_ratio).expand_dims(-1) # mask out if not positive or negative cls_loss = nd.where((pos + hard_negative) > 0, cls_loss, nd.zeros_like(cls_loss)) cls_losses.append( nd.sum(cls_loss, axis=0, exclude=True) / max(1., num_pos_all)) bp = _reshape_like(nd, bp, bt) box_loss = nd.abs(bp - bt) box_loss = nd.where(box_loss > self._rho, box_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(box_loss)) # box loss only apply to positive samples box_loss = box_loss * pos.expand_dims(axis=-1) box_losses.append( nd.sum(box_loss, axis=0, exclude=True) / max(1., num_pos_all)) sum_losses.append(cls_losses[-1] + self._lambd * box_losses[-1]) return sum_losses, cls_losses, box_losses
def forward(self, cls_pred, box_pred, cls_target, box_target): """Compute loss in entire batch across devices.""" # require results across different devices at this time cls_pred, box_pred, cls_target, box_target = [_as_list(x) \ for x in (cls_pred, box_pred, cls_target, box_target)] # cross device reduction to obtain positive samples in entire batch num_pos = [] for cp, bp, ct, bt in zip(*[cls_pred, box_pred, cls_target, box_target]): pos_samples = (ct > 0) num_pos.append(pos_samples.sum()) num_pos_all = sum([p.asscalar() for p in num_pos]) # synchronize across different machines # print('before sync:', num_pos_all) if self._distributed: num_pos_out = nd.zeros(1, mx.cpu()) num_pos_in = nd.zeros(1, mx.cpu()) + num_pos_all # allreduce only supports pushpull if 'allreduce' in self._kv_store_type: self._kv_store.pushpull(self._num_pos_key, num_pos_in, num_pos_out) else: self._kv_store.push(self._num_pos_key, num_pos_in) # self._kv_store._barrier() self._kv_store.pull(self._num_pos_key, out=num_pos_out) num_pos_all = num_pos_out.asscalar() # print('after sync:', num_pos_all) if num_pos_all < 1: # no positive samples found, return dummy losses return nd.zeros((1,)), nd.zeros((1,)), nd.zeros((1,)) # compute element-wise cross entropy loss and sort, then perform negative mining cls_losses = [] box_losses = [] sum_losses = [] for cp, bp, ct, bt in zip(*[cls_pred, box_pred, cls_target, box_target]): pred = nd.log_softmax(cp, axis=-1) pos = ct > 0 cls_loss = -nd.pick(pred, ct, axis=-1, keepdims=False) rank = (cls_loss * (pos - 1)).argsort(axis=1).argsort(axis=1) hard_negative = rank < (pos.sum(axis=1) * self._negative_mining_ratio).expand_dims(-1) # mask out if not positive or negative cls_loss = nd.where((pos + hard_negative) > 0, cls_loss, nd.zeros_like(cls_loss)) cls_losses.append(nd.sum(cls_loss, axis=0, exclude=True) / num_pos_all) bp = _reshape_like(nd, bp, bt) box_loss = nd.abs(bp - bt) box_loss = nd.where(box_loss > self._rho, box_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(box_loss)) # box loss only apply to positive samples box_loss = box_loss * pos.expand_dims(axis=-1) box_losses.append(nd.sum(box_loss, axis=0, exclude=True) / num_pos_all) sum_losses.append(cls_losses[-1] + self._lambd * box_losses[-1]) return sum_losses, cls_losses, box_losses
def forward(self, context): shared = self.hidden(context) if self.sm_out: preds = nd.softmax( nd.stack(*[l(shared)[:, 1] for l in self.actions]).T, axis=1) elif self.log_sm_out: preds = nd.log_softmax( nd.stack(*[l(shared)[:, 1] for l in self.actions]).T, axis=1) else: preds = nd.stack(*[l(shared)[:, 1] for l in self.actions]).T return preds
def retrain_enc(self, l2_alpha=0.1): docs = self.data.get_documents(key='train') with autograd.record(): ### reconstruction phase ### y_onehot_u = self.Enc(docs) y_onehot_u_softmax = nd.softmax(y_onehot_u) x_reconstruction_u = self.Dec(y_onehot_u_softmax) logits = nd.log_softmax(x_reconstruction_u) loss_reconstruction = nd.mean(nd.sum(- docs * logits, axis=1)) loss_reconstruction = loss_reconstruction + l2_alpha * nd.mean(nd.norm(y_onehot_u, ord=1, axis=1)) loss_reconstruction.backward() self.optimizer_enc.step(1) return loss_reconstruction.asscalar()
def total_loss(output, params, mus, sigmas, label_one_hot, log_prior): log_likelihood_s = nd.sum( nd.nansum(label_one_hot * nd.log_softmax(output), axis=0, exclude=True)) log_prior_pre_sum = [] for param in params: log_prior_pre_sum.append(nd.sum(log_prior(param))) log_prior_sum = sum(log_prior_pre_sum) log_var_posterior_pre_sum = [] for i in range(len(params)): log_var_posterior_pre_sum.append( nd.sum(log_gaussian(params[i], mus[i], sigmas[i]))) log_var_posterior_sum = sum(log_var_posterior_pre_sum) total_loss = 1.0 / num_batches * (log_var_posterior_sum - log_prior_sum) - log_likelihood_s return total_loss
def eval_step(data_tr, data_te, data_type="valid"): running_loss = 0.0 eval_idxlist = list(range(data_tr.shape[0])) eval_N = data_tr.shape[0] eval_steps = len(range(0, eval_N, args.batch_size)) n100_list, r20_list, r50_list = [], [], [] with trange(eval_steps) as t: for batch_idx, start_idx in zip(t, range(0, eval_N, args.batch_size)): t.set_description(data_type) end_idx = min(start_idx + args.batch_size, eval_N) X_tr = data_tr[eval_idxlist[start_idx:end_idx]] X_te = data_te[eval_idxlist[start_idx:end_idx]] X_tr_inp = nd.array(X_tr.toarray()).as_in_context(ctx) with autograd.predict_mode(): if model.__class__.__name__ == "MultiVAE": X_out, mu, logvar = model(X_tr_inp) loss = vae_loss_fn(X_tr_inp, X_out, mu, logvar, train_step.anneal) elif model.__class__.__name__ == "MultiDAE": X_out = model(X_tr_inp) loss = -nd.mean(nd.sum(nd.log_softmax(X_out) * X_tr_inp, -1)) running_loss += loss.asscalar() avg_loss = running_loss / (batch_idx + 1) # Exclude examples from training set X_out = X_out.asnumpy() X_out[X_tr.nonzero()] = -np.inf n100 = NDCG_binary_at_k_batch(X_out, X_te, k=100) r20 = Recall_at_k_batch(X_out, X_te, k=20) r50 = Recall_at_k_batch(X_out, X_te, k=50) n100_list.append(n100) r20_list.append(r20) r50_list.append(r50) t.set_postfix(loss=avg_loss) n100_list = np.concatenate(n100_list) r20_list = np.concatenate(r20_list) r50_list = np.concatenate(r50_list) return avg_loss, np.mean(n100_list), np.mean(r20_list), np.mean(r50_list)
def get_loss(pred, label, trg_vocab_size, trg_pad, epsilon=0.1): labelprob = nd.one_hot(label, trg_vocab_size) # Label smoothing smoothed_labelprob = (1 - epsilon) * labelprob + epsilon / trg_vocab_size logprob = nd.log_softmax(pred) loss = -nd.sum(logprob * smoothed_labelprob, axis=-1, keepdims=False) # mask PAD mask = label != trg_pad loss = loss * mask # batch_axis = 0 loss = nd.mean(loss, axis=0, exclude=True) return loss
def forward(self, cls_pred, box_pred, cls_target, box_target): """Compute loss in entire batch across devices.""" # require results across different devices at this time cls_pred, box_pred, cls_target, box_target = [_as_list(x) \ for x in (cls_pred, box_pred, cls_target, box_target)] # cross device reduction to obtain positive samples in entire batch num_pos = [] for cp, bp, ct, bt in zip(*[cls_pred, box_pred, cls_target, box_target]): pos_samples = (ct > 0) num_pos.append(pos_samples.sum()) num_pos_all = sum([p.asscalar() for p in num_pos]) if num_pos_all < 1 and self._min_hard_negatives < 1: # no positive samples and no hard negatives, return dummy losses cls_losses = [nd.sum(cp * 0) for cp in cls_pred] box_losses = [nd.sum(bp * 0) for bp in box_pred] sum_losses = [nd.sum(cp * 0) + nd.sum(bp * 0) for cp, bp in zip(cls_pred, box_pred)] return sum_losses, cls_losses, box_losses # compute element-wise cross entropy loss and sort, then perform negative mining cls_losses = [] box_losses = [] sum_losses = [] for cp, bp, ct, bt in zip(*[cls_pred, box_pred, cls_target, box_target]): pred = nd.log_softmax(cp, axis=-1) pos = ct > 0 cls_loss = -nd.pick(pred, ct, axis=-1, keepdims=False) rank = (cls_loss * (pos - 1)).argsort(axis=1).argsort(axis=1) hard_negative = rank < nd.maximum(self._min_hard_negatives, pos.sum(axis=1) * self._negative_mining_ratio).expand_dims(-1) # mask out if not positive or negative cls_loss = nd.where((pos + hard_negative) > 0, cls_loss, nd.zeros_like(cls_loss)) cls_losses.append(nd.sum(cls_loss, axis=0, exclude=True) / max(1., num_pos_all)) bp = _reshape_like(nd, bp, bt) box_loss = nd.abs(bp - bt) box_loss = nd.where(box_loss > self._rho, box_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(box_loss)) # box loss only apply to positive samples box_loss = box_loss * pos.expand_dims(axis=-1) box_losses.append(nd.sum(box_loss, axis=0, exclude=True) / max(1., num_pos_all)) sum_losses.append(cls_losses[-1] + self._lambd * box_losses[-1]) return sum_losses, cls_losses, box_losses
def decode(self, x): batch_size = x.shape[0] state = self.init_hidden(batch_size, self.ctx) outputs_pgm = [] outputs_param = [] for i in range(self.seq_length): if i == 0: xt = x else: prob_pre = nd.exp(outputs_pgm[-1]) it1 = nd.argmax(prob_pre, axis=1) #print("it1 decode:",it1) xt = self.pgm_embed(it1) #print("xt decode:",xt) output, state = self.core(xt.expand_dims(axis=0), state) pgm_feat1 = nd.relu(self.logit1(output.squeeze(0))) pgm_feat2 = self.logit2(pgm_feat1) pgm_score = nd.log_softmax(pgm_feat2, axis=1) trans_prob = nd.softmax(pgm_feat2, axis=1).detach() param_feat1 = nd.relu(self.regress1(output.squeeze(0))) param_feat2 = nd.concat(trans_prob, param_feat1, dim=1) param_score = self.regress2(param_feat2) param_score = param_score.reshape(batch_size, self.vocab_size + 1, self.max_param) index = nd.argmax(trans_prob, axis=1) index = index.expand_dims(axis=1).expand_dims(axis=2).broadcast_to( shape=(batch_size, 1, self.max_param)).detach() ## param_score = nd.pick(param_score, index, 1) outputs_pgm.append(pgm_score) outputs_param.append(param_score) outputs_pgm = [_.expand_dims(axis=1) for _ in outputs_pgm] outputs_param = [_.expand_dims(axis=1) for _ in outputs_param] pgms = outputs_pgm[0] params = outputs_param[0] for i in range(1, len(outputs_pgm)): pgms = nd.concat(pgms, outputs_pgm[i], dim=1) params = nd.concat(params, outputs_param[i], dim=1) return [pgms, params]
def get_smoothed_loss(pred, label, num_classes, trg_pad, smooth_alpha=0.1): pred = nd.maximum(pred, 1e-10) logprob = nd.log_softmax(pred) # cross entropy ce = -nd.pick(logprob, label) pre_class_gain = smooth_alpha / (num_classes - 1) # loss = (1 - smooth_alpha - pre_class_gain) * ce - pre_class_gain * sum(logprob) loss = (1 - smooth_alpha - pre_class_gain) * ce - nd.sum( pre_class_gain * logprob, axis=-1, keepdims=False) mask = label != trg_pad loss = loss * mask loss = nd.sum(loss) / mask.sum() return loss
def train_step(model, optimizer, data, epoch): running_loss = 0.0 global update_count N = data.shape[0] idxlist = list(range(N)) np.random.shuffle(idxlist) training_steps = len(range(0, N, args.batch_size)) with trange(training_steps) as t: for batch_idx, start_idx in zip(t, range(0, N, args.batch_size)): t.set_description("epoch: {}".format(epoch + 1)) end_idx = min(start_idx + args.batch_size, N) X_inp = data[idxlist[start_idx:end_idx]] X_inp = nd.array(X_inp.toarray()).as_in_context(ctx) if args.constant_anneal: anneal = args.anneal_cap elif args.anneal_epochs is not None: anneal = min( args.anneal_cap, args.anneal_cap * (update_count / total_anneal_steps), ) else: anneal = min(args.anneal_cap, update_count / total_anneal_steps) update_count += 1 with autograd.record(): if model.__class__.__name__ == "MultiVAE": X_out, mu, logvar = model(X_inp) loss = vae_loss_fn(X_inp, X_out, mu, logvar, anneal) train_step.anneal = anneal elif model.__class__.__name__ == "MultiDAE": X_out = model(X_inp) loss = -nd.mean(nd.sum(nd.log_softmax(X_out) * X_inp, -1)) loss.backward() trainer.step(X_inp.shape[0]) running_loss += loss.asscalar() avg_loss = running_loss / (batch_idx + 1) t.set_postfix(loss=avg_loss)
def softmax_cross_entropy(yhat_linear, y): return -nd.nansum(y * nd.log_softmax(yhat_linear), axis=0, exclude=True)
def unlabeled_train_op_mmd_combine(self, update_enc=True): ''' Trains the MMD model ''' batch_size = self.args['batch_size'] model_ctx = self.model_ctx eps = 1e-10 # Retrieve data docs = self.data.get_documents(key='train') if self.args['use_kd']: split_on = docs.shape[1] // 2 docs, bert_logits = docs[:,:split_on], docs[:,split_on:] t = self.args['kd_softmax_temp'] kd_docs = nd.softmax(bert_logits / t) * nd.sum(docs, axis=1, keepdims=True) kd_docs = kd_docs * (kd_docs > self.args['kd_min_count']) y_true = np.random.dirichlet(np.ones(self.ndim_y) * self.args['dirich_alpha'], size=batch_size) y_true = nd.array(y_true, ctx=model_ctx) with autograd.record(): ### reconstruction phase ### y_onehot_u = self.Enc(docs) y_onehot_u_softmax = nd.softmax(y_onehot_u) if self.args['latent_noise'] > 0: y_noise = np.random.dirichlet(np.ones(self.ndim_y) * self.args['dirich_alpha'], size=batch_size) y_noise = nd.array(y_noise, ctx=model_ctx) y_onehot_u_softmax = (1 - self.args['latent_noise']) * y_onehot_u_softmax + self.args['latent_noise'] * y_noise x_reconstruction_u = self.Dec(y_onehot_u_softmax) if self.args['use_kd']: kd_logits = nd.log_softmax(x_reconstruction_u / t) logits = nd.log_softmax(x_reconstruction_u) kd_loss_reconstruction = nd.mean(nd.sum(- kd_docs * kd_logits, axis=1)) loss_reconstruction = nd.mean(nd.sum(- docs * logits, axis=1)) loss_total = self.args['recon_alpha'] * ( self.args['kd_loss_alpha'] * t * t * (kd_loss_reconstruction) + (1 - self.args['kd_loss_alpha']) * loss_reconstruction ) else: logits = nd.log_softmax(x_reconstruction_u) loss_reconstruction = nd.mean(nd.sum(- docs * logits, axis=1)) loss_total = loss_reconstruction * self.args['recon_alpha'] ### mmd phase ### if self.args['adverse']: y_fake = self.Enc(docs) y_fake = nd.softmax(y_fake) loss_mmd = mmd_loss(y_true, y_fake, ctx_model=model_ctx, t=self.args['kernel_alpha']) loss_total = loss_total + loss_mmd if self.args['l2_alpha'] > 0: loss_total = loss_total + self.args['l2_alpha'] * nd.mean(nd.sum(nd.square(y_onehot_u), axis=1)) loss_total.backward() self.optimizer_enc.step(1) self.optimizer_dec.step(1) # self.m.args['batch_size'] latent_max = nd.zeros(self.args['ndim_y'], ctx=model_ctx) for max_ind in nd.argmax(y_onehot_u, axis=1): latent_max[max_ind] += 1.0 latent_max /= batch_size latent_entropy = nd.mean(nd.sum(- y_onehot_u_softmax * nd.log(y_onehot_u_softmax + eps), axis=1)) latent_v = nd.mean(y_onehot_u_softmax, axis=0) dirich_entropy = nd.mean(nd.sum(- y_true * nd.log(y_true + eps), axis=1)) if self.args['adverse']: loss_mmd_return = loss_mmd.asscalar() else: loss_mmd_return = 0.0 return nd.mean(loss_reconstruction).asscalar(), loss_mmd_return, latent_max.asnumpy(), latent_entropy.asscalar(), latent_v.asnumpy(), dirich_entropy.asscalar()
def test_op(self, num_samples=None, num_epochs=None, reset=True, dataset='test'): ''' Evaluates the model using num_samples. Args ---- num_samples: integer, default None The number of samples to evaluate on. This is converted to evaluating on (num_samples // batch_size) minibatches. num_epochs: integer, default None The number of epochs to evaluate on. This used if num_samples is not specified. If neither is specified, defaults to 1 epoch. reset: bool, default True Whether to reset the test data index to 0 before iterating through and evaluating on minibatches. dataset: string, default 'test': Which dataset to evaluate on: 'valid' or 'test'. Returns ------- Loss_u: float The loss on the unlabeled data. Loss_l: float The loss on the labeled data. Eval_u: list of floats A list of evaluation metrics on the unlabeled data. Eval_l: list of floats A list of evaluation metrics on the labeled data. ''' batch_size = self.args['batch_size'] model_ctx = self.model_ctx if num_samples is None and num_epochs is None: # assume full dataset evaluation num_epochs = 1 if reset: # Reset Data to Index Zero if self.data.data[dataset] is not None: self.data.force_reset_data(dataset) if self.data.data[dataset + '_with_labels'] is not None: self.data.force_reset_data(dataset+'_with_labels') # Unlabeled Data u_loss = 'NA' u_eval = [] if self.data.data[dataset] is not None: u_loss = 0 if num_samples is None: num_samps = self.data.data[dataset].shape[0] * num_epochs else: num_samps = num_samples batches = int(np.ceil(num_samps / self.args['batch_size'])) batch_iter = range(batches) if batches > 1: batch_iter = tqdm(batch_iter, desc='unlabeled') for batch in batch_iter: # 1. Retrieve data docs = self.data.get_documents(key=dataset) if self.args['use_kd']: split_on = docs.shape[1] // 2 docs, bert_logits = docs[:,:split_on], docs[:,split_on:] # TODO: below is not used, but also may not be necessary t = self.args['kd_softmax_temp'] kd_docs = nd.softmax(bert_logits / t) * nd.sum(docs, axis=1, keepdims=True) # 2. Compute loss y_u = self.Enc(docs) y_onehot_u_softmax = nd.softmax(y_u) x_reconstruction_u = self.Dec(y_onehot_u_softmax) logits = nd.log_softmax(x_reconstruction_u) loss_recon_unlabel = nd.sum(- docs * logits, axis=1) # 3. Convert to numpy u_loss += nd.mean(loss_recon_unlabel).asscalar() u_loss /= batches # Labeled Data l_loss = 0.0 l_acc = 0.0 if self.data.data[dataset+'_with_labels'] is not None: l_loss = 0 if num_samples is None: num_samps = self.data.data[dataset+'_with_labels'].shape[0] * num_epochs else: num_samps = num_samples batches = int(np.ceil(num_samps / self.args['batch_size'])) batch_iter = range(batches) if batches > 1: batch_iter = tqdm(batch_iter, desc='labeled') softmaxCEL = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False) for batch in batch_iter: # 1. Retrieve data labeled_docs, labels = self.data.get_documents(key=dataset+'_with_labels', split_on=self.data.data_dim) # 2. Compute loss y_u = self.Enc(docs) y_onehot_u_softmax = nd.softmax(y_u) class_pred = nd.argmax(y_onehot_u_softmax, axis=1) l_a = labels[list(range(labels.shape[0])), class_pred] l_acc += nd.mean(l_a).asscalar() labels = labels / nd.sum(labels, axis=1, keepdims=True) l_l = softmaxCEL(y_onehot_u_softmax, labels) # 3. Convert to numpy l_loss += nd.mean(l_l).asscalar() l_loss /= batches l_acc /= batches return u_loss, l_loss, l_acc
def unlabeled_train_op_adv_combine_add(self, update_enc=True): ''' Trains the GAN model ''' batch_size = self.args['batch_size'] model_ctx = self.model_ctx eps = 1e-10 ########################## ### unsupervised phase ### ########################## # Retrieve data docs = self.data.get_documents(key='train') class_true = nd.zeros(batch_size, dtype='int32', ctx=model_ctx) class_fake = nd.ones(batch_size, dtype='int32', ctx=model_ctx) loss_reconstruction = nd.zeros((1,), ctx=model_ctx) ### adversarial phase ### discriminator_z_confidence_true = nd.zeros(shape=(1,), ctx=model_ctx) discriminator_z_confidence_fake = nd.zeros(shape=(1,), ctx=model_ctx) discriminator_y_confidence_true = nd.zeros(shape=(1,), ctx=model_ctx) discriminator_y_confidence_fake = nd.zeros(shape=(1,), ctx=model_ctx) loss_discriminator = nd.zeros(shape=(1,), ctx=model_ctx) dirich_entropy = nd.zeros(shape=(1,), ctx=model_ctx) ### generator phase ### loss_generator = nd.zeros(shape=(1,), ctx=model_ctx) ### reconstruction phase ### with autograd.record(): y_u = self.Enc(docs) y_onehot_u_softmax = nd.softmax(y_u) x_reconstruction_u = self.Dec(y_onehot_u_softmax) logits = nd.log_softmax(x_reconstruction_u) loss_reconstruction = nd.sum(- docs * logits, axis=1) loss_total = loss_reconstruction * self.args['recon_alpha'] if self.args['adverse']: #and np.random.rand()<0.8: y_true = np.random.dirichlet(np.ones(self.ndim_y) * self.args['dirich_alpha'], size=batch_size) y_true = nd.array(y_true, ctx=model_ctx) dy_true = self.Dis_y(y_true) dy_fake = self.Dis_y(y_onehot_u_softmax) discriminator_y_confidence_true = nd.mean(nd.softmax(dy_true)[:, 0]) discriminator_y_confidence_fake = nd.mean(nd.softmax(dy_fake)[:, 1]) softmaxCEL = gluon.loss.SoftmaxCrossEntropyLoss() loss_discriminator = softmaxCEL(dy_true, class_true) + \ softmaxCEL(dy_fake, class_fake) loss_generator = softmaxCEL(dy_fake, class_true) loss_total = loss_total + loss_discriminator + loss_generator dirich_entropy = nd.mean(nd.sum(- y_true * nd.log(y_true + eps), axis=1)) loss_total.backward() self.optimizer_enc.step(batch_size) self.optimizer_dec.step(batch_size) self.optimizer_dis_y.step(batch_size) latent_max = nd.zeros(self.args['ndim_y'], ctx=model_ctx) for max_ind in nd.argmax(y_onehot_u_softmax, axis=1): latent_max[max_ind] += 1.0 latent_max /= batch_size latent_entropy = nd.mean(nd.sum(- y_onehot_u_softmax * nd.log(y_onehot_u_softmax + eps), axis=1)) latent_v = nd.mean(y_onehot_u_softmax, axis=0) return nd.mean(loss_discriminator).asscalar(), nd.mean(loss_generator).asscalar(), nd.mean(loss_reconstruction).asscalar(), \ nd.mean(discriminator_z_confidence_true).asscalar(), nd.mean(discriminator_z_confidence_fake).asscalar(), \ nd.mean(discriminator_y_confidence_true).asscalar(), nd.mean(discriminator_y_confidence_fake).asscalar(), \ latent_max.asnumpy(), latent_entropy.asscalar(), latent_v.asnumpy(), dirich_entropy.asscalar()
def train(epoch, train_loader, model,loss, optimizer, opt,ctx,train_loss,train_iou): """ one epoch training for program executor """ loss_sum,iou_sum,n = 0.0,0.0,0 for idx, data in enumerate(train_loader): start_t = time.time() shape, label, param = data bsz = shape.shape[0] n_step = label.shape[1] #print("label.shape:",label) #print("n_step:",n_step,"bsz:",bsz,"stop_id:",stop_id) index = np.array(list(map(lambda x: n_step, label)))-1 #index = label # add noise during training, making the executor accept # continuous output from program generator label = label.reshape(-1,1).asnumpy() pgm_vector = 0.2 * np.random.uniform(0,1,(bsz * n_step, stop_id)) pgm_noise = 0.2 *np.random.uniform(0,1,label.shape) pgm_value = 1 - pgm_noise #print('pgm_val.shape:',pgm_value.shape,'label.shape:',label.shape,'label.shape:',label.shape) pgm_vector = scatter_numpy(pgm_vector,1,label,pgm_value).reshape(bsz,n_step,stop_id) param_noise = nd.random_uniform(0,1,shape=param.shape) param_vector = param + 0.6 * (param_noise - 0.5) #print("param_vector.shape:",param_vector.shape) gt = shape.as_in_context(ctx) #print(pgm_vector.dtype) index = nd.from_numpy(index).astype('int64').as_in_context(ctx) pgm_vector = nd.from_numpy(pgm_vector).astype('float32').as_in_context(ctx) param_vector = param_vector.as_in_context(ctx) with autograd.record(): pred = model(pgm_vector, param_vector, index) scores = nd.log_softmax(pred,axis=1) pred0 = scores[:,0].squeeze()*opt.n_weight pred1 = scores[:,1].squeeze()*opt.p_weight l = -nd.where(gt, pred1, pred0).mean((1,2,3)) #l = -(nd.pick(scores1, gt, axis=1, keepdims=True)*opt.n_weight # +nd.pick(scores2,(1-gt), axis=1, keepdims=True)*opt.p_weight).mean((1,2,3,4)) l.backward() #clip_gradient(optimizer, opt.grad_clip) #optimizer._allreduce_grads(); optimizer.step(l.shape[0],ignore_stale_grad=True) l = l.mean().asscalar() pred = nd.softmax(pred,axis = 1) pred = pred[:, 1, :, :, :] s1 = gt.reshape(-1, 32, 32, 32).astype('float32').as_in_context(mx.cpu()) s2 = pred.squeeze().as_in_context(mx.cpu()) #print(s2.shape) s2 = (s2 > 0.5) batch_iou = BatchIoU(s1, s2) iou = batch_iou.mean() end_t = time.time() loss_sum+=l n+=1 iou_sum+=iou if idx % (opt.info_interval * 10) == 0: print("Train: epoch {} batch {}/{}, loss13 = {:.3f}, iou = {:.3f}, time = {:.3f}" .format(epoch, idx, len(train_loader), l, iou, end_t - start_t)) sys.stdout.flush() train_loss.append(loss_sum/n) train_iou.append(iou_sum/n)
def forward(self, cls_pred, box_pred, cls_target, box_target): """Compute loss in entire batch across devices. Parameters ---------- cls_pred : mxnet.nd.NDArray Predicted classes. box_pred : mxnet.nd.NDArray Predicted bounding-boxes. cls_target : mxnet.nd.NDArray Ground-truth classes. box_target : mxnet.nd.NDArray Ground-truth bounding-boxes. Returns ------- tuple of NDArrays sum_losses : array with containing the sum of class prediction and bounding-box regression loss. cls_losses : array of class prediction loss. box_losses : array of box regression L1 loss. """ # require results across different devices at this time cls_pred, box_pred, cls_target, box_target = [_as_list(x) \ for x in (cls_pred, box_pred, cls_target, box_target)] # cross device reduction to obtain positive samples in entire batch num_pos = [] for cp, bp, ct, bt in zip( *[cls_pred, box_pred, cls_target, box_target]): pos_samples = (ct > 0) num_pos.append(pos_samples.sum()) num_pos_all = sum([p.asscalar() for p in num_pos]) if num_pos_all < 1 and self._min_hard_negatives < 1: # no positive samples and no hard negatives, return dummy losses cls_losses = [nd.sum(cp * 0) for cp in cls_pred] box_losses = [nd.sum(bp * 0) for bp in box_pred] sum_losses = [ nd.sum(cp * 0) + nd.sum(bp * 0) for cp, bp in zip(cls_pred, box_pred) ] return sum_losses, cls_losses, box_losses # compute element-wise cross entropy loss and sort, then perform negative mining cls_losses = [] box_losses = [] sum_losses = [] for cp, bp, ct, bt in zip( *[cls_pred, box_pred, cls_target, box_target]): pred = nd.log_softmax(cp, axis=-1) pos = ct > 0 cls_loss = -nd.pick(pred, ct, axis=-1, keepdims=False) rank = (cls_loss * (pos - 1)).argsort(axis=1).argsort(axis=1) hard_negative = rank < nd.maximum( self._min_hard_negatives, pos.sum(axis=1) * self._negative_mining_ratio).expand_dims(-1) # mask out if not positive or negative cls_loss = nd.where((pos + hard_negative) > 0, cls_loss, nd.zeros_like(cls_loss)) cls_losses.append( nd.sum(cls_loss, axis=0, exclude=True) / max(1., num_pos_all)) bp = _reshape_like(nd, bp, bt) box_loss = nd.abs(bp - bt) box_loss = nd.where(box_loss > self._rho, box_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(box_loss)) # box loss only apply to positive samples box_loss = box_loss * pos.expand_dims(axis=-1) box_losses.append( nd.sum(box_loss, axis=0, exclude=True) / max(1., num_pos_all)) sum_losses.append(cls_losses[-1] + self._lambd * box_losses[-1]) return sum_losses, cls_losses, box_losses
def forward(self, x): return nd.log_softmax(self.proj(x), axis=-1)
def forward(self, enc1, enc2): x = nd.concat(enc1, enc2) x = self.dense(x) x = nd.log_softmax(x) return x
def vae_loss_fn(inp, out, mu, logvar, anneal): neg_ll = -nd.mean(nd.sum(nd.log_softmax(out) * inp, -1)) KLD = -0.5 * nd.mean(nd.sum(1 + logvar - nd.power(mu, 2) - nd.exp(logvar), axis=1)) return neg_ll + anneal * KLD
def _decode_step_CGED(self, step_input, state): step_output, state, _ = self.decoder(self.encoder.word_embed(step_input), state) step_output = self.fc_error(step_output) return nd.log_softmax(step_output), state
def forward(self, x, y, sample_prob=None): if sample_prob is not None: self.sample_prob = sample_prob batch_size = x.shape[0] state = self.init_hidden(batch_size, self.ctx) outputs_pgm = [] outputs_param = [] seq = y for i in range(seq.shape[1]): if i == 0: xt = x else: if i >= 1 and self.sample_prob > 0: #print("x.shape:",x.shape) sample_prob = nd.uniform( 0, 1, shape=(batch_size), ctx=self.ctx) #sample_prob.shape (10,) sample_mask = sample_prob < self.sample_prob #print("sample_mask:",sample_mask) #print("sample_mask.sum:",sample_mask.sum().asscalar()) if sample_mask.sum() == 0: it1 = seq[:, i - 1] else: sample_ind = sample_mask != 0 #print("sample_ind:",sample_ind) it1 = seq[:, i - 1] #it1.shape : (10,) #print("it1:",it1.shape) #print("output_prog:",outputs_pgm[-1]) prob_prev = nd.exp(outputs_pgm[-1]) #print("prob_pre:",prob_prev) temp = nd.random.multinomial( prob_prev, 1).reshape(-1).astype('int64') #print("prob_prev:",nd.argmax(prob_prev,axis=1).astype('int64')==temp) #print("temp",temp,"\n it1:",it1) it1 = nd.where(sample_ind, temp, it1).astype('float32') else: #print("obtain last ground truth") it1 = seq[:, i - 1].copy() xt = self.pgm_embed(it1) #print("xt after embed:",xt) #print("xt :",xt) output, state = self.core(xt.expand_dims(axis=0), state) pgm_feat1 = nd.relu(self.logit1(output.squeeze(0))) pgm_feat2 = self.logit2(pgm_feat1) pgm_score = nd.log_softmax(pgm_feat2, axis=1) trans_prob = nd.softmax(pgm_feat2, axis=1).detach() param_feat1 = nd.relu(self.regress1(output.squeeze(0))) param_feat2 = nd.concat(trans_prob, param_feat1, dim=1) param_score = self.regress2(param_feat2) param_score = param_score.reshape(batch_size, self.vocab_size + 1, self.max_param) #index = nd.argmax(trans_prob, axis = 1) index = seq[:, i] index = index.expand_dims(axis=1).expand_dims(axis=2).broadcast_to( shape=(batch_size, 1, self.max_param)).detach() param_score = nd.pick(param_score, index, 1) outputs_pgm.append(pgm_score) outputs_param.append(param_score) outputs_pgm = [_.expand_dims(axis=1) for _ in outputs_pgm] outputs_param = [_.expand_dims(axis=1) for _ in outputs_param] pgms = outputs_pgm[0] params = outputs_param[0] for i in range(1, len(outputs_pgm)): pgms = nd.concat(pgms, outputs_pgm[i], dim=1) params = nd.concat(params, outputs_param[i], dim=1) #print("params", params.shape) #rint("pgm", pgms.shape) return [pgms, params]
def log_softmax(self, x): return nd.log_softmax(x, axis=1)
def softmax_cross_entropy(self, yhat_linear, y): return (-nd.nansum(y * nd.log_softmax(yhat_linear)))
def forward(self, edges): score_pred = nd.log_softmax(edges.data['preds'])[:,1:].max(axis=1) score_phr = score_pred + edges.src['node_class_logit'] + edges.dst['node_class_logit'] return {'score_pred': score_pred, 'score_phr': score_phr}
def forward(self, x): x = self.embed(x) x = x.reshape([x.shape[0], -1]) x = nd.relu(self.hidden(x)) out = nd.log_softmax(self.out(x)) return out
def log_softmax_likelihood(self, yhat_linear, y): return nd.nansum(y * nd.log_softmax(yhat_linear), axis=0, exclude=True)
def validate(epoch, val_loader, model, loss, opt, ctx,val_loss,val_iou, gen_shape=False): # load pre-fixed randomization try: rand1 = np.load(opt.rand1) rand2 = np.load(opt.rand2) rand3 = np.load(opt.rand3) except: rand1 = np.random.rand(opt.batch_size * opt.seq_length, stop_id).astype(np.float32) rand2 = np.random.rand(opt.batch_size * opt.seq_length, 1).astype(np.float32) rand3 = np.random.rand(opt.batch_size, opt.seq_length, max_param - 1).astype(np.float32) np.save(opt.rand1, rand1) np.save(opt.rand2, rand2) np.save(opt.rand3, rand3) generated_shapes = None original_shapes = None loss_sum,iou_sum,n = 0.0,0.0,0 for idx, data in enumerate(val_loader): start_t = time.time() shape, label, param = data bsz = shape.shape[0] n_step = label.shape[1] index = np.array(list(map(lambda x: n_step, label))) index = index - 1 # add noise during training, making the executor accept # continuous output from program generator label = label.reshape(-1,1).asnumpy() pgm_vector = 0.1*rand1 pgm_noise = 0.1*rand2 pgm_value = np.ones(label.shape) - pgm_noise #print('pgm_val.shape:',pgm_value.shape,'label.shape:',label.shape,'label.shape:',label.shape) pgm_vector = scatter_numpy(pgm_vector,1,label,pgm_value).reshape(bsz,n_step,stop_id) param_noise = nd.from_numpy(rand3) #print(param.shape,param_noise.shape) param_vector = param + 0.6 * (param_noise - 0.5) gt = shape.astype('float32').as_in_context(ctx) index = nd.from_numpy(index).astype('int64').as_in_context(ctx) pgm_vector = nd.from_numpy(pgm_vector).as_in_context(ctx) param_vector = param_vector.as_in_context(ctx) #prediction pred = model(pgm_vector, param_vector, index) scores = nd.log_softmax(pred,axis=1) pred0 = scores[:,0].squeeze()*opt.p_weight pred1 = scores[:,1].squeeze()*opt.n_weight l = -nd.where(gt, pred1, pred0).mean((1,2,3)) #print(pred2.dtype,gt.dtype) #l = loss(pred,gt,sample_weight = nd.array([opt.n_weight,opt.p_weight])) l = l.mean().asscalar() pred = nd.softmax(pred,axis=1) pred = pred[:, 1, :, :, :] s1 = gt.reshape(-1, 32, 32, 32).as_in_context(mx.cpu()) s2 = pred.squeeze().as_in_context(mx.cpu()) s2 = (s2 > 0.5) batch_iou = BatchIoU(s1, s2) iou = batch_iou.mean() loss_sum+=l n+=1 iou_sum+=iou if(idx+1)%5==0 and gen_shape: if original_shapes is None: original_shapes = s1.expand_dims(axis=0) generated_shapes = s2.expand_dims(axis=0) else: original_shapes = nd.concat(original_shapes,s1.expand_dims(axis=0),dim=0) generated_shapes = nd.concat(generated_shapes,s2.expand_dims(axis=0),dim=0) end_t = time.time() if (idx + 1) % opt.info_interval == 0: print("Test: epoch {} batch {}/{}, loss13 = {:.3f}, iou = {:.3f}, time = {:.3f}" .format(epoch, idx + 1, len(val_loader), l, iou, end_t - start_t)) sys.stdout.flush() if(idx+1>len(val_loader)/10): break; val_loss.append(loss_sum/n) val_iou.append(iou_sum/n) return generated_shapes, original_shapes