def forward(ctx, true_binary, rule_masks, input_logits): ctx.save_for_backward(true_binary, rule_masks, input_logits) b = F.torch.max(input_logits, 2, keepdim=True)[0] raw_logits = input_logits - b exp_pred = torch.exp(raw_logits) * rule_masks + cmd_args.prob_fix norm = torch.sum(exp_pred, 2, keepdim=True) prob = torch.div(exp_pred, norm) ll = F.torch.abs(F.torch.sum( true_binary * prob, 2)) mask = 1 - rule_masks[:, :, -1] logll = mask * F.torch.log(ll) if cmd_args.old_loss: nnz = torch.sum(mask) loss = -torch.sum(logll) / nnz else: loss = -torch.sum(logll) / true_binary.size()[1] if input_logits.is_cuda: return torch.Tensor([loss]).cuda() else: return torch.Tensor([loss])
def forward(self, feat, right, wrong, batch_wrong, fake=None, fake_diff_mask=None): num_wrong = wrong.size(1) batch_size = feat.size(0) feat = feat.view(-1, self.ninp, 1) right_dis = torch.bmm(right.view(-1, 1, self.ninp), feat) wrong_dis = torch.bmm(wrong, feat) batch_wrong_dis = torch.bmm(batch_wrong, feat) wrong_score = torch.sum(torch.exp(wrong_dis - right_dis.expand_as(wrong_dis)),1) \ + torch.sum(torch.exp(batch_wrong_dis - right_dis.expand_as(batch_wrong_dis)),1) loss_dis = torch.sum(torch.log(wrong_score + 1)) loss_norm = right.norm() + feat.norm() + wrong.norm() + batch_wrong.norm() if fake: fake_dis = torch.bmm(fake.view(-1, 1, self.ninp), feat) fake_score = torch.masked_select(torch.exp(fake_dis - right_dis), fake_diff_mask) margin_score = F.relu(torch.log(fake_score + 1) - self.margin) loss_fake = torch.sum(margin_score) loss_dis += loss_fake loss_norm += fake.norm() loss = (loss_dis + 0.1 * loss_norm) / batch_size if fake: return loss, loss_fake.data[0] / batch_size else: return loss
def norm_flow(self, params, z, v): # print (z.size()) h = F.tanh(params[0][0](z)) mew_ = params[0][1](h) sig_ = F.sigmoid(params[0][2](h)+5.) #[PB,Z] # print (v.size()) # print (mew_.size()) # print (self.B) # print (self.P) v = v*sig_ + mew_ logdet = torch.sum(torch.log(sig_), 1) h = F.tanh(params[1][0](v)) mew_ = params[1][1](h) sig_ = F.sigmoid(params[1][2](h)+5.) #[PB,Z] z = z*sig_ + mew_ logdet2 = torch.sum(torch.log(sig_), 1) #[PB] logdet = logdet + logdet2 #[PB,Z], [PB] return z, v, logdet
def _test_jacobian(self, input_dim, hidden_dim): jacobian = torch.zeros(input_dim, input_dim) iaf = InverseAutoregressiveFlow(input_dim, hidden_dim, sigmoid_bias=0.5) def nonzero(x): return torch.sign(torch.abs(x)) x = torch.randn(1, input_dim) iaf_x = iaf(x) analytic_ldt = iaf.log_abs_det_jacobian(x, iaf_x).data.sum() for j in range(input_dim): for k in range(input_dim): epsilon_vector = torch.zeros(1, input_dim) epsilon_vector[0, j] = self.epsilon iaf_x_eps = iaf(x + epsilon_vector) delta = (iaf_x_eps - iaf_x) / self.epsilon jacobian[j, k] = float(delta[0, k].data.sum()) permutation = iaf.arn.get_permutation() permuted_jacobian = jacobian.clone() for j in range(input_dim): for k in range(input_dim): permuted_jacobian[j, k] = jacobian[permutation[j], permutation[k]] numeric_ldt = torch.sum(torch.log(torch.diag(permuted_jacobian))) ldt_discrepancy = np.fabs(analytic_ldt - numeric_ldt) diag_sum = torch.sum(torch.diag(nonzero(permuted_jacobian))) lower_sum = torch.sum(torch.tril(nonzero(permuted_jacobian), diagonal=-1)) assert ldt_discrepancy < self.epsilon assert diag_sum == float(input_dim) assert lower_sum == float(0.0)
def f1_score_macro(y_true, y_pred, per_class=False, threshold=0.5): ''' Macro f1 y_true: [bs, classes, x, y] y_pred: [bs, classes, x, y] Tested: same results as sklearn f1 macro ''' y_true = y_true.byte() y_pred = y_pred > threshold y_true = y_true.permute(0, 2, 3, 1) y_pred = y_pred.permute(0, 2, 3, 1) y_true = y_true.contiguous().view(-1, y_true.size()[3]) # [bs*x*y, classes] y_pred = y_pred.contiguous().view(-1, y_pred.size()[3]) f1s = [] for i in range(y_true.size()[1]): intersect = torch.sum(y_true[:, i] * y_pred[:, i]) # works because all multiplied by 0 gets 0 denominator = torch.sum(y_true[:, i]) + torch.sum(y_pred[:, i]) # works because all multiplied by 0 gets 0 #maybe have to cast to float here (for python3 ??) otherwise always 0 # f1 = (2 * intersect) / (denominator + 1e-6) f1 = (2 * intersect.float()) / (denominator.float() + 1e-6) f1s.append(f1) if per_class: return np.array(f1s) else: return np.mean(np.array(f1s))
def forward(self, input, target): y_true = target.int().unsqueeze(-1) same_id = torch.eq(y_true, y_true.t()).type_as(input) pos_mask = same_id neg_mask = 1 - same_id def _mask_max(input_tensor, mask, axis=None, keepdims=False): input_tensor = input_tensor - 1e6 * (1 - mask) _max, _idx = torch.max(input_tensor, dim=axis, keepdim=keepdims) return _max, _idx def _mask_min(input_tensor, mask, axis=None, keepdims=False): input_tensor = input_tensor + 1e6 * (1 - mask) _min, _idx = torch.min(input_tensor, dim=axis, keepdim=keepdims) return _min, _idx # output[i, j] = || feature[i, :] - feature[j, :] ||_2 dist_squared = torch.sum(input ** 2, dim=1, keepdim=True) + \ torch.sum(input.t() ** 2, dim=0, keepdim=True) - \ 2.0 * torch.matmul(input, input.t()) dist = dist_squared.clamp(min=1e-16).sqrt() pos_max, pos_idx = _mask_max(dist, pos_mask, axis=-1) neg_min, neg_idx = _mask_min(dist, neg_mask, axis=-1) # loss(x, y) = max(0, -y * (x1 - x2) + margin) y = torch.ones(same_id.size()[0]).to(DEVICE) return F.margin_ranking_loss(neg_min.float(), pos_max.float(), y, self.margin, self.size_average)
def norm_flow(self, params, z, v, logposterior): h = F.tanh(params[0][0](z)) mew_ = params[0][1](h) sig_ = F.sigmoid(params[0][2](h)+5.) #[PB,Z] z_reshaped = z.view(self.P, self.B, self.z_size) gradients = torch.autograd.grad(outputs=logposterior(z_reshaped), inputs=z_reshaped, grad_outputs=self.grad_outputs, create_graph=True, retain_graph=True, only_inputs=True)[0] gradients = gradients.detach() gradients = gradients.view(-1,self.z_size) v = v*sig_ + mew_*gradients logdet = torch.sum(torch.log(sig_), 1) h = F.tanh(params[1][0](v)) mew_ = params[1][1](h) sig_ = F.sigmoid(params[1][2](h)+5.) #[PB,Z] z = z*sig_ + mew_*v logdet2 = torch.sum(torch.log(sig_), 1) #[PB] logdet = logdet + logdet2 #[PB,Z], [PB] return z, v, logdet
def test_regularization(self): penalty = self.model.get_regularization_penalty().data assert (penalty > 0).all() penalty2 = 0 # Config specifies penalty as # "regularizer": [ # ["weight$", {"type": "l2", "alpha": 10}], # ["bias$", {"type": "l1", "alpha": 5}] # ] for name, parameter in self.model.named_parameters(): if name.endswith("weight"): weight_penalty = 10 * torch.sum(torch.pow(parameter, 2)) penalty2 += weight_penalty elif name.endswith("bias"): bias_penalty = 5 * torch.sum(torch.abs(parameter)) penalty2 += bias_penalty assert (penalty == penalty2.data).all() # You get a RuntimeError if you call `model.forward` twice on the same inputs. # The data and config are such that the whole dataset is one batch. training_batch = next(self.iterator(self.instances, num_epochs=1)) validation_batch = next(self.iterator(self.instances, num_epochs=1)) training_loss = self.trainer._batch_loss(training_batch, for_training=True).data validation_loss = self.trainer._batch_loss(validation_batch, for_training=False).data # Training loss should have the regularization penalty, but validation loss should not. assert (training_loss != validation_loss).all() # Training loss should equal the validation loss plus the penalty. penalized = validation_loss + penalty assert (training_loss == penalized).all()
def iou(pr, gt, eps=1e-7, threshold=None, activation='sigmoid'): """ Source: https://github.com/catalyst-team/catalyst/ Args: pr (torch.Tensor): A list of predicted elements gt (torch.Tensor): A list of elements that are to be predicted eps (float): epsilon to avoid zero division threshold: threshold for outputs binarization Returns: float: IoU (Jaccard) score """ if activation is None or activation == "none": activation_fn = lambda x: x elif activation == "sigmoid": activation_fn = torch.nn.Sigmoid() elif activation == "softmax2d": activation_fn = torch.nn.Softmax2d() else: raise NotImplementedError( "Activation implemented for sigmoid and softmax2d" ) pr = activation_fn(pr) if threshold is not None: pr = (pr > threshold).float() intersection = torch.sum(gt * pr) union = torch.sum(gt) + torch.sum(pr) - intersection + eps return (intersection + eps) / union
def forward(self, log_prob, y_true, mask): mask = mask.float() log_P = torch.gather(log_prob.view(-1, log_prob.size(2)), 1, y_true.contiguous().view(-1, 1)) # batch*time x 1 log_P = log_P.view(y_true.size(0), y_true.size(1)) # batch x time log_P = log_P * mask # batch x time sum_log_P = torch.sum(log_P, dim=1) / torch.sum(mask, dim=1) # batch return -sum_log_P
def get_reinforce_ps_loss(phi, p0, reinforce = False): # returns pseudoloss: loss whose gradient is unbiased for the # true gradient d = len(p0) e_b = sigmoid(phi) bn_rv = Bernoulli(probs = torch.ones(d) * e_b) binary_samples = bn_rv.sample().detach() # binary_samples = (torch.rand(d) > e_b).float().detach() if reinforce: binary_samples_ = bn_rv.sample().detach() baseline = torch.sum((binary_samples_ - p0)**2) else: baseline = 0.0 sampled_loss = torch.sum((binary_samples - p0)**2) # probs, draw_array = get_all_probs(e_b, d) # losses_array = get_losses_from_draw_array(draw_array, p0) # # cat_rv = Categorical(probs) # indx = cat_rv.sample() # binary_samples = draw_array[indx] # sampled_loss = losses_array[indx] # sampled_log_q = get_bernoulli_log_prob(e_b, binary_samples) ps_loss = (sampled_loss - baseline).detach() * sampled_log_q return ps_loss
def _get_state_cost(self, state: NlvrDecoderState) -> torch.Tensor: """ Return the costs a finished state. Since it is a finished state, the group size will be 1, and hence we'll return just one cost. """ if not state.is_finished(): raise RuntimeError("_get_state_cost() is not defined for unfinished states!") # Our checklist cost is a sum of squared error from where we want to be, making sure we # take into account the mask. checklist_balance = state.checklist_state[0].get_balance() checklist_cost = torch.sum((checklist_balance) ** 2) # This is the number of items on the agenda that we want to see in the decoded sequence. # We use this as the denotation cost if the path is incorrect. # Note: If we are penalizing the model for producing non-agenda actions, this is not the # upper limit on the checklist cost. That would be the number of terminal actions. denotation_cost = torch.sum(state.checklist_state[0].checklist_target.float()) checklist_cost = self._checklist_cost_weight * checklist_cost # TODO (pradeep): The denotation based cost below is strict. May be define a cost based on # how many worlds the logical form is correct in? # label_strings being None happens when we are testing. We do not care about the cost then. # TODO (pradeep): Make this cleaner. if state.label_strings is None or all(self._check_state_denotations(state)): cost = checklist_cost else: cost = checklist_cost + (1 - self._checklist_cost_weight) * denotation_cost return cost
def forward(self, true_binary, rule_masks, raw_logits): if cmd_args.loss_type == 'binary': exp_pred = torch.exp(raw_logits) * rule_masks norm = F.torch.sum(exp_pred, 2, keepdim=True) prob = F.torch.div(exp_pred, norm) return F.binary_cross_entropy(prob, true_binary) * cmd_args.max_decode_steps if cmd_args.loss_type == 'perplexity': return my_perp_loss(true_binary, rule_masks, raw_logits) if cmd_args.loss_type == 'vanilla': exp_pred = torch.exp(raw_logits) * rule_masks + 1e-30 norm = torch.sum(exp_pred, 2, keepdim=True) prob = torch.div(exp_pred, norm) ll = F.torch.abs(F.torch.sum( true_binary * prob, 2)) mask = 1 - rule_masks[:, :, -1] logll = mask * F.torch.log(ll) loss = -torch.sum(logll) / true_binary.size()[1] return loss print('unknown loss type %s' % cmd_args.loss_type) raise NotImplementedError
def average_without_padding(x, ids, padding_id, cuda=False, eps=1e-8): if cuda: mask = Variable(torch.from_numpy(np.not_equal(ids, padding_id).astype(int)[:,:,np.newaxis])).float().cuda().permute(1, 2, 0).expand_as(x) else: mask = Variable(torch.from_numpy(np.not_equal(ids, padding_id).astype(int)[:,:,np.newaxis])).float().permute(1, 2, 0).expand_as(x) s = torch.sum(x*mask, dim=2) / (torch.sum(mask, dim=2)+eps) return s
def calculate_variance_term(pred, gt, means, n_objects, delta_v, norm=2): """pred: bs, height * width, n_filters gt: bs, height * width, n_instances means: bs, n_instances, n_filters""" bs, n_loc, n_filters = pred.size() n_instances = gt.size(2) # bs, n_loc, n_instances, n_filters means = means.unsqueeze(1).expand(bs, n_loc, n_instances, n_filters) # bs, n_loc, n_instances, n_filters pred = pred.unsqueeze(2).expand(bs, n_loc, n_instances, n_filters) # bs, n_loc, n_instances, n_filters gt = gt.unsqueeze(3).expand(bs, n_loc, n_instances, n_filters) _var = (torch.clamp(torch.norm((pred - means), norm, 3) - delta_v, min=0.0) ** 2) * gt[:, :, :, 0] var_term = 0.0 for i in range(bs): _var_sample = _var[i, :, :n_objects[i]] # n_loc, n_objects _gt_sample = gt[i, :, :n_objects[i], 0] # n_loc, n_objects var_term += torch.sum(_var_sample) / torch.sum(_gt_sample) var_term = var_term / bs return var_term
def compute_loss(self, outputs, masks, labels): """ Our implementation of weighted BCE loss. """ labels = labels.view(-1) masks = masks.view(-1) outputs = outputs.view(-1) # Generate the weights ones = torch.sum(labels) total = labels.nelement() weights = torch.FloatTensor(outputs.size()).type_as(outputs.data) weights[labels.long() == 1] = 1.0 - ones / total weights[labels.long() == 0] = ones / total weights = weights.view(weights.size(0), 1).expand(weights.size(0), 2) # Generate the log outputs outputs = outputs.clamp(min=1e-8) log_outputs = torch.log(outputs) neg_outputs = 1.0 - outputs neg_outputs = neg_outputs.clamp(min=1e-8) neg_log_outputs = torch.log(neg_outputs) all_outputs = torch.cat((log_outputs.view(-1, 1), neg_log_outputs.view(-1, 1)), 1) all_values = all_outputs.mul(torch.autograd.Variable(weights)) all_labels = torch.autograd.Variable(torch.cat((labels.view(-1, 1), (1.0 - labels).view(-1, 1)), 1)) all_masks = torch.autograd.Variable(torch.cat((masks.view(-1, 1), masks.view(-1, 1)), 1)) loss = -torch.sum(all_values.mul(all_labels).mul(all_masks)) / outputs.size(0) return loss
def sample_from_discretized_mix_logistic_1d(l, nr_mix): # Pytorch ordering l = l.permute(0, 2, 3, 1) ls = [int(y) for y in l.size()] xs = ls[:-1] + [1] #[3] # unpack parameters logit_probs = l[:, :, :, :nr_mix] l = l[:, :, :, nr_mix:].contiguous().view(xs + [nr_mix * 2]) # for mean, scale # sample mixture indicator from softmax temp = torch.FloatTensor(logit_probs.size()) if l.is_cuda : temp = temp.cuda() temp.uniform_(1e-5, 1. - 1e-5) temp = logit_probs.data - torch.log(- torch.log(temp)) _, argmax = temp.max(dim=3) one_hot = to_one_hot(argmax, nr_mix) sel = one_hot.view(xs[:-1] + [1, nr_mix]) # select logistic parameters means = torch.sum(l[:, :, :, :, :nr_mix] * sel, dim=4) log_scales = torch.clamp(torch.sum( l[:, :, :, :, nr_mix:2 * nr_mix] * sel, dim=4), min=-7.) u = torch.FloatTensor(means.size()) if l.is_cuda : u = u.cuda() u.uniform_(1e-5, 1. - 1e-5) u = Variable(u) x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) x0 = torch.clamp(torch.clamp(x[:, :, :, 0], min=-1.), max=1.) out = x0.unsqueeze(1) return out
def forward(self, input_features, adj): #x = self.conv1(input_features, adj) #x = self.bn1(x) #x = self.act(x) #x = self.conv2(x, adj) #x = self.bn2(x) # pool over all nodes #graph_h = self.pool_graph(x) graph_h = input_features.view(-1, self.max_num_nodes * self.max_num_nodes) # vae h_decode, z_mu, z_lsgms = self.vae(graph_h) out = F.sigmoid(h_decode) out_tensor = out.cpu().data recon_adj_lower = self.recover_adj_lower(out_tensor) recon_adj_tensor = self.recover_full_adj_from_lower(recon_adj_lower) # set matching features be degree out_features = torch.sum(recon_adj_tensor, 1) adj_data = adj.cpu().data[0] adj_features = torch.sum(adj_data, 1) S = self.edge_similarity_matrix(adj_data, recon_adj_tensor, adj_features, out_features, self.deg_feature_similarity) # initialization strategies init_corr = 1 / self.max_num_nodes init_assignment = torch.ones(self.max_num_nodes, self.max_num_nodes) * init_corr #init_assignment = torch.FloatTensor(4, 4) #init.uniform(init_assignment) assignment = self.mpm(init_assignment, S) #print('Assignment: ', assignment) # matching # use negative of the assignment score since the alg finds min cost flow row_ind, col_ind = scipy.optimize.linear_sum_assignment(-assignment.numpy()) print('row: ', row_ind) print('col: ', col_ind) # order row index according to col index #adj_permuted = self.permute_adj(adj_data, row_ind, col_ind) adj_permuted = adj_data adj_vectorized = adj_permuted[torch.triu(torch.ones(self.max_num_nodes,self.max_num_nodes) )== 1].squeeze_() adj_vectorized_var = Variable(adj_vectorized).cuda() #print(adj) #print('permuted: ', adj_permuted) #print('recon: ', recon_adj_tensor) adj_recon_loss = self.adj_recon_loss(adj_vectorized_var, out[0]) print('recon: ', adj_recon_loss) print(adj_vectorized_var) print(out[0]) loss_kl = -0.5 * torch.sum(1 + z_lsgms - z_mu.pow(2) - z_lsgms.exp()) loss_kl /= self.max_num_nodes * self.max_num_nodes # normalize print('kl: ', loss_kl) loss = adj_recon_loss + loss_kl return loss
def forward(self, input1): self.batchgrid3d = torch.zeros(torch.Size([input1.size(0)]) + self.grid3d.size()) for i in range(input1.size(0)): self.batchgrid3d[i] = self.grid3d self.batchgrid3d = Variable(self.batchgrid3d) #print(self.batchgrid3d) x = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,0:4]), 3) y = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,4:8]), 3) z = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,8:]), 3) #print(x) r = torch.sqrt(x**2 + y**2 + z**2) + 1e-5 #print(r) theta = torch.acos(z/r)/(np.pi/2) - 1 #phi = torch.atan(y/x) phi = torch.atan(y/(x + 1e-5)) + np.pi * x.lt(0).type(torch.FloatTensor) * (y.ge(0).type(torch.FloatTensor) - y.lt(0).type(torch.FloatTensor)) phi = phi/np.pi output = torch.cat([theta,phi], 3) return output
def pixel_acc(pred, label, ignore_index=-1): _, preds = torch.max(pred, dim=1) valid = (label != ignore_index).long() acc_sum = torch.sum(valid * (preds == label).long()) pixel_sum = torch.sum(valid) acc = acc_sum.float() / (pixel_sum.float() + 1e-10) return acc
def eval_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) encoder_outputs, encoder_hidden, max_encoder_output = self.model.encoder(enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) if config.use_maxpool_init_ctx: c_t_1 = max_encoder_output step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1,attn_dist, p_gen, coverage = self.model.decoder(y_t_1, s_t_1, encoder_outputs, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss.data[0]
def get_receptive_field(self, patch, layer_idx=None): is_originally_frozen = self._is_frozen self.zero_grad() self.freeze(False) image_size = self.input_size batch_shape = (1, 3, image_size, image_size) x = self._make_cuda(torch.autograd.Variable( torch.rand(*batch_shape), requires_grad=True)) z = self.forward(x, layer_idx=layer_idx) z_patch = patch.forward(z) torch.sum(z_patch).backward() rf = x.grad.data.cpu().numpy() rf = rf[0, 0] rf = list(zip(*np.where(np.abs(rf) > 1e-6))) (i_nw, j_nw), (i_se, j_se) = rf[0], rf[-1] rf_w, rf_h = (j_se - j_nw + 1, i_se - i_nw + 1) self.zero_grad() self.freeze(is_originally_frozen) return (i_nw, j_nw), (rf_w, rf_h)
def reverse_flow(self, z): B = z.shape[0] C = z.shape[1] f = self.flows logdet = 0. reverse_ = list(range(self.n_flows))[::-1] for i in reverse_: z1 = z[:,:C//2] z2 = z[:,C//2:] sig1 = torch.sigmoid(f[str(i)]['f2_sig'](z1)) mu1 = f[str(i)]['f2_mu'](z1) z2 = (z2 - mu1) / sig1 sig2 = torch.sigmoid(f[str(i)]['f1_sig'](z2)) mu2 = f[str(i)]['f1_mu'](z2) z1 = (z1 - mu2) / sig2 z = torch.cat([z1,z2],1) z = z[:,f[str(i)]['inv_perm']] sig1 = sig1.view(B, -1) sig2 = sig2.view(B, -1) logdet += torch.sum(torch.log(sig1), 1) logdet += torch.sum(torch.log(sig2), 1) return z, logdet
def test_fun_weak(model,loss_fn,dataloader,dataloader_neg,batch_tnf,use_cuda=True,triplet=False,tps_grid_regularity_loss=0): model.eval() test_loss = 0 if dataloader_neg is not None: dataloader_neg_iter=iter(dataloader_neg) for batch_idx, batch in enumerate(dataloader): batch = batch_tnf(batch) if dataloader_neg is not None: batch_neg = next(dataloader_neg_iter) batch_neg = batch_tnf(batch_neg) theta_pos,corr_pos,theta_neg,corr_neg = model(batch, batch_neg) inliers_pos = loss_fn(theta_pos,corr_pos) inliers_neg = loss_fn(theta_neg,corr_neg) loss = torch.sum(inliers_neg - inliers_pos) elif dataloader_neg is None and triplet==False: theta,corr = model(batch) loss = loss_fn(theta,corr) elif dataloader_neg is None and triplet==True: theta_pos,corr_pos,theta_neg,corr_neg = model(batch, triplet=True) inliers_pos = loss_fn(theta_pos,corr_pos) inliers_neg = loss_fn(theta_neg,corr_neg) loss = torch.sum(inliers_neg - inliers_pos) test_loss += loss.data.cpu().numpy()[0] test_loss /= len(dataloader) print('Test set: Average loss: {:.4f}'.format(test_loss)) return test_loss
def neg_hartmann6(X: Tensor) -> Tensor: r"""Negative Hartmann6 test function. Six-dimensional function (typically evaluated on `[0, 1]^6`) `H(x) = - sum_{i=1}^4 ALPHA_i exp( - sum_{j=1}^6 A_ij (x_j - P_ij)**2 )` H has a 6 local minima and a global minimum at `z = (0.20169, 0.150011, 0.476874, 0.275332, 0.311652, 0.6573)` with `H(z) = -3.32237` Args: X: A Tensor of size `6` or `k x 6` (k batch evaluations). Returns: `-H(X)`, the negative value of the standard Hartmann6 function. """ batch = X.ndimension() > 1 X = X if batch else X.unsqueeze(0) inner_sum = torch.sum(X.new(A) * (X.unsqueeze(1) - 0.0001 * X.new(P)) ** 2, dim=2) H = -torch.sum(X.new(ALPHA) * torch.exp(-inner_sum), dim=1) result = -H return result if batch else result.squeeze(0)
def _mmd2(K_XX, K_XY, K_YY, const_diagonal=False, biased=False): m = K_XX.size(0) # assume X, Y are same shape # Get the various sums of kernels that we'll use # Kts drop the diagonal, but we don't need to compute them explicitly if const_diagonal is not False: diag_X = diag_Y = const_diagonal sum_diag_X = sum_diag_Y = m * const_diagonal else: diag_X = torch.diag(K_XX) # (m,) diag_Y = torch.diag(K_YY) # (m,) sum_diag_X = torch.sum(diag_X) sum_diag_Y = torch.sum(diag_Y) Kt_XX_sums = K_XX.sum(dim=1) - diag_X # \tilde{K}_XX * e = K_XX * e - diag_X Kt_YY_sums = K_YY.sum(dim=1) - diag_Y # \tilde{K}_YY * e = K_YY * e - diag_Y K_XY_sums_0 = K_XY.sum(dim=0) # K_{XY}^T * e Kt_XX_sum = Kt_XX_sums.sum() # e^T * \tilde{K}_XX * e Kt_YY_sum = Kt_YY_sums.sum() # e^T * \tilde{K}_YY * e K_XY_sum = K_XY_sums_0.sum() # e^T * K_{XY} * e if biased: mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * m) + (Kt_YY_sum + sum_diag_Y) / (m * m) - 2.0 * K_XY_sum / (m * m)) else: mmd2 = (Kt_XX_sum / (m * (m - 1)) + Kt_YY_sum / (m * (m - 1)) - 2.0 * K_XY_sum / (m * m)) return mmd2
def accGradParameters(self, input, gradOutput, scale=1): assert input.dim() == 2 inputSize = self.weight.size(1) outputSize = self.weight.size(0) """ dy_j x_i w_ji ----- = ------------------- - y_j ----------- dw_ji || w_j || * || x || || w_j ||^2 """ if self._weight is None: self._weight = self.weight.new() if self._sum is None: self._sum = input.new() self._weight.resize_as_(self.weight).copy_(self.weight) if self._gradOutput is None: self._gradOutput = gradOutput.new() self._gradOutput.resize_as_(gradOutput).copy_(gradOutput) self._gradOutput.mul_(self.output) torch.sum(self._gradOutput, 0, out=self._sum, keepdim=True) grad = self._sum[0] grad.div_(self._weightNorm.select(1, 0)) self._weight.mul_(grad.view(outputSize, 1).expand_as(self._weight)) input_ = self._gradOutput input_.resize_as_(input).copy_(input) input_.div_(self._inputNorm.expand_as(input)) self._weight.addmm_(-1, 1, gradOutput.t(), input_) self._weight.div_(self._weightNorm.expand_as(self._weight)) self.gradWeight.add_(self._weight)
def forward_flow(self, z, xenc): B = z.shape[0] C = z.shape[1] f = self.flows logdet = 0. for i in range(self.n_flows): z = z[:,f[str(i)]['perm']] z1 = z[:,:C//2] z2 = z[:,C//2:] sig2 = torch.sigmoid(f[str(i)]['f1_sig'](torch.cat([z2,xenc],1))) mu2 = f[str(i)]['f1_mu'](torch.cat([z2,xenc],1)) z1 = z1*sig2 + mu2 mu1 = f[str(i)]['f2_mu'](torch.cat([z1,xenc],1)) sig1 = torch.sigmoid(f[str(i)]['f2_sig'](torch.cat([z1,xenc],1))) z2 = z2*sig1 + mu1 z = torch.cat([z1,z2],1) sig1 = sig1.view(B, -1) sig2 = sig2.view(B, -1) logdet += torch.sum(torch.log(sig1), 1) logdet += torch.sum(torch.log(sig2), 1) return z, logdet
def _get_state_cost(self, worlds: List[WikiTablesWorld], state: CoverageState) -> torch.Tensor: if not state.is_finished(): raise RuntimeError("_get_state_cost() is not defined for unfinished states!") world = worlds[state.batch_indices[0]] # Our checklist cost is a sum of squared error from where we want to be, making sure we # take into account the mask. We clamp the lower limit of the balance at 0 to avoid # penalizing agenda actions produced multiple times. checklist_balance = torch.clamp(state.checklist_state[0].get_balance(), min=0.0) checklist_cost = torch.sum((checklist_balance) ** 2) # This is the number of items on the agenda that we want to see in the decoded sequence. # We use this as the denotation cost if the path is incorrect. denotation_cost = torch.sum(state.checklist_state[0].checklist_target.float()) checklist_cost = self._checklist_cost_weight * checklist_cost action_history = state.action_history[0] batch_index = state.batch_indices[0] action_strings = [state.possible_actions[batch_index][i][0] for i in action_history] logical_form = world.get_logical_form(action_strings) lisp_string = state.extras[batch_index] if self._executor.evaluate_logical_form(logical_form, lisp_string): cost = checklist_cost else: cost = checklist_cost + (1 - self._checklist_cost_weight) * denotation_cost return cost
def project_to_2d(X, camera_params): """ Project 3D points to 2D using the Human3.6M camera projection function. This is a differentiable and batched reimplementation of the original MATLAB script. Arguments: X -- 3D points in *camera space* to transform (N, *, 3) camera_params -- intrinsic parameteres (N, 2+2+3+2=9) """ assert X.shape[-1] == 3 assert len(camera_params.shape) == 2 assert camera_params.shape[-1] == 9 assert X.shape[0] == camera_params.shape[0] while len(camera_params.shape) < len(X.shape): camera_params = camera_params.unsqueeze(1) f = camera_params[..., :2] c = camera_params[..., 2:4] k = camera_params[..., 4:7] p = camera_params[..., 7:] XX = torch.clamp(X[..., :2] / X[..., 2:], min=-1, max=1) r2 = torch.sum(XX[..., :2]**2, dim=len(XX.shape)-1, keepdim=True) radial = 1 + torch.sum(k * torch.cat((r2, r2**2, r2**3), dim=len(r2.shape)-1), dim=len(r2.shape)-1, keepdim=True) tan = torch.sum(p*XX, dim=len(XX.shape)-1, keepdim=True) XXX = XX*(radial + tan) + p*r2 return f*XXX + c
def train_model(model, criterion, optimizer, lr_scheduler, lr, dset_loaders, dset_sizes, use_gpu, num_epochs, exp_dir='./', resume=''): print('dictoinary length' + str(len(dset_loaders))) #reg_params=model.reg_params since = time.time() best_model = model best_acc = 0.0 if os.path.isfile(resume): print("=> loading checkpoint '{}'".format(resume)) checkpoint = torch.load(resume) start_epoch = checkpoint['epoch'] #best_prec1 = checkpoint['best_prec1'] #model = checkpoint['model'] model.load_state_dict(checkpoint['state_dict']) #modelx = checkpoint['model'] #model.reg_params=modelx.reg_params print('load') optimizer.load_state_dict(checkpoint['optimizer']) #pdb. #model.reg_params=reg_params #del model.reg_params print("=> loaded checkpoint '{}' (epoch {})".format( resume, checkpoint['epoch'])) else: start_epoch = 0 print("=> no checkpoint found at '{}'".format(resume)) print(str(start_epoch)) #pdb.set_trace() for epoch in range(start_epoch, num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': optimizer = lr_scheduler(optimizer, epoch, lr) model.train(True) # Set model to training mode else: model.train(False) # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 # Iterate over data. for data in dset_loaders[phase]: # get the inputs inputs, labels = data inputs = inputs.squeeze() # wrap them in Variable if use_gpu: inputs, labels = Variable(inputs.cuda()), \ Variable(labels.cuda()) else: inputs, labels = Variable(inputs), Variable(labels) # zero the parameter gradients optimizer.zero_grad() model.zero_grad() # forward outputs = model(inputs) _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() #print('step') optimizer.step() # statistics running_loss += loss.data[0] running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / dset_sizes[phase] epoch_acc = running_corrects / dset_sizes[phase] print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) # deep copy the model if phase == 'val' and epoch_acc > best_acc: del outputs del labels del inputs del loss del preds best_acc = epoch_acc #best_model = copy.deepcopy(model) torch.save(model, os.path.join(exp_dir, 'best_model.pth.tar')) #epoch_file_name=exp_dir+'/'+'epoch-'+str(epoch)+'.pth.tar' epoch_file_name = exp_dir + '/' + 'epoch' + '.pth.tar' save_checkpoint( { 'epoch': epoch + 1, 'epoch_acc': epoch_acc, 'arch': 'alexnet', 'model': model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, epoch_file_name) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) return model
def _optimize(self, model, optimizer, inputs_tanh_var, pert_tanh_var, targets_oh_var, c_var): """ Optimize for one step. :param model: the model to attack :type model: nn.Module :param optimizer: the Adam optimizer to optimize ``modifier_var`` :type optimizer: optim.Adam :param inputs_tanh_var: the input images in tanh-space :type inputs_tanh_var: Variable :param pert_tanh_var: the perturbation to optimize in tanh-space, ``pert_tanh_var.requires_grad`` flag must be set to True :type pert_tanh_var: Variable :param targets_oh_var: the one-hot encoded target tensor (the attack targets if self.targeted else image labels) :type targets_oh_var: Variable :param c_var: the constant :math:`c` for each perturbation of a batch, a Variable of FloatTensor of dimension [B] :type c_var: Variable :return: the batch loss, squared L2-norm of adversarial perturbations (of dimension [B]), the perturbed activations (of dimension [B]), the adversarial examples (of dimension [B x C x H x W]) """ # the adversarial examples in the image space # of dimension [B x C x H x W] advxs_var = self._from_tanh_space(inputs_tanh_var + pert_tanh_var) # type: Variable # the perturbed activation before softmax pert_outputs_var = model(advxs_var) # type: Variable # the original inputs inputs_var = self._from_tanh_space(inputs_tanh_var) # type: Variable perts_norm_var = torch.pow(advxs_var - inputs_var, 2) perts_norm_var = torch.sum( perts_norm_var.view(perts_norm_var.size(0), -1), 1) # In Carlini's code, `target_activ_var` is called `real`. # It should be a Variable of tensor of dimension [B], such that the # `target_activ_var[i]` is the final activation (right before softmax) # of the $t$th class, where $t$ is the attack target or the image label # # noinspection PyArgumentList target_activ_var = torch.sum(targets_oh_var * pert_outputs_var, 1) inf = 1e4 # sadly pytorch does not work with np.inf; # 1e4 is also used in Carlini's code # In Carlini's code, `maxother_activ_var` is called `other`. # It should be a Variable of tensor of dimension [B], such that the # `maxother_activ_var[i]` is the maximum final activation of all classes # other than class $t$, where $t$ is the attack target or the image # label. # # The assertion here ensures (sufficiently yet not necessarily) the # assumption behind the trick to get `maxother_activ_var` holds, that # $\max_{i \ne t}{o_i} \ge -\text{_inf}$, where $t$ is the target and # $o_i$ the $i$th element along axis=1 of `pert_outputs_var`. # # noinspection PyArgumentList assert (pert_outputs_var.max(1)[0] >= -inf).all(), 'assumption failed' # noinspection PyArgumentList maxother_activ_var = torch.max( ((1 - targets_oh_var) * pert_outputs_var - targets_oh_var * inf), 1)[0] # Compute $f(x')$, where $x'$ is the adversarial example in image space. # The result `f_var` should be of dimension [B] if self.targeted: # if targeted, optimize to make `target_activ_var` larger than # `maxother_activ_var` by `self.confidence` # # noinspection PyArgumentList f_var = torch.clamp(maxother_activ_var - target_activ_var + self.confidence, min=0.0) else: # if not targeted, optimize to make `maxother_activ_var` larger than # `target_activ_var` (the ground truth image labels) by # `self.confidence` # # noinspection PyArgumentList f_var = torch.clamp(target_activ_var - maxother_activ_var + self.confidence, min=0.0) # the total loss of current batch, should be of dimension [1] batch_loss_var = torch.sum(perts_norm_var + c_var * f_var) # type: Variable # Do optimization for one step optimizer.zero_grad() batch_loss_var.backward() optimizer.step() # Make some records in python/numpy on CPU batch_loss = batch_loss_var.item() # type: float pert_norms_np = _var2numpy(perts_norm_var) pert_outputs_np = _var2numpy(pert_outputs_var) advxs_np = _var2numpy(advxs_var) return batch_loss, pert_norms_np, pert_outputs_np, advxs_np
def concat_score(self, hidden, encoder_output): energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh() return torch.sum(self.v * energy, dim=2)
def general_score(self, hidden, encoder_output): energy = self.attn(encoder_output) return torch.sum(hidden * energy, dim=2)
def dot_score(self, hidden, encoder_output): return torch.sum(hidden * encoder_output, dim=2)
def train_model(device, model, dataloaders, dataset_sizes, criterion=None, optimizer=None, scheduler=None, num_epochs=100, checkpoints=10, output_dir='output', status=1, train_acc=0, track_steps=False, seed=414921): ''' Helper function to train PyTorch model based on parameters ''' # pylint: disable=no-member # # <-- VC code pylint complains about torch.sum() and .max() # create the model directory if it doesn't exist if not os.path.isdir(output_dir): os.mkdir(output_dir) # configure the training if it was not specified by user if not criterion: criterion = nn.CrossEntropyLoss() if not optimizer: optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9) if not scheduler: scheduler = lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1) # send the model to the device model = model.to(device) since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 metrics = [] step_metrics = [] # if track_steps=True training_step = 0 acc_reached = False for epoch in range(num_epochs): epoch_start_time = time.time() if (epoch) % status == 0 or epoch == num_epochs-1: print() print(f'Epoch {epoch}/{num_epochs - 1}') print('-' * 10) for phase in ['train', 'val']: if phase == 'train': model.train() else: model.eval() epoch_phase_start_time = time.time() running_loss = 0.0 running_corrects = 0 for inputs, labels in dataloaders[phase]: step_start_time = time.time() inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() if track_steps: # store per step metrics (WARNING! lots of data) step_metrics.append({ 'device': str(device), 'epoch': epoch, 'training_step': training_step, 'training_step_loss': loss.item(), 'training_step_time': time.time() - step_start_time }) training_step += 1 # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) if phase == 'train': scheduler.step() epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] epoch_phase_end_time = time.time() # deep copy the model if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc.item() best_model_wts = copy.deepcopy(model.state_dict()) # check if training accuracy has met target, if so signal exit if (train_acc > 0) and (epoch_acc.item() >= train_acc) and phase == 'train': acc_reached = True print() print(f'Epoch {epoch}/{num_epochs - 1}') print('-' * 10) if (epoch) % status == 0 or epoch == num_epochs-1 or acc_reached: print(f'{phase} Loss: {round(epoch_loss, 4)} Acc: {round(epoch_acc.item(), 4)}') else: prog = '-' * int(((epoch) % status)) print('\r{}|{}'.format(prog,epoch),end='') # store per epoch metrics if phase == 'val': validation_time = time.time() - epoch_start_time avg_val_loss = loss.item() avg_val_acc = epoch_acc.item() else: training_time = time.time() - epoch_start_time avg_train_loss = loss.item() avg_train_acc = epoch_acc.item() metrics.append({ 'device': str(device), 'epoch': epoch, 'average_training_loss': avg_train_loss, 'average_validation_loss': avg_val_loss, 'training_acc': avg_train_acc, 'validaton_acc': avg_val_acc, 'training_time': training_time, 'validation_time': validation_time }) ####### save checkpoint after epoch if (epoch > 0 and epoch != num_epochs-1) and \ ((epoch+1) % checkpoints == 0 and os.path.isdir(output_dir)): checkpoint=os.path.join(output_dir, f'epoch{epoch+1}_checkpoint_model.th') torch.save({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc': best_acc, }, checkpoint) # dump the data for later json_file = os.path.join(output_dir, f'epoch{epoch+1}_checkpoint_metrics.json') with open(json_file, 'w') as fp: json.dump(metrics, fp) ####### # if the target accuracy was reached during this epoch, it is time to exit if acc_reached: break ####### save final checkpoint if os.path.isdir(output_dir): timestamp = time.strftime("%Y-%m-%dT%H%M%S") checkpoint= os.path.join(output_dir, f'final_model_{timestamp}.th') # save the model torch.save({ 'state_dict': model.state_dict(), 'best_acc': best_acc, }, checkpoint) # dump the data for later metric_path = os.path.join(output_dir,f'final_metrics_{timestamp}.json') with open(metric_path, 'w') as fp: json.dump(metrics, fp) ####### time_elapsed = time.time() - since print(f'Training complete in {time_elapsed // 60}m {time_elapsed % 60}s') print(f'Best val Acc: {round(best_acc, 4)}') # load best model weights model.load_state_dict(best_model_wts) # set up return structures metrics_df = pd.DataFrame(data=metrics) step_metrics_df = pd.DataFrame(data=step_metrics) if step_metrics else None return model, metrics_df, step_metrics_df
def active_learning_taylor(func_name,start_rand_idxs=None, bud=None, valid=True,fac_loc_idx=None): torch.manual_seed(42) torch.cuda.manual_seed(42) np.random.seed(42) random.seed(42) torch.backends.cudnn.deterministic = True #model = ThreeLayerNet(M, num_cls, 5, 5) #model = LogisticRegNet(M, num_cls) model = TwoLayerNet(M, num_cls, 100) # if data_name == 'mnist': # model = MnistNet() if torch.cuda.device_count() > 1: print("Using:", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) cudnn.benchmark = True model = model.to(device) idxs = start_rand_idxs if func_name == 'Facloc Regularized': x_val1 = torch.cat([x_val, x_trn[fac_loc_idx]], dim=0) y_val1 = torch.cat([y_val, y_trn[fac_loc_idx]], dim=0) criterion = nn.CrossEntropyLoss() criterion_nored = nn.CrossEntropyLoss(reduction='none') optimizer = optim.SGD(model.parameters(), lr=learning_rate) if func_name == 'Full OneStep': setf_model = SetFunctionBatch(x_val, y_val, model, criterion, criterion_nored, learning_rate, device) elif func_name == 'Facility Location': if data_name != 'covertype': setf_model = SetFunctionFacLoc(device, train_batch_size_for_greedy) idxs = setf_model.lazy_greedy_max(bud, x_trn,model) else: idxs = run_stochastic_Facloc(x_trn, y_trn, bud) facility_loaction_warm_start = copy.deepcopy(idxs) elif func_name == 'Facloc Regularized': setf_model = SetFunctionTaylor(x_val1, y_val1, model, criterion, criterion_nored, learning_rate, device,num_cls) else: #setf_model = SetFunctionTaylorDeep(train_loader_greedy, valid_loader, valid, model, # criterion, criterion_nored, learning_rate, device, N) setf_model = SetFunctionTaylor(x_val, y_val, model, criterion, criterion_nored, learning_rate, device,num_cls) #setf_model = SetFunctionTaylorDeep_ReLoss_Mean(x_trn, y_trn, train_batch_size_for_greedy, x_val, y_val, valid, model, # criterion, criterion_nored, learning_rate, device, N) remainList = set(list(range(N))) idxs = list(idxs) remainList = remainList.difference(idxs) if func_name == 'Taylor Online': print("Starting Online OneStep Run with taylor on loss!") elif func_name == 'Full OneStep': print("Starting Online OneStep Run without taylor!") elif func_name == 'Facloc Regularized': print("Starting Facility Location Regularized Online OneStep Run with taylor!") elif func_name == 'Random Greedy': print("Starting Randomized Greedy Online OneStep Run with taylor!") elif func_name == 'Facility Location': print("Starting Facility Location!") elif func_name == 'Random': print("Starting Random Run!") elif func_name == 'Random Perturbation': print("Starting Online OneStep Run with taylor with random perturbation!") elif func_name == "FASS": print("Filtered Active Submodular Selection(FASS)!") #elif func_name == 'Proximal': #print("Starting Online Proximal OneStep Run with taylor!") #elif func_name == 'Taylor on Logit': # print("Starting Online OneStep Run with taylor on logit!") # if valid: # print("Online OneStep Run with Taylor approximation and with Validation Set",file=logfile) # else: # print("Online OneStep Run with Taylor approximation and without Validation Set",file=logfile) val_accies = np.zeros(no_select) test_accies = np.zeros(no_select) unlab_accies = np.zeros(no_select) # idxs = start_rand_idxs def weight_reset(m): torch.manual_seed(42) torch.cuda.manual_seed(42) np.random.seed(42) random.seed(42) torch.backends.cudnn.deterministic = True if isinstance(m, nn.Linear): #m.reset_parameters() m.weight.data.normal_(0.0, 0.02) m.bias.data.fill_(0) model = model.apply(weight_reset).cuda() #print(model.linear2.weight) for n in range(no_select): loader_tr = DataLoader(CustomDataset_act(x_trn[idxs], y_trn[idxs], transform=None),batch_size=no_points) model.train() for i in range(num_epochs): # inputs, targets = x_trn[idxs].to(device), y_trn[idxs].to(device) '''inputs, targets = x_trn[idxs], y_trn[idxs] optimizer.zero_grad() scores = model(inputs) loss = criterion(scores, targets) loss.backward() optimizer.step()''' #model = model.apply(weight_reset).cuda() accFinal = 0. for batch_idx in list(loader_tr.batch_sampler): x, y, idxs = loader_tr.dataset[batch_idx] x, y = Variable(x.cuda()), Variable(y.cuda()) optimizer.zero_grad() out = model(x) loss = F.cross_entropy(out, y) accFinal += torch.sum((torch.max(out,1)[1] == y).float()).data.item() loss.backward() if (i % 50 == 0) and (accFinal < 0.2): # reset if not converging model = model.apply(weight_reset).cuda() optimizer = optim.SGD(model.parameters(), lr = learning_rate) # clamp gradients, just in case for p in filter(lambda p: p.grad is not None, model.parameters()): p.grad.data.clamp_(min=-.1, max=.1) optimizer.step() #if accFinal/len(loader_tr.dataset.X) >= 0.99: # break '''with torch.no_grad(): # val_in, val_t = x_val.to(device), y_val.to(device) val_outputs = model(x_val) val_loss = criterion(val_outputs, y_val) full_trn_outputs = model(x_trn) full_trn_loss = criterion(full_trn_outputs, y_trn)''' #accFinal = torch.sum((torch.max(scores,1)[1] == targets).float()).data.item() #print(accFinal / len(loader_tr.dataset.X)) #if i % print_every == 0: # Print Training and Validation Loss print( n+1,'Time', 'SubsetTrn', loss.item())#, ,FullTrn,ValLoss: full_trn_loss.item(), val_loss.item()) curr_X_trn = x_trn[list(remainList)] curr_Y_trn = y_trn[list(remainList)] model.eval() with torch.no_grad(): '''full_trn_out = model(x_trn) full_trn_loss = criterion(full_trn_out, y_trn).mean() sub_trn_out = model(x_trn[idxs]) sub_trn_loss = criterion(sub_trn_out, y_trn[idxs]).mean()''' val_out = model(x_val) val_loss = criterion(val_out, y_val) _, val_predict = val_out.max(1) val_correct = val_predict.eq(y_val).sum().item() val_total = y_val.size(0) val_acc = 100 * val_correct / val_total correct = 0 total = 0 inputs, targets = x_tst.to(device), y_tst.to(device) outputs = model(inputs) test_loss = criterion(outputs, targets) _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() tst_acc = 100.0 * correct / total rem_out = model(curr_X_trn) rem_loss = criterion(rem_out, curr_Y_trn) _, rem_predict = rem_out.max(1) rem_correct = rem_predict.eq(curr_Y_trn).sum().item() rem_total = curr_Y_trn.size(0) rem_acc = 100 * rem_correct / rem_total val_accies[n] = val_acc test_accies[n] = tst_acc unlab_accies[n] = rem_acc #if ((i + 1) % select_every == 0) and func_name not in ['Facility Location','Random']: # val_in, val_t = x_val.to(device), y_val.to(device) # Transfer them to device cached_state_dict = copy.deepcopy(model.state_dict()) clone_dict = copy.deepcopy(model.state_dict()) # Dont put the logs for Selection on logfile!! # print("With Taylor approximation",file=logfile) # print("selEpoch: %d, Starting Selection:" % i, str(datetime.datetime.now()),file=logfile) #t_ng_start = time.time() if func_name == 'Random Greedy': new_idxs = setf_model.naive_greedy_max(curr_X_trn,rem_predict,int(0.9 * no_points), clone_dict) new_idxs = list(np.array(list(remainList))[new_idxs]) remainList = remainList.difference(new_idxs) new_idxs.extend(list(np.random.choice(list(remainList), size=int(0.1 * no_points), replace=False))) remainList = remainList.difference(new_idxs) idxs.extend(new_idxs) elif func_name == "FASS": fn = nn.Softmax(dim=1) soft = fn(rem_out) entropy2 = Categorical(probs = soft).entropy() #print(entropy2.shape) if 5*no_points < entropy2.shape[0]: values,indices = entropy2.topk(5*no_points) #indices = list(np.array(list(remainList))[indices.cpu()]) else: indices = [i for i in range(entropy2.shape[0])]#list(remainList) knn_idxs_flag_val = perform_knnsb_selection(datadir, data_name, curr_X_trn[indices],rem_predict[indices], fraction, selUsing='val') #print(knn_idxs_flag_val) #print(len(knn_idxs_flag_val)) ##print(len(knn_idxs_flag_val),len(indices)) knn_idxs_flag_val = list(np.array(list(remainList))[indices.cpu()][knn_idxs_flag_val]) remainList = remainList.difference(knn_idxs_flag_val) idxs.extend(knn_idxs_flag_val) elif func_name == 'Random': state = np.random.get_state() np.random.seed(n*n) #new_idxs = gen_rand_prior_indices(list(remainList), size=no_points) new_idxs = np.random.choice(list(remainList), size=no_points, replace=False) np.random.set_state(state) remainList = remainList.difference(new_idxs) idxs.extend(new_idxs) elif func_name == 'Random Perturbation': new_idxs = setf_model.naive_greedy_max(curr_X_trn,rem_predict,no_points, clone_dict,None,True) # , grads_idxs new_idxs = np.array(list(remainList))[new_idxs] remainList = remainList.difference(new_idxs) idxs.extend(new_idxs) elif func_name == 'Facility Location': if data_name == 'covertype': new_idxs = run_stochastic_Facloc(curr_X_trn, rem_predict, bud) else: new_idxs = setf_model.lazy_greedy_max(bud, curr_X_trn ,model) new_idxs = np.array(list(remainList))[new_idxs] remainList = remainList.difference(new_idxs) idxs.extend(new_idxs) else: new_idxs = setf_model.naive_greedy_max(curr_X_trn,rem_predict,no_points, clone_dict) # , grads_idxs new_idxs = np.array(list(remainList))[new_idxs] remainList = remainList.difference(new_idxs) idxs.extend(new_idxs) '''elif func_name == 'Proximal': previous = torch.zeros(N,device=device) previous[idxs] = 1.0 new_idxs = setf_model.naive_greedy_max(bud, clone_dict,None,previous) idxs = new_idxs''' # print("selEpoch: %d, Selection Ended at:" % (i), str(datetime.datetime.now()),file=logfile) # print("Naive greedy total time with taylor:", time.time()-t_ng_start,file=logfile) model.load_state_dict(cached_state_dict) # Calculate Final SubsetTrn, FullTrn, Val and Test Loss # Calculate Val and Test Accuracy if func_name == 'Facility Location': return val_accies, test_accies, unlab_accies, idxs,facility_loaction_warm_start else: return val_accies, test_accies, unlab_accies, idxs
def forward(self, input): self.x_diff = input[:,:,1:,:] - input[:,:,:-1,:] self.y_diff = input[:,:,:,1:] - input[:,:,:,:-1] self.loss = self.strength * (torch.sum(torch.abs(self.x_diff)) + torch.sum(torch.abs(self.y_diff))) return input
def main(args): # load graph data if args.dataset == 'aifb': dataset = AIFBDataset() elif args.dataset == 'mutag': dataset = MUTAGDataset() elif args.dataset == 'bgs': dataset = BGSDataset() elif args.dataset == 'am': dataset = AMDataset() else: raise ValueError() g = dataset[0] category = dataset.predict_category num_classes = dataset.num_classes train_mask = g.nodes[category].data.pop('train_mask') test_mask = g.nodes[category].data.pop('test_mask') train_idx = th.nonzero(train_mask).squeeze() test_idx = th.nonzero(test_mask).squeeze() labels = g.nodes[category].data.pop('labels') # split dataset into train, validate, test if args.validation: val_idx = train_idx[:len(train_idx) // 5] train_idx = train_idx[len(train_idx) // 5:] else: val_idx = train_idx # check cuda device = 'cpu' use_cuda = args.gpu >= 0 and th.cuda.is_available() if use_cuda: th.cuda.set_device(args.gpu) device = 'cuda:%d' % args.gpu train_label = labels[train_idx] val_label = labels[val_idx] test_label = labels[test_idx] # create embeddings embed_layer = RelGraphEmbed(g, args.n_hidden) node_embed = embed_layer() # create model model = EntityClassify(g, args.n_hidden, num_classes, num_bases=args.n_bases, num_hidden_layers=args.n_layers - 2, dropout=args.dropout, use_self_loop=args.use_self_loop) if use_cuda: model.cuda() # train sampler sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.n_layers) loader = dgl.dataloading.NodeDataLoader( g, {category: train_idx}, sampler, batch_size=args.batch_size, shuffle=True, num_workers=0) # validation sampler # we do not use full neighbor to save computation resources val_sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.n_layers) val_loader = dgl.dataloading.NodeDataLoader( g, {category: val_idx}, val_sampler, batch_size=args.batch_size, shuffle=True, num_workers=0) # optimizer all_params = itertools.chain(model.parameters(), embed_layer.parameters()) optimizer = th.optim.Adam(all_params, lr=args.lr, weight_decay=args.l2norm) # training loop print("start training...") dur = [] for epoch in range(args.n_epochs): model.train() optimizer.zero_grad() if epoch > 3: t0 = time.time() for i, (input_nodes, seeds, blocks) in enumerate(loader): blocks = [blk.to(device) for blk in blocks] seeds = seeds[category] # we only predict the nodes with type "category" batch_tic = time.time() emb = extract_embed(node_embed, input_nodes) lbl = labels[seeds] if use_cuda: emb = {k : e.cuda() for k, e in emb.items()} lbl = lbl.cuda() logits = model(emb, blocks)[category] loss = F.cross_entropy(logits, lbl) loss.backward() optimizer.step() train_acc = th.sum(logits.argmax(dim=1) == lbl).item() / len(seeds) print("Epoch {:05d} | Batch {:03d} | Train Acc: {:.4f} | Train Loss: {:.4f} | Time: {:.4f}". format(epoch, i, train_acc, loss.item(), time.time() - batch_tic)) if epoch > 3: dur.append(time.time() - t0) val_loss, val_acc = evaluate(model, val_loader, node_embed, labels, category, device) print("Epoch {:05d} | Valid Acc: {:.4f} | Valid loss: {:.4f} | Time: {:.4f}". format(epoch, val_acc, val_loss, np.average(dur))) print() if args.model_path is not None: th.save(model.state_dict(), args.model_path) output = model.inference( g, args.batch_size, 'cuda' if use_cuda else 'cpu', 0, node_embed) test_pred = output[category][test_idx] test_labels = labels[test_idx] test_acc = (test_pred.argmax(1) == test_labels).float().mean() print("Test Acc: {:.4f}".format(test_acc)) print()
def forward(self, query, key, value, attention_mask): attention_mask = (attention_mask == 0).float().to(key.device).squeeze() length = torch.sum(attention_mask, dim=1) # attention_mask = attention_mask[:,:,None,None].repeat((1,1,key.size()[-2], key.size()[-1])) key = key * attention_mask[:, :, None, None].repeat( (1, 1, key.size()[-2], key.size()[-1])) key_sent = torch.sum(key, dim=1) / length[:, None, None].repeat( 1, key.size()[-2], key.size()[-1]) if (self.config.adapter_fusion["query"] and not self.config.adapter_fusion["key"] and not self.config.adapter_fusion["value"]): query = query * attention_mask[:, :, None].repeat( (1, 1, query.size()[-1])) query_sent = torch.sum(query, dim=1) / length[:, None].repeat( 1, query.size()[-1]) query_enc = self.query(query_sent) scores_t = torch.matmul(key_sent, query_enc[:, :, None]).squeeze(-1) probs = nn.Softmax(dim=-1)(scores_t / self.T) # result = torch.squeeze(torch.matmul(probs, value), dim=2) result = torch.squeeze(torch.matmul(probs[:, None, None, :], value)) # {'MR': {'devacc': 77.53, 'acc': 76.7, 'ndev': 9596, 'ntest': 9596}} if (self.config.adapter_fusion["query"] and self.config.adapter_fusion["key"] and not self.config.adapter_fusion["value"]): query = query * attention_mask[:, :, None].repeat( (1, 1, query.size()[-1])) query_sent = torch.sum(query, dim=1) / length[:, None].repeat( 1, query.size()[-1]) query_enc = self.query(query_sent) key_enc = self.key(key_sent) scores_t = torch.matmul(key_enc, query_enc[:, :, None]).squeeze(-1) probs = nn.Softmax(dim=-1)(scores_t / self.T) # result = torch.squeeze(torch.matmul(probs, value), dim=2) result = torch.squeeze(torch.matmul(probs[:, None, None, :], value)) if (self.config.adapter_fusion["query"] and self.config.adapter_fusion["key"] and self.config.adapter_fusion["value"]): query = query * attention_mask[:, :, None].repeat( (1, 1, query.size()[-1])) query_sent = torch.sum(query, dim=1) / length[:, None].repeat( 1, query.size()[-1]) query_enc = self.query(query_sent) key_enc = self.key(key_sent) value_enc = self.value(value) scores_t = torch.matmul(key_enc, query_enc[:, :, None]).squeeze(-1) probs = nn.Softmax(dim=-1)(scores_t / self.T) # result = torch.squeeze(torch.matmul(probs, value), dim=2) result = torch.squeeze( torch.matmul(probs[:, None, None, :], value_enc)) if (not self.config.adapter_fusion["query"] and not self.config.adapter_fusion["key"] and not self.config.adapter_fusion["value"]): # key_sent = torch.mean(key, dim=1) scores = self.dense(key_sent) scores_t = scores.transpose(-2, -1) probs = nn.Softmax(dim=-1)(scores_t / self.T) result = torch.squeeze(torch.matmul(probs.unsqueeze(2), value), dim=2) # attention_scores = attention_scores + attention_mask # weighted_value = probs.unsqueeze(1).unsqueeze(-1) * value # result = torch.sum(weighted_value, dim=2) self.T = max(self.T - self.reduction, 1.0) return result
def training_loop(args, model, criterion, optimizer, dataset, f, device, experiment): start = time.time() best_weights = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(args.num_epochs): print(f'Epoch {epoch} began') running_loss = 0.0 running_corrects = 0 # training phase for idx, data in enumerate(Bar(dataset['train_dataloader'])): inputs = Variable(data.get('image')).to(device) target = Variable(data.get('target')).to(device) # forward pass output = model(inputs) _, preds = torch.max(output, 1) loss = criterion(output, target) loss = loss / args.accumulation_steps # Normalize accumulated loss (averaged) loss = loss.mean() # backward pass loss.backward() # Backward pass (mean of parallel loss) if (idx+1) % args.accumulation_steps == 0: # Wait for several backward steps optimizer.step() # Now we can do an optimizer step model.zero_grad() # Reset gradient tensors running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == target.data) # log training stats train_epoch_loss = running_loss / len(dataset['train_data']) train_epoch_acc = running_corrects.double() / len(dataset['train_data']) print('Epoch [{}/{}], training loss:{:.4f}'.format(epoch+1, args.num_epochs, train_epoch_loss)) print('Epoch [{}/{}], training accuracy:{:.4f}'.format(epoch+1, args.num_epochs, train_epoch_acc)) # validation phase running_loss = 0.0 running_corrects = 0 with torch.no_grad(): for idx, data in enumerate(Bar(dataset['val_dataloader'])): inputs = Variable(data.get('image')).to(device) target = Variable(data.get('target')).to(device) output = model(inputs) _, preds = torch.max(output, 1) loss = criterion(output, target).mean() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == target.data) # log validation stats valid_epoch_loss = running_loss / len(dataset['val_data']) valid_epoch_acc = running_corrects.double() / len(dataset['val_data']) print('Epoch [{}/{}], validation loss:{:.4f}'.format(epoch+1, args.num_epochs, valid_epoch_loss)) print('Epoch [{}/{}], validation accuracy:{:.4f}'.format(epoch+1, args.num_epochs, valid_epoch_acc)) # append to experiment report print(f'{epoch+1}\t{train_epoch_loss}\t{train_epoch_acc}\t{valid_epoch_loss}\t{valid_epoch_acc}', file=open(f, "a")) # save best weights if valid_epoch_acc > best_acc: best_acc = valid_epoch_acc best_weights = copy.deepcopy(model.state_dict()) torch.save(model.state_dict(), f'models/{args.dataset}/{experiment}.pth') time_elapsed = time.time() - start print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60), file=open(f, "a")) print('Best val Acc: {:4f}'.format(best_acc), file=open(f, "a")) # load best weights model.load_state_dict(f'models/{args.dataset}/{experiment}.pth') return model
def train_model(args): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') device_ids=[0,1,2,3] batch_size=args.batch_size input_channels = 1 out_channels = [args.out_channels1, args.out_channels2] kernel_size_cnn = [[args.kernel_size_cnn1, args.kernel_size_cnn2],[args.kernel_size_cnn2, args.kernel_size_cnn1]] stride_size_cnn = [[args.stride_size_cnn1, args.stride_size_cnn2],[args.stride_size_cnn2, args.stride_size_cnn1]] kernel_size_pool = [[args.kernel_size_pool1, args.kernel_size_pool2],[args.kernel_size_pool2, args.kernel_size_pool1]] stride_size_pool = [[args.stride_size_pool1, args.stride_size_pool2],[args.stride_size_pool2, args.stride_size_pool1]] hidden_dim=200 num_layers=2 dropout=0 num_labels=4 hidden_dim_lstm=200 epoch_num=50 num_layers_lstm=2 nfft=[512,1024] weight = args.weight model = MultiSpectrogramModel(input_channels,out_channels, kernel_size_cnn, stride_size_cnn, kernel_size_pool, stride_size_pool, hidden_dim,num_layers,dropout,num_labels, batch_size, hidden_dim_lstm,num_layers_lstm,device, nfft, weight, False) print("============================ Number of parameters ====================================") print(str(sum(p.numel() for p in model.parameters() if p.requires_grad))) path="batch_size:{};out_channels:{};kernel_size_cnn:{};stride_size_cnn:{};kernel_size_pool:{};stride_size_pool:{}; weight:{}".format(args.batch_size,out_channels,kernel_size_cnn,stride_size_cnn,kernel_size_pool,stride_size_pool, weight) with open("/scratch/speech/models/classification/spec_multi_joint_stats_weight.txt","a+") as f: f.write("\n"+"============ model starts ===========") f.write("\n"+"model_parameters: "+str(sum(p.numel() for p in model.parameters() if p.requires_grad))+"\n"+path+"\n") model.cuda() model=DataParallel(model,device_ids=device_ids) model.train() # Use Adam as the optimizer with learning rate 0.01 to make it fast for testing purposes optimizer = optim.Adam(model.parameters(),lr=0.001) optimizer2=optim.SGD(model.parameters(), lr=0.1) scheduler = ReduceLROnPlateau(optimizer=optimizer,factor=0.5, patience=2, threshold=1e-3) #scheduler2=ReduceLROnPlateau(optimizer=optimizer2, factor=0.5, patience=2, threshold=1e-3) #scheduler2 =CosineAnnealingLR(optimizer2, T_max=300, eta_min=0.0001) scheduler3 =MultiStepLR(optimizer, [5,10,15],gamma=0.1) # Load the training data training_data = IEMOCAP(name='mel', nfft=nfft, train=True) train_loader = DataLoader(dataset=training_data, batch_size=batch_size, shuffle=True, collate_fn=my_collate, num_workers=0, drop_last=True) testing_data = IEMOCAP(name='mel', nfft=nfft, train=False) test_loader = DataLoader(dataset=testing_data, batch_size=batch_size, shuffle=True, collate_fn=my_collate, num_workers=0,drop_last=True) #print("=================") #print(len(training_data)) #print("===================") test_acc=[] train_acc=[] test_loss=[] train_loss=[] for epoch in range(epoch_num): # again, normally you would NOT do 300 epochs, it is toy data #print("===================================" + str(epoch+1) + "==============================================") losses = 0 correct=0 model.train() for j, (input_lstm, input1, input2, target, seq_length) in enumerate(train_loader): #if (j+1)%20==0: #print("=================================Train Batch"+ str(j+1)+str(weight)+"===================================================") model.zero_grad() losses_batch,correct_batch= model(input_lstm, input1, input2, target, seq_length) loss = torch.mean(losses_batch,dim=0) correct_batch=torch.sum(correct_batch,dim=0) losses += loss.item() * batch_size loss.backward() weight=model.module.state_dict()["weight"] weight=torch.exp(10*weight)/(1+torch.exp(10*weight)).item() optimizer.step() correct += correct_batch.item() accuracy=correct*1.0/((j+1)*batch_size) losses=losses / ((j+1)*batch_size) #scheduler3.step() losses_test = 0 correct_test = 0 #torch.save(model.module.state_dict(), "/scratch/speech/models/classification/spec_full_joint_checkpoint_epoch_{}.pt".format(epoch+1)) model.eval() with torch.no_grad(): for j,(input_lstm, input1, input2, target, seq_length) in enumerate(test_loader): #if (j+1)%10==0: print("=================================Test Batch"+ str(j+1)+ "===================================================") #input_lstm = pad_sequence(sequences=input_lstm,batch_first=True) losses_batch,correct_batch= model(input_lstm,input1, input2, target, seq_length) loss = torch.mean(losses_batch,dim=0) correct_batch=torch.sum(correct_batch,dim=0) losses_test += loss.item() * batch_size correct_test += correct_batch.item() #print("how many correct:", correct_test) accuracy_test = correct_test * 1.0 / ((j+1)*batch_size) losses_test = losses_test / ((j+1)*batch_size) # data gathering test_acc.append(accuracy_test) train_acc.append(accuracy) test_loss.append(losses_test) train_loss.append(losses) print("Epoch: {}-----------Training Loss: {} -------- Testing Loss: {} -------- Training Acc: {} -------- Testing Acc: {}".format(epoch+1,losses,losses_test, accuracy, accuracy_test)+"\n") with open("/scratch/speech/models/classification/spec_multi_joint_stats_weight.txt","a+") as f: #f.write("Epoch: {}-----------Training Loss: {} -------- Testing Loss: {} -------- Training Acc: {} -------- Testing Acc: {}".format(epoch+1,losses,losses_test, accuracy, accuracy_test)+"\n") if epoch==epoch_num-1: f.write("Best Accuracy:{:06.5f}".format(max(test_acc))+"\n") f.write("Average Top 10 Accuracy:{:06.5f}".format(np.mean(np.sort(np.array(test_acc))[-10:]))+"\n") f.write("=============== model ends ==================="+"\n") print("success:{}, Best Accuracy:{}".format(path,max(test_acc)))
def regulization(self, model, Lambda): w = torch.cat([x.view(-1) for x in model.parameters()]) err = Lambda * torch.sum(torch.abs(w)) return err
def k2(kesi,f_x,f_y,mean_logk,lamda): logk=mean_logk+torch.sum(torch.sqrt(lamda)*f_x*f_y*kesi,1) kk=torch.exp(logk) return kk
def train(x, y): frame_predictor.zero_grad() posterior_mu.zero_grad() prior_mu.zero_grad() encoder.zero_grad() decoder.zero_grad() # initialize the hidden state. frame_predictor.hidden = frame_predictor.init_hidden() posterior_mu.hidden = posterior_mu.init_hidden() prior_mu.hidden = prior_mu.init_hidden() mse = 0 var = 0 h_match_prev = [encoder(y[m][0])[0].detach() for m in range(5)] for i in range(1, opt.n_past+opt.n_future): h = encoder(x[i-1]) h_target = encoder(x[i])[0] h_match = [encoder(y[t][i])[0].detach() for t in range(5)] if opt.last_frame_skip or i < opt.n_past: h, skip = h else: h = h[0] ## Our work: at each time stamp we predict 5 random results ## select_tensor = torch.zeros((opt.batch_size, 5, opt.g_dim)).cuda() err_list = [] pred_list = [] pre_hidden = [(frame_predictor.hidden[0][0].clone(), frame_predictor.hidden[0][1].clone()), (frame_predictor.hidden[1][0].clone(), frame_predictor.hidden[1][1].clone())] ref_var = torch.mean(torch.cat([var_encoder(h_match[m] - h_match_prev[m] + h).unsqueeze(1) for m in range(5)], 1), 1) mu = posterior_mu(h_target) mu_p = prior_mu(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h], -1)) for j in range(5): frame_predictor.hidden = [(pre_hidden[0][0].clone(), pre_hidden[0][1].clone()), (pre_hidden[1][0].clone(), pre_hidden[1][1].clone())] z_t = reparameterize(mu, ref_var) h_pred = frame_predictor(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h, z_t], 1)) pred_list.append(h_pred.unsqueeze(1)) err_list.append(torch.mean(torch.abs(h_pred - h_target), -1)) ## Our work: select the best match one as prediction ## err_tensor = torch.cat([err.unsqueeze(-1) for err in err_list], -1) min_idx = torch.argmin(err_tensor, -1) for bs in range(opt.batch_size): select_tensor[bs, min_idx[bs], :] = 1.0 h_pred = torch.sum(torch.cat(pred_list, 1) * select_tensor.detach(), 1) x_pred = decoder([h_pred, skip]) ## Our work: match with the expection ## mse += (mse_criterion(x_pred, x[i]) + opt.alpha * mse_criterion(mu, mu_p)) ## Our work: match with the variation ## ref_var = torch.std(torch.cat([(h_match[m] - h_match_prev[m]).unsqueeze(1) for m in range(5)], 1), 1) pre_var = torch.std(torch.cat([pred_list[m] - h.unsqueeze(1) for m in range(5)], 1), 1) var += mse_criterion(ref_var, pre_var) h_match_prev = h_match loss = mse + var * opt.beta loss.backward() frame_predictor_optimizer.step() posterior_mu_optimizer.step() prior_mu_optimizer.step() var_encoder_optimizer.step() encoder_optimizer.step() decoder_optimizer.step() return mse.data.cpu().numpy()/(opt.n_past+opt.n_future), var.data.cpu().numpy()/(opt.n_future+opt.n_past)
def test_headmasking(self): if not self.test_head_masking: return global_rng.seed(42) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) global_rng.seed() config.output_attentions = True config.output_hidden_states = True configs_no_init = _config_zero_init( config) # To be sure we have no Nan for model_class in self.all_model_classes: model = model_class(config=configs_no_init) model.to(torch_device) model.eval() # Prepare head_mask # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device) head_mask[0, 0] = 0 head_mask[-1, :-1] = 0 head_mask.requires_grad_(requires_grad=True) inputs = inputs_dict.copy() inputs['head_mask'] = head_mask outputs = model(**inputs) # Test that we can get a gradient back for importance score computation output = sum(t.sum() for t in outputs[0]) output = output.sum() output.backward() multihead_outputs = head_mask.grad attentions = outputs[-1] hidden_states = outputs[-2] # Remove Nan for t in attentions: self.assertLess( torch.sum(torch.isnan(t)), t.numel() / 4 ) # Check we don't have more than 25% nans (arbitrary) attentions = [ t.masked_fill(torch.isnan(t), 0.0) for t in attentions ] # remove them (the test is less complete) self.assertIsNotNone(multihead_outputs) self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers) self.assertAlmostEqual( attentions[0][..., 0, :, :].flatten().sum().item(), 0.0) self.assertNotEqual( attentions[0][..., -1, :, :].flatten().sum().item(), 0.0) self.assertNotEqual( attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) self.assertAlmostEqual( attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0) self.assertNotEqual( attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
def discriminator_criterion( G, D, reals, alpha=1., device='cuda', wgan_lambda=10.0, # Weight for the gradient penalty term. wgan_epsilon=0.001, # Weight for the epsilon term, \epsilon_{drift}. wgan_target=1.0): # Target value for gradient magnitudes. """ Wasserstein distance criterion. Parameters ---------- discriminator_fake_output discriminator_real_output generated_output real_output discriminator lambda_ Returns ------- """ latents = torch.randn([reals.shape[0], 512]).to(device) fake_images_out = G(latents, alpha=alpha) real_scores_out = D(reals, alpha=alpha) fake_scores_out = D(fake_images_out, alpha=alpha) loss = fake_scores_out - real_scores_out mixing_factors = torch.rand([reals.shape[0], 1, 1, 1]).to(device) mixed_images_out = torch.lerp(reals, fake_images_out, mixing_factors).to(device) mixed_scores_out = D(mixed_images_out, alpha=alpha) # # Apply dynamic loss scaling for the given expression. # def apply_loss_scaling(self, value): # assert is_tf_expression(value) # if not self.use_loss_scaling: # return value # return value * exp2(self.get_loss_scaling_var(value.device)) # # # Undo the effect of dynamic loss scaling for the given expression. # def undo_loss_scaling(self, value): # assert is_tf_expression(value) # if not self.use_loss_scaling: # return value # return value * exp2(-self.get_loss_scaling_var(value.device)) mixed_loss = torch.sum( mixed_scores_out ) # originally wrapped in loss_scaling, but appears to not use it grad_outputs = torch.ones(mixed_loss.size()).to(device) mixed_grads = torch.autograd.grad(mixed_loss, mixed_images_out, grad_outputs=grad_outputs, create_graph=True)[0] mixed_norms = torch.sqrt(torch.sum(mixed_grads**2, axis=[1, 2, 3])) gradient_penalty = ((mixed_norms - wgan_target)**2).reshape(-1, 1) loss += gradient_penalty * (wgan_lambda / (wgan_target**2)) epsilon_penalty = real_scores_out**2 loss += epsilon_penalty * wgan_epsilon # def D_wgangp_acgan(G, D, opt, training_set, minibatch_size, reals, labels, # wgan_lambda = 10.0, # Weight for the gradient penalty term. # wgan_epsilon = 0.001, # Weight for the epsilon term, \epsilon_{drift}. # wgan_target = 1.0, # Target value for gradient magnitudes. # cond_weight = 1.0): # Weight of the conditioning terms. # # latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:]) # fake_images_out = G.get_output_for(latents, labels, is_training=True) # real_scores_out, real_labels_out = fp32(D.get_output_for(reals, is_training=True)) # fake_scores_out, fake_labels_out = fp32(D.get_output_for(fake_images_out, is_training=True)) # real_scores_out = tfutil.autosummary('Loss/real_scores', real_scores_out) # fake_scores_out = tfutil.autosummary('Loss/fake_scores', fake_scores_out) # loss = fake_scores_out - real_scores_out # # with tf.name_scope('GradientPenalty'): # mixing_factors = tf.random_uniform([minibatch_size, 1, 1, 1], 0.0, 1.0, dtype=fake_images_out.dtype) # mixed_images_out = tfutil.lerp(tf.cast(reals, fake_images_out.dtype), fake_images_out, mixing_factors) # mixed_scores_out, mixed_labels_out = fp32(D.get_output_for(mixed_images_out, is_training=True)) # mixed_scores_out = tfutil.autosummary('Loss/mixed_scores', mixed_scores_out) # mixed_loss = opt.apply_loss_scaling(tf.reduce_sum(mixed_scores_out)) # mixed_grads = opt.undo_loss_scaling(fp32(tf.gradients(mixed_loss, [mixed_images_out])[0])) # mixed_norms = tf.sqrt(tf.reduce_sum(tf.square(mixed_grads), axis=[1,2,3])) # mixed_norms = tfutil.autosummary('Loss/mixed_norms', mixed_norms) # gradient_penalty = tf.square(mixed_norms - wgan_target) # loss += gradient_penalty * (wgan_lambda / (wgan_target**2)) # # with tf.name_scope('EpsilonPenalty'): # epsilon_penalty = tfutil.autosummary('Loss/epsilon_penalty', tf.square(real_scores_out)) # loss += epsilon_penalty * wgan_epsilon # # if D.output_shapes[1][1] > 0: # with tf.name_scope('LabelPenalty'): # label_penalty_reals = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=real_labels_out) # label_penalty_fakes = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=fake_labels_out) # label_penalty_reals = tfutil.autosummary('Loss/label_penalty_reals', label_penalty_reals) # label_penalty_fakes = tfutil.autosummary('Loss/label_penalty_fakes', label_penalty_fakes) # loss += (label_penalty_reals + label_penalty_fakes) * cond_weight # return loss return loss.mean( ) # added mean in pytorch implementation; tf did not have!
def forward(self, visible, training=False): """ forward pass for each sample (multiple steps) return list caches, which each element is the result from one timestep. """ def get_rnn_hidden(v_t, u_tm1): activation = torch.matmul(self.wvu, v_t) + torch.matmul(self.wuu, u_tm1) + self.bu return torch.tanh(activation) time_steps = visible.shape[0] u_tm1 = self.u0 sum1 = 0.0 sum2 = 0.0 total_cost = 0 cost = 0 mse = [] for t in range(1, time_steps): v_t = visible[t] v_tm1 = visible[t - 1] #bh_t, bv_t = self.get_bias(u_tm1) bh_t = F.linear(u_tm1, self.wuh, self.bh) bv_t = F.linear(u_tm1, self.wuv, self.bv) ## gibbs sampling start from v_tm1 # _, negative_sample = self.gibbs_sample(v_tm1, self.w, bh_t, bv_t, num_steps=20) v_ = v_t for _ in range(20): pre_h_, h_ = self.v_to_h(v_, bh_t) pre_v_, v_ = self.h_to_v(h_, bv_t) negative_sample = v_ #mean_v = self.gibbs_step(negative_sample)[0] if training: self.optimizer.zero_grad() cost = self.free_energy(v_t) - self.free_energy(negative_sample) cost.backward(retain_graph=(t != time_steps-1)) #print(self.u0.grad) self.optimizer.step() # RBM Loss # cost += self.free_energy(v_t) - self.free_energy(negative_sample) # RNN Loss y_t = torch.sigmoid(bv_t) total_cost += torch.sum(-v_t * torch.log(1e-6 + y_t) - (1 - v_t) * torch.log(1e-6 + 1 - y_t)) ''' cost = self.free_energy(v_t) - self.free_energy(negative_sample) if training: self.optimizer.zero_grad() cost.backward(retain_graph=(t != time_steps-1)) self.optimizer.step() ''' mse.append(torch.abs(v_t - negative_sample).mean().item()) sum1 += v_t sum2 += negative_sample ut = get_rnn_hidden(v_t, u_tm1) u_tm1 = ut # regularization term total_cost /= time_steps reg_term = (torch.norm(self.wuv)+torch.norm(self.wuh) ) * self.reg_factor total_cost += reg_term return total_cost, mse,reg_term
def train(self, v0, vk, ph0, phk): self.W += torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk) self.b += torch.sum((v0 - vk), 0) self.a += torch.sum((ph0 - phk), 0)
def plot(x, y, epoch): nsample = 20 gen_seq = [[] for i in range(nsample)] gt_seq = [x[i] for i in range(len(x))] for s in range(nsample): frame_predictor.hidden = frame_predictor.init_hidden() posterior_mu.hidden = posterior_mu.init_hidden() prior_mu.hidden = prior_mu.init_hidden() gen_seq[s].append(x[0]) x_in = x[0] h_match_prev = [encoder(y[m][0])[0].detach() for m in range(5)] for i in range(1, opt.n_eval): h_match = [encoder(y[m][i])[0].detach() for m in range(5)] h = encoder(x_in) if opt.last_frame_skip or i < opt.n_past: h, skip = h else: h, _ = h h = h.detach() if i < opt.n_past: h_target = encoder(x[i]) h_target = h_target[0].detach() mu = posterior_mu(h_target) mu_p = prior_mu(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h], -1)) ref_var = torch.mean(torch.cat([var_encoder(h_match[m] - h_match_prev[m] + h).unsqueeze(1) for m in range(5)], 1), 1) z_t = reparameterize(mu, ref_var) frame_predictor(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h, z_t], 1)) x_in = x[i] gen_seq[s].append(x_in) else: mu_p = prior_mu(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h], -1)) ref_var = torch.mean(torch.cat([var_encoder(h_match[m] - h_match_prev[m] + h).unsqueeze(1) for m in range(5)], 1), 1) z_t = reparameterize(mu_p, ref_var) h = frame_predictor(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h, z_t], 1)).detach() x_in = decoder([h, skip]).detach() gen_seq[s].append(x_in) h_match_prev = h_match to_plot = [] gifs = [ [] for t in range(opt.n_eval) ] nrow = min(opt.batch_size, 10) for i in range(nrow): # ground truth sequence row = [] for t in range(opt.n_eval): row.append(gt_seq[t][i]) to_plot.append(row) # best sequence min_mse = 1e7 for s in range(nsample): mse = 0 for t in range(opt.n_eval): mse += torch.sum( (gt_seq[t][i].data.cpu() - gen_seq[s][t][i].data.cpu())**2 ) if mse < min_mse: min_mse = mse min_idx = s s_list = [min_idx, np.random.randint(nsample), np.random.randint(nsample), np.random.randint(nsample), np.random.randint(nsample)] for ss in range(len(s_list)): s = s_list[ss] row = [] for t in range(opt.n_eval): row.append(gen_seq[s][t][i]) to_plot.append(row) for t in range(opt.n_eval): row = [] row.append(gt_seq[t][i]) for ss in range(len(s_list)): s = s_list[ss] row.append(gen_seq[s][t][i]) gifs[t].append(row) fname = '%s/gen/sample_%d.png' % (opt.log_dir, epoch) utils.save_tensors_image(fname, to_plot) fname = '%s/gen/sample_%d.gif' % (opt.log_dir, epoch) utils.save_gif(fname, gifs)
def compute_adjacency_info(vertices: torch.Tensor, faces: torch.Tensor): """Build data structures to help speed up connectivity queries. Assumes a homogeneous mesh, i.e., each face has the same number of vertices. The outputs have the following format: AA, AA_count AA_count: [count_0, ..., count_n] with AA: [[aa_{0,0}, ..., aa_{0,count_0} (, -1, ..., -1)], [aa_{1,0}, ..., aa_{1,count_1} (, -1, ..., -1)], ... [aa_{n,0}, ..., aa_{n,count_n} (, -1, ..., -1)]] """ device = vertices.device facesize = faces.shape[1] nb_vertices = vertices.shape[0] nb_faces = faces.shape[0] edges = torch.cat([faces[:, i:i + 2] for i in range(facesize - 1)] + [faces[:, [-1, 0]]], dim=0) # Sort the vertex of edges in increasing order edges = torch.sort(edges, dim=1)[0] # id of corresponding face in edges face_ids = torch.arange(nb_faces, device=device, dtype=torch.long).repeat(facesize) # remove multiple occurences and sort by the first vertex # the edge key / id is fixed from now as the first axis position # edges_ids will give the key of the edges on the original vector edges, edges_ids = torch.unique(edges, sorted=True, return_inverse=True, dim=0) nb_edges = edges.shape[0] # EDGE2FACE sorted_edges_ids, order_edges_ids = torch.sort(edges_ids) sorted_faces_ids = face_ids[order_edges_ids] # indices of first occurences of each key idx_first = torch.where( torch.nn.functional.pad( sorted_edges_ids[1:] != sorted_edges_ids[:-1], (1, 0), value=1))[0] nb_faces_per_edge = idx_first[1:] - idx_first[:-1] # compute sub_idx (2nd axis indices to store the faces) offsets = torch.zeros(sorted_edges_ids.shape[0], device=device, dtype=torch.long) offsets[idx_first[1:]] = nb_faces_per_edge sub_idx = torch.arange(sorted_edges_ids.shape[0], device=device, dtype=torch.long) - torch.cumsum(offsets, dim=0) # TODO(cfujitsang): potential way to compute sub_idx differently # to test with bigger model # sub_idx = torch.ones(sorted_edges_ids.shape[0], device=device, dtype=torch.long) # sub_idx[0] = 0 # sub_idx[idx_first[1:]] = 1 - nb_faces_per_edge # sub_idx = torch.cumsum(sub_idx, dim=0) nb_faces_per_edge = torch.cat( [nb_faces_per_edge, sorted_edges_ids.shape[0] - idx_first[-1:]], dim=0) max_sub_idx = torch.max(nb_faces_per_edge) ef = torch.zeros( (nb_edges, max_sub_idx), device=device, dtype=torch.long) - 1 ef[sorted_edges_ids, sub_idx] = sorted_faces_ids # FACE2FACES nb_faces_per_face = (torch.stack([ nb_faces_per_edge[edges_ids[i * nb_faces:(i + 1) * nb_faces]] for i in range(facesize) ], dim=1).sum(dim=1) - facesize) ff = torch.cat([ ef[edges_ids[i * nb_faces:(i + 1) * nb_faces]] for i in range(facesize) ], dim=1) # remove self occurences ff[ff == torch.arange(nb_faces, device=device, dtype=torch.long).view( -1, 1)] = -1 ff = torch.sort(ff, dim=-1, descending=True)[0] to_del = (ff[:, 1:] == ff[:, :-1]) & (ff[:, 1:] != -1) ff[:, 1:][to_del] = -1 nb_faces_per_face = nb_faces_per_face - torch.sum(to_del, dim=1) max_sub_idx = torch.max(nb_faces_per_face) ff = torch.sort(ff, dim=-1, descending=True)[0][:, :max_sub_idx] # VERTEX2VERTICES and VERTEX2EDGES npy_edges = edges.cpu().numpy() edge2key = {tuple(npy_edges[i]): i for i in range(nb_edges)} # _edges and double_edges 2nd axis correspond to the triplet: # [left vertex, right vertex, edge key] _edges = torch.cat( [edges, torch.arange(nb_edges, device=device).view(-1, 1)], dim=1) double_edges = torch.cat([_edges, _edges[:, [1, 0, 2]]], dim=0) double_edges = torch.unique(double_edges, sorted=True, dim=0) # TODO(cfujitsang): potential improvment, to test with bigger model: # double_edges0, order_double_edges = torch.sort(double_edges[0]) nb_double_edges = double_edges.shape[0] # indices of first occurences of each key idx_first = torch.where( torch.nn.functional.pad( double_edges[1:, 0] != double_edges[:-1, 0], (1, 0), value=1))[0] nb_edges_per_vertex = idx_first[1:] - idx_first[:-1] # compute sub_idx (2nd axis indices to store the edges) offsets = torch.zeros(nb_double_edges, device=device, dtype=torch.long) offsets[idx_first[1:]] = nb_edges_per_vertex sub_idx = torch.arange(nb_double_edges, device=device, dtype=torch.long) - torch.cumsum(offsets, dim=0) nb_edges_per_vertex = torch.cat( [nb_edges_per_vertex, nb_double_edges - idx_first[-1:]], dim=0) max_sub_idx = torch.max(nb_edges_per_vertex) vv = torch.zeros( (nb_vertices, max_sub_idx), device=device, dtype=torch.long) - 1 vv[double_edges[:, 0], sub_idx] = double_edges[:, 1] ve = torch.zeros( (nb_vertices, max_sub_idx), device=device, dtype=torch.long) - 1 ve[double_edges[:, 0], sub_idx] = double_edges[:, 2] # EDGE2EDGES ee = torch.cat([ve[edges[:, 0], :], ve[edges[:, 1], :]], dim=1) nb_edges_per_edge = nb_edges_per_vertex[ edges[:, 0]] + nb_edges_per_vertex[edges[:, 1]] - 2 max_sub_idx = torch.max(nb_edges_per_edge) # remove self occurences ee[ee == torch.arange(nb_edges, device=device, dtype=torch.long).view( -1, 1)] = -1 ee = torch.sort(ee, dim=-1, descending=True)[0][:, :max_sub_idx] # VERTEX2FACES vertex_ordered, order_vertex = torch.sort(faces.view(-1)) face_ids_in_vertex_order = order_vertex / facesize # indices of first occurences of each id idx_first = torch.where( torch.nn.functional.pad(vertex_ordered[1:] != vertex_ordered[:-1], (1, 0), value=1))[0] nb_faces_per_vertex = idx_first[1:] - idx_first[:-1] # compute sub_idx (2nd axis indices to store the faces) offsets = torch.zeros(vertex_ordered.shape[0], device=device, dtype=torch.long) offsets[idx_first[1:]] = nb_faces_per_vertex sub_idx = torch.arange(vertex_ordered.shape[0], device=device, dtype=torch.long) - torch.cumsum(offsets, dim=0) # TODO(cfujitsang): it seems that nb_faces_per_vertex == nb_edges_per_vertex ? nb_faces_per_vertex = torch.cat( [nb_faces_per_vertex, vertex_ordered.shape[0] - idx_first[-1:]], dim=0) max_sub_idx = torch.max(nb_faces_per_vertex) vf = torch.zeros( (nb_vertices, max_sub_idx), device=device, dtype=torch.long) - 1 vf[vertex_ordered, sub_idx] = face_ids_in_vertex_order return ( edge2key, edges, vv, nb_edges_per_vertex, ve, nb_edges_per_vertex, vf, nb_faces_per_vertex, ff, nb_faces_per_face, ee, nb_edges_per_edge, ef, nb_faces_per_edge, )
def get_gradient_penalty(discriminator, generated_output, real_output, alpha=1., lambda_=10.): """ Get gradient penalty. Parameters ---------- discriminator generated_output real_output lambda_ Returns ------- """ if real_output.shape != generated_output.shape: generated_output = generated_output[:real_output.shape[0]] batch_size = real_output.shape[0] # get epsilon # each image receives its own epsilon # (e.g., image 1 eps == .8047, image 2 eps == .1988, etc.) epsilon = torch.rand(batch_size, 1, 1, 1) # stretch the eps value to dim of each image epsilon = epsilon.expand(real_output.shape) epsilon = epsilon.cuda() # get interpolation interpolation = epsilon * real_output.data + ( 1 - epsilon) * generated_output.data interpolation.requires_grad = True interpolation = interpolation.cuda() # get interpolation logits interpolation_logits = discriminator(interpolation, alpha=alpha) # get gradients grad_outputs = torch.ones(interpolation_logits.size()) grad_outputs = grad_outputs.cuda() gradients = torch.autograd.grad(outputs=interpolation_logits, inputs=interpolation, grad_outputs=grad_outputs, create_graph=True, retain_graph=True)[0] gradients = gradients.detach() # gradients = gradients.view(batch_size, -1) # with tf.name_scope('GradientPenalty'): # # mixing_factors = tf.random_uniform([minibatch_size, 1, 1, 1], 0.0, 1.0, dtype=fake_images_out.dtype) # # mixed_images_out = tfutil.lerp(tf.cast(reals, fake_images_out.dtype), fake_images_out, mixing_factors) # # mixed_scores_out, mixed_labels_out = fp32(D.get_output_for(mixed_images_out, is_training=True)) # # mixed_scores_out = tfutil.autosummary('Loss/mixed_scores', mixed_scores_out) # # mixed_loss = opt.apply_loss_scaling(tf.reduce_sum(mixed_scores_out)) # # mixed_grads = opt.undo_loss_scaling(fp32(tf.gradients(mixed_loss, [mixed_images_out])[0])) # # mixed_norms = tf.sqrt(tf.reduce_sum(tf.square(mixed_grads), axis=[1,2,3])) # # mixed_norms = tfutil.autosummary('Loss/mixed_norms', mixed_norms) # # gradient_penalty = tf.square(mixed_norms - wgan_target) # # loss += gradient_penalty * (wgan_lambda / (wgan_target**2)) # get gradient penalty mixed_norms = torch.sqrt(torch.sum(gradients**2, dim=[1, 2, 3])) gradient_penalty = (mixed_norms - 1)**2 # gradient_penalty = gradient_penalty.item() # remove gradient tracking del interpolation torch.cuda.empty_cache() return (gradient_penalty * (lambda_ / 1.**2)).mean()
x = self.fc4(x) return x sae = SAE() criterion = nn.MSELoss() optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5) nb_epoch = 200 for epoch in range(1, nb_epoch + 1): train_loss = 0 s = 0. for id_user in range(nb_users): input = Variable(training_set[id_user]).unsqueeze(0) target = input.clone() if torch.sum(target.data > 0) > 0: output = sae(input) target.require_grad = False output[target == 0] = 0 loss = criterion(output, target) mean_corrector = nb_movies / \ float(torch.sum(target.data > 0) + 1e-10) loss.backward() train_loss += np.sqrt(loss.data * mean_corrector) s += 1. optimizer.step() print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss / s)) test_loss = 0 s = 0. for id_user in range(nb_users):
def forward(self, # type: ignore tokens: Dict[str, torch.LongTensor], label: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``. label : torch.LongTensor, optional (default = None) A variable representing the label for each instance in the batch. Returns ------- An output dictionary consisting of: class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_classes)`` representing a distribution over the label classes for each instance. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ text_mask = util.get_text_field_mask(tokens).float() # Pop elmo tokens, since elmo embedder should not be present. elmo_tokens = tokens.pop("elmo", None) if tokens: embedded_text = self._text_field_embedder(tokens) else: # only using "elmo" for input embedded_text = None # Add the "elmo" key back to "tokens" if not None, since the tests and the # subsequent training epochs rely not being modified during forward() if elmo_tokens is not None: tokens["elmo"] = elmo_tokens # Create ELMo embeddings if applicable if self._elmo: if elmo_tokens is not None: elmo_representations = self._elmo(elmo_tokens)["elmo_representations"] # Pop from the end is more performant with list if self._use_integrator_output_elmo: integrator_output_elmo = elmo_representations.pop() if self._use_input_elmo: input_elmo = elmo_representations.pop() assert not elmo_representations else: raise ConfigurationError( "Model was built to use Elmo, but input text is not tokenized for Elmo.") if self._use_input_elmo: if embedded_text is not None: embedded_text = torch.cat([embedded_text, input_elmo], dim=-1) else: embedded_text = input_elmo # While using embeddings from the mt-cnn encoder, the hardcoded values for vocab_size can be initialsed appropriately if cnn: embedded_text_cnn = embedded_text enc = Encoder(7855, 300, 600, 5, 3, 0.25, 'cuda') dec = Decoder(5893, 300, 600, 5, 3, 0.25, 1, 'cuda') cnn_model = Seq2Seq(enc, dec).cuda() cnn_model.load_state_dict(torch.load('../cnn_lstm_model.pt')) cnn_model.eval() v1, v2 = cnn_model.encoder(embedded_text[:,:,:256]) v3 = torch.cat((v1,v2),2) embedded_text = torch.cat((embedded_text_cnn,v3),2) # While using embeddings from the mt-lstm encoder (either load from the saved model from the paper or the reproduced model) elif lstm: outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=None, vectors=None, layer0=True, residual_embeddings=True) outputs_both_layer_cove_with_glove.cuda() embedded_text = outputs_both_layer_cove_with_glove(embedded_text,[embedded_text.shape[1]]*embedded_text.shape[0]) dropped_embedded_text = self._embedding_dropout(embedded_text) pre_encoded_text = self._pre_encode_feedforward(dropped_embedded_text) encoded_tokens = self._encoder(pre_encoded_text, text_mask) # Compute biattention. This is a special case since the inputs are the same. attention_logits = encoded_tokens.bmm(encoded_tokens.permute(0, 2, 1).contiguous()) attention_weights = util.masked_softmax(attention_logits, text_mask) encoded_text = util.weighted_sum(encoded_tokens, attention_weights) # Build the input to the integrator integrator_input = torch.cat([encoded_tokens, encoded_tokens - encoded_text, encoded_tokens * encoded_text], 2) integrated_encodings = self._integrator(integrator_input, text_mask) # Concatenate ELMo representations to integrated_encodings if specified if self._use_integrator_output_elmo: integrated_encodings = torch.cat([integrated_encodings, integrator_output_elmo], dim=-1) # Simple Pooling layers max_masked_integrated_encodings = util.replace_masked_values( integrated_encodings, text_mask.unsqueeze(2), -1e7) max_pool = torch.max(max_masked_integrated_encodings, 1)[0] min_masked_integrated_encodings = util.replace_masked_values( integrated_encodings, text_mask.unsqueeze(2), +1e7) min_pool = torch.min(min_masked_integrated_encodings, 1)[0] mean_pool = torch.sum(integrated_encodings, 1) / torch.sum(text_mask, 1, keepdim=True) # Self-attentive pooling layer # Run through linear projection. Shape: (batch_size, sequence length, 1) # Then remove the last dimension to get the proper attention shape (batch_size, sequence length). self_attentive_logits = self._self_attentive_pooling_projection( integrated_encodings).squeeze(2) self_weights = util.masked_softmax(self_attentive_logits, text_mask) self_attentive_pool = util.weighted_sum(integrated_encodings, self_weights) pooled_representations = torch.cat([max_pool, min_pool, mean_pool, self_attentive_pool], 1) pooled_representations_dropped = self._integrator_dropout(pooled_representations) logits = self._output_layer(pooled_representations_dropped) class_probabilities = F.softmax(logits, dim=-1) output_dict = {'logits': logits, 'class_probabilities': class_probabilities} if label is not None: loss = self.loss(logits, label) for metric in self.metrics.values(): metric(logits, label) output_dict["loss"] = loss return output_dict
def test(args, model, classifier, test_loader): # switch to evaluate mode model.eval() classifier.eval() batch_time = AverageMeter() losses = AverageMeter() acc = AverageMeter() total_pred = [] total_target = [] total_pred_score = [] with torch.no_grad(): end = time.time() for batch_idx, (input, target) in enumerate(tqdm(test_loader, disable=False)): # Get inputs and target input, target = input.float(), target.long() # Move the variables to Cuda input, target = input.cuda(), target.cuda() # compute output ############################### feats = model(input) output = classifier(feats) pred_score = torch.softmax(output.detach_(), dim=-1) ####### loss = F.cross_entropy(output, target, reduction='mean') # compute loss and accuracy batch_size = target.size(0) losses.update(loss.item(), batch_size) pred = torch.argmax(output, dim=1) acc.update(torch.sum(target == pred).item() / batch_size, batch_size) # Save pred, target to calculate metrics total_pred.append(pred) total_target.append(target) total_pred_score.append(pred_score) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # print statistics and write summary every N batch if (batch_idx + 1) % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'loss {loss.val:.3f} ({loss.avg:.3f})\t' 'acc {acc.val:.3f} ({acc.avg:.3f})'.format( batch_idx, len(test_loader), batch_time=batch_time, loss=losses, acc=acc)) # Pred and target for performance metrics final_predictions = torch.cat(total_pred).to('cpu') final_targets = torch.cat(total_target).to('cpu') final_pred_score = torch.cat(total_pred_score).to('cpu') return final_predictions, final_targets, final_pred_score
def _evaluate(self, verbose=False): # Predict on full dataset embeds_0 = self.inputs[0] # model.input_layer() embeds_1 = self.model.gc1([embeds_0] + self.inputs[1:]) embeds_2 = self.model.gc2([embeds_1] + self.inputs[1:]) scores = self.model.clf_bias(embeds_2) preds = torch.argmax(scores, dim=1) loss_train = self.cross_entropy_loss(scores[self.idx_train], self.labels_train) loss_valid = self.cross_entropy_loss(scores[self.idx_valid], self.labels_valid) loss_test = self.cross_entropy_loss(scores[self.idx_test], self.labels_test) correct_train = torch.sum(preds[self.idx_train] == self.labels_train) correct_valid = torch.sum(preds[self.idx_valid] == self.labels_valid) correct_test = torch.sum(preds[self.idx_test] == self.labels_test) train_acc_net = correct_train.item() / self.labels_train.size(0) valid_acc_net = correct_valid.item() / self.labels_valid.size(0) test_acc_net = correct_test.item() / self.labels_test.size(0) if verbose: print('Graph:', train_acc_net, valid_acc_net, test_acc_net) scores_shareu = [] scores_value = scores.data.cpu().numpy() for node in range(12127): scores_t = np.zeros(3) if (node not in self.node2adj): scores_shareu.append(scores_t) continue adj = list(self.node2adj[node]) adj_coef = [1 / len(adj)] * len(adj) for user, coef in zip(adj, adj_coef): scores_t += coef * scores_value[user] scores_shareu.append(scores_t) scores_shareu = torch.FloatTensor(np.array(scores_shareu)).cuda() preds_shareu = torch.argmax(scores_shareu, dim=1) correct_train_shareu = torch.sum( preds_shareu[self.idx_train] == self.labels_train) correct_valid_shareu = torch.sum( preds_shareu[self.idx_valid] == self.labels_valid) correct_test_shareu = torch.sum( preds_shareu[self.idx_test] == self.labels_test) train_acc_shareu = correct_train_shareu.item( ) / self.labels_train.size(0) valid_acc_shareu = correct_valid_shareu.item( ) / self.labels_valid.size(0) test_acc_shareu = correct_test_shareu.item() / self.labels_test.size(0) if verbose: print('User:'******'G+U:', train_acc_netshareu, valid_acc_netshareu, test_acc_netshareu) text_idx_perm = [i for i in range(self.num_docs)] scores_text = [] for start in range(0, self.num_docs, self.text_batch_size): self.model.zero_grad() end = start + self.text_batch_size if (end > self.num_docs): end = self.num_docs doc_idx_list_raw = text_idx_perm[start:end] doctext_idx_list = torch.LongTensor(doc_idx_list_raw).cuda() batch_input = self.model.input_layer.get_doc_embed( doctext_idx_list) # torch.mm(model.gc2.W[0]) scores_text.extend( list(self.model.clf_bias(batch_input).data.cpu().numpy())) scores_text = torch.FloatTensor(scores_text).cuda() preds_text = torch.argmax(scores_text, dim=1) # print(idx_train-num_non_docs) # print(preds_shareu[idx_train-num_non_docs]) # exit() correct_train = torch.sum( preds_text[self.idx_train - self.num_non_docs] == self.labels_train) correct_valid = torch.sum( preds_text[self.idx_valid - self.num_non_docs] == self.labels_valid) correct_test = torch.sum( preds_text[self.idx_test - self.num_non_docs] == self.labels_test) train_acc_text = correct_train.item() / self.labels_train.size(0) valid_acc_text = correct_valid.item() / self.labels_valid.size(0) test_acc_text = correct_test.item() / self.labels_test.size(0) if verbose: print('Text:', train_acc_text, valid_acc_text, test_acc_text) scores[self.num_non_docs:] += scores_text preds_nettext = torch.argmax(scores, dim=1) correct_train = torch.sum( preds_nettext[self.idx_train] == self.labels_train) correct_valid = torch.sum( preds_nettext[self.idx_valid] == self.labels_valid) correct_test = torch.sum( preds_nettext[self.idx_test] == self.labels_test) train_acc_nettext = correct_train.item() / self.labels_train.size(0) valid_acc_nettext = correct_valid.item() / self.labels_valid.size(0) test_acc_nettext = correct_test.item() / self.labels_test.size(0) if verbose: print('G+T:', train_acc_nettext, valid_acc_nettext, test_acc_nettext) scores_shareu[self.num_non_docs:] += scores_text preds_all = torch.argmax(scores_shareu, dim=1) correct_train = torch.sum( preds_all[self.idx_train] == self.labels_train) correct_valid = torch.sum( preds_all[self.idx_valid] == self.labels_valid) correct_test = torch.sum(preds_all[self.idx_test] == self.labels_test) train_acc_all = correct_train.item() / self.labels_train.size(0) valid_acc_all = correct_valid.item() / self.labels_valid.size(0) test_acc_all = correct_test.item() / self.labels_test.size(0) if verbose: print('G+U+T:', train_acc_all, valid_acc_all, test_acc_all) if (self.PRED_TYPE == 'net'): train_acc_sel, valid_acc_sel, test_acc_sel, preds_sel = ( train_acc_net, valid_acc_net, test_acc_net, preds.data.cpu().numpy()) elif (self.PRED_TYPE == 'netshareu'): train_acc_sel, valid_acc_sel, test_acc_sel, preds_sel = ( train_acc_netshareu, valid_acc_netshareu, test_acc_netshareu, preds_netshareu.data.cpu().numpy()) elif (self.PRED_TYPE == 'text'): train_acc_sel, valid_acc_sel, test_acc_sel, preds_sel = ( train_acc_text, valid_acc_text, test_acc_text, preds_text.data.cpu().numpy()) elif (self.PRED_TYPE == 'all'): train_acc_sel, valid_acc_sel, test_acc_sel, preds_sel = ( train_acc_all, valid_acc_all, test_acc_all, preds_all.data.cpu().numpy()) else: print('wrong PRED_TYPE') exit() result_table = [ [train_acc_net, valid_acc_net, test_acc_net], [train_acc_shareu, valid_acc_shareu, test_acc_shareu], [train_acc_netshareu, valid_acc_netshareu, test_acc_netshareu], [train_acc_text, valid_acc_text, test_acc_text], [train_acc_nettext, valid_acc_nettext, test_acc_nettext], [train_acc_all, valid_acc_all, test_acc_all] ] return (loss_train.item(), loss_valid.item(), loss_test.item(), train_acc_sel, valid_acc_sel, test_acc_sel, preds_sel, result_table)
def draw_one_density_plot(self, ax, model, data_dict, traj_id, multiply_by_poisson=False): scale = 5 cmap = add_white(plt.cm.get_cmap('Blues', 9)) # plt.cm.BuGn_r cmap2 = add_white(plt.cm.get_cmap('Reds', 9)) # plt.cm.BuGn_r #cmap = plt.cm.get_cmap('viridis') data = data_dict["data_to_predict"] time_steps = data_dict["tp_to_predict"] mask = data_dict["mask_predicted_data"] observed_data = data_dict["observed_data"] observed_time_steps = data_dict["observed_tp"] observed_mask = data_dict["observed_mask"] npts = 50 xx, yy, z0_grid = get_meshgrid(npts=npts, int_y1=(-scale, scale), int_y2=(-scale, scale)) z0_grid = z0_grid.to(get_device(data)) if model.latent_dim > 2: z0_grid = torch.cat( (z0_grid, torch.zeros(z0_grid.size(0), model.latent_dim - 2)), 1) if model.use_poisson_proc: n_traj, n_dims = z0_grid.size() # append a vector of zeros to compute the integral of lambda and also zeros for the first point of lambda zeros = torch.zeros([n_traj, model.input_dim + model.latent_dim ]).to(get_device(data)) z0_grid_aug = torch.cat((z0_grid, zeros), -1) else: z0_grid_aug = z0_grid # Shape of sol_y [n_traj_samples, n_samples, n_timepoints, n_latents] sol_y = model.diffeq_solver(z0_grid_aug.unsqueeze(0), time_steps) if model.use_poisson_proc: sol_y, log_lambda_y, int_lambda, _ = model.diffeq_solver.ode_func.extract_poisson_rate( sol_y) assert (torch.sum(int_lambda[:, :, 0, :]) == 0.) assert (torch.sum(int_lambda[0, 0, -1, :] <= 0) == 0.) pred_x = model.decoder(sol_y) # Plot density for one trajectory one_traj = data[traj_id] mask_one_traj = None if mask is not None: mask_one_traj = mask[traj_id].unsqueeze(0) mask_one_traj = mask_one_traj.repeat(npts**2, 1, 1).unsqueeze(0) ax.cla() # Plot: prior prior_density_grid = model.z0_prior.log_prob( z0_grid.unsqueeze(0)).squeeze(0) # Sum the density over two dimensions prior_density_grid = torch.sum(prior_density_grid, -1) # ================================================= # Plot: p(x | y(t0)) masked_gaussian_log_density_grid = masked_gaussian_log_density( pred_x, one_traj.repeat(npts**2, 1, 1).unsqueeze(0), mask=mask_one_traj, obsrv_std=model.obsrv_std).squeeze(-1) # Plot p(t | y(t0)) if model.use_poisson_proc: poisson_info = {} poisson_info["int_lambda"] = int_lambda[:, :, -1, :] poisson_info["log_lambda_y"] = log_lambda_y poisson_log_density_grid = compute_poisson_proc_likelihood( one_traj.repeat(npts**2, 1, 1).unsqueeze(0), pred_x, poisson_info, mask=mask_one_traj) poisson_log_density_grid = poisson_log_density_grid.squeeze(0) # ================================================= # Plot: p(x , y(t0)) log_joint_density = prior_density_grid + masked_gaussian_log_density_grid if multiply_by_poisson: log_joint_density = log_joint_density + poisson_log_density_grid density_grid = torch.exp(log_joint_density) density_grid = torch.reshape(density_grid, (xx.shape[0], xx.shape[1])) density_grid = density_grid.cpu().numpy() ax.contourf(xx, yy, density_grid, cmap=cmap, alpha=1) # ================================================= # Plot: q(y(t0)| x) #self.ax_density.set_title("Red: q(y(t0) | x) Blue: p(x, y(t0))") ax.set_xlabel('z1(t0)') ax.set_ylabel('z2(t0)') data_w_mask = observed_data[traj_id].unsqueeze(0) if observed_mask is not None: data_w_mask = torch.cat( (data_w_mask, observed_mask[traj_id].unsqueeze(0)), -1) z0_mu, z0_std = model.encoder_z0(data_w_mask, observed_time_steps) if model.use_poisson_proc: z0_mu = z0_mu[:, :, :model.latent_dim] z0_std = z0_std[:, :, :model.latent_dim] q_z0 = Normal(z0_mu, z0_std) q_density_grid = q_z0.log_prob(z0_grid) # Sum the density over two dimensions q_density_grid = torch.sum(q_density_grid, -1) density_grid = torch.exp(q_density_grid) density_grid = torch.reshape(density_grid, (xx.shape[0], xx.shape[1])) density_grid = density_grid.cpu().numpy() ax.contourf(xx, yy, density_grid, cmap=cmap2, alpha=0.3)
def train(args, model_teacher, model_student, classifier_teacher, classifier_student, train_labeled_loader, train_unlabeled_loader, optimizer, epoch): model_teacher.eval() classifier_teacher.eval() model_student.train() classifier_student.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() losses_x = AverageMeter() losses_u = AverageMeter() acc = AverageMeter() end = time.time() train_loader = zip(train_labeled_loader, train_unlabeled_loader) for batch_idx, (data_x, data_u) in enumerate(tqdm(train_loader, disable=False)): # Get inputs and target inputs_x, targets_x = data_x inputs_u_w, inputs_u_s = data_u inputs_x, inputs_u_w, inputs_u_s, targets_x = inputs_x.float(), inputs_u_w.float(), inputs_u_s.float(), targets_x.long() # Move the variables to Cuda inputs_x, inputs_u_w, inputs_u_s, targets_x = inputs_x.cuda(), inputs_u_w.cuda(), inputs_u_s.cuda(), targets_x.cuda() # Compute output inputs_x = inputs_x.reshape(-1, 3, 256, 256) #Reshape targets_x = targets_x.reshape(-1, ) # Compute pseudolabels for weak_unlabeled images using the teacher model with torch.no_grad(): feat_u_w = model_teacher(inputs_u_w) # weak unlabeled data logits_u_w = classifier_teacher(feat_u_w) # Compute output for labeled and strong_unlabeled images using the student model inputs = torch.cat((inputs_x, inputs_u_s)) feats = model_student(inputs) logits = classifier_student(feats) batch_size = inputs_x.shape[0] logits_x = logits[:batch_size] # labeled data logits_u_s = logits[batch_size:] # unlabeled data del logits # Compute loss Supervised_loss = F.cross_entropy(logits_x, targets_x, reduction='mean') pseudo_label = torch.softmax(logits_u_w.detach_(), dim=-1) max_probs, targets_u = torch.max(pseudo_label, dim=-1) Consistency_loss = F.cross_entropy(logits_u_s, targets_u, reduction='mean') final_loss = Supervised_loss + args.lambda_u * Consistency_loss # compute gradient and do SGD step ############# optimizer.zero_grad() final_loss.backward() optimizer.step() # compute loss and accuracy #################### losses_x.update(Supervised_loss.item(), batch_size) losses_u.update(Consistency_loss.item(), batch_size) losses.update(final_loss.item(), batch_size) pred = torch.argmax(logits_x, dim=1) acc.update(torch.sum(targets_x == pred).item() / batch_size, batch_size) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # print statistics and write summary every N batch if (batch_idx + 1) % args.print_freq == 0: print('Train: [{0}][{1}/{2}]\t' 'BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'DT {data_time.val:.3f} ({data_time.avg:.3f})\t' 'acc {acc.val:.3f} ({acc.avg:.3f})\t' 'final_loss {final_loss.val:.3f} ({final_loss.avg:.3f})\t' 'Supervised_loss {Supervised_loss.val:.3f} ({Supervised_loss.avg:.3f})\t' 'Consistency_loss {Consistency_loss.val:.3f} ({Consistency_loss.avg:.3f})'.format(epoch, batch_idx + 1, len(train_labeled_loader), batch_time=batch_time, data_time=data_time, acc=acc, final_loss=losses, Supervised_loss=losses_x, Consistency_loss=losses_u)) return losses.avg, losses_x.avg, losses_u.avg, acc.avg
remd = emd.emdModule() remd = remd.cuda() dis, ind = remd(point_a, point_b, 0.005, 300) for ass in range(B): point_c[ass, :, :] = point_c[ass, ind[ass].long(), :] int_lam = int(args.num_points * lam) int_lam = max(1, int_lam) random_point = torch.from_numpy( np.random.choice(1024, B, replace=False, p=None)) # kNN ind1 = torch.tensor(range(B)) query = point_a[ind1, random_point].view(B, 1, 3) dist = torch.sqrt( torch.sum( (point_a - query.repeat(1, args.num_points, 1))**2, 2)) idxs = dist.topk(int_lam, dim=1, largest=False, sorted=True).indices for i2 in range(B): points[i2, idxs[i2], :] = point_c[i2, idxs[i2], :] # adjust lambda to exactly match point ratio lam = int_lam * 1.0 / args.num_points points = points.transpose(2, 1) pred, trans_feat = model(points) loss = criterion(pred, target_a.long()) * ( 1. - lam) + criterion(pred, target_b.long()) * lam else: points = points.transpose(2, 1) pred, trans_feat = model(points) loss = criterion(pred, target.long())
def loss_KLDivergence(mu, sigma): return -0.5 * torch.sum(1 + sigma - torch.pow(mu, 2) - torch.exp(sigma))