def forward(self, images, questions): N, T, _, _, _ = images.size() # bs x 5 x 3 x 224 x 224 img_feats = self.cnn(images.contiguous().view( -1, images.size(2), images.size(3), images.size(4))) img_feats = self.cnn_fc_layer(img_feats) img_feats_tr = self.img_tr(img_feats) ques_feats = self.q_rnn(questions) ques_feats_repl = ques_feats.view(N, 1, -1).repeat(1, T, 1) ques_feats_repl = ques_feats_repl.view(N * T, -1) ques_feats_tr = self.ques_tr(ques_feats_repl) ques_img_feats = torch.cat([ques_feats_tr, img_feats_tr], 1) att_feats = self.att(ques_img_feats) att_probs = F.softmax(att_feats.view(N, T), dim=1) att_probs2 = att_probs.view(N, T, 1).repeat(1, 1, 64) att_img_feats = torch.mul(att_probs2, img_feats.view(N, T, 64)) att_img_feats = torch.sum(att_img_feats, dim=1) mul_feats = torch.mul(ques_feats, att_img_feats) scores = self.classifier(mul_feats) return scores, att_probs
def forward(self, x): x = self.embed(x) x = self.dropout(x) # x = x.view(len(x), x.size(1), -1) # x = embed.view(len(x), embed.size(1), -1) bilstm_out, self.hidden = self.bilstm(x, self.hidden) bilstm_out = torch.transpose(bilstm_out, 0, 1) bilstm_out = torch.transpose(bilstm_out, 1, 2) # bilstm_out = F.max_pool1d(bilstm_out, bilstm_out.size(2)).squeeze(2) bilstm_out = F.max_pool1d(bilstm_out, bilstm_out.size(2)) bilstm_out = bilstm_out.squeeze(2) hidden2lable = self.hidden2label1(F.tanh(bilstm_out)) gate_layer = F.sigmoid(self.gate_layer(bilstm_out)) # calculate highway layer values gate_hidden_layer = torch.mul(hidden2lable, gate_layer) # if write like follow ,can run,but not equal the HighWay NetWorks formula # gate_input = torch.mul((1 - gate_layer), hidden2lable) gate_input = torch.mul((1 - gate_layer), bilstm_out) highway_output = torch.add(gate_hidden_layer, gate_input) logit = self.logit_layer(highway_output) return logit
def forward(self, theta, matches, return_outliers=False): if isinstance(theta,Variable): # handle normal batch transformations batch_size=theta.size()[0] theta=theta.clone() mask = self.geometricTnf(expand_dim(self.mask_id,0,batch_size),theta) if return_outliers: mask_outliers = self.geometricTnf(expand_dim(1.0-self.mask_id,0,batch_size),theta) if self.normalize: epsilon=1e-5 mask = torch.div(mask, torch.sum(torch.sum(torch.sum(mask+epsilon,3),2),1).unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(mask)) if return_outliers: mask_outliers = torch.div(mask_outliers, torch.sum(torch.sum(torch.sum(mask_outliers+epsilon,3),2),1).unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(mask_outliers)) score = torch.sum(torch.sum(torch.sum(torch.mul(mask,matches),3),2),1) if return_outliers: score_outliers = torch.sum(torch.sum(torch.sum(torch.mul(mask_outliers,matches),3),2),1) return (score,score_outliers) elif isinstance(theta,list): # handle multiple transformations per batch item, batch is in list format (used for RANSAC) batch_size = len(theta) score = [] for b in range(batch_size): sample_size=theta[b].size(0) s=self.forward(theta[b],expand_dim(matches[b,:,:,:].unsqueeze(0),0,sample_size)) score.append(s) return score
def train_init(init_net, meta_alpha, loss_fn, image, target_bbox, evaluator): init_net.train() # Draw pos/neg samples pos_examples = gen_samples(SampleGenerator('gaussian', image.size, 0.1, 1.2), target_bbox, opts['n_pos_init'], opts['overlap_pos_init']) neg_examples = np.concatenate([ gen_samples(SampleGenerator('uniform', image.size, 1, 2, 1.1), target_bbox, opts['n_neg_init']//2, opts['overlap_neg_init']), gen_samples(SampleGenerator('whole', image.size, 0, 1.2, 1.1), target_bbox, opts['n_neg_init']//2, opts['overlap_neg_init'])]) # Crop images crop_size = opts['img_size'] padding = opts['padding'] image = np.asarray(image) pos_regions = extract_regions(image, pos_examples, crop_size, padding) neg_regions = extract_regions(image, neg_examples, crop_size, padding) pos_regions_var = Variable(torch.from_numpy(pos_regions[:opts['batch_pos']])) neg_regions_var = Variable(torch.from_numpy(neg_regions[:opts['batch_neg']])) if opts['use_gpu']: pos_regions_var = pos_regions_var.cuda() neg_regions_var = neg_regions_var.cuda() # training tracker_init_weights = OrderedDict((name, param) for (name, param) in init_net.named_parameters()) tracker_keys = [name for (name, _) in init_net.named_parameters()] # the first iteration pos_score = init_net.forward(pos_regions_var) neg_score = init_net.forward(neg_regions_var) init_loss = loss_fn(pos_score,neg_score) init_acc,init_acc_pos,init_acc_neg = evaluator(pos_score, neg_score) grads = torch.autograd.grad(init_loss, tracker_init_weights.values(), create_graph=True) tracker_weights = OrderedDict((name, param - torch.mul(alpha,grad)) for ((name, param),(_,alpha),grad) in zip(tracker_init_weights.items(), meta_alpha.items(), grads)) # rest of iterations for i in range(opts['n_init_updates']-1): pos_score = init_net.forward(pos_regions_var, tracker_weights) neg_score = init_net.forward(neg_regions_var, tracker_weights) loss = loss_fn(pos_score,neg_score) grads = torch.autograd.grad(loss, tracker_weights.values(), create_graph=True) tracker_weights = OrderedDict((name, param - torch.mul(alpha,grad)) for ((name, param),(_,alpha),grad) in zip(tracker_weights.items(),meta_alpha.items(), grads)) # update tracker init_net.copy_meta_weights(tracker_weights) init_net.eval() pos_score = init_net.forward(pos_regions_var) neg_score = init_net.forward(neg_regions_var) acc,acc_pos,acc_neg = evaluator(pos_score, neg_score) pos_regions_var = Variable(torch.from_numpy(pos_regions)) neg_regions_var = Variable(torch.from_numpy(neg_regions)) if opts['use_gpu']: pos_regions_var = pos_regions_var.cuda() neg_regions_var = neg_regions_var.cuda() pos_feats = init_net(pos_regions_var, out_layer='features') neg_feats = init_net(neg_regions_var, out_layer='features') return pos_feats.data.clone(), neg_feats.data.clone(), init_acc, acc
def updateGradInput(self, input, gradOutput): v1 = input[0] v2 = input[1] v1, v2 = self._makeContiguous(v1, v2) if len(self.gradInput) != 2: if self.gradInput[0] is None: self.gradInput[0] = v1.new() if self.gradInput[1] is None: self.gradInput[1] = v1.new() self.gradInput = self.gradInput[:2] gw1 = self.gradInput[0] gw2 = self.gradInput[1] gw1.resize_as_(v1).copy_(v2) gw2.resize_as_(v1).copy_(v1) torch.mul(self.w1, self.w22, out=self.buffer) gw1.addcmul_(-1, self.buffer.expand_as(v1), v1) gw1.mul_(self.w.expand_as(v1)) torch.mul(self.w1, self.w32, out=self.buffer) gw2.addcmul_(-1, self.buffer.expand_as(v1), v2) gw2.mul_(self.w.expand_as(v1)) go = gradOutput.contiguous().view(-1, 1).expand_as(v1) gw1.mul_(go) gw2.mul_(go) return self.gradInput
def __call__(self, image_batch, theta_aff, theta_aff_tps, use_cuda=True): sampling_grid_aff = self.affTnf(image_batch=None, theta_batch=theta_aff.view(-1,2,3), return_sampling_grid=True, return_warped_image=False) sampling_grid_aff_tps = self.tpsTnf(image_batch=None, theta_batch=theta_aff_tps, return_sampling_grid=True, return_warped_image=False) if self.padding_crop_factor is not None: sampling_grid_aff_tps = sampling_grid_aff_tps*self.padding_crop_factor; # put 1e10 value in region out of bounds of sampling_grid_aff in_bound_mask_aff = ((sampling_grid_aff[:,:,:,0]>-1) * (sampling_grid_aff[:,:,:,0]<1) * (sampling_grid_aff[:,:,:,1]>-1) * (sampling_grid_aff[:,:,:,1]<1)).unsqueeze(3) in_bound_mask_aff = in_bound_mask_aff.expand_as(sampling_grid_aff) sampling_grid_aff = torch.mul(in_bound_mask_aff.float(),sampling_grid_aff) sampling_grid_aff = torch.add((in_bound_mask_aff.float()-1)*(1e10),sampling_grid_aff) # compose transformations sampling_grid_aff_tps_comp = F.grid_sample(sampling_grid_aff.transpose(2,3).transpose(1,2), sampling_grid_aff_tps).transpose(1,2).transpose(2,3) # put 1e10 value in region out of bounds of sampling_grid_aff_tps_comp in_bound_mask_aff_tps=((sampling_grid_aff_tps[:,:,:,0]>-1) * (sampling_grid_aff_tps[:,:,:,0]<1) * (sampling_grid_aff_tps[:,:,:,1]>-1) * (sampling_grid_aff_tps[:,:,:,1]<1)).unsqueeze(3) in_bound_mask_aff_tps=in_bound_mask_aff_tps.expand_as(sampling_grid_aff_tps_comp) sampling_grid_aff_tps_comp=torch.mul(in_bound_mask_aff_tps.float(),sampling_grid_aff_tps_comp) sampling_grid_aff_tps_comp = torch.add((in_bound_mask_aff_tps.float()-1)*(1e10),sampling_grid_aff_tps_comp) # sample transformed image warped_image_batch = F.grid_sample(image_batch, sampling_grid_aff_tps_comp) return warped_image_batch
def forward(self, input1): self.batchgrid3d = torch.zeros(torch.Size([input1.size(0)]) + self.grid3d.size()) for i in range(input1.size(0)): self.batchgrid3d[i] = self.grid3d self.batchgrid3d = Variable(self.batchgrid3d) #print(self.batchgrid3d) x = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,0:4]), 3) y = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,4:8]), 3) z = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,8:]), 3) #print(x) r = torch.sqrt(x**2 + y**2 + z**2) + 1e-5 #print(r) theta = torch.acos(z/r)/(np.pi/2) - 1 #phi = torch.atan(y/x) phi = torch.atan(y/(x + 1e-5)) + np.pi * x.lt(0).type(torch.FloatTensor) * (y.ge(0).type(torch.FloatTensor) - y.lt(0).type(torch.FloatTensor)) phi = phi/np.pi output = torch.cat([theta,phi], 3) return output
def forward(self, title, pg): r_gate = F.sigmoid(self.wrx(title) + self.wrh(pg)) i_gate = F.sigmoid(self.wix(title) + self.wih(pg)) n_gate = F.tanh(self.wnx(title) + torch.mul(r_gate, self.wnh(pg))) result = torch.mul(i_gate, pg) + torch.mul(torch.add(-i_gate, 1), n_gate) return result
def updateGradInput(self, input, y): v1 = input[0] v2 = input[1] gw1 = self.gradInput[0] gw2 = self.gradInput[1] gw1.resize_as_(v1).copy_(v2) gw2.resize_as_(v1).copy_(v1) torch.mul(self.w1, self.w22, out=self.buffer) gw1.addcmul_(-1, self.buffer.expand_as(v1), v1) gw1.mul_(self.w.expand_as(v1)) torch.mul(self.w1, self.w32, out=self.buffer) gw2.addcmul_(-1, self.buffer.expand_as(v1), v2) gw2.mul_(self.w.expand_as(v1)) # self._idx = self._outputs <= 0 torch.le(self._outputs, 0, out=self._idx) self._idx = self._idx.view(-1, 1).expand(gw1.size()) gw1[self._idx] = 0 gw2[self._idx] = 0 torch.eq(y, 1, out=self._idx) self._idx = self._idx.view(-1, 1).expand(gw2.size()) gw1[self._idx] = gw1[self._idx].mul_(-1) gw2[self._idx] = gw2[self._idx].mul_(-1) if self.sizeAverage: gw1.div_(y.size(0)) gw2.div_(y.size(0)) return self.gradInput
def forward(self, inputs, targets, step, weight_constraint_lambda, logger): n = inputs.size(0) # Compute pairwise distance, replace by the official when merged # features = F.normalize(inputs) features = inputs dist = torch.pow(features, 2).sum(dim=1, keepdim=True).expand(n, n) dist = dist + dist.t() dist.addmm_(1, -2, features, features.t()) dist = dist.clamp(min=1e-12).sqrt() # for numerical stability # get the positive label mask mask = targets.expand(n, n).eq(targets.expand(n, n).t()) mask = mask.float() positive_dist = torch.mul(dist, mask) negative_dist = torch.mul(mask, dist.max()) + torch.mul(dist, 1 - mask) indexes_ap = [] indexes_ng = [] dist_ap = [] dist_an = [] for i in range(n): pos_dist, pos_index = positive_dist[i].max(0) neg_dist, neg_index = negative_dist[i].min(0) dist_ap.append(pos_dist) dist_an.append(neg_dist) indexes_ap.append(pos_index) indexes_ng.append(neg_index) dist_ap = torch.cat(dist_ap) dist_an = torch.cat(dist_an) indexes_ap = torch.cat(indexes_ap) indexes_ng = torch.cat(indexes_ng) pair_adp_inputs = [] for i in range(n): pair_adp_inputs.append(torch.cat([inputs[i, :], inputs[indexes_ap.data[i], :]])) # for i in range(n): pair_adp_inputs.append(torch.cat([inputs[i, :], inputs[indexes_ng.data[i], :]])) pair_adp_inputs = torch.stack(pair_adp_inputs) # Compute adp_pairwise distance, replace by the official when merged dist_adp = self.AdpsubM(pair_adp_inputs, n) # [2*batchsize] [ap,ng]*batchsize # dist_constraint = torch.norm(dist-dist.t()) dist_ap_adp = dist_adp[::2] dist_an_adp = dist_adp[1::2] # Compute ranking hinge loss y = dist_an.data.new() y.resize_as_(dist_an.data) y.fill_(1) y = Variable(y) # dist_neg_constr = 1/torch.norm(dist[mask==0]) trip_loss = self.softmargin_loss(dist_an - dist_ap, y) trip_loss_adp = self.softmargin_loss(dist_an_adp - dist_ap_adp, y) loss = trip_loss + trip_loss_adp # loss = trip_loss if logger: # logger.scalar_summary('Metric_constraint', Metric_constraint.data[0], step) # logger.scalar_summary('dist_constraint', dist_constraint.data[0], step) # logger.histo_summary('W',W.data.cpu().numpy(),step) logger.histo_summary('dist_apt', dist_adp.data.cpu().numpy(), step) logger.histo_summary('dist', dist.data.cpu().numpy(), step) logger.scalar_summary('trip_loss', trip_loss.data[0], step) prec = (dist_an.data > dist_ap.data).sum() * 1. / y.size(0) return trip_loss_adp, prec
def forward(self, x): x0 = self.conv.forward(x.float()) x = self.pool_mil(x0) x = x.squeeze(2).squeeze(2) x1 = torch.add(torch.mul(x0.view(x.size(0), 1000, -1), -1), 1) cumprod = torch.cumprod(x1, 2) out = torch.max(x, torch.add(torch.mul(cumprod[:, :, -1], -1), 1)) #out = F.softmax(out) return out
def forward(self, img, att_size=14): x0 = self.conv(img) x = self.pool_mil(x0) x = x.squeeze(2).squeeze(2) x = self.l1(x) x1 = torch.add(torch.mul(x.view(x.size(0), 1000, -1), -1), 1) cumprod = torch.cumprod(x1, 2) out = torch.max(x, torch.add(torch.mul(cumprod[:, :, -1], -1), 1)) return out
def updateOutput(self, input): input1, input2 = input[0], input[1] if self.buffer is None: self.buffer = input1.new() torch.mul(input1, input2, out=self.buffer) torch.sum(self.buffer, 1, True, out=self.output) self.output.resize_(input1.size(0)) return self.output
def custom_cross_entropy(x, y): sigmoid_x = torch.sigmoid(x) sigmoid_x2 = torch.sigmoid(x ** 2) neg_log_sigmoid_x = -1 * torch.log(sigmoid_x) neg_log_1_minus_sigmoid_x2 = -1 * torch.log(1 - sigmoid_x2) l1 = torch.mul(y, neg_log_sigmoid_x) l2 = torch.mul(1 - y, neg_log_1_minus_sigmoid_x2) return torch.sum(l1 + l2)
def forward(self, x): bahs, chs, _, _ = x.size() # Returns a new tensor with the same data as the self tensor but of a different size. chn_se = self.avg_pool(x).view(bahs, chs) chn_se = self.channel_excitation(chn_se).view(bahs, chs, 1, 1) chn_se = torch.mul(x, chn_se) spa_se = self.spatial_se(x) spa_se = torch.mul(x, spa_se) return torch.add(chn_se, 1, spa_se)
def updateOutput(self, input): gaterInput, expertInputs = input # buffers if self._gaterView is None: self._gaterView = input[0].new() if self._expert is None: self._expert = input[0].new() if self._expertView is None: self._expertView = input[0].new() self.dimG = 1 batchSize = gaterInput.size(0) if self.table or isinstance(expertInputs, list): self.table = True if gaterInput.size(self.dimG) != len(expertInputs): raise RuntimeError("Should be one gater output per expert") expertInput = expertInputs[0] if self.batchSize != batchSize: size = [1] * (expertInput.dim() + 1) if self.dimG > 0: size[0] = gaterInput.size(0) size[self.dim] = gaterInput.size(self.dimG) self.size = torch.Size(size) self.output.resize_as_(expertInput) self.backwardSetup = False self.batchSize = batchSize self._gaterView = gaterInput.view(self.size) self.output.zero_() # multiply accumulate gater outputs by their commensurate expert for i, expertInput in enumerate(expertInputs): gate = self._gaterView.select(self.dim, i).expand_as(expertInput) self.output.addcmul_(expertInput, gate) else: if self.batchSize != batchSize: size = [1] * expertInputs.dim() if self.dimG > 0: size[0] = gaterInput.size(0) size[self.dim] = gaterInput.size(self.dimG) self.size = torch.Size(size) self.output.resize_as_(expertInputs.select(self.dim, 0)) self.batchSize = batchSize self.backwardSetup = False self._gaterView = gaterInput.view(self.size) torch.mul(self._gaterView.expand_as(expertInputs), expertInputs, out=self._expert) torch.sum(self._expert, self.dim, True, out=self.output) self.output.resize_as_(expertInputs.select(self.dim, 0)) return self.output
def accGradParameters(self, input, gradOutput, scale=1): inputSize, outputSize = self.weight.size(0), self.weight.size(1) """ dy_j 2 * c_j * c_j * (w_j - x) c_j * c_j * (w_j - x) ---- = -------------------------- = --------------------- dw_j 2 || c_j * (w_j - x) || y_j dy_j 2 * c_j * (w_j - x)^2 c_j * (w_j - x)^2 ---- = ----------------------- = ----------------- dc_j 2 || c_j * (w_j - x) || y_j #""" # assumes a preceding call to updateGradInput if input.dim() == 1: self.gradWeight.add_(-scale, self._repeat2) self._repeat.div_(self.diagCov) self._repeat.mul_(self._repeat) self._repeat.mul_(self.diagCov) if torch.type(input) == 'torch.cuda.FloatTensor': self._repeat2.resize_as_(self._expand4).copy_(self._expand4) self._repeat2.mul_(self._repeat) else: torch.mul(self._repeat, self._expand4, out=self._repeat2) self.gradDiagCov.add_(self._repeat2) elif input.dim() == 2: if self._sum is None: self._sum = input.new() torch.sum(self._repeat2, 0, True, out=self._sum) self._sum.resize_(inputSize, outputSize) self.gradWeight.add_(-scale, self._sum) if input.type() == 'torch.cuda.FloatTensor': # requires lots of memory, but minimizes cudaMallocs and loops self._repeat.div_(self._repeat3) self._repeat.mul_(self._repeat) self._repeat.mul_(self._repeat3) self._repeat2.resize_as_(self._expand4).copy_(self._expand4) self._repeat.mul_(self._repeat2) else: self._repeat.div_(self._expand3) self._repeat.mul_(self._repeat) self._repeat.mul_(self._expand3) self._repeat.mul_(self._expand4) torch.sum(self._repeat, 0, True, out=self._sum) self._sum.resize_(inputSize, outputSize) self.gradDiagCov.add_(scale, self._sum) else: raise RuntimeError("1D or 2D input expected")
def forward(self, input1): self.batchgrid = torch.zeros(torch.Size([input1.size(0)]) + self.grid.size()) for i in range(input1.size(0)): self.batchgrid[i] = self.grid self.batchgrid = Variable(self.batchgrid) #print self.batchgrid, input1[:,:,:,0:3] #print self.batchgrid, input1[:,:,:,4:6] x = torch.mul(self.batchgrid, input1[:,:,:,0:3]) y = torch.mul(self.batchgrid, input1[:,:,:,3:6]) output = torch.cat([torch.sum(x,3),torch.sum(y,3)], 3) return output
def updateOutput(self, input): input1, input2 = input[0], input[1] input1, input2 = self._makeContiguous(input1, input2) if self.buffer is None: self.buffer = input1.new() self.w1 = input1.new() self.w22 = input1.new() self.w = input1.new() self.w32 = input1.new() self.ones = input1.new() torch.mul(input1, input2, out=self.buffer) torch.sum(self.buffer, 1, out=self.w1, keepdim=True) epsilon = 1e-12 torch.mul(input1, input1, out=self.buffer) torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon) self.w22.reciprocal_() self.w.resize_as_(self.w22).copy_(self.w22) torch.mul(input2, input2, out=self.buffer) torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon) self.w32.reciprocal_() self.w.mul_(self.w32) self.w.sqrt_() torch.mul(self.w1, self.w, out=self.output) self.output.resize_(input1.size(0)) return self.output
def accGradParameters(self, input, gradOutput, scale=1): if self._input is None: self._input = input.new() self._gradWeight = input.new() self._sum = input.new() batchSize = input.size(0) contiguousView(self._input, input, batchSize, -1) contiguousView(self._gradOutput, gradOutput, batchSize, -1) self._gradWeight = self.gradWeight.view(1, -1) torch.mul(self._input, self._gradOutput, out=self._repeat) torch.sum(self._repeat, 0, True, out=self._sum) self._gradWeight.add_(scale, self._sum)
def accGradParameters(self, input, gradOutput, scale=1): self._assertInputGradOutput(input, gradOutput) # make sure we have buffer: if self.buff1 is None: self.buff1 = input[0].new() self.buff1.resize_as_(input[0]) # accumulate parameter gradients: for k in range(self.weight.size(0)): torch.mul(input[0], gradOutput.narrow(1, k, 1).expand_as(input[0]), out=self.buff1) self.gradWeight[k].addmm_(self.buff1.t(), input[1]) if self.bias is not None: self.gradBias.add_(scale, gradOutput.sum(0, keepdim=False))
def backward(self, grad_output): input, output = self.saved_tensors grad_input = grad_output.new() if self._backend is not None: self._backend.SpatialCrossMapLRN_updateGradInput( self._backend.library_state, input, grad_output, grad_input, self.scale, output, self.size, self.alpha, self.beta, self.k ) else: batch_size = input.size(0) channels = input.size(1) input_height = input.size(2) input_width = input.size(3) paddded_ratio = input.new(channels + self.size - 1, input_height, input_width) accum_ratio = input.new(input_height, input_width) cache_ratio_value = 2 * self.alpha * self.beta / self.size inversePrePad = int(self.size - (self.size - 1) / 2) grad_input.resize_as_(input) torch.pow(self.scale, -self.beta, out=grad_input).mul_(grad_output) paddded_ratio.zero_() padded_ratio_center = paddded_ratio.narrow(0, inversePrePad, channels) for n in range(batch_size): torch.mul(grad_output[n], output[n], out=padded_ratio_center) padded_ratio_center.div_(self.scale[n]) torch.sum( paddded_ratio.narrow(0, 0, self.size - 1), 0, keepdim=False, out=accum_ratio) for c in range(channels): accum_ratio.add_(paddded_ratio[c + self.size - 1]) grad_input[n][c].addcmul_(-cache_ratio_value, input[n][c], accum_ratio) accum_ratio.add_(-1, paddded_ratio[c]) return grad_input
def forward(self, input_data): # input_data: batch_size * T - 1 * input_size input_weighted = Variable(input_data.data.new(input_data.size(0), self.T - 1, self.input_size).zero_()) input_encoded = Variable(input_data.data.new(input_data.size(0), self.T - 1, self.hidden_size).zero_()) # hidden, cell: initial states with dimention hidden_size hidden = self.init_hidden(input_data) # 1 * batch_size * hidden_size cell = self.init_hidden(input_data) # hidden.requires_grad = False # cell.requires_grad = False for t in range(self.T - 1): # Eqn. 8: concatenate the hidden states with each predictor x = torch.cat((hidden.repeat(self.input_size, 1, 1).permute(1, 0, 2), cell.repeat(self.input_size, 1, 1).permute(1, 0, 2), input_data.permute(0, 2, 1)), dim = 2) # batch_size * input_size * (2*hidden_size + T - 1) # Eqn. 9: Get attention weights x = self.attn_linear(x.view(-1, self.hidden_size * 2 + self.T - 1)) # (batch_size * input_size) * 1 attn_weights = F.softmax(x.view(-1, self.input_size)) # batch_size * input_size, attn weights with values sum up to 1. # Eqn. 10: LSTM weighted_input = torch.mul(attn_weights, input_data[:, t, :]) # batch_size * input_size # Fix the warning about non-contiguous memory # see https://discuss.pytorch.org/t/dataparallel-issue-with-flatten-parameter/8282 self.lstm_layer.flatten_parameters() _, lstm_states = self.lstm_layer(weighted_input.unsqueeze(0), (hidden, cell)) hidden = lstm_states[0] cell = lstm_states[1] # Save output input_weighted[:, t, :] = weighted_input input_encoded[:, t, :] = hidden return input_weighted, input_encoded
def forward(self, inputs, input_lengths): """ Forward pass. # Arguments: inputs (Torch.Variable): Tensor of input sequences input_lengths (torch.LongTensor): Lengths of the sequences # Return: Tuple with (representations and attentions if self.return_attention else None). """ logits = inputs.matmul(self.attention_vector) unnorm_ai = (logits - logits.max()).exp() # Compute a mask for the attention on the padded sequences # See e.g. https://discuss.pytorch.org/t/self-attention-on-words-and-masking/5671/5 max_len = unnorm_ai.size(1) idxes = torch.arange(0, max_len, out=torch.LongTensor(max_len)).unsqueeze(0) mask = Variable((idxes < input_lengths.unsqueeze(1)).float()) # apply mask and renormalize attention scores (weights) masked_weights = unnorm_ai * mask att_sums = masked_weights.sum(dim=1, keepdim=True) # sums per sequence attentions = masked_weights.div(att_sums) # apply attention weights weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs)) # get the final fixed vector representations of the sentences representations = weighted.sum(dim=1) return (representations, attentions if self.return_attention else None)
def routing(self, x, b_IJ, W,batch_size,routing_iter): x1 = x.view(batch_size, 256, 1, 6, 6) x_tile = x1.repeat(1, 1, 10, 1, 1) x_view = x_tile.view(batch_size, 1152, 10, 8, 1) stride_i = W.repeat(batch_size, 1, 1, 1, 1) stride_j = stride_i.view(batch_size, 1152, 10, 16, 8) dot_op = torch.matmul(stride_j, x_view) dot_op_stopped = Variable(dot_op.data.clone(), requires_grad=False) for r_iter in range(routing_iter): id_capsule = F.softmax(b_IJ, dim=2) if r_iter == routing_iter - 1: route_I = torch.mul(id_capsule, dot_op) route_I_sum = torch.sum(route_I, dim=1, keepdim=True) + self.bias V_J = squash(route_I_sum,self.epsilon) if r_iter < routing_iter - 1: dot_op_stopped_tmp = dot_op_stopped.data.numpy() dot_op_stopped_tmp = np.reshape(dot_op_stopped_tmp, (batch_size, 1152, 10, 16, 1)) id_capsule_tmp = id_capsule.data.numpy() route_I_tmp = id_capsule_tmp * dot_op_stopped_tmp route_I_tmp_sum = np.sum(route_I_tmp, axis=1, keepdims=True) + self.bias.data.numpy() V_J_tmp = squash(torch.Tensor(route_I_tmp_sum),self.epsilon) V_J_tmp_tiled = np.tile(V_J_tmp.numpy(), (1, 1152, 1, 1, 1)) dot_op_stopped_tmp = np.reshape(dot_op_stopped_tmp, (batch_size, 1152, 10, 1, 16)) u_produce_v = np.matmul(dot_op_stopped_tmp, V_J_tmp_tiled) b_IJ.data += torch.Tensor(u_produce_v) return V_J
def updateGradInput(self, input, gradOutput): assert input.dim() == 4 if input.type() == 'torch.cuda.FloatTensor': self._backend.SpatialCrossMapLRN_updateGradInput( self._backend.library_state, input, gradOutput, self.gradInput, self.scale, self.output, self.size, self.alpha, self.beta, self.k ) else: batchSize = input.size(0) channels = input.size(1) inputHeight = input.size(2) inputWidth = input.size(3) if self.paddedRatio is None: self.paddedRatio = input.new() if self.accumRatio is None: self.accumRatio = input.new() self.paddedRatio.resize_(channels + self.size - 1, inputHeight, inputWidth) self.accumRatio.resize_(inputHeight, inputWidth) cacheRatioValue = 2 * self.alpha * self.beta / self.size inversePrePad = int(self.size - (self.size - 1) / 2) self.gradInput.resize_as_(input) torch.pow(self.scale, -self.beta, out=self.gradInput).mul_(gradOutput) self.paddedRatio.zero_() paddedRatioCenter = self.paddedRatio.narrow(0, inversePrePad, channels) for n in range(batchSize): torch.mul(gradOutput[n], self.output[n], out=paddedRatioCenter) paddedRatioCenter.div_(self.scale[n]) torch.sum(self.paddedRatio.narrow(0, 0, self.size - 1), 0, keepdim=False, out=self.accumRatio) for c in range(channels): self.accumRatio.add_(self.paddedRatio[c + self.size - 1]) self.gradInput[n][c].addcmul_(-cacheRatioValue, input[n][c], self.accumRatio) self.accumRatio.add_(-1, self.paddedRatio[c]) return self.gradInput
def train_init(tracker_net, meta_alpha, loss_fn, pos_regions, neg_regions, lh_pos_regions, lh_neg_regions, evaluator, train_all=False): if train_all: tracker_init_weights = OrderedDict((name, param) for (name, param) in tracker_net.named_parameters()) tracker_keys = [name for (name, _) in tracker_net.named_parameters()] else: tracker_init_weights = OrderedDict((name, param) for (name, param) in tracker_net.named_parameters() if name.startswith('fc') ) tracker_keys = [name for (name, _) in tracker_net.named_parameters() if name.startswith('fc')] # the first iteration pos_score = tracker_net.forward(pos_regions) neg_score = tracker_net.forward(neg_regions) loss = loss_fn(pos_score,neg_score) grads = torch.autograd.grad(loss, tracker_init_weights.values(), create_graph=True) tracker_weights = OrderedDict((name, param - torch.mul(meta_alpha,grad)) for ((name, param),(_,meta_alpha),grad) in zip(tracker_init_weights.items(), meta_alpha.items(), grads)) for i in range(opts['n_init_updates']-1): pos_score = tracker_net.forward(pos_regions, tracker_weights) neg_score = tracker_net.forward(neg_regions, tracker_weights) loss = loss_fn(pos_score,neg_score) grads = torch.autograd.grad(loss, tracker_weights.values(), create_graph=True) tracker_weights = OrderedDict((name, param - torch.mul(meta_alpha,grad)) for ((name, param),(_,meta_alpha),grad) in zip(tracker_weights.items(),meta_alpha.items(), grads)) lh_pos_score = tracker_net.forward(lh_pos_regions, tracker_weights) lh_neg_score = tracker_net.forward(lh_neg_regions, tracker_weights) lh_loss = loss_fn(lh_pos_score,lh_neg_score) lh_acc,lh_acc_pos,lh_acc_neg = evaluator(lh_pos_score, lh_neg_score) pos_score = tracker_net.forward(pos_regions, tracker_weights) neg_score = tracker_net.forward(neg_regions, tracker_weights) loss = loss_fn(pos_score,neg_score) acc,acc_pos,acc_neg = evaluator(pos_score, neg_score) # compute meta grads for lookahead dataset grads = torch.autograd.grad(lh_loss, tracker_init_weights.values(), retain_graph=True) alpha_grads = torch.autograd.grad(lh_loss, meta_alpha.values()) meta_init_grads = {} meta_alpha_grads = {} for i in range(len(tracker_keys)): meta_init_grads[tracker_keys[i]] = grads[i] meta_alpha_grads[tracker_keys[i]] = alpha_grads[i] return meta_init_grads, meta_alpha_grads, loss.data[0], lh_loss.data[0], acc, lh_acc
def updateGradInput(self, input, y): self.gradInput.resize_as_(input).copy_(y) self.gradInput[torch.mul(torch.eq(y, -1), torch.gt(input, self.margin))] = 0 if self.sizeAverage: self.gradInput.mul_(1. / input.nelement()) return self.gradInput
def forward(self, lvec, rvec): mult_dist = torch.mul(lvec, rvec) abs_dist = torch.abs(torch.add(lvec, -rvec)) vec_dist = torch.cat((mult_dist, abs_dist), 1) out = F.sigmoid(self.wh(vec_dist)) out = F.log_softmax(self.wp(out)) return out
def get_vert_context(self, vert_state, edge_state): verb_vert_state = vert_state[0] region_vert_state = vert_state[1:] verb_expanded_state = verb_vert_state.expand(region_vert_state.size(0), verb_vert_state.size(0)) #print('vert shapes', verb_vert_state.size(), region_vert_state.size(), verb_expanded_state.size()) verb_concat = torch.mul(verb_expanded_state, edge_state) region_concat = torch.mul(region_vert_state, edge_state) att_weighted_verb_per_edge = torch.mul(self.vert_att(verb_concat), edge_state) att_weighted_region = torch.mul(self.edge_att(region_concat), edge_state) att_weighted_verb = torch.sum(att_weighted_verb_per_edge, 0) vert_ctx = torch.cat((torch.unsqueeze(att_weighted_verb,0),att_weighted_region), 0) #print('vert context :', vert_ctx.size()) return vert_ctx
def forward(self, x): # padding = self.dilation - (x.shape[-1] + self.dilation - 1) % self.dilation x = F.pad(x, (self.dilation, 0)) return torch.mul(self.tanh(self.conv_f(x)), self.sig(self.conv_g(x)))
def forward(self, input, target, N_D, N_R): return torch.sum(-1. * torch.mul(target, input) + N_D / float(N_R) * torch.mul((1 - target), (torch.exp(input) - 1)))
def convert_from_tanh(input_tensor): out_tensor = torch.add(input_tensor, 1) out_tensor = torch.mul(out_tensor, 255 / 2) return out_tensor
def forward(self, shading, albedo): self.shading = shading.repeat(1, self.nc, 1, 1) self.img = torch.mul(self.shading, albedo) return self.img
def nms(boxes, scores, overlap=0.5, top_k=200): """Apply non-maximum suppression at test time to avoid detecting too many overlapping bounding boxes for a given object. Args: boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. scores: (tensor) The class predscores for the img, Shape:[num_priors]. overlap: (float) The overlap thresh for suppressing unnecessary boxes. top_k: (int) The Maximum number of box preds to consider. Return: The indices of the kept boxes with respect to num_priors. """ keep = torch.Tensor(scores.size(0)).fill_(0).long() if boxes.numel() == 0: return keep x1 = boxes[:, 0] y1 = boxes[:, 1] x2 = boxes[:, 2] y2 = boxes[:, 3] area = torch.mul(x2 - x1, y2 - y1) v, idx = scores.sort(0) # sort in ascending order # I = I[v >= 0.01] idx = idx[-top_k:] # indices of the top-k largest vals xx1 = boxes.new() yy1 = boxes.new() xx2 = boxes.new() yy2 = boxes.new() w = boxes.new() h = boxes.new() # keep = torch.Tensor() count = 0 while idx.numel() > 0: i = idx[-1] # index of current largest val # keep.append(i) keep[count] = i count += 1 if idx.size(0) == 1: break idx = idx[:-1] # remove kept element from view # load bboxes of next highest vals torch.index_select(x1, 0, idx, out=xx1) torch.index_select(y1, 0, idx, out=yy1) torch.index_select(x2, 0, idx, out=xx2) torch.index_select(y2, 0, idx, out=yy2) # store element-wise max with next highest score xx1 = torch.clamp(xx1, min=x1[i]) yy1 = torch.clamp(yy1, min=y1[i]) xx2 = torch.clamp(xx2, max=x2[i]) yy2 = torch.clamp(yy2, max=y2[i]) w.resize_as_(xx2) h.resize_as_(yy2) w = xx2 - xx1 h = yy2 - yy1 # check sizes of xx1 and xx2.. after each iteration w = torch.clamp(w, min=0.0) h = torch.clamp(h, min=0.0) inter = w * h # IoU = i / (area(a) + area(b) - i) rem_areas = torch.index_select(area, 0, idx) # load remaining areas) union = (rem_areas - inter) + area[i] IoU = inter / union # store result in iou # keep only elements with an IoU <= overlap idx = idx[IoU.le(overlap)] return keep, count
def forward(self, hidden, y_label, input_mask): """ hidden: batch_size_x * (1 + batch_size_y) * max_seq_length * h_dim y_label: batch_size_x * batch_size_y input_mask: batch_size_x * (1 + batch_size_y) * max_seq_length """ hidden = normalize(hidden) hidden = hidden.masked_fill(mask=~input_mask.unsqueeze(-1).expand_as(hidden), value=torch.tensor(0)) x = hidden[:,0,:,:] # x * seq * dim y = hidden[:,1:,:,:] # x * y * seq * dim #h[x][0][m][q] #h[x][i][n][q] #h[x][i][m] = \sum n,q h[x][0][m][q] * h[x][i][n][q] """ a[x][p][q] b[y][i][q] c[x][y][p] = \sum i,q a_xpq * b_yiq """ x_score = x.unsqueeze(1) # x * 1 * seq * dim y_score = torch.sum(y, dim=2) # x * y * dim x_mask = input_mask[:,0,:] # x * seq score = self.attn_fn(x_score, y_score) / self.temperature # x * y * seq score = score.masked_fill(mask=~x_mask.unsqueeze(1).expand_as(score),value=float('-1e8')) attn = F.softmax(score, dim=2).unsqueeze(-1) # batch_size_x * batch_size_y * max_seq_length * 1 x_expand = x.unsqueeze(-1).expand(x.size(0), x.size(1), x.size(2), y.size(1)) # batch_size_x * max_seq_length * h_dim * batch_size_y x_perm = x_expand.permute(0, 3, 2, 1) # batch_size_x * batch_size_y * h_dim * max_seq_length #print(hidden.shape, x.shape, x_perm.shape, y.shape, attn.shape) x_embed = torch.matmul(x_perm, attn).squeeze(-1) if not self.cat: x_embed = normalize(x_embed) # batch_size_x * batch_size_y * h_dim else: x_embed = normalize(torch.cat((x_embed, x[:,-1,:].unsqueeze(1).expand_as(x_embed)), dim=-1)) #y_embed = normalize(torch.sum(y, dim=1)) # batch_size_y * h_dim y_embed = y[:,:,-1,:] # x * y * dim #y_embed_expand = y_embed.unsqueeze(0).expand(x_embed.size(0), y_embed.size(0), y_embed.size(1)) # batch_size_x * batch_size_y * h_dim sim = self.sim_fn(x_embed, y_embed) # x * y mse_loss = self.mse_loss(sim, y_label) weight = torch.ones_like(sim) weight[y_label==1] = self.pos_weight mse = torch.mean(torch.mul(mse_loss, weight)) mse = torch.mean(mse_loss) #print(sim.shape, y_label.shape) pos_wrong = sum(sim[y_label==1.] < 0).item() neg_wrong = sum(sim[y_label==-1.] > 0).item() pos_count = len(y_label[y_label==1.]) neg_count = len(y_label[y_label==-1.]) return mse, (attn.squeeze(-1), sim, y_label, pos_wrong, neg_wrong, pos_count, neg_count)
def score(self, potentials, parts, batch_dims=[0]): score = torch.mul(potentials, parts) batch = tuple((score.shape[b] for b in batch_dims)) return self.semiring.prod(score.view(batch + (-1, )))
def _fusion_classif(self, x_v, x_q): x_mm = torch.mul(x_v, x_q) return x_mm
def _attention(self, input_v, x_q_vec): batch_size = input_v.size(0) width = input_v.size(2) height = input_v.size(3) # Process visual before fusion #x_v = input_v.view(batch_size*width*height, dim_features) x_v = input_v x_v = F.dropout(x_v, p=self.opt['attention']['dropout_v'], training=self.training) x_v = self.conv_v_att(x_v) if 'activation_v' in self.opt['attention']: x_v = getattr(F, self.opt['attention']['activation_v'])(x_v) x_v = x_v.view(batch_size, self.opt['attention']['dim_v'], width * height) x_v = x_v.transpose(1, 2) # Process question before fusion x_q = F.dropout(x_q_vec, p=self.opt['attention']['dropout_q'], training=self.training) x_q = self.linear_q_att(x_q) if 'activation_q' in self.opt['attention']: x_q = getattr(F, self.opt['attention']['activation_q'])(x_q) x_q = x_q.view(batch_size, 1, self.opt['attention']['dim_q']) x_q = x_q.expand(batch_size, width * height, self.opt['attention']['dim_q']) # First multimodal fusion x_att = self._fusion_att(x_v, x_q) if 'activation_mm' in self.opt['attention']: x_att = getattr(F, self.opt['attention']['activation_mm'])(x_att) # Process attention vectors x_att = F.dropout(x_att, p=self.opt['attention']['dropout_mm'], training=self.training) # can be optim to avoid two views and transposes x_att = x_att.view(batch_size, width, height, self.opt['attention']['dim_mm']) x_att = x_att.transpose(2, 3).transpose(1, 2) x_att = self.conv_att(x_att) x_att = x_att.view(batch_size, self.opt['attention']['nb_glimpses'], width * height) list_att_split = torch.split(x_att, 1, dim=1) list_att = [] for x_att in list_att_split: x_att = x_att.contiguous() x_att = x_att.view(batch_size, width * height) x_att = F.softmax(x_att) list_att.append(x_att) # Apply attention vectors to input_v x_v = input_v.view(batch_size, self.opt['dim_v'], width * height) x_v = x_v.transpose(1, 2) list_v_att = [] for i, x_att in enumerate(list_att): x_att = x_att.view(batch_size, width * height, 1) x_att = x_att.expand(batch_size, width * height, self.opt['dim_v']) x_v_att = torch.mul(x_att, x_v) x_v_att = x_v_att.sum(1) x_v_att = x_v_att.view(batch_size, self.opt['dim_v']) list_v_att.append(x_v_att) return list_v_att
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, train_loader, writer): """ Evaluate the model on the val set. Args: val_loader (loader): data loader to provide validation data. model (model): model to evaluate the performance. val_meter (ValMeter): meter instance to record and calculate the metrics. cur_epoch (int): number of the current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Evaluation mode enabled. The running stats would not be updated. model.eval() val_meter.iter_tic() for cur_iter, (inputs, labels, index, time, meta) in enumerate(val_loader): if cfg.NUM_GPUS: # Transferthe data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) index = index.cuda() time = time.cuda() batch_size = (inputs[0][0].size(0) if isinstance(inputs[0], list) else inputs[0].size(0)) val_meter.data_toc() if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) ori_boxes = meta["ori_boxes"] metadata = meta["metadata"] if cfg.NUM_GPUS: preds = preds.cpu() ori_boxes = ori_boxes.cpu() metadata = metadata.cpu() if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) val_meter.iter_toc() # Update and log stats. val_meter.update_stats(preds, ori_boxes, metadata) else: if cfg.TASK == "ssl" and cfg.MODEL.MODEL_NAME == "ContrastiveModel": if not cfg.CONTRASTIVE.KNN_ON: return train_labels = (model.module.train_labels if hasattr( model, "module") else model.train_labels) yd, yi = model(inputs, index, time) K = yi.shape[1] C = (cfg.CONTRASTIVE.NUM_CLASSES_DOWNSTREAM ) # eg 400 for Kinetics400 candidates = train_labels.view(1, -1).expand(batch_size, -1) retrieval = torch.gather(candidates, 1, yi) retrieval_one_hot = torch.zeros((batch_size * K, C)).cuda() retrieval_one_hot.scatter_(1, retrieval.view(-1, 1), 1) yd_transform = yd.clone().div_(cfg.CONTRASTIVE.T).exp_() probs = torch.mul( retrieval_one_hot.view(batch_size, -1, C), yd_transform.view(batch_size, -1, 1), ) preds = torch.sum(probs, 1) else: preds = model(inputs) if cfg.DATA.MULTI_LABEL: if cfg.NUM_GPUS > 1: preds, labels = du.all_gather([preds, labels]) else: # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, 5)) # Combine the errors across the GPUs. top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] if cfg.NUM_GPUS > 1: top1_err, top5_err = du.all_reduce([top1_err, top5_err]) # Copy the errors from GPU to CPU (sync point). top1_err, top5_err = top1_err.item(), top5_err.item() val_meter.iter_toc() # Update and log stats. val_meter.update_stats( top1_err, top5_err, batch_size * max( cfg.NUM_GPUS, 1 ), # If running on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU. ) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Val/Top1_err": top1_err, "Val/Top5_err": top5_err }, global_step=len(val_loader) * cur_epoch + cur_iter, ) val_meter.update_predictions(preds, labels) val_meter.log_iter_stats(cur_epoch, cur_iter) val_meter.iter_tic() # Log epoch stats. val_meter.log_epoch_stats(cur_epoch) # write to tensorboard format if available. if writer is not None: if cfg.DETECTION.ENABLE: writer.add_scalars({"Val/mAP": val_meter.full_map}, global_step=cur_epoch) else: all_preds = [pred.clone().detach() for pred in val_meter.all_preds] all_labels = [ label.clone().detach() for label in val_meter.all_labels ] if cfg.NUM_GPUS: all_preds = [pred.cpu() for pred in all_preds] all_labels = [label.cpu() for label in all_labels] writer.plot_eval(preds=all_preds, labels=all_labels, global_step=cur_epoch) val_meter.reset()
def test_precedence_semantics(self): """Test semantics for __torch_function__ for functions that take multiple arguments For functions that take multiple arguments, the appropriate __torch_function__ implementation to call is determined by examining the types of the arguments. The precedence order is left-to-right in the argument list, except subclasses are always checked before superclasses. The first result of calling the implementations in precedence order that is not NotImplemented is returned to the user. If all implementations return NotImplemented, a TypeError is raised. All cases are tested with functions implemented in C++ and either foo or baz, which are python functions defined above that are instrumented to obey the same dispatch rules as the functions in torch.functional. """ # DiagonalTensor has a valid override and SubDiagonal has an # override that returns NotImplemented so we should call the # DiagonalTensor implementation, returning -1 t1 = DiagonalTensor(5, 2) t2 = SubDiagonalTensor(5, 2) self.assertEqual(torch.div(t1, t2), -1) self.assertEqual(torch.div(t2, t1), -1) self.assertEqual(foo(t1, t2), -1) self.assertEqual(foo(t2, t1), -1) # SubTensor has an implementation that returns NotImplemented as # well so it should behave exactly like SubDiagonalTensor in the # test above t3 = SubTensor([[1, 2], [1, 2]]) self.assertEqual(torch.div(t1, t3), -1) self.assertEqual(torch.div(t3, t1), -1) self.assertEqual(foo(t1, t3), -1) self.assertEqual(foo(t3, t1), -1) # div between SubTensor and SubDiagonalTensor should raise # TypeError since both have an implementation that # explicitly returns NotImplemented with self.assertRaises(TypeError): torch.div(t2, t3) with self.assertRaises(TypeError): torch.div(t3, t2) with self.assertRaises(TypeError): foo(t2, t3) with self.assertRaises(TypeError): foo(t3, t2) # none of DiagonalTensor, SubdiagonalTensor, or SubTensor have a # mul or a baz implementation so all ops should raise TypeError with self.assertRaises(TypeError): torch.mul(t1, t1) with self.assertRaises(TypeError): torch.mul(t1, t2) with self.assertRaises(TypeError): torch.mul(t1, t3) with self.assertRaises(TypeError): torch.mul(t2, t1) with self.assertRaises(TypeError): torch.mul(t2, t2) with self.assertRaises(TypeError): torch.mul(t2, t3) with self.assertRaises(TypeError): torch.mul(t3, t1) with self.assertRaises(TypeError): torch.mul(t3, t2) with self.assertRaises(TypeError): torch.mul(t3, t3) with self.assertRaises(TypeError): baz(t1, t1) with self.assertRaises(TypeError): baz(t1, t2) with self.assertRaises(TypeError): baz(t1, t3) with self.assertRaises(TypeError): baz(t2, t1) with self.assertRaises(TypeError): baz(t2, t2) with self.assertRaises(TypeError): baz(t2, t3) with self.assertRaises(TypeError): baz(t3, t1) with self.assertRaises(TypeError): baz(t3, t2) with self.assertRaises(TypeError): baz(t3, t3)
def normalize_weights(network, eps=1e-3): """ 'Normalize' the weights of a network, so that for each hidden neuron, the norm of incoming weights to that neuron is sqrt(2), dividing the outputs of that neuron by the factor that the inputs were multiplied by. For a ReLU network, this operation preserves network functionality. network: a neural network. has to inherit from torch.nn.module. Currently probably has to be an MLP eps: a float that should be small relative to sqrt(2), to add stability. returns nothing: just modifies the network in-place """ layers = get_weight_modules_from_live_net(network) for idx in range(len(layers) - 1): this_layer = layers[idx] next_layer = layers[idx + 1] assert 'fc_mod' in this_layer or 'conv_mod' in this_layer assert 'fc_mod' in next_layer or 'conv_mod' in next_layer inc_raw_weight_mod = (this_layer['fc_mod'] if 'fc_mod' in this_layer else this_layer['conv_mod']) inc_raw_weights = inc_raw_weight_mod.weight inc_raw_bias = inc_raw_weight_mod.bias inc_weights_np = inc_raw_weights.detach().cpu().numpy() inc_biases_np = inc_raw_bias.detach().cpu().numpy() if 'bn_mod' in this_layer: bn_mod = this_layer['bn_mod'] if hasattr(bn_mod, 'weight') and bn_mod.weight is not None: bn_weights_np = bn_mod.weight.detach().cpu().numpy() inc_weights_np = size_and_multiply_np(bn_weights_np, inc_weights_np) inc_weights_np = size_sqrt_divide_np(bn_mod.running_var, inc_weights_np) outgoing_weight_mod = (next_layer['fc_mod'] if 'fc_mod' in next_layer else next_layer['conv_mod']) outgoing_weights = outgoing_weight_mod.weight num_neurons = inc_weights_np.shape[0] assert outgoing_weights.shape[1] % num_neurons == 0 if 'fc_mod' in this_layer and 'fc_mod' in next_layer: assert outgoing_weights.shape[1] == num_neurons if 'conv_mod' in this_layer and 'conv_mod' in next_layer: assert outgoing_weights.shape[1] == num_neurons unsqueezed_bias = np.expand_dims(inc_biases_np, axis=1) flat_weights = inc_weights_np.reshape(inc_weights_np.shape[0], -1) all_inc_weights = np.concatenate((flat_weights, unsqueezed_bias), axis=1) scales = np.linalg.norm(all_inc_weights, axis=1) scales /= np.sqrt(2.) scales += eps scales = torch.from_numpy(scales) scales_rows = torch.unsqueeze(scales, 1) for i in range(2, len(inc_raw_weights.shape)): scales_rows = torch.unsqueeze(scales_rows, i) scales_mul = vector_stretch(scales, outgoing_weights.shape[1]) for i in range(1, len(outgoing_weights.shape) - 1): scales_mul = torch.unsqueeze(scales_mul, i) incoming_weights_unpruned = True incoming_biases_unpruned = True outgoing_weights_unpruned = True for name, param in inc_raw_weight_mod.named_parameters(): if name == 'weight_orig': param.data = torch.div(param, scales_rows) incoming_weights_unpruned = False if name == 'bias_orig': param.data = torch.div(param, scales) incoming_biases_unpruned = False for name, param in outgoing_weight_mod.named_parameters(): if name == 'weight_orig': param.data = torch.mul(param, scales_mul) outgoing_weights_unpruned = False if incoming_weights_unpruned: inc_raw_weight_mod.weight.data = torch.div(inc_raw_weights, scales_rows) if incoming_biases_unpruned: inc_raw_weight_mod.bias.data = torch.div(inc_raw_bias, scales) if outgoing_weights_unpruned: outgoing_weight_mod.weight.data = torch.mul( outgoing_weights, scales_mul)
def run_test_row_parallel_linear(rank, model_parallel_size, filename, filename_rpc): dist_init(rank, model_parallel_size, filename, filename_rpc) mpu.initialize_model_parallel(model_parallel_size) if torch.distributed.get_rank() == 0: print( "> testing RowParallelLinear with model parallel size: {}".format( model_parallel_size)) model_parallel_size = mpu.get_model_parallel_world_size() seed = 12345 set_random_seed(seed) input_size_coeff = 13 input_size = input_size_coeff * model_parallel_size output_size_coeff = 17 output_size = output_size_coeff * model_parallel_size batch_size = 7 # Network identity_layer = IdentityLayer2D(batch_size, input_size).cuda() linear_layer = layers.RowParallelLinear( input_size, output_size, keep_master_weight_for_test=True).cuda() loss_weight = torch.randn([batch_size, output_size]).cuda() # Forward input_ = identity_layer() output = linear_layer(input_) loss = torch.mul(output, loss_weight).sum() # Backward loss.backward() # Values. dLdY = loss_weight X = identity_layer.weight A = linear_layer.master_weight.cuda() dLdA = torch.matmul(dLdY.t(), X) dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) dLdX = torch.matmul(dLdY, A) rank = mpu.get_model_parallel_rank() my_dLdA = torch.split(dLdA, input_size_coeff, dim=1)[rank].contiguous().clone() error = my_dLdA.sub(linear_layer.weight.grad).abs().max() torch.distributed.barrier() print(" error in dLdA on global rank {}: {}".format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 error = dLdb.sub(linear_layer.bias.grad).abs().max() torch.distributed.barrier() print(" error in dLdb on global rank {}: {}".format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 error = dLdX.sub(identity_layer.weight.grad).abs().max() torch.distributed.barrier() print(" error in dLdX on global rank {}: {}".format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(" >> passed the test :-)")
def one_step_forward(self, input, state, for_out): """ Given input and state, move one step forward in time. Also, compute the derivative with respect to this input if it is not for_out. :param input: input at time t :param state: state at time t-1 :param for_out: training or testing :return: state at time t, gradient with respect to input """ gate_inputs = torch.matmul(torch.cat([input, state], 1), self.gate_kernel) gate_inputs = gate_inputs + self.gate_bias value = F.sigmoid(gate_inputs) r, u = torch.chunk(value, chunks=2, dim=1) r_state = r * state candidate = torch.matmul(torch.cat([input, r_state], 1), self.candidate_kernel) candidate = candidate + self.candidate_bias c = self.activation(candidate) new_h = u * state + (1 - u) * c if for_out or not (self.derivative_needed): return new_h, None # Extract the ".data" for each of the variables so that further computations do not # affect the gradients of these variables. u = u.data c = c.data state = state.data value = value.data start = torch.ones([1, self.hidden_units]) du = torch.mul(start, (state)) # dstate = torch.mul(start, (u)) du = du - torch.mul(start, (c)) dc = torch.mul(start, (1 - u)) # 50 x 50 if self.activation == F.leaky_relu: dcandidate = ( dc * torch.where(candidate > 0, torch.ones_like(candidate), 0.01 * torch.ones_like(candidate)) ) # 50x50 * 1x50 else: dcandidate = (dc * (1 - c**2)) dinputs_rstate = torch.matmul(dcandidate, torch.transpose(self.candidate_kernel, dim0=1, dim1=0)) # 50 x 350 dinputs = dinputs_rstate[:, :300] drstate = dinputs_rstate[:, 300:] # dstate = dstate + r * drstate dr = state * drstate dru = torch.cat([dr, du], dim=1) dgateinputs = dru * (value * (1 - value)) dinputs_state = torch.matmul( dgateinputs, torch.transpose(self.gate_kernel, dim0=1, dim1=0)) dinputs = dinputs + dinputs_state[:, :300] # dstate = dstate + dinputs_state[:, 300:] return new_h, dinputs
def native_layer_norm_backward( grad_out: Tensor, input: Tensor, normalized_shape: List[int], mean: Tensor, rstd: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], output_mask: List[bool], ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]: input_shape = input.shape input_ndim = input.dim() axis = input_ndim - len(normalized_shape) inner_dims = input_shape[axis:] outer_dims = input_shape[:axis] inner_dim_indices = list(range(axis, input_ndim)) outer_dim_indices = list(range(0, axis)) N = 1 for i in inner_dims: N *= i M = 1 for i in outer_dims: M *= i if M <= 0 or N <= 0: return ( input.new_zeros(input_shape), input.new_zeros(input_shape[axis:]), input.new_zeros(input_shape[axis:]), ) mean_, rstd_ = recompute_mean_var(input, rstd, inner_dim_indices, keepdim=True) x_hat = (input - mean_) * rstd_ if weight is not None: grad_x_hat = grad_out * weight else: grad_x_hat = grad_out a = grad_x_hat * N b = torch.sum(grad_x_hat, inner_dim_indices, True) c1 = torch.mul(grad_x_hat, x_hat) c2 = torch.sum(c1, inner_dim_indices, True) c3 = torch.mul(x_hat, c2) inner = a - b - c3 if output_mask[0]: d_input: Optional[Tensor] = (rstd_ / N) * inner else: d_input = torch.zeros_like( input) # should be None but doesn't work with vjp if output_mask[1] and weight is not None: if len(outer_dim_indices) > 0: d_weight: Optional[Tensor] = torch.sum(grad_out * x_hat, outer_dim_indices, False) else: d_weight = grad_out * x_hat elif weight is not None: d_weight = torch.zeros_like( weight) # should be None but doesn't work with vjp else: d_weight = torch.zeros(()) # should be None but doesn't work with vjp if output_mask[2] and bias is not None: if len(outer_dim_indices) > 0: d_bias: Optional[Tensor] = torch.sum(grad_out, outer_dim_indices, False) else: d_bias = grad_out elif bias is not None: d_bias = torch.zeros_like( bias) # should be None but doesn't work with vjp else: d_bias = torch.zeros(()) # should be None but doesn't work with vjp return (d_input, d_weight, d_bias)
def _fusion_att(self, x_v, x_q): x_att = torch.mul(x_v, x_q) return x_att
def _abs_square(x): return torch.mul(x, x)
def main(args): """ Main translation function' """ # Load arguments from checkpoint torch.manual_seed(args.seed) state_dict = torch.load( args.checkpoint_path, map_location=lambda s, l: default_restore_location(s, 'cpu')) args_loaded = argparse.Namespace(**{ **vars(args), **vars(state_dict['args']) }) args_loaded.data = args.data args = args_loaded utils.init_logging(args) # Load dictionaries src_dict = Dictionary.load( os.path.join(args.data, 'dict.{:s}'.format(args.source_lang))) logging.info('Loaded a source dictionary ({:s}) with {:d} words'.format( args.source_lang, len(src_dict))) tgt_dict = Dictionary.load( os.path.join(args.data, 'dict.{:s}'.format(args.target_lang))) logging.info('Loaded a target dictionary ({:s}) with {:d} words'.format( args.target_lang, len(tgt_dict))) # Load dataset test_dataset = Seq2SeqDataset( src_file=os.path.join(args.data, 'test.{:s}'.format(args.source_lang)), tgt_file=os.path.join(args.data, 'test.{:s}'.format(args.target_lang)), src_dict=src_dict, tgt_dict=tgt_dict) test_loader = torch.utils.data.DataLoader(test_dataset, num_workers=1, collate_fn=test_dataset.collater, batch_sampler=BatchSampler( test_dataset, 9999999, args.batch_size, 1, 0, shuffle=False, seed=args.seed)) # Build model and criterion model = models.build_model(args, src_dict, tgt_dict) if args.cuda: model = model.cuda() model.eval() model.load_state_dict(state_dict['model']) logging.info('Loaded a model from checkpoint {:s}'.format( args.checkpoint_path)) progress_bar = tqdm(test_loader, desc='| Generation', leave=False) # Iterate over the test set all_hyps = {} for i, sample in enumerate(progress_bar): # Create a beam search object or every input sentence in batch batch_size = sample['src_tokens'].shape[0] searches = [ BeamSearch(args.beam_size, args.max_len - 1, tgt_dict.unk_idx) for i in range(batch_size) ] with torch.no_grad(): # Compute the encoder output encoder_out = model.encoder(sample['src_tokens'], sample['src_lengths']) #print("src_tokens:", type(sample['src_tokens'])) outlen = len(tgt_dict.string(sample['src_tokens'])) print("-words len:", outlen) #print("-size:", list(sample['src_tokens'].size())) #print("-content:", sample['src_tokens']) # __QUESTION 1: What is "go_slice" used for and what do its dimensions represent? go_slice = \ torch.ones(sample['src_tokens'].shape[0], 1).fill_(tgt_dict.eos_idx).type_as(sample['src_tokens']) if args.cuda: go_slice = utils.move_to_cuda(go_slice) # Compute the decoder output at the first time step decoder_out, _ = model.decoder(go_slice, encoder_out) #print("Decoder_out:", type(decoder_out)) # <class 'torch.Tensor'> #print("-size:", list(decoder_out.size())) print("-content:", decoder_out) lp_y = 1 / (((5 + outlen**a) / ((5 + 1)**a))) decoder_out = torch.mul(decoder_out, lp_y) print("-normalized:", decoder_out) # __QUESTION 2: Why do we keep one top candidate more than the beam size? log_probs, next_candidates = torch.topk(torch.log( torch.softmax(decoder_out, dim=2)), args.beam_size + 1, dim=-1) # Create number of beam_size beam search nodes for every input sentence for i in range(batch_size): for j in range(args.beam_size): best_candidate = next_candidates[i, :, j] backoff_candidate = next_candidates[i, :, j + 1] best_log_p = log_probs[i, :, j] backoff_log_p = log_probs[i, :, j + 1] next_word = torch.where(best_candidate == tgt_dict.unk_idx, backoff_candidate, best_candidate) log_p = torch.where(best_candidate == tgt_dict.unk_idx, backoff_log_p, best_log_p) log_p = log_p[-1] # Store the encoder_out information for the current input sentence and beam emb = encoder_out['src_embeddings'][:, i, :] lstm_out = encoder_out['src_out'][0][:, i, :] final_hidden = encoder_out['src_out'][1][:, i, :] final_cell = encoder_out['src_out'][2][:, i, :] try: mask = encoder_out['src_mask'][i, :] except TypeError: mask = None node = BeamSearchNode(searches[i], emb, lstm_out, final_hidden, final_cell, mask, torch.cat( (go_slice[i], next_word)), log_p, 1) # __QUESTION 3: Why do we add the node with a negative score? searches[i].add(-node.eval(), node) # Start generating further tokens until max sentence length reached for _ in range(args.max_len - 1): # Get the current nodes to expand nodes = [n[1] for s in searches for n in s.get_current_beams()] if nodes == []: break # All beams ended in EOS # Reconstruct prev_words, encoder_out from current beam search nodes prev_words = torch.stack([node.sequence for node in nodes]) encoder_out["src_embeddings"] = torch.stack( [node.emb for node in nodes], dim=1) lstm_out = torch.stack([node.lstm_out for node in nodes], dim=1) final_hidden = torch.stack([node.final_hidden for node in nodes], dim=1) final_cell = torch.stack([node.final_cell for node in nodes], dim=1) encoder_out["src_out"] = (lstm_out, final_hidden, final_cell) try: encoder_out["src_mask"] = torch.stack( [node.mask for node in nodes], dim=0) except TypeError: encoder_out["src_mask"] = None with torch.no_grad(): # Compute the decoder output by feeding it the decoded sentence prefix decoder_out, _ = model.decoder(prev_words, encoder_out) # see __QUESTION 2 log_probs, next_candidates = torch.topk(torch.log( torch.softmax(decoder_out, dim=2)), args.beam_size + 1, dim=-1) # Create number of beam_size next nodes for every current node for i in range(log_probs.shape[0]): for j in range(args.beam_size): best_candidate = next_candidates[i, :, j] backoff_candidate = next_candidates[i, :, j + 1] best_log_p = log_probs[i, :, j] backoff_log_p = log_probs[i, :, j + 1] next_word = torch.where(best_candidate == tgt_dict.unk_idx, backoff_candidate, best_candidate) log_p = torch.where(best_candidate == tgt_dict.unk_idx, backoff_log_p, best_log_p) log_p = log_p[-1] next_word = torch.cat((prev_words[i][1:], next_word[-1:])) # Get parent node and beam search object for corresponding sentence node = nodes[i] search = node.search # __QUESTION 4: How are "add" and "add_final" different? What would happen if we did not make this distinction? # Store the node as final if EOS is generated if next_word[-1] == tgt_dict.eos_idx: node = BeamSearchNode( search, node.emb, node.lstm_out, node.final_hidden, node.final_cell, node.mask, torch.cat((prev_words[i][0].view([1]), next_word)), node.logp, node.length) search.add_final(-node.eval(), node) # Add the node to current nodes for next iteration else: node = BeamSearchNode( search, node.emb, node.lstm_out, node.final_hidden, node.final_cell, node.mask, torch.cat((prev_words[i][0].view([1]), next_word)), node.logp + log_p, node.length + 1) search.add(-node.eval(), node) # __QUESTION 5: What happens internally when we prune our beams? # How do we know we always maintain the best sequences? for search in searches: search.prune() # Segment into sentences best_sents = torch.stack( [search.get_best()[1].sequence[1:].cpu() for search in searches]) decoded_batch = best_sents.numpy() output_sentences = [ decoded_batch[row, :] for row in range(decoded_batch.shape[0]) ] # __QUESTION 6: What is the purpose of this for loop? temp = list() for sent in output_sentences: first_eos = np.where(sent == tgt_dict.eos_idx)[0] if len(first_eos) > 0: temp.append(sent[:first_eos[0]]) else: temp.append(sent) output_sentences = temp # Convert arrays of indices into strings of words output_sentences = [tgt_dict.string(sent) for sent in output_sentences] for ii, sent in enumerate(output_sentences): all_hyps[int(sample['id'].data[ii])] = sent # Write to file if args.output is not None: with open(args.output, 'w') as out_file: for sent_id in range(len(all_hyps.keys())): out_file.write(all_hyps[sent_id] + '\n')
def train_dqn(model, options, resume): """Train DQN model -- DQN model lr -- learning rate max_episode -- maximum episode resume -- resume previous model model_name -- checkpoint file name """ best_time_step = 0. if resume: if options.weight is None: print('when resume, you should give weight file name.') return print('load previous model weight: {}'.format(options.weight)) _, _, best_time_step = load_checkpoint(options.weight, model) flappyBird = game.GameState() optimizer = optim.RMSprop(model.parameters(), lr=options.lr) ceriterion = nn.MSELoss() action = [1, 0] o, r, terminal = flappyBird.frame_step(action) o = preprocess(o) model.set_initial_state() if options.cuda: model = model.cuda() # in the first `OBSERVE` time steos, we dont train the model for i in range(options.observation): action = model.get_action_randomly() o, r, terminal = flappyBird.frame_step(action) o = preprocess(o) model.store_transition(o, action, r, terminal) # start training for episode in range(options.max_episode): model.time_step = 0 model.set_train() total_reward = 0. # begin an episode! while True: optimizer.zero_grad() action = model.get_action() o_next, r, terminal = flappyBird.frame_step(action) total_reward += options.gamma**model.time_step * r o_next = preprocess(o_next) model.store_transition(o_next, action, r, terminal) model.increase_time_step() # Step 1: obtain random minibatch from replay memory minibatch = random.sample(model.replay_memory, options.batch_size) state_batch = np.array([data[0] for data in minibatch]) action_batch = np.array([data[1] for data in minibatch]) reward_batch = np.array([data[2] for data in minibatch]) next_state_batch = np.array([data[3] for data in minibatch]) state_batch_var = Variable(torch.from_numpy(state_batch)) next_state_batch_var = Variable(torch.from_numpy(next_state_batch), volatile=True) if options.cuda: state_batch_var = state_batch_var.cuda() next_state_batch_var = next_state_batch_var.cuda() # Step 2: calculate y q_value_next = model.forward(next_state_batch_var) q_value = model.forward(state_batch_var) y = reward_batch.astype(np.float32) max_q, _ = torch.max(q_value_next, dim=1) for i in range(options.batch_size): if not minibatch[i][4]: y[i] += options.gamma*max_q.data[i] y = Variable(torch.from_numpy(y)) action_batch_var = Variable(torch.from_numpy(action_batch)) if options.cuda: y = y.cuda() action_batch_var = action_batch_var.cuda() q_value = torch.sum(torch.mul(action_batch_var, q_value), dim=1) loss = ceriterion(q_value, y) loss.backward() optimizer.step() # when the bird dies, the episode ends if terminal: break print('episode: {}, epsilon: {:.4f}, max time step: {}, total reward: {:.6f}'.format( episode, model.epsilon, model.time_step, total_reward)) if model.epsilon > options.final_e: delta = (options.init_e - options.final_e)/options.exploration model.epsilon -= delta if episode % 100 == 0: ave_time = test_dqn(model, episode) if ave_time > best_time_step: best_time_step = ave_time save_checkpoint({ 'episode': episode, 'epsilon': model.epsilon, 'state_dict': model.state_dict(), 'best_time_step': best_time_step, }, True, 'checkpoint-episode-%d.pth.tar' %episode) elif episode % options.save_checkpoint_freq == 0: save_checkpoint({ 'episode:': episode, 'epsilon': model.epsilon, 'state_dict': model.state_dict(), 'time_step': ave_time, }, False, 'checkpoint-episode-%d.pth.tar' %episode) else: continue print('save checkpoint, episode={}, ave time step={:.2f}'.format( episode, ave_time))
def run_test_parallel_embedding(rank, model_parallel_size, filename, filename_rpc): dist_init(rank, model_parallel_size, filename, filename_rpc) if torch.distributed.get_rank() == 0: print("> testing parallel embedding with model parallel size {} ...". format(model_parallel_size)) mpu.initialize_model_parallel(model_parallel_size) model_parallel_size = mpu.get_model_parallel_world_size() batch_size = 17 seq_length = 23 vocab_size = 48 hidden_size = 16 seed = 1236 set_random_seed(123) input_data = torch.LongTensor(size=(batch_size, seq_length)).random_( 0, vocab_size).cuda() loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda() set_random_seed(seed) embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda() output = embedding_original(input_data) loss_original = torch.mul(output, loss_weight).sum() loss_original.backward() set_random_seed(seed) embedding_parallel = layers.ParallelEmbedding( vocab_size, hidden_size, init_method=init.normal_).cuda() output = embedding_parallel(input_data) loss_parallel = torch.mul(output, loss_weight).sum() loss_parallel.backward() set_random_seed(seed) embedding_vocab_parallel = layers.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init.normal_).cuda() output = embedding_vocab_parallel(input_data) loss_vocab_parallel = torch.mul(output, loss_weight).sum() loss_vocab_parallel.backward() torch.distributed.barrier() error = loss_parallel.sub(loss_original).abs() print(" error in loss (parallel) on global rank {}: {}".format( torch.distributed.get_rank(), error)) assert error < 1.0e-12, "error: {}".format(error) torch.distributed.barrier() error = loss_vocab_parallel.sub(loss_original).abs() print(" error in loss (vocab parallel) on global rank {}: {}".format( torch.distributed.get_rank(), error)) assert error < 1.0e-12, "error: {}".format(error) weight_grad_orig = torch.split(embedding_original.weight.grad, hidden_size // model_parallel_size, 1)[mpu.get_model_parallel_rank()] error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max() print(" error in grad (parallel) on global rank {}: {}".format( torch.distributed.get_rank(), error)) assert error < 1.0e-12, "error: {}".format(error) weight_grad_orig = torch.split(embedding_original.weight.grad, vocab_size // model_parallel_size, 0)[mpu.get_model_parallel_rank()] error = embedding_vocab_parallel.weight.grad.sub( weight_grad_orig).abs().max() print(" error in grad (vocab parallel) on global rank {}: {}".format( torch.distributed.get_rank(), error)) assert error < 1.0e-12, "error: {}".format(error) # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(">> passed the test :-)")
def mean_squared_error(prediction, target): prediction, target = flatten(prediction), flatten(target) diff = prediction - target return -torch.sum(torch.mul(diff, diff), 1)
def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" self.update_step += 1 experiences, demos = self.memory.sample(), self.demo_memory.sample() states, actions, rewards, next_states, dones = experiences demo_states, demo_actions, _, _, _ = demos new_actions, log_prob, pre_tanh_value, mu, std = self.actor(states) pred_actions, _, _, _, _ = self.actor(demo_states) # train alpha if self.hyper_params["AUTO_ENTROPY_TUNING"]: alpha_loss = (-self.log_alpha * (log_prob + self.target_entropy).detach()).mean() self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() alpha = self.log_alpha.exp() else: alpha_loss = torch.zeros(1) alpha = self.hyper_params["W_ENTROPY"] # Q function loss masks = 1 - dones q_1_pred = self.qf_1(states, actions) q_2_pred = self.qf_2(states, actions) v_target = self.vf_target(next_states) q_target = rewards + self.hyper_params["GAMMA"] * v_target * masks qf_1_loss = F.mse_loss(q_1_pred, q_target.detach()) qf_2_loss = F.mse_loss(q_2_pred, q_target.detach()) # V function loss v_pred = self.vf(states) q_pred = torch.min(self.qf_1(states, new_actions), self.qf_2(states, new_actions)) v_target = q_pred - alpha * log_prob vf_loss = F.mse_loss(v_pred, v_target.detach()) # train Q functions self.qf_1_optimizer.zero_grad() qf_1_loss.backward() self.qf_1_optimizer.step() self.qf_2_optimizer.zero_grad() qf_2_loss.backward() self.qf_2_optimizer.step() # train V function self.vf_optimizer.zero_grad() vf_loss.backward() self.vf_optimizer.step() if self.update_step % self.hyper_params["POLICY_UPDATE_FREQ"] == 0: # bc loss qf_mask = torch.gt( self.qf_1(demo_states, demo_actions), self.qf_1(demo_states, pred_actions), ).to(device) qf_mask = qf_mask.float() n_qf_mask = int(qf_mask.sum().item()) if n_qf_mask == 0: bc_loss = torch.zeros(1, device=device) else: bc_loss = (torch.mul(pred_actions, qf_mask) - torch.mul( demo_actions, qf_mask)).pow(2).sum() / n_qf_mask # actor loss advantage = q_pred - v_pred.detach() actor_loss = (alpha * log_prob - advantage).mean() actor_loss = self.lambda1 * actor_loss + self.lambda2 * bc_loss # regularization if not self.is_discrete: # iff the action is continuous mean_reg = self.hyper_params["W_MEAN_REG"] * mu.pow(2).mean() std_reg = self.hyper_params["W_STD_REG"] * std.pow(2).mean() pre_activation_reg = self.hyper_params[ "W_PRE_ACTIVATION_REG"] * (pre_tanh_value.pow(2).sum( dim=-1).mean()) actor_reg = mean_reg + std_reg + pre_activation_reg # actor loss + regularization actor_loss += actor_reg # train actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target networks common_utils.soft_update(self.vf, self.vf_target, self.hyper_params["TAU"]) else: actor_loss = torch.zeros(1) n_qf_mask = 0 return ( actor_loss.item(), qf_1_loss.item(), qf_2_loss.item(), vf_loss.item(), alpha_loss.item(), n_qf_mask, )
def native_batch_norm_backward( grad_out: Tensor, input: Tensor, weight: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], save_mean: Optional[Tensor], save_invstd: Optional[Tensor], train: bool, eps: float, output_mask: List[bool], ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]: input_shape = input.shape input_rank = input.dim() assert input_rank >= 2, "rank of the input must be at least 2" axis = 1 num_features = prod(input_shape) / input_shape[axis] mean = save_mean invstd = save_invstd if train: assert save_mean is not None and save_invstd is not None, "when train=True, save_mean and save_invstd are required" reduciton_dims = [0] + list(range(2, input.dim())) assert invstd is not None # for typing mean, invstd = recompute_mean_var(input, invstd, reduciton_dims, keepdim=False) else: assert running_mean is not None and running_var is not None mean = running_mean invstd = torch.rsqrt(running_var + eps) broadcast_mask = [1] * input_rank broadcast_mask[axis] = input_shape[axis] reduction_axes: List[int] = [] for i in range(input_rank): if i != axis: reduction_axes.append(i) mean = torch.reshape(mean, broadcast_mask) norm = 1.0 / num_features grad_output_sum = torch.sum(grad_out, reduction_axes) dot_p = torch.sum(grad_out * (input - mean), reduction_axes) grad_mean = torch.reshape(grad_output_sum * norm, broadcast_mask) proj_scale = torch.reshape(torch.mul(dot_p * norm, invstd * invstd), broadcast_mask) if weight is None: grad_scale = torch.reshape(invstd, broadcast_mask) * 1.0 else: grad_scale = torch.reshape(invstd * weight, broadcast_mask) if train: proj = (input - mean) * proj_scale grad_input = ((grad_out - proj) - grad_mean) * grad_scale else: grad_input = grad_out * grad_scale if output_mask[1]: grad_weight = dot_p * invstd elif weight is not None: grad_weight = torch.zeros_like( weight) # should be None but doesn't work with vjp else: grad_weight = torch.zeros( ()) # should be None but doesn't work with vjp if output_mask[2]: grad_bias = grad_output_sum else: grad_bias = torch.zeros_like( grad_output_sum) # should be None but doesn't work with vjp return (grad_input, grad_weight, grad_bias)