def _get_sac_trainer_params(env, sac_model_params, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) q2_network = None if sac_model_params.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) value_network = FullyConnectedNetwork( [state_dim] + sac_model_params.value_network.layers + [1], sac_model_params.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, sac_model_params.actor_network.layers, sac_model_params.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() value_network.cuda() actor_network.cuda() value_network_target = deepcopy(value_network) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) min_action_range_tensor_serving = torch.from_numpy(action_range_low).unsqueeze( dim=0 ) max_action_range_tensor_serving = torch.from_numpy(action_range_high).unsqueeze( dim=0 ) trainer_args = [ q1_network, value_network, value_network_target, actor_network, sac_model_params, ] trainer_kwargs = { "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return trainer_args, trainer_kwargs
def __init__(self, in_features, out_features, sigma_zero=0.4, bias=True): super(NoisyFactorizedLinear, self).__init__(in_features, out_features, bias=bias) sigma_init = sigma_zero / math.sqrt(in_features) self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init)) self.register_buffer("epsilon_input", torch.zeros(1, in_features)) self.register_buffer("epsilon_output", torch.zeros(out_features, 1)) if bias: self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
def __init__(self, in_features, out_features, sigma_init=0.017, bias=True): super(NoisyLinear, self).__init__(in_features, out_features, bias=bias) self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init)) self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features)) if bias: self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init)) self.register_buffer("epsilon_bias", torch.zeros(out_features)) self.reset_parameters()
def test_apply_init(): this_tests(apply_leaf, apply_init) m = simple_cnn(b,bn=True) all2 = lambda m: nn.init.constant_(m.weight,0.2) if hasattr(m, 'weight') else m all7 = lambda m: nn.init.constant_(m,0.7) apply_leaf(m,all2) apply_init(m,all7) conv1_w = torch.full([6,3,3,3],0.7) bn1_w = torch.full([6],0.2) assert conv1_w.equal(m[0][0].weight), "Expected first colvulition layer's weights to be %r" % conv1_w assert bn1_w.equal(m[0][2].weight), "Expected first batch norm layers weights to be %r" % bn1_w
def __init__(self, memory_spec, algorithm, body): util.set_attr(self, memory_spec, [ 'alpha', 'epsilon', 'batch_size', 'max_size', 'use_cer', ]) self.epsilon = torch.full((1,), self.epsilon) self.alpha = torch.full((1,), self.alpha) super(PrioritizedReplay, self).__init__(memory_spec, algorithm, body)
def pad(self, minibatch): """Pad a batch of examples to the length of the longest example. Args: minibatch (List[torch.FloatTensor]): A list of audio data, each having shape 1 x n_feats x len where len is variable. Returns: torch.FloatTensor or Tuple[torch.FloatTensor, List[int]]: The padded tensor of shape ``(batch_size, 1, n_feats, max_len)``. and a list of the lengths if `self.include_lengths` is `True` else just returns the padded tensor. """ assert not self.pad_first and not self.truncate_first \ and not self.fix_length and self.sequential minibatch = list(minibatch) lengths = [x.size(1) for x in minibatch] max_len = max(lengths) nfft = minibatch[0].size(0) sounds = torch.full((len(minibatch), 1, nfft, max_len), self.pad_token) for i, (spect, len_) in enumerate(zip(minibatch, lengths)): sounds[i, :, :, 0:len_] = spect if self.include_lengths: return (sounds, lengths) return sounds
def __init__(self, pad, bos, eos, batch_size, device, parallel_paths, min_length, block_ngram_repeat, exclusion_tokens, return_attention, max_length): # magic indices self.pad = pad self.bos = bos self.eos = eos # result caching self.predictions = [[] for _ in range(batch_size)] self.scores = [[] for _ in range(batch_size)] self.attention = [[] for _ in range(batch_size)] self.alive_seq = torch.full( [batch_size * parallel_paths, 1], self.bos, dtype=torch.long, device=device) self.is_finished = torch.zeros( [batch_size, parallel_paths], dtype=torch.uint8, device=device) self.alive_attn = None self.min_length = min_length self.max_length = max_length self.block_ngram_repeat = block_ngram_repeat self.exclusion_tokens = exclusion_tokens self.return_attention = return_attention self.done = False
def _viterbi_decode(self, feats): # just for decode backpointers = [] init_vvars = torch.full((1, self.tagset_size), -10000.) # Initialize the viterbi variables in log space init_vvars[0][self.tag_to_ix[START_TAG]] = 0 forward_var = init_vvars # forward_var at step i holds the viterbi variables for step i-1 for feat in feats: bptrs_t = [] # holds the backpointers for this step viterbivars_t = [] # holds the viterbi variables for this step for next_tag in range(self.tagset_size): # next_tag_var[i] holds the viterbi variable for tag i at the previous step, plus the score of transitioning from tag i to next_tag. # We don't include the emission scores here because the max does not depend on them (we add them in below) next_tag_var = forward_var + self.transitions[next_tag] best_tag_id = argmax(next_tag_var) bptrs_t.append(best_tag_id) viterbivars_t.append(next_tag_var[0][best_tag_id].view(1)) # Now add in the emission scores, and assign forward_var to the set of viterbi variables we just computed forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1) backpointers.append(bptrs_t) # Transition to STOP_TAG terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] best_tag_id = argmax(terminal_var) path_score = terminal_var[0][best_tag_id] # Follow the back pointers to decode the best path. best_path = [best_tag_id] for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) # Pop off the start tag (we dont want to return that to the caller) start = best_path.pop() assert start == self.tag_to_ix[START_TAG] # Sanity check best_path.reverse() return path_score, best_path
def _forward_alg(self, feats): # Do the forward algorithm to compute the partition function # Initialize alphas init_alphas = torch.full((1, self.tagset_size), -10000.) # START_TAG has all of the score. init_alphas[0][self.tag_to_ix[START_TAG]] = 0. # Wrap in a variable so that we will get automatic backprop forward_var = init_alphas # Iterate through the sentence for feat in feats: alphas_t = [] # The forward tensors at this timestep for next_tag in range(self.tagset_size): # broadcast the emission score: it is the same regardless of # the previous tag emit_score = feat[next_tag].view( 1, -1).expand(1, self.tagset_size) # the ith entry of trans_score is the score of transitioning to # next_tag from i trans_score = self.transitions[next_tag].view(1, -1) # The ith entry of next_tag_var is the value for the # edge (i -> next_tag) before we do log-sum-exp next_tag_var = forward_var + trans_score + emit_score # The forward variable for this tag is log-sum-exp of all the # scores. alphas_t.append(log_sum_exp(next_tag_var).view(1)) forward_var = torch.cat(alphas_t).view(1, -1) terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] alpha = log_sum_exp(terminal_var) return alpha
def test_repeating_excluded_index_does_not_die(self): # batch 0 will repeat excluded idx, batch 1 will repeat n_words = 100 repeat_idx = 47 # will be repeated and should be blocked repeat_idx_ignored = 7 # will be repeated and should not be blocked ngram_repeat = 3 for batch_sz in [1, 3, 17]: samp = RandomSampling( 0, 1, 2, batch_sz, torch.device("cpu"), 0, ngram_repeat, {repeat_idx_ignored}, False, 30, 1., 5, torch.randint(0, 30, (batch_sz,))) for i in range(ngram_repeat + 4): word_probs = torch.full( (batch_sz, n_words), -float('inf')) word_probs[0, repeat_idx_ignored] = 0 if batch_sz > 1: word_probs[1, repeat_idx] = 0 word_probs[2:, repeat_idx + i] = 0 attns = torch.randn(1, batch_sz, 53) samp.advance(word_probs, attns) if i <= ngram_repeat: self.assertFalse(samp.topk_scores.eq( self.BLOCKED_SCORE).any()) else: # now batch 1 dies self.assertFalse(samp.topk_scores[0].eq( self.BLOCKED_SCORE).any()) if batch_sz > 1: self.assertTrue(samp.topk_scores[1].eq( self.BLOCKED_SCORE).all()) self.assertFalse(samp.topk_scores[2:].eq( self.BLOCKED_SCORE).any())
def test_advance_with_all_repeats_gets_blocked(self): # all beams repeat (beam >= 1 repeat dummy scores) beam_sz = 5 n_words = 100 repeat_idx = 47 ngram_repeat = 3 for batch_sz in [1, 3]: beam = BeamSearch( beam_sz, batch_sz, 0, 1, 2, 2, torch.device("cpu"), GlobalScorerStub(), 0, 30, False, ngram_repeat, set(), torch.randint(0, 30, (batch_sz,)), False, 0.) for i in range(ngram_repeat + 4): # predict repeat_idx over and over again word_probs = torch.full( (batch_sz * beam_sz, n_words), -float('inf')) word_probs[0::beam_sz, repeat_idx] = 0 attns = torch.randn(1, batch_sz * beam_sz, 53) beam.advance(word_probs, attns) if i <= ngram_repeat: expected_scores = torch.tensor( [0] + [-float('inf')] * (beam_sz - 1))\ .repeat(batch_sz, 1) self.assertTrue(beam.topk_log_probs.equal(expected_scores)) else: self.assertTrue( beam.topk_log_probs.equal( torch.tensor(self.BLOCKED_SCORE) .repeat(batch_sz, beam_sz)))
def test_advance_with_some_repeats_gets_blocked(self): # beam 0 and beam >=2 will repeat (beam >= 2 repeat dummy scores) beam_sz = 5 n_words = 100 repeat_idx = 47 ngram_repeat = 3 beam = Beam(beam_sz, 0, 1, 2, n_best=2, exclusion_tokens=set(), global_scorer=GlobalScorerStub(), block_ngram_repeat=ngram_repeat) for i in range(ngram_repeat + 4): # non-interesting beams are going to get dummy values word_probs = torch.full((beam_sz, n_words), -float('inf')) if i == 0: # on initial round, only predicted scores for beam 0 # matter. Make two predictions. Top one will be repeated # in beam zero, second one will live on in beam 1. word_probs[0, repeat_idx] = -0.1 word_probs[0, repeat_idx + i + 1] = -2.3 else: # predict the same thing in beam 0 word_probs[0, repeat_idx] = 0 # continue pushing around what beam 1 predicts word_probs[1, repeat_idx + i + 1] = 0 attns = torch.randn(beam_sz) beam.advance(word_probs, attns) if i <= ngram_repeat: self.assertFalse(beam.scores[0].eq(self.BLOCKED_SCORE)) self.assertFalse(beam.scores[1].eq(self.BLOCKED_SCORE)) else: # now beam 0 dies (along with the others), beam 1 -> beam 0 self.assertFalse(beam.scores[0].eq(self.BLOCKED_SCORE)) self.assertTrue( beam.scores[1:].equal(torch.tensor( [self.BLOCKED_SCORE] * (beam_sz - 1))))
def test_advance_with_all_repeats_gets_blocked(self): # all beams repeat (beam >= 1 repeat dummy scores) beam_sz = 5 n_words = 100 repeat_idx = 47 ngram_repeat = 3 beam = Beam(beam_sz, 0, 1, 2, n_best=2, exclusion_tokens=set(), global_scorer=GlobalScorerStub(), block_ngram_repeat=ngram_repeat) for i in range(ngram_repeat + 4): # predict repeat_idx over and over again word_probs = torch.full((beam_sz, n_words), -float('inf')) word_probs[0, repeat_idx] = 0 attns = torch.randn(beam_sz) beam.advance(word_probs, attns) if i <= ngram_repeat: self.assertTrue( beam.scores.equal( torch.tensor( [0] + [-float('inf')] * (beam_sz - 1)))) else: self.assertTrue( beam.scores.equal(torch.tensor( [self.BLOCKED_SCORE] * beam_sz)))
def script_viterbi(unary, trans, start_idx, end_idx): # type: (Tensor, Tensor, int, int) -> Tuple[Tensor, Tensor] backpointers = [] alphas = torch.full((1, unary.size(1)), -1e4, dtype=unary.dtype, device=unary.device) alphas[0, start_idx] = 0 for i in range(unary.size(0)): unary_t = unary[i, :] next_tag_var = alphas + trans viterbi, best_tag_ids = torch.max(next_tag_var, 1) backpointers.append(best_tag_ids) alphas = viterbi + unary_t alphas = alphas.unsqueeze(0) terminal_vars = alphas.squeeze(0) + trans[end_idx, :] path_score, best_tag_id = torch.max(terminal_vars, 0) best_path = [best_tag_id] for i in range(len(backpointers)): i = len(backpointers) - i - 1 best_tag_id = backpointers[i][best_tag_id] best_path.append(best_tag_id) new_path = [] for i in range(len(best_path)): i = len(best_path) - i - 1 new_path.append(best_path[i]) return torch.stack(new_path[1:]), path_score
def cross_entropy_loss(input, target): total_loss = torch.tensor(0.0) for i in range(input.size(1)): cls_idx = torch.full((input.size(0),), i, dtype=torch.long) loss = F.cross_entropy(input, cls_idx, reduce=False) total_loss += target[:, i].dot(loss) return total_loss / input.shape[0]
def test_beam_is_done_when_n_best_beams_eos_using_min_length(self): # this is also a test that when block_ngram_repeat=0, # repeating is acceptable beam_sz = 5 batch_sz = 3 n_words = 100 _non_eos_idxs = [47, 51, 13, 88, 99] valid_score_dist = torch.log_softmax(torch.tensor( [6., 5., 4., 3., 2., 1.]), dim=0) min_length = 5 eos_idx = 2 beam = BeamSearch( beam_sz, batch_sz, 0, 1, 2, 2, torch.device("cpu"), GlobalScorerStub(), min_length, 30, False, 0, set(), torch.randint(0, 30, (batch_sz,)), False, 0.) for i in range(min_length + 4): # non-interesting beams are going to get dummy values word_probs = torch.full( (batch_sz * beam_sz, n_words), -float('inf')) if i == 0: # "best" prediction is eos - that should be blocked word_probs[0::beam_sz, eos_idx] = valid_score_dist[0] # include at least beam_sz predictions OTHER than EOS # that are greater than -1e20 for j, score in zip(_non_eos_idxs, valid_score_dist[1:]): word_probs[0::beam_sz, j] = score elif i <= min_length: # predict eos in beam 1 word_probs[1::beam_sz, eos_idx] = valid_score_dist[0] # provide beam_sz other good predictions in other beams for k, (j, score) in enumerate( zip(_non_eos_idxs, valid_score_dist[1:])): beam_idx = min(beam_sz-1, k) word_probs[beam_idx::beam_sz, j] = score else: word_probs[0::beam_sz, eos_idx] = valid_score_dist[0] word_probs[1::beam_sz, eos_idx] = valid_score_dist[0] # provide beam_sz other good predictions in other beams for k, (j, score) in enumerate( zip(_non_eos_idxs, valid_score_dist[1:])): beam_idx = min(beam_sz-1, k) word_probs[beam_idx::beam_sz, j] = score attns = torch.randn(1, batch_sz * beam_sz, 53) beam.advance(word_probs, attns) if i < min_length: self.assertFalse(beam.done) elif i == min_length: # beam 1 dies on min_length self.assertTrue(beam.is_finished[:, 1].all()) beam.update_finished() self.assertFalse(beam.done) else: # i > min_length # beam 0 dies on the step after beam 1 dies self.assertTrue(beam.is_finished[:, 0].all()) beam.update_finished() self.assertTrue(beam.done)
def _train(self, epoch): """Perform the actual train.""" # put model into train mode self.d_model.train() # TODO: why? cp_loader = deepcopy(self.train_loader) if self.verbose: progress_bar = tqdm(total=len(cp_loader), desc='Current Epoch', file=sys.stdout, leave=False, ncols=75, position=0, unit=' Batch') else: progress_bar = None real_label = 1 fake_label = 0 for batch_idx, inputs in enumerate(cp_loader): # Update Discriminator network maximize log(D(x)) + log(1 - D(G(z))) # train with real self.optimizer_d.zero_grad() inputs = inputs.to(self.device) batch_size = inputs.size(0) outputs = self.d_model(inputs) label = torch.full((batch_size,), real_label, device=self.device) loss_d_real = self.loss_function(outputs, label) loss_d_real.backward() # train with fake noise = torch.randn((batch_size, self.g_model.nz, 1, 1,), device=self.device) fake_outputs = self.g_model(noise) label.fill_(fake_label) outputs = self.d_model(fake_outputs.detach()) loss_g_fake = self.loss_function(outputs, label) loss_g_fake.backward() self.optimizer_d.step() # (2) Update G network: maximize log(D(G(z))) self.g_model.zero_grad() label.fill_(real_label) outputs = self.d_model(fake_outputs) loss_g = self.loss_function(outputs, label) loss_g.backward() self.optimizer_g.step() if self.verbose: if batch_idx % 10 == 0: progress_bar.update(10) if self.out_f is not None and batch_idx % 100 == 0: fake = self.g_model(self.sample_noise) vutils.save_image( fake.detach(), '%s/fake_samples_epoch_%03d.png' % (self.out_f, epoch), normalize=True) if self.verbose: progress_bar.close()
def test_beam_is_done_when_n_best_beams_eos_using_min_length(self): # this is also a test that when block_ngram_repeat=0, # repeating is acceptable beam_sz = 5 n_words = 100 _non_eos_idxs = [47, 51, 13, 88, 99] valid_score_dist = torch.log_softmax(torch.tensor( [6., 5., 4., 3., 2., 1.]), dim=0) min_length = 5 eos_idx = 2 beam = Beam(beam_sz, 0, 1, eos_idx, n_best=2, exclusion_tokens=set(), min_length=min_length, global_scorer=GlobalScorerStub(), block_ngram_repeat=0) for i in range(min_length + 4): # non-interesting beams are going to get dummy values word_probs = torch.full((beam_sz, n_words), -float('inf')) if i == 0: # "best" prediction is eos - that should be blocked word_probs[0, eos_idx] = valid_score_dist[0] # include at least beam_sz predictions OTHER than EOS # that are greater than -1e20 for j, score in zip(_non_eos_idxs, valid_score_dist[1:]): word_probs[0, j] = score elif i <= min_length: # predict eos in beam 1 word_probs[1, eos_idx] = valid_score_dist[0] # provide beam_sz other good predictions in other beams for k, (j, score) in enumerate( zip(_non_eos_idxs, valid_score_dist[1:])): beam_idx = min(beam_sz-1, k) word_probs[beam_idx, j] = score else: word_probs[0, eos_idx] = valid_score_dist[0] word_probs[1, eos_idx] = valid_score_dist[0] # provide beam_sz other good predictions in other beams for k, (j, score) in enumerate( zip(_non_eos_idxs, valid_score_dist[1:])): beam_idx = min(beam_sz-1, k) word_probs[beam_idx, j] = score attns = torch.randn(beam_sz) beam.advance(word_probs, attns) if i < min_length: self.assertFalse(beam.done) elif i == min_length: # beam 1 dies on min_length self.assertEqual(beam.finished[0][1], beam.min_length + 1) self.assertEqual(beam.finished[0][2], 1) self.assertFalse(beam.done) else: # i > min_length # beam 0 dies on the step after beam 1 dies self.assertEqual(beam.finished[1][1], beam.min_length + 2) self.assertEqual(beam.finished[1][2], 0) self.assertTrue(beam.done)
def test_neg_styblinski_tang_global_maximum(self, cuda=False): device = torch.device("cuda") if cuda else torch.device("cpu") for dtype in (torch.float, torch.double): X = torch.full( (3,), GLOBAL_MAXIMIZER, device=device, dtype=dtype, requires_grad=True ) res = neg_styblinski_tang(X) res.backward() self.assertAlmostEqual(res.item(), 3 * GLOBAL_MAXIMUM, places=4) self.assertLess(X.grad.abs().max().item(), 1e-5)
def __init__(self, label_smoothing, tgt_vocab_size, padding_idx=0): assert 0.0 < label_smoothing <= 1.0 self.padding_idx = padding_idx super(LabelSmoothingLoss, self).__init__() smoothing_value = label_smoothing / (tgt_vocab_size - 2) # -1 for pad, -1 for gold-standard word one_hot = torch.full((tgt_vocab_size,), smoothing_value) one_hot[self.padding_idx] = 0 self.register_buffer('one_hot', one_hot.unsqueeze(0)) self.confidence = 1.0 - label_smoothing
def test_doesnt_predict_eos_if_shorter_than_min_len(self): # beam 0 will always predict EOS. The other beams will predict # non-eos scores. for batch_sz in [1, 3]: beam_sz = 5 n_words = 100 _non_eos_idxs = [47, 51, 13, 88, 99] valid_score_dist = torch.log_softmax(torch.tensor( [6., 5., 4., 3., 2., 1.]), dim=0) min_length = 5 eos_idx = 2 lengths = torch.randint(0, 30, (batch_sz,)) beam = BeamSearch(beam_sz, batch_sz, 0, 1, 2, 2, torch.device("cpu"), GlobalScorerStub(), min_length, 30, False, 0, set(), lengths, False, 0.) all_attns = [] for i in range(min_length + 4): # non-interesting beams are going to get dummy values word_probs = torch.full( (batch_sz * beam_sz, n_words), -float('inf')) if i == 0: # "best" prediction is eos - that should be blocked word_probs[0::beam_sz, eos_idx] = valid_score_dist[0] # include at least beam_sz predictions OTHER than EOS # that are greater than -1e20 for j, score in zip(_non_eos_idxs, valid_score_dist[1:]): word_probs[0::beam_sz, j] = score else: # predict eos in beam 0 word_probs[0::beam_sz, eos_idx] = valid_score_dist[0] # provide beam_sz other good predictions for k, (j, score) in enumerate( zip(_non_eos_idxs, valid_score_dist[1:])): beam_idx = min(beam_sz-1, k) word_probs[beam_idx::beam_sz, j] = score attns = torch.randn(1, batch_sz * beam_sz, 53) all_attns.append(attns) beam.advance(word_probs, attns) if i < min_length: expected_score_dist = \ (i+1) * valid_score_dist[1:].unsqueeze(0) self.assertTrue( beam.topk_log_probs.allclose( expected_score_dist)) elif i == min_length: # now the top beam has ended and no others have self.assertTrue(beam.is_finished[:, 0].eq(1).all()) self.assertTrue(beam.is_finished[:, 1:].eq(0).all()) else: # i > min_length # not of interest, but want to make sure it keeps running # since only beam 0 terminates and n_best = 2 pass
def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100): assert 0.0 < label_smoothing <= 1.0 self.ignore_index = ignore_index super(LabelSmoothingLoss, self).__init__() smoothing_value = label_smoothing / (tgt_vocab_size - 2) one_hot = torch.full((tgt_vocab_size,), smoothing_value) one_hot[self.ignore_index] = 0 self.register_buffer('one_hot', one_hot.unsqueeze(0)) self.confidence = 1.0 - label_smoothing
def test_repeating_excluded_index_does_not_die(self): # beam 0 and beam >= 2 will repeat (beam 2 repeats excluded idx) beam_sz = 5 n_words = 100 repeat_idx = 47 # will be repeated and should be blocked repeat_idx_ignored = 7 # will be repeated and should not be blocked ngram_repeat = 3 for batch_sz in [1, 3]: beam = BeamSearch( beam_sz, batch_sz, 0, 1, 2, 2, torch.device("cpu"), GlobalScorerStub(), 0, 30, False, ngram_repeat, {repeat_idx_ignored}, torch.randint(0, 30, (batch_sz,)), False, 0.) for i in range(ngram_repeat + 4): # non-interesting beams are going to get dummy values word_probs = torch.full( (batch_sz * beam_sz, n_words), -float('inf')) if i == 0: word_probs[0::beam_sz, repeat_idx] = -0.1 word_probs[0::beam_sz, repeat_idx + i + 1] = -2.3 word_probs[0::beam_sz, repeat_idx_ignored] = -5.0 else: # predict the same thing in beam 0 word_probs[0::beam_sz, repeat_idx] = 0 # continue pushing around what beam 1 predicts word_probs[1::beam_sz, repeat_idx + i + 1] = 0 # predict the allowed-repeat again in beam 2 word_probs[2::beam_sz, repeat_idx_ignored] = 0 attns = torch.randn(1, batch_sz * beam_sz, 53) beam.advance(word_probs, attns) if i <= ngram_repeat: self.assertFalse(beam.topk_log_probs[:, 0].eq( self.BLOCKED_SCORE).any()) self.assertFalse(beam.topk_log_probs[:, 1].eq( self.BLOCKED_SCORE).any()) self.assertFalse(beam.topk_log_probs[:, 2].eq( self.BLOCKED_SCORE).any()) else: # now beam 0 dies, beam 1 -> beam 0, beam 2 -> beam 1 # and the rest die self.assertFalse(beam.topk_log_probs[:, 0].eq( self.BLOCKED_SCORE).any()) # since all preds after i=0 are 0, we can check # that the beam is the correct idx by checking that # the curr score is the initial score self.assertTrue(beam.topk_log_probs[:, 0].eq(-2.3).all()) self.assertFalse(beam.topk_log_probs[:, 1].eq( self.BLOCKED_SCORE).all()) self.assertTrue(beam.topk_log_probs[:, 1].eq(-5.0).all()) self.assertTrue( beam.topk_log_probs[:, 2:].equal( torch.tensor(self.BLOCKED_SCORE) .repeat(batch_sz, beam_sz - 2)))
def convert_to_roi_format(self, boxes): concat_boxes = cat([b.bbox for b in boxes], dim=0) device, dtype = concat_boxes.device, concat_boxes.dtype ids = cat( [ torch.full((len(b), 1), i, dtype=dtype, device=device) for i, b in enumerate(boxes) ], dim=0, ) rois = torch.cat([ids, concat_boxes], dim=1) return rois
def test_doesnt_predict_eos_if_shorter_than_min_len(self): # beam 0 will always predict EOS. The other beams will predict # non-eos scores. # this is also a test that when block_ngram_repeat=0, # repeating is acceptable beam_sz = 5 n_words = 100 _non_eos_idxs = [47, 51, 13, 88, 99] valid_score_dist = torch.log_softmax(torch.tensor( [6., 5., 4., 3., 2., 1.]), dim=0) min_length = 5 eos_idx = 2 beam = Beam(beam_sz, 0, 1, eos_idx, n_best=2, exclusion_tokens=set(), min_length=min_length, global_scorer=GlobalScorerStub(), block_ngram_repeat=0) for i in range(min_length + 4): # non-interesting beams are going to get dummy values word_probs = torch.full((beam_sz, n_words), -float('inf')) if i == 0: # "best" prediction is eos - that should be blocked word_probs[0, eos_idx] = valid_score_dist[0] # include at least beam_sz predictions OTHER than EOS # that are greater than -1e20 for j, score in zip(_non_eos_idxs, valid_score_dist[1:]): word_probs[0, j] = score else: # predict eos in beam 0 word_probs[0, eos_idx] = valid_score_dist[0] # provide beam_sz other good predictions for k, (j, score) in enumerate( zip(_non_eos_idxs, valid_score_dist[1:])): beam_idx = min(beam_sz-1, k) word_probs[beam_idx, j] = score attns = torch.randn(beam_sz) beam.advance(word_probs, attns) if i < min_length: expected_score_dist = (i+1) * valid_score_dist[1:] self.assertTrue(beam.scores.allclose(expected_score_dist)) elif i == min_length: # now the top beam has ended and no others have # first beam finished had length beam.min_length self.assertEqual(beam.finished[0][1], beam.min_length + 1) # first beam finished was 0 self.assertEqual(beam.finished[0][2], 0) else: # i > min_length # not of interest, but want to make sure it keeps running # since only beam 0 terminates and n_best = 2 pass
def __init__(self, beam_size, batch_size, pad, bos, eos, n_best, mb_device, global_scorer, min_length, max_length, return_attention, block_ngram_repeat, exclusion_tokens, memory_lengths, stepwise_penalty, ratio): super(BeamSearch, self).__init__( pad, bos, eos, batch_size, mb_device, beam_size, min_length, block_ngram_repeat, exclusion_tokens, return_attention, max_length) # beam parameters self.global_scorer = global_scorer self.beam_size = beam_size self.n_best = n_best self.batch_size = batch_size self.ratio = ratio # result caching self.hypotheses = [[] for _ in range(batch_size)] # beam state self.top_beam_finished = torch.zeros([batch_size], dtype=torch.uint8) self.best_scores = torch.full([batch_size], -1e10, dtype=torch.float, device=mb_device) self._batch_offset = torch.arange(batch_size, dtype=torch.long) self._beam_offset = torch.arange( 0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=mb_device) self.topk_log_probs = torch.tensor( [0.0] + [float("-inf")] * (beam_size - 1), device=mb_device ).repeat(batch_size) self.select_indices = None self._memory_lengths = memory_lengths # buffers for the topk scores and 'backpointer' self.topk_scores = torch.empty((batch_size, beam_size), dtype=torch.float, device=mb_device) self.topk_ids = torch.empty((batch_size, beam_size), dtype=torch.long, device=mb_device) self._batch_index = torch.empty([batch_size, beam_size], dtype=torch.long, device=mb_device) self.done = False # "global state" of the old beam self._prev_penalty = None self._coverage = None self._stepwise_cov_pen = ( stepwise_penalty and self.global_scorer.has_cov_pen) self._vanilla_cov_pen = ( not stepwise_penalty and self.global_scorer.has_cov_pen) self._cov_pen = self.global_scorer.has_cov_pen
def numericalize_inputs(cls, init_case, params): bs = params["batch_size"] max_len = params["max_len"] lengths = torch.randint(1, max_len, (bs,)) lengths[params["full_length_seq"]] = max_len nfeats = params["nfeats"] fake_input = torch.full( (bs, 1, nfeats, max_len), init_case["pad_index"]) for b in range(bs): fake_input[b, :, :, :lengths[b]] = torch.randn( (1, nfeats, lengths[b])) if init_case["include_lengths"]: fake_input = (fake_input, lengths) return fake_input, lengths
def _forward_alg(self, feats): # Reference: /Users/coder352/github/jKnowledge/Math_Manual/Machine_Learning/pdf/l85_Named-Entity-Recognition.pdf init_alphas = torch.full((1, self.tagset_size), -10000.) # (1, 5); Do the forward algorithm to compute the partition function init_alphas[0][self.tag_to_ix[START_TAG]] = 0. # START_TAG has all of the score. forward_var = init_alphas # Wrap in a variable so that we will get automatic backprop for feat in feats: # Iterate through the sentence alphas_t = [] # The forward tensors at this timestep for next_tag in range(self.tagset_size): # broadcast the emission score: it is the same regardless of the previous tag emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size) trans_score = self.transitions[next_tag].view(1, -1) # the ith entry of trans_score is the score of transitioning to next_tag from i next_tag_var = forward_var + trans_score + emit_score # The ith entry of next_tag_var is the value for the edge (i -> next_tag) before we do log-sum-exp alphas_t.append(log_sum_exp(next_tag_var).view(1)) # The forward variable for this tag is log-sum-exp of all the scores. forward_var = torch.cat(alphas_t).view(1, -1) terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] alpha = log_sum_exp(terminal_var) return alpha
def test_advance_with_some_repeats_gets_blocked(self): # beam 0 and beam >=2 will repeat (beam >= 2 repeat dummy scores) beam_sz = 5 n_words = 100 repeat_idx = 47 ngram_repeat = 3 for batch_sz in [1, 3]: beam = BeamSearch( beam_sz, batch_sz, 0, 1, 2, 2, torch.device("cpu"), GlobalScorerStub(), 0, 30, False, ngram_repeat, set(), torch.randint(0, 30, (batch_sz,)), False, 0.) for i in range(ngram_repeat + 4): # non-interesting beams are going to get dummy values word_probs = torch.full( (batch_sz * beam_sz, n_words), -float('inf')) if i == 0: # on initial round, only predicted scores for beam 0 # matter. Make two predictions. Top one will be repeated # in beam zero, second one will live on in beam 1. word_probs[0::beam_sz, repeat_idx] = -0.1 word_probs[0::beam_sz, repeat_idx + i + 1] = -2.3 else: # predict the same thing in beam 0 word_probs[0::beam_sz, repeat_idx] = 0 # continue pushing around what beam 1 predicts word_probs[1::beam_sz, repeat_idx + i + 1] = 0 attns = torch.randn(1, batch_sz * beam_sz, 53) beam.advance(word_probs, attns) if i <= ngram_repeat: self.assertFalse( beam.topk_log_probs[0::beam_sz].eq( self.BLOCKED_SCORE).any()) self.assertFalse( beam.topk_log_probs[1::beam_sz].eq( self.BLOCKED_SCORE).any()) else: # now beam 0 dies (along with the others), beam 1 -> beam 0 self.assertFalse( beam.topk_log_probs[:, 0].eq( self.BLOCKED_SCORE).any()) self.assertTrue( beam.topk_log_probs[:, 1:].equal( torch.tensor(self.BLOCKED_SCORE) .repeat(batch_sz, beam_sz-1)))
def test_repeating_excluded_index_does_not_die(self): # beam 0 and beam >= 2 will repeat (beam 2 repeats excluded idx) beam_sz = 5 n_words = 100 repeat_idx = 47 # will be repeated and should be blocked repeat_idx_ignored = 7 # will be repeated and should not be blocked ngram_repeat = 3 beam = Beam(beam_sz, 0, 1, 2, n_best=2, exclusion_tokens=set([repeat_idx_ignored]), global_scorer=GlobalScorerStub(), block_ngram_repeat=ngram_repeat) for i in range(ngram_repeat + 4): # non-interesting beams are going to get dummy values word_probs = torch.full((beam_sz, n_words), -float('inf')) if i == 0: word_probs[0, repeat_idx] = -0.1 word_probs[0, repeat_idx + i + 1] = -2.3 word_probs[0, repeat_idx_ignored] = -5.0 else: # predict the same thing in beam 0 word_probs[0, repeat_idx] = 0 # continue pushing around what beam 1 predicts word_probs[1, repeat_idx + i + 1] = 0 # predict the allowed-repeat again in beam 2 word_probs[2, repeat_idx_ignored] = 0 attns = torch.randn(beam_sz) beam.advance(word_probs, attns) if i <= ngram_repeat: self.assertFalse(beam.scores[0].eq(self.BLOCKED_SCORE)) self.assertFalse(beam.scores[1].eq(self.BLOCKED_SCORE)) self.assertFalse(beam.scores[2].eq(self.BLOCKED_SCORE)) else: # now beam 0 dies, beam 1 -> beam 0, beam 2 -> beam 1 # and the rest die self.assertFalse(beam.scores[0].eq(self.BLOCKED_SCORE)) # since all preds after i=0 are 0, we can check # that the beam is the correct idx by checking that # the curr score is the initial score self.assertTrue(beam.scores[0].eq(-2.3)) self.assertFalse(beam.scores[1].eq(self.BLOCKED_SCORE)) self.assertTrue(beam.scores[1].eq(-5.0)) self.assertTrue( beam.scores[2:].equal(torch.tensor( [self.BLOCKED_SCORE] * (beam_sz - 2))))
def test_draw_boxes_grayscale(): img = torch.full((1, 4, 4), fill_value=255, dtype=torch.uint8) boxes = torch.tensor([[0, 0, 3, 3]], dtype=torch.int64) bboxed_img = utils.draw_bounding_boxes(image=img, boxes=boxes, colors=["#1BBC9B"]) assert bboxed_img.size(0) == 3
def filter_proposals( self, proposals: Tensor, objectness: Tensor, image_shapes: List[Tuple[int, int]], num_anchors_per_level: List[int] ) -> Tuple[List[Tensor], List[Tensor]]: """ Args: Returns: """ num_images = proposals.shape[0] device = proposals.device # do not backprop throught objectness objectness = objectness.detach() objectness = objectness.reshape(num_images, -1) levels = [ torch.full((n, ), i, dtype=torch.int64, device=device) for i, n in enumerate(num_anchors_per_level) ] levels = torch.cat(levels, dim=0) levels = levels.reshape(1, -1).expand_as(objectness) # select top_n boxes independently per level before applying nms top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level) image_range = torch.arange(num_images, device=device) batch_idx = image_range[:, None] objectness = objectness[batch_idx, top_n_idx] levels = levels[batch_idx, top_n_idx] proposals = proposals[batch_idx, top_n_idx] objectness_prob = torch.sigmoid(objectness) final_boxes = [] final_scores = [] for boxes, scores, lvl, img_shape in zip(proposals, objectness_prob, levels, image_shapes): boxes = clip_boxes_to_image(boxes, img_shape) # remove small boxes keep = remove_small_boxes(boxes, self.min_size) boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep] # remove low scoring boxes # use >= for Backwards compatibility keep = torch.where(scores >= self.score_thresh)[0] boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep] # non-maximum suppression, independently done per level keep = batched_nms(boxes, scores, lvl, self.nms_thresh) # keep only topk scoring predictions keep = keep[:self.post_nms_top_n()] boxes, scores = boxes[keep], scores[keep] final_boxes.append(boxes) final_scores.append(scores) return final_boxes, final_scores
def est(cnn, mu, Str, fg, pro=None, pos=None, H=None): global max_norm if (fg == True): cnn.bid = 6 pro.bid = pos + 1 pro.to(device) pro.eval() else: cnn.to(device) torch.save(cnn.state_dict(), 'model_{}.pt'.format(Str)) # exit(0) loss_f = nn.CrossEntropyLoss() cnn.eval() test_correct = 0 test_loss = 0 train_correct = 0 train_loss = 0 i = 1 tr = 0 with torch.no_grad(): for X_train, y_train in train_loader: X_train = X_train.to(device) y_train = y_train.to(device) outputs = cnn(X_train) if (fg == True): pro(X_train) # loss = (1-mu)*loss_f(outputs, y_train)+mu * loss = torch.dist(cnn.feat[pos], pro.feat[pos], p=2) tr += 1 - loss / pro.feat[pos].norm(p=2) else: loss = loss_f(outputs, y_train) label = y_train.cpu().numpy() # for i in range(6): # tmp.append(cnn.feat[i].cpu().detach().numpy()) # for j in range(len(y_train)): # y=label[j] # S_n[y]+=1 # if(S_n[y]<=S_limit): # for i in range(6): # if(S_de[i]==True): # for k in range(dim[i]): # X[i][y][S_n[y]][k]=np.dot(tmp[i][j],vec[k][0:re_dim[i]].T) # else: # X[i][y][S_n[y]]=tmp[i][j] for j in range(label.size): y = label[j] for i in range(6): X[i][y].append(cnn.feat[i][j]) if max_norm[i] < cnn.feat[i][j].norm(p=2): max_norm[i] = cnn.feat[i][j].norm(p=2) tmp = len(X[i][y]) if (tmp > S_limit[i]): Q = torch.full([tmp, X[i][y][0].size()[0]], 0, dtype=torch.float32).to(device) for k in range(tmp): Q[k] = X[i][y][k] # H_tmp = entropy(Q) Cnt[i][y] += 1 # print(i) # print(y) # print(H_tmp) # print('\n') H[i][y] += entropy(Q, max_norm[i]) X[i][y] = [] y_pred = torch.max(outputs, 1).indices train_correct += torch.sum(y_pred == y_train).item() train_loss += loss.item() for X_test, y_test in test_loader: X_test = X_test.to(device) y_test = y_test.to(device) outputs = cnn(X_test) if (fg == True): pro(X_test) # loss = (1-mu)*loss_f(outputs, y_test)+mu * loss = torch.dist(cnn.feat[pos], pro.feat[pos], p=2) tr += 1 - loss / pro.feat[pos].norm(p=2) else: loss = loss_f(outputs, y_test) label = y_test.cpu().numpy() for j in range(label.size): y = label[j] for i in range(6): X[i][y].append(cnn.feat[i][j]) if max_norm[i] < cnn.feat[i][j].norm(p=2): max_norm[i] = cnn.feat[i][j].norm(p=2) tmp = len(X[i][y]) if (tmp > S_limit[i]): Q = torch.full([tmp, X[i][y][0].size()[0]], 0, dtype=torch.float32).to(device) for k in range(tmp): Q[k] = X[i][y][k] # H_tmp = entropy(Q) Cnt[i][y] += 1 # print(i) # print(y) # print(H_tmp) # print('\n') H[i][y] += entropy(Q, max_norm[i]) X[i][y] = [] y_pred = torch.max(outputs, 1).indices test_correct += torch.sum(y_pred == y_test).item() test_loss += loss.item() torch.cuda.empty_cache() if (fg == False): for i in range(6): for y in range(10): tmp = len(X[i][y]) if (tmp > 0.5 * S_limit[i]): Q = torch.full([tmp, X[i][y][0].size()[0]], 0, dtype=torch.float32).to(device) for k in range(tmp): Q[k] = X[i][y][k] Cnt[i][y] += 1 H[i][y] += entropy(Q, max_norm[i]) X[i][y] = [] H[i][y] /= Cnt[i][y] print(Str) print('Train Loss {:.4f}'.format(train_loss / len(train_loader))) print('Test Loss {:.4f}'.format(test_loss / len(test_loader))) print('Train Acc {:.4f}%'.format(train_correct / len(train_data) * 100)) print('Test Acc {:.4f}%'.format(test_correct / len(test_data) * 100)) if (fg == True): # print(tr) return tr / (len(train_loader) + len(test_loader)), train_loss / len( train_loader), test_loss / len(test_loader), train_correct / len( train_data), test_correct / len(test_data)
def collate(batch): if len(batch) == 1: batch[0]['a_batch_size'] = batch[0]['image'].size(0) return batch[0] batch = [b for b in batch if b is not None] a_batch_size = len(batch[0]['gt']) dim1 = batch[0]['image'].shape[1] dim3 = max([b['image'].shape[3] for b in batch]) dim2 = batch[0]['image'].shape[2] max_label_len = max([b['label'].size(0) for b in batch]) if batch[0]['spaced_label'] is not None: max_spaced_label_len = max([b['spaced_label'].size(0) for b in batch]) else: max_spaced_label_len = None input_batch = torch.full((len(batch) * a_batch_size, dim1, dim2, dim3), PADDING_CONSTANT) mask_batch = torch.full((len(batch) * a_batch_size, dim1, dim2, dim3), PADDING_CONSTANT) if 'fg_mask' in batch[0]: fg_masks = torch.full((len(batch) * a_batch_size, 1, dim2, dim3), 0) if 'changed_image' in batch[0]: changed_batch = torch.full( (len(batch) * a_batch_size, dim1, dim2, dim3), PADDING_CONSTANT) top_and_bottom_batch = torch.full((len(batch) * a_batch_size, 2, dim3), 0) center_line_batch = torch.full((len(batch) * a_batch_size, dim3), dim2 / 2) labels_batch = torch.IntTensor(max_label_len, len(batch) * a_batch_size).fill_(0) if max_spaced_label_len is not None: spaced_labels_batch = torch.IntTensor(max_spaced_label_len, len(batch) * a_batch_size).fill_(0) else: spaced_labels_batch = None for i in range(len(batch)): b_img = batch[i]['image'] b_mask = batch[i]['mask'] b_top_and_bottom = batch[i]['top_and_bottom'] b_center_line = batch[i]['center_line'] l = batch[i]['label'] #toPad = (dim3-b_img.shape[3]) input_batch[i * a_batch_size:(i + 1) * a_batch_size, :, :, 0:b_img.shape[3]] = b_img mask_batch[i * a_batch_size:(i + 1) * a_batch_size, :, :, 0:b_img.shape[3]] = b_mask if 'fg_mask' in batch[i]: fg_masks[i * a_batch_size:(i + 1) * a_batch_size, :, :, 0:b_img.shape[3]] = batch[i]['fg_mask'] if 'changed_image' in batch[i]: changed_batch[i * a_batch_size:(i + 1) * a_batch_size, :, :, 0:b_img.shape[3]] = batch[i]['changed_image'] if b_top_and_bottom is not None: top_and_bottom_batch[i * a_batch_size:(i + 1) * a_batch_size, :, 0:b_img.shape[3]] = b_top_and_bottom else: top_and_bottom_batch = None if b_center_line is not None: center_line_batch[i * a_batch_size:(i + 1) * a_batch_size, 0:b_img.shape[3]] = b_center_line else: center_line_batch = None labels_batch[0:l.size(0), i * a_batch_size:(i + 1) * a_batch_size] = l if max_spaced_label_len is not None: sl = batch[i]['spaced_label'] spaced_labels_batch[0:sl.size(0), i * a_batch_size:(i + 1) * a_batch_size] = sl if batch[0]['style'] is None: style = None else: style = torch.cat([b['style'] for b in batch], dim=0) toRet = { "image": input_batch, "mask": mask_batch, "top_and_bottom": top_and_bottom_batch, "center_line": center_line_batch, "label": labels_batch, "style": style, #"style": torch.cat([b['style'] for b in batch],dim=0), #"label_lengths": [l for b in batch for l in b['label_lengths']], "label_lengths": torch.cat([b['label_lengths'] for b in batch], dim=0), "gt": [l for b in batch for l in b['gt']], "spaced_label": spaced_labels_batch, "author": [l for b in batch for l in b['author']], "name": [l for b in batch for l in b['name']], "a_batch_size": a_batch_size } if 'fg_mask' in batch[0]: toRet['fg_mask'] = fg_masks if 'changed_image' in batch[0]: toRet['changed_image'] = changed_batch return toRet
def _fast_translate_batch(self, batch, data, max_length, min_length=0, n_best=1, return_attention=False): # TODO: faster code path for beam_size == 1. # TODO: support these blacklisted features. assert data.data_type == 'text' assert not self.copy_attn assert not self.dump_beam assert not self.use_filter_pred assert self.block_ngram_repeat == 0 assert self.global_scorer.beta == 0 beam_size = self.beam_size batch_size = batch.batch_size vocab = self.fields["tgt"].vocab start_token = vocab.stoi[inputters.BOS_WORD] end_token = vocab.stoi[inputters.EOS_WORD] # Encoder forward. src = inputters.make_features(batch, 'src', data.data_type) _, src_lengths = batch.src enc_states, memory_bank, src_lengths \ = self.model.encoder(src, src_lengths) dec_states = self.model.decoder.init_decoder_state(src, memory_bank, enc_states, with_cache=True) # Tile states and memory beam_size times. dec_states.map_batch_fn( lambda state, dim: tile(state, beam_size, dim=dim)) if type(memory_bank) == tuple: device = memory_bank[0].device memory_bank = tuple(tile(m, beam_size, dim=1) for m in memory_bank) else: memory_bank = tile(memory_bank, beam_size, dim=1) device = memory_bank.device memory_lengths = tile(src_lengths, beam_size) top_beam_finished = torch.zeros([batch_size], dtype=torch.uint8) batch_offset = torch.arange(batch_size, dtype=torch.long) beam_offset = torch.arange(0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device) alive_seq = torch.full([batch_size * beam_size, 1], start_token, dtype=torch.long, device=device) alive_attn = None # Give full probability to the first beam on the first step. topk_log_probs = (torch.tensor([0.0] + [float("-inf")] * (beam_size - 1), device=device).repeat(batch_size)) # Structure that holds finished hypotheses. hypotheses = [[] for _ in range(batch_size)] # noqa: F812 results = {} results["predictions"] = [[] for _ in range(batch_size)] # noqa: F812 results["scores"] = [[] for _ in range(batch_size)] # noqa: F812 results["attention"] = [[] for _ in range(batch_size)] # noqa: F812 results["gold_score"] = [0] * batch_size results["batch"] = batch if self.mask is not None: mask = self.mask.get_log_probs_masking_tensor( src.squeeze(2), beam_size).to(memory_bank.device) for step in range(max_length): decoder_input = alive_seq[:, -1].view(1, -1, 1) # Decoder forward. dec_out, dec_states, attn = self.model.decoder( decoder_input, memory_bank, dec_states, memory_lengths=memory_lengths, step=step) # Generator forward. log_probs = self.model.generator.forward(dec_out.squeeze(0)) vocab_size = log_probs.size(-1) if step < min_length: log_probs[:, end_token] = -1e20 if self.mask is not None: log_probs = log_probs * mask # Multiply probs by the beam probability. log_probs += topk_log_probs.view(-1).unsqueeze(1) alpha = self.global_scorer.alpha length_penalty = ((5.0 + (step + 1)) / 6.0)**alpha # Flatten probs into a list of possibilities. curr_scores = log_probs / length_penalty curr_scores = curr_scores.reshape(-1, beam_size * vocab_size) topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1) # Recover log probs. topk_log_probs = topk_scores * length_penalty # Resolve beam origin and true word ids. topk_beam_index = topk_ids.div(vocab_size) topk_ids = topk_ids.fmod(vocab_size) # Map beam_index to batch_index in the flat representation. batch_index = (topk_beam_index + beam_offset[:topk_beam_index.size(0)].unsqueeze(1)) select_indices = batch_index.view(-1) # Append last prediction. alive_seq = torch.cat([ alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1) ], -1) if return_attention: current_attn = attn["std"].index_select(1, select_indices) if alive_attn is None: alive_attn = current_attn else: alive_attn = alive_attn.index_select(1, select_indices) alive_attn = torch.cat([alive_attn, current_attn], 0) is_finished = topk_ids.eq(end_token) if step + 1 == max_length: is_finished.fill_(1) # Save finished hypotheses. if is_finished.any(): # Penalize beams that finished. topk_log_probs.masked_fill_(is_finished, -1e10) is_finished = is_finished.to('cpu') top_beam_finished |= is_finished[:, 0].eq(1) predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1)) attention = (alive_attn.view(alive_attn.size(0), -1, beam_size, alive_attn.size(-1)) if alive_attn is not None else None) non_finished_batch = [] for i in range(is_finished.size(0)): b = batch_offset[i] finished_hyp = is_finished[i].nonzero().view(-1) # Store finished hypotheses for this batch. for j in finished_hyp: # if (predictions[i, j, 1:] == end_token).sum() <= 1: hypotheses[b].append(( topk_scores[i, j], predictions[i, j, 1:], # Ignore start_token. attention[:, i, j, :memory_lengths[i]] if attention is not None else None)) # End condition is the top beam finished and we can return # n_best hypotheses. if top_beam_finished[i] and len(hypotheses[b]) >= n_best: best_hyp = sorted(hypotheses[b], key=lambda x: x[0], reverse=True) for n, (score, pred, attn) in enumerate(best_hyp): if n >= n_best: break results["scores"][b].append(score) results["predictions"][b].append(pred) results["attention"][b].append( attn if attn is not None else []) else: non_finished_batch.append(i) non_finished = torch.tensor(non_finished_batch) # If all sentences are translated, no need to go further. if len(non_finished) == 0: break # Remove finished batches for the next step. top_beam_finished = top_beam_finished.index_select( 0, non_finished) batch_offset = batch_offset.index_select(0, non_finished) non_finished = non_finished.to(topk_ids.device) topk_log_probs = topk_log_probs.index_select(0, non_finished) batch_index = batch_index.index_select(0, non_finished) select_indices = batch_index.view(-1) alive_seq = predictions.index_select(0, non_finished) \ .view(-1, alive_seq.size(-1)) if alive_attn is not None: alive_attn = attention.index_select(1, non_finished) \ .view(alive_attn.size(0), -1, alive_attn.size(-1)) # Reorder states. if type(memory_bank) == tuple: memory_bank = tuple( m.index_select(1, select_indices) for m in memory_bank) else: memory_bank = memory_bank.index_select(1, select_indices) memory_lengths = memory_lengths.index_select(0, select_indices) dec_states.map_batch_fn( lambda state, dim: state.index_select(dim, select_indices)) if self.mask is not None: mask = mask.index_select(0, select_indices) return results
def beam_search( self, encoder_output, beam_size, store_alphas=False, store_beam=False, print_beam=False, ): """Generate and return the top k sequences using beam search.""" current_beam_width = beam_size enc_image_size = encoder_output.size(1) encoder_dim = encoder_output.size()[-1] # Flatten encoding encoder_output = encoder_output.view(1, -1, encoder_dim) # We'll treat the problem as having a batch size of k encoder_output = encoder_output.expand( beam_size, encoder_output.size(1), encoder_dim ) # Tensor to store top k sequences; now they're just <start> top_k_sequences = torch.full( (beam_size, 1), self.word_map[TOKEN_START], dtype=torch.int64, device=device ) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(beam_size, device=device) if store_alphas: # Tensor to store top k sequences' alphas; now they're just 1s seqs_alpha = torch.ones(beam_size, 1, enc_image_size, enc_image_size).to( device ) # Lists to store completed sequences, scores, and alphas and the full decoding beam complete_seqs = [] complete_seqs_alpha = [] complete_seqs_scores = [] beam = [] # Initialize hidden states states = self.init_hidden_states(encoder_output) # Start decoding for step in range(0, self.params["max_caption_len"] - 1): prev_words = top_k_sequences[:, step] prev_word_embeddings = self.word_embedding(prev_words) predictions, states, alpha = self.forward_step( encoder_output, prev_word_embeddings, states ) scores = F.log_softmax(predictions, dim=1) # Add the new scores scores = top_k_scores.unsqueeze(1).expand_as(scores) + scores # For the first timestep, the scores from previous decoding are all the same, so in order to create 5 # different sequences, we should only look at one branch if step == 0: scores = scores[0] # Find the top k of the flattened scores top_k_scores, top_k_words = scores.view(-1).topk( current_beam_width, 0, largest=True, sorted=True ) # Convert flattened indices to actual indices of scores prev_seq_inds = top_k_words / self.vocab_size # (k) next_words = top_k_words % self.vocab_size # (k) # Add new words to sequences top_k_sequences = torch.cat( (top_k_sequences[prev_seq_inds], next_words.unsqueeze(1)), dim=1 ) if print_beam: print_current_beam(top_k_sequences, top_k_scores, self.word_map) if store_beam: beam.append(top_k_sequences) # Store the new alphas if store_alphas: alpha = alpha.view(-1, enc_image_size, enc_image_size) seqs_alpha = torch.cat( (seqs_alpha[prev_seq_inds], alpha[prev_seq_inds].unsqueeze(1)), dim=1, ) # Check for complete and incomplete sequences (based on the <end> token) incomplete_inds = ( torch.nonzero(next_words != self.word_map[TOKEN_END]).view(-1).tolist() ) complete_inds = ( torch.nonzero(next_words == self.word_map[TOKEN_END]).view(-1).tolist() ) # Set aside complete sequences and reduce beam size accordingly if len(complete_inds) > 0: complete_seqs.extend(top_k_sequences[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) if store_alphas: complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist()) # Stop if k captions have been completely generated current_beam_width = len(incomplete_inds) if current_beam_width == 0: break # Proceed with incomplete sequences top_k_sequences = top_k_sequences[incomplete_inds] for i in range(len(states)): states[i] = states[i][prev_seq_inds[incomplete_inds]] encoder_output = encoder_output[prev_seq_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds] if store_alphas: seqs_alpha = seqs_alpha[incomplete_inds] if len(complete_seqs) < beam_size: complete_seqs.extend(top_k_sequences.tolist()) complete_seqs_scores.extend(top_k_scores) if store_alphas: complete_seqs_alpha.extend(seqs_alpha) sorted_sequences = [ sequence for _, sequence in sorted( zip(complete_seqs_scores, complete_seqs), reverse=True ) ] sorted_alphas = None if store_alphas: sorted_alphas = [ alpha for _, alpha in sorted( zip(complete_seqs_scores, complete_seqs_alpha), reverse=True ) ] return sorted_sequences, sorted_alphas, beam
def data2x(self, features, device, y): who = features['who'] hand = features['hand'] batch_size = hand.size()[0] cls_ids = torch.full((batch_size, 1), self.cls_token_id, dtype=torch.long, device=device) sep_ids = torch.full((batch_size, 1), self.sep_token_id, dtype=torch.long, device=device) x = torch.cat( [ cls_ids, hand, #14 features['discards'][:, 0, :], features['discards'][:, 1, :], features['discards'][:, 2, :], features['discards'][:, 3, :], # 100(25) features['melds'][0][:, 0], features['melds'][1][:, 0], features['melds'][2][:, 0], features['melds'][3][:, 0], # 80(20) features['action_meld_tiles'], # 4 features['menzen'] + self.menzen_offset, features['reach_state'] + self.reach_state_offset, features['n_reach'] + self.n_reach_offset, features['reach_ippatsu'] + self.reach_ippatsu_offset, features['doras'], features['dans'] + self.dans_offset, features['rates'] + self.rates_offset, features['scores'] + self.scores_offset, features['oya'] + self.oya_offset, features['n_honba'] + self.n_honba_offset, features['n_round'] + self.n_round_offset, features['sanma_or_yonma'] + self.sanma_or_yonma_offset, features['han_or_ton'] + self.han_or_ton_offset, features['aka_ari'] + self.aka_ari_offset, features['kui_ari'] + self.kui_ari_offset, features['shanten'] + self.shanten_offset, features['who'] + self.who_offset, features['sum_discards'] + self.sum_discards_offset ], dim=1) hand_length = hand.size()[1] discard_length = features['discards'].size()[2] dora_length = features['doras'].size()[1] pad_token_type_ids = torch.full((batch_size, 1), self.pad_token_id, dtype=torch.long, device=device) token_types = torch.cat([ pad_token_type_ids, torch.full((batch_size, hand_length), self.hand_token_id, dtype=torch.long, device=device), torch.full((batch_size, discard_length), self.discard_0_token_id, dtype=torch.long, device=device), torch.full((batch_size, discard_length), self.discard_1_token_id, dtype=torch.long, device=device), torch.full((batch_size, discard_length), self.discard_2_token_id, dtype=torch.long, device=device), torch.full((batch_size, discard_length), self.discard_3_token_id, dtype=torch.long, device=device), features['melds'][0][:, 1] + self.meld_0_base_token_id, features['melds'][1][:, 1] + self.meld_1_base_token_id, features['melds'][2][:, 1] + self.meld_2_base_token_id, features['melds'][3][:, 1] + self.meld_3_base_token_id, torch.full((batch_size, 4), self.action_meld_tiles_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.menzen_0_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.menzen_1_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.menzen_2_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.menzen_3_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.reach_state_0_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.reach_state_1_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.reach_state_2_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.reach_state_3_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.n_reach_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.reach_ippatsu_0_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.reach_ippatsu_1_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.reach_ippatsu_2_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.reach_ippatsu_3_token_id, dtype=torch.long, device=device), torch.full((batch_size, dora_length), self.dora_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.dans_0_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.dans_1_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.dans_2_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.dans_3_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.rates_0_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.rates_1_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.rates_2_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.rates_3_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.scores_0_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.scores_1_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.scores_2_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.scores_3_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.oya_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.n_honba_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.n_round_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.sanma_or_yonma_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.han_or_ton_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.aka_ari_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.kui_ari_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.shanten_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.who_token_id, dtype=torch.long, device=device), torch.full((batch_size, 1), self.sum_discards_token_id, dtype=torch.long, device=device) ], dim=1) cls_tokens = torch.tensor([self.tgt_cls_token_id] * batch_size, dtype=torch.long, device=device).reshape((batch_size, 1)) tgt_ids = torch.cat([cls_tokens, y[:, :-1]], axis=1) tgt_ids[tgt_ids == -100] = self.tgt_pad_token_id return x, token_types, tgt_ids
def add_lsqmodule(net, constr_weight): for name, module in net.named_modules(): if isinstance(module, Conv2d) or isinstance(module, Linear): scale_init = torch.full((1, ), module.weight.abs().mean().item()) module.wquantizer = LsqWeight(constraint=constr_weight, scale_init=scale_init.clone())
fake_label = 0 niter = 25 g_loss = [] d_loss = [] for epoch in range(niter): for i, data in enumerate(dataloader, 0): ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ########################### # train with real netD.zero_grad() real_cpu = data[0].to(device) batch_size = real_cpu.size(0) label = torch.full((batch_size, ), real_label, device=device) output = netD(real_cpu) errD_real = criterion(output, label) errD_real.backward() D_x = output.mean().item() # train with fake noise = torch.randn(batch_size, nz, 1, 1, device=device) fake = netG(noise) label.fill_(fake_label) output = netD(fake.detach()) errD_fake = criterion(output, label) errD_fake.backward() D_G_z1 = output.mean().item() errD = errD_real + errD_fake
def forward(self, inputs, triples, lengths, elmo_embedding, id2_ids_batch): if self.args.pretrain_model_type == 'elmo': elmo_inputs = torch.Tensor().cuda() for i in range(len(inputs)): elmo_input = torch.from_numpy(elmo_embedding[' '.join(map(str, inputs[i].cpu().numpy()))].value).type(torch.cuda.FloatTensor) try: elmo_inputs = torch.cat((elmo_inputs, elmo_input.unsqueeze(dim=0))) except: elmo_inputs = torch.cat((elmo_inputs, elmo_input.unsqueeze(dim=0)[:,:128,:]), dim=0) inputs = elmo_inputs else: inputs = self.embedding(inputs) # Introducing external knowledge in different ways. t = torch.zeros(inputs.size(0), self.seq_length, self.input_dim + self.triples_embedding_dim).cuda() if self.args.concat_mode=="graph_attention": for i in range(len(inputs)): b = torch.full([self.seq_length, self.triples_number], -1, dtype=torch.long).cuda() bb = torch.zeros(self.seq_length, self.triples_embedding_dim).cuda() if (torch.equal(id2_ids_batch[i], b)): t[i] = torch.cat((inputs[i], bb), dim=-1) else: for k in range(len(id2_ids_batch[i])): c = torch.full([self.triples_number], -1, dtype=torch.long).cuda() cc = torch.zeros(self.triples_embedding_dim).cuda() if (torch.equal(id2_ids_batch[i][k], c)): t[i][k] = torch.cat((inputs[i][k], cc), dim=-1) else: list1 = torch.Tensor().cuda() list2 = torch.Tensor().cuda() head_id, tail_id, relation_id = torch.chunk(triples[i][k], 3, dim=1) t2 = self.embeddings_entity(head_id).cuda() t21 = self.embeddings_entity(tail_id).cuda() t22 = self.embeddings_relation(relation_id).cuda() head_tail = torch.cat((t2, t21), dim=2) list1 = torch.cat((list1, head_tail), dim=0) list2 = torch.cat((list2, t22), dim=0) head_tail_transformed = self.entity_transformed(list1) head_tail_transformed_final = F.tanh(head_tail_transformed) relation_transformed1 = F.tanh(list2) e_weight = (head_tail_transformed_final * relation_transformed1).sum(dim=2) alpha_weight = F.softmax(e_weight, dim=0) graph_embed = (alpha_weight.unsqueeze(1) * head_tail).sum(dim=0) aa = torch.cat((inputs[i][k], graph_embed.squeeze(0))) t[i][k] = aa else: for i in range(len(inputs)): dict = {} b = torch.full([self.seq_length, self.triples_number], -1, dtype=torch.long).cuda() bb = torch.zeros(self.seq_length, self.triples_embedding_dim).cuda() if (torch.equal(id2_ids_batch[i], b)): t[i] = torch.cat((inputs[i], bb), dim=-1) else: for k in range(len(id2_ids_batch[i])): a = 0 input = torch.Tensor().cuda() c = torch.full([self.triples_number], -1, dtype=torch.long).cuda() cc = torch.zeros(self.triples_embedding_dim).cuda() if (torch.equal(id2_ids_batch[i][k], c)): t[i][k] = torch.cat((inputs[i][k], cc), dim=-1) else: for j in range(len(id2_ids_batch[i][k])): if id2_ids_batch[i][k][j].cpu().numpy() == 1: inputs_triples = torch.cat( (inputs[i][k], self.embeddings_entity(triples[i][k][j][1]))) elif id2_ids_batch[i][k][j].cpu().numpy() == 2: inputs_triples = torch.cat( (inputs[i][k], self.embeddings_entity(triples[i][k][j][0]))) else: continue if a == 0: a = a + 1 input = torch.cat((inputs_triples, input)) else: a = a + 1 input = input + inputs_triples if a != 0: input = input / a dict[k] = input for k in dict: t[i][k] = dict[k] # 1. input embedded_input = self.dropout_on_input_to_LSTM(t) (sorted_input, sorted_lengths, input_unsort_indices, _) = sort_batch_by_length(embedded_input, lengths) packed_input = pack_padded_sequence(sorted_input, sorted_lengths.data.tolist(), batch_first=True) packed_sorted_output, _ = self.rnn(packed_input) sorted_output, _ = pad_packed_sequence(packed_sorted_output, batch_first=True) output = sorted_output[input_unsort_indices] # 2. use attention if self.args.attention_layer == 'att': attention_logits = self.attention_weights(output).squeeze(-1) mask_attention_logits = (attention_logits != 0).type( torch.cuda.FloatTensor if inputs.is_cuda else torch.FloatTensor) softmax_attention_logits = last_dim_softmax(attention_logits, mask_attention_logits) softmax_attention_logits0 = softmax_attention_logits.unsqueeze(dim=1) input_encoding = torch.bmm(softmax_attention_logits0, output) input_encoding0 = input_encoding.squeeze(dim=1) else: input_encoding = torch.Tensor().cuda() querys = self.query_embedding(torch.arange(0,self.args.num_classes,1).cuda()) attention_weights = torch.Tensor(self.args.num_classes, len(output), len(output[0])).cuda() for i in range(self.args.num_classes): attention_logits = self.proquery_weights_mp(output) attention_logits = torch.bmm(attention_logits, querys[i].unsqueeze(dim=1).repeat(len(output),1,1)).squeeze(dim=-1) mask_attention_logits = (attention_logits != 0).type( torch.cuda.FloatTensor if inputs.is_cuda else torch.FloatTensor) softmax_attention_logits = last_dim_softmax(attention_logits, mask_attention_logits) input_encoding_part = torch.bmm(softmax_attention_logits.unsqueeze(dim=1), output) input_encoding = torch.cat((input_encoding,input_encoding_part.squeeze(dim=1)), dim=-1) attention_weights[i] = softmax_attention_logits # 3. run linear layer if self.args.attention_layer == 'att': input_encodings = self.dropout_on_input_to_linear_layer(input_encoding0) unattized_output = self.output_projection(input_encodings) output_distribution = F.log_softmax(unattized_output, dim=-1) return output_distribution, softmax_attention_logits.squeeze(dim=1) else: input_encodings = self.dropout_on_input_to_linear_layer(input_encoding) unattized_output = self.multi_output_projection(input_encodings) output_distribution = F.log_softmax(unattized_output, dim=-1) cos = torch.nn.CosineSimilarity(dim=0, eps=1e-16) attention_loss = abs(cos(querys[0], querys[1])) + abs(cos(querys[1], querys[2])) \ + abs(cos(querys[0], querys[2])) return output_distribution, attention_weights, attention_loss
def forward( self, images: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: if self.training: if targets is None: torch._assert(False, "targets should not be none when in training mode") else: for target in targets: boxes = target["boxes"] if isinstance(boxes, torch.Tensor): torch._assert( len(boxes.shape) == 2 and boxes.shape[-1] == 4, f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.", ) else: torch._assert(False, f"Expected target boxes to be of type Tensor, got {type(boxes)}.") # get the original image sizes original_image_sizes: List[Tuple[int, int]] = [] for img in images: val = img.shape[-2:] torch._assert( len(val) == 2, f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}", ) original_image_sizes.append((val[0], val[1])) # transform the input images, targets = self.transform(images, targets) # Check for degenerate boxes if targets is not None: for target_idx, target in enumerate(targets): boxes = target["boxes"] degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] if degenerate_boxes.any(): bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0] degen_bb: List[float] = boxes[bb_idx].tolist() torch._assert( False, "All bounding boxes should have positive height and width." f" Found invalid box {degen_bb} for target at index {target_idx}.", ) # get the features from the backbone features = self.backbone(images.tensors) if isinstance(features, torch.Tensor): features = OrderedDict([("0", features)]) features = list(features.values()) # compute the ssd heads outputs using the features head_outputs = self.head(features) # create the set of anchors anchors = self.anchor_generator(images, features) losses = {} detections: List[Dict[str, Tensor]] = [] if self.training: matched_idxs = [] if targets is None: torch._assert(False, "targets should not be none when in training mode") else: for anchors_per_image, targets_per_image in zip(anchors, targets): if targets_per_image["boxes"].numel() == 0: matched_idxs.append( torch.full( (anchors_per_image.size(0),), -1, dtype=torch.int64, device=anchors_per_image.device ) ) continue match_quality_matrix = box_ops.box_iou(targets_per_image["boxes"], anchors_per_image) matched_idxs.append(self.proposal_matcher(match_quality_matrix)) losses = self.compute_loss(targets, head_outputs, anchors, matched_idxs) else: detections = self.postprocess_detections(head_outputs, anchors, images.image_sizes) detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) if torch.jit.is_scripting(): if not self._has_warned: warnings.warn("SSD always returns a (Losses, Detections) tuple in scripting") self._has_warned = True return losses, detections return self.eager_outputs(losses, detections)
def forward(self, int_fill: int): size = torch.Size(2, 2) a = torch.full(size, int_fill) b = torch.full(size, 1) return (a, b)
step = 0 print("<---------Training Images------------>") plot_imgs(device, batch_size, img_size, data_root) print("<---------Start Training------------>") for epoch in range(epochs): for i, data in enumerate(dataloader, 0): #---------------TRAIN D-----------------# # train real imgs netD.zero_grad() # clear gradients real_img = data[0].to(device) b_size = real_img.size(0) label = torch.full((b_size, ), real_label, dtype=torch.float, device=device) output = netD(real_img).view(-1) errD_real = criterion(output, label) errD_real.backward() D_x = output.mean().item() # train fake imgs noise = torch.randn(b_size, nz, 1, 1, device=device) fake = netG(noise) label.fill_(fake_label) output = netD(fake.detach()).view(-1) errD_fake = criterion(output, label) errD_fake.backward() D_G_z1 = output.mean().item() errD = errD_real + errD_fake
def nucleus_sampling(self, encoder_output, beam_size, top_p, print_beam=False): """Generate and return the top k sequences using nucleus sampling.""" current_beam_width = beam_size encoder_dim = encoder_output.size()[-1] # Flatten encoding encoder_output = encoder_output.view(1, -1, encoder_dim) # We'll treat the problem as having a batch size of k encoder_output = encoder_output.expand( beam_size, encoder_output.size(1), encoder_dim ) # Tensor to store top k sequences; now they're just <start> top_k_sequences = torch.full( (beam_size, 1), self.word_map[TOKEN_START], dtype=torch.int64, device=device ) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(beam_size, device=device) # Lists to store completed sequences, scores, and alphas and the full decoding beam complete_seqs = [] complete_seqs_scores = [] # Initialize hidden states states = self.init_hidden_states(encoder_output) # Start decoding for step in range(0, self.params["max_caption_len"] - 1): prev_words = top_k_sequences[:, step] prev_word_embeddings = self.word_embedding(prev_words) predictions, states, alpha = self.forward_step( encoder_output, prev_word_embeddings, states ) scores = F.log_softmax(predictions, dim=1) sorted_logits, sorted_indices = torch.sort(scores, descending=True, dim=-1) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above the threshold sorted_indices_to_remove = cumulative_probs > top_p # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ ..., :-1 ].clone() sorted_indices_to_remove[..., 0] = 0 top_k_scores = torch.zeros( current_beam_width, dtype=torch.float, device=device ) top_k_words = torch.zeros( current_beam_width, dtype=torch.long, device=device ) for i in range(0, current_beam_width): scores[i][sorted_indices[i][sorted_indices_to_remove[i]]] = -float( "inf" ) # Sample from the scores top_k_words[i] = torch.multinomial(torch.softmax(scores[i], -1), 1) top_k_scores[i] = scores[i][top_k_words[i]] # Add new words to sequences top_k_sequences = torch.cat( (top_k_sequences, top_k_words.unsqueeze(1)), dim=1 ) if print_beam: print_current_beam(top_k_sequences, top_k_scores, self.word_map) # Check for complete and incomplete sequences (based on the <end> token) incomplete_inds = ( torch.nonzero(top_k_words != self.word_map[TOKEN_END]).view(-1).tolist() ) complete_inds = ( torch.nonzero(top_k_words == self.word_map[TOKEN_END]).view(-1).tolist() ) # Set aside complete sequences and reduce beam size accordingly if len(complete_inds) > 0: complete_seqs.extend(top_k_sequences[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) # Stop if k captions have been completely generated current_beam_width = len(incomplete_inds) if current_beam_width == 0: break # Proceed with incomplete sequences top_k_sequences = top_k_sequences[incomplete_inds] for i in range(len(states)): states[i] = states[i][incomplete_inds] encoder_output = encoder_output[incomplete_inds] top_k_scores = top_k_scores[incomplete_inds] if len(complete_seqs) < beam_size: complete_seqs.extend(top_k_sequences.tolist()) complete_seqs_scores.extend(top_k_scores) sorted_sequences = [ sequence for _, sequence in sorted( zip(complete_seqs_scores, complete_seqs), reverse=True ) ] return sorted_sequences, None, None
def mvae(mix, model, n_iter, device, proj_back=True, return_sigma=False): """Implementation of Multichannel Conditional VAE. It only works in the determined case (n_sources == n_channels). Args: mix (ndarray): (n_frequencies, n_channels, n_frames) STFT representation of the observed signal. model (cvae.CVAE): Trained Conditional VAE model. n_iter (int): Number of iterations. device (torch.device): Device used for computation. proj_back (bool): If use back-projection technique. return_sigma (bool): If also return estimated power spectrogram for each speaker. Returns: tuple[ndarray, ndarray]: Tuple of separated signal and separation matrix. The shapes of separated signal and separation matrix are (n_frequencies, n_sources, n_frames) and (n_frequencies, n_sources, n_channels), respectively. """ if isinstance(mix, np.ndarray): if_use_cuda = False xp = np elif isinstance(mix, cp.ndarray): if_use_cuda = True xp = cp else: raise ValueError('A numpy.ndarray or cupy.ndarray instance should be ' 'given as `mix` argument') n_freq, n_src, n_frame = mix.shape sep, sep_mat = ilrma(mix, n_iter=30, n_basis=2) sep_pow = xp.power(xp.abs(sep), 2) # (n_freq, n_src, n_frame) c = torch.full((n_src, model.n_speakers), 1 / model.n_speakers, device=device, requires_grad=True) log_g = torch.full((n_src, 1, 1), model.log_g.item(), device=device) with torch.no_grad(): if if_use_cuda: sep_pow_tensor = to_tensor(sep_pow).transpose(0, 1) else: sep_pow_tensor =\ torch.from_numpy(sep_pow).transpose(0, 1).to(device) sep_pow_tensor.clamp_(EPS) z, _ = model.encode(sep_pow_tensor, c) sigma_sq = (model.decode(z, c) + log_g).exp() sigma_sq.clamp_(min=EPS) sigma_reci = 1 / sigma_sq if if_use_cuda: sigma_reci = to_cupy(sigma_reci) else: sigma_reci = sigma_reci.numpy() z.requires_grad = True eye = xp.tile(xp.eye(n_src), (n_freq, 1, 1)) for _ in range(n_iter): for src in range(n_src): h = sigma_reci[src, :, :, None] @ xp.ones((1, n_src)) h = mix.conj() @ (mix.swapaxes(1, 2) * h) u_mat = h.swapaxes(1, 2) / n_frame h = sep_mat @ u_mat + EPS * eye sep_mat[:, src, :] = xp.linalg.solve(h, eye[:, :, src]).conj() h = sep_mat[:, src, None, :] @ u_mat h = (h @ sep_mat[:, src, :, None].conj()).squeeze(2) sep_mat[:, src, :] = (sep_mat[:, src, :] / xp.sqrt(h).conj()) sep = sep_mat @ mix xp.power(xp.abs(sep), 2, out=sep_pow) xp.clip(sep_pow, a_min=EPS, a_max=None, out=sep_pow) optimizer = torch.optim.Adam((z, c), lr=1e-3) if if_use_cuda: sep_pow_tensor = to_tensor(sep_pow).transpose(0, 1) else: sep_pow_tensor = \ torch.from_numpy(sep_pow).transpose(0, 1).to(device) for _ in range(50): log_sigma_sq = model.decode(z, torch.softmax(c, dim=1)) + log_g loss = torch.sum( log_sigma_sq + (sep_pow_tensor.log() - log_sigma_sq).exp()) model.zero_grad() optimizer.zero_grad() loss.backward() optimizer.step() with torch.no_grad(): sigma_sq = (model.decode(z, torch.softmax(c, dim=1)) + log_g).exp() lbd = torch.sum(sep_pow_tensor / sigma_sq, dim=(1, 2)) lbd = lbd / n_freq / n_frame / log_g.squeeze(2).squeeze(1).exp() log_g[:, 0, 0] += torch.log(lbd) sigma_sq *= lbd.unsqueeze(1).unsqueeze(2) if if_use_cuda: sep_mat *= to_cupy(lbd.unsqueeze(0).unsqueeze(2)) sigma_reci = to_cupy(1 / sigma_sq) else: sep_mat *= lbd.unsqueeze(0).unsqueeze(2).numpy() sigma_reci = (1 / sigma_sq).numpy() # Back-projection technique if proj_back: z = projection_back(sep, mix[:, 0, :]) sep *= xp.conj(z[:, :, None]) if return_sigma: return sep, sep_mat, sigma_sq.cpu().numpy() else: return sep, sep_mat
def forward(self, encoder_output, target_captions=None, decode_lengths=None): """ Forward propagation. :param encoder_output: output features of the encoder :param target_captions: encoded target captions, shape: (batch_size, max_caption_length) :param decode_lengths: caption lengths, shape: (batch_size, 1) :return: scores for vocabulary, decode lengths, weights """ batch_size = encoder_output.size(0) # Flatten image encoder_output = encoder_output.view(batch_size, -1, encoder_output.size(-1)) if not self.training: decode_lengths = torch.full( (batch_size,), self.params["max_caption_len"], dtype=torch.int64, device=device, ) # Initialize LSTM state states = self.init_hidden_states(encoder_output) # Tensors to hold word prediction scores and alphas scores = torch.zeros( (batch_size, max(decode_lengths), self.vocab_size), device=device ) alphas = torch.zeros( batch_size, max(decode_lengths), encoder_output.size(1), device=device ) # At the start, all 'previous words' are the <start> token prev_words = torch.full( (batch_size,), self.word_map[TOKEN_START], dtype=torch.int64, device=device ) for t in range(max(decode_lengths)): if not self.training: # Find all sequences where an <end> token has been produced in the last timestep ind_end_token = ( torch.nonzero(prev_words == self.word_map[TOKEN_END]) .view(-1) .tolist() ) # Update the decode lengths accordingly decode_lengths[ind_end_token] = torch.min( decode_lengths[ind_end_token], torch.full_like(decode_lengths[ind_end_token], t, device=device), ) # Check if all sequences are finished: indices_incomplete_sequences = torch.nonzero(decode_lengths > t).view(-1) if len(indices_incomplete_sequences) == 0: break prev_words_embedded = self.word_embedding(prev_words) scores_for_timestep, states, alphas_for_timestep = self.forward_step( encoder_output, prev_words_embedded, states ) # Update the previously predicted words prev_words = self.update_previous_word( scores_for_timestep, target_captions, t ) scores[indices_incomplete_sequences, t, :] = scores_for_timestep[ indices_incomplete_sequences ] if alphas_for_timestep is not None: alphas[indices_incomplete_sequences, t, :] = alphas_for_timestep[ indices_incomplete_sequences ] return scores, decode_lengths, alphas
for i, data in enumerate(dataloader): niter = epoch * len(dataloader) + i # Save just first batch of real data for displaying if i == 0: real_display = data.cpu() ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ########################### # Train with real data netD.zero_grad() real = data.to(device) batch_size, seq_len = real.size(0), real.size(1) label = torch.full((batch_size, seq_len, 1), real_label, device=device) # real = real.type(torch.DoubleTensor) output = netD(real) # .type(torch.DoubleTensor) errD_real = criterion(output, label) errD_real.backward() D_x = output.mean().item() # Train with fake data noise = torch.randn(batch_size, seq_len, nz, device=device) if opt.delta_condition: # Sample a delta for each batch and concatenate to the noise for each timestep deltas = dataset.sample_deltas(batch_size).unsqueeze(2).repeat( 1, seq_len, 1) noise = torch.cat((noise, deltas), dim=2)
def generate_square_subsequent_mask(sz: int) -> Tensor: r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0). """ return torch.triu(torch.full((sz, sz), float('-inf')), diagonal=1)
def main(noise_factor, data, gan_model): ############################ result_dir = './gan_mnist/' + gan_model + data + str(noise_factor) BATCH_SIZE = 64 WORKERS = 2 NGPU = 1 Z_dim = 100 X_dim = 784 Img_dim = 28 LR = 0.0002 N_EPOCHS = 200 ########################### transform = transforms.Compose([transforms.ToTensor()]) # transforms.Normalize([0.5], [0.5])]) dataset_class = getattr(torchvision.datasets, data) trainset = dataset_class(root='./data', train=True, download=True, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=WORKERS) # Decide which device we want to run on device = torch.device("cuda:0" if ( torch.cuda.is_available() and NGPU > 0) else "cpu") netG = Gen(NGPU).to(device) netD = Dis(NGPU).to(device) # Handle multi-gpu if desired if (device.type == 'cuda') and (NGPU > 1): netG = nn.DataParallel(netG, list(range(NGPU))) netD = nn.DataParallel(netD, list(range(NGPU))) print(netG) print(netD) print(device) criterion = nn.BCEWithLogitsLoss() sig = nn.Sigmoid() # Create batch of latent vectors that we will use to visualize # the result of the generator fixed_noise = torch.randn(64, Z_dim, device=device) # Establish convention for real and fake labels real_label = 1 fake_label = 0 # Setup Adam optimizers optimizerD = optim.Adam(netD.parameters(), lr=LR) optimizerG = optim.Adam(netG.parameters(), lr=LR) # Training Loop # Lists to keep track of progress G_losses = [] D_losses = [] # results save folder if not os.path.isdir(result_dir): os.mkdir(result_dir) print("Starting Training Loop...") # For each epoch for epoch in range(N_EPOCHS): # For each batch for i, data in enumerate(trainloader): ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ########################### ## Train with all-real batch netD.zero_grad() # Format batch real = data[0].to(device) b_size = real.size(0) label = torch.full((b_size, ), real_label, device=device) # Forward pass real batch through D real = real.view(-1, X_dim) real = salt_and_pepper(real, device, p=noise_factor).to(device) output = netD(real).view(-1) # Calculate loss on all-real batch errD_real = criterion(output, label) # Calculate gradients for D in backward pass errD_real.backward() output = sig(output) D_x = output.mean().item() ## Train with all-fake batch # Generate batch of latent vectors noise = torch.randn(b_size, Z_dim, device=device) # Generate fake image batch with G fake = netG(noise) label.fill_(fake_label) # Classify all fake batch with D # Detach to avoid training G on these labels (&save time) output = netD(fake.detach()).view(-1) # Calculate D's loss on the all-fake batch errD_fake = criterion(output, label) # Calculate the gradients for this batch errD_fake.backward() output = sig(output) D_G_z1 = output.mean().item() # Add the gradients from the all-real and all-fake batches errD = errD_real + errD_fake # Update D optimizerD.step() ############################ # (2) Update G network: maximize log(D(G(z))) ########################### netG.zero_grad() label.fill_(real_label) # fake labels are real for generator cost # Since we just updated D, perform another forward pass of all-fake batch through D output = netD(fake).view(-1) # Calculate G's loss based on this output errG = criterion(output, label) # Calculate gradients for G errG.backward() output = sig(output) D_G_z2 = output.mean().item() # Update G optimizerG.step() # Output training stats if i % 1000 == 0: print( f'[{epoch}/{N_EPOCHS}], {i}, {len(trainloader)}, Loss_D: {errD.item()}, ' f'Loss_G: {errG.item()}, D(x): {D_x}, D(G(z)): {D_G_z1}/{D_G_z2}' ) # Save Losses for plotting later G_losses.append(errG.item()) D_losses.append(errD.item()) # Check how the generator is doing by saving G's output on fixed_noise if i == len(trainloader) % 10: # if epoch == N_EPOCHS-1 and i == len(trainloader)-1: with torch.no_grad(): fake = netG(fixed_noise).detach().cpu() np.save(result_dir + '/' + str(epoch), fake.numpy()) showloss(G_losses, D_losses, result_dir) # imshow(torch.reshape(fake, (64, 1, Img_dim, Img_dim)), result_dir) # result for FID z_ = torch.randn(10000, Z_dim, device=device) #10000 z_fid = netG(z_).detach().cpu() np.save(result_dir + '/result4FID', z_fid.numpy())
def dcgan(dat, netG, netD, args): device = args.device if torch.cuda.is_available(): netG.cuda() netD.cuda() criterion.cuda() criterion_mse.cuda() X_training = dat['X_train'].to(device) fixed_noise = torch.randn(args.num_gen_images, args.nz, 1, 1, device=device) optimizerD = optim.Adam(netD.parameters(), lr=args.lrD, betas=(args.beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=args.lrG, betas=(args.beta1, 0.999)) for epoch in range(1, args.epochs + 1): for i in range(0, len(X_training), args.batchSize): netD.zero_grad() stop = min(args.batchSize, len(X_training[i:])) real_cpu = X_training[i:i + stop].to(device) batch_size = real_cpu.size(0) label = torch.full((batch_size, ), real_label, device=device) output = netD(real_cpu) errD_real = criterion(output, label) errD_real.backward() D_x = output.mean().item() # train with fake noise = torch.randn(batch_size, args.nz, 1, 1, device=device) fake = netG(noise) label.fill_(fake_label) output = netD(fake.detach()) errD_fake = criterion(output, label) errD_fake.backward() D_G_z1 = output.mean().item() errD = errD_real + errD_fake optimizerD.step() # (2) Update G network: maximize log(D(G(z))) netG.zero_grad() label.fill_(real_label) output = netD(fake) errG = criterion(output, label) errG.backward() D_G_z2 = output.mean().item() optimizerG.step() ## log performance if i % args.log == 0: print( 'Epoch [%d/%d] .. Batch [%d/%d] .. Loss_D: %.4f .. Loss_G: %.4f .. D(x): %.4f .. D(G(z)): %.4f / %.4f' % (epoch, args.epochs, i, len(X_training), errD.data, errG.data, D_x, D_G_z1, D_G_z2)) print('*' * 100) print('End of epoch {}'.format(epoch)) print('*' * 100) if epoch % args.save_imgs_every == 0: fake = netG(fixed_noise).detach() vutils.save_image(fake, '%s/dcgan_%s_fake_epoch_%03d.png' % (args.results_folder, args.dataset, epoch), normalize=True, nrow=20) if epoch % args.save_ckpt_every == 0: torch.save( netG.state_dict(), os.path.join( args.results_folder, 'netG_dcgan_%s_epoch_%s.pth' % (args.dataset, epoch)))
def beam_search( decoder: Decoder, size: int, bos_index: int, eos_index: int, pad_index: int, encoder_output: Tensor, encoder_hidden: Tensor, src_mask: Tensor, max_output_length: int, alpha: float, embed: Embeddings, n_best: int = 1) -> (np.array, np.array): """ Beam search with size k. Inspired by OpenNMT-py, adapted for Transformer. In each decoding step, find the k most likely partial hypotheses. :param decoder: :param size: size of the beam :param bos_index: :param eos_index: :param pad_index: :param encoder_output: :param encoder_hidden: :param src_mask: :param max_output_length: :param alpha: `alpha` factor for length penalty :param embed: :param n_best: return this many hypotheses, <= beam (currently only 1) :return: - stacked_output: output hypotheses (2d array of indices), - stacked_attention_scores: attention scores (3d array) """ assert size > 0, 'Beam size must be >0.' assert n_best <= size, 'Can only return {} best hypotheses.'.format(size) # init transformer = isinstance(decoder, TransformerDecoder) batch_size = src_mask.size(0) att_vectors = None # not used for Transformer # Recurrent models only: initialize RNN hidden state # pylint: disable=protected-access if not transformer: hidden = decoder._init_hidden(encoder_hidden) else: hidden = None # tile encoder states and decoder initial states beam_size times if hidden is not None: hidden = tile(hidden, size, dim=1) # layers x batch*k x dec_hidden_size encoder_output = tile(encoder_output.contiguous(), size, dim=0) # batch*k x src_len x enc_hidden_size src_mask = tile(src_mask, size, dim=0) # batch*k x 1 x src_len # Transformer only: create target mask if transformer: trg_mask = src_mask.new_ones([1, 1, 1]) # transformer only else: trg_mask = None # numbering elements in the batch batch_offset = torch.arange( batch_size, dtype=torch.long, device=encoder_output.device) # numbering elements in the extended batch, i.e. beam size copies of each # batch element beam_offset = torch.arange( 0, batch_size * size, step=size, dtype=torch.long, device=encoder_output.device) # keeps track of the top beam size hypotheses to expand for each element # in the batch to be further decoded (that are still "alive") alive_seq = torch.full( [batch_size * size, 1], bos_index, dtype=torch.long, device=encoder_output.device) # Give full probability to the first beam on the first step. topk_log_probs = torch.zeros(batch_size, size, device=encoder_output.device) topk_log_probs[:, 1:] = float("-inf") # Structure that holds finished hypotheses. hypotheses = [[] for _ in range(batch_size)] results = { "predictions": [[] for _ in range(batch_size)], "scores": [[] for _ in range(batch_size)], "gold_score": [0] * batch_size, } for step in range(max_output_length): # This decides which part of the predicted sentence we feed to the # decoder to make the next prediction. # For Transformer, we feed the complete predicted sentence so far. # For Recurrent models, only feed the previous target word prediction if transformer: # Transformer decoder_input = alive_seq # complete prediction so far else: # Recurrent decoder_input = alive_seq[:, -1].view(-1, 1) # only the last word # expand current hypotheses # decode one single step # logits: logits for final softmax # pylint: disable=unused-variable trg_embed = embed(decoder_input) logits, hidden, att_scores, att_vectors = decoder( encoder_output=encoder_output, encoder_hidden=encoder_hidden, src_mask=src_mask, trg_embed=trg_embed, hidden=hidden, prev_att_vector=att_vectors, unroll_steps=1, trg_mask=trg_mask # subsequent mask for Transformer only ) # For the Transformer we made predictions for all time steps up to # this point, so we only want to know about the last time step. if transformer: logits = logits[:, -1] # keep only the last time step hidden = None # we don't need to keep it for transformer # batch*k x trg_vocab log_probs = F.log_softmax(logits, dim=-1).squeeze(1) # multiply probs by the beam probability (=add logprobs) log_probs += topk_log_probs.view(-1).unsqueeze(1) curr_scores = log_probs.clone() # compute length penalty if alpha > -1: length_penalty = ((5.0 + (step + 1)) / 6.0) ** alpha curr_scores /= length_penalty # flatten log_probs into a list of possibilities curr_scores = curr_scores.reshape(-1, size * decoder.output_size) # pick currently best top k hypotheses (flattened order) topk_scores, topk_ids = curr_scores.topk(size, dim=-1) if alpha > -1: # recover original log probs topk_log_probs = topk_scores * length_penalty else: topk_log_probs = topk_scores.clone() # reconstruct beam origin and true word ids from flattened order topk_beam_index = topk_ids.div(decoder.output_size) topk_ids = topk_ids.fmod(decoder.output_size) # map beam_index to batch_index in the flat representation batch_index = ( topk_beam_index + beam_offset[:topk_beam_index.size(0)].unsqueeze(1)) select_indices = batch_index.view(-1) # append latest prediction alive_seq = torch.cat( [alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1) # batch_size*k x hyp_len is_finished = topk_ids.eq(eos_index) if step + 1 == max_output_length: is_finished.fill_(True) # end condition is whether the top beam is finished end_condition = is_finished[:, 0].eq(True) # save finished hypotheses if is_finished.any(): predictions = alive_seq.view(-1, size, alive_seq.size(-1)) for i in range(is_finished.size(0)): b = batch_offset[i] if end_condition[i]: is_finished[i].fill_(1) finished_hyp = is_finished[i].nonzero().view(-1) # store finished hypotheses for this batch for j in finished_hyp: # Check if the prediction has more than one EOS. # If it has more than one EOS, it means that the # prediction should have already been added to # the hypotheses, so you don't have to add them again. if (predictions[i, j, 1:] == eos_index).nonzero().numel() \ < 2: # ignore start_token hypotheses[b].append( (topk_scores[i, j], predictions[i, j, 1:]) ) # if the batch reached the end, save the n_best hypotheses if end_condition[i]: best_hyp = sorted( hypotheses[b], key=lambda x: x[0], reverse=True) for n, (score, pred) in enumerate(best_hyp): if n >= n_best: break results["scores"][b].append(score) results["predictions"][b].append(pred) non_finished = end_condition.eq(False).nonzero().view(-1) # if all sentences are translated, no need to go further # pylint: disable=len-as-condition if len(non_finished) == 0: break # remove finished batches for the next step topk_log_probs = topk_log_probs.index_select(0, non_finished) batch_index = batch_index.index_select(0, non_finished) batch_offset = batch_offset.index_select(0, non_finished) alive_seq = predictions.index_select(0, non_finished) \ .view(-1, alive_seq.size(-1)) # reorder indices, outputs and masks select_indices = batch_index.view(-1) encoder_output = encoder_output.index_select(0, select_indices) src_mask = src_mask.index_select(0, select_indices) if hidden is not None and not transformer: if isinstance(hidden, tuple): # for LSTMs, states are tuples of tensors h, c = hidden h = h.index_select(1, select_indices) c = c.index_select(1, select_indices) hidden = (h, c) else: # for GRUs, states are single tensors hidden = hidden.index_select(1, select_indices) if att_vectors is not None: att_vectors = att_vectors.index_select(0, select_indices) def pad_and_stack_hyps(hyps, pad_value): filled = np.ones((len(hyps), max([h.shape[0] for h in hyps])), dtype=int) * pad_value for j, h in enumerate(hyps): for k, i in enumerate(h): filled[j, k] = i return filled # from results to stacked outputs assert n_best == 1 # only works for n_best=1 for now final_outputs = pad_and_stack_hyps([r[0].cpu().numpy() for r in results["predictions"]], pad_value=pad_index) return final_outputs, None
def presgan(dat, netG, netD, log_sigma, args): writer = SummaryWriter(log_dir='tensorboard' + args.dataset) device = args.device if torch.cuda.is_available(): print("cuda") netG.cuda() netD.cuda() criterion.cuda() criterion_mse.cuda() X_training = dat['X_train'].to(device) # [60000, 1, 64, 64] fixed_noise = torch.randn(args.num_gen_images, args.nz, 1, 1, device=device) torch.manual_seed(123) # NEW Y_training = dat['Y_train'].to(device) # NUM_CLASS = 10 NUM_CLASS = args.n_classes optimizerD = optim.Adam(netD.parameters(), lr=args.lrD, betas=(args.beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=args.lrG, betas=(args.beta1, 0.999)) sigma_optimizer = optim.Adam([log_sigma], lr=args.sigma_lr, betas=(args.beta1, 0.999)) if args.restrict_sigma: logsigma_min = math.log(math.exp(args.sigma_min) - 1.0) logsigma_max = math.log(math.exp(args.sigma_max) - 1.0) #stepsize = args.stepsize_num / args.nz #bsz = args.batchSize #print(X_training.shape) #print(X_training.shape) #print(X_training.shape) #asdfasdfcscv stepsize = args.stepsize_num / args.nz Y_forY_training = dat['Y_train'].to(device) bsz = args.batchSize for epoch in range(1, args.epochs + 1): for i in range(0, len(X_training), bsz): # sigma_x = F.softplus(log_sigma).view(1, 1, args.imageSize, args.imageSize) # sigma_x = F.softplus(log_sigma).view(1, 1, args.imageSize, args.imageSize) # sigma_x = F.softplus(log_sigma).view(1, 1, args.imageSize, args.imageSize) stop = min(bsz, len(X_training[i:])) real_cpu = X_training[i:i + stop].to(device) y_real_cpu = Y_forY_training[i:i + stop].to(device) for sadgfjasj in range(len(y_real_cpu)): if (sadgfjasj > 0) and (y_real_cpu[sadgfjasj] == 2): y_real_cpu[sadgfjasj] = y_real_cpu[sadgfjasj - 1] real_cpu[sadgfjasj, :] = real_cpu[sadgfjasj - 1, :] elif (sadgfjasj == 0) and (y_real_cpu[sadgfjasj] == 2): y_real_cpu[sadgfjasj] = y_real_cpu[sadgfjasj + 1] real_cpu[sadgfjasj, :] = real_cpu[sadgfjasj + 1, :] X_training[i:i + stop] = real_cpu Y_forY_training[i:i + stop] = y_real_cpu #sigma_x = F.softplus(log_sigma).view(1, 1, args.imageSize, args.imageSize) #sigma_x = F.softplus(log_sigma).view(1, 1, args.imageSize, args.imageSize) #sigma_x = F.softplus(log_sigma).view(1, 1, args.imageSize, args.imageSize) #sigma_x = F.softplus(log_sigma).view(1, 1, args.imageSize, args.imageSize) sigma_x = F.softplus(log_sigma).view(1, 1, args.imageSize, args.imageSize) netD.zero_grad() stop = min(bsz, len(X_training[i:])) real_cpu = X_training[i:i + stop].to(device) ''' for epoch in range(1, args.epochs+1): for i in range(0, len(X_training), bsz): # bsz = 64 sigma_x = F.softplus(log_sigma).view(1, 1, args.imageSize, args.imageSize) netD.zero_grad() stop = min(bsz, len(X_training[i:])) real_cpu = X_training[i:i+stop].to(device) # [64, 1, 64, 64] ''' batch_size = real_cpu.size(0) labelv = torch.full((batch_size, ), real_label).to(device) # train discriminator on real (noised) data and real labels y_labels = Y_training[i:i + stop].to(device) y_one_hot = torch.FloatTensor(batch_size, NUM_CLASS).to( device) # adding cuda here # print(batch_size, bsz, y_labels.size()) y_one_hot = y_one_hot.zero_().scatter_( 1, y_labels.view(batch_size, 1), 1).to(device) noise_eta = torch.randn_like(real_cpu).to(device) noised_data = real_cpu + sigma_x.detach() * noise_eta out_real = netD(noised_data, y_one_hot) #, y_one_hot_labels errD_real = criterion(out_real, labelv) errD_real.backward() D_x = out_real.mean().item() # make generator output image from random labels; make discriminator classify rand_y_one_hot = torch.FloatTensor( batch_size, NUM_CLASS).zero_().to(device) # adding cuda here rand_y_one_hot.scatter_( 1, torch.randint(0, NUM_CLASS, size=(batch_size, 1), device=device), 1 ) # #rand_y_one_hot.scatter_(1, torch.from_numpy(np.random.randint(0, 10, size=(bsz,1))), 1) noise = torch.randn(batch_size, args.nz, 1, 1, device=device) mu_fake = netG(noise, rand_y_one_hot) fake = mu_fake + sigma_x * noise_eta labelv = labelv.fill_(fake_label).to(device) out_fake = netD(fake.detach(), rand_y_one_hot) errD_fake = criterion(out_fake, labelv) errD_fake.backward() D_G_z1 = out_fake.mean().item() errD = errD_real + errD_fake optimizerD.step() # update G network: maximize log(D(G(z))) netG.zero_grad() sigma_optimizer.zero_grad() rand_y_one_hot = torch.FloatTensor(batch_size, NUM_CLASS).zero_().to(device) rand_y_one_hot = rand_y_one_hot.scatter_( 1, torch.randint(0, NUM_CLASS, size=(batch_size, 1), device=device), 1).to(device) labelv = labelv.fill_(real_label).to(device) gen_input = torch.randn(batch_size, args.nz, 1, 1, device=device) out = netG(gen_input, rand_y_one_hot) # add rand y labels noise_eta = torch.randn_like(out) g_fake_data = out + noise_eta * sigma_x dg_fake_decision = netD(g_fake_data, rand_y_one_hot) # add rand y labels g_error_gan = criterion(dg_fake_decision, labelv) D_G_z2 = dg_fake_decision.mean().item() # # TO TEST WITHOUT ENTROPY, SET: # if epoch < 10 and args.lambda_ != 0 and args.dataset != 'mnist': # args.lambda_ = 0 # elif epoch < 20 and args.lambda_ != 0 and args.dataset != 'mnist': # args.lambda_ = 0.0001 # elif args.lambda_ != 0 and args.dataset != 'mnist': # args.lambda_ = 0.0002 if args.lambda_ == 0: g_error_gan.backward() optimizerG.step() sigma_optimizer.step() else: # added y_tilde param (rand_y_one_hot) hmc_samples, hmc_labels, acceptRate, stepsize = hmc.get_samples( netG, g_fake_data.detach(), rand_y_one_hot.detach(), gen_input.clone(), sigma_x.detach(), args.burn_in, args.num_samples_posterior, args.leapfrog_steps, stepsize, args.flag_adapt, args.hmc_learning_rate, args.hmc_opt_accept) bsz, d = hmc_samples.size() hmc_samples = hmc_samples.view(bsz, d, 1, 1).to(device) hmc_labels = hmc_labels.to(device) mean_output = netG(hmc_samples, hmc_labels) bsz = g_fake_data.size(0) mean_output_summed = torch.zeros_like(g_fake_data).to(device) for cnt in range(args.num_samples_posterior): mean_output_summed = mean_output_summed + mean_output[ cnt * bsz:(cnt + 1) * bsz] mean_output_summed = mean_output_summed / args.num_samples_posterior c = ((g_fake_data - mean_output_summed) / sigma_x**2).detach() g_error_entropy = torch.mul(c, out + sigma_x * noise_eta).mean(0).sum() g_error = g_error_gan - args.lambda_ * g_error_entropy g_error.backward() optimizerG.step() sigma_optimizer.step() if args.restrict_sigma: log_sigma.data.clamp_(min=logsigma_min, max=logsigma_max) ## log performance if i % args.log == 0: print( 'Epoch [%d/%d] .. Batch [%d/%d] .. Loss_D: %.4f .. Loss_G: %.4f .. D(x): %.4f .. D(G(z)): %.4f / %.4f' % (epoch, args.epochs, i, len(X_training), errD.data, g_error_gan.data, D_x, D_G_z1, D_G_z2)) with open('%s/log.csv' % args.results_folder, 'a') as f: r = csv.writer(f) # Loss_G, Loss_D, D(x), D(G(z)) r.writerow([g_error_gan.data, errD.data, D_x, D_G_z2]) if i % (2 * args.log) == 0: t_iter = (epoch * len(X_training) + i) / bsz writer.add_scalar('Loss_G', g_error_gan.data, t_iter) writer.add_scalar('Loss_D', errD.data, t_iter) writer.add_scalar('D(x)', D_x, t_iter) writer.add_scalar('D(G(z))', D_G_z2, t_iter) print('*' * 100) print('End of epoch {}'.format(epoch)) print('sigma min: {} .. sigma max: {}'.format(torch.min(sigma_x), torch.max(sigma_x))) print('*' * 100) if args.lambda_ > 0: print( '| MCMC diagnostics ====> | stepsize: {} | min ar: {} | mean ar: {} | max ar: {} |' .format(stepsize, acceptRate.min().item(), acceptRate.mean().item(), acceptRate.max().item())) if epoch % args.save_imgs_every == 0: rand_y_one_hot = torch.FloatTensor(args.num_gen_images, NUM_CLASS).zero_().to( device) # adding cuda here rand_y_one_hot = rand_y_one_hot.scatter_( 1, torch.randint(0, NUM_CLASS, size=(args.num_gen_images, 1), device=device), 1 ).to( device ) # #rand_y_one_hot.scatter_(1, torch.from_numpy(np.random.randint(0, 10, size=(bsz,1))), 1) fake = netG(fixed_noise, rand_y_one_hot).detach() vutils.save_image(fake, '%s/presgan_%s_fake_epoch_%03d.png' % (args.results_folder, args.dataset, epoch), normalize=True, nrow=20) if epoch % args.save_ckpt_every == 0: torch.save( netG.state_dict(), os.path.join( args.results_folder, 'netG_presgan_%s_epoch_%s.pth' % (args.dataset, epoch))) torch.save( log_sigma, os.path.join(args.results_folder, 'log_sigma_%s_%s.pth' % (args.dataset, epoch))) torch.save( netD.state_dict(), os.path.join( args.results_folder, 'netD_presgan_%s_epoch_%s.pth' % (args.dataset, epoch)))
def learn(self, batch, max_episode_len, train_step): """ 在learn的时候,抽取到的数据是四维的,四个维度分别为 1——第几个episode 2——episode中第几个transition 3——第几个agent的数据 4——具体obs维度。 因为在选动作时不仅需要输入当前的inputs,还要给神经网络输入hidden_state, hidden_state和之前的经验相关,因此就不能随机抽取经验进行学习。所以这里一次抽取多个episode, 然后一次给神经网络传入每个episode的同一个位置的transition :param batch: :param max_episode_len: :param train_step: :param epsilon: :return: """ # 获得episode的数目 episode_num = batch['o'].shape[0] # 初始化隐藏状态 self.init_hidden(episode_num) # 数据转为tensor # for key in batch.keys(): # if key == 'a': # batch[key] = torch.LongTensor(batch[key]) # else: # batch[key] = torch.Tensor(batch[key]) for key in batch.keys(): if key == 'a': batch[key] = torch.as_tensor(batch[key], dtype=torch.long, device=self.args.device) else: batch[key] = torch.as_tensor(batch[key], dtype=torch.float, device=self.args.device) s, next_s, a, r, avail_a, next_avail_a, done = batch['s'], batch['next_s'], batch['a'], \ batch['r'], batch['avail_a'], batch['next_avail_a'], \ batch['done'] # 避免填充的产生 TD-error 影响训练 mask = 1 - batch["padded"].float() # 获取当前与下个状态的q值,(episode, max_episode_len, n_agents, n_actions) eval_qs, target_qs = self.get_q(batch, episode_num, max_episode_len) # 是否使用GPU # if self.args.cuda: # a = a.cuda() # r = r.cuda() # done = done.cuda() # mask = mask.cuda() # # if 'qmix' in self.args.alg: # s = s.cuda() # next_s = next_s.cuda() # 得到每个动作对应的 q 值 eval_qsa = torch.gather(eval_qs, dim=3, index=a).squeeze(3) # 计算Q_tot if self.args.alg == 'qatten': eval_q_total, q_attend_regs, head_entropies = self.eval_mix_net( eval_qsa, s, a) else: eval_q_total = self.eval_mix_net(eval_qsa, s) qstar_q_total, qstar_loss, q_attend_regs = None, None, None # 需要先把不行动作的mask掉 target_qs[next_avail_a == 0.0] = -9999999 target_qsa = target_qs.max(dim=3)[0] if self.wqmix > 0: # TODO 找到使得Q_tot最大的联合动作,由于qmix是单调假设的,每个agent q值最大则 Q_tot最大,因此联合动作就是每个agent q值最大的动作 argmax_u = target_qs.argmax(dim=3).unsqueeze(3) qstar_eval_qs, qstar_target_qs = self.get_q( batch, episode_num, max_episode_len, True) # 获得对应的动作q值 qstar_eval_qs = torch.gather(qstar_eval_qs, dim=3, index=a).squeeze(3) qstar_target_qs = torch.gather(qstar_target_qs, dim=3, index=argmax_u).squeeze(3) # 通过前馈网络得到qstar qstar_q_total = self.qstar_eval_mix(qstar_eval_qs, s) next_q_total = self.qstar_target_mix(qstar_target_qs, next_s) elif self.args.alg == 'qatten': # chosen_action_qvals, q_attend_regs, head_entropies = self.mixer(chosen_action_qvals, batch["state"][:, :-1], # actions) target_next_actions = target_qs.max( dim=3)[1].unsqueeze(-1).detach() next_q_total, q_attend_regs, _ = self.target_mix_net( target_qsa, next_s, target_next_actions) else: # 得到 target q,是inf出现的nan # target_qs[next_avail_a == 0.0] = float('-inf') # target_qs = target_qs.max(dim=3)[0] # 计算target Q_tot next_q_total = self.target_mix_net(target_qsa, next_s) target_q_total = r + self.args.gamma * next_q_total * (1 - done) # weights = torch.Tensor(np.ones(eval_q_total.shape)) weights = torch.as_tensor(np.ones(eval_q_total.shape), dtype=torch.float, device=self.args.device) if self.wqmix > 0: # 1- 可以保证weights在 (0, 1] # TODO: 这里只说是 (0, 1] 之间,文中有介绍具体的参数设置 # weights = torch.Tensor(1 - np.random.ranf(eval_q_total.shape)) weights = torch.full(eval_q_total.shape, self.alpha, device=self.args.device) if self.args.alg == 'cwqmix': error = mask * (target_q_total - qstar_q_total) elif self.args.alg == 'owqmix': error = mask * (target_q_total - eval_q_total) else: raise Exception("模型不存在") weights[error > 0] = 1. # qstar 参数更新 qstar_error = mask * (qstar_q_total - target_q_total.detach()) qstar_loss = (qstar_error**2).sum() / mask.sum() # self.qstar_optimizer.zero_grad() # qstar_loss.backward() # torch.nn.utils.clip_grad_norm_(self.qstar_params, self.args.clip_norm) # self.qstar_optimizer.step() # 计算 TD error # TODO 这里权值detach有影响吗 td_error = mask * (eval_q_total - target_q_total.detach()) # if self.args.cuda: # weights = weights.cuda() loss = (weights.detach() * td_error**2).sum() / mask.sum() if self.args.alg == 'qatten': loss += q_attend_regs elif self.wqmix > 0: loss += qstar_loss self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_params, self.args.clip_norm) self.optimizer.step() if train_step > 0 and train_step % self.args.target_update_period == 0: self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_mix_net.load_state_dict(self.eval_mix_net.state_dict()) if self.wqmix > 0: self.qstar_target_rnn.load_state_dict( self.qstar_eval_rnn.state_dict()) self.qstar_target_mix.load_state_dict( self.qstar_eval_mix.state_dict())
def full(shape, fill_value, dtype, ctx): return th.full(shape, fill_value, dtype=dtype, device=ctx)
def aug_test_vote(self, imgs, img_metas, rescale=False): # recompute feats to save memory feats = self.extract_feats(imgs) aug_bboxes = [] aug_labels = [] for i, (x, img_meta) in enumerate(zip(feats, img_metas)): # only one image in the batch # TODO more flexible outs = self.bbox_head(x) bbox_inputs = outs + (img_meta, self.test_cfg, False, True) det_bboxes, det_labels = self.bbox_head.get_bboxes(*bbox_inputs)[0] keeped = self.remove_boxes(det_bboxes, self.test_cfg.scale_ranges[i // 2][0], self.test_cfg.scale_ranges[i // 2][1]) det_bboxes, det_labels = det_bboxes[keeped, :], det_labels[keeped] aug_bboxes.append(det_bboxes) aug_labels.append(det_labels) # after merging, bboxes will be rescaled to the original image size merged_bboxes, merged_labels = self.merge_aug_vote_results( aug_bboxes, aug_labels, img_metas) det_bboxes = [] det_labels = [] for j in range(80): inds = (merged_labels == j).nonzero().squeeze(1) scores_j = merged_bboxes[inds, 4] bboxes_j = merged_bboxes[inds, :4].view(-1, 4) bboxes_j, scores_j = self.bboxes_vote(bboxes_j, scores_j) if len(bboxes_j) > 0: det_bboxes.append( torch.cat([bboxes_j, scores_j[:, None]], dim=1)) det_labels.append( torch.full((bboxes_j.shape[0], ), j, dtype=torch.int64, device=scores_j.device)) if len(det_bboxes) > 0: det_bboxes = torch.cat(det_bboxes, dim=0) det_labels = torch.cat(det_labels) else: det_bboxes = merged_bboxes.new_zeros((0, 5)) det_labels = merged_bboxes.new_zeros((0, ), dtype=torch.long) if det_bboxes.shape[0] > 1000 > 0: cls_scores = det_bboxes[:, 4] image_thresh, _ = torch.kthvalue(cls_scores.cpu(), det_bboxes.shape[0] - 1000 + 1) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep, as_tuple=False).squeeze(1) det_bboxes = det_bboxes[keep] det_labels = det_labels[keep] if rescale: _det_bboxes = det_bboxes else: _det_bboxes = det_bboxes.clone() _det_bboxes[:, :4] *= img_metas[0][0]['scale_factor'] bbox_results = bbox2result(_det_bboxes, det_labels, self.bbox_head.num_classes) return bbox_results
def train(model, model2, dset_loaders, criterion, BCEcriterion, epoch, phase, optimizer, optimizer_Global, args, logger, use_gpu): model.train() logger.info('-' * 10) logger.info('Epoch {}/{}'.format(epoch, args.epochs - 1)) logger.info('Current Learning rate: {}'.format(showLR(optimizer))) running_loss, running_corrects, global_loss, running_all = 0., 0., 0., 0. since = time.time() last_time_batch_idx = -1 for batch_idx, (inputs, targets) in enumerate(dset_loaders[phase]): label_real = torch.full((inputs.size(0), ), 1) label_fake = torch.full((inputs.size(0), ), 0) if use_gpu: inputs = inputs.cuda() targets = targets.cuda() target_mi = make_one_hot_global( targets, 1000 ) #They would be concatenated with final representations(Global) label_fake = label_fake.cuda() label_real = label_real.cuda() outputs = model(inputs) _, preds = torch.max(outputs.data, 1) optimizer.zero_grad() optimizer_Global.zero_grad() loss = criterion(outputs, targets) loss.backward() # Paired samples(Global) info_real_output_global = model2(target_mi, outputs) loss_real_global = BCEcriterion(info_real_output_global.squeeze(), label_real) loss_real_global.backward(retain_graph=True) # Unpaired samples(Global) info_fake_output_global = model2( target_mi, torch.cat((outputs[2:, ...], outputs[0:2, ...]), dim=0)) loss_fake_global = BCEcriterion(info_fake_output_global.squeeze(), label_fake) loss_fake_global.backward() optimizer.step() optimizer_Global.step() # stastics running_loss += loss.item() * inputs.size(0) batch_correct = (preds == targets.data).sum().item() running_corrects += batch_correct running_all += len(inputs) error_info_global = loss_real_global.item() + loss_fake_global.item() global_loss += error_info_global * inputs.size(0) if batch_idx % args.interval == 0 or (batch_idx == len(dset_loaders[phase]) - 1): print( 'Process: [{:5.0f}/{:5.0f} ({:.0f}%)]\tLoss batch: {:.4f}\tLoss total: {:.4f}\tAcc batch:{:.4f}\tAcc total:{:.4f}\tEstimated time:{:5.0f}s\r' .format(running_all, len(dset_loaders[phase].dataset), 100. * batch_idx / (len(dset_loaders[phase]) - 1), float(loss), float(running_loss) / running_all, float(batch_correct) / len(inputs), float(running_corrects) / running_all, (time.time() - since) / (batch_idx - last_time_batch_idx) * (len(dset_loaders[phase]) - batch_idx - 1))), last_time_batch_idx = batch_idx since = time.time() loss_epoch = float(running_loss) / len(dset_loaders[phase].dataset) acc_epoch = float(running_corrects) / len(dset_loaders[phase].dataset) global_loss_epoch = float(global_loss) / len(dset_loaders[phase].dataset) logger.info( '{} Epoch:\t{:2}\tLoss: {:.4f}\tAcc:{:.4f}\tglobal:{:.4f}\n'.format( phase, epoch, loss_epoch, acc_epoch, global_loss_epoch))
def test_draw_boxes_colors(colors): img = torch.full((3, 100, 100), 0, dtype=torch.uint8) utils.draw_bounding_boxes(img, boxes, fill=False, width=7, colors=colors)
def evaluate(self, input: RLEstimatorInput, **kwargs) -> EstimatorResults: assert input.value_function is not None logging.info(f"{self}: start evaluating") stime = time.process_time() results = EstimatorResults() num_resamples = kwargs["num_resamples"] if "num_resamples" in kwargs else 200 loss_threhold = ( kwargs["loss_threhold"] if "loss_threhold" in kwargs else 0.00001 ) lr = kwargs["lr"] if "lr" in kwargs else 0.0001 logging.info( f" params: num_resamples[{num_resamples}], " f"loss_threshold[{loss_threhold}], " f"lr[{lr}]" ) for state, mdps in input.log.items(): n = len(mdps) horizon = len(reduce(lambda a, b: a if len(a) > len(b) else b, mdps)) ws = self._calc_weights(n, horizon, zip_longest(*mdps), input.target_policy) last_ws = torch.zeros((n, horizon), device=self._device) last_ws[:, 0] = 1.0 / n last_ws[:, 1:] = ws[:, :-1] discount = torch.full((horizon,), input.gamma, device=self._device) discount[0] = 1.0 discount = discount.cumprod(0) rs = torch.zeros((n, horizon)) vs = torch.zeros((n, horizon)) qs = torch.zeros((n, horizon)) for ts, j in zip(zip_longest(*mdps), count()): for t, i in zip(ts, count()): if t is not None and t.action is not None: qs[i, j] = input.value_function(t.last_state, t.action) vs[i, j] = input.value_function(t.last_state) rs[i, j] = t.reward vs = vs.to(device=self._device) qs = qs.to(device=self._device) rs = rs.to(device=self._device) wdrs = ((ws * (rs - qs) + last_ws * vs) * discount).cumsum(1) wdr = wdrs[:, -1].sum(0) next_vs = torch.zeros((n, horizon), device=self._device) next_vs[:, :-1] = vs[:, 1:] gs = wdrs + ws * next_vs * discount gs_normal = gs.sub(torch.mean(gs, 0)) assert n > 1 omiga = (n / (n - 1.0)) * torch.einsum("ij,ik->jk", gs_normal, gs_normal) resample_wdrs = torch.zeros((num_resamples,)) for i in range(num_resamples): samples = random.choices(range(n), k=n) sws = ws[samples, :] last_sws = last_ws[samples, :] srs = rs[samples, :] svs = vs[samples, :] sqs = qs[samples, :] resample_wdrs[i] = ( ((sws * (srs - sqs) + last_sws * svs).sum(0) * discount) .sum() .item() ) resample_wdrs, _ = resample_wdrs.to(device=self._device).sort(0) lb = torch.min(wdr, resample_wdrs[int(round(0.05 * num_resamples))]) ub = torch.max(wdr, resample_wdrs[int(round(0.95 * num_resamples)) - 1]) b = torch.tensor( list( map( lambda a: a - ub if a > ub else (a - lb if a < lb else 0.0), # pyre-fixme[6]: Expected `Iterable[Variable[_T1]]` for 2nd # param but got `Tensor`. gs.sum(0), ) ), device=self._device, ) b.unsqueeze_(0) bb = b * b.t() cov = omiga + bb # x = torch.rand((1, horizon), device=self.device, requires_grad=True) x = torch.zeros((1, horizon), device=self._device, requires_grad=True) # using SGD to find min x optimizer = torch.optim.SGD([x], lr=lr) last_y = 0.0 for i in range(100): x = torch.nn.functional.softmax(x, dim=1) y = torch.mm(torch.mm(x, cov), x.t()) if abs(y.item() - last_y) < loss_threhold: print(f"{i}: {last_y} -> {y.item()}") break last_y = y.item() optimizer.zero_grad() y.backward(retain_graph=True) optimizer.step() x = torch.nn.functional.softmax(x, dim=1) estimate = torch.mm(x, gs.sum(0, keepdim=True).t()) if input.ground_truth is not None: ground_truth = input.ground_truth(state) else: ground_truth = None results.append( EstimatorResult( self._log_reward(input.gamma, mdps), estimate, ground_truth ) ) logging.info( f"{self}: finishing evaluating[" f"process_time={time.process_time() - stime}]" ) return results
def find_top_rrpn_proposals( proposals, pred_objectness_logits, image_sizes, nms_thresh, pre_nms_topk, post_nms_topk, min_box_size, training, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). image_sizes (list[tuple]): sizes (h, w) for each image nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RRPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RRPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_size(float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i. """ num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip(itertools.count(), proposals, pred_objectness_logits): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i[batch_idx, :num_proposals_i] topk_idx = idx[batch_idx, :num_proposals_i] # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 5 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append( torch.full((num_proposals_i, ), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = RotatedBoxes(topk_proposals[n]) scores_per_img = topk_scores[n] valid_mask = torch.isfinite( boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img) if not valid_mask.all(): boxes = boxes[valid_mask] scores_per_img = scores_per_img[valid_mask] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_size) lvl = level_ids if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], level_ids[keep]) keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def forward(self, s, nrows=None, ncols=None, exp=False, exp_alpha=20, dummy_row=False, dtype=torch.float32): batch_size = s.shape[0] if dummy_row: dummy_shape = list(s.shape) dummy_shape[1] = s.shape[2] - s.shape[1] s = torch.cat((s, torch.full(dummy_shape, 0.).to(s.device)), dim=1) new_nrows = ncols for b in range(batch_size): s[b, nrows[b]:new_nrows[b], :ncols[b]] = self.epsilon nrows = new_nrows row_norm_ones = torch.zeros(batch_size, s.shape[1], s.shape[1], device=s.device) # size: row x row col_norm_ones = torch.zeros(batch_size, s.shape[2], s.shape[2], device=s.device) # size: col x col for b in range(batch_size): row_slice = slice(0, nrows[b] if nrows is not None else s.shape[2]) col_slice = slice(0, ncols[b] if ncols is not None else s.shape[1]) row_norm_ones[b, row_slice, row_slice] = 1 col_norm_ones[b, col_slice, col_slice] = 1 # for Sinkhorn stacked on last dimension if len(s.shape) == 4: row_norm_ones = row_norm_ones.unsqueeze(-1) col_norm_ones = col_norm_ones.unsqueeze(-1) s += self.epsilon for i in range(self.max_iter): if exp: s = torch.exp(exp_alpha * s) if i % 2 == 1: # column norm sum = torch.sum(torch.mul(s.unsqueeze(3), col_norm_ones.unsqueeze(1)), dim=2) else: # row norm sum = torch.sum(torch.mul(row_norm_ones.unsqueeze(3), s.unsqueeze(1)), dim=2) tmp = torch.zeros_like(s) for b in range(batch_size): row_slice = slice( 0, nrows[b] if nrows is not None else s.shape[2]) col_slice = slice( 0, ncols[b] if ncols is not None else s.shape[1]) tmp[b, row_slice, col_slice] = 1 / sum[b, row_slice, col_slice] s = s * tmp if dummy_row and dummy_shape[1] > 0: s = s[:, :-dummy_shape[1]] return s