def plot_attention(self, source_inputs, target_inputs, dataflow): src = [self.fields[dataflow[0]].init_token] + self.fields[ dataflow[0]].reverse(source_inputs)[0].split() + ['<eos>'] trg = [self.fields[dataflow[1]].init_token ] + self.fields[dataflow[1]].reverse(target_inputs)[0].split() if (len(src) <= self.visual_limit) and (len(trg) <= self.visual_limit): self.attention_flag = False self.attention_maps = [] for i in range(self.args.n_layers): for j in range(self.args.n_heads): fig = visualize_attention( src, src, self.encoder.layers[i].selfattn.layer. attention.p_attn[j].detach()[:len(src), :len(src)]) name = 'Self-attention (Enc) L{}/H{}'.format(i, j) self.attention_maps.append((name, fig)) fig = visualize_attention( trg, trg, self.decoder.layers[i].selfattn.layer. attention.p_attn[j].detach()[:len(trg), :len(trg)]) name = 'Self-attention (Dec) L{}/H{}'.format(i, j) self.attention_maps.append((name, fig)) fig = visualize_attention( src, trg, self.decoder.layers[i].crossattn.layer. attention.p_attn[j].detach()[:len(trg), :len(src)]) name = 'Cross-attention L{}/H{}'.format(i, j) self.attention_maps.append((name, fig))
def evaluate(model, load_path): with open(load_path + '/trained_params_best.npz') as f: loaded = np.load(f) blocks_model = Model(model.cost) params_dicts = blocks_model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] assert param.get_value().shape == loaded[param_name].shape param.set_value(loaded[param_name]) train_data_stream, valid_data_stream = get_cmv_v2_streams(100) # T x B x F data = train_data_stream.get_epoch_iterator().next() cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.location, model.scale], on_unused_input='ignore', allow_input_downcast=True) res = f(data[1], data[0]) for i in range(10): visualize_attention(data[0][:, i, :], res[0][:, i, :], res[1][:, i, :], prefix=str(i))
def main(): g = DualSourceAttentionGraph() print("Training Graph loaded") summary_g = tf.Graph() with g.graph.as_default(): # Training sv = tf.train.Supervisor(logdir=hp.logdir, save_model_secs=0, summary_op=g.merged) with sv.managed_session() as sess: for epoch in range(1, hp.num_epochs + 1): if sv.should_stop(): break print("epoch={}".format(epoch)) for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): _, l1, l2, l, alignment_history1, alignment_history2, x1, x2 = sess.run( [ g.train_op, g.mean_loss1, g.mean_loss2, g.mean_loss, g.alignment_history1, g.alignment_history2, g.x1, g.x2 ]) print("mean_loss1={}, mean_loss2={}, mean_loss={}".format( l1, l2, l)) # Write checkpoint files at every epoch gs = sess.run(g.global_step) sv.saver.save( sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs)) with summary_g.as_default(): with tf.Session(graph=summary_g) as summary_sess: visualize_attention(alignment_history1[0], [ idx2char_kana[idx] + ' ' for idx in np.fromstring(x1[0], np.int32) ]) plot1 = figure_to_tensor() attention_image1 = tf.summary.image( "attention1 %d" % gs, plot1) visualize_attention(alignment_history1[0], [ idx2phone[idx] for idx in np.fromstring(x2[0], np.int32) ]) plot2 = figure_to_tensor() attention_image2 = tf.summary.image( "attention2 %d" % gs, plot2) merged = summary_sess.run( tf.summary.merge( [attention_image1, attention_image2])) sv.summary_computed(sess, merged, gs)
def training_loop(train_dict, val_dict, idx_dict, encoder, decoder, criterion, optimizer, opts): """Runs the main training loop; evaluates the model on the val set every epoch. * Prints training and val loss each epoch. * Prints qualitative translation results each epoch using TEST_SENTENCE * Saves an attention map for TEST_WORD_ATTN each epoch Arguments: train_dict: The training word pairs, organized by source and target lengths. val_dict: The validation word pairs, organized by source and target lengths. idx_dict: Contains char-to-index and index-to-char mappings, and start & end token indexes. encoder: An encoder model to produce annotations for each step of the input sequence. decoder: A decoder model (with or without attention) to generate output tokens. criterion: Used to compute the CrossEntropyLoss for each decoder output. optimizer: Implements a step rule to update the parameters of the encoder and decoder. opts: The command-line arguments. """ start_token = idx_dict['start_token'] end_token = idx_dict['end_token'] char_to_index = idx_dict['char_to_index'] loss_log = open(os.path.join(opts.checkpoint_path, 'loss_log.txt'), 'w') best_val_loss = 1e6 train_losses = [] val_losses = [] for epoch in range(opts.nepochs): optimizer.param_groups[0]['lr'] *= opts.lr_decay epoch_losses = [] for key in train_dict: input_strings, target_strings = zip(*train_dict[key]) input_tensors = [ torch.LongTensor( utils.string_to_index_list(s, char_to_index, end_token)) for s in input_strings ] target_tensors = [ torch.LongTensor( utils.string_to_index_list(s, char_to_index, end_token)) for s in target_strings ] num_tensors = len(input_tensors) num_batches = int(np.ceil(num_tensors / float(opts.batch_size))) for i in range(num_batches): start = i * opts.batch_size end = start + opts.batch_size inputs = utils.to_var(torch.stack(input_tensors[start:end]), opts.cuda) targets = utils.to_var(torch.stack(target_tensors[start:end]), opts.cuda) # The batch size may be different in each epoch BS = inputs.size(0) encoder_annotations, encoder_hidden = encoder(inputs) # The last hidden state of the encoder becomes the first hidden state of the decoder decoder_hidden = encoder_hidden start_vector = torch.ones(BS).long().unsqueeze( 1) * start_token # BS x 1 --> 16x1 CHECKED decoder_input = utils.to_var( start_vector, opts.cuda) # BS x 1 --> 16x1 CHECKED loss = 0.0 seq_len = targets.size(1) # Gets seq_len from BS x seq_len use_teacher_forcing = np.random.rand( ) < opts.teacher_forcing_ratio for i in range(seq_len): decoder_output, decoder_hidden, attention_weights = decoder( decoder_input, decoder_hidden, encoder_annotations) current_target = targets[:, i] loss += criterion( decoder_output, current_target ) # cross entropy between the decoder distribution and GT ni = F.softmax(decoder_output, dim=1).data.max(1)[1] if use_teacher_forcing: # With teacher forcing, use the ground-truth token to condition the next step decoder_input = targets[:, i].unsqueeze(1) else: # Without teacher forcing, use the model's own predictions to condition the next step decoder_input = utils.to_var(ni.unsqueeze(1), opts.cuda) loss /= float(seq_len) epoch_losses.append(loss.item()) # Zero gradients optimizer.zero_grad() # Compute gradients loss.backward() # Update the parameters of the encoder and decoder optimizer.step() train_loss = np.mean(epoch_losses) val_loss = evaluate(val_dict, encoder, decoder, idx_dict, criterion, opts) if val_loss < best_val_loss: checkpoint(encoder, decoder, idx_dict, opts) if not opts.no_attention: # Save attention maps for the fixed word TEST_WORD_ATTN throughout training utils.visualize_attention( TEST_WORD_ATTN, encoder, decoder, idx_dict, opts, save=os.path.join( opts.checkpoint_path, 'train_attns/attn-epoch-{}.png'.format(epoch))) gen_string = utils.translate_sentence(TEST_SENTENCE, encoder, decoder, idx_dict, opts) print( "Epoch: {:3d} | Train loss: {:.3f} | Val loss: {:.3f} | Gen: {:20s}" .format(epoch, train_loss, val_loss, gen_string)) loss_log.write('{} {} {}\n'.format(epoch, train_loss, val_loss)) loss_log.flush() train_losses.append(train_loss) val_losses.append(val_loss) save_loss_plot(train_losses, val_losses, opts)
parser = argparse.ArgumentParser() parser.add_argument('--load', type=str, help='Path to checkpoint directory.') parser.add_argument('--cuda', action='store_true', default=False, help='Use GPU.') return parser if __name__ == '__main__': parser = create_parser() opts = parser.parse_args() encoder, decoder, idx_dict = load(opts) for word in words: translated = utils.translate(word, encoder, decoder, idx_dict, opts) print('{} --> {}'.format(word, translated)) utils.visualize_attention(word, encoder, decoder, idx_dict, opts, save=os.path.join(opts.load, '{}.pdf'.format(word)))
def evaluate(model, load_path, plot): with open(load_path + 'trained_params_best.npz') as f: loaded = np.load(f) blocks_model = Model(model.cost) params_dicts = blocks_model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] assert param.get_value().shape == loaded[param_name].shape param.set_value(loaded[param_name]) if plot: train_data_stream, valid_data_stream = get_streams(20) # T x B x F data = train_data_stream.get_epoch_iterator().next() cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.location, model.scale], on_unused_input='ignore', allow_input_downcast=True) res = f(data[1], data[0]) for i in range(10): visualize_attention(data[0][:, i, :], res[0][:, i, :], res[1][:, i, :], image_shape=(512, 512), prefix=str(i)) plot_curves(path=load_path, to_be_plotted=['train_categoricalcrossentropy_apply_cost', 'valid_categoricalcrossentropy_apply_cost'], yaxis='Cross Entropy', titles=['train', 'valid'], main_title='CE') plot_curves(path=load_path, to_be_plotted=['train_learning_rate', 'train_learning_rate'], yaxis='lr', titles=['train', 'train'], main_title='lr') plot_curves(path=load_path, to_be_plotted=['train_total_gradient_norm', 'valid_total_gradient_norm'], yaxis='GradientNorm', titles=['train', 'valid'], main_title='GradientNorm') for grad in ['_total_gradient_norm', '_total_gradient_norm', '_/lstmattention.W_patch_grad_norm', '_/lstmattention.W_state_grad_norm', '_/lstmattention.initial_cells_grad_norm', '_/lstmattention.initial_location_grad_norm', '_/lstmattention/lstmattention_mlp/linear_0.W_grad_norm', '_/lstmattention/lstmattention_mlp/linear_1.W_grad_norm', '_/mlp/linear_0.W_grad_norm', '_/mlp/linear_1.W_grad_norm']: plot_curves(path=load_path, to_be_plotted=['train' + grad, 'valid' + grad], yaxis='GradientNorm', titles=['train', 'valid'], main_title=grad.replace( "_", "").replace("/", "").replace(".", "")) plot_curves(path=load_path, to_be_plotted=[ 'train_misclassificationrate_apply_error_rate', 'valid_misclassificationrate_apply_error_rate'], yaxis='Error rate', titles=['train', 'valid'], main_title='Error') print 'plot printed'
data = ds.get_epoch_iterator(as_dict=True).next() inputs = ComputationGraph(model.patch).inputs f = theano.function(inputs, [model.location, model.scale, model.patch, model.downn_sampled_input]) res = f(data['features']) location, scale, patch, downn_sampled_input = res # os.makedirs('res_frames/') # os.makedirs('res_patch/') # os.makedirs('res_downn_sampled_input/') # for i, f in enumerate(data['features']): # plt.imshow(f[0].reshape(100, 100), cmap=plt.gray(), # interpolation='nearest') # plt.savefig('res_frames/img_' + str(i) + '.png') # for i, p in enumerate(patch): # plt.imshow(p[0, 0], cmap=plt.gray(), interpolation='nearest') # plt.savefig('res_patch/img_' + str(i) + '.png') # for i, d in enumerate(downn_sampled_input): # plt.imshow(d[0, 0], cmap=plt.gray(), interpolation='nearest') # plt.savefig('res_downn_sampled_input/img_' + str(i) + '.png') for i in range(10): visualize_attention(data['features'][:, i], (location[:, i] + 1) * 512 / 2, scale[:, i] + 1 + 0.24, image_shape=(512, 512), prefix=str(i)) else: # evaluate(model, 'results/v2_len10_mlp_2015_11_13_at_18_37/', # plot=False) train(model, configs)
configs['cropper_input_shape'] = (200, 320) configs['patch_shape'] = (32, 32) configs['num_channels'] = 3 configs['classifier_dims'] = [configs['lstm_dim'], 64, 31] configs['load_pretrained'] = False configs['test_model'] = True configs['l2_reg'] = 0.001 timestr = time.strftime("%Y_%m_%d_at_%H_%M") save_path = configs['save_path'] + timestr configs['save_path'] = save_path log_path = os.path.join(save_path, 'log.txt') os.makedirs(save_path) fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) for item in configs: logger.info(item + ': %s' % str(configs[item])) model = setup_model(configs) eval_ = False if eval_: eval_function = evaluate( model, 'results/Cook_32016_03_10_at_20_40/', configs) # analyze('results/Cook_2_CNN2016_03_06_at_23_56/') visualize_attention(model, configs, eval_function) else: # evaluate(model, 'results/Cook_n_2016_03_05_at_00_42/', configs) train(model, configs)
configs['lstm_dim'] = 128 configs['attention_mlp_hidden_dims'] = [128] configs['cropper_input_shape'] = (100, 100) configs['patch_shape'] = (28, 28) configs['num_channels'] = 1 configs['classifier_dims'] = [configs['lstm_dim'], 64, 10] configs['load_pretrained'] = False configs['test_model'] = True configs['l2_reg'] = 0.001 timestr = time.strftime("%Y_%m_%d_at_%H_%M") save_path = configs['save_path'] + timestr configs['save_path'] = save_path log_path = os.path.join(save_path, 'log.txt') os.makedirs(save_path) fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) for item in configs: logger.info(item + ': %s' % str(configs[item])) model = setup_model(configs) eval_ = False if eval_: eval_function = evaluate(model, 'results/BMNIST_Learn_2016_02_25_at_23_50/', configs) analyze('results/BMNIST_Learn_2016_02_25_at_23_50/') visualize_attention(model, configs, eval_function) else: # evaluate(model, 'results/CMV_Hard_len10_2016_02_22_at_21_00/') train(model, configs)
print_opts(args) transformer_encoder, transformer_decoder = train(args) translated = translate_sentence(TEST_SENTENCE, transformer_encoder, transformer_decoder, None, args) print("source:\t\t{} \ntranslated:\t{}".format(TEST_SENTENCE, translated)) """Try translating different sentences by changing the variable TEST_SENTENCE. Identify two distinct failure modes and briefly describe them.""" TEST_SENTENCE = test_cases translated = translate_sentence(TEST_SENTENCE, transformer_encoder, transformer_decoder, None, args) print("source:\t\t{} \ntranslated:\t{}".format(TEST_SENTENCE, translated)) exit() """# Attention Visualizations One of the benefits of using attention is that it allows us to gain insight into the inner workings of the model. By visualizing the attention weights generated for the input tokens in each decoder step, we can see where the model focuses while producing each output token. The code in this section loads the model you trained from the previous section and uses it to translate a given set of words: it prints the translations and display heatmaps to show how attention is used at each step. Play around with visualizing attention maps generated by the previous two models you've trained. Inspect visualizations in one success and one failure case for both models. """ TEST_WORD_ATTN = 'street' visualize_attention(TEST_WORD_ATTN, rnn_attn_encoder, rnn_attn_decoder, None, args, save="additive_attention.pdf") TEST_WORD_ATTN = 'street' visualize_attention(TEST_WORD_ATTN, transformer_encoder, transformer_decoder, None, args, save="scaled_dot_product_attention.pdf")