def plot_attention(self, source_inputs, target_inputs, dataflow):
        src = [self.fields[dataflow[0]].init_token] + self.fields[
            dataflow[0]].reverse(source_inputs)[0].split() + ['<eos>']
        trg = [self.fields[dataflow[1]].init_token
               ] + self.fields[dataflow[1]].reverse(target_inputs)[0].split()

        if (len(src) <= self.visual_limit) and (len(trg) <= self.visual_limit):

            self.attention_flag = False
            self.attention_maps = []
            for i in range(self.args.n_layers):
                for j in range(self.args.n_heads):
                    fig = visualize_attention(
                        src, src, self.encoder.layers[i].selfattn.layer.
                        attention.p_attn[j].detach()[:len(src), :len(src)])
                    name = 'Self-attention (Enc) L{}/H{}'.format(i, j)
                    self.attention_maps.append((name, fig))

                    fig = visualize_attention(
                        trg, trg, self.decoder.layers[i].selfattn.layer.
                        attention.p_attn[j].detach()[:len(trg), :len(trg)])
                    name = 'Self-attention (Dec) L{}/H{}'.format(i, j)
                    self.attention_maps.append((name, fig))

                    fig = visualize_attention(
                        src, trg, self.decoder.layers[i].crossattn.layer.
                        attention.p_attn[j].detach()[:len(trg), :len(src)])
                    name = 'Cross-attention L{}/H{}'.format(i, j)
                    self.attention_maps.append((name, fig))
Esempio n. 2
0
def evaluate(model, load_path):
    with open(load_path + '/trained_params_best.npz') as f:
        loaded = np.load(f)
        blocks_model = Model(model.cost)
        params_dicts = blocks_model.get_parameter_dict()
        params_names = params_dicts.keys()
        for param_name in params_names:
            param = params_dicts[param_name]
            # '/f_6_.W' --> 'f_6_.W'
            slash_index = param_name.find('/')
            param_name = param_name[slash_index + 1:]
            assert param.get_value().shape == loaded[param_name].shape
            param.set_value(loaded[param_name])

    train_data_stream, valid_data_stream = get_cmv_v2_streams(100)
    # T x B x F
    data = train_data_stream.get_epoch_iterator().next()
    cg = ComputationGraph(model.cost)
    f = theano.function(cg.inputs, [model.location, model.scale],
                        on_unused_input='ignore',
                        allow_input_downcast=True)
    res = f(data[1], data[0])
    for i in range(10):
        visualize_attention(data[0][:, i, :],
                            res[0][:, i, :],
                            res[1][:, i, :],
                            prefix=str(i))
Esempio n. 3
0
def evaluate(model, load_path):
    with open(load_path + '/trained_params_best.npz') as f:
        loaded = np.load(f)
        blocks_model = Model(model.cost)
        params_dicts = blocks_model.get_parameter_dict()
        params_names = params_dicts.keys()
        for param_name in params_names:
                    param = params_dicts[param_name]
                    # '/f_6_.W' --> 'f_6_.W'
                    slash_index = param_name.find('/')
                    param_name = param_name[slash_index + 1:]
                    assert param.get_value().shape == loaded[param_name].shape
                    param.set_value(loaded[param_name])

    train_data_stream, valid_data_stream = get_cmv_v2_streams(100)
    # T x B x F
    data = train_data_stream.get_epoch_iterator().next()
    cg = ComputationGraph(model.cost)
    f = theano.function(cg.inputs, [model.location, model.scale],
                        on_unused_input='ignore',
                        allow_input_downcast=True)
    res = f(data[1], data[0])
    for i in range(10):
        visualize_attention(data[0][:, i, :],
                            res[0][:, i, :], res[1][:, i, :], prefix=str(i))
def main():
    g = DualSourceAttentionGraph()
    print("Training Graph loaded")

    summary_g = tf.Graph()

    with g.graph.as_default():

        # Training
        sv = tf.train.Supervisor(logdir=hp.logdir,
                                 save_model_secs=0,
                                 summary_op=g.merged)

        with sv.managed_session() as sess:
            for epoch in range(1, hp.num_epochs + 1):
                if sv.should_stop(): break
                print("epoch={}".format(epoch))
                for step in tqdm(range(g.num_batch),
                                 total=g.num_batch,
                                 ncols=70,
                                 leave=False,
                                 unit='b'):
                    _, l1, l2, l, alignment_history1, alignment_history2, x1, x2 = sess.run(
                        [
                            g.train_op, g.mean_loss1, g.mean_loss2,
                            g.mean_loss, g.alignment_history1,
                            g.alignment_history2, g.x1, g.x2
                        ])
                    print("mean_loss1={}, mean_loss2={}, mean_loss={}".format(
                        l1, l2, l))

                # Write checkpoint files at every epoch
                gs = sess.run(g.global_step)
                sv.saver.save(
                    sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))

                with summary_g.as_default():
                    with tf.Session(graph=summary_g) as summary_sess:
                        visualize_attention(alignment_history1[0], [
                            idx2char_kana[idx] + ' '
                            for idx in np.fromstring(x1[0], np.int32)
                        ])
                        plot1 = figure_to_tensor()
                        attention_image1 = tf.summary.image(
                            "attention1 %d" % gs, plot1)

                        visualize_attention(alignment_history1[0], [
                            idx2phone[idx]
                            for idx in np.fromstring(x2[0], np.int32)
                        ])
                        plot2 = figure_to_tensor()
                        attention_image2 = tf.summary.image(
                            "attention2 %d" % gs, plot2)
                        merged = summary_sess.run(
                            tf.summary.merge(
                                [attention_image1, attention_image2]))
                        sv.summary_computed(sess, merged, gs)
Esempio n. 5
0
def training_loop(train_dict, val_dict, idx_dict, encoder, decoder, criterion,
                  optimizer, opts):
    """Runs the main training loop; evaluates the model on the val set every epoch.
        * Prints training and val loss each epoch.
        * Prints qualitative translation results each epoch using TEST_SENTENCE
        * Saves an attention map for TEST_WORD_ATTN each epoch

    Arguments:
        train_dict: The training word pairs, organized by source and target lengths.
        val_dict: The validation word pairs, organized by source and target lengths.
        idx_dict: Contains char-to-index and index-to-char mappings, and start & end token indexes.
        encoder: An encoder model to produce annotations for each step of the input sequence.
        decoder: A decoder model (with or without attention) to generate output tokens.
        criterion: Used to compute the CrossEntropyLoss for each decoder output.
        optimizer: Implements a step rule to update the parameters of the encoder and decoder.
        opts: The command-line arguments.
    """

    start_token = idx_dict['start_token']
    end_token = idx_dict['end_token']
    char_to_index = idx_dict['char_to_index']

    loss_log = open(os.path.join(opts.checkpoint_path, 'loss_log.txt'), 'w')

    best_val_loss = 1e6
    train_losses = []
    val_losses = []

    for epoch in range(opts.nepochs):

        optimizer.param_groups[0]['lr'] *= opts.lr_decay

        epoch_losses = []

        for key in train_dict:

            input_strings, target_strings = zip(*train_dict[key])
            input_tensors = [
                torch.LongTensor(
                    utils.string_to_index_list(s, char_to_index, end_token))
                for s in input_strings
            ]
            target_tensors = [
                torch.LongTensor(
                    utils.string_to_index_list(s, char_to_index, end_token))
                for s in target_strings
            ]

            num_tensors = len(input_tensors)
            num_batches = int(np.ceil(num_tensors / float(opts.batch_size)))

            for i in range(num_batches):

                start = i * opts.batch_size
                end = start + opts.batch_size

                inputs = utils.to_var(torch.stack(input_tensors[start:end]),
                                      opts.cuda)
                targets = utils.to_var(torch.stack(target_tensors[start:end]),
                                       opts.cuda)

                # The batch size may be different in each epoch
                BS = inputs.size(0)

                encoder_annotations, encoder_hidden = encoder(inputs)

                # The last hidden state of the encoder becomes the first hidden state of the decoder
                decoder_hidden = encoder_hidden

                start_vector = torch.ones(BS).long().unsqueeze(
                    1) * start_token  # BS x 1 --> 16x1  CHECKED
                decoder_input = utils.to_var(
                    start_vector, opts.cuda)  # BS x 1 --> 16x1  CHECKED

                loss = 0.0

                seq_len = targets.size(1)  # Gets seq_len from BS x seq_len

                use_teacher_forcing = np.random.rand(
                ) < opts.teacher_forcing_ratio

                for i in range(seq_len):
                    decoder_output, decoder_hidden, attention_weights = decoder(
                        decoder_input, decoder_hidden, encoder_annotations)

                    current_target = targets[:, i]
                    loss += criterion(
                        decoder_output, current_target
                    )  # cross entropy between the decoder distribution and GT
                    ni = F.softmax(decoder_output, dim=1).data.max(1)[1]

                    if use_teacher_forcing:
                        # With teacher forcing, use the ground-truth token to condition the next step
                        decoder_input = targets[:, i].unsqueeze(1)
                    else:
                        # Without teacher forcing, use the model's own predictions to condition the next step
                        decoder_input = utils.to_var(ni.unsqueeze(1),
                                                     opts.cuda)

                loss /= float(seq_len)
                epoch_losses.append(loss.item())

                # Zero gradients
                optimizer.zero_grad()

                # Compute gradients
                loss.backward()

                # Update the parameters of the encoder and decoder
                optimizer.step()

        train_loss = np.mean(epoch_losses)
        val_loss = evaluate(val_dict, encoder, decoder, idx_dict, criterion,
                            opts)

        if val_loss < best_val_loss:
            checkpoint(encoder, decoder, idx_dict, opts)

        if not opts.no_attention:
            # Save attention maps for the fixed word TEST_WORD_ATTN throughout training
            utils.visualize_attention(
                TEST_WORD_ATTN,
                encoder,
                decoder,
                idx_dict,
                opts,
                save=os.path.join(
                    opts.checkpoint_path,
                    'train_attns/attn-epoch-{}.png'.format(epoch)))

        gen_string = utils.translate_sentence(TEST_SENTENCE, encoder, decoder,
                                              idx_dict, opts)
        print(
            "Epoch: {:3d} | Train loss: {:.3f} | Val loss: {:.3f} | Gen: {:20s}"
            .format(epoch, train_loss, val_loss, gen_string))

        loss_log.write('{} {} {}\n'.format(epoch, train_loss, val_loss))
        loss_log.flush()

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        save_loss_plot(train_losses, val_losses, opts)
Esempio n. 6
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--load',
                        type=str,
                        help='Path to checkpoint directory.')
    parser.add_argument('--cuda',
                        action='store_true',
                        default=False,
                        help='Use GPU.')
    return parser


if __name__ == '__main__':

    parser = create_parser()
    opts = parser.parse_args()

    encoder, decoder, idx_dict = load(opts)

    for word in words:
        translated = utils.translate(word, encoder, decoder, idx_dict, opts)

        print('{} --> {}'.format(word, translated))

        utils.visualize_attention(word,
                                  encoder,
                                  decoder,
                                  idx_dict,
                                  opts,
                                  save=os.path.join(opts.load,
                                                    '{}.pdf'.format(word)))
Esempio n. 7
0
def evaluate(model, load_path, plot):
    with open(load_path + 'trained_params_best.npz') as f:
        loaded = np.load(f)
        blocks_model = Model(model.cost)
        params_dicts = blocks_model.get_parameter_dict()
        params_names = params_dicts.keys()
        for param_name in params_names:
            param = params_dicts[param_name]
            # '/f_6_.W' --> 'f_6_.W'
            slash_index = param_name.find('/')
            param_name = param_name[slash_index + 1:]
            assert param.get_value().shape == loaded[param_name].shape
            param.set_value(loaded[param_name])

    if plot:
        train_data_stream, valid_data_stream = get_streams(20)
        # T x B x F
        data = train_data_stream.get_epoch_iterator().next()
        cg = ComputationGraph(model.cost)
        f = theano.function(cg.inputs, [model.location, model.scale],
                            on_unused_input='ignore',
                            allow_input_downcast=True)
        res = f(data[1], data[0])
        for i in range(10):
            visualize_attention(data[0][:, i, :],
                                res[0][:, i, :], res[1][:, i, :],
                                image_shape=(512, 512), prefix=str(i))

        plot_curves(path=load_path,
                    to_be_plotted=['train_categoricalcrossentropy_apply_cost',
                                   'valid_categoricalcrossentropy_apply_cost'],
                    yaxis='Cross Entropy',
                    titles=['train', 'valid'],
                    main_title='CE')

        plot_curves(path=load_path,
                    to_be_plotted=['train_learning_rate',
                                   'train_learning_rate'],
                    yaxis='lr',
                    titles=['train', 'train'],
                    main_title='lr')

        plot_curves(path=load_path,
                    to_be_plotted=['train_total_gradient_norm',
                                   'valid_total_gradient_norm'],
                    yaxis='GradientNorm',
                    titles=['train', 'valid'],
                    main_title='GradientNorm')

        for grad in ['_total_gradient_norm',
                     '_total_gradient_norm',
                     '_/lstmattention.W_patch_grad_norm',
                     '_/lstmattention.W_state_grad_norm',
                     '_/lstmattention.initial_cells_grad_norm',
                     '_/lstmattention.initial_location_grad_norm',
                     '_/lstmattention/lstmattention_mlp/linear_0.W_grad_norm',
                     '_/lstmattention/lstmattention_mlp/linear_1.W_grad_norm',
                     '_/mlp/linear_0.W_grad_norm',
                     '_/mlp/linear_1.W_grad_norm']:
            plot_curves(path=load_path,
                        to_be_plotted=['train' + grad,
                                       'valid' + grad],
                        yaxis='GradientNorm',
                        titles=['train',
                                'valid'],
                        main_title=grad.replace(
                            "_", "").replace("/", "").replace(".", ""))

        plot_curves(path=load_path,
                    to_be_plotted=[
                        'train_misclassificationrate_apply_error_rate',
                        'valid_misclassificationrate_apply_error_rate'],
                    yaxis='Error rate',
                    titles=['train', 'valid'],
                    main_title='Error')
        print 'plot printed'
Esempio n. 8
0
            data = ds.get_epoch_iterator(as_dict=True).next()
            inputs = ComputationGraph(model.patch).inputs
            f = theano.function(inputs,
                                [model.location, model.scale,
                                 model.patch, model.downn_sampled_input])
            res = f(data['features'])
            location, scale, patch, downn_sampled_input = res
            # os.makedirs('res_frames/')
            # os.makedirs('res_patch/')
            # os.makedirs('res_downn_sampled_input/')
            # for i, f in enumerate(data['features']):
            #     plt.imshow(f[0].reshape(100, 100), cmap=plt.gray(),
            #                interpolation='nearest')
            #     plt.savefig('res_frames/img_' + str(i) + '.png')
            # for i, p in enumerate(patch):
            #     plt.imshow(p[0, 0], cmap=plt.gray(), interpolation='nearest')
            #     plt.savefig('res_patch/img_' + str(i) + '.png')
            # for i, d in enumerate(downn_sampled_input):
            #     plt.imshow(d[0, 0], cmap=plt.gray(), interpolation='nearest')
            #     plt.savefig('res_downn_sampled_input/img_' + str(i) + '.png')

            for i in range(10):
                visualize_attention(data['features'][:, i],
                                    (location[:, i] + 1) * 512 / 2,
                                    scale[:, i] + 1 + 0.24,
                                    image_shape=(512, 512), prefix=str(i))
        else:
            # evaluate(model, 'results/v2_len10_mlp_2015_11_13_at_18_37/',
            #          plot=False)
            train(model, configs)
Esempio n. 9
0
        configs['cropper_input_shape'] = (200, 320)
        configs['patch_shape'] = (32, 32)
        configs['num_channels'] = 3
        configs['classifier_dims'] = [configs['lstm_dim'], 64, 31]
        configs['load_pretrained'] = False
        configs['test_model'] = True
        configs['l2_reg'] = 0.001

    timestr = time.strftime("%Y_%m_%d_at_%H_%M")
    save_path = configs['save_path'] + timestr
    configs['save_path'] = save_path
    log_path = os.path.join(save_path, 'log.txt')
    os.makedirs(save_path)
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)
    for item in configs:
        logger.info(item + ': %s' % str(configs[item]))

    model = setup_model(configs)

    eval_ = False
    if eval_:
        eval_function = evaluate(
            model, 'results/Cook_32016_03_10_at_20_40/', configs)
        # analyze('results/Cook_2_CNN2016_03_06_at_23_56/')
        visualize_attention(model, configs, eval_function)
    else:
        # evaluate(model, 'results/Cook_n_2016_03_05_at_00_42/', configs)
        train(model, configs)
Esempio n. 10
0
        configs['lstm_dim'] = 128
        configs['attention_mlp_hidden_dims'] = [128]
        configs['cropper_input_shape'] = (100, 100)
        configs['patch_shape'] = (28, 28)
        configs['num_channels'] = 1
        configs['classifier_dims'] = [configs['lstm_dim'], 64, 10]
        configs['load_pretrained'] = False
        configs['test_model'] = True
        configs['l2_reg'] = 0.001
        timestr = time.strftime("%Y_%m_%d_at_%H_%M")
        save_path = configs['save_path'] + timestr
        configs['save_path'] = save_path
        log_path = os.path.join(save_path, 'log.txt')
        os.makedirs(save_path)
        fh = logging.FileHandler(filename=log_path)
        fh.setLevel(logging.DEBUG)
        logger.addHandler(fh)
        for item in configs:
            logger.info(item + ': %s' % str(configs[item]))

        model = setup_model(configs)

        eval_ = False
        if eval_:
            eval_function = evaluate(model, 'results/BMNIST_Learn_2016_02_25_at_23_50/', configs)
            analyze('results/BMNIST_Learn_2016_02_25_at_23_50/')
            visualize_attention(model, configs, eval_function)
        else:
            # evaluate(model, 'results/CMV_Hard_len10_2016_02_22_at_21_00/')
            train(model, configs)
print_opts(args)
transformer_encoder, transformer_decoder = train(args)

translated = translate_sentence(TEST_SENTENCE, transformer_encoder, transformer_decoder, None, args)
print("source:\t\t{} \ntranslated:\t{}".format(TEST_SENTENCE, translated))

"""Try translating different sentences by changing the variable TEST_SENTENCE. Identify two distinct failure modes and briefly describe them."""

TEST_SENTENCE = test_cases
translated = translate_sentence(TEST_SENTENCE, transformer_encoder, transformer_decoder, None, args)
print("source:\t\t{} \ntranslated:\t{}".format(TEST_SENTENCE, translated))
exit()

"""# Attention Visualizations

One of the benefits of using attention is that it allows us to gain insight into the inner workings of the model.

By visualizing the attention weights generated for the input tokens in each decoder step, we can see where the model focuses while producing each output token.

The code in this section loads the model you trained from the previous section and uses it to translate a given set of words: it prints the translations and display heatmaps to show how attention is used at each step.

Play around with visualizing attention maps generated by the previous two models you've trained. Inspect visualizations in one success and one failure case for both models.
"""

TEST_WORD_ATTN = 'street'
visualize_attention(TEST_WORD_ATTN, rnn_attn_encoder, rnn_attn_decoder, None, args, save="additive_attention.pdf")

TEST_WORD_ATTN = 'street'
visualize_attention(TEST_WORD_ATTN, transformer_encoder, transformer_decoder, None, args, save="scaled_dot_product_attention.pdf")