Ejemplo n.º 1
0
Archivo: train.py Proyecto: pxu4114/CMR
def validate(opt, val_loader, model, audio):
    # compute the encoding for all the validation images and captions
    img_embs, cap_embs, aud_embs = encode_data(model, val_loader, audio,
                                               opt.log_step, logging.info)

    # image to caption retrieval
    (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, measure=opt.measure)
    logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1, r5, r10, medr, meanr))
    # caption to image retrieval
    (r1i, r5i, r10i, medri, meanri) = t2i(img_embs,
                                          cap_embs,
                                          measure=opt.measure)
    logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1i, r5i, r10i, medri, meanri))
    # image to audio retrieval
    (r1ia, r5ia, r10ia, medria, meanria) = i2t(img_embs,
                                               aud_embs,
                                               measure=opt.measure)
    logging.info("image to audio: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1ia, r5ia, r10ia, medria, meanria))
    # audio to image retrieval
    (r1ai, r5ai, r10ai, medrai, meanrai) = t2i(img_embs,
                                               aud_embs,
                                               measure=opt.measure)
    logging.info("audio to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1ai, r5ai, r10ai, medrai, meanrai))
    # caption to audio retrieval
    (r1ca, r5ca, r10ca, medrca, meanrca) = i2t(cap_embs,
                                               aud_embs,
                                               measure=opt.measure,
                                               npts=5000)
    logging.info("text to audio: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1ca, r5ca, r10ca, medrca, meanrca))
    # audio to caption retrieval
    (r1ac, r5ac, r10ac, medrac, meanrac) = i2t(aud_embs,
                                               cap_embs,
                                               measure=opt.measure,
                                               npts=5000)
    logging.info("audio to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1ac, r5ac, r10ac, medrac, meanrac))
    # sum of recalls to be used for early stopping
    currscore = r1 + r5 + r10 + r1i + r5i + r10i + r1ia + r5ia + r10ia + r1ai + r5ai + r10ai + r1ca + r5ca + r10ca + r1ac + r5ac + r10ac
    # pdb.set_trace()
    # record metrics in tensorboard
    tb_logger.log_value('r1', r1, step=model.Eiters)
    tb_logger.log_value('r5', r5, step=model.Eiters)
    tb_logger.log_value('r10', r10, step=model.Eiters)
    tb_logger.log_value('medr', medr, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('r1i', r1i, step=model.Eiters)
    tb_logger.log_value('r5i', r5i, step=model.Eiters)
    tb_logger.log_value('r10i', r10i, step=model.Eiters)
    tb_logger.log_value('medri', medri, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Ejemplo n.º 2
0
def validate(opt, val_loader, model):
    # compute the encoding for all the validation images and captions
    img_embs, cap_embs = encode_data(
        model, val_loader, opt.log_step, logging.info)

    print(img_embs.shape[0] // 5, "Images", cap_embs.shape[0], "texts for validate")

    # caption retrieval
    (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, measure=opt.measure)
    logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1, r5, r10, medr, meanr))
    # image retrieval
    (r1i, r5i, r10i, medri, meanr) = t2i(
        img_embs, cap_embs, measure=opt.measure)
    logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1i, r5i, r10i, medri, meanr))
    # sum of recalls to be used for early stopping
    currscore = r1 + r5 + r10 + r1i + r5i + r10i

    # record metrics in tensorboard
    tb_logger.log_value('r1', r1, step=model.Eiters)
    tb_logger.log_value('r5', r5, step=model.Eiters)
    tb_logger.log_value('r10', r10, step=model.Eiters)
    tb_logger.log_value('medr', medr, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)

    tb_logger.log_value('r1i', r1i, step=model.Eiters)
    tb_logger.log_value('r5i', r5i, step=model.Eiters)
    tb_logger.log_value('r10i', r10i, step=model.Eiters)
    tb_logger.log_value('medri', medri, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)

    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Ejemplo n.º 3
0
def validate(opt, val_loader, model, tb_writer):
    # compute the encoding for all the validation images and captions
    # with torch.no_grad():
    img_embs, cap_embs = encode_data(model, val_loader, opt.log_step,
                                     logging.info)

    # caption retrieval
    (r1, r5, r10, medr, meanr) = i2t(img_embs,
                                     cap_embs,
                                     measure=opt.test_measure)
    print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
          (r1, r5, r10, medr, meanr))
    # image retrieval
    (r1i, r5i, r10i, medri, meanr) = t2i(img_embs,
                                         cap_embs,
                                         measure=opt.test_measure)
    print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
          (r1i, r5i, r10i, medri, meanr))
    # sum of recalls to be used for early stopping
    currscore = r1 + r5 + r10 + r1i + r5i + r10i

    # record metrics in tensorboard
    tb_writer.add_scalar('data/r1', r1, model.Eiters)
    tb_writer.add_scalar('data/r5', r5, model.Eiters)
    tb_writer.add_scalar('data/r10', r10, model.Eiters)
    tb_writer.add_scalar('data/medr', medr, model.Eiters)
    tb_writer.add_scalar('data/meanr', meanr, model.Eiters)
    tb_writer.add_scalar('data/r1i', r1i, model.Eiters)
    tb_writer.add_scalar('data/r5i', r5i, model.Eiters)
    tb_writer.add_scalar('data/r10i', r10i, model.Eiters)
    tb_writer.add_scalar('data/medri', medri, model.Eiters)
    tb_writer.add_scalar('data/meanr', meanr, model.Eiters)
    tb_writer.add_scalar('data/rsum', currscore, model.Eiters)

    return currscore
Ejemplo n.º 4
0
        def eval_model():
            print ('evaluating model...')
            weights = model.get_weights()
            emb_w = weights[0]
            im_w = weights[1]
            im_b = weights[2]
            gru_weights = weights[3:12]

            test_model_im = Model(input=image_input, output=emb_image)
            test_model_im.set_weights([im_w, im_b])
            test_model_im.compile(optimizer='adam', loss=contrastive_loss)
            test_model_cap = Model(input=cap_input, output=emb_cap)
            test_model_cap.set_weights([emb_w]+ gru_weights)
            test_model_cap.compile(optimizer='adam', loss=contrastive_loss)

            test_cap, test_im = test_iter.all()
            all_caps = numpy.zeros(shape=(len(test_cap),model_config['max_cap_length']))
            all_images = numpy.zeros(shape=(len(test_cap), model_config['dim_cnn']))
            pred_cap = test_model_cap.predict(test_cap)
            pred_im = test_model_im.predict(test_im)
            test_errs = compute_errors(pred_cap, pred_im)

            r10_c, rmean_c = t2i(test_errs)
            r10_i, rmean_i = i2t(test_errs)
            print ("Image to text: %.1f %.1f" % (r10_i, rmean_i))
            print ("Text to image: %.1f %.1f" % (r10_c, rmean_c))
Ejemplo n.º 5
0
def validate(opt, val_loader, model):
    # compute the encoding for all the validation images and captions
    vid_seq_embs, para_seq_embs, clip_embs, cap_embs, _, _, num_clips, cur_vid_total = encode_data(
        opt, model, val_loader, opt.log_step, logging.info, contextual_model=True)

    # caption retrieval
#    vid_clip_rep, _, _ = i2t(clip_embs, cap_embs, measure=opt.measure)
    # image retrieval
#    cap_clip_rep, _, _ = t2i(clip_embs, cap_embs, measure=opt.measure)

    # caption retrieval
    vid_seq_rep, top1_v2p, rank_vid_v2p  = i2t(vid_seq_embs, para_seq_embs, measure=opt.measure)
    # image retrieval
    para_seq_rep, top1_p2v, rank_para_p2v = t2i(vid_seq_embs, para_seq_embs, measure=opt.measure)

    currscore = vid_seq_rep['sum'] + para_seq_rep['sum']

#    logging.info("Clip to Sent: %.1f, %.1f, %.1f, %.1f, %.1f" %
#         (vid_clip_rep['r1'], vid_clip_rep['r5'], vid_clip_rep['r10'], vid_clip_rep['medr'], vid_clip_rep['meanr']))
#    logging.info("Sent to Clip: %.1f, %.1f, %.1f, %.1f, %.1f" %
#         (cap_clip_rep['r1'], cap_clip_rep['r5'], cap_clip_rep['r10'], cap_clip_rep['medr'], cap_clip_rep['meanr']))
    logging.info("Video to Paragraph: %.1f, %.1f, %.1f, %.1f, %.1f" %
         (vid_seq_rep['r1'], vid_seq_rep['r5'], vid_seq_rep['r10'], vid_seq_rep['medr'], vid_seq_rep['meanr']))
    logging.info("Paragraph to Video: %.1f, %.1f, %.1f, %.1f, %.1f" %
         (para_seq_rep['r1'], para_seq_rep['r5'], para_seq_rep['r10'], para_seq_rep['medr'], para_seq_rep['meanr']))
    logging.info("Currscore: %.1f" % (currscore))

    # record metrics in tensorboard
#    LogReporter(tb_logger, vid_clip_rep, model.Eiters, 'clip')
#    LogReporter(tb_logger, cap_clip_rep, model.Eiters, 'clipi')
    LogReporter(tb_logger, vid_seq_rep, model.Eiters, 'seq')
    LogReporter(tb_logger, para_seq_rep, model.Eiters, 'seqi')
    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Ejemplo n.º 6
0
def validate(opt, val_loader, model):
    # compute the encoding for all the validation images and captions
    img_embs, cap_embs = encode_data(model, val_loader, opt.log_step,
                                     logging.info)

    # caption retrieval
    (r1, r5, r10, medr, meanr), d_i2t = i2t(img_embs,
                                            cap_embs,
                                            measure=opt.measure)

    logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1, r5, r10, medr, meanr))
    # image retrieval
    (r1i, r5i, r10i, medri, meanr), d_t2i = t2i(img_embs,
                                                cap_embs,
                                                measure=opt.measure)
    logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1i, r5i, r10i, medri, meanr))
    # sum of recalls to be used for early stopping
    currscore = r1 + r5 + r10 + r1i + r5i + r10i

    # record metrics in tensorboard
    tb_logger.log_value('recall@1_text', r1, step=model.Eiters)
    tb_logger.log_value('recall@5_text', r5, step=model.Eiters)
    tb_logger.log_value('recall@10_text', r10, step=model.Eiters)
    tb_logger.log_value('med-r_text', medr, step=model.Eiters)
    tb_logger.log_value('mean-r_text', meanr, step=model.Eiters)
    tb_logger.log_value('recall@1_im', r1i, step=model.Eiters)
    tb_logger.log_value('recall@5_im', r5i, step=model.Eiters)
    tb_logger.log_value('recall@10_im', r10i, step=model.Eiters)
    tb_logger.log_value('med-r_im', medri, step=model.Eiters)
    tb_logger.log_value('mean-r_im', meanr, step=model.Eiters)
    tb_logger.log_value('recall_sum', currscore, step=model.Eiters)

    return currscore, d_i2t
Ejemplo n.º 7
0
        def eval_model():
            print ('evaluating model...')
            weights = model.get_weights()
            for j in range(len(weights)):
                print(weights[j].shape)
            emb_w = weights[0]
            im_w = weights[4]
            im_b = weights[5]
            gru_weights = weights[1:4]

            test_model_im = Model(inputs=image_input, outputs=emb_image)
            test_model_im.set_weights([im_w, im_b])
            test_model_im.compile(optimizer='adam', loss=contrastive_loss)
            test_model_cap = Model(inputs=cap_input, outputs=emb_cap)
            test_model_cap.set_weights([emb_w]+ gru_weights)
            test_model_cap.compile(optimizer='adam', loss=contrastive_loss)

            test_cap, test_im = test_iter.all()
            all_caps = numpy.zeros(shape=(len(test_cap),model_config['max_cap_length']))
            all_images = numpy.zeros(shape=(len(test_cap), model_config['dim_cnn']))
            pred_cap = test_model_cap.predict(test_cap)
            pred_im = test_model_im.predict(test_im)
            test_errs = compute_errors(pred_cap, pred_im)
            
            r10_c, rmean_c = t2i(test_errs)
            r10_i, rmean_i = i2t(test_errs)
            print ("Image to text: %.1f %.1f" % (r10_i, rmean_i))
            print ("Text to image: %.1f %.1f" % (r10_c, rmean_c))
Ejemplo n.º 8
0
def validate(opt, val_loader, model):

    # compute the encoding for all the validation images and captions
    with torch.no_grad():
        img_embs, cap_embs, cap_lens, freqs = encode_data(
            model, val_loader, opt.log_step, logging.info)

        img_embs = numpy.array(
            [img_embs[i] for i in range(0, len(img_embs), 1)])
        start = time.time()

        # find the similarity between every caption and image in the validation set?
        if opt.cross_attn == 't2i':
            sims, _ = shard_xattn_t2i(img_embs,
                                      cap_embs,
                                      cap_lens,
                                      freqs,
                                      opt,
                                      shard_size=opt.shard_size)
        elif opt.cross_attn == 'i2t':
            sims, _ = shard_xattn_i2t(img_embs,
                                      cap_embs,
                                      cap_lens,
                                      freqs,
                                      opt,
                                      shard_size=opt.shard_size)
        else:
            raise NotImplementedError
        end = time.time()
        print("calculate similarity time:", end - start)

        # caption retrieval (find the right text with every image)
        (r1, r5, r10, r20, r50, medr, meanr) = i2t(img_embs, cap_embs,
                                                   cap_lens, sims)
        logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f %.1f %.1f" %
                     (r1, r5, r10, r20, r50, medr, meanr))
        # image retrieval (find the right image for every text)
        (r1i, r5i, r10i, r20i, r50i, medri,
         meanr) = t2i(img_embs, cap_embs, cap_lens, sims)
        logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f %.1f %.1f" %
                     (r1i, r5i, r10i, r20i, r50i, medri, meanr))
        # sum of recalls to be used for early stopping
        currscore = r1 + r5 + r10 + r1i + r5i + r10i

    # record metrics in tensorboard
    tb_logger.log_value('r1', r1, step=model.Eiters)
    tb_logger.log_value('r5', r5, step=model.Eiters)
    tb_logger.log_value('r10', r10, step=model.Eiters)
    tb_logger.log_value('medr', medr, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('r1i', r1i, step=model.Eiters)
    tb_logger.log_value('r5i', r5i, step=model.Eiters)
    tb_logger.log_value('r10i', r10i, step=model.Eiters)
    tb_logger.log_value('medri', medri, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Ejemplo n.º 9
0
def evaluate(img_ids,
             img_embs,
             t_embs,
             measure='cosine',
             n_caption=2,
             val_metric='map',
             direction='t2i'):
    count = {}
    for iid in img_ids:
        if int(iid) not in count:
            count[int(iid)] = (1, 0)
        else:
            count[int(iid)] = (count[int(iid)][0] + 1, 0)
    img_mask, text_mask = [False for _ in img_ids], [True for _ in img_ids]
    for idx, iid in enumerate(img_ids):
        c, u = count[int(iid)]
        if c >= n_caption and u == 0:
            img_mask[idx] = True
            count[int(iid)] = (c, 1)
        elif c >= n_caption and u == 1:
            count[int(iid)] = (c, 2)
        else:
            text_mask[idx] = False

    img_ids = [x for idx, x in enumerate(img_ids) if img_mask[idx]]
    img_embs = img_embs[img_mask]
    t_embs = t_embs[text_mask]

    c2i_all_errors = evaluation.cal_error(img_embs, t_embs, measure)

    if val_metric == "recall":
        # meme retrieval
        (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors,
                                                         n_caption=n_caption)
        # caption retrieval
        (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors,
                                                    n_caption=n_caption)
    elif val_metric == "map":
        # meme retrieval
        t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=n_caption)
        # caption retrieval
        i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=n_caption)

    currscore = 0
    if val_metric == "recall":
        if direction == 'i2t' or direction == 'all':
            rsum = r1 + r5 + r10
            currscore += rsum
        if direction == 't2i' or direction == 'all':
            rsumi = r1i + r5i + r10i
            currscore += rsumi
    elif val_metric == "map":
        if direction == 'i2t' or direction == 'all':
            currscore += i2t_map_score
        if direction == 't2i' or direction == 'all':
            currscore += t2i_map_score

    return currscore
Ejemplo n.º 10
0
    def valid_retrieval(self):
        '''The task is performed 5 times on 1000-image subsets of the test set and the results are averaged.'''
        '''Our best results are obtained with a different strategy:
        Images are resized to 400x400 irrespective of their size and aspect ratio'''
        mean_loss = 0
        embed_txts = []
        embed_imgs = []

        if self.init_from or (self.mode == 'val'):
            self.txt_enc, self.img_enc = self.restore_model(
                self.init_from, self.main_dir, self.model_name)

        self.txt_enc.eval()
        self.img_enc.eval()

        data_iter = iter(self.validloader)
        iters = len(data_iter)
        mean_loss = 0

        for j in range(iters):
            img, tokens, _ = next(data_iter)

            tokens = tokens.to(self.device)
            img = img.to(self.device)

            embed_txt = self.txt_enc(tokens)
            embed_img = self.img_enc(img)  #.type(torch.cuda.DoubleTensor)

            ## Compute loss.
            loss = self.hard_negative_loss(
                embed_img, embed_txt) + self.hard_negative_loss(
                    embed_txt, embed_img)

            mean_loss += loss
            embed_txts.extend(embed_txt)
            embed_imgs.extend(embed_img)

        mean_loss /= iters
        r = np.zeros(4)
        ri = np.zeros(4)

        for i in range(5):
            r += i2t(embed_imgs[1000 * i:1000 (i + 1)],
                     embed_txts[1000 * i:1000 (i + 1)])
            ri += t2i(embed_imgs[1000 * i:1000 (i + 1)],
                      embed_txts[1000 * i:1000 (i + 1)])

        r /= 5
        ri /= 5

        print("Image to text: %.1f, %.1f, %.1f, %.1f" %
              (r[0], r[1], r[2], r[3]))
        print("Text to image: %.1f, %.1f, %.1f, %.1f" %
              (ri[0], ri[1], ri[2], ri[3]))

        return r, ri, mean_loss
Ejemplo n.º 11
0
def validate(opt, val_loader, model):
    # compute the encoding for all the validation images and captions
    img_embs, cap_embs, cap_lens = encode_data(model, val_loader, opt.log_step,
                                               logging.info)

    img_embs = numpy.array([img_embs[i] for i in range(0, len(img_embs), 5)])

    print("Img shape in validate:", img_embs.shape)

    start = time.time()
    if opt.cross_attn == 't2i':
        sims = shard_xattn_t2i(img_embs,
                               cap_embs,
                               cap_lens,
                               opt,
                               shard_size=128)
    elif opt.cross_attn == 'i2t':
        sims = shard_xattn_i2t(img_embs,
                               cap_embs,
                               cap_lens,
                               opt,
                               shard_size=128)
    else:
        raise NotImplementedError
    end = time.time()
    print("calculate similarity time:", end - start)

    # caption retrieval
    (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, cap_lens, sims)
    logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1, r5, r10, medr, meanr))
    # image retrieval
    (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, cap_lens, sims)
    logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1i, r5i, r10i, medri, meanr))
    # sum of recalls to be used for early stopping
    currscore = r1 + r5 + r10 + r1i + r5i + r10i

    # record metrics in tensorboard
    tb_logger.log_value('r1', r1, step=model.Eiters)
    tb_logger.log_value('r5', r5, step=model.Eiters)
    tb_logger.log_value('r10', r10, step=model.Eiters)
    tb_logger.log_value('medr', medr, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('r1i', r1i, step=model.Eiters)
    tb_logger.log_value('r5i', r5i, step=model.Eiters)
    tb_logger.log_value('r10i', r10i, step=model.Eiters)
    tb_logger.log_value('medri', medri, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Ejemplo n.º 12
0
def validate(opt, val_loader, model, vocab):
    # compute the encoding for all the validation images and captions
    img_embs, cap_embs = encode_data(model, val_loader, opt.log_step,
                                     logger.info, vocab)
    # caption retrieval
    (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, measure='cosine')
    logger.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
                (r1, r5, r10, medr, meanr))
    # image retrieval
    (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, measure='cosine')
    logger.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
                (r1i, r5i, r10i, medri, meanr))
    # sum of recalls to be used for early stopping
    currscore = r1 + r5 + r10 + r1i + r5i + r10i

    return currscore
Ejemplo n.º 13
0
def validate(opt, val_loader, model):
    # compute the encoding for all the validation images and captions
    img_embs, cap_embs, cap_lens = encode_data(model, val_loader, opt.log_step,
                                               logging.info)

    # clear duplicate 5*images and keep 1*images
    img_embs = numpy.array([img_embs[i] for i in range(0, len(img_embs), 5)])

    # record computation time of validation
    start = time.time()
    sims = shard_attn_scores(model,
                             img_embs,
                             cap_embs,
                             cap_lens,
                             opt,
                             shard_size=100)
    end = time.time()
    print("calculate similarity time:", end - start)

    # caption retrieval
    (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, cap_lens, sims)
    logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1, r5, r10, medr, meanr))

    # image retrieval
    (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, cap_lens, sims)
    logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1i, r5i, r10i, medri, meanr))

    # sum of recalls to be used for early stopping
    r_sum = r1 + r5 + r10 + r1i + r5i + r10i

    # record metrics in tensorboard
    tb_logger.log_value('r1', r1, step=model.Eiters)
    tb_logger.log_value('r5', r5, step=model.Eiters)
    tb_logger.log_value('r10', r10, step=model.Eiters)
    tb_logger.log_value('medr', medr, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('r1i', r1i, step=model.Eiters)
    tb_logger.log_value('r5i', r5i, step=model.Eiters)
    tb_logger.log_value('r10i', r10i, step=model.Eiters)
    tb_logger.log_value('medri', medri, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('r_sum', r_sum, step=model.Eiters)

    return r_sum
Ejemplo n.º 14
0
def validate(opt, val_loader, model, num_offsets=10):
  # compute the encoding for all the validation images and captions
  img_seq_embs, cap_seq_embs = encode_eval_data(
    model, val_loader, opt.log_step, logging.info, num_offsets=num_offsets)

  for _offset in xrange(num_offsets):
    logging.info("Offset: %.1f" % _offset )

    # caption retrieval
    (seq_r1, seq_r5, seq_r10, seq_medr, seq_meanr) = i2t(
        img_seq_embs[_offset], cap_seq_embs[_offset], measure=opt.measure)
    logging.info("seq_Image to seq_text: %.1f, %.1f, %.1f, %.1f, %.1f" %
          (seq_r1, seq_r5, seq_r10, seq_medr, seq_meanr))
    # image retrieval
    (seq_r1i, seq_r5i, seq_r10i, seq_medri, seq_meanr) = t2i(
        img_seq_embs[_offset], cap_seq_embs[_offset], measure=opt.measure)
    logging.info("seq_Text to seq_image: %.1f, %.1f, %.1f, %.1f, %.1f" %
          (seq_r1i, seq_r5i, seq_r10i, seq_medri, seq_meanr))
Ejemplo n.º 15
0
def validate(opt, val_loader, model, lang, n=5):
    # compute the encoding for all the validation images and captions
    img_embs, cap_embs, val_loss = encode_data(model, val_loader, opt.log_step,
                                               logging.info)
    if lang in ['en', 'de']:
        n = 5
    else:
        n = 1
    # caption retrieval
    (r1, r5, r10, medr, meanr) = i2t(img_embs,
                                     cap_embs,
                                     measure=opt.measure,
                                     n=n)
    logging.info(
        "%s Image to text: R@1 %.1f | R@5 %.1f | R@10 %.1f | Medr %.1f | Meanr %.1f"
        % (lang, r1, r5, r10, medr, meanr))
    # image retrieval
    (r1i, r5i, r10i, medri, meanr) = t2i(img_embs,
                                         cap_embs,
                                         measure=opt.measure,
                                         n=n)
    logging.info(
        "%s Text to image: R@1 %.1f | R@5 %.1f | R@10 %.1f | Medr %.1f | Meanr %.1f"
        % (lang, r1i, r5i, r10i, medri, meanr))
    # sum of recalls to be used for early stopping
    currscore = r1 + r5 + r10 + r1i + r5i + r10i

    # record metrics in tensorboard
    tb_logger.log_value('r1', r1, step=model.Eiters)
    tb_logger.log_value('r5', r5, step=model.Eiters)
    tb_logger.log_value('r10', r10, step=model.Eiters)
    tb_logger.log_value('medr', medr, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('r1i', r1i, step=model.Eiters)
    tb_logger.log_value('r5i', r5i, step=model.Eiters)
    tb_logger.log_value('r10i', r10i, step=model.Eiters)
    tb_logger.log_value('medri', medri, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('rsum', currscore, step=model.Eiters)
    tb_logger.log_value('valid', val_loss, step=model.Eiters)

    return currscore
Ejemplo n.º 16
0
def validate(opt, val_loader, model, tb_logger):
    # compute the encoding for all the validation images and captions
    print("start validate")
    model.val_start()

    img_embs, cap_embs, cap_masks = encode_data(model, val_loader,
                                                opt.log_step, logging.info)

    # caption retrieval
    (i2t_r1, i2t_r5, i2t_r10, i2t_medr,
     i2t_meanr), (t2i_r1, t2i_r5, t2i_r10, t2i_medr,
                  t2i_meanr) = i2t(img_embs,
                                   cap_embs,
                                   cap_masks,
                                   measure=opt.measure,
                                   model=model)
    logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (i2t_r1, i2t_r5, i2t_r10, i2t_medr, i2t_meanr))
    # image retrieval
    #(r1i, r5i, r10i, medri, meanr) = t2i(
    #    img_embs, cap_embs, measure=opt.measure, model=model)
    logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (t2i_r1, t2i_r5, t2i_r10, t2i_medr, t2i_meanr))
    # sum of recalls to be used for early stopping
    currscore = i2t_r1 + i2t_r5 + i2t_r10 + t2i_r1 + t2i_r5 + t2i_r10

    # record metrics in tensorboard
    tb_logger.log_value('i2t_r1', i2t_r1, step=model.Eiters)
    tb_logger.log_value('i2t_r5', i2t_r5, step=model.Eiters)
    tb_logger.log_value('i2t_r10', i2t_r10, step=model.Eiters)
    tb_logger.log_value('i2t_medr', i2t_medr, step=model.Eiters)
    tb_logger.log_value('i2t_meanr', i2t_meanr, step=model.Eiters)
    tb_logger.log_value('t2i_r1', t2i_r1, step=model.Eiters)
    tb_logger.log_value('t2i_r5', t2i_r5, step=model.Eiters)
    tb_logger.log_value('t2i_r10', t2i_r10, step=model.Eiters)
    tb_logger.log_value('t2i_medr', t2i_medr, step=model.Eiters)
    tb_logger.log_value('t2i_meanr', t2i_meanr, step=model.Eiters)
    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Ejemplo n.º 17
0
def validate(opt, val_loader, model):
    # compute the encoding for all the validation images and captions
    start = time.time()
    img_embs, cap_embs, cap_lens = encode_data(model, val_loader, opt,
                                               opt.log_step, logging.info)
    end = time.time()
    print("calculate backbone time:", end - start)

    img_embs = numpy.array([img_embs[i] for i in range(0, len(img_embs), 5)])

    start = time.time()
    sims = 1 - cdist(img_embs, cap_embs, metric='cosine')
    end = time.time()
    print("calculate similarity time:", end - start)

    # caption retrieval
    (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, cap_lens, sims)
    logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1, r5, r10, medr, meanr))
    # image retrieval
    (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, cap_lens, sims)
    logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
                 (r1i, r5i, r10i, medri, meanr))
    # sum of recalls to be used for early stopping
    currscore = r1 + r5 + r10 + r1i + r5i + r10i

    # record metrics in tensorboard
    tb_logger.log_value('r1', r1, step=model.Eiters)
    tb_logger.log_value('r5', r5, step=model.Eiters)
    tb_logger.log_value('r10', r10, step=model.Eiters)
    tb_logger.log_value('medr', medr, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('r1i', r1i, step=model.Eiters)
    tb_logger.log_value('r5i', r5i, step=model.Eiters)
    tb_logger.log_value('r10i', r10i, step=model.Eiters)
    tb_logger.log_value('medri', medri, step=model.Eiters)
    tb_logger.log_value('meanr', meanr, step=model.Eiters)
    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Ejemplo n.º 18
0
def trainer(
        data='coco',  #f8k, f30k, coco
        margin=0.2,
        dim=1024,
        dim_image=4096,
        dim_word=300,
        encoder='gru',  # gru OR bow
        max_epochs=15,
        dispFreq=10,
        decay_c=0.,
        grad_clip=2.,
        maxlen_w=100,
        optimizer='adam',
        batch_size=128,
        saveto='/ais/gobi3/u/rkiros/uvsmodels/coco.npz',
        validFreq=100,
        lrate=0.0002,
        reload_=False):

    # Model options
    model_options = {}
    model_options['data'] = data
    model_options['margin'] = margin
    model_options['dim'] = dim
    model_options['dim_image'] = dim_image
    model_options['dim_word'] = dim_word
    model_options['encoder'] = encoder
    model_options['max_epochs'] = max_epochs
    model_options['dispFreq'] = dispFreq
    model_options['decay_c'] = decay_c
    model_options['grad_clip'] = grad_clip
    model_options['maxlen_w'] = maxlen_w
    model_options['optimizer'] = optimizer
    model_options['batch_size'] = batch_size
    model_options['saveto'] = saveto
    model_options['validFreq'] = validFreq
    model_options['lrate'] = lrate
    model_options['reload_'] = reload_

    print(model_options)

    # reload options
    if reload_ and os.path.exists(saveto):
        print('reloading...' + saveto)
        with open('%s.pkl' % saveto, 'rb') as f:
            models_options = pkl.load(f)

    # Load training and development sets
    print('Loading dataset')
    train, dev = load_dataset(data)[:2]

    # Create and save dictionary
    print('Creating dictionary')
    worddict = build_dictionary(train[0] + dev[0])[0]
    n_words = len(worddict)
    model_options['n_words'] = n_words
    print('Dictionary size: ' + str(n_words))
    with open('%s.dictionary.pkl' % saveto, 'wb') as f:
        pkl.dump(worddict, f)

    # Inverse dictionary
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    print('Building model')
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, inps, cost = build_model(tparams, model_options)

    # before any regularizer
    print('Building f_log_probs...', )
    f_log_probs = theano.function(inps, cost, profile=False)
    print('Done')

    # weight decay, if applicable
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print('Building f_cost...', )
    f_cost = theano.function(inps, cost, profile=False)
    print('Done')

    print('Building sentence encoder')
    trng, inps_se, sentences = build_sentence_encoder(tparams, model_options)
    f_senc = theano.function(inps_se, sentences, profile=False)

    print('Building image encoder')
    trng, inps_ie, images = build_image_encoder(tparams, model_options)
    f_ienc = theano.function(inps_ie, images, profile=False)

    print('Building f_grad...', )
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads],
                                  profile=False)
    f_weight_norm = theano.function([], [(t**2).sum()
                                         for k, t in tparams.iteritems()],
                                    profile=False)

    if grad_clip > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (grad_clip**2),
                              g / tensor.sqrt(g2) * grad_clip, g))
        grads = new_grads

    lr = tensor.scalar(name='lr')
    print('Building optimizers...', )
    # (compute gradients), (updates parameters)
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)

    print('Optimization')

    # Each sentence in the minibatch have same length (for encoder)
    train_iter = homogeneous_data.HomogeneousData([train[0], train[1]],
                                                  batch_size=batch_size,
                                                  maxlen=maxlen_w)

    uidx = 0
    curr = 0.
    n_samples = 0

    for eidx in xrange(max_epochs):

        print('Epoch ', eidx)

        for x, im in train_iter:
            n_samples += len(x)
            uidx += 1

            x, mask, im = homogeneous_data.prepare_data(x,
                                                        im,
                                                        worddict,
                                                        maxlen=maxlen_w,
                                                        n_words=n_words)

            if x == None:
                print('Minibatch with zero sample under length ', maxlen_w)
                uidx -= 1
                continue

            # Update
            ud_start = time.time()
            cost = f_grad_shared(x, mask, im)
            f_update(lrate)
            ud = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print('NaN detected')
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ',
                      ud)

            if numpy.mod(uidx, validFreq) == 0:

                print('Computing results...')
                curr_model = {}
                curr_model['options'] = model_options
                curr_model['worddict'] = worddict
                curr_model['word_idict'] = word_idict
                curr_model['f_senc'] = f_senc
                curr_model['f_ienc'] = f_ienc

                ls = encode_sentences(curr_model, dev[0])
                lim = encode_images(curr_model, dev[1])

                (r1, r5, r10, medr) = i2t(lim, ls)
                print("Image to text: %.1f, %.1f, %.1f, %.1f" %
                      (r1, r5, r10, medr))
                (r1i, r5i, r10i, medri) = t2i(lim, ls)
                print("Text to image: %.1f, %.1f, %.1f, %.1f" %
                      (r1i, r5i, r10i, medri))

                currscore = r1 + r5 + r10 + r1i + r5i + r10i
                if currscore > curr:
                    curr = currscore

                    # Save model
                    print('Saving...', )
                    params = unzip(tparams)
                    numpy.savez(saveto, **params)
                    pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                    print('Done')

        print('Seen %d samples' % n_samples)
Ejemplo n.º 19
0
def test(test_loader,
         model,
         tb_logger,
         measure='cosine',
         log_step=10,
         ndcg_scorer=None):
    # compute the encoding for all the validation images and captions
    img_embs, cap_embs = encode_data(model, test_loader, log_step,
                                     logging.info)

    if measure == 'cosine':
        sim_fn = cosine_sim
    elif measure == 'dot':
        sim_fn = dot_sim

    results = []
    for i in range(5):
        r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000],
                     cap_embs[i * 5000:(i + 1) * 5000],
                     None,
                     None,
                     return_ranks=True,
                     ndcg_scorer=ndcg_scorer,
                     fold_index=i)
        print(
            "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f"
            % r)
        ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000],
                       cap_embs[i * 5000:(i + 1) * 5000],
                       None,
                       None,
                       return_ranks=True,
                       ndcg_scorer=ndcg_scorer,
                       fold_index=i)
        if i == 0:
            rt, rti = rt0, rti0
        print(
            "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f"
            % ri)
        ar = (r[0] + r[1] + r[2]) / 3
        ari = (ri[0] + ri[1] + ri[2]) / 3
        rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
        print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
        results += [list(r) + list(ri) + [ar, ari, rsum]]

    print("-----------------------------------")
    print("Mean metrics: ")
    mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
    print("rsum: %.1f" % (mean_metrics[16] * 6))
    print("Average i2t Recall: %.1f" % mean_metrics[14])
    print(
        "Image to text: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f"
        % mean_metrics[:7])
    print("Average t2i Recall: %.1f" % mean_metrics[15])
    print(
        "Text to image: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f"
        % mean_metrics[7:14])

    # record metrics in tensorboard
    tb_logger.add_scalar('test/r1', mean_metrics[0], model.Eiters)
    tb_logger.add_scalar('test/r5', mean_metrics[1], model.Eiters)
    tb_logger.add_scalar('test/r10', mean_metrics[2], model.Eiters)
    tb_logger.add_scalars('test/mean_ndcg', {
        'rougeL': mean_metrics[5],
        'spice': mean_metrics[6]
    }, model.Eiters)
    tb_logger.add_scalar('test/r1i', mean_metrics[7], model.Eiters)
    tb_logger.add_scalar('test/r5i', mean_metrics[8], model.Eiters)
    tb_logger.add_scalar('test/r10i', mean_metrics[9], model.Eiters)
    tb_logger.add_scalars('test/mean_ndcg_i', {
        'rougeL': mean_metrics[12],
        'spice': mean_metrics[13]
    }, model.Eiters)
Ejemplo n.º 20
0
def trainer(load_from=None, save_dir='snapshots', name='anon', **kwargs):
    """
    :param load_from: location to load parameters + options from
    :param name: name of model, used as location to save parameters + options
    """

    curr_model = dict()

    # load old model, including parameters, but overwrite with new options
    if load_from:
        print 'reloading...' + load_from
        with open('%s.pkl' % load_from, 'rb') as f:
            curr_model = pkl.load(f)
    else:
        curr_model['options'] = {}

    for k, v in kwargs.iteritems():
        curr_model['options'][k] = v

    model_options = curr_model['options']

    # initialize logger
    import datetime
    timestampedName = datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '_' + name

    from logger import Log
    log = Log(name=timestampedName,
              hyperparams=model_options,
              saveDir='vis/training',
              xLabel='Examples Seen',
              saveFrequency=1)

    print curr_model['options']

    # Load training and development sets
    print 'Loading dataset'
    dataset = load_dataset(model_options['data'],
                           cnn=model_options['cnn'],
                           load_train=True)
    train = dataset['train']
    dev = dataset['dev']

    # Create dictionary
    print 'Creating dictionary'
    worddict = build_dictionary(train['caps'] + dev['caps'])
    print 'Dictionary size: ' + str(len(worddict))
    curr_model['worddict'] = worddict
    curr_model['options']['n_words'] = len(worddict) + 2

    # save model
    pkl.dump(curr_model, open('%s/%s.pkl' % (save_dir, name), 'wb'))

    print 'Loading data'
    train_iter = datasource.Datasource(train,
                                       batch_size=model_options['batch_size'],
                                       worddict=worddict)
    dev = datasource.Datasource(dev, worddict=worddict)
    dev_caps, dev_ims = dev.all()

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if load_from is not None and os.path.exists(load_from):
        params = load_params(load_from, params)

    tparams = init_tparams(params)

    inps, cost = build_model(tparams, model_options)

    print 'Building sentence encoder'
    inps_se, sentences = build_sentence_encoder(tparams, model_options)
    f_senc = theano.function(inps_se, sentences, profile=False)

    print 'Building image encoder'
    inps_ie, images = build_image_encoder(tparams, model_options)
    f_ienc = theano.function(inps_ie, images, profile=False)

    print 'Building f_grad...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    print 'Building errors..'
    inps_err, errs = build_errors(model_options)
    f_err = theano.function(inps_err, errs, profile=False)

    curr_model['f_senc'] = f_senc
    curr_model['f_ienc'] = f_ienc
    curr_model['f_err'] = f_err

    if model_options['grad_clip'] > 0.:
        grads = [maxnorm(g, model_options['grad_clip']) for g in grads]

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    # (compute gradients), (updates parameters)
    f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams,
                                                               grads, inps,
                                                               cost)

    print 'Optimization'

    uidx = 0
    curr = 0
    n_samples = 0

    for eidx in xrange(model_options['max_epochs']):

        print 'Epoch ', eidx

        for x, mask, im in train_iter:
            n_samples += x.shape[1]
            uidx += 1

            # Update
            ud_start = time.time()
            cost = f_grad_shared(x, mask, im)
            f_update(model_options['lrate'])
            ud = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, model_options['dispFreq']) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
                log.update({'Error': float(cost)}, n_samples)

            if numpy.mod(uidx, model_options['validFreq']) == 0:

                print 'Computing results...'

                # encode sentences efficiently
                dev_s = encode_sentences(
                    curr_model,
                    dev_caps,
                    batch_size=model_options['batch_size'])
                dev_i = encode_images(curr_model, dev_ims)

                # compute errors
                dev_errs = compute_errors(curr_model, dev_s, dev_i)

                # compute ranking error
                (r1, r5, r10, medr, meanr), vis_details = t2i(dev_errs,
                                                              vis_details=True)
                (r1i, r5i, r10i, medri, meanri) = i2t(dev_errs)
                print "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (
                    r1, r5, r10, medr, meanr)
                log.update(
                    {
                        'R@1': r1,
                        'R@5': r5,
                        'R@10': r10,
                        'median_rank': medr,
                        'mean_rank': meanr
                    }, n_samples)
                print "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (
                    r1i, r5i, r10i, medri, meanri)
                log.update(
                    {
                        'Image2Caption_R@1': r1i,
                        'Image2Caption_R@5': r5i,
                        'Image2CaptionR@10': r10i,
                        'Image2Caption_median_rank': medri,
                        'Image2Caption_mean_rank': meanri
                    }, n_samples)

                tot = r1 + r5 + r10
                if tot > curr:
                    curr = tot
                    # Save parameters
                    print 'Saving...',
                    numpy.savez('%s/%s' % (save_dir, name), **unzip(tparams))
                    print 'Done'
                    vis_details['hyperparams'] = model_options
                    # Save visualization details
                    with open(
                            'vis/roc/%s/%s.json' %
                        (model_options['data'], timestampedName), 'w') as f:
                        json.dump(vis_details, f)
                    # Add the new model to the index
                    try:
                        index = json.load(open('vis/roc/index.json', 'r'))
                    except IOError:
                        index = {model_options['data']: []}

                    models = index[model_options['data']]
                    if timestampedName not in models:
                        models.append(timestampedName)

                    with open('vis/roc/index.json', 'w') as f:
                        json.dump(index, f)

        print 'Seen %d samples' % n_samples
Ejemplo n.º 21
0
def trainer(load_from=None, save_dir="snapshots", name="anon", **kwargs):
    """
    :param load_from: location to load parameters + options from
    :param name: name of model, used as location to save parameters + options
    """

    curr_model = dict()

    # load old model, including parameters, but overwrite with new options
    if load_from:
        print "reloading..." + load_from
        with open("%s.pkl" % load_from, "rb") as f:
            curr_model = pkl.load(f)
    else:
        curr_model["options"] = {}

    for k, v in kwargs.iteritems():
        curr_model["options"][k] = v

    model_options = curr_model["options"]

    # initialize logger
    import datetime

    timestampedName = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + "_" + name

    from logger import Log

    log = Log(
        name=timestampedName, hyperparams=model_options, saveDir="vis/training", xLabel="Examples Seen", saveFrequency=1
    )

    print curr_model["options"]

    # Load training and development sets
    print "Loading dataset"
    dataset = load_dataset(model_options["data"], cnn=model_options["cnn"], load_train=True)
    train = dataset["train"]
    dev = dataset["dev"]

    # Create dictionary
    print "Creating dictionary"
    worddict = build_dictionary(train["caps"] + dev["caps"])
    print "Dictionary size: " + str(len(worddict))
    curr_model["worddict"] = worddict
    curr_model["options"]["n_words"] = len(worddict) + 2

    # save model
    pkl.dump(curr_model, open("%s/%s.pkl" % (save_dir, name), "wb"))

    print "Loading data"
    train_iter = datasource.Datasource(train, batch_size=model_options["batch_size"], worddict=worddict)
    dev = datasource.Datasource(dev, worddict=worddict)
    dev_caps, dev_ims = dev.all()

    print "Building model"
    params = init_params(model_options)
    # reload parameters
    if load_from is not None and os.path.exists(load_from):
        params = load_params(load_from, params)

    tparams = init_tparams(params)

    inps, cost = build_model(tparams, model_options)

    print "Building sentence encoder"
    inps_se, sentences = build_sentence_encoder(tparams, model_options)
    f_senc = theano.function(inps_se, sentences, profile=False)

    print "Building image encoder"
    inps_ie, images = build_image_encoder(tparams, model_options)
    f_ienc = theano.function(inps_ie, images, profile=False)

    print "Building f_grad...",
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    print "Building errors.."
    inps_err, errs = build_errors(model_options)
    f_err = theano.function(inps_err, errs, profile=False)

    curr_model["f_senc"] = f_senc
    curr_model["f_ienc"] = f_ienc
    curr_model["f_err"] = f_err

    if model_options["grad_clip"] > 0.0:
        grads = [maxnorm(g, model_options["grad_clip"]) for g in grads]

    lr = tensor.scalar(name="lr")
    print "Building optimizers...",
    # (compute gradients), (updates parameters)
    f_grad_shared, f_update = eval(model_options["optimizer"])(lr, tparams, grads, inps, cost)

    print "Optimization"

    uidx = 0
    curr = 0
    n_samples = 0

    for eidx in xrange(model_options["max_epochs"]):

        print "Epoch ", eidx

        for x, mask, im in train_iter:
            n_samples += x.shape[1]
            uidx += 1

            # Update
            ud_start = time.time()
            cost = f_grad_shared(x, mask, im)
            f_update(model_options["lrate"])
            ud = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print "NaN detected"
                return 1.0, 1.0, 1.0

            if numpy.mod(uidx, model_options["dispFreq"]) == 0:
                print "Epoch ", eidx, "Update ", uidx, "Cost ", cost, "UD ", ud
                log.update({"Error": float(cost)}, n_samples)

            if numpy.mod(uidx, model_options["validFreq"]) == 0:

                print "Computing results..."

                # encode sentences efficiently
                dev_s = encode_sentences(curr_model, dev_caps, batch_size=model_options["batch_size"])
                dev_i = encode_images(curr_model, dev_ims)

                # compute errors
                dev_errs = compute_errors(curr_model, dev_s, dev_i)

                # compute ranking error
                (r1, r5, r10, medr, meanr), vis_details = t2i(dev_errs, vis_details=True)
                (r1i, r5i, r10i, medri, meanri) = i2t(dev_errs)
                print "Text to image (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)
                log.update({"R@1": r1, "R@5": r5, "R@10": r10, "median_rank": medr, "mean_rank": meanr}, n_samples)
                print "Image to text (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanri)
                log.update(
                    {
                        "Image2Caption_R@1": r1i,
                        "Image2Caption_R@5": r5i,
                        "Image2CaptionR@10": r10i,
                        "Image2Caption_median_rank": medri,
                        "Image2Caption_mean_rank": meanri,
                    },
                    n_samples,
                )

                tot = r1 + r5 + r10
                if tot > curr:
                    curr = tot
                    # Save parameters
                    print "Saving...",
                    numpy.savez("%s/%s" % (save_dir, name), **unzip(tparams))
                    print "Done"
                    vis_details["hyperparams"] = model_options
                    # Save visualization details
                    with open("vis/roc/%s/%s.json" % (model_options["data"], timestampedName), "w") as f:
                        json.dump(vis_details, f)
                    # Add the new model to the index
                    try:
                        index = json.load(open("vis/roc/index.json", "r"))
                    except IOError:
                        index = {model_options["data"]: []}

                    models = index[model_options["data"]]
                    if timestampedName not in models:
                        models.append(timestampedName)

                    with open("vis/roc/index.json", "w") as f:
                        json.dump(index, f)

        print "Seen %d samples" % n_samples
Ejemplo n.º 22
0
def validate_split(opt,
                   vid_data_loader,
                   text_data_loader,
                   model,
                   measure='cosine'):
    # compute the encoding for all the validation video and captions

    model.val_start()
    video_embs, video_ids = evaluation.encode_text_or_vid(
        model.embed_vis, vid_data_loader)
    cap_embs, caption_ids = evaluation.encode_text_or_vid(
        model.embed_txt, text_data_loader)

    c2i_all_errors = evaluation.cal_error(video_embs, cap_embs, measure)
    if opt.val_metric == "recall":

        # video retrieval
        if opt.testCollection.startswith('msvd'):
            (r1i, r5i, r10i, medri, meanri,
             t2i_map_score) = evaluation.t2i_varied(c2i_all_errors,
                                                    caption_ids, video_ids)
        else:
            (r1i, r5i, r10i, medri,
             meanri) = evaluation.t2i(c2i_all_errors, n_caption=opt.n_caption)
        print(" * Text to video:")
        print(" * r_1_5_10: {}".format(
            [round(r1i, 3), round(r5i, 3),
             round(r10i, 3)]))
        print(" * medr, meanr: {}".format([round(medri, 3), round(meanri, 3)]))
        print(" * " + '-' * 10)

        # caption retrieval
        if opt.testCollection.startswith('msvd'):
            (r1, r5, r10, medr, meanr,
             i2t_map_score) = evaluation.i2t_varied(c2i_all_errors,
                                                    caption_ids, video_ids)
        else:
            (r1, r5, r10, medr,
             meanr) = evaluation.i2t(c2i_all_errors, n_caption=opt.n_caption)
        print(" * Video to text:")
        print(" * r_1_5_10: {}".format(
            [round(r1, 3), round(r5, 3),
             round(r10, 3)]))
        print(" * medr, meanr: {}".format([round(medr, 3), round(meanr, 3)]))
        print(" * " + '-' * 10)

        # record metrics in tensorboard
        tb_logger.log_value('r1', r1, step=model.Eiters)
        tb_logger.log_value('r5', r5, step=model.Eiters)
        tb_logger.log_value('r10', r10, step=model.Eiters)
        tb_logger.log_value('medr', medr, step=model.Eiters)
        tb_logger.log_value('meanr', meanr, step=model.Eiters)
        tb_logger.log_value('r1i', r1i, step=model.Eiters)
        tb_logger.log_value('r5i', r5i, step=model.Eiters)
        tb_logger.log_value('r10i', r10i, step=model.Eiters)
        tb_logger.log_value('medri', medri, step=model.Eiters)
        tb_logger.log_value('meanri', meanri, step=model.Eiters)

    elif opt.val_metric == "map":
        i2t_map_score = evaluation.i2t_map(c2i_all_errors,
                                           n_caption=opt.n_caption)
        t2i_map_score = evaluation.t2i_map(c2i_all_errors,
                                           n_caption=opt.n_caption)
        tb_logger.log_value('i2t_map', i2t_map_score, step=model.Eiters)
        tb_logger.log_value('t2i_map', t2i_map_score, step=model.Eiters)
        print('i2t_map', i2t_map_score)
        print('t2i_map', t2i_map_score)

    currscore = 0
    if opt.val_metric == "recall":
        if opt.direction == 'i2t' or opt.direction == 'all':
            currscore += (r1 + r5 + r10)
        if opt.direction == 't2i' or opt.direction == 'all':
            currscore += (r1i + r5i + r10i)
    elif opt.val_metric == "map":
        if opt.direction == 'i2t' or opt.direction == 'all':
            currscore += i2t_map_score
        if opt.direction == 't2i' or opt.direction == 'all':
            currscore += t2i_map_score

    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Ejemplo n.º 23
0
def trainer(data='f30k',
            margin=0.2,
            dim=1024,
            dim_image=4096,
            dim_word=300,
            max_epochs=15,
            encoder='lstm',
            dispFreq=10,
            grad_clip=2.0,
            maxlen_w=150,
            batch_size=128,
            saveto='vse/f30K',
            validFreq=100,
            early_stop=20,
            lrate=1e-3,
            reload_=False):
    # Model options
    model_options = {}
    model_options['data'] = data
    model_options['margin'] = margin
    model_options['dim'] = dim
    model_options['dim_image'] = dim_image
    model_options['dim_word'] = dim_word
    model_options['max_epochs'] = max_epochs
    model_options['dispFreq'] = dispFreq
    model_options['grad_clip'] = grad_clip
    model_options['maxlen_w'] = maxlen_w
    model_options['batch_size'] = batch_size
    model_options['saveto'] = saveto
    model_options['validFreq'] = validFreq
    model_options['lrate'] = lrate
    model_options['reload_'] = reload_

    logging.info(model_options)

    # reload options
    if reload_ and os.path.exists(saveto):
        logging.info('reloading...' + saveto)
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    # Load training and development sets
    logging.info('loading dataset')
    titles, album_ims, artist, genre = load_dataset(data)
    artist_string = artist
    genre_string = genre

    # Create and save dictionary
    if os.path.exists('%s.dictionary.pkl' % saveto):
        logging.info('loading dict from...' + saveto)
        with open('%s.dictionary.pkl' % saveto, 'rb') as wdict:
            worddict = pkl.load(wdict)
        n_words = len(worddict)
        model_options['n_words'] = n_words
        logging.info('Dictionary size: ' + str(n_words))
    else:

        logging.info('Create dictionary')
        worddict = build_dictionary(titles + artist + genre)[0]
        n_words = len(worddict)
        model_options['n_words'] = n_words
        logging.info('Dictionary words: ' + str(n_words))
        with open('%s.dictionary.pkl' % saveto, 'wb') as f:
            pkl.dump(worddict, f)

    # Inverse dictionary
    word_idict = dict()
    for kk, vv in worddict.items():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    model_options['worddict'] = worddict
    model_options['word_idict'] = word_idict

    # Each sentence in the minibatch have same length (for encoder)
    train_iter = homogeneous_data.HomogeneousData(
        [titles, album_ims, artist, genre],
        batch_size=batch_size,
        maxlen=maxlen_w)

    img_sen_model = Img_Sen_Artist_Ranking(model_options)
    # todo code to load saved model dict
    if os.path.exists('%s_model_%s.pkl' % (saveto, encoder)):
        logging.info('Loading model...')
        # pkl.dump(model_options, open('%s_params_%s.pkl' % (saveto, encoder), 'wb'))
        img_sen_model.load_state_dict(
            torch.load('%s_model_%s.pkl' % (saveto, encoder)))
        logging.info('Done')
    img_sen_model = img_sen_model.cuda()

    loss_fn = PairwiseRankingLoss(margin=margin).cuda()

    params = filter(lambda p: p.requires_grad, img_sen_model.parameters())
    optimizer = torch.optim.Adam(params, lr=lrate)
    scheduler = ReduceLROnPlateau(optimizer,
                                  factor=0.1,
                                  patience=40,
                                  mode='min',
                                  verbose=True,
                                  threshold=1e-8)

    uidx = 0
    curr = 0.0
    n_samples = 0

    # For Early-stopping
    best_r1, best_r5, best_r10, best_medr = 0.0, 0.0, 0.0, 0
    best_step = 0

    writer = SummaryWriter()
    for eidx in range(max_epochs):

        for x, im, artist, genre in train_iter:
            n_samples += len(x)
            uidx += 1

            x, im, artist, genre = homogeneous_data.prepare_data(
                x,
                im,
                artist,
                genre,
                worddict,
                maxlen=maxlen_w,
                n_words=n_words)

            if x is None:
                logging.info('Minibatch with zero sample under length ',
                             maxlen_w)
                uidx -= 1
                continue

            x = Variable(torch.from_numpy(x).cuda())
            im = Variable(torch.from_numpy(im).cuda())
            artist = Variable(torch.from_numpy(artist).cuda())
            genre = Variable(torch.from_numpy(genre).cuda())
            # Update
            x1, im1, artist, genre = img_sen_model(x, im, artist, genre)

            #make validation on inout before trainer see it
            if numpy.mod(uidx, validFreq) == 0:
                img_sen_model.eval()
                with torch.no_grad():
                    print('Epoch ', eidx, '\tUpdate@ ', uidx, '\tCost ',
                          cost.data.item())
                    writer.add_scalar('Evaluation/Validation_Loss',
                                      cost.data.item(), uidx)
                    (r1, r5, r10, medr) = i2t(im1, x)  #distances with l2norm
                    logging.info("Image to text: %.1f, %.1f, %.1f, %.1f" %
                                 (r1, r5, r10, medr))

                    (r1g, r5g, r10g, medrg) = i2t(im1, genre)
                    logging.info("Image to genre: %.1f, %.1f, %.1f, %.1f" %
                                 (r1g, r5g, r10g, medrg))

                    (r1a, r5a, r10a, medra) = i2t(im1, artist)
                    logging.info("Image to Artist: %.1f, %.1f, %.1f, %.1f" %
                                 (r1a, r5a, r10a, medra))

                    logging.info("Cal Recall@K ")
                    writer.add_scalars('Validation Recal/Image2Album', {
                        'r@1': r1,
                        'r@5': r5,
                        'r@10': r10
                    }, uidx)

                    writer.add_scalars('Validation Recal/Image2Genres', {
                        'r@1': r1g,
                        'r@5': r5g,
                        'r@10': r10g
                    }, uidx)

                    writer.add_scalars('Validation Recal/Image2Artist', {
                        'r@1': r1a,
                        'r@5': r5a,
                        'r@10': r5a
                    }, uidx)

                    curr_step = uidx / validFreq

                    currscore = r1 + r5 + r10 + r1a + r5a + r10a + r1g + r5g + r10g - medr - medrg - medra
                    if currscore > curr:
                        curr = currscore
                        best_r1, best_r5, best_r10, best_medr = r1, r5, r10, medr
                        best_r1g, best_r5g, best_r10g, best_medrg = r1, r5, r10, medrg
                        best_step = curr_step

                        # Save model
                        logging.info('Saving model...')
                        pkl.dump(
                            model_options,
                            open('%s_params_%s.pkl' % (saveto, encoder), 'wb'))
                        torch.save(img_sen_model.state_dict(),
                                   '%s_model_%s.pkl' % (saveto, encoder))
                        logging.info('Done')

                    if curr_step - best_step > early_stop:
                        logging.info('early stopping, jumping now...')
                        logging.info("Image to text: %.1f, %.1f, %.1f, %.1f" %
                                     (best_r1, best_r5, best_r10, best_medr))
                        logging.info(
                            "Image to genre: %.1f, %.1f, %.1f, %.1f" %
                            (best_r1g, best_r5g, best_r10g, best_medrg))

                        #return 0
                        lrate = 1e-4
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lrate

            img_sen_model.train()
            cost = loss_fn(im1, x1, artist, genre)
            writer.add_scalar('Evaluation/training_Loss', cost, uidx)

            optimizer.zero_grad()
            cost.backward()
            torch.nn.utils.clip_grad_norm_(params, grad_clip)

            scheduler.step(cost.data.item())
            optimizer.step()

        #scheduler.step(cost.data.item())
        logging.info('Seen %d samples' % n_samples)
Ejemplo n.º 24
0
def main():
    # parse options
    parser = TrainOptions()
    opts = parser.parse()

    # daita loader
    print('\n--- load dataset ---')
    vocab = pickle.load(
        open(os.path.join(opts.vocab_path, '%s_vocab.pkl' % opts.data_name),
             'rb'))
    vocab_size = len(vocab)
    opts.vocab_size = vocab_size
    torch.backends.cudnn.enabled = False
    # Load data loaders
    train_loader, val_loader = data.get_loaders(opts.data_name, vocab,
                                                opts.crop_size,
                                                opts.batch_size, opts.workers,
                                                opts)
    test_loader = data.get_test_loader('test', opts.data_name, vocab,
                                       opts.crop_size, opts.batch_size,
                                       opts.workers, opts)
    # model
    print('\n--- load subspace ---')
    subspace = model_2.VSE(opts)
    subspace.setgpu()
    print('\n--- load model ---')
    model = DRIT(opts)
    model.setgpu(opts.gpu)
    if opts.resume is None:  #之前没有保存过模型
        model.initialize()
        ep0 = -1
        total_it = 0
    else:
        ep0, total_it = model.resume(opts.resume)
    model.set_scheduler(opts, last_ep=ep0)
    ep0 += 1
    print('start the training at epoch %d' % (ep0))

    # saver for display and output
    saver = Saver(opts)

    # train
    print('\n--- train ---')
    max_it = 500000
    score = 0.0
    subspace.train_start()
    for ep in range(ep0, opts.pre_iter):
        print('-----ep:{} --------'.format(ep))
        for it, (images, captions, lengths, ids) in enumerate(train_loader):
            if it >= opts.train_iter:
                break
            # input data
            images = images.cuda(opts.gpu).detach()
            captions = captions.cuda(opts.gpu).detach()

            img, cap = subspace.train_emb(images,
                                          captions,
                                          lengths,
                                          ids,
                                          pre=True)  #[b,1024]

            subspace.pre_optimizer.zero_grad()
            img = img.view(images.size(0), -1, 32, 32)
            cap = cap.view(images.size(0), -1, 32, 32)

            model.pretrain_ae(img, cap)

            if opts.grad_clip > 0:
                clip_grad_norm(subspace.params, opts.grad_clip)

            subspace.pre_optimizer.step()

    for ep in range(ep0, opts.n_ep):
        subspace.train_start()
        adjust_learning_rate(opts, subspace.optimizer, ep)
        for it, (images, captions, lengths, ids) in enumerate(train_loader):
            if it >= opts.train_iter:
                break
            # input data
            images = images.cuda(opts.gpu).detach()
            captions = captions.cuda(opts.gpu).detach()

            img, cap = subspace.train_emb(images, captions, lengths,
                                          ids)  #[b,1024]

            img = img.view(images.size(0), -1, 32, 32)
            cap = cap.view(images.size(0), -1, 32, 32)

            subspace.optimizer.zero_grad()

            for p in model.disA.parameters():
                p.requires_grad = True
            for p in model.disB.parameters():
                p.requires_grad = True
            for p in model.disA_attr.parameters():
                p.requires_grad = True
            for p in model.disB_attr.parameters():
                p.requires_grad = True

            for i in range(opts.niters_gan_d):  #5
                model.update_D(img, cap)

            for p in model.disA.parameters():
                p.requires_grad = False
            for p in model.disB.parameters():
                p.requires_grad = False
            for p in model.disA_attr.parameters():
                p.requires_grad = False
            for p in model.disB_attr.parameters():
                p.requires_grad = False

            for i in range(opts.niters_gan_enc):
                model.update_E(img, cap)  #利用新的content损失函数

            subspace.optimizer.step()

            print('total_it: %d (ep %d, it %d), lr %09f' %
                  (total_it, ep, it, model.gen_opt.param_groups[0]['lr']))
            total_it += 1

        # decay learning rate
        if opts.n_ep_decay > -1:
            model.update_lr()

        # save result image
        #saver.write_img(ep, model)
        if (ep + 1) % opts.n_ep == 0:
            print('save model')
            filename = os.path.join(opts.result_dir, opts.name)
            model.save('%s/final_model.pth' % (filename), ep, total_it)
            torch.save(subspace.state_dict(),
                       '%s/final_subspace.pth' % (filename))
        elif (ep + 1) % 10 == 0:
            print('save model')
            filename = os.path.join(opts.result_dir, opts.name)
            model.save('%s/%s_model.pth' % (filename, str(ep + 1)), ep,
                       total_it)
            torch.save(subspace.state_dict(),
                       '%s/%s_subspace.pth' % (filename, str(ep + 1)))

        if (ep + 1) % opts.model_save_freq == 0:
            a = None
            b = None
            c = None
            d = None
            subspace.val_start()
            for it, (images, captions, lengths, ids) in enumerate(test_loader):
                if it >= opts.val_iter:
                    break
                images = images.cuda(opts.gpu).detach()
                captions = captions.cuda(opts.gpu).detach()

                img_emb, cap_emb = subspace.forward_emb(images,
                                                        captions,
                                                        lengths,
                                                        volatile=True)

                img = img_emb.view(images.size(0), -1, 32, 32)
                cap = cap_emb.view(images.size(0), -1, 32, 32)
                image1, text1 = model.test_model2(img, cap)
                img2 = image1.view(images.size(0), -1)
                cap2 = text1.view(images.size(0), -1)

                if a is None:
                    a = np.zeros(
                        (opts.val_iter * opts.batch_size, img_emb.size(1)))
                    b = np.zeros(
                        (opts.val_iter * opts.batch_size, cap_emb.size(1)))

                    c = np.zeros(
                        (opts.val_iter * opts.batch_size, img2.size(1)))
                    d = np.zeros(
                        (opts.val_iter * opts.batch_size, cap2.size(1)))

                a[ids] = img_emb.data.cpu().numpy().copy()
                b[ids] = cap_emb.data.cpu().numpy().copy()

                c[ids] = img2.data.cpu().numpy().copy()
                d[ids] = cap2.data.cpu().numpy().copy()

            aa = torch.from_numpy(a)
            bb = torch.from_numpy(b)

            cc = torch.from_numpy(c)
            dd = torch.from_numpy(d)

            (r1, r5, r10, medr, meanr) = i2t(aa, bb, measure=opts.measure)
            print('test640: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                medr, r1, r5, r10))

            (r1i, r5i, r10i, medri, meanr) = t2i(aa, bb, measure=opts.measure)
            print('test640: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                medri, r1i, r5i, r10i))

            (r2, r3, r4, m1, m2) = i2t(cc, dd, measure=opts.measure)
            print('test640: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                m1, r2, r3, r4))

            (r2i, r3i, r4i, m1i, m2i) = t2i(cc, dd, measure=opts.measure)
            print('test640: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                m1i, r2i, r3i, r4i))

            curr = r2 + r3 + r4 + r2i + r3i + r4i

            if curr > score:
                score = curr
                print('save model')
                filename = os.path.join(opts.result_dir, opts.name)
                model.save('%s/best_model.pth' % (filename), ep, total_it)
                torch.save(subspace.state_dict(),
                           '%s/subspace.pth' % (filename))

            a = None
            b = None
            c = None
            d = None

            for it, (images, captions, lengths, ids) in enumerate(test_loader):

                images = images.cuda(opts.gpu).detach()
                captions = captions.cuda(opts.gpu).detach()

                img_emb, cap_emb = subspace.forward_emb(images,
                                                        captions,
                                                        lengths,
                                                        volatile=True)

                img = img_emb.view(images.size(0), -1, 32, 32)
                cap = cap_emb.view(images.size(0), -1, 32, 32)
                image1, text1 = model.test_model2(img, cap)
                img2 = image1.view(images.size(0), -1)
                cap2 = text1.view(images.size(0), -1)

                if a is None:
                    a = np.zeros((len(test_loader.dataset), img_emb.size(1)))
                    b = np.zeros((len(test_loader.dataset), cap_emb.size(1)))

                    c = np.zeros((len(test_loader.dataset), img2.size(1)))
                    d = np.zeros((len(test_loader.dataset), cap2.size(1)))

                a[ids] = img_emb.data.cpu().numpy().copy()
                b[ids] = cap_emb.data.cpu().numpy().copy()

                c[ids] = img2.data.cpu().numpy().copy()
                d[ids] = cap2.data.cpu().numpy().copy()

            aa = torch.from_numpy(a)
            bb = torch.from_numpy(b)

            cc = torch.from_numpy(c)
            dd = torch.from_numpy(d)

            (r1, r5, r10, medr, meanr) = i2t(aa, bb, measure=opts.measure)
            print('test5000: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                medr, r1, r5, r10))

            (r1i, r5i, r10i, medri, meanr) = t2i(aa, bb, measure=opts.measure)
            print('test5000: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                medri, r1i, r5i, r10i))

            (r2, r3, r4, m1, m2) = i2t(cc, dd, measure=opts.measure)
            print('test5000: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                m1, r2, r3, r4))

            (r2i, r3i, r4i, m1i, m2i) = t2i(cc, dd, measure=opts.measure)
            print('test5000: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                m1i, r2i, r3i, r4i))

    return
Ejemplo n.º 25
0
        c = np.zeros((opts.batch_size * opts.test_iter, img2.size(1)))
        d = np.zeros((opts.batch_size * opts.test_iter, cap2.size(1)))

    a[ids] = img_emb.data.cpu().numpy().copy()
    b[ids] = cap_emb.data.cpu().numpy().copy()

    c[ids] = img2.data.cpu().numpy().copy()
    d[ids] = cap2.data.cpu().numpy().copy()

aa = torch.from_numpy(a)
bb = torch.from_numpy(b)

cc = torch.from_numpy(c)
dd = torch.from_numpy(d)

#print(image1.size())
(r1, r5, r10, medr, meanr) = i2t(aa, bb)
print('subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(medr, r1, r5, r10))

(r1i, r5i, r10i, medri, meanri) = t2i(
    aa,
    bb,
)
print('subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(medri, r1i, r5i, r10i))

(r2, r3, r4, m1, m2) = i2t(cc, dd)
print('encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(m1, r2, r3, r4))

(r2i, r3i, r4i, m1i, m2i) = t2i(cc, dd)
print('encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(m1i, r2i, r3i, r4i))
Ejemplo n.º 26
0
    def train_matching_gan(self):
        margin_ranking_loss = nn.MarginRankingLoss(self.margin)

        for epoch in range(self.epochs):
            for sample in self.match_data_loader:
                images = sample['images']
                sentences = sample['sentences']
                unmatched_images = sample['unmatched_images']
                unmatched_sentences = sample['unmatched_sentences']

                images = torch.tensor(images, requires_grad=False).cuda()
                sentences = torch.tensor(sentences, requires_grad=False).cuda()
                unmatched_images = torch.tensor(unmatched_images,
                                                requires_grad=False).cuda()
                unmatched_sentences = torch.tensor(unmatched_sentences,
                                                   requires_grad=False).cuda()

                # 更新判别器
                self.match_discriminator_optimizer.zero_grad()

                fake_images = self.image_generator(sentences)
                if self.arguments['use_sentence_generator']:
                    # fake_sentences = self.sentence_generator(images)
                    with torch.no_grad():
                        image_features = self.downsapmle_block(images)
                        fake_sentences = self.sentence_decoder_block.sample(
                            image_features)

                        fake_sentences = convert_indexes2sentence(
                            self.arguments['idx2word'], fake_sentences)
                        xs = []
                        for sentence in fake_sentences:
                            sentence = [
                                self.arguments['word2idx'][w]
                                if self.arguments['word2idx'][w] <
                                self.arguments['word_number'] else 1
                                for w in sentence.split()
                            ]
                            x = np.zeros(
                                self.arguments['sentence_max_length']).astype(
                                    'int64')
                            if len(sentence
                                   ) < self.arguments['sentence_max_length']:
                                x[:len(sentence)] = sentence
                            else:
                                x[:] = sentence[:self.arguments[
                                    'sentence_max_length']]
                            xs.append(x)
                        fake_sentences = np.stack(xs, 0)
                        fake_sentences = torch.LongTensor(fake_sentences)
                        fake_sentences = torch.tensor(
                            fake_sentences, requires_grad=False).cuda()
                        fake_sentence_scores = self.match_discriminator(
                            images, fake_sentences)
                        loss4 = margin_ranking_loss(fake_sentence_scores,
                                                    unmatched_sentence_scores,
                                                    real_labels)

                matching_scores = self.match_discriminator(images, sentences)
                unmatched_sentence_scores = self.match_discriminator(
                    images, unmatched_sentences)
                unmatched_image_scores = self.match_discriminator(
                    unmatched_images, sentences)
                fake_image_scores = self.match_discriminator(
                    fake_images, sentences)

                real_labels = torch.ones(images.size(0)).cuda()
                loss1 = margin_ranking_loss(matching_scores,
                                            unmatched_sentence_scores,
                                            real_labels)
                loss2 = margin_ranking_loss(matching_scores,
                                            unmatched_image_scores,
                                            real_labels)
                loss3 = margin_ranking_loss(fake_image_scores,
                                            unmatched_image_scores,
                                            real_labels)

                if self.arguments['use_sentence_generator']:
                    discriminator_loss = loss1 + loss2 + loss3 + loss4
                else:
                    discriminator_loss = loss1 + loss2 + loss3

                discriminator_loss.backward()

                self.match_discriminator_optimizer.step()
                print("Epoch: %d, discriminator_loss= %f" %
                      (epoch, discriminator_loss.data))

            if (epoch + 1) == self.epochs:
                save_discriminator_checkpoint(self.match_discriminator,
                                              self.model_save_path, epoch)

            val_images, val_sentences = load_validation_set(self.arguments)
            val_sentences = torch.tensor(val_sentences,
                                         requires_grad=False).cuda()
            val_images = torch.tensor(val_images, requires_grad=False).cuda()
            i2t_r1, i2t_r5, i2t_r10, i2t_medr = i2t(self.match_discriminator,
                                                    val_images, val_sentences)
            t2i_r1, t2i_r5, t2i_r10, t2i_medr = t2i(self.match_discriminator,
                                                    val_sentences, val_images)
            print "Image to Text: %.2f, %.2f, %.2f, %.2f" \
                  % (i2t_r1, i2t_r5, i2t_r10, i2t_medr)
            print "Text to Image: %.2f, %.2f, %.2f, %.2f" \
                  % (t2i_r1, t2i_r5, t2i_r10, t2i_medr)
Ejemplo n.º 27
0
def trainer(data='coco',  #f8k, f30k, coco
            margin=0.2,
            dim=1024,
            dim_image=4096,
            dim_word=300,
            encoder='gru',  # gru OR bow
            max_epochs=15,
            dispFreq=10,
            decay_c=0.,
            grad_clip=2.,
            maxlen_w=100,
            optimizer='adam',
            batch_size = 128,
            saveto='/ais/gobi3/u/rkiros/uvsmodels/coco.npz',
            validFreq=100,
            lrate=0.0002,
            reload_=False):

    # Model options
    model_options = {}
    model_options['data'] = data
    model_options['margin'] = margin
    model_options['dim'] = dim
    model_options['dim_image'] = dim_image
    model_options['dim_word'] = dim_word
    model_options['encoder'] = encoder
    model_options['max_epochs'] = max_epochs
    model_options['dispFreq'] = dispFreq
    model_options['decay_c'] = decay_c
    model_options['grad_clip'] = grad_clip
    model_options['maxlen_w'] = maxlen_w
    model_options['optimizer'] = optimizer
    model_options['batch_size'] = batch_size
    model_options['saveto'] = saveto
    model_options['validFreq'] = validFreq
    model_options['lrate'] = lrate
    model_options['reload_'] = reload_

    print model_options

    # reload options
    if reload_ and os.path.exists(saveto):
        print 'reloading...' + saveto
        with open('%s.pkl'%saveto, 'rb') as f:
            models_options = pkl.load(f)

    # Load training and development sets
    print 'Loading dataset'
    train, dev = load_dataset(data)[:2]

    # Create and save dictionary
    print 'Creating dictionary'
    worddict = build_dictionary(train[0]+dev[0])[0]
    n_words = len(worddict)
    model_options['n_words'] = n_words
    print 'Dictionary size: ' + str(n_words)
    with open('%s.dictionary.pkl'%saveto, 'wb') as f:
        pkl.dump(worddict, f)

    # Inverse dictionary
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, inps, cost = build_model(tparams, model_options)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=False)
    print 'Done'

    # weight decay, if applicable
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=False)
    print 'Done'

    print 'Building sentence encoder'
    trng, inps_se, sentences = build_sentence_encoder(tparams, model_options)
    f_senc = theano.function(inps_se, sentences, profile=False)

    print 'Building image encoder'
    trng, inps_ie, images = build_image_encoder(tparams, model_options)
    f_ienc = theano.function(inps_ie, images, profile=False)

    print 'Building f_grad...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False)
    f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False)

    if grad_clip > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (grad_clip**2),
                                           g / tensor.sqrt(g2) * grad_clip,
                                           g))
        grads = new_grads

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    # (compute gradients), (updates parameters)
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)

    print 'Optimization'

    # Each sentence in the minibatch have same length (for encoder)
    train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w)

    uidx = 0
    curr = 0.
    n_samples = 0
    
    for eidx in xrange(max_epochs):

        print 'Epoch ', eidx

        for x, im in train_iter:
            n_samples += len(x)
            uidx += 1

            x, mask, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words)

            if x == None:
                print 'Minibatch with zero sample under length ', maxlen_w
                uidx -= 1
                continue

            # Update
            ud_start = time.time()
            cost = f_grad_shared(x, mask, im)
            f_update(lrate)
            ud = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            if numpy.mod(uidx, validFreq) == 0:

                print 'Computing results...'
                curr_model = {}
                curr_model['options'] = model_options
                curr_model['worddict'] = worddict
                curr_model['word_idict'] = word_idict
                curr_model['f_senc'] = f_senc
                curr_model['f_ienc'] = f_ienc

                ls = encode_sentences(curr_model, dev[0])
                lim = encode_images(curr_model, dev[1])

                (r1, r5, r10, medr) = i2t(lim, ls)
                print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)
                (r1i, r5i, r10i, medri) = t2i(lim, ls)
                print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri)

                currscore = r1 + r5 + r10 + r1i + r5i + r10i
                if currscore > curr:
                    curr = currscore

                    # Save model
                    print 'Saving...',
                    params = unzip(tparams)
                    numpy.savez(saveto, **params)
                    pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
                    print 'Done'

        print 'Seen %d samples'%n_samples
Ejemplo n.º 28
0
def trainer(data='coco',
            margin=0.2,
            dim=1024,
            dim_image=4096,
            dim_word=300,
            max_epochs=15,
            encoder='lstm',
            dispFreq=10,
            grad_clip=2.0,
            maxlen_w=150,
            batch_size=128,
            saveto='vse/coco',
            validFreq=100,
            early_stop=20,
            lrate=0.0002,
            reload_=False):

    # Model options
    model_options = {}
    model_options['data'] = data
    model_options['margin'] = margin
    model_options['dim'] = dim
    model_options['dim_image'] = dim_image
    model_options['dim_word'] = dim_word
    model_options['max_epochs'] = max_epochs
    model_options['dispFreq'] = dispFreq
    model_options['grad_clip'] = grad_clip
    model_options['maxlen_w'] = maxlen_w
    model_options['batch_size'] = batch_size
    model_options['saveto'] = saveto
    model_options['validFreq'] = validFreq
    model_options['lrate'] = lrate
    model_options['reload_'] = reload_

    print model_options

    # reload options
    if reload_ and os.path.exists(saveto):
        print 'reloading...' + saveto
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    # Load training and development sets
    print 'loading dataset'
    train, dev = load_dataset(data)

    # Create and save dictionary
    print 'Create dictionary'
    worddict = build_dictionary(train[0] + dev[0])[0]
    n_words = len(worddict)
    model_options['n_words'] = n_words
    print 'Dictionary size: ' + str(n_words)
    with open('%s.dictionary.pkl' % saveto, 'wb') as f:
        pkl.dump(worddict, f)

    # Inverse dictionary
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    model_options['worddict'] = worddict
    model_options['word_idict'] = word_idict

    # Each sentence in the minibatch have same length (for encoder)
    train_iter = homogeneous_data.HomogeneousData([train[0], train[1]],
                                                  batch_size=batch_size,
                                                  maxlen=maxlen_w)

    img_sen_model = ImgSenRanking(model_options)
    img_sen_model = img_sen_model.cuda()

    loss_fn = PairwiseRankingLoss(margin=margin)
    loss_fn = loss_fn.cuda()

    params = filter(lambda p: p.requires_grad, img_sen_model.parameters())
    optimizer = torch.optim.Adam(params, lrate)

    uidx = 0
    curr = 0.0
    n_samples = 0

    # For Early-stopping
    best_r1, best_r5, best_r10, best_medr = 0.0, 0.0, 0.0, 0
    best_r1i, best_r5i, best_r10i, best_medri = 0.0, 0.0, 0.0, 0
    best_step = 0

    for eidx in xrange(max_epochs):

        print 'Epoch ', eidx

        for x, im in train_iter:
            n_samples += len(x)
            uidx += 1

            x, im = homogeneous_data.prepare_data(x,
                                                  im,
                                                  worddict,
                                                  maxlen=maxlen_w,
                                                  n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen_w
                uidx -= 1
                continue

            x = Variable(torch.from_numpy(x).cuda())
            im = Variable(torch.from_numpy(im).cuda())
            # Update
            x, im = img_sen_model(x, im)
            cost = loss_fn(im, x)
            optimizer.zero_grad()
            cost.backward()
            torch.nn.utils.clip_grad_norm(params, grad_clip)
            optimizer.step()

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, '\tUpdate ', uidx, '\tCost ', cost.data.cpu(
                ).numpy()[0]

            if numpy.mod(uidx, validFreq) == 0:

                print 'Computing results...'
                curr_model = {}
                curr_model['options'] = model_options
                curr_model['worddict'] = worddict
                curr_model['word_idict'] = word_idict
                curr_model['img_sen_model'] = img_sen_model

                ls, lim = encode_sentences(curr_model, dev[0]), encode_images(
                    curr_model, dev[1])

                r_time = time.time()
                (r1, r5, r10, medr) = i2t(lim, ls)
                print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10,
                                                                 medr)
                (r1i, r5i, r10i, medri) = t2i(lim, ls)
                print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i,
                                                                 r10i, medri)

                print "Cal Recall@K using %ss" % (time.time() - r_time)

                curr_step = uidx / validFreq

                currscore = r1 + r5 + r10 + r1i + r5i + r10i
                if currscore > curr:
                    curr = currscore
                    best_r1, best_r5, best_r10, best_medr = r1, r5, r10, medr
                    best_r1i, best_r5i, best_r10i, best_medri = r1i, r5i, r10i, medri
                    best_step = curr_step

                    # Save model
                    print 'Saving model...',
                    pkl.dump(
                        model_options,
                        open('%s_params_%s.pkl' % (saveto, encoder), 'wb'))
                    torch.save(img_sen_model.state_dict(),
                               '%s_model_%s.pkl' % (saveto, encoder))
                    print 'Done'

                if curr_step - best_step > early_stop:
                    print 'Early stopping ...'
                    print "Image to text: %.1f, %.1f, %.1f, %.1f" % (
                        best_r1, best_r5, best_r10, best_medr)
                    print "Text to image: %.1f, %.1f, %.1f, %.1f" % (
                        best_r1i, best_r5i, best_r10i, best_medri)
                    return 0

        print 'Seen %d samples' % n_samples
Ejemplo n.º 29
0
def validate(val_loader,
             model,
             tb_logger,
             measure='cosine',
             log_step=10,
             ndcg_scorer=None,
             alignment_mode=None):
    # compute the encoding for all the validation images and captions
    img_embs, cap_embs, img_lenghts, cap_lenghts = encode_data(
        model, val_loader, log_step, logging.info)

    # initialize similarity matrix evaluator
    sim_matrix_fn = AlignmentContrastiveLoss(
        aggregation=alignment_mode,
        return_similarity_mat=True) if alignment_mode is not None else None

    if measure == 'cosine':
        sim_fn = cosine_sim
    elif measure == 'dot':
        sim_fn = dot_sim

    # caption retrieval
    (r1, r5, r10, medr, meanr, mean_rougel_ndcg,
     mean_spice_ndcg) = i2t(img_embs,
                            cap_embs,
                            img_lenghts,
                            cap_lenghts,
                            measure=measure,
                            ndcg_scorer=ndcg_scorer,
                            sim_function=sim_matrix_fn)
    logging.info(
        "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f"
        % (r1, r5, r10, medr, meanr, mean_rougel_ndcg, mean_spice_ndcg))
    # image retrieval
    (r1i, r5i, r10i, medri, meanr, mean_rougel_ndcg_i,
     mean_spice_ndcg_i) = t2i(img_embs,
                              cap_embs,
                              img_lenghts,
                              cap_lenghts,
                              ndcg_scorer=ndcg_scorer,
                              measure=measure,
                              sim_function=sim_matrix_fn)

    logging.info(
        "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f"
        %
        (r1i, r5i, r10i, medri, meanr, mean_rougel_ndcg_i, mean_spice_ndcg_i))
    # sum of recalls to be used for early stopping
    currscore = r1 + r5 + r10 + r1i + r5i + r10i
    spice_ndcg_sum = mean_spice_ndcg + mean_spice_ndcg_i

    # record metrics in tensorboard
    tb_logger.add_scalar('r1', r1, model.Eiters)
    tb_logger.add_scalar('r5', r5, model.Eiters)
    tb_logger.add_scalar('r10', r10, model.Eiters)
    tb_logger.add_scalars('mean_ndcg', {
        'rougeL': mean_rougel_ndcg,
        'spice': mean_spice_ndcg
    }, model.Eiters)
    tb_logger.add_scalar('medr', medr, model.Eiters)
    tb_logger.add_scalar('meanr', meanr, model.Eiters)
    tb_logger.add_scalar('r1i', r1i, model.Eiters)
    tb_logger.add_scalar('r5i', r5i, model.Eiters)
    tb_logger.add_scalar('r10i', r10i, model.Eiters)
    tb_logger.add_scalars('mean_ndcg_i', {
        'rougeL': mean_rougel_ndcg_i,
        'spice': mean_spice_ndcg_i
    }, model.Eiters)
    tb_logger.add_scalar('medri', medri, model.Eiters)
    tb_logger.add_scalar('meanr', meanr, model.Eiters)
    tb_logger.add_scalar('rsum', currscore, model.Eiters)
    tb_logger.add_scalar('spice_ndcg_sum', spice_ndcg_sum, model.Eiters)

    return currscore, spice_ndcg_sum
Ejemplo n.º 30
0
def trainer(load_from=None,
            save_dir='snapshots',
            name='anon',
            **kwargs):
    """
    :param load_from: location to load parameters + options from
    :param name: name of model, used as location to save parameters + options
    """

    curr_model = dict()

    # load old model, including parameters, but overwrite with new options
    if load_from:
        print 'reloading...' + load_from
        with open('%s.pkl'%load_from, 'rb') as f:
            curr_model = pkl.load(f)
    else:
        curr_model['options'] = {}

    for k, v in kwargs.iteritems():
        curr_model['options'][k] = v

    model_options = curr_model['options']

    # initialize logger
    import datetime
    timestampedName = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + name

    from logger import Log
    log = Log(name=timestampedName, hyperparams=model_options, saveDir='vis/training',
              xLabel='Examples Seen', saveFrequency=1)


    print curr_model['options']




    # Load training and development sets
    print 'Loading dataset'
    dataset = load_dataset(model_options['data'], cnn=model_options['cnn'], load_train=True)
    train = dataset['train']
    dev = dataset['dev']

    # Create dictionary
    print 'Creating dictionary'
    worddict = build_dictionary(train['caps']+dev['caps'])
    print 'Dictionary size: ' + str(len(worddict))
    curr_model['worddict'] = worddict
    curr_model['options']['n_words'] = len(worddict) + 2

    # save model
    pkl.dump(curr_model, open('%s/%s.pkl' % (save_dir, name), 'wb'))


    print 'Loading data'
    train_iter = datasource.Datasource(train, batch_size=model_options['batch_size'], worddict=worddict)
    dev = datasource.Datasource(dev, worddict=worddict)
    dev_caps, dev_ims = dev.all()

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if load_from is not None and os.path.exists(load_from):
        params = load_params(load_from, params)

    tparams = init_tparams(params)

    inps, cost = build_model(tparams, model_options)

    print 'Building sentence encoder'
    inps_se, sentences = build_sentence_encoder(tparams, model_options)
    f_senc = theano.function(inps_se, sentences, profile=False)

    print 'Building image encoder'
    inps_ie, images = build_image_encoder(tparams, model_options)
    f_ienc = theano.function(inps_ie, images, profile=False)

    print 'Building f_grad...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    print 'Building errors..'
    inps_err, errs = build_errors(model_options)
    f_err = theano.function(inps_err, errs, profile=False)

    curr_model['f_senc'] = f_senc
    curr_model['f_ienc'] = f_ienc
    curr_model['f_err'] = f_err



    if model_options['grad_clip'] > 0.:
        grads = [maxnorm(g, model_options['grad_clip']) for g in grads]

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    # (compute gradients), (updates parameters)
    f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost)

    print 'Optimization'

    uidx = 0
    curr = 0
    n_samples = 0


    
    for eidx in xrange(model_options['max_epochs']):

        print 'Epoch ', eidx

        for x, mask, im in train_iter:
            n_samples += x.shape[1]
            uidx += 1

            # Update
            ud_start = time.time()
            cost = f_grad_shared(x, mask, im)
            f_update(model_options['lrate'])
            ud = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, model_options['dispFreq']) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
                log.update({'Error': float(cost)}, n_samples)


            if numpy.mod(uidx, model_options['validFreq']) == 0:

                print 'Computing results...'

                # encode sentences efficiently
                dev_s = encode_sentences(curr_model, dev_caps, batch_size=model_options['batch_size'])
                dev_i = encode_images(curr_model, dev_ims)


                # compute errors
                dev_errs = compute_errors(curr_model, dev_s, dev_i)

                # compute ranking error
                (r1, r5, r10, medr, meanr), vis_details = t2i(dev_errs, vis_details=True)
                (r1i, r5i, r10i, medri, meanri) = i2t(dev_errs)
                print "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)
                log.update({'R@1': r1, 'R@5': r5, 'R@10': r10, 'median_rank': medr, 'mean_rank': meanr}, n_samples)
                print "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanri)
                log.update({'Image2Caption_R@1': r1i, 'Image2Caption_R@5': r5i, 'Image2CaptionR@10': r10i, 'Image2Caption_median_rank': medri, 'Image2Caption_mean_rank': meanri}, n_samples)

                tot = r1 + r5 + r10
                if tot > curr:
                    curr = tot
                    # Save parameters
                    print 'Saving...',
                    numpy.savez('%s/%s'%(save_dir, name), **unzip(tparams))
                    print 'Done'
                    vis_details['hyperparams'] = model_options
                    # Save visualization details
                    with open('vis/roc/%s/%s.json' % (model_options['data'], timestampedName), 'w') as f:
                        json.dump(vis_details, f)
                    # Add the new model to the index
                    index = json.load(open('vis/roc/index.json', 'r'))
                    models = index[model_options['data']]
                    if timestampedName not in models:
                        models.append(timestampedName)

                    with open('vis/roc/index.json', 'w') as f:
                        json.dump(index, f)






        print 'Seen %d samples'%n_samples
Ejemplo n.º 31
0
def validate(opt, val_loader, model, measure='cosine'):
    # compute the encoding for all the validation video and captions
    video_embs, cap_embs, video_ids, caption_ids = evaluation.encode_data(
        model, val_loader, opt.log_step, logging.info)

    # we load data as video-sentence pairs
    # but we only need to forward each video once for evaluation
    # so we get the video set and mask out same videos with feature_mask
    feature_mask = []
    evaluate_videos = set()
    for video_id in video_ids:
        feature_mask.append(video_id not in evaluate_videos)
        evaluate_videos.add(video_id)
    video_embs = video_embs[feature_mask]
    video_ids = [
        x for idx, x in enumerate(video_ids) if feature_mask[idx] is True
    ]

    c2i_all_errors = evaluation.cal_error(video_embs, cap_embs, measure)
    if opt.val_metric == "recall":

        # video retrieval
        (r1i, r5i, r10i, medri,
         meanri) = evaluation.t2i(c2i_all_errors, n_caption=opt.n_caption)
        print(" * Text to video:")
        print(" * r_1_5_10: {}".format(
            [round(r1i, 3), round(r5i, 3),
             round(r10i, 3)]))
        print(" * medr, meanr: {}".format([round(medri, 3), round(meanri, 3)]))
        print(" * " + '-' * 10)

        # caption retrieval
        (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors,
                                                    n_caption=opt.n_caption)
        print(" * Video to text:")
        print(" * r_1_5_10: {}".format(
            [round(r1, 3), round(r5, 3),
             round(r10, 3)]))
        print(" * medr, meanr: {}".format([round(medr, 3), round(meanr, 3)]))
        print(" * " + '-' * 10)

        # record metrics in tensorboard
        tb_logger.log_value('r1', r1, step=model.Eiters)
        tb_logger.log_value('r5', r5, step=model.Eiters)
        tb_logger.log_value('r10', r10, step=model.Eiters)
        tb_logger.log_value('medr', medr, step=model.Eiters)
        tb_logger.log_value('meanr', meanr, step=model.Eiters)
        tb_logger.log_value('r1i', r1i, step=model.Eiters)
        tb_logger.log_value('r5i', r5i, step=model.Eiters)
        tb_logger.log_value('r10i', r10i, step=model.Eiters)
        tb_logger.log_value('medri', medri, step=model.Eiters)
        tb_logger.log_value('meanri', meanri, step=model.Eiters)

    elif opt.val_metric == "map":
        i2t_map_score = evaluation.i2t_map(c2i_all_errors,
                                           n_caption=opt.n_caption)
        t2i_map_score = evaluation.t2i_map(c2i_all_errors,
                                           n_caption=opt.n_caption)
        tb_logger.log_value('i2t_map', i2t_map_score, step=model.Eiters)
        tb_logger.log_value('t2i_map', t2i_map_score, step=model.Eiters)
        print('i2t_map', i2t_map_score)
        print('t2i_map', t2i_map_score)

    currscore = 0
    if opt.val_metric == "recall":
        if opt.direction == 'i2t' or opt.direction == 'all':
            currscore += (r1 + r5 + r10)
        if opt.direction == 't2i' or opt.direction == 'all':
            currscore += (r1i + r5i + r10i)
    elif opt.val_metric == "map":
        if opt.direction == 'i2t' or opt.direction == 'all':
            currscore += i2t_map_score
        if opt.direction == 't2i' or opt.direction == 'all':
            currscore += t2i_map_score

    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Ejemplo n.º 32
0
def trainer(data='coco',
            margin=0.2,
            dim=1024,
            dim_image=4096,
            dim_word=300,
            encoder='gru',
            max_epochs=15,
            dispFreq=10,
            decay_c=0.0,
            grad_clip=2.0,
            maxlen_w=150,
            batch_size=128,
            saveto='vse/coco',
            validFreq=100,
            lrate=0.0002,
            concat=True,
            reload_=False):

    hyper_params = {
        'data': data,
        'encoder': encoder,
        'batch_size': batch_size,
        'time': cur_time,
        'lrate': lrate,
        'concat': concat,
    }

    i2t_r1 = dict([('i2t_recall', 'r1')] + hyper_params.items())
    i2t_r5 = dict([('i2t_recall', 'r5')] + hyper_params.items())
    i2t_r10 = dict([('i2t_recall', 'r10')] + hyper_params.items())
    t2i_r1 = dict([('t2i_recall', 'r1')] + hyper_params.items())
    t2i_r5 = dict([('t2i_recall', 'r5')] + hyper_params.items())
    t2i_r10 = dict([('t2i_recall', 'r10')] + hyper_params.items())

    i2t_med = dict([('i2t_med', 'i2t_med')] + hyper_params.items())
    t2i_med = dict([('t2i_med', 't2i_med')] + hyper_params.items())

    agent = Agent(port=5020)
    i2t_r1_agent = agent.register(i2t_r1, 'recall', overwrite=True)
    i2t_r5_agent = agent.register(i2t_r5, 'recall', overwrite=True)
    i2t_r10_agent = agent.register(i2t_r10, 'recall', overwrite=True)
    t2i_r1_agent = agent.register(t2i_r1, 'recall', overwrite=True)
    t2i_r5_agent = agent.register(t2i_r5, 'recall', overwrite=True)
    t2i_r10_agent = agent.register(t2i_r10, 'recall', overwrite=True)

    i2t_med_agent = agent.register(i2t_med, 'median', overwrite=True)
    t2i_med_agent = agent.register(t2i_med, 'median', overwrite=True)

    # Model options
    model_options = {}
    model_options['data'] = data
    model_options['margin'] = margin
    model_options['dim'] = dim
    model_options['dim_image'] = dim_image
    model_options['dim_word'] = dim_word
    model_options['encoder'] = encoder
    model_options['max_epochs'] = max_epochs
    model_options['dispFreq'] = dispFreq
    model_options['decay_c'] = decay_c
    model_options['grad_clip'] = grad_clip
    model_options['maxlen_w'] = maxlen_w
    model_options['batch_size'] = batch_size
    model_options['saveto'] = saveto
    model_options['validFreq'] = validFreq
    model_options['lrate'] = lrate
    model_options['reload_'] = reload_
    model_options['concat'] = concat

    print model_options

    # reload options
    if reload_ and os.path.exists(saveto):
        print 'reloading...' + saveto
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    # Load training and development sets
    print 'loading dataset'
    train, dev = load_dataset(data)[:2]

    # Create and save dictionary
    print 'Create dictionary'
    worddict = build_dictionary(train[0] + dev[0])[0]
    n_words = len(worddict)
    model_options['n_words'] = n_words
    print 'Dictionary size: ' + str(n_words)
    with open('%s.dictionary.pkl' % saveto, 'wb') as f:
        pkl.dump(worddict, f)

    # Inverse dictionary
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    model_options['worddict'] = worddict
    model_options['word_idict'] = word_idict

    # Each sentence in the minibatch have same length (for encoder)
    train_iter = homogeneous_data.HomogeneousData([train[0], train[1]],
                                                  batch_size=batch_size,
                                                  maxlen=maxlen_w)

    img_sen_model = ImgSenRanking(model_options)
    img_sen_model = img_sen_model.cuda()

    loss_fn = PairwiseRankingLoss(margin=margin)
    loss_fn = loss_fn.cuda()

    params = filter(lambda p: p.requires_grad, img_sen_model.parameters())
    optimizer = torch.optim.Adam(params, lrate)

    uidx = 0
    curr = 0.0
    n_samples = 0

    for eidx in xrange(max_epochs):

        print 'Epoch ', eidx

        for x, im in train_iter:
            n_samples += len(x)
            uidx += 1

            x_id, im = homogeneous_data.prepare_data(x,
                                                     im,
                                                     worddict,
                                                     maxlen=maxlen_w,
                                                     n_words=n_words)

            if x_id is None:
                print 'Minibatch with zero sample under length ', maxlen_w
                uidx -= 1
                continue

            x_id = Variable(torch.from_numpy(x_id).cuda())
            im = Variable(torch.from_numpy(im).cuda())
            # Update
            ud_start = time.time()
            x, im = img_sen_model(x_id, im, x)
            cost = loss_fn(im, x)
            optimizer.zero_grad()
            cost.backward()
            torch.nn.utils.clip_grad_norm(params, grad_clip)
            optimizer.step()
            ud = time.time() - ud_start

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost.data.cpu(
                ).numpy()[0], 'UD ', ud

            if numpy.mod(uidx, validFreq) == 0:

                print 'Computing results...'
                curr_model = {}
                curr_model['options'] = model_options
                curr_model['worddict'] = worddict
                curr_model['word_idict'] = word_idict
                curr_model['img_sen_model'] = img_sen_model

                ls, lim = encode_sentences(curr_model, dev[0]), encode_images(
                    curr_model, dev[1])

                r1, r5, r10, medr = 0.0, 0.0, 0.0, 0
                r1i, r5i, r10i, medri = 0.0, 0.0, 0.0, 0
                r_time = time.time()
                if data == 'arch' or data == 'arch_small':
                    (r1, r5, r10, medr) = i2t_arch(lim, ls)
                    print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5,
                                                                     r10, medr)
                    (r1i, r5i, r10i, medri) = t2i_arch(lim, ls)
                    print "Text to image: %.1f, %.1f, %.1f, %.1f" % (
                        r1i, r5i, r10i, medri)
                else:
                    (r1, r5, r10, medr) = i2t(lim, ls)
                    print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5,
                                                                     r10, medr)
                    (r1i, r5i, r10i, medri) = t2i(lim, ls)
                    print "Text to image: %.1f, %.1f, %.1f, %.1f" % (
                        r1i, r5i, r10i, medri)

                print "Cal Recall@K using %ss" % (time.time() - r_time)

                record_num = uidx / validFreq
                agent.append(i2t_r1_agent, record_num, r1)
                agent.append(i2t_r5_agent, record_num, r5)
                agent.append(i2t_r10_agent, record_num, r10)
                agent.append(t2i_r1_agent, record_num, r1i)
                agent.append(t2i_r5_agent, record_num, r5i)
                agent.append(t2i_r10_agent, record_num, r10i)

                agent.append(i2t_med_agent, record_num, medr)
                agent.append(t2i_med_agent, record_num, medri)

                currscore = r1 + r5 + r10 + r1i + r5i + r10i
                if currscore > curr:
                    curr = currscore

                    # Save model
                    print 'Saving model...',
                    pkl.dump(
                        model_options,
                        open('%s_params_%s.pkl' % (saveto, encoder), 'wb'))
                    torch.save(img_sen_model.state_dict(),
                               '%s_model_%s.pkl' % (saveto, encoder))
                    print 'Done'

        print 'Seen %d samples' % n_samples
def trainer(**kwargs):
    """
    Train the model according to input params
    Info about input params is available in parameters.py
    """
    # Timing
    print('Starting time:', datetime.now())
    sys.stdout.flush()
    t_start_train = time.time()

    # Model options
    # load old model, including parameters, but overwrite with new options

    # Extract model options from arguments
    model_options = {}
    for k, v in kwargs.iteritems():
        model_options[k] = v

    # Print input options
    print('PARAMETERS BEFORE LOADING:')
    for k, v in model_options.items():
        print('{:>26}: {}'.format(k, v))
    sys.stdout.flush()

    # Reload options if required
    curr_model = dict()
    if model_options['reload_']:
        # Reload model parameters
        opt_filename_reload = get_opt_filename(model_options, previous=True)
        print('reloading...', opt_filename_reload)
        sys.stdout.flush()
        try:
            with open(opt_filename_reload, 'rb') as f:
                curr_model = pkl.load(f)
        except:
            print(
                'Failed to reload parameters, try to use only feeded parameters'
            )
            curr_model['options'] = {}

        # Check if we reload from best model or last model
        if model_options['load_from'] in ['Best', 'best', 'B', 'b']:
            load_from_best = True
            print('Loading from Best saved model in validation results')
        elif model_options['load_from'] in ['Last', 'last', 'L', 'l']:
            load_from_best = False
            print('Loading from Last saved model')
        else:
            print('Unkown choice for "load_from" parameter',
                  model_options['load_from'])
            print('Please choose one of:', ['Best', 'best', 'B', 'b'],
                  ['Last', 'last', 'L', 'l'])
            print('Using Last as default')
            load_from_best = False

        # Reload end-point parameters
        state_filename = get_sol_filename(model_options,
                                          best=load_from_best,
                                          previous=True)
        print('reloading...', state_filename)
        sys.stdout.flush()
        try:
            with open(state_filename, 'rb') as f:
                state_params = pkl.load(f)
            if load_from_best:
                init_epoch = state_params['epoch']
                solution = state_params
            else:
                init_epoch = state_params['epoch_done'] + 1
                solution = state_params['solution']
            best_val_score = solution['best_val_score']
            n_samples = solution['samples_seen']
        except:
            print('Failed to reload state parameters, starting from 0')
            init_epoch = 0
            best_val_score = 0
            n_samples = 0

    else:
        curr_model['options'] = {}
        init_epoch = 0
        best_val_score = 0
        n_samples = 0

    # Overwrite loaded options with input options
    for k, v in kwargs.iteritems():
        curr_model['options'][k] = v
    model_options = curr_model['options']

    # Print final options loaded
    if model_options['reload_']:
        print('PARAMETERS AFTER LOADING:')
        for k, v in model_options.items():
            print('{:>26}: {}'.format(k, v))
        sys.stdout.flush()

    # Load training and development sets
    print('Loading dataset')
    sys.stdout.flush()

    dataset = load_dataset(dataset_name=model_options['data'],
                           embedding=model_options['embedding'],
                           path_to_data=model_options['data_path'],
                           test_subset=model_options['test_subset'],
                           load_train=True,
                           fold=0)
    train = dataset['train']
    dev = dataset['val']

    # Create word dictionary
    print('Creating dictionary')
    sys.stdout.flush()
    worddict = build_dictionary(train['caps'] + dev['caps'])
    print('Dictionary size: ' + str(len(worddict)))
    sys.stdout.flush()
    curr_model['worddict'] = worddict
    curr_model['options']['n_words'] = len(worddict) + 2

    # save model
    opt_filename_save = get_opt_filename(model_options, previous=False)
    print('Saving model parameters in', opt_filename_save)
    sys.stdout.flush()
    try:
        os.makedirs(os.path.dirname(opt_filename_save))
    except:
        pass
    pkl.dump(curr_model, open(opt_filename_save, 'wb'))

    # Load data from dataset
    print('Loading data')
    sys.stdout.flush()
    train_iter = datasource.Datasource(train,
                                       batch_size=model_options['batch_size'],
                                       worddict=worddict)
    dev = datasource.Datasource(dev, worddict=worddict)
    dev_caps, dev_ims = dev.all()

    print('Building model')
    sys.stdout.flush()
    params = init_params(model_options)

    # reload network parameters, ie. weights
    if model_options['reload_']:
        params_filename = get_npz_filename(model_options,
                                           best=load_from_best,
                                           previous=True)
        params = load_params(params_filename, params)

    tparams = init_tparams(params)
    inps, cost = build_model(tparams, model_options)

    print('Building sentence encoder')
    sys.stdout.flush()
    inps_se, sentences = build_sentence_encoder(tparams, model_options)
    f_senc = theano.function(inps_se, sentences, profile=False)

    print('Building image encoder')
    sys.stdout.flush()
    inps_ie, images = build_image_encoder(tparams, model_options)
    f_ienc = theano.function(inps_ie, images, profile=False)

    print('Building f_grad...')
    sys.stdout.flush()
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    print('Building errors...')
    sys.stdout.flush()
    inps_err, errs = build_errors(model_options)
    f_err = theano.function(inps_err, errs, profile=False)

    curr_model['f_senc'] = f_senc
    curr_model['f_ienc'] = f_ienc
    curr_model['f_err'] = f_err

    if model_options['grad_clip'] > 0.:
        grads = [maxnorm(g, model_options['grad_clip']) for g in grads]

    lr = tensor.scalar(name='lr')
    print('Building optimizers...')
    sys.stdout.flush()
    # (compute gradients), (updates parameters)
    f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams,
                                                               grads, inps,
                                                               cost)

    # Get names for the files to save model and solution
    sol_filename_best = get_sol_filename(model_options,
                                         best=True,
                                         previous=False)
    sol_filename_last = get_sol_filename(model_options,
                                         best=False,
                                         previous=False)
    params_filename_best = get_npz_filename(model_options,
                                            best=True,
                                            previous=False)
    params_filename_last = get_npz_filename(model_options,
                                            best=False,
                                            previous=False)

    print('PATHS TO MODELS:')
    for filename in [
            sol_filename_best, sol_filename_last, params_filename_best,
            params_filename_last
    ]:
        print(filename)
        sys.stdout.flush()
        try:
            os.makedirs(os.path.dirname(filename))
        except:
            pass

    # Start optimization
    print('Optimization')
    sys.stdout.flush()

    uidx = 0

    # Timing
    t_start = time.time()
    print('Starting time:', datetime.now())

    for eidx in range(init_epoch, model_options['max_epochs']):
        t_start_epoch = time.time()
        print('Epoch ', eidx)
        sys.stdout.flush()

        for x, mask, im in train_iter:
            n_samples += x.shape[1]
            uidx += 1

            # Update
            ud_start = time.time()
            cost = f_grad_shared(x, mask, im)
            f_update(model_options['lrate'])
            ud = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print('NaN detected')
                sys.stdout.flush()
                return 1., 1., 1.

            if numpy.mod(uidx, model_options['dispFreq']) == 0:
                print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ',
                      ud)
                sys.stdout.flush()

            if numpy.mod(uidx, model_options['validFreq']) == 0:
                print('Computing results...')
                sys.stdout.flush()

                # encode sentences efficiently
                dev_s = encode_sentences(
                    curr_model,
                    dev_caps,
                    batch_size=model_options['batch_size'])
                dev_i = encode_images(curr_model, dev_ims)

                # compute errors
                dev_errs = compute_errors(curr_model, dev_s, dev_i)

                # compute ranking error
                (r1, r5, r10, medr, meanr) = i2t(dev_errs)
                (r1i, r5i, r10i, medri, meanri) = t2i(dev_errs)
                print("Text to image (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" %
                      (r1i, r5i, r10i, medri, meanri))
                sys.stdout.flush()
                print("Image to text (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" %
                      (r1, r5, r10, medr, meanr))
                sys.stdout.flush()

                # Score
                val_score = r1 + r5 + r10 + r1i + r5i + r10i
                if val_score > best_val_score:

                    print('BEST MODEL FOUND')
                    print('Score:', val_score)
                    print('Previous best score:', best_val_score)
                    best_val_score = val_score
                    # Join in a results dict
                    results_dict = build_results_dict(r1, r5, r10, medr, r1i,
                                                      r5i, r10i, medri)

                    # Save parameters
                    print('Saving...', end=' ')
                    sys.stdout.flush()
                    numpy.savez(params_filename_best, **unzip(tparams))
                    print('Done')
                    sys.stdout.flush()

                    # Update solution
                    solution = OrderedDict([
                        ('epoch', eidx), ('update', uidx),
                        ('samples_seen', n_samples),
                        ('best_val_score', best_val_score),
                        ('best_val_res', results_dict),
                        ('time_until_results',
                         str(timedelta(seconds=(time.time() - t_start_train))))
                    ])
                    pkl.dump(solution, open(sol_filename_best, 'wb'))

        print('Seen %d samples' % n_samples)
        sys.stdout.flush()

        # Timing
        t_epoch = time.time() - t_start_epoch
        t_epoch_avg = (time.time() - t_start) / (eidx + 1 - (init_epoch))
        print('Time for this epoch:', str(timedelta(seconds=t_epoch)),
              'Average:', str(timedelta(seconds=t_epoch_avg)))
        t_2_complete = t_epoch_avg * (model_options['max_epochs'] - (eidx + 1))
        print('Time since start session:',
              str(timedelta(seconds=time.time() - t_start)),
              'Estimated time to complete training:',
              str(timedelta(seconds=t_2_complete)))
        print('Current time:', datetime.now())
        sys.stdout.flush()

        # Save current model
        try:
            state_params = OrderedDict([('epoch_done', eidx),
                                        ('solution', solution)])
        except:
            solution = OrderedDict([
                ('epoch', eidx), ('update', uidx), ('samples_seen', n_samples),
                ('best_val_score', best_val_score),
                ('time_until_results',
                 str(timedelta(seconds=(time.time() - t_start_train))))
            ])
            state_params = OrderedDict([('epoch_done', eidx),
                                        ('solution', solution)])
        pkl.dump(state_params, open(sol_filename_last, 'wb'))

        # Save parameters
        print('Saving LAST npz...', end=' ')
        sys.stdout.flush()
        numpy.savez(params_filename_last, **unzip(tparams))
        print('Done')
        sys.stdout.flush()

    return solution
Ejemplo n.º 34
0
def main():
    opt = parse_args()
    print(json.dumps(vars(opt), indent=2))

    rootpath = opt.rootpath
    testCollection = opt.testCollection
    n_caption = opt.n_caption
    resume = os.path.join(opt.logger_name, opt.checkpoint_name)

    if not os.path.exists(resume):
        logging.info(resume + ' not exists.')
        sys.exit(0)

    checkpoint = torch.load(resume)
    start_epoch = checkpoint['epoch']
    best_rsum = checkpoint['best_rsum']
    print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
        resume, start_epoch, best_rsum))
    options = checkpoint['opt']
    if not hasattr(options, 'concate'):
        setattr(options, "concate", "full")

    trainCollection = options.trainCollection
    output_dir = resume.replace(trainCollection, testCollection)
    output_dir = output_dir.replace('/%s/' % options.cv_name,
                                    '/results/%s/' % trainCollection)
    result_pred_sents = os.path.join(output_dir, 'id.sent.score.txt')
    pred_error_matrix_file = os.path.join(output_dir,
                                          'pred_errors_matrix.pth.tar')
    if checkToSkip(pred_error_matrix_file, opt.overwrite):
        sys.exit(0)
    makedirsforfile(pred_error_matrix_file)

    # data loader prepare
    caption_files = {
        'test':
        os.path.join(rootpath, testCollection, 'TextData',
                     '%s.caption.txt' % testCollection)
    }
    img_feat_path = os.path.join(rootpath, testCollection, 'FeatureData',
                                 options.visual_feature)
    visual_feats = {'test': BigFile(img_feat_path)}
    assert options.visual_feat_dim == visual_feats['test'].ndims
    video2frames = {
        'test':
        read_dict(
            os.path.join(rootpath, testCollection, 'FeatureData',
                         options.visual_feature, 'video2frames.txt'))
    }

    # set bow vocabulary and encoding
    bow_vocab_file = os.path.join(rootpath, options.trainCollection,
                                  'TextData', 'vocabulary', 'bow',
                                  options.vocab + '.pkl')
    bow_vocab = pickle.load(open(bow_vocab_file, 'rb'))
    bow2vec = get_text_encoder('bow')(bow_vocab)
    options.bow_vocab_size = len(bow_vocab)

    # set rnn vocabulary
    rnn_vocab_file = os.path.join(rootpath, options.trainCollection,
                                  'TextData', 'vocabulary', 'rnn',
                                  options.vocab + '.pkl')
    rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb'))
    options.vocab_size = len(rnn_vocab)

    # Construct the model
    model = get_model(options.model)(options)
    model.load_state_dict(checkpoint['model'])
    model.Eiters = checkpoint['Eiters']
    model.val_start()

    if testCollection.startswith(
            'msvd'):  # or testCollection.startswith('msrvtt'):
        # set data loader
        video_ids_list = data.read_video_ids(caption_files['test'])
        vid_data_loader = data.get_vis_data_loader(visual_feats['test'],
                                                   opt.batch_size,
                                                   opt.workers,
                                                   video2frames['test'],
                                                   video_ids=video_ids_list)
        text_data_loader = data.get_txt_data_loader(caption_files['test'],
                                                    rnn_vocab, bow2vec,
                                                    opt.batch_size,
                                                    opt.workers)
        # mapping
        video_embs, video_ids = evaluation.encode_text_or_vid(
            model.embed_vis, vid_data_loader)
        cap_embs, caption_ids = evaluation.encode_text_or_vid(
            model.embed_txt, text_data_loader)
    else:
        # set data loader
        data_loader = data.get_test_data_loaders(caption_files,
                                                 visual_feats,
                                                 rnn_vocab,
                                                 bow2vec,
                                                 opt.batch_size,
                                                 opt.workers,
                                                 opt.n_caption,
                                                 video2frames=video2frames)
        # mapping
        video_embs, cap_embs, video_ids, caption_ids = evaluation.encode_data(
            model, data_loader['test'], opt.log_step, logging.info)
        # remove duplicate videos
        idx = range(0, video_embs.shape[0], n_caption)
        video_embs = video_embs[idx, :]
        video_ids = video_ids[::opt.n_caption]

    c2i_all_errors = evaluation.cal_error(video_embs, cap_embs,
                                          options.measure)
    torch.save(
        {
            'errors': c2i_all_errors,
            'videos': video_ids,
            'captions': caption_ids
        }, pred_error_matrix_file)
    print("write into: %s" % pred_error_matrix_file)

    if testCollection.startswith(
            'msvd'):  # or testCollection.startswith('msrvtt'):
        # caption retrieval
        (r1, r5, r10, medr, meanr,
         i2t_map_score) = evaluation.i2t_varied(c2i_all_errors, caption_ids,
                                                video_ids)
        # video retrieval
        (r1i, r5i, r10i, medri, meanri,
         t2i_map_score) = evaluation.t2i_varied(c2i_all_errors, caption_ids,
                                                video_ids)
    else:
        # caption retrieval
        (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors,
                                                         n_caption=n_caption)
        t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=n_caption)

        # video retrieval
        (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors,
                                                    n_caption=n_caption)
        i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=n_caption)

    print(" * Text to Video:")
    print(" * r_1_5_10, medr, meanr: {}".format([
        round(r1i, 1),
        round(r5i, 1),
        round(r10i, 1),
        round(medri, 1),
        round(meanri, 1)
    ]))
    print(" * recall sum: {}".format(round(r1i + r5i + r10i, 1)))
    print(" * mAP: {}".format(round(t2i_map_score, 3)))
    print(" * " + '-' * 10)

    # caption retrieval
    print(" * Video to text:")
    print(" * r_1_5_10, medr, meanr: {}".format([
        round(r1, 1),
        round(r5, 1),
        round(r10, 1),
        round(medr, 1),
        round(meanr, 1)
    ]))
    print(" * recall sum: {}".format(round(r1 + r5 + r10, 1)))
    print(" * mAP: {}".format(round(i2t_map_score, 3)))
    print(" * " + '-' * 10)