def extract_feature(params): if params['dataset'] in ['refcoco', 'refcoco+', 'refcocog']: image_root = params['coco_image_root'] elif params['dataset'] == 'refgta': image_root = params['gta_image_root'] target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['ann_feats'] = 'old' + params['ann_feats'] loader = DataLoader(params) # model setting batch_size = params['batch_size'] gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy res = L.ResNet152Layers() res.to_gpu(gpu_id) chainer.config.train = False chainer.config.enable_backprop = False anns = loader.anns images = loader.Images perm = np.arange(len(anns)) ann_feats = [] for bs in tqdm(range(0, len(anns), batch_size)): batch = [] for ix in perm[bs:bs + batch_size]: ann = anns[ix] h5_id = ann['h5_id'] assert h5_id == ix, 'h5_id not match' img = images[ann['image_id']] x1, y1, w, h = ann['box'] image = Image.open(os.path.join(image_root, img['file_name'])).convert('RGB') if h <= w: nh, nw = int(224 / w * h), 224 else: nh, nw = 224, int(224 / h * w) image = image.crop((x1, y1, x1 + w, y1 + h)).resize( (nw, nh), Image.ANTIALIAS) image = np.array(image).astype(np.float32)[:, :, ::-1] image -= np.array([103.939, 116.779, 123.68], dtype=np.float32) image = image.transpose((2, 0, 1)) pad_image = np.zeros((3, 224, 224), dtype=np.float32) if nh <= nw: pad_image[:, (224 - nh) // 2:(224 - nh) // 2 + nh, :] = image else: pad_image[:, :, (224 - nw) // 2:(224 - nw) // 2 + nw] = image batch.append(pad_image) batch = Variable(xp.array(batch, dtype=xp.float32)) feature = res(batch, layers=['pool5']) feature = cuda.to_cpu(feature['pool5'].data) ann_feats.extend(feature) np.save(os.path.join(target_save_dir, params['ann_feats']), ann_feats)
def extract_feature(params): if params['dataset'] in ['refcoco', 'refcoco+', 'refcocog']: image_root = params['coco_image_root'] elif params['dataset'] == 'refgta': image_root = params['gta_image_root'] target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['ann_feats'] = 'old' + params['ann_feats'] loader = DataLoader(params) # model setting batch_size = params['batch_size'] gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy res = L.ResNet152Layers() res.to_gpu(gpu_id) chainer.config.train = False chainer.config.enable_backprop = False anns = loader.anns images = loader.Images perm = np.arange(len(anns)) ann_feats = [] shapes = [] for bs in tqdm(range(0, len(anns), batch_size)): batch = [] for ix in perm[bs:bs + batch_size]: ann = anns[ix] h5_id = ann['h5_id'] assert h5_id == ix, 'h5_id not match' img = images[ann['image_id']] x1, y1, w, h = ann['box'] image = Image.open(os.path.join( image_root, img['file_name'])).convert('RGB').crop( (x1, y1, x1 + w, y1 + h)) image, resize_shape = keep_asR_resize(image) shapes.append(resize_shape) image = np.array(image).astype(np.float32)[:, :, ::-1] image -= np.array([103.939, 116.779, 123.68], dtype=np.float32) image = image.transpose((2, 0, 1)) batch.append(image) batch = Variable(xp.array(batch, dtype=xp.float32)) feature = res(batch, layers=['res5']) feature = cuda.to_cpu(feature['res5'].data) ann_feats.extend( np.transpose(feature, (0, 2, 3, 1)).reshape(-1, 36, 2048)) np.save(os.path.join(target_save_dir, params['sp_ann_feats']), ann_feats) np.save(os.path.join(target_save_dir, params['ann_shapes']), shapes)
def extract_feature(params): if params['dataset'] in ['refcoco', 'refcoco+', 'refcocog']: image_root = params['coco_image_root'] elif params['dataset'] == 'refgta': image_root = params['gta_image_root'] target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['image_feats'] = 'old' + params['image_feats'] loader = DataLoader(params) # model setting batch_size = params['batch_size'] gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy res = L.ResNet152Layers() res.to_gpu(gpu_id) chainer.config.train = False chainer.config.enable_backprop = False images = loader.images perm = np.arange(len(images)) image_feats = [] for bs in tqdm(range(0, len(images), batch_size)): batch = [] for ix in perm[bs:bs + batch_size]: image = Image.open( os.path.join(image_root, images[ix]['file_name'])).convert('RGB') if params['dataset'] in ['refcoco', 'refcoco+', 'refcocog']: image = image.resize((224, 224), Image.ANTIALIAS) else: image = image.resize((480, 288), Image.ANTIALIAS) image = np.array(image).astype(np.float32)[:, :, ::-1] image -= np.array([103.939, 116.779, 123.68], dtype=np.float32) image = image.transpose((2, 0, 1)) batch.append(image) batch = Variable(xp.array(batch, dtype=xp.float32)) feature = res(batch, layers=['res5']) feature = cuda.to_cpu(feature['res5'].data) if params['dataset'] in ['refcoco', 'refcoco+', 'refcocog']: image_feats.extend( np.transpose(feature, (0, 2, 3, 1)).reshape(-1, 49, 2048)) else: image_feats.extend( np.transpose(feature, (0, 2, 3, 1)).reshape(-1, 135, 2048)) np.save(os.path.join(target_save_dir, params['image_feats']), image_feats)
def test_net(args): tf.set_random_seed(42) np.random.seed(42) loader = DataLoader(args.batch_size, args.input_folder, info_path=None, val_split=0, test=True) vgg16 = VGG16(trainable=False, layer='fc6') net = fishTest(vgg16, lr=None) saver = tf.train.Saver(max_to_keep=1) test_names = [] test_probs = [] with tf.Session() as sess: tf.global_variables_initializer().run() optimistic_restore(sess, args.model) iter_ix = 0 while True: print('{}/{}'.format(iter_ix, loader.train_size)) val_img, _, wrap, img_names = loader.next_batch() probs = sess.run(net.probs, feed_dict={net.input: val_img}) iter_ix += args.batch_size if wrap: ix = -1*(iter_ix - loader.train_size) test_names += img_names[:ix] test_probs += probs.tolist()[:ix] break test_names += img_names test_probs += probs.tolist() print(len(test_names)) print(len(test_probs)) with open(args.output_file, 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',') for img_name, probs in zip(test_names, test_probs): writer.writerow([img_name] + probs)
def train_all(params): target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) graph_dir = osp.join('log_graph', params['dataset'] + '_' + params['splitBy']) model_dir = osp.join(params['save_dir'], 'model', params['dataset'] + '_' + params['splitBy']) if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['image_feats'] = 'old' + params['image_feats'] params['sp_ann_feats'] = 'old' + params['sp_ann_feats'] params['ann_feats'] = 'old' + params['ann_feats'] params['ann_shapes'] = 'old' + params['ann_shapes'] params['id'] = 'old' + params['id'] params['word_emb_path'] = 'old' + params['word_emb_path'] if params['dataset'] in ['refcoco', 'refcoco+', 'refcocog']: global_shapes = (224, 224) elif params['dataset'] == 'refgta': global_shapes = (480, 288) loader = DataLoader(params) # model setting batch_size = params['batch_size'] gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy featsOpt = { 'sp_ann': osp.join(target_save_dir, params['sp_ann_feats']), 'ann_input': osp.join(target_save_dir, params['ann_feats']), 'img': osp.join(target_save_dir, params['image_feats']), 'shapes': osp.join(target_save_dir, params['ann_shapes']) } loader.loadFeats(featsOpt, mmap_mode=False) loader.shuffle('train') ve = VisualEncoder(res6=L.ResNet152Layers().fc6, global_shapes=global_shapes).to_gpu(gpu_id) rl_crit = ListenerReward(len(loader.ix_to_word), global_shapes=global_shapes).to_gpu(gpu_id) lm = LanguageModel(len(loader.ix_to_word), loader.seq_length, global_shapes, res6=L.ResNet152Layers().fc6).to_gpu(gpu_id) serializers.load_hdf5(osp.join(model_dir, params['id'] + ".h5"), rl_crit) ve_optim = optimizers.Adam(alpha=4e-5, beta1=0.8) lm_optim = optimizers.Adam(alpha=4e-4, beta1=0.8) ve_optim.setup(ve) lm_optim.setup(lm) ve_optim.add_hook(chainer.optimizer.GradientClipping(params['grad_clip'])) lm_optim.add_hook(chainer.optimizer.GradientClipping(params['grad_clip'])) ## non-finetune layer ve.joint_enc.W.update_rule.hyperparam.alpha = 4e-4 ve.joint_enc.b.update_rule.hyperparam.alpha = 4e-4 lm.gaussian_p.x_var.update_rule.hyperparam.alpha = 1e-2 lm.gaussian_p.y_var.update_rule.hyperparam.alpha = 1e-2 ve.gaussian_p.x_var.update_rule.hyperparam.alpha = 1e-2 ve.gaussian_p.y_var.update_rule.hyperparam.alpha = 1e-2 iteration = 0 epoch = 0 lam = params['rank_lam'] val_loss_history = [] val_loss_lm_s_history = [] val_loss_lm_l_history = [] val_loss_l_history = [] val_acc_history = [] val_rank_acc_history = [] min_val_loss = 100 while True: chainer.config.train = True chainer.config.enable_backprop = True ve.zerograds() lm.zerograds() rl_crit.zerograds() start = time.time() data = loader.getBatch('train', params) ref_ann_ids = data['ref_ann_ids'] pos_feats = Variable(xp.array(data['feats'], dtype=xp.float32)) pos_sp_cxt_feats = Variable( xp.array(data['sp_cxt_feats'], dtype=xp.float32)) pos_sp_ann_feats = Variable( xp.array(data['sp_ann_feats'], dtype=xp.float32)) neg_feats = Variable(xp.array(data['neg_feats'], dtype=xp.float32)) neg_pos_sp_cxt_feats = Variable( xp.array(data['neg_sp_cxt_feats'], dtype=xp.float32)) neg_pos_sp_ann_feats = Variable( xp.array(data['neg_sp_ann_feats'], dtype=xp.float32)) local_shapes = np.concatenate([ data['local_shapes'], data['neg_local_shapes'], data['local_shapes'] ], axis=0) feats = F.concat([pos_feats, neg_feats, pos_feats], axis=0) sp_cxt_feats = F.concat( [pos_sp_cxt_feats, neg_pos_sp_cxt_feats, pos_sp_cxt_feats], axis=0) sp_ann_feats = F.concat( [pos_sp_ann_feats, neg_pos_sp_ann_feats, pos_sp_ann_feats], axis=0) seqz = np.concatenate([data['seqz'], data['seqz'], data['neg_seqz']], axis=0) lang_last_ind = calc_max_ind(seqz) seqz = Variable(xp.array(seqz, dtype=xp.int32)) coord = cuda.to_cpu( feats[:, sum(ve.feat_ind[:1]):sum(ve.feat_ind[:2])].data) local_sp_coord, global_sp_coord = calc_coordinate_feature( coord, local_shapes, global_shapes=global_shapes) local_sp_coord, global_sp_coord = xp.array(local_sp_coord, dtype=xp.float32), xp.array( global_sp_coord, dtype=xp.float32) # encode vis feature vis_feats = ve(feats, sp_cxt_feats, coord) sp_feats, sp_feats_emb = lm.calc_spatial_features( sp_cxt_feats, sp_ann_feats, local_sp_coord, global_sp_coord) logprobs = lm(vis_feats, sp_feats, sp_feats_emb, coord, seqz, lang_last_ind) # lang loss pairP, vis_unpairP, lang_unpairP = F.split_axis(logprobs, 3, axis=1) pair_num, _, lang_unpair_num = np.split(lang_last_ind, 3) num_labels = {'T': xp.array(pair_num), 'F': xp.array(lang_unpair_num)} lm_flows = { 'T': pairP, 'visF': [pairP, vis_unpairP], 'langF': [pairP, lang_unpairP] } lm_loss = lm_crits(lm_flows, num_labels, params['lm_margin'], vlamda=params['vis_rank_weight'], llamda=params['lang_rank_weight']) # RL loss (pos,pos) rl_vis_feats = F.split_axis(vis_feats, 3, axis=0)[0] rl_coord = np.split(coord, 3, axis=0)[0] rl_sp_vis_feats = F.split_axis(sp_feats, 3, axis=0)[0] rl_sp_vis_emb = F.split_axis(sp_feats_emb, 3, axis=0)[0] sampled_seq, sample_log_probs = lm.sample(rl_vis_feats, rl_sp_vis_feats, rl_sp_vis_emb, rl_coord) sampled_lang_last_ind = calc_max_ind(sampled_seq) rl_loss = rl_crit(pos_feats, pos_sp_cxt_feats, rl_coord, sampled_seq, sample_log_probs, sampled_lang_last_ind) loss = lm_loss + rl_loss print(lm_loss, rl_loss) if params['dataset'] == 'refgta' and params[ 'ranking'] and iteration > 8000: lam += 0.4 / 8000 score = F.sum(pairP, axis=0) / (xp.array(pair_num + 1)) rank_loss = calc_rank_loss(score, data['rank'], margin=0.01) * lam loss += rank_loss loss.backward() ve_optim.update() lm_optim.update() if data['bounds']['wrapped']: print('one epoch finished!') loader.shuffle('train') if iteration % params['losses_log_every'] == 0: acc = xp.where(rl_crit.reward > 0.5, 1, 0).mean() print('{} iter : train loss {}, acc : {} reward_mean : {}'.format( iteration, loss.data, acc, rl_crit.reward.mean())) if (iteration % params['save_checkpoint_every'] == 0 and iteration > 0): chainer.config.train = False chainer.config.enable_backprop = False loader.resetImageIterator('val') loss_sum = 0 loss_generation = 0 loss_lm_margin = 0 loss_evals = 0 accuracy = 0 rank_acc = 0 rank_num = 0 while True: data = loader.getImageBatch('val', params) image_id = data['image_id'] img_ann_ids = data['img_ann_ids'] sent_ids = data['sent_ids'] gd_ixs = data['gd_ixs'] feats = Variable(xp.array(data['feats'], dtype=xp.float32)) sp_cxt_feats = Variable( xp.array(data['sp_cxt_feats'], dtype=xp.float32)) sp_ann_feats = Variable( xp.array(data['sp_ann_feats'], dtype=xp.float32)) local_shapes = data['local_shapes'] seqz = data['seqz'] lang_last_ind = calc_max_ind(seqz) scores = [] for i, sent_id in enumerate(sent_ids): gd_ix = gd_ixs[i] labels = xp.zeros(len(img_ann_ids), dtype=xp.int32) labels[gd_ix] = 1 labels = Variable(labels) sent_seqz = np.concatenate( [[seqz[i]] for _ in range(len(img_ann_ids))], axis=0) one_last_ind = np.array([lang_last_ind[i]] * len(img_ann_ids)) sent_seqz = Variable(xp.array(sent_seqz, dtype=xp.int32)) coord = cuda.to_cpu( feats[:, sum(ve.feat_ind[:1]):sum(ve.feat_ind[:2])].data) local_sp_coord, global_sp_coord = calc_coordinate_feature( coord, local_shapes, global_shapes=global_shapes) local_sp_coord, global_sp_coord = xp.array( local_sp_coord, dtype=xp.float32), xp.array(global_sp_coord, dtype=xp.float32) vis_enc_feats = ve(feats, sp_cxt_feats, coord) sp_feats, sp_feats_emb = lm.calc_spatial_features( sp_cxt_feats, sp_ann_feats, local_sp_coord, global_sp_coord) vis_feats = vis_enc_feats logprobs = lm(vis_feats, sp_feats, sp_feats_emb, coord, sent_seqz, one_last_ind).data gd_ix = gd_ixs[i] lm_generation_loss = lm_crits( { 'T': logprobs[:, gd_ix, xp.newaxis] }, { 'T': one_last_ind[gd_ix, np.newaxis] }, params['lm_margin'], vlamda=0, llamda=0).data lm_scores = -computeLosses(logprobs, one_last_ind) lm_margin_loss, pos_sc, max_neg_sc = compute_margin_loss( lm_scores, gd_ix, params['lm_margin']) scores.append(lm_scores[gd_ix]) loss_generation += lm_generation_loss loss_lm_margin += lm_margin_loss loss_sum += lm_generation_loss + lm_margin_loss loss_evals += 1 if pos_sc > max_neg_sc: accuracy += 1 if params['dataset'] == 'refgta': rank_a, rank_n = calc_rank_acc(scores, data['rank']) rank_acc += rank_a rank_num += rank_n print('{} iter | {}/{} validating acc : {}'.format( iteration, data['bounds']['it_pos_now'], data['bounds']['it_max'], accuracy / loss_evals)) if data['bounds']['wrapped']: print('validation finished!') fin_val_loss = cuda.to_cpu(loss_sum / loss_evals) loss_generation = cuda.to_cpu(loss_generation / loss_evals) loss_lm_margin = cuda.to_cpu(loss_lm_margin / loss_evals) fin_val_acc = accuracy / loss_evals break val_loss_history.append(fin_val_loss) val_loss_lm_s_history.append(loss_generation) val_loss_lm_l_history.append(loss_lm_margin) val_acc_history.append(fin_val_acc) if min_val_loss > fin_val_loss: print('val loss {} -> {} improved!'.format( min_val_loss, val_loss_history[-1])) min_val_loss = fin_val_loss serializers.save_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "ve.h5"), ve) serializers.save_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "lm.h5"), lm) ## graph plt.title("accuracy") plt.plot(np.arange(len(val_acc_history)), val_acc_history, label="val_accuracy") plt.legend() plt.savefig( os.path.join(graph_dir, params['id'] + params['id2'] + "_joint_acc.png")) plt.close() plt.title("loss") plt.plot(np.arange(len(val_loss_history)), val_loss_history, label="all_loss") plt.plot(np.arange(len(val_loss_history)), val_loss_lm_s_history, label="generation_loss") plt.legend() plt.savefig( os.path.join(graph_dir, params['id'] + params['id2'] + "_joint_loss.png")) plt.close() plt.title("loss") plt.plot(np.arange(len(val_loss_history)), val_loss_lm_l_history, label="lm_comp_loss") plt.legend() plt.savefig( os.path.join( graph_dir, params['id'] + params['id2'] + "_joint_comp_loss.png")) plt.close() if params['dataset'] == 'refgta': val_rank_acc_history.append(rank_acc / rank_num) plt.title("rank loss") plt.plot(np.arange(len(val_rank_acc_history)), val_rank_acc_history, label="rank_acc") plt.legend() plt.savefig( os.path.join( graph_dir, params['id'] + params['id2'] + "_joint_rank_acc.png")) plt.close() if iteration > params['learning_rate_decay_start'] and params[ 'learning_rate_decay_start'] >= 0: frac = (iteration - params['learning_rate_decay_start'] ) / params['learning_rate_decay_every'] decay_factor = math.pow(0.1, frac) ve_optim.alpha *= decay_factor lm_optim.alpha *= decay_factor iteration += 1
def train_all(params): target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) graph_dir = osp.join('log_graph', params['dataset'] + '_' + params['splitBy']) model_dir = osp.join(params['save_dir'], 'model', params['dataset'] + '_' + params['splitBy']) if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['image_feats'] = 'old' + params['image_feats'] params['ann_feats'] = 'old' + params['ann_feats'] params['id'] = 'old' + params['id'] params['word_emb_path'] = 'old' + params['word_emb_path'] with open('setting.json', 'w') as f: json.dump(params, f) if not osp.isdir(graph_dir): os.mkdir(graph_dir) loader = DataLoader(params) # model setting batch_size = params['batch_size'] gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy featsOpt = { 'ann': osp.join(target_save_dir, params['ann_feats']), 'img': osp.join(target_save_dir, params['image_feats']) } loader.loadFeats(featsOpt) loader.shuffle('train') ve = VisualEncoder(res6=L.ResNet152Layers().fc6).to_gpu(gpu_id) if 'attention' in params['id']: print('attention language encoder') le = LanguageEncoderAttn(len(loader.ix_to_word)) rl_crit = ListenerReward(len(loader.ix_to_word), attention=True).to_gpu(gpu_id) else: le = LanguageEncoder(len(loader.ix_to_word)) rl_crit = ListenerReward(len(loader.ix_to_word), attention=False).to_gpu(gpu_id) cca = CcaEmbedding().to_gpu(gpu_id) lm = LanguageModel(len(loader.ix_to_word), loader.seq_length) if params['pretrained_w']: print('pretrained word embedding...') word_emb = load_vcab_init( loader.word_to_ix, osp.join(target_save_dir, params['word_emb_path'])) le.word_emb.W.data = word_emb lm.word_emb = le.word_emb le.to_gpu(gpu_id) lm.to_gpu(gpu_id) serializers.load_hdf5(osp.join(model_dir, params['id'] + ".h5"), rl_crit) ve_optim = optimizers.Adam(alpha=4e-5, beta1=0.8) le_optim = optimizers.Adam(alpha=4e-4, beta1=0.8) cca_optim = optimizers.Adam(alpha=4e-4, beta1=0.8) lm_optim = optimizers.Adam(alpha=4e-4, beta1=0.8) ve_optim.setup(ve) le_optim.setup(le) cca_optim.setup(cca) lm_optim.setup(lm) ve_optim.add_hook(chainer.optimizer.GradientClipping(0.1)) le_optim.add_hook(chainer.optimizer.GradientClipping(0.1)) cca_optim.add_hook(chainer.optimizer.GradientClipping(0.1)) lm_optim.add_hook(chainer.optimizer.GradientClipping(0.1)) ve.joint_enc.W.update_rule.hyperparam.alpha = 4e-4 ve.joint_enc.b.update_rule.hyperparam.alpha = 4e-4 iteration = 0 epoch = 0 val_loss_history = [] val_loss_lm_s_history = [] val_loss_lm_l_history = [] val_loss_l_history = [] val_acc_history = [] val_rank_acc_history = [] min_val_loss = 100 while True: chainer.config.train = True chainer.config.enable_backprop = True ve.zerograds() le.zerograds() cca.zerograds() lm.zerograds() rl_crit.zerograds() data = loader.getBatch('train', params) ref_ann_ids = data['ref_ann_ids'] pos_feats = Variable(xp.array(data['feats'], dtype=xp.float32)) neg_feats = Variable(xp.array(data['neg_feats'], dtype=xp.float32)) feats = F.concat([pos_feats, neg_feats, pos_feats], axis=0) seqz = np.concatenate([data['seqz'], data['seqz'], data['neg_seqz']], axis=0) lang_last_ind = calc_max_ind(seqz) seqz = Variable(xp.array(seqz, dtype=xp.int32)) vis_enc_feats = ve(feats) lang_enc_feats = le(seqz, lang_last_ind) cossim, vis_emb_feats = cca(vis_enc_feats, lang_enc_feats) vis_feats = vis_combine(vis_enc_feats, vis_emb_feats) logprobs = lm(vis_feats, seqz, lang_last_ind) # emb loss pairSim, vis_unpairSim, lang_unpairSim = F.split_axis(cossim, 3, axis=0) emb_flows = { 'vis': [pairSim, vis_unpairSim], 'lang': [pairSim, lang_unpairSim] } emb_loss = emb_crits(emb_flows, params['emb_margin']) # lang loss pairP, vis_unpairP, lang_unpairP = F.split_axis(logprobs, 3, axis=1) pair_num, _, lang_unpair_num = np.split(lang_last_ind, 3) num_labels = {'T': pair_num, 'F': lang_unpair_num} lm_flows = { 'T': pairP, 'visF': [pairP, vis_unpairP], 'langF': [pairP, lang_unpairP] } lm_loss = lm_crits(lm_flows, num_labels, params['lm_margin'], vlamda=params['vis_rank_weight'], llamda=params['lang_rank_weight']) # RL loss (pos,pos)のみ rl_vis_feats = F.split_axis(vis_feats, 3, axis=0)[0] sampled_seq, sample_log_probs = lm.sample(rl_vis_feats) sampled_lang_last_ind = calc_max_ind(sampled_seq) rl_loss = rl_crit(pos_feats, sampled_seq, sample_log_probs, sampled_lang_last_ind) #, lm.baseline) loss = emb_loss + lm_loss + rl_loss print(emb_loss, lm_loss, rl_loss) loss.backward() ve_optim.update() le_optim.update() cca_optim.update() lm_optim.update() if data['bounds']['wrapped']: print('one epoch finished!') loader.shuffle('train') if params['check_sent']: sampled_sents = loader.decode_sequence(cuda.to_cpu(sampled_seq), sampled_lang_last_ind) for i in range(len(sampled_sents)): print('sampled sentence : ', ' '.join(sampled_sents[i])) print('reward : ', rl_crit.reward[i]) if iteration % params['losses_log_every'] == 0: acc = xp.where(rl_crit.reward > 0.5, 1, 0).mean() print('{} iter : train loss {}, acc : {}, reward_mean : {}'.format( iteration, loss.data, acc, rl_crit.reward.mean())) if iteration % params[ 'mine_hard_every'] == 0 and iteration > 0 and params[ 'mine_hard']: make_graph(ve, cca, loader, 'train', params, xp) if (iteration % params['save_checkpoint_every'] == 0 and iteration > 0): chainer.config.train = False chainer.config.enable_backprop = False loader.resetImageIterator('val') loss_sum = 0 loss_generation = 0 loss_lm_margin = 0 loss_emb_margin = 0 loss_evals = 0 accuracy = 0 rank_acc = 0 rank_num = 0 while True: data = loader.getImageBatch('val', params) image_id = data['image_id'] img_ann_ids = data['img_ann_ids'] sent_ids = data['sent_ids'] gd_ixs = data['gd_ixs'] feats = Variable(xp.array(data['feats'], dtype=xp.float32)) seqz = data['seqz'] lang_last_ind = calc_max_ind(seqz) scores = [] for i, sent_id in enumerate(sent_ids): gd_ix = gd_ixs[i] labels = xp.zeros(len(img_ann_ids), dtype=xp.int32) labels[gd_ix] = 1 labels = Variable(labels) sent_seqz = np.concatenate( [[seqz[i]] for _ in range(len(img_ann_ids))], axis=0) one_last_ind = np.array([lang_last_ind[i]] * len(img_ann_ids)) sent_seqz = Variable(xp.array(sent_seqz, dtype=xp.int32)) vis_enc_feats = ve(feats) lang_enc_feats = le(sent_seqz, one_last_ind) cossim, vis_emb_feats = cca(vis_enc_feats, lang_enc_feats) vis_feats = vis_combine(vis_enc_feats, vis_emb_feats) logprobs = lm(vis_feats, sent_seqz, one_last_ind).data gd_ix = gd_ixs[i] lm_generation_loss = lm_crits( { 'T': logprobs[:, gd_ix, xp.newaxis] }, { 'T': one_last_ind[gd_ix, np.newaxis] }, params['lm_margin'], vlamda=0, llamda=0).data lm_scores = -computeLosses(logprobs, one_last_ind) lm_margin_loss, _, _ = compute_margin_loss( lm_scores, gd_ix, params['lm_margin']) scores.append(lm_scores[gd_ix]) emb_margin_loss, pos_sc, max_neg_sc = compute_margin_loss( cossim.data, gd_ix, params['emb_margin']) loss_generation += lm_generation_loss loss_lm_margin += lm_margin_loss loss_emb_margin += emb_margin_loss loss_sum += lm_generation_loss + lm_margin_loss + emb_margin_loss loss_evals += 1 if pos_sc > max_neg_sc: accuracy += 1 if params['dataset'] == 'refgta': rank_a, rank_n = calc_rank_acc(scores, data['rank']) rank_acc += rank_a rank_num += rank_n print('{} iter | {}/{} validating acc : {}'.format( iteration, data['bounds']['it_pos_now'], data['bounds']['it_max'], accuracy / loss_evals)) if data['bounds']['wrapped']: print('validation finished!') fin_val_loss = cuda.to_cpu(loss_sum / loss_evals) loss_generation = cuda.to_cpu(loss_generation / loss_evals) loss_lm_margin = cuda.to_cpu(loss_lm_margin / loss_evals) loss_emb_margin = cuda.to_cpu(loss_emb_margin / loss_evals) fin_val_acc = accuracy / loss_evals break val_loss_history.append(fin_val_loss) val_loss_lm_s_history.append(loss_generation) val_loss_lm_l_history.append(loss_lm_margin) val_loss_l_history.append(loss_emb_margin) val_acc_history.append(fin_val_acc) if min_val_loss > fin_val_loss: print('val loss {} -> {} improved!'.format( min_val_loss, val_loss_history[-1])) min_val_loss = fin_val_loss serializers.save_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "ve.h5"), ve) serializers.save_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "le.h5"), le) serializers.save_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "cca.h5"), cca) serializers.save_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "lm.h5"), lm) ## graph plt.title("accuracy") plt.plot(np.arange(len(val_acc_history)), val_acc_history, label="val_accuracy") plt.legend() plt.savefig( os.path.join(graph_dir, params['id'] + params['id2'] + "_joint_acc.png")) plt.close() plt.title("loss") plt.plot(np.arange(len(val_loss_history)), val_loss_history, label="all_loss") plt.plot(np.arange(len(val_loss_history)), val_loss_lm_s_history, label="generation_loss") plt.legend() plt.savefig( os.path.join(graph_dir, params['id'] + params['id2'] + "_joint_loss.png")) plt.close() plt.title("loss") plt.plot(np.arange(len(val_loss_history)), val_loss_lm_l_history, label="lm_comp_loss") plt.plot(np.arange(len(val_loss_history)), val_loss_l_history, label="comp_loss") plt.legend() plt.savefig( os.path.join( graph_dir, params['id'] + params['id2'] + "_joint_comp_loss.png")) plt.close() if params['dataset'] == 'refgta': print(rank_num) val_rank_acc_history.append(rank_acc / rank_num) plt.title("rank loss") plt.plot(np.arange(len(val_rank_acc_history)), val_rank_acc_history, label="rank_acc") plt.legend() plt.savefig( os.path.join( graph_dir, params['id'] + params['id2'] + "_rank_acc.png")) plt.close() if iteration > params['learning_rate_decay_start'] and params[ 'learning_rate_decay_start'] >= 0: frac = (iteration - params['learning_rate_decay_start'] ) / params['learning_rate_decay_every'] decay_factor = math.pow(0.1, frac) ve_optim.alpha *= decay_factor le_optim.alpha *= decay_factor cca_optim.alpha *= decay_factor lm_optim.alpha *= decay_factor iteration += 1
def eval_all(params): target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) model_dir = osp.join(params['save_dir'], 'model', params['dataset'] + '_' + params['splitBy']) result_dir = osp.join('result', params['dataset'] + '_' + params['splitBy']) if not osp.isdir(result_dir): os.makedirs(result_dir) if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['image_feats_h5'] = 'old' + params['image_feats_h5'] params['ann_feats_h5'] = 'old' + params['ann_feats_h5'] params['id'] = 'old' + params['id'] if params['dataset'] in ['refcoco', 'refcoco+', 'refcocog']: global_shapes = (224, 224) elif params['dataset'] == 'refgta': global_shapes = (480, 288) loader = DataLoader(params) featsOpt = { 'sp_ann': osp.join(target_save_dir, params['sp_ann_feats']), 'ann_input': osp.join(target_save_dir, params['ann_feats']), 'img': osp.join(target_save_dir, params['image_feats']), 'shapes': osp.join(target_save_dir, params['ann_shapes']) } loader.loadFeats(featsOpt) chainer.config.train = False chainer.config.enable_backprop = False gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy ve = VisualEncoder(global_shapes=global_shapes).to_gpu(gpu_id) lm = LanguageModel(len(loader.ix_to_word), loader.seq_length, global_shapes).to_gpu(gpu_id) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "ve.h5"), ve) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "lm.h5"), lm) predictions = [] beam_all_results = [] while True: data = loader.getTestBatch(params['split'], params) ref_ids = data['ref_ids'] lang_last_ind = calc_max_ind(data['seqz']) feats = Variable(xp.array(data['feats'], dtype=xp.float32)) sp_cxt_feats = Variable( xp.array(data['sp_cxt_feats'], dtype=xp.float32)) sp_ann_feats = Variable( xp.array(data['sp_ann_feats'], dtype=xp.float32)) local_shapes = data['local_shapes'] coord = data['feats'][:, sum(ve.feat_ind[:1]):sum(ve.feat_ind[:2])] local_sp_coord, global_sp_coord = calc_coordinate_feature( coord, local_shapes, global_shapes=global_shapes) local_sp_coord, global_sp_coord = xp.array(local_sp_coord, dtype=xp.float32), xp.array( global_sp_coord, dtype=xp.float32) vis_enc_feats = ve(feats, sp_cxt_feats, coord) vis_feats = vis_enc_feats sp_feats, sp_feats_emb = lm.calc_spatial_features( sp_cxt_feats, sp_ann_feats, local_sp_coord, global_sp_coord) if params['beam_width'] == 1: results = lm.max_sample(vis_feats) else: beam_results, _ = beam_search(lm, vis_feats, sp_feats, sp_feats_emb, coord, params['beam_width']) results = [result[0]['sent'] for result in beam_results] ppls = [result[0]['ppl'] for result in beam_results] for i, result in enumerate(results): gen_sentence = ' '.join( [loader.ix_to_word[str(w)] for w in result]) if params['beam_width'] == 1: print(gen_sentence) else: print(gen_sentence, ', ppl : ', ppls[i]) entry = {'ref_id': ref_ids[i], 'sent': gen_sentence} predictions.append(entry) if params['beam_width'] > 1: beam_all_results.append({ 'ref_id': ref_ids[i], 'beam': beam_results[i] }) print('evaluating validation performance... {}/{}'.format( data['bounds']['it_pos_now'], data['bounds']['it_max'])) if data['bounds']['wrapped']: print('validation finished!') break lang_stats = language_eval(predictions, params['split'], params) print(lang_stats) print('sentence mean length: ', np.mean([len(pred['sent'].split()) for pred in predictions])) with open( result_dir + '/' + params['id'] + params['id2'] + str(params['beam_width']) + params['split'] + 'raw.json', 'w') as f: json.dump(predictions, f) with open( result_dir + '/' + params['id'] + params['id2'] + str(params['beam_width']) + params['split'] + '.json', 'w') as f: json.dump(lang_stats, f) with open( result_dir + '/' + params['id'] + params['id2'] + str(params['beam_width']) + params['split'] + 'all_beam.json', 'w') as f: json.dump(beam_all_results, f)
def eval_all(params): target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) model_dir = osp.join(params['save_dir'], 'model', params['dataset'] + '_' + params['splitBy']) batch_size = params['batch_size'] if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['image_feats'] = 'old' + params['image_feats'] params['ann_feats'] = 'old' + params['ann_feats'] params['id'] = 'old' + params['id'] loader = DataLoader(params) featsOpt = { 'ann': osp.join(target_save_dir, params['ann_feats']), 'img': osp.join(target_save_dir, params['image_feats']) } loader.loadFeats(featsOpt) chainer.config.train = False chainer.config.enable_backprop = False gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy if 'attention' in params['id']: print('attn') le = LanguageEncoderAttn(len(loader.ix_to_word)).to_gpu(gpu_id) else: le = LanguageEncoder(len(loader.ix_to_word)).to_gpu(gpu_id) ve = VisualEncoder().to_gpu(gpu_id) cca = CcaEmbedding().to_gpu(gpu_id) lm = LanguageModel(len(loader.ix_to_word), loader.seq_length).to_gpu(gpu_id) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "ve.h5"), ve) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "le.h5"), le) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "cca.h5"), cca) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "lm.h5"), lm) for num, entry in enumerate(train_entries): print("{}/{}".format(num, len(train_entries))) image_id = entry['image_id'] idx = train_iid2id[image_id] features = all_image_feats[idx] boxes = all_boxes[idx] referring_expression = [] for m in range(36): bbox = boxes[m] bbox[2] -= bbox[0] bbox[3] -= bbox[1] boxes[m] = bbox for k in range(36): feats = fetch_feats(entry, features, boxes, loader, k, params) feats = Variable(xp.array(feats, dtype=xp.float32)) vis_enc_feats = ve(feats) lang_enc_feats = vis_enc_feats _, vis_emb_feats = cca(vis_enc_feats, lang_enc_feats) vis_feats = vis_combine(vis_enc_feats, vis_emb_feats) beam_results = beam_search(lm, vis_feats, params['beam_width']) results = [result['sent'] for result in beam_results[0]] results = results[:3] gen_sentence = [] for i, result in enumerate(results): gen_sentence.append(' '.join( [loader.ix_to_word[str(w)] for w in result])) referring_expression.append(gen_sentence) entry['object_captions'] = referring_expression pickle.dump(train_entries, open('VQA_ref_testdataset_v3.pkl', 'wb'))
def eval_all(params): target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) model_dir = osp.join(params['save_dir'], 'model', params['dataset'] + '_' + params['splitBy']) if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['image_feats'] = 'old' + params['image_feats'] params['ann_feats'] = 'old' + params['ann_feats'] params['id'] = 'old' + params['id'] loader = DataLoader(params) featsOpt = { 'ann': osp.join(target_save_dir, params['ann_feats']), 'img': osp.join(target_save_dir, params['image_feats']) } loader.loadFeats(featsOpt) chainer.config.train = False chainer.config.enable_backprop = False gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy if 'attention' in params['id']: print('attn') le = LanguageEncoderAttn(len(loader.ix_to_word)).to_gpu(gpu_id) else: le = LanguageEncoder(len(loader.ix_to_word)).to_gpu(gpu_id) ve = VisualEncoder().to_gpu(gpu_id) cca = CcaEmbedding().to_gpu(gpu_id) lm = LanguageModel(len(loader.ix_to_word), loader.seq_length).to_gpu(gpu_id) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "ve.h5"), ve) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "le.h5"), le) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "cca.h5"), cca) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "lm.h5"), lm) predictions = [] beam_all_results = [] while True: data = loader.getTestBatch(params['split'], params) ref_ids = data['ref_ids'] image_id = data['image_id'] lang_last_ind = calc_max_ind(data['seqz']) feats = Variable(xp.array(data['feats'], dtype=xp.float32)) vis_enc_feats = ve(feats) lang_enc_feats = vis_enc_feats ##fake _, vis_emb_feats = cca(vis_enc_feats, lang_enc_feats) vis_feats = vis_combine(vis_enc_feats, vis_emb_feats) if params['beam_width'] == 1: results = lm.max_sample(vis_feats) else: beam_results = beam_search(lm, vis_feats, params['beam_width']) results = [result[0]['sent'] for result in beam_results] ppls = [result[0]['ppl'] for result in beam_results] for i, result in enumerate(results): gen_sentence = ' '.join( [loader.ix_to_word[str(w)] for w in result]) if params['beam_width'] == 1: print(gen_sentence) else: print(gen_sentence, 'image_id : ', image_id) entry = {'ref_id': ref_ids[i], 'sent': gen_sentence} predictions.append(entry) if params['beam_width'] > 1: beam_all_results.append({ 'ref_id': ref_ids[i], 'beam': beam_results[i] }) print('evaluating validation performance... {}/{}'.format( data['bounds']['it_pos_now'], data['bounds']['it_max'])) if data['bounds']['wrapped']: print('validation finished!') break lang_stats = language_eval(predictions, params['split'], params) print(lang_stats)
def eval_all(params): target_save_dir = osp.join(params['save_dir'],'prepro', params['dataset']+'_'+params['splitBy']) model_dir = osp.join(params['save_dir'],'model', params['dataset']+'_'+params['splitBy']) if params['old'] and params['dataset'] in ['refcoco','refcoco+','refcocog']: params['data_json'] = 'old'+params['data_json'] params['data_h5'] = 'old'+params['data_h5'] params['image_feats'] = 'old'+params['image_feats'] params['ann_feats'] = 'old'+params['ann_feats'] params['id'] = 'old'+params['id'] loader = DataLoader(params) featsOpt = {'ann':osp.join(target_save_dir, params['ann_feats']), 'img':osp.join(target_save_dir, params['image_feats'])} loader.loadFeats(featsOpt) chainer.config.train = False chainer.config.enable_backprop = False gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy ve = VisualEncoder().to_gpu(gpu_id) if 'attention' in params['id']: print('attn') le = LanguageEncoderAttn(len(loader.ix_to_word)).to_gpu(gpu_id) else: le = LanguageEncoder(len(loader.ix_to_word)).to_gpu(gpu_id) cca = CcaEmbedding().to_gpu(gpu_id) lm = LanguageModel(len(loader.ix_to_word), loader.seq_length).to_gpu(gpu_id) serializers.load_hdf5(osp.join(model_dir, params['id']+params['id2']+"ve.h5"), ve) serializers.load_hdf5(osp.join(model_dir, params['id']+params['id2']+"le.h5"), le) serializers.load_hdf5(osp.join(model_dir, params['id']+params['id2']+"cca.h5"), cca) serializers.load_hdf5(osp.join(model_dir, params['id']+params['id2']+"lm.h5"), lm) accuracy = 0 loss_evals = 0 while True: data = loader.getImageBatch(params['split'], params) image_id = data['image_id'] img_ann_ids = data['img_ann_ids'] sent_ids = data['sent_ids'] gd_ixs = data['gd_ixs'] feats = Variable(xp.array(data['feats'], dtype=xp.float32)) seqz = data['seqz'] lang_last_ind = calc_max_ind(seqz) for i, sent_id in enumerate(sent_ids): gd_ix = gd_ixs[i] labels = xp.zeros(len(img_ann_ids), dtype=xp.int32) labels[gd_ix] = 1 labels = Variable(labels) sent_seqz = np.concatenate([[seqz[i]] for _ in range(len(img_ann_ids))],axis=0) one_last_ind = np.array([lang_last_ind[i]]*len(img_ann_ids)) sent_seqz = Variable(xp.array(sent_seqz, dtype=xp.int32)) vis_enc_feats = ve(feats) lang_enc_feats = le(sent_seqz, one_last_ind) cossim, vis_emb_feats = cca(vis_enc_feats, lang_enc_feats) vis_feats = vis_combine(vis_enc_feats, vis_emb_feats) logprobs = lm(vis_feats, sent_seqz, one_last_ind).data lm_scores = -computeLosses(logprobs, one_last_ind) if params['mode']==0: _, pos_sc, max_neg_sc = compute_margin_loss(lm_scores, gd_ix, 0) elif params['mode']==1: _, pos_sc, max_neg_sc = compute_margin_loss(cossim.data, gd_ix, 0) elif params['mode']==2: scores = cossim.data + params['lamda'] * lm_scores _, pos_sc, max_neg_sc = compute_margin_loss(scores, gd_ix, 0) if pos_sc > max_neg_sc: accuracy += 1 loss_evals += 1 print('{}-th: evaluating [{}] ... image[{}/{}] sent[{}], acc={}'.format(loss_evals, params['split'], data['bounds']['it_pos_now'], data['bounds']['it_max'], i, accuracy*100.0/loss_evals)) if data['bounds']['wrapped']: print('validation finished!') f = open('result/'+params['dataset']+params['split']+params['id']+str(params['mode'])+str(params['lamda'])+'comp.txt', 'w') # 書き込みモードで開く f.write(str(accuracy*100.0/loss_evals)) # 引数の文字列をファイルに書き込む f.close() break
def train_vl(params): target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) graph_dir = osp.join('log_graph', params['dataset'] + '_' + params['splitBy']) model_dir = osp.join(params['save_dir'], 'model', params['dataset'] + '_' + params['splitBy']) if not osp.isdir(graph_dir): os.makedirs(graph_dir) if not osp.isdir(model_dir): os.makedirs(model_dir) if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['image_feats'] = 'old' + params['image_feats'] params['ann_feats'] = 'old' + params['ann_feats'] params['id'] = 'old' + params['id'] params['word_emb_path'] = 'old' + params['word_emb_path'] loader = DataLoader(params) featsOpt = { 'ann': osp.join(target_save_dir, params['ann_feats']), 'img': osp.join(target_save_dir, params['image_feats']) } loader.loadFeats(featsOpt) loader.shuffle('train') # model setting batch_size = params['batch_size'] gpu_id = params['gpu_id'] seq_per_ref = params['seq_per_ref'] cuda.get_device(gpu_id).use() xp = cuda.cupy ve = VisualEncoder(res6=L.ResNet152Layers().fc6).to_gpu(gpu_id) if 'attention' in params['id']: print('attention language encoder') le = LanguageEncoderAttn(len(loader.ix_to_word)) save_model = ListenerReward(len(loader.ix_to_word), attention=True) else: le = LanguageEncoder(len(loader.ix_to_word)) save_model = ListenerReward(len(loader.ix_to_word), attention=False) if params['pretrained_w']: print('pretrained word embedding...') word_emb = load_vcab_init( loader.word_to_ix, osp.join(target_save_dir, params['word_emb_path'])) le.word_emb.W.data = word_emb le.to_gpu(gpu_id) me = MetricNet().to_gpu(gpu_id) ve_optim = optimizers.Adam(alpha=4e-5, beta1=0.8) le_optim = optimizers.Adam(alpha=4e-4, beta1=0.8) me_optim = optimizers.Adam(alpha=4e-4, beta1=0.8) ve_optim.setup(ve) le_optim.setup(le) me_optim.setup(me) ve_optim.add_hook(chainer.optimizer.GradientClipping(0.1)) le_optim.add_hook(chainer.optimizer.GradientClipping(0.1)) me_optim.add_hook(chainer.optimizer.GradientClipping(0.1)) ve.joint_enc.W.update_rule.hyperparam.alpha = 4e-4 ve.joint_enc.b.update_rule.hyperparam.alpha = 4e-4 iteration = 0 epoch = 0 val_loss_history = [] val_acc_history = [] val_rank_acc_history = [] min_val_loss = 100 max_acc = 0 while True: chainer.config.train = True chainer.config.enable_backprop = True ve.zerograds() le.zerograds() me.zerograds() data = loader.getBatch('train', params) ref_ann_ids = data['ref_ann_ids'] pos_feats = Variable(xp.array(data['feats'], dtype=xp.float32)) neg_feats = Variable(xp.array(data['neg_feats'], dtype=xp.float32)) feats = F.concat([pos_feats, neg_feats, pos_feats], axis=0) seqz = np.concatenate([data['seqz'], data['seqz'], data['neg_seqz']], axis=0) lang_last_ind = calc_max_ind(seqz) seqz = Variable(xp.array(seqz, dtype=xp.int32)) labels = Variable( xp.concatenate([ xp.ones((batch_size * seq_per_ref)), xp.zeros((batch_size * seq_per_ref)), xp.zeros((batch_size * seq_per_ref)) ]).astype(xp.int32)) vis_enc_feats = ve(feats) lang_enc_feats = le(seqz, lang_last_ind) score = me(vis_enc_feats, lang_enc_feats).reshape(labels.shape) loss = F.sigmoid_cross_entropy(score, labels) loss.backward() ve_optim.update() le_optim.update() me_optim.update() if data['bounds']['wrapped']: print('{} epoch finished!'.format(epoch)) loader.shuffle('train') epoch += 1 if iteration % params['losses_log_every'] == 0: print('{} iter ({} epoch): train loss {}'.format( iteration, epoch, loss.data)) ## validation if (iteration % params['save_checkpoint_every'] == 0 and iteration > 0): chainer.config.train = False chainer.config.enable_backprop = False loader.resetImageIterator('val') loss_sum = 0 loss_evals = 0 accuracy = 0 rank_acc = 0 rank_num = 0 while True: data = loader.getImageBatch('val', params) image_id = data['image_id'] img_ann_ids = data['img_ann_ids'] sent_ids = data['sent_ids'] gd_ixs = data['gd_ixs'] feats = Variable(xp.array(data['feats'], dtype=xp.float32)) seqz = data['seqz'] scores = [] for i, sent_id in enumerate(sent_ids): ## image内の全ての候補領域とscoreを算出する gd_ix = gd_ixs[i] labels = xp.zeros(len(img_ann_ids), dtype=xp.int32) labels[gd_ix] = 1 labels = Variable(labels) sent_seqz = np.concatenate( [[seqz[i]] for _ in range(len(img_ann_ids))], axis=0) lang_last_ind = calc_max_ind(sent_seqz) sent_seqz = Variable(xp.array(sent_seqz, dtype=xp.int32)) vis_enc_feats = ve(feats) lang_enc_feats = le(sent_seqz, lang_last_ind) score = me(vis_enc_feats, lang_enc_feats).reshape(labels.shape) loss = F.sigmoid_cross_entropy(score, labels) scores.append(score[gd_ix].data) loss_sum += loss.data loss_evals += 1 _, pos_sc, max_neg_sc = compute_margin_loss( score.data, gd_ix, 0) if pos_sc > max_neg_sc: accuracy += 1 if params['dataset'] == 'refgta': rank_a, rank_n = calc_rank_acc(scores, data['rank']) rank_acc += rank_a rank_num += rank_n print('{} iter | {}/{} validating acc : {}'.format( iteration, data['bounds']['it_pos_now'], data['bounds']['it_max'], accuracy / loss_evals)) if data['bounds']['wrapped']: print('validation finished!') fin_val_loss = cuda.to_cpu(loss_sum / loss_evals) fin_val_acc = accuracy / loss_evals break val_loss_history.append(fin_val_loss) val_acc_history.append(fin_val_acc) if min_val_loss > fin_val_loss: print('val loss {} -> {} improved!'.format( min_val_loss, val_loss_history[-1])) min_val_loss = fin_val_loss if max_acc < fin_val_acc: max_acc = fin_val_acc save_model.ve = ve save_model.le = le save_model.me = me serializers.save_hdf5( osp.join(model_dir, params['id'] + ".h5"), save_model) ## graph plt.title("accuracy") plt.plot(np.arange(len(val_acc_history)), val_acc_history, label="val_accuracy") plt.legend() plt.savefig(os.path.join(graph_dir, params['id'] + "_acc.png")) plt.close() plt.title("loss") plt.plot(np.arange(len(val_loss_history)), val_loss_history, label="val_loss") plt.legend() plt.savefig(os.path.join(graph_dir, params['id'] + "_loss.png")) plt.close() if params['dataset'] == 'refgta': print(rank_num) val_rank_acc_history.append(rank_acc / rank_num) plt.title("rank loss") plt.plot(np.arange(len(val_rank_acc_history)), val_rank_acc_history, label="rank_acc") plt.legend() plt.savefig( os.path.join(graph_dir, params['id'] + "_rank_acc.png")) plt.close() # learning rate decay if iteration > params['learning_rate_decay_start'] and params[ 'learning_rate_decay_start'] >= 0: frac = (iteration - params['learning_rate_decay_start'] ) / params['learning_rate_decay_every'] decay_factor = math.pow(0.1, frac) ve_optim.alpha *= decay_factor le_optim.alpha *= decay_factor me_optim.alpha *= decay_factor iteration += 1
def eval_all(params): target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) model_dir = osp.join(params['save_dir'], 'model', params['dataset'] + '_' + params['splitBy']) batch_size = params['batch_size'] if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['image_feats'] = 'old' + params['image_feats'] params['ann_feats'] = 'old' + params['ann_feats'] params['id'] = 'old' + params['id'] loader = DataLoader(params) featsOpt = { 'ann': osp.join(target_save_dir, params['ann_feats']), 'img': osp.join(target_save_dir, params['image_feats']) } loader.loadFeats(featsOpt) chainer.config.train = False chainer.config.enable_backprop = False gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy if 'attention' in params['id']: print('attn') le = LanguageEncoderAttn(len(loader.ix_to_word)).to_gpu(gpu_id) else: le = LanguageEncoder(len(loader.ix_to_word)).to_gpu(gpu_id) ve = VisualEncoder().to_gpu(gpu_id) cca = CcaEmbedding().to_gpu(gpu_id) lm = LanguageModel(len(loader.ix_to_word), loader.seq_length).to_gpu(gpu_id) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "ve.h5"), ve) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "le.h5"), le) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "cca.h5"), cca) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "lm.h5"), lm) # train: 82783 , val: 40504, test: 81434 add_feats = np.zeros([81434, 36, 1024]) for num, entry in enumerate(train_entries): print("{}/{}".format(num, len(train_entries))) image_id = entry['image_id'] idx = train_iid2id[image_id] features = all_image_feats[idx] boxes = all_boxes[idx] for m in range(36): bbox = boxes[m] bbox[2] -= bbox[0] bbox[3] -= bbox[1] boxes[m] = bbox for k in range(36): # feat 's shape: [1, 6249] feats = fetch_feats(entry, features, boxes, loader, k, params) feats = Variable(xp.array(feats, dtype=xp.float32)) # vis_enc_featus 's shape: [1, 512] vis_enc_feats = ve(feats) # lang_enc_featus 's shape: [1, 512] lang_enc_feats = vis_enc_feats # vis_emb_feats 's shape: [1, 512] _, vis_emb_feats = cca(vis_enc_feats, lang_enc_feats) # vis_feats 's shape: [1, 1024] vis_feats = vis_combine(vis_enc_feats, vis_emb_feats) # add_feats = np.zeros([82783, 36, 1024]) vis_feats = chainer.cuda.to_cpu(vis_feats.data) add_feats[idx][k] = np.array(vis_feats) # all_image_feats = cxt_features.get('image_features') # all_sp_feats = cxt_features.get('spatial_features') # all_boxes = cxt_features.get('image_bb') with h5py.File('new_hdf5/add_test36.hdf5', 'w') as hf: hf.create_dataset('image_features', data=all_image_feats, maxshape=(82783, 36, 2048)) hf.create_dataset('spatial_features', data=all_sp_feats, maxshape=(82783, 36, 6)) hf.create_dataset('image_bb', data=all_boxes, maxshape=(82783, 36, 4)) hf.create_dataset('additional_feats', data=add_feats, maxshape=(82783, 36, 1024))
def main(params): target_save_dir = osp.join(params['save_dir'],'prepro', params['dataset']+'_'+params['splitBy']) if params['old']: params['data_json'] = 'old'+params['data_json'] params['data_h5'] = 'old'+params['data_h5'] params['image_feats_h5'] = 'old'+params['image_feats_h5'] params['ann_feats_h5'] = 'old'+params['ann_feats_h5'] params['id'] = 'old'+params['id'] with open(target_save_dir+params["split"]+'_'+params['id']+str(params['beam_width'])+'.json') as f: data = json.load(f) ref_to_beams = {item['ref_id']: item['beam'] for item in data} # add ref_id to each beam for ref_id, beams in ref_to_beams.items(): for beam in beams: beam['ref_id'] = ref_id # make up ref_id in beam loader = DataLoader(params) featsOpt = {'ann':osp.join(target_save_dir, params['ann_feats_h5']), 'img':osp.join(target_save_dir, params['image_feats_h5'])} loader.loadFeats(featsOpt) chainer.config.train = False chainer.config.enable_backprop = False gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy ve = VisualEncoder().to_gpu(gpu_id) if 'attention' in params['id']: le = LanguageEncoderAttn(len(loader.ix_to_word)).to_gpu(gpu_id) else: le = LanguageEncoder(len(loader.ix_to_word)).to_gpu(gpu_id) cca = CcaEmbedding().to_gpu(gpu_id) serializers.load_hdf5(params['model_root']+params['dataset']+'_'+params['splitBy']+'/'+params['id']+"ve.h5", ve) serializers.load_hdf5(params['model_root']+params['dataset']+'_'+params['splitBy']+'/'+params['id']+"le.h5", le) serializers.load_hdf5(params['model_root']+params['dataset']+'_'+params['splitBy']+'/'+params['id']+"cca.h5", cca) img_to_ref_ids, img_to_ref_confusion = calc_confusion(loader, data, ref_to_beams, ve, le, cca, params, xp) sys.path.insert(0, osp.join('pyutils', 'refer2')) sys.path.insert(0, osp.join('pyutils', 'refer2', 'evaluation')) from refer import REFER from refEvaluation import RefEvaluation from crossEvaluation import CrossEvaluation refer = REFER(params['data_root'], params['dataset'], params['splitBy'], old_version=params['old']) if params['dataset'] == 'refcoco': lambda1 = 5 lambda2 = 5 elif params['dataset'] == 'refcoco+': lambda1 = 5 lambda2 = 5 elif params['dataset'] == 'refcocog': lambda1 = 5 lambda2 = 5 else: error('No such dataset option for ', params['dataset']) # compute unary potential, img_to_ref_unary # let's firstly try one image Res = [] for image_id in img_to_ref_confusion: # ref_ids and confusion matrices for this image img_ref_ids = img_to_ref_ids[image_id] ref_to_confusion = img_to_ref_confusion[image_id] # compute unary potential for each ref_id for ref_id in img_ref_ids: confusion = ref_to_confusion[ref_id] # (beam_size, #img_ref_ids) beams = ref_to_beams[ref_id] # [{ppl, sent, logp}] of beam_size compute_unary(ref_id, beams, confusion, img_ref_ids, lambda1, lambda2) # here's more preparation ref_beam_to_ix, ix_to_ref_beam, all_beams = make_index(img_ref_ids, ref_to_beams) # compute pairwise potentials pairwise_ref_beam_ids = compute_pairwise(img_ref_ids, ref_to_beams) # call cplex res = bilp(img_ref_ids, ref_to_beams, all_beams, pairwise_ref_beam_ids, ref_beam_to_ix, loader) Res += res # evaluate refEval = RefEvaluation(refer, Res) refEval.evaluate() overall = {} for metric, score in refEval.eval.items(): overall[metric] = score print (overall) if params['write_result'] > 0: file_name = params['model_id']+'_'+params['split']+'_beamrerank.json' result_path = osp.join('cache', 'lang', params['dataset']+'_'+params['splitBy'], file_name) refToEval = refEval.refToEval for res in Res: ref_id, sent = res['ref_id'], res['sent'] refToEval[ref_id]['sent'] = sent with open(result_path[:-5] + '_out.json', 'w') as outfile: json.dump({'overall': overall, 'refToEval': refToEval}, outfile) # CrossEvaluation takes as input [{ref_id, sent}] ceval = CrossEvaluation(refer, Res) ceval.cross_evaluate() ceval.make_ref_to_evals() ref_to_evals = ceval.ref_to_evals # ref_to_evals = {ref_id: {ref_id: {method: score}}} # compute cross score xcider = ceval.Xscore('CIDEr')
def eval_all(params): target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) model_dir = osp.join(params['save_dir'], 'model', params['dataset'] + '_' + params['splitBy']) result_dir = osp.join('result', params['dataset'] + '_' + params['splitBy']) if not osp.isdir(result_dir): os.makedirs(result_dir) if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['image_feats_h5'] = 'old' + params['image_feats'] params['ann_feats_h5'] = 'old' + params['ann_feats'] params['ann_feats_input'] = 'old' + params['ann_feats_input'] params['shapes'] = 'old' + params['shapes'] params['id'] = 'old' + params['id'] if params['dataset'] in ['refcoco', 'refcoco+', 'refcocog']: global_shapes = (224, 224) elif params['dataset'] == 'refgta': global_shapes = (480, 288) loader = DataLoader(params) featsOpt = { 'sp_ann': osp.join(target_save_dir, params['sp_ann_feats']), 'ann_input': osp.join(target_save_dir, params['ann_feats']), 'img': osp.join(target_save_dir, params['image_feats']), 'shapes': osp.join(target_save_dir, params['ann_shapes']) } loader.loadFeats(featsOpt) chainer.config.train = False chainer.config.enable_backprop = False gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy ve = VisualEncoder(global_shapes=global_shapes).to_gpu(gpu_id) rl_crit = ListenerReward(len(loader.ix_to_word), global_shapes=global_shapes).to_gpu(gpu_id) lm = LanguageModel(len(loader.ix_to_word), loader.seq_length, global_shapes).to_gpu(gpu_id) serializers.load_hdf5(osp.join(model_dir, params['id'] + ".h5"), rl_crit) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "ve.h5"), ve) serializers.load_hdf5( osp.join(model_dir, params['id'] + params['id2'] + "lm.h5"), lm) accuracy = 0 loss_evals = 0 while True: data = loader.getImageBatch(params['split'], params) image_id = data['image_id'] img_ann_ids = data['img_ann_ids'] sent_ids = data['sent_ids'] gd_ixs = data['gd_ixs'] feats = Variable(xp.array(data['feats'], dtype=xp.float32)) sp_cxt_feats = Variable( xp.array(data['sp_cxt_feats'], dtype=xp.float32)) sp_ann_feats = Variable( xp.array(data['sp_ann_feats'], dtype=xp.float32)) local_shapes = data['local_shapes'] seqz = data['seqz'] lang_last_ind = calc_max_ind(seqz) for i, sent_id in enumerate(sent_ids): gd_ix = gd_ixs[i] labels = xp.zeros(len(img_ann_ids), dtype=xp.int32) labels[gd_ix] = 1 labels = Variable(labels) sent_seqz = np.concatenate([[seqz[i]] for _ in range(len(img_ann_ids))], axis=0) one_last_ind = np.array([lang_last_ind[i]] * len(img_ann_ids)) sent_seqz = Variable(xp.array(sent_seqz, dtype=xp.int32)) coord = cuda.to_cpu( feats[:, sum(ve.feat_ind[:1]):sum(ve.feat_ind[:2])].data) local_sp_coord, global_sp_coord = calc_coordinate_feature( coord, local_shapes, global_shapes=global_shapes) local_sp_coord, global_sp_coord = xp.array( local_sp_coord, dtype=xp.float32), xp.array(global_sp_coord, dtype=xp.float32) vis_enc_feats = ve(feats, sp_cxt_feats, coord) sp_feats, sp_feats_emb = lm.calc_spatial_features( sp_cxt_feats, sp_ann_feats, local_sp_coord, global_sp_coord) vis_feats = vis_enc_feats logprobs = lm(vis_feats, sp_feats, sp_feats_emb, coord, sent_seqz, one_last_ind).data lm_scores = -cuda.to_cpu(computeLosses(logprobs, one_last_ind)) score = cuda.to_cpu( F.sigmoid( rl_crit.calc_score(feats, sp_cxt_feats, coord, sent_seqz, one_last_ind)).data)[:, 0] if params['mode'] == 0: _, pos_sc, max_neg_sc = compute_margin_loss( lm_scores, gd_ix, 0) elif params['mode'] == 1: _, pos_sc, max_neg_sc = compute_margin_loss(score, gd_ix, 0) elif params['mode'] == 2: scores = score + params['lamda'] * lm_scores _, pos_sc, max_neg_sc = compute_margin_loss(scores, gd_ix, 0) if pos_sc > max_neg_sc: accuracy += 1 loss_evals += 1 print('{}-th: evaluating [{}] ... image[{}/{}] sent[{}], acc={}'. format(loss_evals, params['split'], data['bounds']['it_pos_now'], data['bounds']['it_max'], i, accuracy * 100.0 / loss_evals)) if data['bounds']['wrapped']: print('validation finished!') f = open( result_dir + '/' + params['id'] + params['id2'] + str(params['mode']) + str(params['lamda']) + 'comp.txt', 'w') f.write(str(accuracy * 100.0 / loss_evals)) f.close() break
def main(params): target_save_dir = osp.join(params['save_dir'], 'prepro', params['dataset'] + '_' + params['splitBy']) model_dir = osp.join(params['save_dir'], 'model', params['dataset'] + '_' + params['splitBy']) if params['old']: params['data_json'] = 'old' + params['data_json'] params['data_h5'] = 'old' + params['data_h5'] params['image_feats'] = 'old' + params['image_feats'] params['ann_feats'] = 'old' + params['ann_feats'] params['id'] = 'old' + params['id'] if params['dataset'] in ['refcoco', 'refcoco+', 'refcocog']: global_shapes = (224, 224) image_root = params['coco_image_root'] elif params['dataset'] == 'refgta': global_shapes = (480, 288) image_root = params['gta_image_root'] with open(target_save_dir + params["split"] + '_' + params['id'] + params['id2'] + str(params['beam_width']) + '.json') as f: data = json.load(f) ref_to_beams = {item['ref_id']: item['beam'] for item in data} # add ref_id to each beam for ref_id, beams in ref_to_beams.items(): for beam in beams: beam['ref_id'] = ref_id # make up ref_id in beam loader = DataLoader(params) featsOpt = { 'sp_ann': osp.join(target_save_dir, params['sp_ann_feats']), 'ann_input': osp.join(target_save_dir, params['ann_feats']), 'img': osp.join(target_save_dir, params['image_feats']), 'shapes': osp.join(target_save_dir, params['ann_shapes']) } loader.loadFeats(featsOpt) loader.shuffle('train') loader.loadFeats(featsOpt) chainer.config.train = False chainer.config.enable_backprop = False gpu_id = params['gpu_id'] cuda.get_device(gpu_id).use() xp = cuda.cupy rl_crit = ListenerReward(len(loader.ix_to_word), global_shapes=global_shapes).to_gpu(gpu_id) serializers.load_hdf5(osp.join(model_dir, params['id'] + ".h5"), rl_crit) #serializers.load_hdf5(osp.join(model_dir, "attn_rank.h5"), rl_crit) img_to_ref_ids, img_to_ref_confusion = calc_confusion( loader, data, ref_to_beams, rl_crit, params, xp) sys.path.insert(0, osp.join('pyutils', 'refer2')) sys.path.insert(0, osp.join('pyutils', 'refer2', 'evaluation')) from refer import REFER from refEvaluation import RefEvaluation from crossEvaluation import CrossEvaluation refer = REFER(params['data_root'], image_root, params['dataset'], params['splitBy'], old_version=params['old']) if params['dataset'] == 'refcoco': lambda1 = 5 lambda2 = 5 elif params['dataset'] == 'refcoco+': lambda1 = 5 lambda2 = 5 elif params['dataset'] == 'refcocog': lambda1 = 5 lambda2 = 5 elif params['dataset'] == 'refgta': lambda1 = 5 lambda2 = 5 else: error('No such dataset option for ', params['dataset']) # compute unary potential, img_to_ref_unary # let's firstly try one image Res = [] for image_id in img_to_ref_confusion: # ref_ids and confusion matrices for this image img_ref_ids = img_to_ref_ids[image_id] ref_to_confusion = img_to_ref_confusion[image_id] # compute unary potential for each ref_id for ref_id in img_ref_ids: confusion = ref_to_confusion[ref_id] # (beam_size, #img_ref_ids) beams = ref_to_beams[ref_id] # [{ppl, sent, logp}] of beam_size compute_unary(ref_id, beams, confusion, img_ref_ids, lambda1, lambda2) # here's more preparation ref_beam_to_ix, ix_to_ref_beam, all_beams = make_index( img_ref_ids, ref_to_beams) # compute pairwise potentials pairwise_ref_beam_ids = compute_pairwise(img_ref_ids, ref_to_beams) # call cplex res = bilp(img_ref_ids, ref_to_beams, all_beams, pairwise_ref_beam_ids, ref_beam_to_ix, loader) Res += res # evaluate eval_cider_r = params['dataset'] == 'refgta' refEval = RefEvaluation(refer, Res, eval_cider_r=eval_cider_r) refEval.evaluate() overall = {} for metric, score in refEval.eval.items(): overall[metric] = score print(overall) if params['write_result'] > 0: refToEval = refEval.refToEval for res in Res: ref_id, sent = res['ref_id'], res['sent'] refToEval[ref_id]['sent'] = sent with open('' + params['id'] + params['id2'] + '_out.json', 'w') as outfile: json.dump({'overall': overall, 'refToEval': refToEval}, outfile) # CrossEvaluation takes as input [{ref_id, sent}] ceval = CrossEvaluation(refer, Res) ceval.cross_evaluate() ceval.make_ref_to_evals() ref_to_evals = ceval.ref_to_evals # ref_to_evals = {ref_id: {ref_id: {method: score}}} # compute cross score xcider = ceval.Xscore('CIDEr')