def generate(**kwargs): device = t.device('cuda') if t.cuda.is_available else t.device('cpu') for k, v in kwargs.items(): setattr(opt, k, v) data = t.load(opt.caption_path, map_location=lambda s, l: s) word2ix, ix2word = data['word2ix'], data['ix2word'] transforms = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), tv.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) img = img = transforms(img).unsqueeze(0) resnet50 = tv.models.resnet50(True).eval() del resnet50.fc resnet50.fc = lambda x: x # resnet50 = # img = img_feats = resnet50(img).detach() # Caption Model model = CaptionModel(opt, word2ix, ix2word) model.load_state_dict(t.load(opt.model_path, map_location='cpu')) # results = model.generate([0]) print('\r\n'.join(results))
def generate(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) device=t.device('cuda') if opt.use_gpu else t.device('cpu') # 数据预处理 data = t.load(opt.caption_data_path, map_location=lambda s, l: s) word2ix, ix2word = data['word2ix'], data['ix2word'] normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) transforms = tv.transforms.Compose([ tv.transforms.Resize(opt.scale_size), tv.transforms.CenterCrop(opt.img_size), tv.transforms.ToTensor(), normalize ]) img = img = transforms(img).unsqueeze(0) # 用resnet50来提取图片特征 resnet50 = tv.models.resnet50(True).eval() del resnet50.fc resnet50.fc = lambda x: x img = img_feats = resnet50(img).detach() # Caption模型 model = CaptionModel(opt, word2ix, ix2word) model = model.load(opt.model_ckpt).eval() results = model.generate([0]) print('\r\n'.join(results))
def train(**kwargs): device = t.device('cuda') if t.cuda.is_available() else t.device('cpu') for k, v in kwargs.items(): setattr(opt, k, v) dataloader = get_dataloader(opt) model = CaptionModel(opt, dataloader.dataset.word2ix, dataloader.dataset.id2ix) if opt.model_path: model.load_state_dict(t.load(opt.model_path, map_location='cpu')) t.backends.cudnn.enabled = False model = optimizer = Adam(model.parameters(), criterion = t.nn.CrossEntropyLoss() for epoch in range(opt.max_epoch): for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(dataloader)): imgs = Variable(imgs).to(device) captions = Variable(captions).to(device) pred, _ = model(imgs, captions, lengths) target_captions = pack_padded_sequence(captions, lengths)[0] loss = criterion(pred, target_captions) optimizer.zero_grad() loss.backward() optimizer.step() print("Current Loss: ", loss.item()) if (epoch + 1) % opt.save_model == 0:, "checkpoints/{}.pth".format(epoch))
def generate(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) device = t.device('cuda') if opt.use_gpu else t.device('cpu') # 数据预处理 data = t.load(opt.caption_data_path, map_location=lambda s, l: s) word2ix, ix2word = data['word2ix'], data['ix2word'] normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) transforms = tv.transforms.Compose([ tv.transforms.Resize(opt.scale_size), tv.transforms.CenterCrop(opt.img_size), tv.transforms.ToTensor(), normalize ]) img = img = transforms(img).unsqueeze(0) # 用resnet50来提取图片特征 resnet50 = tv.models.resnet50(True).eval() del resnet50.fc resnet50.fc = lambda x: x img = img_feats = resnet50(img).detach() # Caption模型 model = CaptionModel(opt, word2ix, ix2word) model = model.load(opt.model_ckpt).eval() results = model.generate([0]) print('\r\n'.join(results))
def generate(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) # 数据预处理 data = t.load(opt.caption_data_path) word2ix, ix2word = data['word2ix'], data['ix2word'] test_datas = t.load('test_results2.pth') imgs = t.load('test_imgs.pth') # Caption模型 model = CaptionModel(opt, None, word2ix, ix2word) model = model.load(opt.model_ckpt).eval() model.cuda() results = [] for ii, (img_feat, img_id) in tqdm.tqdm(enumerate(zip(test_datas, imgs))): sentences = model.generate(img_feat) item = { 'image_id': img_id.replace('.jpg', ''), 'caption': sentences[0].replace('</EOS>', '') } results.append(item) if ii % 1000 == 0: print sentences[0] import json with open('submit.json', 'w') as f: json.dump(results, f)
def generate(**kwargs): opt = Config() for k,v in kwargs.items(): setattr(opt,k,v) # 数据预处理 data = t.load(opt.caption_data_path,map_location=lambda s,l:s) word2ix,ix2word = data['word2ix'],data['ix2word'] IMAGENET_MEAN = [0.485, 0.456, 0.406] IMAGENET_STD = [0.229, 0.224, 0.225] normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN,std=IMAGENET_STD) transforms = tv.transforms.Compose([ tv.transforms.Scale(opt.scale_size), tv.transforms.CenterCrop(opt.img_size), tv.transforms.ToTensor(), normalize ]) img = img = transforms(img).unsqueeze(0) # 用resnet50来提取图片特征 resnet50 = tv.models.resnet50(True).eval() del resnet50.fc resnet50.fc = lambda x:x if opt.use_gpu: resnet50.cuda() img = img.cuda() img_feats = resnet50(Variable(img,volatile=True)) # Caption模型 model = CaptionModel(opt,word2ix,ix2word) model = model.load(opt.model_ckpt).eval() if opt.use_gpu: model.cuda() results = model.generate([0]) print('\r\n'.join(results))
def generate(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) # 数据预处理 data = t.load(opt.caption_data_path, map_location=lambda s, l: s) word2ix, ix2word = data['word2ix'], data['ix2word'] IMAGENET_MEAN = [0.485, 0.456, 0.406] IMAGENET_STD = [0.229, 0.224, 0.225] normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) transforms = tv.transforms.Compose([ tv.transforms.Resize(opt.scale_size), tv.transforms.CenterCrop(opt.img_size), tv.transforms.ToTensor(), normalize, ]) img = img = transforms(img).unsqueeze(0) # 用resnet50来提取图片特征 resnet50 = tv.models.resnet50(True).eval() del resnet50.fc resnet50.fc = lambda x: x if opt.use_gpu: resnet50.cuda() img = img.cuda() with t.no_grad(): img_feats = resnet50(Variable(img)) # Caption模型 model = CaptionModel(opt, word2ix, ix2word) model = model.load(opt.model_ckpt).eval() if opt.use_gpu: model.cuda() results = model.generate([0]) print('\r\n'.join(results))
def run_inference(config, curr_ckpt_path): """ Main inference function. Builds and executes the model. """ ckpt_dir, ckpt_file = os.path.split(curr_ckpt_path) ckpt_num = P_CKPT.findall(ckpt_file)[0] # Checkpoint number # Setup input pipeline & Build model print('TensorFlow version: r{}'.format(tf.__version__)) g = tf.Graph() with g.as_default(): tf.set_random_seed(config.rand_seed) inputs_man = inputs.InputManager(config, is_inference=True) c = inputs_man.config batch_size = c.batch_size_infer with tf.name_scope('infer'): m_infer = CaptionModel(c, mode='infer', batch_ops=inputs_man.batch_infer, reuse=False, name='inference') init_fn = tf.local_variables_initializer() saver = tf.train.Saver() filenames = inputs_man.filenames_infer r = config.per_process_gpu_memory_fraction gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=r) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options), graph=g) num_batches = int(c.split_sizes['infer'] / batch_size) raw_outputs = dict(captions={}, attention={}, image_ids={}, beam_size=c.infer_beam_size, max_caption_length=c.infer_max_length, checkpoint_path=curr_ckpt_path, checkpoint_number=ckpt_num) coco_json = [] with sess: # Restore model from checkpoint saver.restore(sess, curr_ckpt_path) g.finalize() print("INFO: Graph constructed. Starting inference.") start_time = time.time() desc = 'Inference: checkpoint {}'.format(ckpt_num) for step in tqdm(range(num_batches), desc=desc, ncols=100): word_ids, attn_maps = captions = id_to_caption(word_ids, c) #attn_maps = np.split(attn_maps, batch_size) # Get image ids, compile results batch_start = step * batch_size batch_end = (step + 1) * batch_size batch_filenames = filenames[batch_start:batch_end] for i, f in enumerate(batch_filenames): image_id = f.replace('.jpg', '') image_id = P_COCO.findall(image_id) if len(image_id) > 0: image_id = int(image_id[0]) else: image_id = int(image_id) raw_outputs['captions'][f] = captions[i] #if c.infer_beam_size == 1: raw_outputs['attention'][f] = attn_maps[i] raw_outputs['image_ids'][f] = image_id coco_json.append( dict(image_id=image_id, caption=unicode(captions[i]))) print("\nExample captions:\n{}\n".format("\n".join(captions[:3]))) t = time.time() - start_time sess.close() # Ensure correctness assert len(filenames) == len(list(set(filenames))) assert len(filenames) == len(coco_json) assert len(filenames) == len(raw_outputs['image_ids'].keys()) # Dump output files raw_output_fname = 'outputs___{}.pkl'.format(ckpt_num) coco_json_fname = 'captions___{}.json'.format(ckpt_num) # Captions with attention maps if c.save_attention_maps: with open(pjoin(c.infer_save_path, raw_output_fname), 'wb') as f: pickle.dump(raw_outputs, f, pickle.HIGHEST_PROTOCOL) # Captions with image ids with open(pjoin(c.infer_save_path, coco_json_fname), 'w') as f: json.dump(coco_json, f) if not os.path.isfile(pjoin(c.infer_save_path, 'infer_speed.txt')): out = [ 'Using GPU #: {}'.format(c.gpu), 'Inference batch size: {}'.format(c.batch_size_infer), 'Inference beam size: {}'.format(c.infer_beam_size), '' ] with open(pjoin(c.infer_save_path, 'infer_speed.txt'), 'a') as f: f.write('\r\n'.join(out)) with open(pjoin(c.infer_save_path, 'infer_speed.txt'), 'a') as f: f.write('\r\n{}'.format(len(filenames) / t)) print("\nINFO: Inference completed. Time taken: {:4.2f} mins\n".format(t / 60))
def main(hf, f_type, capl=16, d_w2v=512, output_dim=512, feature_shape=None, lr=0.01, batch_size=64, total_epoch=100, file=None, pretrained_model=None): ''' capl: the length of caption ''' # Create vocabulary v2i, train_data, val_data, test_data = MsrDataUtil.create_vocabulary_word2vec( file, capl=capl, v2i={ '': 0, 'UNK': 1, 'BOS': 2, 'EOS': 3 }) i2v = {i: v for v, i in v2i.items()} print('building model ...') voc_size = len(v2i) input_video = tf.placeholder(tf.float32, shape=(None, ) + feature_shape, name='input_video') input_captions = tf.placeholder(tf.int32, shape=(None, capl), name='input_captions') y = tf.placeholder(tf.int32, shape=(None, capl, len(v2i))) captionModel = CaptionModel.CaptionModel(input_video, input_captions, voc_size, d_w2v, output_dim) predict_score, predict_words = captionModel.build_model() loss = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=predict_score) loss = tf.reduce_mean(loss) + sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) optimizer = tf.train.RMSPropOptimizer(lr, decay=0.9, momentum=0.0, epsilon=1e-8) train = optimizer.minimize(loss) ''' configure && runtime environment ''' config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.3 # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) config.log_device_placement = False sess = tf.Session(config=config) init = tf.global_variables_initializer() with sess.as_default(): saver = tf.train.Saver(sharded=True, max_to_keep=total_epoch) if pretrained_model is not None: saver.restore(sess, pretrained_model) print('restore pre trained file:' + pretrained_model) for epoch in xrange(total_epoch): # # shuffle print('Epoch: %d/%d, Batch_size: %d' % (epoch + 1, total_epoch, batch_size)) # # train phase tic = time.time() total_loss = exe_train(sess, train_data, batch_size, v2i, hf, feature_shape, train, loss, input_video, input_captions, y, capl=capl) print(' --Train--, Loss: %.5f, .......Time:%.3f' % (total_loss, time.time() - tic)) tic = time.time() js = exe_test(sess, test_data, batch_size, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, y, capl=capl) print(' --Val--, .......Time:%.3f' % (time.time() - tic)) #save model export_path = '/home/xyj/usr/local/saved_model/msrvtt2017/s2s' + '_' + f_type + '/' + 'lr' + str( lr) + '_f' + str(feature_shape[0]) if not os.path.exists(export_path + '/model'): os.makedirs(export_path + '/model') print('mkdir %s' % export_path + '/model') if not os.path.exists(export_path + '/res'): os.makedirs(export_path + '/res') print('mkdir %s' % export_path + '/res') # eval res_path = export_path + '/res/' + f_type + '_E' + str(epoch + 1) + '.json' evaluate_mode_by_shell(res_path, js) save_path = sess, export_path + '/model/' + 'E' + str(epoch + 1) + '_L' + str(total_loss) + '.ckpt') print("Model saved in file: %s" % save_path)
'num_chunks': opt.num_chunks, 'mode': 'test' } train_loader = DataLoader(train_opt) val_loader = DataLoader(val_opt) test_loader = DataLoader(test_opt) opt.vocab = train_loader.get_vocab() opt.vocab_size = train_loader.get_vocab_size() opt.seq_length = train_loader.get_seq_length() opt.feat_dims = train_loader.get_feat_dims() opt.history_file = opt.model_file.replace('.pth', '_history.json', 1)'Building model...') model = CaptionModel(opt) xe_criterion = CrossEntropyCriterion() rl_criterion = RewardCriterion() if torch.cuda.is_available(): model.cuda() xe_criterion.cuda() rl_criterion.cuda()'Start training...') start = optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate) infos = train(model, xe_criterion,
test_loader = DataLoader(test_opt)'Loading model: %s', opt.model_file) checkpoint = torch.load(opt.model_file) checkpoint_opt = checkpoint['opt'] opt.model_type = checkpoint_opt.model_type opt.vocab = checkpoint_opt.vocab opt.vocab_size = checkpoint_opt.vocab_size opt.seq_length = checkpoint_opt.seq_length opt.feat_dims = checkpoint_opt.feat_dims assert opt.vocab_size == test_loader.get_vocab_size() assert opt.seq_length == test_loader.get_seq_length() assert opt.feat_dims == test_loader.get_feat_dims()'Building model...') model = CaptionModel(opt)'Loading state from the checkpoint...') model.load_state_dict(checkpoint['model']) xe_criterion = CrossEntropyCriterion() model.cuda() xe_criterion.cuda()'Start testing...') test(model, xe_criterion, test_loader, opt)'Time: %s', - start) test_loader.close()
def train(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) device=t.device('cuda') if opt.use_gpu else t.device('cpu') opt.caption_data_path = 'caption.pth' # 原始数据 opt.test_img = '' # 输入图片 # opt.model_ckpt='caption_0914_1947' # 预训练的模型 # 数据 vis = Visualizer(env=opt.env) dataloader = get_dataloader(opt) _data = dataloader.dataset._data word2ix, ix2word = _data['word2ix'], _data['ix2word'] # 模型 model = CaptionModel(opt, word2ix, ix2word) if opt.model_ckpt: model.load(opt.model_ckpt) optimizer = model.get_optimizer( criterion = t.nn.CrossEntropyLoss() # 统计 loss_meter = meter.AverageValueMeter() for epoch in range(opt.epoch): loss_meter.reset() for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(dataloader)): # 训练 optimizer.zero_grad() imgs = captions = input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] score, _ = model(imgs, input_captions, lengths) loss = criterion(score, target_captions) loss.backward() optimizer.step() loss_meter.add(loss.item()) # 可视化 if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() vis.plot('loss', loss_meter.value()[0]) # 可视化原始图片 + 可视化人工的描述语句 raw_img = _data['ix2id'][indexes[0]] img_path = opt.img_path + raw_img raw_img ='RGB') raw_img = tv.transforms.ToTensor()(raw_img) raw_caption =[:, 0] raw_caption = ''.join([_data['ix2word'][ii] for ii in raw_caption]) vis.text(raw_caption, u'raw_caption') vis.img('raw', raw_img, caption=raw_caption) # 可视化网络生成的描述语句 results = model.generate([0]) vis.text('</br>'.join(results), u'caption')
'Process the train data with and', 'first to get "model_info.json", "train_descriptions.pkl" and "train.pkl".' ) embedding_matrix = get_embeddings(root_path, model_info['vocab_size'], embedding_dim, model_info['wordtoidx']) train_dataset = SampleDataset(train_descriptions, train_img_features, model_info['wordtoidx'], model_info['max_length']) train_loader = DataLoader(train_dataset, batch_size, collate_fn=my_collate) caption_model = CaptionModel(model_info['vocab_size'], embedding_dim, hidden_size=hidden_size, embedding_matrix=embedding_matrix, embedding_train=True) init_weights(caption_model, embedding_pretrained=True) # we will ignore the pad token in true target set criterion = nn.CrossEntropyLoss(ignore_index=0) optimizer = torch.optim.Adam(caption_model.parameters(), lr=0.01) clip = 1 start = time() print(f'Training...')
batch_size=batch_size, max_length=max_length, shuffle=True, num_workers=workers) print("trainloader ok") valDataLoader = get_iterator(valDset, vocab, batch_size=eval_batch_size, max_length=max_length, shuffle=False, num_workers=workers) model = CaptionModel(cnn, vocab, embedding_size=embedding_size, rnn_size=rnn_size, num_layers=num_layers, share_embedding_weights=share_weights) if 'cuda' in type: cudnn.benchmark = True model.cuda() optimizer = select_optimizer(optimizer, params=model.parameters(), lr=learning_rate) regime = lambda e: { 'lr': learning_rate * (lr_decay**e), 'momentum': momentum, 'weight_decay': weight_decay }
def main(hf,f_type,capl=16, d_w2v=512, output_dim=512, feature_shape=None,unsup_training_feature_shape=None, lr=0.01, batch_size=64,total_epoch=100,unsup_epoch=None, file=None,pretrained_model=None): ''' capl: the length of caption ''' # Create vocabulary v2i, train_data, val_data, test_data = MsrDataUtil.create_vocabulary_word2vec(file, capl=capl, v2i={'': 0, 'UNK':1,'BOS':2, 'EOS':3}) i2v = {i:v for v,i in v2i.items()} print('building model ...') voc_size = len(v2i) input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video') input_captions = tf.placeholder(tf.int32, shape=(None,capl), name='input_captions') y = tf.placeholder(tf.int32,shape=(None, capl)) unsup_input_video = tf.placeholder(tf.float32, shape=(None,)+(40,2048),name='unsup_input_video') unsup_decoder_feature = tf.placeholder(tf.float32, shape=(None,)+(40,2048),name='unsup_decoder_feature') true_video = tf.placeholder(tf.float32, shape=(None,)+(40,2048),name='true_video') # # attentionCaptionModel = CaptionModel.UnsupTrainingAttentionCaptionModel(input_video, input_captions, unsup_input_video, unsup_decoder_feature, voc_size, d_w2v, output_dim, T_k=[1,2,4,8]) predict_score, predict_words, predict_vector = attentionCaptionModel.build_model() huber_Loss = Losses.Huber_Loss(predict_vector, true_video) unsup_training_loss = print('unsup_training_loss.get_shape().as_list()',unsup_training_loss.get_shape().as_list()) unsup_training_loss = tf.reduce_mean(tf.reduce_sum(unsup_training_loss,axis=[1,2])+sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))) optimizer = tf.train.AdamOptimizer(learning_rate=lr,beta1=0.9,beta2=0.999,epsilon=1e-08,use_locking=False,name='Adam') gvs = optimizer.compute_gradients(unsup_training_loss) capped_gvs = [(tf.clip_by_global_norm([grad], 10)[0][0], var) for grad, var in gvs ] unsup_training = optimizer.apply_gradients(capped_gvs) caption_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=predict_score)+sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) caption_loss = tf.reduce_mean(caption_loss)# caption_gvs = optimizer.compute_gradients(caption_loss) caption_capped_gvs = [(tf.clip_by_global_norm([grad], 10)[0][0], var) for grad, var in caption_gvs ] caption_training = optimizer.apply_gradients(caption_capped_gvs) # caption_training = optimizer.minimize(caption_loss) # ''' configure && runtime environment ''' config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.3 # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) config.log_device_placement=False sess = tf.Session(config=config) init = tf.global_variables_initializer() with sess.as_default(): saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch) if pretrained_model is not None: saver.restore(sess, pretrained_model) print('restore pre trained file:' + pretrained_model) export_path = '/home/xyj/usr/local/saved_model/msrvtt2017/'+f_type+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0])+'_B'+str(batch_size) # #unsupervised training for epoch in xrange(unsup_epoch): print('Unsupervised Epoch: %d/%d, Batch_size: %d' %(epoch+1,unsup_epoch,batch_size)) # # train phase tic = time.time() total_loss = exe_unsup_train(sess, train_data, batch_size, v2i, hf, unsup_training_feature_shape, unsup_training, unsup_training_loss, unsup_input_video, unsup_decoder_feature, true_video,capl=capl) print(' --Unsupervised Training--, Loss: %.5f, .......Time:%.3f' %(total_loss,time.time()-tic)) tic = time.time() total_loss = exe_unsup_test(sess, test_data, batch_size, v2i, hf, unsup_training_feature_shape, unsup_training_loss, unsup_input_video, unsup_decoder_feature, true_video,capl=capl) print(' --Unsupervised Testing--, Loss: %.5f, .......Time:%.3f' %(total_loss,time.time()-tic)) if not os.path.exists(export_path+'/unsupervised'): os.makedirs(export_path+'/unsupervised') print('mkdir %s' %export_path+'/unsupervised') save_path =, export_path+'/unsupervised/'+'E'+str(epoch+1)+'_L'+str(total_loss)+'.ckpt') for epoch in xrange(total_epoch): # # shuffle # if epoch % 5==0: # train phase print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size)) tic = time.time() total_loss = exe_train(sess, train_data, batch_size, v2i, hf, feature_shape, caption_training, caption_loss, input_video, input_captions, y,capl=capl) print(' --Train--, Loss: %.5f, .......Time:%.3f' %(total_loss,time.time()-tic)) tic = time.time() js = exe_test(sess, test_data, batch_size, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, y, capl=capl) print(' --Val--, .......Time:%.3f' %(time.time()-tic)) #save model # export_path = '/home/xyj/usr/local/saved_model/msrvtt2017/s2s'+'_'+f_type+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0])+'_B'+str(batch_size) if not os.path.exists(export_path+'/model'): os.makedirs(export_path+'/model') print('mkdir %s' %export_path+'/model') if not os.path.exists(export_path+'/res'): os.makedirs(export_path+'/res') print('mkdir %s' %export_path+'/res') # eval res_path = export_path+'/res/'+f_type+'_E'+str(epoch+1)+'.json' evaluate_mode_by_shell(res_path,js) save_path =, export_path+'/model/'+'E'+str(epoch+1)+'_L'+str(total_loss)+'.ckpt') print("Model saved in file: %s" % save_path)
IMAGENET_STD = [0.229, 0.224, 0.225] normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN,std=IMAGENET_STD) transforms = tv.transforms.Compose([ tv.transforms.Scale(opt.scale_size), tv.transforms.CenterCrop(opt.img_size), tv.transforms.ToTensor(), normalize ]) img_ = img = transforms(img_).unsqueeze(0) img_.resize((int(img_.width*256/img_.height),256)) # 用resnet50来提取图片特征 # 如果本地没有预训练的模型文件,会自动下载 resnet50 = tv.models.resnet50(True).eval() del resnet50.fc resnet50.fc = lambda x:x if opt.use_gpu: resnet50.cuda() img = img.cuda() img_feats = resnet50(Variable(img,volatile=True)) # Caption模型 model = CaptionModel(opt,word2ix,ix2word) model = model.load(opt.model_ckpt).eval() if opt.use_gpu: model.cuda() results = model.generate([0]) print('\r\n'.join(results))
def main(unused_argv): # Check the supplied arguments if len(unused_argv) != 1: raise Exception("There is a problem with how you entered flags: %s" % unused_argv) if not FLAGS.experiment_name: raise Exception("You need to specify --experiment_name") if not FLAGS.ckpt_load_dir and FLAGS.mode == "eval": raise Exception("You need to specify a directory to load the checkpoint for eval") if (not FLAGS.data_source) or (FLAGS.data_source != "ssd" and FLAGS.data_source != "ram"): raise Exception("You need to specify how to load data. Choose from ram and ssd.") FLAGS.MAIN_DIR = os.path.dirname(os.path.abspath(__file__)) # Absolute path of the directory containing FLAGS.DATA_DIR = os.path.join(FLAGS.MAIN_DIR, "data") # Absolute path of the data/ directory FLAGS.EXPERIMENTS_DIR = os.path.join(FLAGS.MAIN_DIR, "experiments") # Absolute path of the experiments/ directory FLAGS.train_dir = os.path.join(FLAGS.EXPERIMENTS_DIR, FLAGS.experiment_name) FLAGS.bestmodel_dir = os.path.join(FLAGS.train_dir, "best_checkpoint") FLAGS.train_res_dir = os.path.join(FLAGS.train_dir, "myCaptions.json") # Store the prediction results (for evaluation) during training FLAGS.glove_path = os.path.join(FLAGS.MAIN_DIR, "glove.6B.300d.trimmed.txt") FLAGS.goldAnn_train_dir = os.path.join(FLAGS.MAIN_DIR, "coco/annotations/captions_train2014.json") FLAGS.goldAnn_val_dir = os.path.join(FLAGS.MAIN_DIR, "coco/annotations/captions_val2014.json") # Load embedding matrix and vocab mappings random_init = (FLAGS.special_token == "train") emb_matrix, word2id, id2word = get_glove(FLAGS.glove_path, 300, random_init=random_init) # Initialize model caption_model = CaptionModel(FLAGS, id2word, word2id, emb_matrix) # Some GPU settings config=tf.ConfigProto() config.gpu_options.allow_growth = True #################################################################################### #################################################################################### if FLAGS.mode == "train": # Setup train dir and logfile if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler(os.path.join(FLAGS.train_dir, "log.txt")) logging.getLogger().addHandler(file_handler) # Make bestmodel dir if necessary if not os.path.exists(FLAGS.bestmodel_dir): os.makedirs(FLAGS.bestmodel_dir) with tf.Session(config=config) as sess: initialize_model(sess, caption_model, FLAGS.train_dir, expect_exists=False) # Load most recent model caption_model.train(sess) #################################################################################### #################################################################################### # Sample evaluation command: python --mode=eval --experiment_name=baseline --ckpt_load_dir=./experiments/baseline/best_checkpoint elif FLAGS.mode == "eval": print("Starting official evaluation...") with tf.Session(config=config) as sess: initialize_model(sess, caption_model, FLAGS.ckpt_load_dir, expect_exists=True) scores = caption_model.check_metric(sess, mode='val', num_samples=0) # Replace mode with 'test' if want to evaluate on test set for metric_name, metric_score in scores.items(): print("{}: {}".format(metric_name, metric_score)) else: raise Exception("Unexpected value of FLAGS.mode: %s" % FLAGS.mode)
def train(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) vis = Visualizer(env=opt.env) dataloader = get_dataloader(opt) _data = dataloader.dataset._data word2ix, ix2word = _data['word2ix'], _data['ix2word'] # cnn = tv.models.resnet50(True) model = CaptionModel(opt, None, word2ix, ix2word) if opt.model_ckpt: model.load(opt.model_ckpt) optimizer = model.get_optimizer(opt.lr1) criterion = t.nn.CrossEntropyLoss() model.cuda() criterion.cuda() loss_meter = meter.AverageValueMeter() perplexity = meter.AverageValueMeter() for epoch in range(opt.epoch): loss_meter.reset() perplexity.reset() for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(dataloader)): optimizer.zero_grad() input_captions = captions[:-1] imgs = imgs.cuda() captions = captions.cuda() imgs = Variable(imgs) captions = Variable(captions) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] score, _ = model(imgs, input_captions, lengths) loss = criterion(score, target_captions) loss.backward() # clip_grad_norm(model.rnn.parameters(),opt.grad_clip) optimizer.step() loss_meter.add([0]) perplexity.add(t.exp([0]) # 可视化 if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() vis.plot('loss', loss_meter.value()[0]) vis.plot('perplexity', perplexity.value()[0]) # 可视化原始图片 raw_img = _data['train']['ix2id'][indexes[0]] img_path = '/data/image/ai_cha/caption/ai_challenger_caption_train_20170902/caption_train_images_20170902/' + raw_img raw_img ='RGB') raw_img = tv.transforms.ToTensor()(raw_img) vis.img('raw', raw_img) # raw_img = ([0]*0.25+0.45).clamp(max=1,min=0) # vis.img('raw',raw_img) # 可视化人工的描述语句 raw_caption =[:, 0] raw_caption = ''.join( [_data['ix2word'][ii] for ii in raw_caption]) vis.text(raw_caption, u'raw_caption') # 可视化网络生成的描述语句 results = model.generate([0]) vis.text('</br>'.join(results), u'caption') if (epoch + 1) % 100 == 0:
def train(**kwargs): opt = Config() opt.caption_data_path = 'caption.pth' # 原始数据 opt.test_img = '' # 输入图片 #opt.model_ckpt='caption_0914_1947' # 预训练的模型 # 数据w vis = Visualizer(env=opt.env) dataloader = get_dataloader(opt) _data = dataloader.dataset._data word2ix, ix2word = _data['word2ix'], _data['ix2word'] # 模型 model = CaptionModel(opt, word2ix, ix2word) if opt.model_ckpt: model.load(opt.model_ckpt) optimizer = model.get_optimizer( criterion = t.nn.CrossEntropyLoss() if opt.use_gpu: model.cuda() criterion.cuda() # 统计 loss_meter = meter.AverageValueMeter() for epoch in range(opt.epoch): loss_meter.reset() for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(dataloader)): # 训练 optimizer.zero_grad() if opt.use_gpu: imgs = imgs.cuda() captions = captions.cuda() imgs = Variable(imgs) captions = Variable(captions) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] score, _ = model(imgs, input_captions, lengths) loss = criterion(score, target_captions) loss.backward() optimizer.step() loss_meter.add([0]) ''' if (ii+1)%opt.plot_every ==0: if os.path.exists(opt.debug_file): ipdb.set_trace() vis.plot('loss',loss_meter.value()[0]) # 可视化原始图片 + 可视化人工的描述语句 raw_img = _data['ix2id'][indexes[0]] img_path=opt.img_path+raw_img raw_img ='RGB') raw_img = tv.transforms.ToTensor()(raw_img) raw_caption =[:,0] raw_caption = ''.join([_data['ix2word'][int(ii)] for ii in raw_caption]) vis.text(raw_caption,u'raw_caption') vis.img('raw',raw_img,caption=raw_caption) # 可视化网络生成的描述语句 results = model.generate([0]) vis.text('</br>'.join(results),u'caption') '''
transform=mytransform, train=True) flicker8k_val = FlickrDataLoader.Flicker8k(img_dir, cap_path, val_txt, transform=mytransform, train=True) with open('feat6k.npy', 'r') as f: feat_tr = np.load(f) with open('capt6k.pkl', 'r') as f: caption_trn = pickle.load(f) with open('feat.pkl', 'r') as f: feat_val = pickle.load(f) with open('capt1k.pkl', 'r') as f: caption_val = pickle.load(f) model = CaptionModel(bsz=1, feat_dim=(196, 512), n_voc=5834, n_embed=512, n_hidden=1024).cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) train(epoches=1) with open('model_t.pth', 'r') as f: model.load_state_dict(torch.load(f))
def train_fn(config): """Main training function. To be called by `try_to_train()`.""" #print('TensorFlow version: r{}'.format(tf.__version__)) print('INFO: Logging to `{}`.'.format(config.log_path)) # Setup input pipeline & Build model g = tf.Graph() with g.as_default(): tf.set_random_seed(config.rand_seed) if config.token_type == 'radix': inputs_man = inputs.InputManager_Radix(config) elif config.token_type == 'char': inputs_man = inputs.InputManager_Char(config) else: inputs_man = inputs.InputManager(config) c = inputs_man.config num_batches = int(c.split_sizes['train'] / c.batch_size_train) lr = c.lr_start n_steps_log = int(num_batches / c.num_logs_per_epoch) with tf.name_scope('train'): m_train = CaptionModel(c, mode='train', batch_ops=inputs_man.batch_train, reuse=False, name='train') m_train.dset_size = c.split_sizes['train'] with tf.name_scope('valid'): m_valid = CaptionModel(c, mode='eval', batch_ops=inputs_man.batch_eval, reuse=True, name='valid') m_valid.dset_size = c.split_sizes['valid'] init_fn = tf.global_variables_initializer() model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'Model') model_saver = tf.train.Saver(var_list=model_vars, max_to_keep=c.max_saves) saver = tf.train.Saver(max_to_keep=2) r = c.per_process_gpu_memory_fraction gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=r) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options), graph=g) summary_writer = tf.summary.FileWriter(c.log_path, g) with sess: # Restore model from checkpoint if provided lr = m_train.restore_model(sess, saver, lr) g.finalize() #ops.get_model_size(scope_or_list=m_train._get_trainable_vars(), ops.get_model_size(scope_or_list='Model/decoder/rnn_decoder', log_path=c.log_path) start_step = n_steps_log = int(n_steps_log / 5) print('INFO: Graph constructed. Training begins now.') start_epoch = time.time() for step in xrange(start_step, c.max_step): epoch = int(step / num_batches) + 1 # Write summary to disk once every `n_steps_log` steps if (step + 1) % (n_steps_log * 5) == 0: ppl, summary, global_step, lr =[ m_train.dec_log_ppl, m_train.summary_op, m_train.global_step, ]) t = time.time() - start_epoch speed = (step + 1 - start_step) * c.batch_size_train / t print(' Training speed: {:7.2f} examples/sec.'.format(speed)) summary_writer.add_summary(summary, global_step) value_summary({'train/speed': speed}, summary_writer, global_step) # Quick logging elif (step + 1) % n_steps_log == 0: ppl, global_step, lr = [m_train.dec_log_ppl, m_train.global_step,]) ppl = np.exp(ppl) logstr = 'Epoch {:2d} ~~ {:6.2f} % ~ '.format( epoch, ((step % num_batches) + 1) / num_batches * 100) logstr += 'Perplexity {:8.4f} ~ LR {:5.3e} ~ '.format(ppl, lr) logstr += 'Step {}'.format(global_step) print(' ' + logstr) else: ppl, global_step = [m_train.dec_log_ppl, m_train.global_step]) if num_batches > 5000: save = (step + 1) % int(num_batches / 2) == 0 else: save = (step + 1) % num_batches == 0 save = save and (step + 100) < c.max_step # Evaluation and save model if save or (step + 1) == c.max_step:, c.save_path + '_compact', global_step), c.save_path, global_step) _run_eval_loop(sess, c, m_valid, summary_writer, global_step) if (step + 1) % num_batches == 0: if c.legacy: lr = _lr_reduce_check(config, epoch, lr) m_train.update_lr(sess, lr) t = time.time() - start_epoch print('\n\n>>> Epoch {:3d} complete'.format(epoch)) print('>>> Time taken: {:10.2f} minutes\n\n'.format(t / 60)) start_epoch = time.time() start_step = step + 1 sess.close() print('\n\nINFO: Training completed.')
def main(): global args args = parser.parse_args() if is '': ='%Y-%m-%d_%H-%M-%S') save_path = os.path.join(args.results_dir, if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt')) checkpoint_file = os.path.join(save_path, 'checkpoint_epoch_%s.pth.tar') logging.debug("run arguments: %s", args)"using pretrained cnn %s", args.cnn) cnn = resnet.__dict__[args.cnn](pretrained=True) vocab = build_vocab() model = CaptionModel(cnn, vocab, embedding_size=args.embedding_size, rnn_size=args.rnn_size, num_layers=args.num_layers, share_embedding_weights=args.share_weights) train_data = get_iterator(get_coco_data(vocab, train=True), batch_size=args.batch_size, max_length=args.max_length, shuffle=True, num_workers=args.workers) val_data = get_iterator(get_coco_data(vocab, train=False), batch_size=args.eval_batch_size, max_length=args.max_length, shuffle=False, num_workers=args.workers) if 'cuda' in args.type: cudnn.benchmark = True model.cuda() optimizer = select_optimizer( args.optimizer, params=model.parameters(), regime = lambda e: {'lr': * (args.lr_decay ** e), 'momentum': args.momentum, 'weight_decay': args.weight_decay} model.finetune_cnn(False) def forward(model, data, training=True, optimizer=None): use_cuda = 'cuda' in args.type loss = nn.CrossEntropyLoss() perplexity = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() if training: model.train() else: model.eval() end = time.time() for i, (imgs, (captions, lengths)) in enumerate(data): data_time.update(time.time() - end) if use_cuda: imgs = imgs.cuda() captions = captions.cuda(async=True) imgs = Variable(imgs, volatile=not training) captions = Variable(captions, volatile=not training) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] pred, _ = model(imgs, input_captions, lengths) err = loss(pred, target_captions) perplexity.update(math.exp([0])) if training: optimizer.zero_grad() err.backward() clip_grad_norm(model.rnn.parameters(), args.grad_clip) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0:'{phase} - Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Perplexity {perp.val:.4f} ({perp.avg:.4f})'.format( epoch, i, len(data), phase='TRAINING' if training else 'EVALUATING', batch_time=batch_time, data_time=data_time, perp=perplexity)) return perplexity.avg for epoch in range(args.start_epoch, args.epochs): if epoch >= args.finetune_epoch: model.finetune_cnn(True) optimizer = adjust_optimizer( optimizer, epoch, regime) # Train train_perp = forward( model, train_data, training=True, optimizer=optimizer) # Evaluate val_perp = forward(model, val_data, training=False)'\n Epoch: {0}\t' 'Training Perplexity {train_perp:.4f} \t' 'Validation Perplexity {val_perp:.4f} \n' .format(epoch + 1, train_perp=train_perp, val_perp=val_perp)) model.save_checkpoint(checkpoint_file % (epoch + 1))
def main(hf, f_type, final=False, capl=16, d_w2v=512, output_dim=512, feature_shape=None, lr=0.01, batch_size=64, total_epoch=100, file=None, pretrained_model=None): ''' capl: the length of caption ''' # Create vocabulary if final: v2i, train_data, test_data = MsrFinalDataUtil.create_vocabulary_word2vec( file, capl=capl, word_threshold=1, v2i={ '': 0, 'UNK': 1, 'BOS': 2, 'EOS': 3 }, num_training=9000) else: v2i, train_data, val_data, test_data = MsrDataUtil.create_vocabulary_word2vec( file, capl=capl, word_threshold=1, v2i={ '': 0, 'UNK': 1, 'BOS': 2, 'EOS': 3 }) i2v = {i: v for v, i in v2i.items()} print('building model ...') voc_size = len(v2i) input_video = tf.placeholder(tf.float32, shape=(None, ) + feature_shape, name='input_video') input_captions = tf.placeholder(tf.int32, shape=(None, capl), name='input_captions') y = tf.placeholder(tf.int32, shape=(None, capl)) attentionCaptionModel = CaptionModel.GRUAttentionBeamsearchCaptionModel( input_video, input_captions, voc_size, d_w2v, output_dim, max_len=16, beamsearch_batchsize=1, beam_size=5) predict_score, predict_words, loss_mask, finished_beam, logprobs_finished_beams, past_logprobs = attentionCaptionModel.build_model( ) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=predict_score) loss = tf.reduce_sum(loss, reduction_indices=[-1]) / tf.reduce_sum( loss_mask, reduction_indices=[-1]) loss = tf.reduce_mean(loss) + sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam') gvs = optimizer.compute_gradients(loss) capped_gvs = [(tf.clip_by_global_norm([grad], 10)[0][0], var) for grad, var in gvs] train = optimizer.apply_gradients(capped_gvs) # optimizer = tf.train.RMSPropOptimizer(lr,decay=0.9, momentum=0.0, epsilon=1e-8) # train = optimizer.minimize(loss) ''' configure && runtime environment ''' config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.3 # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) config.log_device_placement = False sess = tf.Session(config=config) init = tf.global_variables_initializer() with sess.as_default(): saver = tf.train.Saver(sharded=True, max_to_keep=total_epoch) if pretrained_model is not None: saver.restore(sess, pretrained_model) print('restore pre trained file:' + pretrained_model) for epoch in xrange(total_epoch): # # # shuffle print('Epoch: %d/%d, Batch_size: %d' % (epoch + 1, total_epoch, batch_size)) # train phase tic = time.time() total_loss = exe_train(sess, train_data, batch_size, v2i, hf, feature_shape, train, loss, input_video, input_captions, y, capl=capl) print(' --Train--, Loss: %.5f, .......Time:%.3f' % (total_loss, time.time() - tic)) tic = time.time() js = exe_test(sess, test_data, 1, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, y, finished_beam, logprobs_finished_beams, past_logprobs, capl=capl) print(' --Val--, ......Time:%.3f' % (time.time() - tic)) #save model export_path = '/home/xyj/usr/local/saved_model/msrvtt2017/s2s' + '_' + f_type + '/' + 'lr' + str( lr) + '_f' + str(feature_shape[0]) + '_B' + str(batch_size) if not os.path.exists(export_path + '/model'): os.makedirs(export_path + '/model') print('mkdir %s' % export_path + '/model') if not os.path.exists(export_path + '/res'): os.makedirs(export_path + '/res') print('mkdir %s' % export_path + '/res') # eval res_path = export_path + '/res/E' + str(epoch + 1) + '.json' evaluate_mode_by_shell(res_path, js)