def main(): parser = argparse.ArgumentParser() parser.add_argument('--out', type=str, default='result', help='Output directory') parser.add_argument('--mscoco-root', type=str, default='data', help='MSOCO dataset root directory') parser.add_argument('--max-iters', type=int, default=50000, help='Maximum number of iterations to train') parser.add_argument('--batch-size', type=int, default=128, help='Minibatch size') parser.add_argument('--dropout-ratio', type=float, default=0.5, help='Language model dropout ratio') parser.add_argument('--val-keep-quantity', type=int, default=100, help='Keep every N-th validation image') parser.add_argument('--val-iter', type=int, default=100, help='Run validation every N-th iteration') parser.add_argument('--log-iter', type=int, default=1, help='Log every N-th iteration') parser.add_argument('--snapshot-iter', type=int, default=1000, help='Model snapshot every N-th iteration') parser.add_argument('--rnn', type=str, default='nsteplstm', choices=['nsteplstm', 'lstm'], help='Language model layer type') parser.add_argument('--gpu', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--max-caption-length', type=int, default=30, help='Maxium caption length when using LSTM layer') args = parser.parse_args() # Load the MSCOCO dataset. Assumes that the dataset has been downloaded # already using e.g. the `download.py` script train, val = datasets.get_mscoco(args.mscoco_root) # Validation samples are used to address overfitting and see how well your # model generalizes to yet unseen data. However, since the number of these # samples in MSCOCO is quite large (~200k) and thus require time to # evaluate, you may choose to use only a fraction of the available samples val = val[::args.val_keep_quantity] # Number of unique words that are found in the dataset vocab_size = len(train.vocab) # Instantiate the model to be trained either with LSTM layers or with # NStepLSTM layers model = ImageCaptionModel( vocab_size, dropout_ratio=args.dropout_ratio, rnn=args.rnn) if args.gpu >= 0: chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() def transform(in_data): # Called for each sample and applies necessary preprocessing to the # image such as resizing and normalizing img, caption = in_data img = model.prepare(img) return img, caption # We need to preprocess the images since their sizes may vary (and the # model requires that they have the exact same fixed size) train = TransformDataset(train, transform) val = TransformDataset(val, transform) train_iter = iterators.MultiprocessIterator( train, args.batch_size, shared_mem=700000) val_iter = chainer.iterators.MultiprocessIterator( val, args.batch_size, repeat=False, shuffle=False, shared_mem=700000) optimizer = optimizers.Adam() optimizer.setup(model) def converter(batch, device): # The converted receives a batch of input samples any may modify it if # necessary. In our case, we need to align the captions depending on if # we are using LSTM layers of NStepLSTM layers in the model. if args.rnn == 'lstm': max_caption_length = args.max_caption_length elif args.rnn == 'nsteplstm': max_caption_length = None else: raise ValueError('Invalid RNN type.') return datasets.converter( batch, device, max_caption_length=max_caption_length) updater = training.updater.StandardUpdater( train_iter, optimizer=optimizer, device=args.gpu, converter=converter) trainer = training.Trainer( updater, out=args.out, stop_trigger=(args.max_iters, 'iteration')) trainer.extend( extensions.Evaluator( val_iter, target=model, converter=converter, device=args.gpu ), trigger=(args.val_iter, 'iteration') ) trainer.extend( extensions.LogReport( ['main/loss', 'validation/main/loss'], trigger=(args.log_iter, 'iteration') ) ) trainer.extend( extensions.PlotReport( ['main/loss', 'validation/main/loss'], trigger=(args.log_iter, 'iteration') ) ) trainer.extend( extensions.PrintReport( ['elapsed_time', 'epoch', 'iteration', 'main/loss', 'validation/main/loss'] ), trigger=(args.log_iter, 'iteration') ) # Save model snapshots so that later on, we can load them and generate new # captions for any image. This can be done in the `predict.py` script trainer.extend( extensions.snapshot_object(model, 'model_{.updater.iteration}'), trigger=(args.snapshot_iter, 'iteration') ) trainer.extend(extensions.ProgressBar()) trainer.run()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--out', type=str, default='result', help='Output directory') parser.add_argument('--mscoco-root', type=str, default='data', help='MSOCO dataset root directory') parser.add_argument('--max-iters', type=int, default=50000, help='Maximum number of iterations to train') parser.add_argument('--batch-size', type=int, default=128, help='Minibatch size') parser.add_argument('--dropout-ratio', type=float, default=0.5, help='Language model dropout ratio') parser.add_argument('--val-keep-quantity', type=int, default=100, help='Keep every N-th validation image') parser.add_argument('--val-iter', type=int, default=100, help='Run validation every N-th iteration') parser.add_argument('--log-iter', type=int, default=1, help='Log every N-th iteration') parser.add_argument('--snapshot-iter', type=int, default=1000, help='Model snapshot every N-th iteration') parser.add_argument('--rnn', type=str, default='nsteplstm', choices=['nsteplstm', 'lstm'], help='Language model layer type') parser.add_argument('--gpu', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--max-caption-length', type=int, default=30, help='Maxium caption length when using LSTM layer') args = parser.parse_args() # Load the MSCOCO dataset. Assumes that the dataset has been downloaded # already using e.g. the `download.py` script train, val = datasets.get_mscoco(args.mscoco_root) # Validation samples are used to address overfitting and see how well your # model generalizes to yet unseen data. However, since the number of these # samples in MSCOCO is quite large (~200k) and thus require time to # evaluate, you may choose to use only a fraction of the available samples val = val[::args.val_keep_quantity] # Number of unique words that are found in the dataset vocab_size = len(train.vocab) # Instantiate the model to be trained either with LSTM layers or with # NStepLSTM layers model = ImageCaptionModel(vocab_size, dropout_ratio=args.dropout_ratio, rnn=args.rnn) if args.gpu >= 0: #chainer.backends.cuda.get_device_from_id(args.gpu).use() chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() def transform(in_data): # Called for each sample and applies necessary preprocessing to the # image such as resizing and normalizing img, caption = in_data img = model.prepare(img) return img, caption # We need to preprocess the images since their sizes may vary (and the # model requires that they have the exact same fixed size) train = TransformDataset(train, transform) val = TransformDataset(val, transform) train_iter = iterators.MultiprocessIterator(train, args.batch_size, shared_mem=700000) val_iter = chainer.iterators.MultiprocessIterator(val, args.batch_size, repeat=False, shuffle=False, shared_mem=700000) optimizer = optimizers.Adam() optimizer.setup(model) def converter(batch, device): # The converted receives a batch of input samples any may modify it if # necessary. In our case, we need to align the captions depending on if # we are using LSTM layers of NStepLSTM layers in the model. if args.rnn == 'lstm': max_caption_length = args.max_caption_length elif args.rnn == 'nsteplstm': max_caption_length = None else: raise ValueError('Invalid RNN type.') return datasets.converter(batch, device, max_caption_length=max_caption_length) updater = training.updater.StandardUpdater(train_iter, optimizer=optimizer, device=args.gpu, converter=converter) trainer = training.Trainer(updater, out=args.out, stop_trigger=(args.max_iters, 'iteration')) trainer.extend(extensions.Evaluator(val_iter, target=model, converter=converter, device=args.gpu), trigger=(args.val_iter, 'iteration')) trainer.extend( extensions.LogReport(['main/loss', 'validation/main/loss'], trigger=(args.log_iter, 'iteration'))) trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], trigger=(args.log_iter, 'iteration'))) trainer.extend(extensions.PrintReport([ 'elapsed_time', 'epoch', 'iteration', 'main/loss', 'validation/main/loss' ]), trigger=(args.log_iter, 'iteration')) # Save model snapshots so that later on, we can load them and generate new # captions for any image. This can be done in the `predict.py` script trainer.extend(extensions.snapshot_object(model, 'model_{.updater.iteration}'), trigger=(args.snapshot_iter, 'iteration')) trainer.extend(extensions.ProgressBar()) trainer.run()
train_loss_batch, _ = sess.run( [model.total_loss, model.train_op], feed_dict={model.handle: training_handle, model.max_len: [gs + 1]}) _loss.append(train_loss_batch) if (i % 1000 == 0): print("step: {0:d}, minibatch training loss: {1:.4f}".format(i, train_loss_batch)) loss_this_epoch = np.sum(_loss) end = time.perf_counter() print('Epoch {:2d} - train loss: {:.4f} - time: {:4f}'.format( epoch, np.mean(_loss), end - start)) saver.save(sess, model.hps.ckpt_dir + 'model.ckpt', global_step=gs) print("save checkpoint in {}".format(model.hps.ckpt_dir + 'model.ckpt-' + str(gs))) if __name__ == '__main__': hparams = get_hparams() tf.reset_default_graph() config = tf.ConfigProto() config.gpu_options.allow_growth=True sess = tf.Session(config=config) model = ImageCaptionModel(hparams, mode='train') model.build() writer = tf.summary.FileWriter("../tensorboard/", graph = sess.graph) sess.close() training_filenames = glob.glob('../data/tfrecord/train-*') num_train_records = get_num_records(training_filenames) train(training_filenames, num_train_records, model)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--img', type=str, help='Image path') parser.add_argument('--img-dir', type=str, help='Image directory path, instead of a single image') parser.add_argument('--model', type=str, default='result/model_1000', help='Trained model path') parser.add_argument('--mscoco-root', type=str, default='data', help='MSOCO dataset root directory') parser.add_argument('--rnn', type=str, default='nsteplstm', choices=['nsteplstm', 'lstm'], help='Language model layer type') parser.add_argument('--gpu', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--max-caption-length', type=int, default=30, help='Maximum caption length generated') args = parser.parse_args() # Load the dataset to obtain the vocabulary, which is needed to convert # predicted tokens into actual words train, _ = datasets.get_mscoco(args.mscoco_root) vocab = train.vocab ivocab = {v: k for k, v in vocab.items()} model = ImageCaptionModel(len(train.vocab), rnn=args.rnn) serializers.load_npz(args.model, model) if args.img_dir: # Read all images in directory img_paths = [ i for i in glob.glob(os.path.join(args.img_dir, '*')) if i.endswith(('png', 'jpg')) ] img_paths = sorted(img_paths) else: # Load a single image img_paths = [args.img] if not img_paths: raise IOError('No images found for the given path') imgs = [] for img_path in img_paths: img = Image.open(img_path) img = model.prepare(img) imgs.append(img) imgs = np.asarray(imgs) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() imgs = chainer.cuda.to_gpu(imgs) bos = vocab['<bos>'] eos = vocab['<eos>'] with chainer.using_config('train', False), \ chainer.no_backprop_mode(): captions = model.predict(imgs, bos=bos, eos=eos, max_caption_length=args.max_caption_length) captions = chainer.cuda.to_cpu(captions) # Print the predicted captions file_names = [os.path.basename(path) for path in img_paths] max_length = max(len(name) for name in file_names) for file_name, caption in zip(file_names, captions): caption = ' '.join(ivocab[token] for token in caption) caption = caption.replace('<bos>', '').replace('<eos>', '').strip() print(('{0:' + str(max_length) + '} {1}').format(file_name, caption))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--img', type=str, help='Image path') parser.add_argument('--img-dir', type=str, help='Image directory path, instead of a single image') parser.add_argument('--model', type=str, default='result/model_1000', help='Trained model path') parser.add_argument('--mscoco-root', type=str, default='data', help='MSOCO dataset root directory') parser.add_argument('--rnn', type=str, default='nsteplstm', choices=['nsteplstm', 'lstm'], help='Language model layer type') parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--max-caption-length', type=int, default=30, help='Maximum caption length generated') group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() device = chainer.get_device(args.device) if device.xp is chainerx: sys.stderr.write('This example does not support ChainerX devices.\n') sys.exit(1) print('Device: {}'.format(device)) print() # Load the dataset to obtain the vocabulary, which is needed to convert # predicted tokens into actual words train, _ = datasets.get_mscoco(args.mscoco_root) vocab = train.vocab ivocab = {v: k for k, v in vocab.items()} model = ImageCaptionModel(len(train.vocab), rnn=args.rnn) serializers.load_npz(args.model, model) model.to_device(device) if args.img_dir: # Read all images in directory img_paths = [ i for i in glob.glob(os.path.join(args.img_dir, '*')) if i.endswith(('png', 'jpg')) ] img_paths = sorted(img_paths) else: # Load a single image img_paths = [args.img] if not img_paths: raise IOError('No images found for the given path') imgs = [] for img_path in img_paths: img = Image.open(img_path) img = model.prepare(img) imgs.append(img) imgs = np.asarray(imgs) imgs = device.send(imgs) bos = vocab['<bos>'] eos = vocab['<eos>'] with chainer.using_config('train', False), \ chainer.no_backprop_mode(): captions = model.predict(imgs, bos=bos, eos=eos, max_caption_length=args.max_caption_length) captions = chainer.get_device('@numpy').send(captions) # Print the predicted captions file_names = [os.path.basename(path) for path in img_paths] max_length = max(len(name) for name in file_names) for file_name, caption in zip(file_names, captions): caption = ' '.join(ivocab[token] for token in caption) caption = caption.replace('<bos>', '').replace('<eos>', '').strip() print(('{0:' + str(max_length) + '} {1}').format(file_name, caption))
return pd.DataFrame({ 'img_id': img_ids, 'caption': caps }).set_index(['img_id']) def read_map(): encode_map = pickle.load(open('../data/enc_map.pkl', 'rb')) # token => id decode_map = pickle.load(open('../data/dec_map.pkl', 'rb')) # id => token return encode_map, decode_map if __name__ == '__main__': encode_map, decode_map = read_map() img_test = joblib.load('../data/processed/valid.joblib') hps = get_hparams() tf.reset_default_graph() model = ImageCaptionModel(hps, mode='test') model.build() predict = generate_captions(model, decode_map=decode_map, encode_map=encode_map, img_test=img_test) predict.to_csv('../evaluation/captions.csv') os.chdir('../evaluation') subprocess.run(['./gen_score', '-i', 'captions.csv', '-r', 'score.csv'])
image_dir=IMAGE_DIR, log_path=LOG_PATH, if_extract_feature=IF_EXTRACT_FEATURE) data_handler.load_preprocess() generator = DataGenerator(data_path=LOG_PATH, batch_size=BATCH_SIZE, data_handler=data_handler) print('Train Size : {}'.format(generator.training_dataset.shape[0])) print('Validation Size : {}'.format(generator.validation_dataset.shape[0])) model = ImageCaptionModel(max_token_length=generator.MAX_TOKEN_LENGTH, vocabulary_size=generator.VOCABULARY_SIZE, num_image_features=generator.IMG_FEATS, hidden_size=128, embedding_size=128) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) print('Number of parameters:', model.count_params()) training_history_filename = LOG_PATH + 'training_history.log' csv_logger = CSVLogger(training_history_filename, append=False) model_names = ('../trained_models/IAPR_2012/' + 'iapr_weights.{epoch:02d}-{val_loss:.2f}.hdf5') model_checkpoint = ModelCheckpoint(model_names,
def main(): parser = argparse.ArgumentParser() parser.add_argument('--img', type=str, help='Image path') parser.add_argument('--img-dir', type=str, help='Image directory path, instead of a single image') parser.add_argument('--model', type=str, default='result/model_1000', help='Trained model path') parser.add_argument('--mscoco-root', type=str, default='data', help='MSOCO dataset root directory') parser.add_argument('--rnn', type=str, default='nsteplstm', choices=['nsteplstm', 'lstm'], help='Language model layer type') parser.add_argument('--gpu', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--max-caption-length', type=int, default=30, help='Maximum caption length generated') args = parser.parse_args() # Load the dataset to obtain the vocabulary, which is needed to convert # predicted tokens into actual words train, _ = datasets.get_mscoco(args.mscoco_root) vocab = train.vocab ivocab = {v: k for k, v in vocab.items()} model = ImageCaptionModel(len(train.vocab), rnn=args.rnn) serializers.load_npz(args.model, model) if args.img_dir: # Read all images in directory img_paths = [ i for i in glob.glob(os.path.join(args.img_dir, '*')) if i.endswith(('png', 'jpg'))] img_paths = sorted(img_paths) else: # Load a single image img_paths = [args.img] if not img_paths: raise IOError('No images found for the given path') imgs = [] for img_path in img_paths: img = Image.open(img_path) img = model.prepare(img) imgs.append(img) imgs = np.asarray(imgs) if args.gpu >= 0: chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() imgs = chainer.backends.cuda.to_gpu(imgs) bos = vocab['<bos>'] eos = vocab['<eos>'] with chainer.using_config('train', False), \ chainer.no_backprop_mode(): captions = model.predict( imgs, bos=bos, eos=eos, max_caption_length=args.max_caption_length) captions = chainer.backends.cuda.to_cpu(captions) # Print the predicted captions file_names = [os.path.basename(path) for path in img_paths] max_length = max(len(name) for name in file_names) for file_name, caption in zip(file_names, captions): caption = ' '.join(ivocab[token] for token in caption) caption = caption.replace('<bos>', '').replace('<eos>', '').strip() print(('{0:' + str(max_length) + '} {1}').format(file_name, caption))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--img', type=str, help='Image path') parser.add_argument('--img-dir', type=str, help='Image directory path, instead of a single image') parser.add_argument('--model', type=str, default='result/model_1000', help='Trained model path') parser.add_argument('--dataset-name', type=str, default='mscoco', choices=["mscoco", "stair_captions"], help='MSCOCO dataset root directory') parser.add_argument('--mscoco-root', type=str, default='data', help='MSOCO dataset root directory') parser.add_argument('--rnn', type=str, default='nsteplstm', choices=['nsteplstm', 'lstm'], help='Language model layer type') parser.add_argument('--gpu', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--max-caption-length', type=int, default=30, help='Maximum caption length generated') parser.add_argument('--out', type=str, help='Json file to save predicted captions', default='prediction.json') parser.add_argument('--batch-size', type=int, default=128, help='Minibatch size') args = parser.parse_args() # Set path to annotation files if args.dataset_name == "mscoco": train_anno = "annotations/captions_train2014.json" val_anno = "annotations/captions_val2014.json" elif args.dataset_name == "stair_captions": train_anno = "annotations/stair_captions_v1.2_train_tokenized.json" val_anno = "annotations/stair_captions_v1.2_val_tokenized.json" # Load the dataset to obtain the vocabulary, which is needed to convert # predicted tokens into actual words train, _ = datasets.get_mscoco(args.mscoco_root, train_anno=train_anno, val_anno=val_anno, dataset_name=args.dataset_name) vocab = train.vocab ivocab = {v: k for k, v in vocab.items()} model = ImageCaptionModel(len(train.vocab), rnn=args.rnn) serializers.load_npz(args.model, model) if args.img_dir: # Read all images in directory img_paths = [ i for i in glob.glob(os.path.join(args.img_dir, '*')) if i.endswith(('png', 'jpg')) ] img_paths = sorted(img_paths) else: # Load a single image img_paths = [args.img] if not img_paths: raise IOError('No images found for the given path') img_paths = np.random.permutation(img_paths) results = dict() for i in range(0, len(img_paths), args.batch_size): img_paths_sub = img_paths[i:i + args.batch_size] imgs = [] for img_path in img_paths_sub: img = Image.open(img_path) img = model.prepare(img) imgs.append(img) imgs = np.asarray(imgs) if args.gpu >= 1: chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() imgs = chainer.backends.cuda.to_gpu(imgs) bos = vocab['<bos>'] eos = vocab['<eos>'] with chainer.using_config('train', False), \ chainer.no_backprop_mode(): captions = model.predict( imgs, bos=bos, eos=eos, max_caption_length=args.max_caption_length) captions = chainer.backends.cuda.to_cpu(captions) # Print the predicted captions file_names = [os.path.basename(path) for path in img_paths_sub] max_length = max(len(name) for name in file_names) for file_name, caption in zip(file_names, captions): caption = ' '.join(ivocab[token] for token in caption) caption = caption.replace('<bos>', '') end = caption.find('<eos>') caption = caption[:end].strip() # caption = caption.replace('<bos>', '').replace('<eos>', '').strip() results[file_name] = caption print( ('{0:' + str(max_length) + '} {1}').format(file_name, caption)) # Save captions with open(args.out, "w") as f: json.dump(results, f, indent=4)