def extract_features(raw, dataset, mode, max_frames=-1, overwrite=False, aencoder=AppearanceEncoder(), mencoder=MotionEncoder()): """ Builds appearance and motion features for a list of videos. :param raw: Raw dataset of videos for which to extract features. :param dataset: Dataset in which to place the resultant features. :param mode: Dataset mode (train, val, test). :param max_frames: Maximum number of allowable frames in a given video. :param overwrite: Unless this flag is specified, will fail rather than overwrite existing cache. :param aencoder: Encoder used for appearance. :param mencoder: Encoder used for motion. :return: Numpy of features with shape [len(videos), max_frames, aencoder.feature_size() + mencoder.feature_size()] where index i corresponds to the ith video in sorted(videos). """ assert isinstance(raw, str) assert isinstance(dataset, str) assert isinstance(mode, str) and mode == "train" or mode == "val" or mode == "test", \ "Extraction mode must be train, val, or test got {}".format(mode) assert isinstance(max_frames, int) and max_frames > 0, "max_frame must be a positive integer" assert isinstance(aencoder, nn.Module) assert isinstance(mencoder, nn.Module) raw_dir = _util.get_raw_dataset_by_name(raw) dataset_dir = _util.get_dataset_by_name(dataset, mode=mode, create=True) video_ids = sorted(set(_util.load_array(dataset_dir, "video_ids"))) videos = [os.path.join(raw_dir, mode, "{}.mp4".format(video_id)) for video_id in video_ids] for video_path in videos: assert os.path.exists(video_path), "Cannot find mp4 video @ {}".format(video_path) aencoder = aencoder.cuda(1) # mencoder = mencoder#.cuda(0) # num_features = aencoder.feature_size() + mencoder.feature_size() num_features = aencoder.feature_size() features = np.zeros((len(videos), max_frames, num_features), dtype=np.float32) for i, video_path in enumerate(tqdm(videos)): frames, _ = sample_frames(video_path, max_frames) frames = np.array([preprocess_frame(f) for f in frames]) frames = frames.transpose(0, 3, 1, 2) frames = torch.from_numpy(frames).cuda(1) # clips = np.array([[preprocess_frame(f) for f in clip] for clip in clips]) # clips = clips.transpose(0, 4, 1, 2, 3) # clips = torch.from_numpy(clips).cuda(0) af = aencoder.forward(frames) # mf = mencoder.forward(clips) af = af.cpu().detach().numpy() # mf = mf.cpu().detach().numpy() features[i, :frames.shape[0], :] = af #np.concatenate([af, mf], axis=1) _util.dump_array(dataset_dir, "frames", 100, features, overwrite=overwrite) return features
def __init__(self, dataset: str, mode: str) -> None: super().__init__() dataset_dir = _util.get_dataset_by_name(dataset, mode) self._features = _util.load_array(dataset_dir, "frames") self._video_ids = _util.load_array(dataset_dir, "video_ids") self._vid2idx = { video_id: idx for idx, video_id in enumerate(sorted(set(self._video_ids))) }
def vocab(dataset: Optional[str] = "MSRVTT", threshold: int = 3): if vocab.inst is None: if dataset is None: vocab.inst = Vocabulary(threshold) else: dataset_dir = _util.get_dataset_by_name(dataset) with open(os.path.join(dataset_dir, Vocabulary.PICKLE_FILE), 'rb') as f: vocab.inst = pickle.load(f) if not isinstance(vocab.inst, Vocabulary): raise TypeError('Pickled object @ {} not of type {}'.format( dataset_dir, Vocabulary)) return vocab.inst
def _build_cache(dataset: str, mode: str, sentences: List[str], video_ids: List[str], vocab: Vocabulary, max_words: int, overwrite: bool) -> None: dataset_dir = _util.get_dataset_by_name(dataset, mode, create=True) captions = [] cap_lens = [] _logger.info("Building {} cache...".format(mode)) for i, sentence in enumerate(tqdm(sentences)): caption = [vocab[Token.START]] caption += map(vocab.__getitem__, map(str.lower, nltk.tokenize.word_tokenize(sentence))) cap_lens.append(len(caption) + 1) # plus one for Token.END if len(caption) >= max_words: _logger.warn("Truncating caption {} from {} words to {}".format( i, len(caption) + 1, max_words)) caption = caption[:max_words - 1] caption.append(vocab[Token.END]) caption += [vocab[Token.PAD]] * (max_words - len(caption)) assert len(caption) == max_words captions.append(caption) captions = np.array(captions) cap_lens = np.array(cap_lens) video_ids = np.array(video_ids) _logger.info("Saving cache...") _util.dump_array(dataset_dir, "captions", 10000, captions, overwrite=overwrite) _util.dump_array(dataset_dir, "cap_lens", 100000, cap_lens, overwrite=overwrite) _util.dump_array(dataset_dir, "video_ids", 100000, video_ids, overwrite=overwrite) return captions
def build_cache(raw: str, dataset: str, threshold: int, max_words: int, train_fraction: float = 1., overwrite: bool = False) -> None: """ Builds caption cache files for a raw dataset based on annotations.json. :param raw: Raw dataset of videos for which to build cache. :param dataset: Dataset in which to place the resultant cache files. :param threshold: Number of occurrences under which a token will be unk'd. :param max_words: Maximum number of allowable words in a caption (will pad to this length). :param train_fraction: Amount of training annotations to use, defaults to 1.0 (all of train). :param overwrite: Unless this flag is specified, will fail rather than overwrite existing cache. """ assert isinstance(raw, str) assert isinstance(dataset, str) assert isinstance(train_fraction, float) and 0 < train_fraction <= 1. assert isinstance( max_words, int) and max_words > 0, "max_words must be a positive integer" train_dir = _util.get_raw_dataset_by_name(raw, mode="train") val_dir = _util.get_raw_dataset_by_name(raw, mode="val") test_dir = _util.get_raw_dataset_by_name(raw, mode="test") train_ann = os.path.join(train_dir, "annotations.json") val_ann = os.path.join(val_dir, "annotations.json") test_ann = os.path.join(test_dir, "annotations.json") assert os.path.exists( train_ann ), "Could not find train annotations.json in raw dataset {}".format(raw) assert os.path.exists( val_ann ), "Could not find val annotations.json in raw dataset {}".format(raw) assert os.path.exists( test_ann ), "Could not find test annotations.json in raw dataset {}".format(raw) with open(train_ann, 'r') as f: train_ann = json.load(f) with open(val_ann, 'r') as f: val_ann = json.load(f) with open(test_ann, 'r') as f: test_ann = json.load(f) train_sentences = [] train_video_ids = [] for sentence in train_ann["sentences"]: train_video_ids.append(sentence["video_id"]) train_sentences.append(sentence["caption"]) val_sentences = [] val_video_ids = [] for sentence in val_ann["sentences"]: val_video_ids.append(sentence["video_id"]) val_sentences.append(sentence["caption"]) test_sentences = [] test_video_ids = [] for sentence in test_ann["sentences"]: test_video_ids.append(sentence["video_id"]) test_sentences.append(sentence["caption"]) all_sentences = train_sentences + val_sentences + test_sentences vocab = build_vocabulary(all_sentences, threshold) last_train_idx = int(len(train_ann["sentences"]) * train_fraction) _logger.info("Using {} of {} total train sentences".format( last_train_idx, len(train_ann["sentences"]))) _build_cache(dataset, "train", train_sentences[:last_train_idx], train_video_ids[:last_train_idx], vocab, max_words, overwrite) val_sentences = _build_cache(dataset, "val", val_sentences, val_video_ids, vocab, max_words, overwrite) test_sentences = _build_cache(dataset, "test", test_sentences, test_video_ids, vocab, max_words, overwrite) with open( os.path.join(_util.get_dataset_by_name(dataset), Vocabulary.PICKLE_FILE), 'wb') as f: pickle.dump(vocab, f) prepare_gt( os.path.join(_util.get_dataset_by_name(dataset, mode="val"), "reference.json"), val_sentences, val_video_ids) prepare_gt( os.path.join(_util.get_dataset_by_name(dataset, mode="test"), "reference.json"), test_sentences, test_video_ids)
def train( # General training hyperparameters. dataset: str, num_epochs: int = 100, batch_size: int = 128, # Learning rate schedulers. learning_rate: float = 3e-4, ss_factor: int = 24, min_ss: float = 0.6, # Representation hyperparameters. projected_size: int = 500, hidden_size: int = 1024, # Hidden size of the recurrent cells. mid_size: int = 128, # Dimension of the boundary detection layer. # REVIEW josephz: Remove this? # frame_shape: tuple=(3, 224, 224), # Video frame shape. a_feature_size: int = 2048, # Appearance model feature-dimension size. # REVIEW josephz: Remove this? # m_feature_size=4096, # Motion model feature-dimension size. # Maximum-size hyperparameters. # frame_sample_rate: int=10, # Sample rate of video frames. max_frames: int = 30, # Maximum length of the video-frame sequence. max_words: int = 30, # Maximum length of the caption-word sequence. # Misc hyperparameters. ckpt_freq: int = 3, use_cuda: bool = False, use_ckpt: bool = False, use_argmax: bool = False, seed: int = 0, ): """ Args: dataset (str): Dataset to train on. num_epochs (int): Number of epochs to train for. batch_size (int): Batch size to train with. learning_rate (float): Learning rate. ss_factor (int): Scheduled Sampling factor, to compute a teacher-forcing ratio. min_ss (float): Minimum teacher-forcing ratio. projected_size (int): Projection size for the Encoder-Decoder model. hidden_size (int): Hidden state size for the recurrent network in the encoder. mid_size (int): Hidden state size for the Boundary Detector network in the encoder. a_feature_size: Input feature size for the Encoder network. max_frames (int): Maximum length of the video-frame sequence. max_words (int): Maximum length of the caption-word sequence. ckpt_freq (int): Frequency to compute evaluation metrics and save checkpoint. use_cuda (bool): Flag whether to use CUDA devices. use_ckpt (bool): Flag on whether to load checkpoint if possible. use_argmax (bool): Flag on whether to use greedy or multinomial sampling during decoding. seed (int): Random seed. Effects: We will have several outputs: - Checkpoints (model weights) - Logs (tensorboard logs) """ # Set seeds. torch.random.manual_seed(seed) np.random.seed(seed) # Prepare output paths. # REVIEW josephz: This is unbelievably hacky, but we want an easy way to allow the user to set and track # hyperparameters using the cmd_line interface? This should probably be abstracted in utility.py. hparams = locals() params = { arg_name: hparams[arg_name] for arg_name in inspect.signature(train).parameters.keys() } ckpt_path = _util.get_weights_path_by_param(reuse=False, **params) print( "Saving checkpoints to '{ckpt_path}', you may visualize in tensorboard with the following: \n\n\t`tensorboard --logdir={ckpt_path}`\n" .format(ckpt_path=ckpt_path)) # Setup logging paths. log_path = os.path.join(ckpt_path, 'logs') _util.mkdir(log_path) _tb_logger.configure(log_path, flush_secs=10) # REVIEW josephz: Todo, clean this up. banet_pth_path_fmt = os.path.join(ckpt_path, '{:04d}_{:04d}.pth') best_banet_pth_path = os.path.join(ckpt_path, 'weights.pth') optimizer_pth_path = os.path.join(ckpt_path, 'optimizer.pth') best_optimizer_pth_path = os.path.join(ckpt_path, 'best_optimizer.pth') # Load Vocabulary. vocab_size = len(vocab()) # Load Reference for COCO. # val_dir = _util.get_dataset_by_name(dataset, mode='val') # val_reference_txt_path = os.path.join(val_dir, 'reference.json') # val_prediction_txt_path = os.path.join(val_dir, 'prediction.txt') # reference = COCO(val_reference_txt_path) eval_mode = 'val' eval_dir = _util.get_dataset_by_name(dataset, mode=eval_mode) test_reference_txt_path = os.path.join(eval_dir, 'reference.json') test_prediction_txt_path = os.path.join(eval_dir, 'prediction.txt') reference = COCO(test_reference_txt_path) print("Evaluating on '{}'".format(eval_dir)) # Initialize the model. banet = _models.BANet(a_feature_size, projected_size, mid_size, hidden_size, max_frames, max_words, use_cuda=use_cuda) # Load model weights if possible. if use_ckpt: pretrained_path = os.path.join(_util.get_raw_dataset_by_name('MSRVTT'), 'pretrained_weights.pth') weights = torch.load(pretrained_path) # REVIEW josephz: Figure out how to do the decoder weights partially: # https://discuss.pytorch.org/t/how-to-load-part-of-pre-trained-model/1113/6 del weights['decoder.word_embed.weight'] del weights['decoder.word_restore.bias'] del weights['decoder.word_restore.weight'] banet.load_state_dict(weights, strict=False) if use_cuda: banet.cuda() # Initialize loss and optimizer. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(banet.parameters(), lr=learning_rate) if os.path.exists(optimizer_pth_path) and use_ckpt: optimizer.load_state_dict(torch.load(optimizer_pth_path)) # Initialize Dataloaders. train_loader = _data.get_train_dataloader(dataset, batch_size=batch_size) eval_loader = _data.get_eval_dataloader(dataset, eval_mode, batch_size=batch_size) num_train_steps = len(train_loader) num_eval_steps = len(eval_loader) # Begin Training Loop. print("Training Configuration:") print("\tLearning Rate: '{0:.4f}'".format(learning_rate)) print("\tScheduled Sampling:") print("\t\tMax Teacher Forcing Rate: '{0:.4f}'".format(min_ss)) print("\t\tScheduled Factor: '{0:.4f}'".format(ss_factor)) print("\tBatch Size: '{}'".format(batch_size)) print("\tEpochs: '{}'".format(num_epochs)) print("\tDataset: '{}'".format(dataset)) print("\tCheckpoint Path: '{}'".format(ckpt_path)) best_meteor = 0 loss_count = 0 for epoch in range(num_epochs): epsilon = max(min_ss, ss_factor / (ss_factor + np.exp(epoch / ss_factor))) print('epoch:%d\tepsilon:%.8f' % (epoch, epsilon)) _tb_logger.log_value('epsilon', epsilon, epoch) for i, (videos, captions, cap_lens, video_ids) in tqdm.tqdm(enumerate(train_loader, start=1), total=num_train_steps): if use_cuda: videos = videos.cuda() targets = captions.cuda() else: targets = captions # Zero the gradients and run the encoder-decoder model. optimizer.zero_grad() outputs, video_encoded = banet(videos, targets, teacher_forcing_ratio=epsilon, use_argmax=use_argmax) # NOTE: Usually the last batch is less than the selected batch_size, so we dynamically # compute the correct batch_size to use here, rather than throwing away the last # training batch. bsz = len(targets) # Un-pad and flatten the outputs and labels. outputs = torch.cat([outputs[j][:cap_lens[j]] for j in range(bsz)], dim=0) targets = torch.cat([targets[j][:cap_lens[j]] for j in range(bsz)], dim=0) outputs = outputs.view(-1, vocab_size) targets = targets.view(-1) # Compute loss for back-propagation. # assert all(targets > 0) and all(outputs > 0) loss = criterion(outputs, targets) loss_val = loss.item() _tb_logger.log_value('loss', loss_val, epoch * num_train_steps + i) loss_count += loss_val # REVIEW josephz: Is there grad_norm? loss.backward() optimizer.step() eval_steps = 25 if i % eval_steps == 0 or bsz < batch_size: loss_count /= eval_steps if bsz == batch_size else i % eval_steps perplexity = np.exp(loss_count) print( 'Epoch [%d/%d]:\n\tStep [%d/%d]\n\tLoss: %.4f\n\tPerplexity: %5.4f' % (epoch, num_epochs, i, num_train_steps, loss_count, perplexity)) _tb_logger.log_value('perplexity', perplexity, epoch * num_train_steps + i) loss_count = 0 tokens = banet.decoder.sample(video_encoded) for j in range(5): we = vocab().decode(tokens.data[j].squeeze()) gt = vocab().decode(captions[j].squeeze()) print('\t\t[vid_id={}]'.format(video_ids[j])) print('\t\t\tWE: %s\n\t\t\tGT: %s' % (we, gt)) # Finally, compute evaluation metrics and save the best models. if epoch % ckpt_freq == 0: # Save epoch checkpoint. banet_pth_path = banet_pth_path_fmt.format(epoch, num_epochs) print("Saving checkpoints to '{}'".format(banet_pth_path)) torch.save(banet.state_dict(), banet_pth_path) torch.save(optimizer.state_dict(), optimizer_pth_path) # Compute evaluation. banet.eval() print("Computing Metrics:...") metrics = _train.eval_step(eval_loader, banet, test_prediction_txt_path, reference, use_cuda=use_cuda) for k, v in metrics.items(): _tb_logger.log_value(k, v, epoch) if k == 'METEOR' and v > best_meteor: # Save the best model based on the METEOR metric. # For reference, see https://www.cs.cmu.edu/~alavie/papers/BanerjeeLavie2005-final.pdf print("Saving best checkpoint of metric: '{}'".format( best_meteor)) shutil.copy2(banet_pth_path, best_banet_pth_path) shutil.copy2(optimizer_pth_path, best_optimizer_pth_path) best_meteor = v banet.train()
def evaluate(raw: str, dataset: str, mode: str, weights_path: str, batch_size: int = 64, use_cuda: bool = False) -> None: dataset_dir = _util.get_dataset_by_name(dataset, mode) raw_dir = _util.get_raw_dataset_by_name(raw, mode) model, run, args, weights_path = _util.get_params_by_weights_path( weights_path) a_feature_size = int(args["a_feature_size"]) projected_size = int(args["projected_size"]) mid_size = int(args["mid_size"]) hidden_size = int(args["hidden_size"]) max_frames = int(args["max_frames"]) max_words = int(args["max_words"]) banet = _models.BANet(a_feature_size, projected_size, mid_size, hidden_size, max_frames, max_words, use_cuda=use_cuda) pretrained_path = os.path.join(weights_path, "weights.pth") weights = torch.load(pretrained_path) banet.load_state_dict(weights) if use_cuda: banet.cuda() print("Computing metrics...") eval_loader = _data.get_eval_dataloader(dataset, mode, batch_size=batch_size) test_reference_txt_path = os.path.join(dataset_dir, 'reference.json') test_prediction_txt_path = os.path.join(dataset_dir, 'prediction.txt') reference = COCO(test_reference_txt_path) _train.eval_step(eval_loader, banet, test_prediction_txt_path, reference, use_cuda=use_cuda) # Must switch to a new loder which provides captions. eval_loader = _data.get_dataloader(dataset, mode, batch_size=batch_size) for i, (videos, captions, cap_lens, video_ids) in tqdm(enumerate(eval_loader, start=1), total=len(eval_loader)): if use_cuda: videos = videos.cuda() video_encoded = banet.encoder(videos) tokens = banet.decoder.sample(video_encoded) # vid_paths = [os.path.join(raw_dir, "{}.mp4".format(video_id)) for video_id in video_ids] for j in range(len(tokens)): # vid = imageio.get_reader(vid_paths[j]).iter_data() print('[vid_id={}]'.format(video_ids[j])) print("gt :", vocab().decode(captions[j])) print("pred:", vocab().decode(tokens.data[j].squeeze())) print()