Exemple #1
0
def extract_features(raw, dataset, mode, max_frames=-1, overwrite=False, aencoder=AppearanceEncoder(), mencoder=MotionEncoder()):
    """
    Builds appearance and motion features for a list of videos.

    :param raw: Raw dataset of videos for which to extract features.
    :param dataset: Dataset in which to place the resultant features.
    :param mode: Dataset mode (train, val, test).
    :param max_frames: Maximum number of allowable frames in a given video.
    :param overwrite: Unless this flag is specified, will fail rather than overwrite existing cache.
    :param aencoder: Encoder used for appearance.
    :param mencoder: Encoder used for motion.
    :return: Numpy of features with shape [len(videos), max_frames, aencoder.feature_size() + mencoder.feature_size()]
    where index i corresponds to the ith video in sorted(videos).
    """
    assert isinstance(raw, str)
    assert isinstance(dataset, str)
    assert isinstance(mode, str) and mode == "train" or mode == "val" or mode == "test", \
        "Extraction mode must be train, val, or test got {}".format(mode)
    assert isinstance(max_frames, int) and max_frames > 0, "max_frame must be a positive integer"
    assert isinstance(aencoder, nn.Module)
    assert isinstance(mencoder, nn.Module)

    raw_dir = _util.get_raw_dataset_by_name(raw)
    dataset_dir = _util.get_dataset_by_name(dataset, mode=mode, create=True)

    video_ids = sorted(set(_util.load_array(dataset_dir, "video_ids")))
    videos = [os.path.join(raw_dir, mode, "{}.mp4".format(video_id)) for video_id in video_ids]

    for video_path in videos:
        assert os.path.exists(video_path), "Cannot find mp4 video @ {}".format(video_path)

    aencoder = aencoder.cuda(1)
    # mencoder = mencoder#.cuda(0)
    # num_features = aencoder.feature_size() + mencoder.feature_size()
    num_features = aencoder.feature_size()

    features = np.zeros((len(videos), max_frames, num_features), dtype=np.float32)
    for i, video_path in enumerate(tqdm(videos)):
        frames, _ = sample_frames(video_path, max_frames)

        frames = np.array([preprocess_frame(f) for f in frames])
        frames = frames.transpose(0, 3, 1, 2)
        frames = torch.from_numpy(frames).cuda(1)

        # clips = np.array([[preprocess_frame(f) for f in clip] for clip in clips])
        # clips = clips.transpose(0, 4, 1, 2, 3)
        # clips = torch.from_numpy(clips).cuda(0)

        af = aencoder.forward(frames)
        # mf = mencoder.forward(clips)

        af = af.cpu().detach().numpy()
        # mf = mf.cpu().detach().numpy()

        features[i, :frames.shape[0], :] = af  #np.concatenate([af, mf], axis=1)

    _util.dump_array(dataset_dir, "frames", 100, features, overwrite=overwrite)
    return features
Exemple #2
0
    def __init__(self, dataset: str, mode: str) -> None:
        super().__init__()

        dataset_dir = _util.get_dataset_by_name(dataset, mode)
        self._features = _util.load_array(dataset_dir, "frames")
        self._video_ids = _util.load_array(dataset_dir, "video_ids")
        self._vid2idx = {
            video_id: idx
            for idx, video_id in enumerate(sorted(set(self._video_ids)))
        }
Exemple #3
0
def vocab(dataset: Optional[str] = "MSRVTT", threshold: int = 3):
    if vocab.inst is None:
        if dataset is None:
            vocab.inst = Vocabulary(threshold)
        else:
            dataset_dir = _util.get_dataset_by_name(dataset)
            with open(os.path.join(dataset_dir, Vocabulary.PICKLE_FILE),
                      'rb') as f:
                vocab.inst = pickle.load(f)
            if not isinstance(vocab.inst, Vocabulary):
                raise TypeError('Pickled object @ {} not of type {}'.format(
                    dataset_dir, Vocabulary))
    return vocab.inst
Exemple #4
0
def _build_cache(dataset: str, mode: str, sentences: List[str],
                 video_ids: List[str], vocab: Vocabulary, max_words: int,
                 overwrite: bool) -> None:
    dataset_dir = _util.get_dataset_by_name(dataset, mode, create=True)

    captions = []
    cap_lens = []
    _logger.info("Building {} cache...".format(mode))
    for i, sentence in enumerate(tqdm(sentences)):
        caption = [vocab[Token.START]]

        caption += map(vocab.__getitem__,
                       map(str.lower, nltk.tokenize.word_tokenize(sentence)))
        cap_lens.append(len(caption) + 1)  # plus one for Token.END

        if len(caption) >= max_words:
            _logger.warn("Truncating caption {} from {} words to {}".format(
                i,
                len(caption) + 1, max_words))
            caption = caption[:max_words - 1]
        caption.append(vocab[Token.END])

        caption += [vocab[Token.PAD]] * (max_words - len(caption))

        assert len(caption) == max_words
        captions.append(caption)

    captions = np.array(captions)
    cap_lens = np.array(cap_lens)
    video_ids = np.array(video_ids)

    _logger.info("Saving cache...")
    _util.dump_array(dataset_dir,
                     "captions",
                     10000,
                     captions,
                     overwrite=overwrite)
    _util.dump_array(dataset_dir,
                     "cap_lens",
                     100000,
                     cap_lens,
                     overwrite=overwrite)
    _util.dump_array(dataset_dir,
                     "video_ids",
                     100000,
                     video_ids,
                     overwrite=overwrite)

    return captions
Exemple #5
0
def build_cache(raw: str,
                dataset: str,
                threshold: int,
                max_words: int,
                train_fraction: float = 1.,
                overwrite: bool = False) -> None:
    """
    Builds caption cache files for a raw dataset based on annotations.json.

    :param raw: Raw dataset of videos for which to build cache.
    :param dataset: Dataset in which to place the resultant cache files.
    :param threshold: Number of occurrences under which a token will be unk'd.
    :param max_words: Maximum number of allowable words in a caption (will pad to this length).
    :param train_fraction: Amount of training annotations to use, defaults to 1.0 (all of train).
    :param overwrite: Unless this flag is specified, will fail rather than overwrite existing cache.
    """
    assert isinstance(raw, str)
    assert isinstance(dataset, str)
    assert isinstance(train_fraction, float) and 0 < train_fraction <= 1.
    assert isinstance(
        max_words,
        int) and max_words > 0, "max_words must be a positive integer"

    train_dir = _util.get_raw_dataset_by_name(raw, mode="train")
    val_dir = _util.get_raw_dataset_by_name(raw, mode="val")
    test_dir = _util.get_raw_dataset_by_name(raw, mode="test")

    train_ann = os.path.join(train_dir, "annotations.json")
    val_ann = os.path.join(val_dir, "annotations.json")
    test_ann = os.path.join(test_dir, "annotations.json")

    assert os.path.exists(
        train_ann
    ), "Could not find train annotations.json in raw dataset {}".format(raw)
    assert os.path.exists(
        val_ann
    ), "Could not find val annotations.json in raw dataset {}".format(raw)
    assert os.path.exists(
        test_ann
    ), "Could not find test annotations.json in raw dataset {}".format(raw)

    with open(train_ann, 'r') as f:
        train_ann = json.load(f)
    with open(val_ann, 'r') as f:
        val_ann = json.load(f)
    with open(test_ann, 'r') as f:
        test_ann = json.load(f)

    train_sentences = []
    train_video_ids = []
    for sentence in train_ann["sentences"]:
        train_video_ids.append(sentence["video_id"])
        train_sentences.append(sentence["caption"])

    val_sentences = []
    val_video_ids = []
    for sentence in val_ann["sentences"]:
        val_video_ids.append(sentence["video_id"])
        val_sentences.append(sentence["caption"])

    test_sentences = []
    test_video_ids = []
    for sentence in test_ann["sentences"]:
        test_video_ids.append(sentence["video_id"])
        test_sentences.append(sentence["caption"])

    all_sentences = train_sentences + val_sentences + test_sentences
    vocab = build_vocabulary(all_sentences, threshold)

    last_train_idx = int(len(train_ann["sentences"]) * train_fraction)
    _logger.info("Using {} of {} total train sentences".format(
        last_train_idx, len(train_ann["sentences"])))

    _build_cache(dataset, "train", train_sentences[:last_train_idx],
                 train_video_ids[:last_train_idx], vocab, max_words, overwrite)
    val_sentences = _build_cache(dataset, "val", val_sentences, val_video_ids,
                                 vocab, max_words, overwrite)
    test_sentences = _build_cache(dataset, "test", test_sentences,
                                  test_video_ids, vocab, max_words, overwrite)

    with open(
            os.path.join(_util.get_dataset_by_name(dataset),
                         Vocabulary.PICKLE_FILE), 'wb') as f:
        pickle.dump(vocab, f)

    prepare_gt(
        os.path.join(_util.get_dataset_by_name(dataset, mode="val"),
                     "reference.json"), val_sentences, val_video_ids)
    prepare_gt(
        os.path.join(_util.get_dataset_by_name(dataset, mode="test"),
                     "reference.json"), test_sentences, test_video_ids)
Exemple #6
0
def train(
    # General training hyperparameters.
    dataset: str,
    num_epochs: int = 100,
    batch_size: int = 128,

    # Learning rate schedulers.
    learning_rate: float = 3e-4,
    ss_factor: int = 24,
    min_ss: float = 0.6,

    # Representation hyperparameters.
    projected_size: int = 500,
    hidden_size: int = 1024,  # Hidden size of the recurrent cells.
    mid_size: int = 128,  # Dimension of the boundary detection layer.

    # REVIEW josephz: Remove this?
    # frame_shape: tuple=(3, 224, 224),  # Video frame shape.
    a_feature_size: int = 2048,  # Appearance model feature-dimension size.
    # REVIEW josephz: Remove this?
    # m_feature_size=4096,  # Motion model feature-dimension size.

    # Maximum-size hyperparameters.
    # frame_sample_rate: int=10,  # Sample rate of video frames.
    max_frames: int = 30,  # Maximum length of the video-frame sequence.
    max_words: int = 30,  # Maximum length of the caption-word sequence.

    # Misc hyperparameters.
    ckpt_freq: int = 3,
    use_cuda: bool = False,
    use_ckpt: bool = False,
    use_argmax: bool = False,
    seed: int = 0,
):
    """

    Args:
        dataset (str): Dataset to train on.
        num_epochs (int): Number of epochs to train for.
        batch_size (int): Batch size to train with.

        learning_rate (float): Learning rate.
        ss_factor (int): Scheduled Sampling factor, to compute a teacher-forcing ratio.
        min_ss (float): Minimum teacher-forcing ratio.

        projected_size (int): Projection size for the Encoder-Decoder model.
        hidden_size (int): Hidden state size for the recurrent network in the encoder.
        mid_size (int): Hidden state size for the Boundary Detector network in the encoder.
        a_feature_size: Input feature size for the Encoder network.

        max_frames (int): Maximum length of the video-frame sequence.
        max_words (int): Maximum length of the caption-word sequence.

        ckpt_freq (int): Frequency to compute evaluation metrics and save checkpoint.
        use_cuda (bool): Flag whether to use CUDA devices.
        use_ckpt (bool): Flag on whether to load checkpoint if possible.
        use_argmax (bool): Flag on whether to use greedy or multinomial sampling during decoding.
        seed (int): Random seed.

    Effects:
        We will have several outputs:
            - Checkpoints (model weights)
            - Logs (tensorboard logs)
    """
    # Set seeds.
    torch.random.manual_seed(seed)
    np.random.seed(seed)

    # Prepare output paths.
    # REVIEW josephz: This is unbelievably hacky, but we want an easy way to allow the user to set and track
    #   hyperparameters using the cmd_line interface? This should probably be abstracted in utility.py.
    hparams = locals()
    params = {
        arg_name: hparams[arg_name]
        for arg_name in inspect.signature(train).parameters.keys()
    }

    ckpt_path = _util.get_weights_path_by_param(reuse=False, **params)
    print(
        "Saving checkpoints to '{ckpt_path}', you may visualize in tensorboard with the following: \n\n\t`tensorboard --logdir={ckpt_path}`\n"
        .format(ckpt_path=ckpt_path))

    # Setup logging paths.
    log_path = os.path.join(ckpt_path, 'logs')
    _util.mkdir(log_path)
    _tb_logger.configure(log_path, flush_secs=10)

    # REVIEW josephz: Todo, clean this up.
    banet_pth_path_fmt = os.path.join(ckpt_path, '{:04d}_{:04d}.pth')
    best_banet_pth_path = os.path.join(ckpt_path, 'weights.pth')
    optimizer_pth_path = os.path.join(ckpt_path, 'optimizer.pth')
    best_optimizer_pth_path = os.path.join(ckpt_path, 'best_optimizer.pth')

    # Load Vocabulary.
    vocab_size = len(vocab())

    # Load Reference for COCO.
    # val_dir = _util.get_dataset_by_name(dataset, mode='val')
    # val_reference_txt_path = os.path.join(val_dir, 'reference.json')
    # val_prediction_txt_path = os.path.join(val_dir, 'prediction.txt')
    # reference = COCO(val_reference_txt_path)

    eval_mode = 'val'
    eval_dir = _util.get_dataset_by_name(dataset, mode=eval_mode)
    test_reference_txt_path = os.path.join(eval_dir, 'reference.json')
    test_prediction_txt_path = os.path.join(eval_dir, 'prediction.txt')
    reference = COCO(test_reference_txt_path)
    print("Evaluating on '{}'".format(eval_dir))

    # Initialize the model.
    banet = _models.BANet(a_feature_size,
                          projected_size,
                          mid_size,
                          hidden_size,
                          max_frames,
                          max_words,
                          use_cuda=use_cuda)

    # Load model weights if possible.
    if use_ckpt:
        pretrained_path = os.path.join(_util.get_raw_dataset_by_name('MSRVTT'),
                                       'pretrained_weights.pth')
        weights = torch.load(pretrained_path)

        # REVIEW josephz: Figure out how to do the decoder weights partially:
        #   https://discuss.pytorch.org/t/how-to-load-part-of-pre-trained-model/1113/6
        del weights['decoder.word_embed.weight']
        del weights['decoder.word_restore.bias']
        del weights['decoder.word_restore.weight']
        banet.load_state_dict(weights, strict=False)
    if use_cuda:
        banet.cuda()

    # Initialize loss and optimizer.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(banet.parameters(), lr=learning_rate)
    if os.path.exists(optimizer_pth_path) and use_ckpt:
        optimizer.load_state_dict(torch.load(optimizer_pth_path))

    # Initialize Dataloaders.
    train_loader = _data.get_train_dataloader(dataset, batch_size=batch_size)
    eval_loader = _data.get_eval_dataloader(dataset,
                                            eval_mode,
                                            batch_size=batch_size)

    num_train_steps = len(train_loader)
    num_eval_steps = len(eval_loader)

    # Begin Training Loop.
    print("Training Configuration:")
    print("\tLearning Rate: '{0:.4f}'".format(learning_rate))
    print("\tScheduled Sampling:")
    print("\t\tMax Teacher Forcing Rate: '{0:.4f}'".format(min_ss))
    print("\t\tScheduled Factor: '{0:.4f}'".format(ss_factor))
    print("\tBatch Size: '{}'".format(batch_size))
    print("\tEpochs: '{}'".format(num_epochs))
    print("\tDataset: '{}'".format(dataset))
    print("\tCheckpoint Path: '{}'".format(ckpt_path))

    best_meteor = 0
    loss_count = 0
    for epoch in range(num_epochs):
        epsilon = max(min_ss,
                      ss_factor / (ss_factor + np.exp(epoch / ss_factor)))
        print('epoch:%d\tepsilon:%.8f' % (epoch, epsilon))
        _tb_logger.log_value('epsilon', epsilon, epoch)

        for i, (videos, captions, cap_lens,
                video_ids) in tqdm.tqdm(enumerate(train_loader, start=1),
                                        total=num_train_steps):
            if use_cuda:
                videos = videos.cuda()
                targets = captions.cuda()
            else:
                targets = captions

            # Zero the gradients and run the encoder-decoder model.
            optimizer.zero_grad()
            outputs, video_encoded = banet(videos,
                                           targets,
                                           teacher_forcing_ratio=epsilon,
                                           use_argmax=use_argmax)

            # NOTE: Usually the last batch is less than the selected batch_size, so we dynamically
            #       compute the correct batch_size to use here, rather than throwing away the last
            #       training batch.
            bsz = len(targets)

            # Un-pad and flatten the outputs and labels.
            outputs = torch.cat([outputs[j][:cap_lens[j]] for j in range(bsz)],
                                dim=0)
            targets = torch.cat([targets[j][:cap_lens[j]] for j in range(bsz)],
                                dim=0)

            outputs = outputs.view(-1, vocab_size)
            targets = targets.view(-1)

            # Compute loss for back-propagation.
            # assert all(targets > 0) and all(outputs > 0)
            loss = criterion(outputs, targets)
            loss_val = loss.item()
            _tb_logger.log_value('loss', loss_val, epoch * num_train_steps + i)
            loss_count += loss_val
            # REVIEW josephz: Is there grad_norm?
            loss.backward()
            optimizer.step()

            eval_steps = 25
            if i % eval_steps == 0 or bsz < batch_size:
                loss_count /= eval_steps if bsz == batch_size else i % eval_steps
                perplexity = np.exp(loss_count)
                print(
                    'Epoch [%d/%d]:\n\tStep [%d/%d]\n\tLoss: %.4f\n\tPerplexity: %5.4f'
                    % (epoch, num_epochs, i, num_train_steps, loss_count,
                       perplexity))
                _tb_logger.log_value('perplexity', perplexity,
                                     epoch * num_train_steps + i)
                loss_count = 0
                tokens = banet.decoder.sample(video_encoded)
                for j in range(5):
                    we = vocab().decode(tokens.data[j].squeeze())
                    gt = vocab().decode(captions[j].squeeze())
                    print('\t\t[vid_id={}]'.format(video_ids[j]))
                    print('\t\t\tWE: %s\n\t\t\tGT: %s' % (we, gt))

        # Finally, compute evaluation metrics and save the best models.
        if epoch % ckpt_freq == 0:
            # Save epoch checkpoint.
            banet_pth_path = banet_pth_path_fmt.format(epoch, num_epochs)
            print("Saving checkpoints to '{}'".format(banet_pth_path))
            torch.save(banet.state_dict(), banet_pth_path)
            torch.save(optimizer.state_dict(), optimizer_pth_path)

            # Compute evaluation.
            banet.eval()
            print("Computing Metrics:...")
            metrics = _train.eval_step(eval_loader,
                                       banet,
                                       test_prediction_txt_path,
                                       reference,
                                       use_cuda=use_cuda)
            for k, v in metrics.items():
                _tb_logger.log_value(k, v, epoch)
                if k == 'METEOR' and v > best_meteor:
                    # Save the best model based on the METEOR metric.
                    # For reference, see https://www.cs.cmu.edu/~alavie/papers/BanerjeeLavie2005-final.pdf
                    print("Saving best checkpoint of metric: '{}'".format(
                        best_meteor))
                    shutil.copy2(banet_pth_path, best_banet_pth_path)
                    shutil.copy2(optimizer_pth_path, best_optimizer_pth_path)
                    best_meteor = v
            banet.train()
Exemple #7
0
def evaluate(raw: str,
             dataset: str,
             mode: str,
             weights_path: str,
             batch_size: int = 64,
             use_cuda: bool = False) -> None:
    dataset_dir = _util.get_dataset_by_name(dataset, mode)
    raw_dir = _util.get_raw_dataset_by_name(raw, mode)

    model, run, args, weights_path = _util.get_params_by_weights_path(
        weights_path)

    a_feature_size = int(args["a_feature_size"])
    projected_size = int(args["projected_size"])
    mid_size = int(args["mid_size"])
    hidden_size = int(args["hidden_size"])
    max_frames = int(args["max_frames"])
    max_words = int(args["max_words"])
    banet = _models.BANet(a_feature_size,
                          projected_size,
                          mid_size,
                          hidden_size,
                          max_frames,
                          max_words,
                          use_cuda=use_cuda)

    pretrained_path = os.path.join(weights_path, "weights.pth")
    weights = torch.load(pretrained_path)
    banet.load_state_dict(weights)
    if use_cuda:
        banet.cuda()

    print("Computing metrics...")
    eval_loader = _data.get_eval_dataloader(dataset,
                                            mode,
                                            batch_size=batch_size)
    test_reference_txt_path = os.path.join(dataset_dir, 'reference.json')
    test_prediction_txt_path = os.path.join(dataset_dir, 'prediction.txt')
    reference = COCO(test_reference_txt_path)

    _train.eval_step(eval_loader,
                     banet,
                     test_prediction_txt_path,
                     reference,
                     use_cuda=use_cuda)

    # Must switch to a new loder which provides captions.
    eval_loader = _data.get_dataloader(dataset, mode, batch_size=batch_size)
    for i, (videos, captions, cap_lens,
            video_ids) in tqdm(enumerate(eval_loader, start=1),
                               total=len(eval_loader)):
        if use_cuda:
            videos = videos.cuda()

        video_encoded = banet.encoder(videos)
        tokens = banet.decoder.sample(video_encoded)

        # vid_paths = [os.path.join(raw_dir, "{}.mp4".format(video_id)) for video_id in video_ids]

        for j in range(len(tokens)):
            # vid = imageio.get_reader(vid_paths[j]).iter_data()

            print('[vid_id={}]'.format(video_ids[j]))
            print("gt  :", vocab().decode(captions[j]))
            print("pred:", vocab().decode(tokens.data[j].squeeze()))
            print()