コード例 #1
0
ファイル: train.py プロジェクト: Worm4047/TVR
def start_training():
    logger.info("Setup config, data and model...")
    opt = BaseOptions().parse()
    set_seed(opt.seed)
    if opt.debug:  # keep the model run deterministically
        # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config.
        # Enable this only when input size is fixed.
        cudnn.benchmark = False
        cudnn.deterministic = True

    opt.writer = SummaryWriter(opt.tensorboard_log_dir)
    opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n"
    opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n"

    train_dataset = ExCLDataset(
        dset_name=opt.dset_name,
        data_path=opt.train_path,
        desc_bert_path_or_handler=opt.desc_bert_path,
        sub_bert_path_or_handler=opt.sub_bert_path,
        max_desc_len=opt.max_desc_l,
        max_ctx_len=opt.max_ctx_l,
        vid_feat_path_or_handler=opt.vid_feat_path,
        clip_length=opt.clip_length,
        ctx_mode=opt.ctx_mode,
        h5driver=opt.h5driver,
        data_ratio=opt.data_ratio,
        normalize_vfeat=not opt.no_norm_vfeat,
        normalize_tfeat=not opt.no_norm_tfeat,
    )

    if opt.eval_path is not None:
        eval_dataset = ExCLDataset(
            dset_name=opt.dset_name,
            data_path=opt.eval_path,
            desc_bert_path_or_handler=train_dataset.desc_bert_h5,
            sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None,
            max_desc_len=opt.max_desc_l,
            max_ctx_len=opt.max_ctx_l,
            vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None,
            clip_length=opt.clip_length,
            ctx_mode=opt.ctx_mode,
            h5driver=opt.h5driver,
            data_ratio=opt.data_ratio,
            normalize_vfeat=not opt.no_norm_vfeat,
            normalize_tfeat=not opt.no_norm_tfeat,
            video_duration_idx_path=opt.video_duration_idx_path,
            eval_split_name=opt.eval_split_name
        )
    else:
        eval_dataset = None

    model_config = EDict(
        visual_input_size=opt.vid_feat_size,
        sub_input_size=opt.sub_feat_size,  # for both desc and subtitles
        query_input_size=opt.q_feat_size,  # for both desc and subtitles
        hidden_size=opt.hidden_size,
        drop=opt.drop,
        ctx_mode=opt.ctx_mode,  # video, sub or video_sub
        initializer_range=opt.initializer_range
    )
    logger.info("model_config {}".format(model_config))
    model = EXCL(model_config)
    count_parameters(model)
    logger.info("Start Training...")
    train(model, train_dataset, eval_dataset, opt)
    return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
コード例 #2
0
def main():
    opt = get_args()

    # random seed
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    train_dataset = RCDataset(dset_name=opt.dset_name,
                              data_dir=opt.data_dir,
                              video_feature_dir=opt.video_feature_dir,
                              duration_file=opt.v_duration_file,
                              word2idx_path=opt.word2idx_path,
                              max_t_len=opt.max_t_len,
                              max_v_len=opt.max_v_len,
                              max_n_sen=opt.max_n_sen,
                              mode="train",
                              recurrent=opt.recurrent,
                              untied=opt.untied or opt.mtrans)
    # add 10 at max_n_sen to make the inference stage use all the segments
    val_dataset = RCDataset(dset_name=opt.dset_name,
                            data_dir=opt.data_dir,
                            video_feature_dir=opt.video_feature_dir,
                            duration_file=opt.v_duration_file,
                            word2idx_path=opt.word2idx_path,
                            max_t_len=opt.max_t_len,
                            max_v_len=opt.max_v_len,
                            max_n_sen=opt.max_n_sen + 10,
                            mode="val",
                            recurrent=opt.recurrent,
                            untied=opt.untied or opt.mtrans)

    if opt.recurrent:
        collate_fn = caption_collate
    else:  # single sentence (including untied)
        collate_fn = single_sentence_collate
    train_loader = DataLoader(train_dataset,
                              collate_fn=collate_fn,
                              batch_size=opt.batch_size,
                              shuffle=True,
                              num_workers=opt.num_workers,
                              pin_memory=opt.pin_memory)
    val_loader = DataLoader(val_dataset,
                            collate_fn=collate_fn,
                            batch_size=opt.val_batch_size,
                            shuffle=False,
                            num_workers=opt.num_workers,
                            pin_memory=opt.pin_memory)

    opt.vocab_size = len(train_dataset.word2idx)
    print(json.dumps(vars(opt), indent=4, sort_keys=True))

    device = torch.device("cuda" if opt.cuda else "cpu")
    rt_config = EDict(
        xl_grad=opt.xl_grad,  # enable back-propagation for transformerXL model
        hidden_size=opt.hidden_size,
        intermediate_size=opt.intermediate_size,  # after each self attention
        vocab_size=opt.vocab_size,  # get from word2idx
        word_vec_size=opt.word_vec_size,
        video_feature_size=opt.video_feature_size,
        max_position_embeddings=opt.max_v_len +
        opt.max_t_len,  # get from max_seq_len
        max_v_len=opt.max_v_len,  # max length of the videos
        max_t_len=opt.max_t_len,  # max length of the text
        type_vocab_size=opt.type_vocab_size,
        layer_norm_eps=opt.layer_norm_eps,  # bert layernorm
        hidden_dropout_prob=opt.
        hidden_dropout_prob,  # applies everywhere except attention
        num_hidden_layers=opt.
        num_hidden_layers,  # number of transformer layers
        num_attention_heads=opt.num_attention_heads,
        attention_probs_dropout_prob=opt.
        attention_probs_dropout_prob,  # applies only to self attention
        n_memory_cells=opt.
        n_memory_cells,  # memory size will be (n_memory_cells, D)
        memory_dropout_prob=opt.memory_dropout_prob,
        initializer_range=opt.initializer_range,
        label_smoothing=opt.label_smoothing,
        share_wd_cls_weight=opt.share_wd_cls_weight)
    if opt.recurrent:
        if opt.xl:
            logger.info("Use recurrent model - TransformerXL" +
                        " (with gradient)" if opt.xl_grad else "")
            model = TransformerXL(rt_config)
        else:
            logger.info("Use recurrent model - Mine")
            model = RecursiveTransformer(rt_config)
    else:  # single sentence, including untied
        if opt.untied:
            logger.info("Use untied non-recurrent single sentence model")
            model = NonRecurTransformerUntied(rt_config)
        elif opt.mtrans:
            logger.info(
                "Use masked transformer -- another non-recurrent single sentence model"
            )
            model = MTransformer(rt_config)
        else:
            logger.info("Use non-recurrent single sentence model")
            model = NonRecurTransformer(rt_config)

    if opt.glove_path is not None:
        if hasattr(model, "embeddings"):
            logger.info("Load GloVe as word embedding")
            model.embeddings.set_pretrained_embedding(torch.from_numpy(
                torch.load(opt.glove_path)).float(),
                                                      freeze=opt.freeze_glove)
        else:
            logger.warning(
                "This model has no embeddings, cannot load glove vectors into the model"
            )

    count_parameters(model)
    if hasattr(model, "embeddings") and hasattr(model.embeddings,
                                                "word_embeddings"):
        count_parameters(model.embeddings.word_embeddings)

    train(model, train_loader, val_loader, device, opt)
コード例 #3
0
ファイル: config.py プロジェクト: ascust/wsscoseg
from easydict import EasyDict as EDict

conf = EDict()

conf.CLASS_NUM = 21  # in this case voc dataset.
conf.MEAN_RGB = (123, 117, 104)  #RGB not BGR
conf.WD = 5e-4
conf.MOMENTUM = 0.9
conf.WORKSPACE = 512
conf.DOWN_SAMPLE_SCALE = 8

#train init model
conf.LR_INIT = 16e-4
conf.EPOCH_SIZE_INIT = 200
conf.MAX_EPOCH_INIT = 40
conf.BATCH_SIZE_INIT = 16
conf.CROP_SIZE_INIT = 320
conf.SCALE_RANGE_INIT = [0.7, 1.3]

#train final model
conf.LR_FINAL = 16e-4
conf.EPOCH_SIZE_FINAL = 700
conf.MAX_EPOCH_FINAL = 40
conf.BATCH_SIZE_FINAL = 16
conf.CROP_SIZE_FINAL = 320
conf.SCALE_RANGE_FINAL = [0.7, 1.3]

#for evaluate init and final models
conf.CPU_WORKER_NUM = 8

conf.EVAL_WAIT_TIME = 0.3  # hour
コード例 #4
0
ファイル: train.py プロジェクト: Worm4047/TVR
def start_training():
    logger.info("Setup config, data and model...")
    opt = BaseOptions().parse()
    set_seed(opt.seed)
    if opt.debug:  # keep the model run deterministically
        # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config.
        # Enable this only when input size is fixed.
        cudnn.benchmark = False
        cudnn.deterministic = True

    opt.writer = SummaryWriter(opt.tensorboard_log_dir)
    opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n"
    opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n"

    train_dataset = StartEndDataset(
        dset_name=opt.dset_name,
        data_path=opt.train_path,
        desc_bert_path_or_handler=opt.desc_bert_path,
        sub_bert_path_or_handler=opt.sub_bert_path,
        max_desc_len=opt.max_desc_l,
        max_ctx_len=opt.max_ctx_l,
        vid_feat_path_or_handler=opt.vid_feat_path,
        clip_length=opt.clip_length,
        ctx_mode=opt.ctx_mode,
        h5driver=opt.h5driver,
        data_ratio=opt.data_ratio,
        normalize_vfeat=not opt.no_norm_vfeat,
        normalize_tfeat=not opt.no_norm_tfeat,
    )

    if opt.eval_path is not None:
        # val dataset, used to get eval loss
        train_eval_dataset = StartEndDataset(
            dset_name=opt.dset_name,
            data_path=opt.eval_path,
            desc_bert_path_or_handler=train_dataset.desc_bert_h5,
            sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None,
            max_desc_len=opt.max_desc_l,
            max_ctx_len=opt.max_ctx_l,
            vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None,
            clip_length=opt.clip_length,
            ctx_mode=opt.ctx_mode,
            h5driver=opt.h5driver,
            data_ratio=opt.data_ratio,
            normalize_vfeat=not opt.no_norm_vfeat,
            normalize_tfeat=not opt.no_norm_tfeat
        )

        eval_dataset = StartEndEvalDataset(
            dset_name=opt.dset_name,
            eval_split_name=opt.eval_split_name,  # should only be val set
            data_path=opt.eval_path,
            desc_bert_path_or_handler=train_dataset.desc_bert_h5,
            sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None,
            max_desc_len=opt.max_desc_l,
            max_ctx_len=opt.max_ctx_l,
            video_duration_idx_path=opt.video_duration_idx_path,
            vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None,
            clip_length=opt.clip_length,
            ctx_mode=opt.ctx_mode,
            data_mode="query",
            h5driver=opt.h5driver,
            data_ratio=opt.data_ratio,
            normalize_vfeat=not opt.no_norm_vfeat,
            normalize_tfeat=not opt.no_norm_tfeat
        )
    else:
        eval_dataset = None

    model_config = EDict(
        merge_two_stream=not opt.no_merge_two_stream,  # merge video and subtitles
        cross_att=not opt.no_cross_att,  # use cross-attention when encoding video and subtitles
        span_predictor_type=opt.span_predictor_type,  # span_predictor_type
        encoder_type=opt.encoder_type,  # gru, lstm, transformer
        add_pe_rnn=opt.add_pe_rnn,  # add pe for RNNs
        pe_type=opt.pe_type,  #
        visual_input_size=opt.vid_feat_size,
        sub_input_size=opt.sub_feat_size,  # for both desc and subtitles
        query_input_size=opt.q_feat_size,  # for both desc and subtitles
        hidden_size=opt.hidden_size,  #
        stack_conv_predictor_conv_kernel_sizes=opt.stack_conv_predictor_conv_kernel_sizes,  #
        conv_kernel_size=opt.conv_kernel_size,
        conv_stride=opt.conv_stride,
        max_ctx_l=opt.max_ctx_l,
        max_desc_l=opt.max_desc_l,
        input_drop=opt.input_drop,
        cross_att_drop=opt.cross_att_drop,
        drop=opt.drop,
        n_heads=opt.n_heads,  # self-att heads
        initializer_range=opt.initializer_range,  # for linear layer
        ctx_mode=opt.ctx_mode,  # video, sub or video_sub
        margin=opt.margin,  # margin for ranking loss
        ranking_loss_type=opt.ranking_loss_type,  # loss type, 'hinge' or 'lse'
        lw_neg_q=opt.lw_neg_q,  # loss weight for neg. query and pos. context
        lw_neg_ctx=opt.lw_neg_ctx,  # loss weight for pos. query and neg. context
        lw_st_ed=0,  # will be assigned dynamically at training time
        use_hard_negative=False,  # reset at each epoch
        hard_pool_size=opt.hard_pool_size,
        use_self_attention=not opt.no_self_att,  # whether to use self attention
        no_modular=opt.no_modular
    )
    logger.info("model_config {}".format(model_config))
    model = XML(model_config)
    count_parameters(model)
    logger.info("Start Training...")
    train(model, train_dataset, train_eval_dataset, eval_dataset, opt)
    return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
コード例 #5
0
def start_training():
    logger.info("Setup config, data and model...")
    opt = BaseOptions().parse()
    set_seed(opt.seed)
    if opt.debug:  # keep the model run deterministically
        # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config.
        # Enable this only when input size is fixed.
        cudnn.benchmark = False
        cudnn.deterministic = True

    opt.writer = SummaryWriter(opt.tensorboard_log_dir)
    opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n"
    opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n"

    train_dataset = RetrievalDataset(
        dset_name=opt.dset_name,
        data_path=opt.train_path,
        desc_bert_path_or_handler=opt.desc_bert_path,
        sub_bert_path_or_handler=opt.sub_bert_path,
        vid_feat_path_or_handler=opt.vid_feat_path,
        max_desc_len=opt.max_desc_l,
        max_ctx_len=opt.max_ctx_l,
        ctx_mode=opt.ctx_mode,
        h5driver=opt.h5driver,
        data_ratio=opt.data_ratio,
        normalize_vfeat=not opt.no_norm_vfeat,
        normalize_tfeat=not opt.no_norm_tfeat,
    )

    if opt.eval_path is not None:
        eval_dataset = RetrievalEvalDataset(
            dset_name=opt.dset_name,
            eval_split_name=opt.eval_split_name,  # should only be val set
            data_path=opt.eval_path,
            desc_bert_path_or_handler=train_dataset.desc_bert_h5,
            sub_bert_path_or_handler=train_dataset.sub_bert_h5
            if "sub" in opt.ctx_mode else None,
            max_desc_len=opt.max_desc_l,
            max_ctx_len=opt.max_ctx_l,
            video_duration_idx_path=opt.video_duration_idx_path,
            vid_feat_path_or_handler=train_dataset.vid_feat_h5
            if "video" in opt.ctx_mode else None,
            ctx_mode=opt.ctx_mode,
            data_mode="query",
            h5driver=opt.h5driver,
            data_ratio=opt.data_ratio,
            normalize_vfeat=not opt.no_norm_vfeat,
            normalize_tfeat=not opt.no_norm_tfeat,
        )
    else:
        eval_dataset = None

    model_config = EDict(
        ctx_mode=opt.ctx_mode,
        text_input_size=opt.sub_feat_size,
        vid_input_size=opt.vid_feat_size,  #
        output_size=opt.output_size,
        margin=opt.margin,  # margin for ranking loss
    )
    logger.info("model_config {}".format(model_config))
    model = MEE(model_config)
    count_parameters(model)
    logger.info("Start Training...")
    train(model, train_dataset, eval_dataset, opt)
    return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
コード例 #6
0
        return aug_image


#SAMPLE CONFIGURATION FOR PHOTOMETRIC AUGMENTATION
photometric_transform_config = EDict({
    'trans_photo': {
        'smoothing_aug': True,
        'rescale_down_prob': 0.2,
        'rescale_down': 0.5,
        'rescale_down_linear_upsample_prob': 1.0,
        'gauss_smooth_k_min': 2,
        'gauss_smooth_k_max': 5,
        'noise_aug': False,
        'poisson_noise_prob': 0.4,
        'speckle_noise_mean': 0,
        'speckle_noise_sigma': 0.025,
        'gaussian_noise_mean': 0,
        'gaussian_noise_sigma': 0.025,
        'graylevel_aug': True,
        'max_rand_contrast': 0.2,
        'max_rand_brightness': 15,
        'log_transform_prob': 0.2,
        'gamma_corr_prob': 0.2,
        'dec_contrast_aug': False,
        'dec_contrast_ksize': 5 #Increase it to odd numbers to reduce the effect of texture removal from the images
    },
    'edge_strength': False #Keep it false always
})

def __main__():
    #Creating a transformation class for pytorch transform class
    photo_transformer = PhotometricTransform(photometric_transform_config)
コード例 #7
0
def main():
    opt = get_args()

    # random seed
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    train_dataset = TVCaptionDataset(ctx_mode=opt.ctx_mode,
                                     data_ratio=opt.data_ratio,
                                     data_path=opt.train_path,
                                     sub_meta_path=opt.sub_meta_path,
                                     vid_h5_path_or_handler=opt.vid_feat_path,
                                     word2idx_path=opt.word2idx_path,
                                     max_cap_len=opt.max_cap_len,
                                     max_sub_len=opt.max_sub_len,
                                     max_v_len=opt.max_v_len,
                                     h5driver=opt.h5driver,
                                     clip_length=1.5,
                                     normalize_vfeat=not opt.no_norm_vfeat,
                                     is_eval=False)
    eval_dataset = TVCaptionDataset(
        ctx_mode=opt.ctx_mode,
        # data_ratio=opt.data_ratio,
        data_ratio=1.0,
        data_path=opt.eval_path,
        sub_meta_path=opt.sub_meta_path,
        vid_h5_path_or_handler=train_dataset.vid_h5
        if "video" in opt.ctx_mode else None,
        word2idx_path=opt.word2idx_path,
        max_cap_len=opt.max_cap_len,
        max_sub_len=opt.max_sub_len,
        max_v_len=opt.max_v_len,
        h5driver=opt.h5driver,
        clip_length=1.5,
        normalize_vfeat=not opt.no_norm_vfeat,
        is_eval=True  # only set to True at inference
    )

    train_loader = DataLoader(train_dataset,
                              collate_fn=caption_collate,
                              batch_size=opt.batch_size,
                              shuffle=True,
                              num_workers=opt.num_workers,
                              pin_memory=opt.pin_memory)
    eval_loader = DataLoader(eval_dataset,
                             collate_fn=caption_collate,
                             batch_size=opt.eval_batch_size,
                             shuffle=False,
                             num_workers=opt.num_workers,
                             pin_memory=opt.pin_memory)

    opt.vocab_size = len(train_dataset.word2idx)
    pprint.pprint(vars(opt))

    rt_config = EDict(
        hidden_size=opt.hidden_size,
        intermediate_size=opt.intermediate_size,  # after each self attention
        vocab_size=opt.vocab_size,  # get from word2idx
        word_vec_size=opt.word_vec_size,
        video_feature_size=opt.vid_feat_size,
        max_position_embeddings=max(opt.max_v_len + opt.max_sub_len,
                                    opt.max_cap_len),  # get from max_seq_len
        type_vocab_size=opt.type_vocab_size,
        layer_norm_eps=opt.layer_norm_eps,  # bert layernorm
        hidden_dropout_prob=opt.
        hidden_dropout_prob,  # applies everywhere except attention
        num_hidden_layers=opt.
        num_hidden_layers,  # number of transformer layers
        num_attention_heads=opt.num_attention_heads,
        attention_probs_dropout_prob=opt.
        attention_probs_dropout_prob,  # applies only to self attention
        initializer_range=opt.initializer_range,
        label_smoothing=opt.label_smoothing,
        share_wd_cls_weight=opt.share_wd_cls_weight)
    model = MMT(rt_config)

    if opt.glove_path is not None:
        if hasattr(model, "embeddings"):
            logger.info("Load GloVe as word embedding")
            model.embeddings.set_pretrained_embedding(torch.from_numpy(
                torch.load(opt.glove_path)).float(),
                                                      freeze=opt.freeze_glove)
        else:
            logger.warning(
                "This model has no embeddings, cannot load glove vectors into the model"
            )

    train(model, train_loader, eval_loader, opt)