def start_training(): logger.info("Setup config, data and model...") opt = BaseOptions().parse() set_seed(opt.seed) if opt.debug: # keep the model run deterministically # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config. # Enable this only when input size is fixed. cudnn.benchmark = False cudnn.deterministic = True opt.writer = SummaryWriter(opt.tensorboard_log_dir) opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n" opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n" train_dataset = ExCLDataset( dset_name=opt.dset_name, data_path=opt.train_path, desc_bert_path_or_handler=opt.desc_bert_path, sub_bert_path_or_handler=opt.sub_bert_path, max_desc_len=opt.max_desc_l, max_ctx_len=opt.max_ctx_l, vid_feat_path_or_handler=opt.vid_feat_path, clip_length=opt.clip_length, ctx_mode=opt.ctx_mode, h5driver=opt.h5driver, data_ratio=opt.data_ratio, normalize_vfeat=not opt.no_norm_vfeat, normalize_tfeat=not opt.no_norm_tfeat, ) if opt.eval_path is not None: eval_dataset = ExCLDataset( dset_name=opt.dset_name, data_path=opt.eval_path, desc_bert_path_or_handler=train_dataset.desc_bert_h5, sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None, max_desc_len=opt.max_desc_l, max_ctx_len=opt.max_ctx_l, vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None, clip_length=opt.clip_length, ctx_mode=opt.ctx_mode, h5driver=opt.h5driver, data_ratio=opt.data_ratio, normalize_vfeat=not opt.no_norm_vfeat, normalize_tfeat=not opt.no_norm_tfeat, video_duration_idx_path=opt.video_duration_idx_path, eval_split_name=opt.eval_split_name ) else: eval_dataset = None model_config = EDict( visual_input_size=opt.vid_feat_size, sub_input_size=opt.sub_feat_size, # for both desc and subtitles query_input_size=opt.q_feat_size, # for both desc and subtitles hidden_size=opt.hidden_size, drop=opt.drop, ctx_mode=opt.ctx_mode, # video, sub or video_sub initializer_range=opt.initializer_range ) logger.info("model_config {}".format(model_config)) model = EXCL(model_config) count_parameters(model) logger.info("Start Training...") train(model, train_dataset, eval_dataset, opt) return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
def main(): opt = get_args() # random seed random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) train_dataset = RCDataset(dset_name=opt.dset_name, data_dir=opt.data_dir, video_feature_dir=opt.video_feature_dir, duration_file=opt.v_duration_file, word2idx_path=opt.word2idx_path, max_t_len=opt.max_t_len, max_v_len=opt.max_v_len, max_n_sen=opt.max_n_sen, mode="train", recurrent=opt.recurrent, untied=opt.untied or opt.mtrans) # add 10 at max_n_sen to make the inference stage use all the segments val_dataset = RCDataset(dset_name=opt.dset_name, data_dir=opt.data_dir, video_feature_dir=opt.video_feature_dir, duration_file=opt.v_duration_file, word2idx_path=opt.word2idx_path, max_t_len=opt.max_t_len, max_v_len=opt.max_v_len, max_n_sen=opt.max_n_sen + 10, mode="val", recurrent=opt.recurrent, untied=opt.untied or opt.mtrans) if opt.recurrent: collate_fn = caption_collate else: # single sentence (including untied) collate_fn = single_sentence_collate train_loader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=opt.pin_memory) val_loader = DataLoader(val_dataset, collate_fn=collate_fn, batch_size=opt.val_batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=opt.pin_memory) opt.vocab_size = len(train_dataset.word2idx) print(json.dumps(vars(opt), indent=4, sort_keys=True)) device = torch.device("cuda" if opt.cuda else "cpu") rt_config = EDict( xl_grad=opt.xl_grad, # enable back-propagation for transformerXL model hidden_size=opt.hidden_size, intermediate_size=opt.intermediate_size, # after each self attention vocab_size=opt.vocab_size, # get from word2idx word_vec_size=opt.word_vec_size, video_feature_size=opt.video_feature_size, max_position_embeddings=opt.max_v_len + opt.max_t_len, # get from max_seq_len max_v_len=opt.max_v_len, # max length of the videos max_t_len=opt.max_t_len, # max length of the text type_vocab_size=opt.type_vocab_size, layer_norm_eps=opt.layer_norm_eps, # bert layernorm hidden_dropout_prob=opt. hidden_dropout_prob, # applies everywhere except attention num_hidden_layers=opt. num_hidden_layers, # number of transformer layers num_attention_heads=opt.num_attention_heads, attention_probs_dropout_prob=opt. attention_probs_dropout_prob, # applies only to self attention n_memory_cells=opt. n_memory_cells, # memory size will be (n_memory_cells, D) memory_dropout_prob=opt.memory_dropout_prob, initializer_range=opt.initializer_range, label_smoothing=opt.label_smoothing, share_wd_cls_weight=opt.share_wd_cls_weight) if opt.recurrent: if opt.xl: logger.info("Use recurrent model - TransformerXL" + " (with gradient)" if opt.xl_grad else "") model = TransformerXL(rt_config) else: logger.info("Use recurrent model - Mine") model = RecursiveTransformer(rt_config) else: # single sentence, including untied if opt.untied: logger.info("Use untied non-recurrent single sentence model") model = NonRecurTransformerUntied(rt_config) elif opt.mtrans: logger.info( "Use masked transformer -- another non-recurrent single sentence model" ) model = MTransformer(rt_config) else: logger.info("Use non-recurrent single sentence model") model = NonRecurTransformer(rt_config) if opt.glove_path is not None: if hasattr(model, "embeddings"): logger.info("Load GloVe as word embedding") model.embeddings.set_pretrained_embedding(torch.from_numpy( torch.load(opt.glove_path)).float(), freeze=opt.freeze_glove) else: logger.warning( "This model has no embeddings, cannot load glove vectors into the model" ) count_parameters(model) if hasattr(model, "embeddings") and hasattr(model.embeddings, "word_embeddings"): count_parameters(model.embeddings.word_embeddings) train(model, train_loader, val_loader, device, opt)
from easydict import EasyDict as EDict conf = EDict() conf.CLASS_NUM = 21 # in this case voc dataset. conf.MEAN_RGB = (123, 117, 104) #RGB not BGR conf.WD = 5e-4 conf.MOMENTUM = 0.9 conf.WORKSPACE = 512 conf.DOWN_SAMPLE_SCALE = 8 #train init model conf.LR_INIT = 16e-4 conf.EPOCH_SIZE_INIT = 200 conf.MAX_EPOCH_INIT = 40 conf.BATCH_SIZE_INIT = 16 conf.CROP_SIZE_INIT = 320 conf.SCALE_RANGE_INIT = [0.7, 1.3] #train final model conf.LR_FINAL = 16e-4 conf.EPOCH_SIZE_FINAL = 700 conf.MAX_EPOCH_FINAL = 40 conf.BATCH_SIZE_FINAL = 16 conf.CROP_SIZE_FINAL = 320 conf.SCALE_RANGE_FINAL = [0.7, 1.3] #for evaluate init and final models conf.CPU_WORKER_NUM = 8 conf.EVAL_WAIT_TIME = 0.3 # hour
def start_training(): logger.info("Setup config, data and model...") opt = BaseOptions().parse() set_seed(opt.seed) if opt.debug: # keep the model run deterministically # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config. # Enable this only when input size is fixed. cudnn.benchmark = False cudnn.deterministic = True opt.writer = SummaryWriter(opt.tensorboard_log_dir) opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n" opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n" train_dataset = StartEndDataset( dset_name=opt.dset_name, data_path=opt.train_path, desc_bert_path_or_handler=opt.desc_bert_path, sub_bert_path_or_handler=opt.sub_bert_path, max_desc_len=opt.max_desc_l, max_ctx_len=opt.max_ctx_l, vid_feat_path_or_handler=opt.vid_feat_path, clip_length=opt.clip_length, ctx_mode=opt.ctx_mode, h5driver=opt.h5driver, data_ratio=opt.data_ratio, normalize_vfeat=not opt.no_norm_vfeat, normalize_tfeat=not opt.no_norm_tfeat, ) if opt.eval_path is not None: # val dataset, used to get eval loss train_eval_dataset = StartEndDataset( dset_name=opt.dset_name, data_path=opt.eval_path, desc_bert_path_or_handler=train_dataset.desc_bert_h5, sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None, max_desc_len=opt.max_desc_l, max_ctx_len=opt.max_ctx_l, vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None, clip_length=opt.clip_length, ctx_mode=opt.ctx_mode, h5driver=opt.h5driver, data_ratio=opt.data_ratio, normalize_vfeat=not opt.no_norm_vfeat, normalize_tfeat=not opt.no_norm_tfeat ) eval_dataset = StartEndEvalDataset( dset_name=opt.dset_name, eval_split_name=opt.eval_split_name, # should only be val set data_path=opt.eval_path, desc_bert_path_or_handler=train_dataset.desc_bert_h5, sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None, max_desc_len=opt.max_desc_l, max_ctx_len=opt.max_ctx_l, video_duration_idx_path=opt.video_duration_idx_path, vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None, clip_length=opt.clip_length, ctx_mode=opt.ctx_mode, data_mode="query", h5driver=opt.h5driver, data_ratio=opt.data_ratio, normalize_vfeat=not opt.no_norm_vfeat, normalize_tfeat=not opt.no_norm_tfeat ) else: eval_dataset = None model_config = EDict( merge_two_stream=not opt.no_merge_two_stream, # merge video and subtitles cross_att=not opt.no_cross_att, # use cross-attention when encoding video and subtitles span_predictor_type=opt.span_predictor_type, # span_predictor_type encoder_type=opt.encoder_type, # gru, lstm, transformer add_pe_rnn=opt.add_pe_rnn, # add pe for RNNs pe_type=opt.pe_type, # visual_input_size=opt.vid_feat_size, sub_input_size=opt.sub_feat_size, # for both desc and subtitles query_input_size=opt.q_feat_size, # for both desc and subtitles hidden_size=opt.hidden_size, # stack_conv_predictor_conv_kernel_sizes=opt.stack_conv_predictor_conv_kernel_sizes, # conv_kernel_size=opt.conv_kernel_size, conv_stride=opt.conv_stride, max_ctx_l=opt.max_ctx_l, max_desc_l=opt.max_desc_l, input_drop=opt.input_drop, cross_att_drop=opt.cross_att_drop, drop=opt.drop, n_heads=opt.n_heads, # self-att heads initializer_range=opt.initializer_range, # for linear layer ctx_mode=opt.ctx_mode, # video, sub or video_sub margin=opt.margin, # margin for ranking loss ranking_loss_type=opt.ranking_loss_type, # loss type, 'hinge' or 'lse' lw_neg_q=opt.lw_neg_q, # loss weight for neg. query and pos. context lw_neg_ctx=opt.lw_neg_ctx, # loss weight for pos. query and neg. context lw_st_ed=0, # will be assigned dynamically at training time use_hard_negative=False, # reset at each epoch hard_pool_size=opt.hard_pool_size, use_self_attention=not opt.no_self_att, # whether to use self attention no_modular=opt.no_modular ) logger.info("model_config {}".format(model_config)) model = XML(model_config) count_parameters(model) logger.info("Start Training...") train(model, train_dataset, train_eval_dataset, eval_dataset, opt) return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
def start_training(): logger.info("Setup config, data and model...") opt = BaseOptions().parse() set_seed(opt.seed) if opt.debug: # keep the model run deterministically # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config. # Enable this only when input size is fixed. cudnn.benchmark = False cudnn.deterministic = True opt.writer = SummaryWriter(opt.tensorboard_log_dir) opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n" opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n" train_dataset = RetrievalDataset( dset_name=opt.dset_name, data_path=opt.train_path, desc_bert_path_or_handler=opt.desc_bert_path, sub_bert_path_or_handler=opt.sub_bert_path, vid_feat_path_or_handler=opt.vid_feat_path, max_desc_len=opt.max_desc_l, max_ctx_len=opt.max_ctx_l, ctx_mode=opt.ctx_mode, h5driver=opt.h5driver, data_ratio=opt.data_ratio, normalize_vfeat=not opt.no_norm_vfeat, normalize_tfeat=not opt.no_norm_tfeat, ) if opt.eval_path is not None: eval_dataset = RetrievalEvalDataset( dset_name=opt.dset_name, eval_split_name=opt.eval_split_name, # should only be val set data_path=opt.eval_path, desc_bert_path_or_handler=train_dataset.desc_bert_h5, sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None, max_desc_len=opt.max_desc_l, max_ctx_len=opt.max_ctx_l, video_duration_idx_path=opt.video_duration_idx_path, vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None, ctx_mode=opt.ctx_mode, data_mode="query", h5driver=opt.h5driver, data_ratio=opt.data_ratio, normalize_vfeat=not opt.no_norm_vfeat, normalize_tfeat=not opt.no_norm_tfeat, ) else: eval_dataset = None model_config = EDict( ctx_mode=opt.ctx_mode, text_input_size=opt.sub_feat_size, vid_input_size=opt.vid_feat_size, # output_size=opt.output_size, margin=opt.margin, # margin for ranking loss ) logger.info("model_config {}".format(model_config)) model = MEE(model_config) count_parameters(model) logger.info("Start Training...") train(model, train_dataset, eval_dataset, opt) return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
return aug_image #SAMPLE CONFIGURATION FOR PHOTOMETRIC AUGMENTATION photometric_transform_config = EDict({ 'trans_photo': { 'smoothing_aug': True, 'rescale_down_prob': 0.2, 'rescale_down': 0.5, 'rescale_down_linear_upsample_prob': 1.0, 'gauss_smooth_k_min': 2, 'gauss_smooth_k_max': 5, 'noise_aug': False, 'poisson_noise_prob': 0.4, 'speckle_noise_mean': 0, 'speckle_noise_sigma': 0.025, 'gaussian_noise_mean': 0, 'gaussian_noise_sigma': 0.025, 'graylevel_aug': True, 'max_rand_contrast': 0.2, 'max_rand_brightness': 15, 'log_transform_prob': 0.2, 'gamma_corr_prob': 0.2, 'dec_contrast_aug': False, 'dec_contrast_ksize': 5 #Increase it to odd numbers to reduce the effect of texture removal from the images }, 'edge_strength': False #Keep it false always }) def __main__(): #Creating a transformation class for pytorch transform class photo_transformer = PhotometricTransform(photometric_transform_config)
def main(): opt = get_args() # random seed random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) train_dataset = TVCaptionDataset(ctx_mode=opt.ctx_mode, data_ratio=opt.data_ratio, data_path=opt.train_path, sub_meta_path=opt.sub_meta_path, vid_h5_path_or_handler=opt.vid_feat_path, word2idx_path=opt.word2idx_path, max_cap_len=opt.max_cap_len, max_sub_len=opt.max_sub_len, max_v_len=opt.max_v_len, h5driver=opt.h5driver, clip_length=1.5, normalize_vfeat=not opt.no_norm_vfeat, is_eval=False) eval_dataset = TVCaptionDataset( ctx_mode=opt.ctx_mode, # data_ratio=opt.data_ratio, data_ratio=1.0, data_path=opt.eval_path, sub_meta_path=opt.sub_meta_path, vid_h5_path_or_handler=train_dataset.vid_h5 if "video" in opt.ctx_mode else None, word2idx_path=opt.word2idx_path, max_cap_len=opt.max_cap_len, max_sub_len=opt.max_sub_len, max_v_len=opt.max_v_len, h5driver=opt.h5driver, clip_length=1.5, normalize_vfeat=not opt.no_norm_vfeat, is_eval=True # only set to True at inference ) train_loader = DataLoader(train_dataset, collate_fn=caption_collate, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=opt.pin_memory) eval_loader = DataLoader(eval_dataset, collate_fn=caption_collate, batch_size=opt.eval_batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=opt.pin_memory) opt.vocab_size = len(train_dataset.word2idx) pprint.pprint(vars(opt)) rt_config = EDict( hidden_size=opt.hidden_size, intermediate_size=opt.intermediate_size, # after each self attention vocab_size=opt.vocab_size, # get from word2idx word_vec_size=opt.word_vec_size, video_feature_size=opt.vid_feat_size, max_position_embeddings=max(opt.max_v_len + opt.max_sub_len, opt.max_cap_len), # get from max_seq_len type_vocab_size=opt.type_vocab_size, layer_norm_eps=opt.layer_norm_eps, # bert layernorm hidden_dropout_prob=opt. hidden_dropout_prob, # applies everywhere except attention num_hidden_layers=opt. num_hidden_layers, # number of transformer layers num_attention_heads=opt.num_attention_heads, attention_probs_dropout_prob=opt. attention_probs_dropout_prob, # applies only to self attention initializer_range=opt.initializer_range, label_smoothing=opt.label_smoothing, share_wd_cls_weight=opt.share_wd_cls_weight) model = MMT(rt_config) if opt.glove_path is not None: if hasattr(model, "embeddings"): logger.info("Load GloVe as word embedding") model.embeddings.set_pretrained_embedding(torch.from_numpy( torch.load(opt.glove_path)).float(), freeze=opt.freeze_glove) else: logger.warning( "This model has no embeddings, cannot load glove vectors into the model" ) train(model, train_loader, eval_loader, opt)