def main(args): if args.debug_every <= 1: pdb.set_trace() model = None if args.baseline_model is not None: print('Loading baseline model from ', args.baseline_model) model, _ = utils.load_baseline(args.baseline_model) if args.vocab_json is not None: new_vocab = utils.load_vocab(args.vocab_json) model.rnn.expand_vocab(new_vocab['question_token_to_idx']) elif (args.program_generator is not None and args.execution_engine is not None): pg, _ = utils.load_program_generator(args.program_generator, args.model_type) ee, _ = utils.load_execution_engine(args.execution_engine, verbose=False, model_type=args.model_type) if args.vocab_json is not None: new_vocab = utils.load_vocab(args.vocab_json) pg.expand_encoder_vocab(new_vocab['question_token_to_idx']) model = (pg, ee) else: print('Must give either --baseline_model or --program_generator' + 'and --execution_engine') return dtype = torch.FloatTensor if args.use_gpu == 1: dtype = torch.cuda.FloatTensor if args.question is not None and args.image is not None: run_single_example(args, model, dtype, args.question) # Interactive mode elif (args.image is not None and args.input_question_h5 is None and args.input_features_h5 is None): feats_var = extract_image_features(args, dtype) print(colored('Ask me something!', 'cyan')) while True: # Get user question question_raw = input(">>> ") run_single_example(args, model, dtype, question_raw, feats_var) else: vocab = load_vocab(args) loader_kwargs = { 'question_h5': args.input_question_h5, 'feature_h5': args.input_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, } if args.num_samples is not None and args.num_samples > 0: loader_kwargs['max_samples'] = args.num_samples if args.family_split_file is not None: with open(args.family_split_file, 'r') as f: loader_kwargs['question_families'] = json.load(f) with ClevrDataLoader(**loader_kwargs) as loader: run_batch(args, model, dtype, loader)
def main(args): if not args.program_generator: args.program_generator = args.execution_engine input_question_h5 = os.path.join(args.data_dir, '{}_questions.h5'.format(args.part)) input_features_h5 = os.path.join(args.data_dir, '{}_features.h5'.format(args.part)) model = None if args.baseline_model is not None: print('Loading baseline model from ', args.baseline_model) model, _ = utils.load_baseline(args.baseline_model) if args.vocab_json is not None: new_vocab = utils.load_vocab(args.vocab_json) model.rnn.expand_vocab(new_vocab['question_token_to_idx']) elif args.program_generator is not None and args.execution_engine is not None: pg, _ = utils.load_program_generator(args.program_generator) ee, _ = utils.load_execution_engine(args.execution_engine, verbose=False) if args.vocab_json is not None: new_vocab = utils.load_vocab(args.vocab_json) pg.expand_encoder_vocab(new_vocab['question_token_to_idx']) model = (pg, ee) else: print( 'Must give either --baseline_model or --program_generator and --execution_engine' ) return dtype = torch.FloatTensor if args.use_gpu == 1: dtype = torch.cuda.FloatTensor if args.question is not None and args.image is not None: run_single_example(args, model, dtype, args.question) else: vocab = load_vocab(args) loader_kwargs = { 'question_h5': input_question_h5, 'feature_h5': input_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, } if args.num_samples is not None and args.num_samples > 0: loader_kwargs['max_samples'] = args.num_samples if args.family_split_file is not None: with open(args.family_split_file, 'r') as f: loader_kwargs['question_families'] = json.load(f) with ClevrDataLoader(**loader_kwargs) as loader: run_batch(args, model, dtype, loader)
def get_execution_engine(args): vocab = utils.load_vocab(args.vocab_json) if args.execution_engine_start_from is not None: ee, kwargs = utils.load_execution_engine( args.execution_engine_start_from, model_type=args.model_type) else: kwargs = { 'vocab': vocab, 'feature_dim': parse_int_list(args.feature_dim), 'stem_batchnorm': args.module_stem_batchnorm == 1, 'stem_num_layers': args.module_stem_num_layers, 'module_dim': args.module_dim, 'module_residual': args.module_residual == 1, 'module_batchnorm': args.module_batchnorm == 1, 'classifier_proj_dim': args.classifier_proj_dim, 'classifier_downsample': args.classifier_downsample, 'classifier_fc_layers': parse_int_list(args.classifier_fc_dims), 'classifier_batchnorm': args.classifier_batchnorm == 1, 'classifier_dropout': args.classifier_dropout, 'encoder_vocab_size': len(vocab['question_token_to_idx']), 'decoder_vocab_size': len(vocab['program_token_to_idx']), 'wordvec_dim': args.rnn_wordvec_dim, 'hidden_dim': args.rnn_hidden_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, # 0e-2 } if args.model_type == 'FiLM': kwargs['num_modules'] = args.num_modules kwargs['stem_kernel_size'] = args.module_stem_kernel_size kwargs['stem_stride'] = args.module_stem_stride kwargs['stem_padding'] = args.module_stem_padding kwargs['module_num_layers'] = args.module_num_layers kwargs['module_batchnorm_affine'] = args.module_batchnorm_affine == 1 kwargs['module_dropout'] = args.module_dropout kwargs['module_input_proj'] = args.module_input_proj kwargs['module_kernel_size'] = args.module_kernel_size kwargs['use_gamma'] = args.use_gamma == 1 kwargs['use_beta'] = args.use_beta == 1 kwargs['use_coords'] = args.use_coords kwargs['debug_every'] = args.debug_every kwargs['print_verbose_every'] = args.print_verbose_every kwargs['condition_method'] = args.condition_method kwargs['condition_pattern'] = parse_int_list(args.condition_pattern) kwargs['parameter_efficient'] = args.program_generator_parameter_efficient == 1 kwargs['output_batchnorm'] = args.rnn_output_batchnorm == 1 kwargs['bidirectional'] = args.bidirectional == 1 kwargs['rnn_time_step'] = args.rnn_time_step kwargs['encoder_type'] = args.encoder_type kwargs['decoder_type'] = args.decoder_type kwargs['gamma_option'] = args.gamma_option kwargs['gamma_baseline'] = args.gamma_baseline # 1 ee = FiLMedNet(**kwargs) else: ee = ModuleNet(**kwargs) ee.cuda() ee.train() return ee, kwargs
def get_baseline_model(args): vocab = utils.load_vocab(args.vocab_json) if args.baseline_start_from is not None: model, kwargs = utils.load_baseline(args.baseline_start_from) elif args.model_type == 'LSTM': kwargs = { 'vocab': vocab, 'rnn_wordvec_dim': args.rnn_wordvec_dim, 'rnn_dim': args.rnn_hidden_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, 'fc_dims': parse_int_list(args.classifier_fc_dims), 'fc_use_batchnorm': args.classifier_batchnorm == 1, 'fc_dropout': args.classifier_dropout, } model = LstmModel(**kwargs) elif args.model_type == 'CNN+LSTM': kwargs = { 'vocab': vocab, 'rnn_wordvec_dim': args.rnn_wordvec_dim, 'rnn_dim': args.rnn_hidden_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, 'cnn_feat_dim': parse_int_list(args.feature_dim), 'cnn_num_res_blocks': args.cnn_num_res_blocks, 'cnn_res_block_dim': args.cnn_res_block_dim, 'cnn_proj_dim': args.cnn_proj_dim, 'cnn_pooling': args.cnn_pooling, 'fc_dims': parse_int_list(args.classifier_fc_dims), 'fc_use_batchnorm': args.classifier_batchnorm == 1, 'fc_dropout': args.classifier_dropout, } model = CnnLstmModel(**kwargs) elif args.model_type == 'CNN+LSTM+SA': kwargs = { 'vocab': vocab, 'rnn_wordvec_dim': args.rnn_wordvec_dim, 'rnn_dim': args.rnn_hidden_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, 'cnn_feat_dim': parse_int_list(args.feature_dim), 'stacked_attn_dim': args.stacked_attn_dim, 'num_stacked_attn': args.num_stacked_attn, 'fc_dims': parse_int_list(args.classifier_fc_dims), 'fc_use_batchnorm': args.classifier_batchnorm == 1, 'fc_dropout': args.classifier_dropout, } model = CnnLstmSaModel(**kwargs) if model.rnn.token_to_idx != vocab['question_token_to_idx']: # Make sure new vocab is superset of old for k, v in model.rnn.token_to_idx.items(): assert k in vocab['question_token_to_idx'] assert vocab['question_token_to_idx'][k] == v for token, idx in vocab['question_token_to_idx'].items(): model.rnn.token_to_idx[token] = idx kwargs['vocab'] = vocab model.rnn.expand_vocab(vocab['question_token_to_idx']) model.cuda() model.train() return model, kwargs
def main(args): if args.randomize_checkpoint_path == 1: name, ext = os.path.splitext(args.checkpoint_path) num = random.randint(1, 1000000) args.checkpoint_path = '%s_%06d%s' % (name, num, ext) print('Will save checkpoints to %s' % args.checkpoint_path) vocab = utils.load_vocab(args.vocab_json) if args.use_local_copies == 1: shutil.copy(args.train_question_h5, '/tmp/train_questions.h5') shutil.copy(args.train_features_h5, '/tmp/train_features.h5') shutil.copy(args.val_question_h5, '/tmp/val_questions.h5') shutil.copy(args.val_features_h5, '/tmp/val_features.h5') args.train_question_h5 = '/tmp/train_questions.h5' args.train_features_h5 = '/tmp/train_features.h5' args.val_question_h5 = '/tmp/val_questions.h5' args.val_features_h5 = '/tmp/val_features.h5' question_families = None if args.family_split_file is not None: with open(args.family_split_file, 'r') as f: question_families = json.load(f) train_loader_kwargs = { 'question_h5': args.train_question_h5, 'feature_h5': args.train_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'shuffle': args.shuffle_train_data == 1, 'question_families': question_families, 'min_program_depth': args.min_program_depth, 'max_program_depth': args.max_program_depth, 'max_samples': args.num_train_samples, 'num_workers': args.loader_num_workers, 'drop_last': True, } val_loader_kwargs = { 'question_h5': args.val_question_h5, 'feature_h5': args.val_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'question_families': question_families, 'min_program_depth': args.min_program_depth, 'max_program_depth': args.max_program_depth, 'max_samples': args.num_val_samples, 'num_workers': args.loader_num_workers, } with ClevrDataLoader(**train_loader_kwargs) as train_loader, \ ClevrDataLoader(**val_loader_kwargs) as val_loader: train_loop(args, train_loader, val_loader) if args.use_local_copies == 1 and args.cleanup_local_copies == 1: os.remove('/tmp/train_questions.h5') os.remove('/tmp/train_features.h5') os.remove('/tmp/val_questions.h5') os.remove('/tmp/val_features.h5')
def main(args): assert args.min_program_depth == args.max_program_depth, \ "This script is for validating at one singular depth." if args.randomize_checkpoint_path == 1: name, ext = os.path.splitext(args.checkpoint_path) num = random.randint(1, 1000000) args.checkpoint_path = '%s_%06d%s' % (name, num, ext) print('Will save checkpoints to %s' % args.checkpoint_path) vocab = utils.load_vocab(args.vocab_json) if args.use_local_copies == 1: shutil.copy(args.train_question_h5, '/tmp/train_questions.h5') shutil.copy(args.train_features_h5, '/tmp/train_features.h5') shutil.copy(args.val_question_h5, '/tmp/val_questions.h5') shutil.copy(args.val_features_h5, '/tmp/val_features.h5') args.train_question_h5 = '/tmp/train_questions.h5' args.train_features_h5 = '/tmp/train_features.h5' args.val_question_h5 = '/tmp/val_questions.h5' args.val_features_h5 = '/tmp/val_features.h5' question_families = None if args.family_split_file is not None: with open(args.family_split_file, 'r') as f: question_families = json.load(f) val_loader_kwargs = { 'question_h5': args.val_question_h5, 'feature_h5': args.val_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'question_families': question_families, 'min_program_depth': args.min_program_depth, 'max_program_depth': args.max_program_depth, 'max_samples': args.num_val_samples, 'num_workers': args.loader_num_workers, } with ClevrDataLoader(**val_loader_kwargs) as val_loader: val_acc = None if len(val_loader) > 0: val_acc = validation_procedure(args, val_loader) if val_acc is not None: depth_accs = dict() if os.path.exists(args.output_json): with open(args.output_json) as fd: depth_accs = json.load(fd) program_depth = args.min_program_depth if program_depth not in depth_accs: depth_accs[program_depth] = val_acc with open(args.output_json,'w') as fd: json.dump(depth_accs, fd) else: print(f'WARNING: depth {program_depth} is already in document.')
def main(args): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_visible if args.randomize_checkpoint_path == 1: # default 0 name, ext = os.path.splitext(args.checkpoint_path) num = random.randint(1, 1000000) args.checkpoint_path = '%s_%06d%s' % (name, num, ext) print('Will save checkpoints to %s' % args.checkpoint_path) vocab = utils.load_vocab(args.vocab_json) if args.use_local_copies == 1: # default 0 shutil.copy(args.train_question_h5, '/tmp/train_questions.h5') shutil.copy(args.train_features_h5, '/tmp/train_features.h5') shutil.copy(args.val_question_h5, '/tmp/val_questions.h5') shutil.copy(args.val_features_h5, '/tmp/val_features.h5') args.train_question_h5 = '/tmp/train_questions.h5' args.train_features_h5 = '/tmp/train_features.h5' args.val_question_h5 = '/tmp/val_questions.h5' args.val_features_h5 = '/tmp/val_features.h5' question_families = None if args.family_split_file is not None: # default None with open(args.family_split_file, 'r') as f: question_families = json.load(f) train_loader_kwargs = { 'question_h5': args.train_question_h5, # path 'feature_h5': args.train_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'shuffle': args.shuffle_train_data == 1, 'question_families': question_families, # None 'max_samples': args.num_train_samples, # None 'num_workers': args.loader_num_workers, # 1,default } val_loader_kwargs = { 'question_h5': args.val_question_h5, 'feature_h5': args.val_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'question_families': question_families, 'max_samples': args.num_val_samples, 'num_workers': args.loader_num_workers, } with ClevrDataLoader(**train_loader_kwargs) as train_loader, \ ClevrDataLoader(**val_loader_kwargs) as val_loader: train_loop(args, train_loader, val_loader) if args.use_local_copies == 1 and args.cleanup_local_copies == 1: # 0 os.remove('/tmp/train_questions.h5') os.remove('/tmp/train_features.h5') os.remove('/tmp/val_questions.h5') os.remove('/tmp/val_features.h5')
def check_accuracy(args, program_generator, execution_engine, baseline_model, loader): set_mode('eval', [program_generator, execution_engine, baseline_model]) num_correct, num_samples = 0, 0 for batch in loader: questions, _, feats, answers, programs, _ = batch if isinstance(questions, list): questions = questions[0] questions_var = questions.to(device=args.device) feats_var = feats.to(device=args.device) answers_var = feats.to(device=args.device) # questions_var = Variable(questions.cuda(), volatile=True) # feats_var = Variable(feats.cuda(), volatile=True) # answers_var = Variable(feats.cuda(), volatile=True) if programs[0] is not None: programs_var = Variable(programs.cuda(), volatile=True) scores = None # Use this for everything but PG if args.model_type == 'PG': vocab = utils.load_vocab(args.vocab_json) for i in range(questions.size(0)): program_pred = program_generator.sample( Variable(questions[i:i + 1].cuda(), volatile=True)) program_pred_str = vr.preprocess.decode( program_pred, vocab['program_idx_to_token']) program_str = vr.preprocess.decode( programs[i], vocab['program_idx_to_token']) if program_pred_str == program_str: num_correct += 1 num_samples += 1 elif args.model_type == 'EE': scores = execution_engine(feats_var, programs_var) elif args.model_type == 'PG+EE': programs_pred = program_generator.reinforce_sample(questions_var, argmax=True) scores = execution_engine(feats_var, programs_pred) elif args.model_type == 'FiLM': programs_pred = program_generator(questions_var) scores = execution_engine(feats_var, programs_pred) elif args.model_type in ['LSTM', 'CNN+LSTM', 'CNN+LSTM+SA']: scores = baseline_model(questions_var, feats_var) if scores is not None: _, preds = scores.data.cpu().max(1) num_correct += (preds == answers).sum() num_samples += preds.size(0) if args.num_val_samples is not None and num_samples >= args.num_val_samples: break set_mode('train', [program_generator, execution_engine, baseline_model]) acc = float(num_correct) / num_samples return acc
def get_execution_engine(args): vocab = utils.load_vocab(args.vocab_json) if args.execution_engine_start_from is not None: ee, kwargs = utils.load_execution_engine( args.execution_engine_start_from, model_type=args.model_type) else: kwargs = { 'vocab': vocab, 'feature_dim': parse_int_list(args.feature_dim), 'stem_batchnorm': args.module_stem_batchnorm == 1, 'stem_num_layers': args.module_stem_num_layers, 'module_dim': args.module_dim, 'module_residual': args.module_residual == 1, 'module_batchnorm': args.module_batchnorm == 1, 'classifier_proj_dim': args.classifier_proj_dim, 'classifier_downsample': args.classifier_downsample, 'classifier_fc_layers': parse_int_list(args.classifier_fc_dims), 'classifier_batchnorm': args.classifier_batchnorm == 1, 'classifier_dropout': args.classifier_dropout, } if args.model_type == 'FiLM': kwargs['num_modules'] = args.num_modules kwargs['stem_kernel_size'] = args.module_stem_kernel_size kwargs['stem_stride'] = args.module_stem_stride kwargs['stem_padding'] = args.module_stem_padding kwargs['module_num_layers'] = args.module_num_layers kwargs[ 'module_batchnorm_affine'] = args.module_batchnorm_affine == 1 kwargs['module_dropout'] = args.module_dropout kwargs['module_input_proj'] = args.module_input_proj kwargs['module_kernel_size'] = args.module_kernel_size kwargs['use_gamma'] = args.use_gamma == 1 kwargs['use_beta'] = args.use_beta == 1 kwargs['use_coords'] = args.use_coords kwargs['debug_every'] = args.debug_every kwargs['print_verbose_every'] = args.print_verbose_every kwargs['condition_method'] = args.condition_method kwargs['with_cbn'] = args.with_cbn kwargs['final_resblock_with_cbn'] = args.final_resblock_with_cbn kwargs['condition_pattern'] = parse_int_list( args.condition_pattern) ee = FiLMedNet(**kwargs) else: ee = ModuleNet(**kwargs) # if cuda.device_count() > 1: # ee = nn.DataParallel(ee) ee.cuda() ee.train() return ee, kwargs
def get_execution_engine(args): vocab = utils.load_vocab(args.vocab_json) if args.execution_engine_start_from is not None: ee, kwargs = utils.load_execution_engine( args.execution_engine_start_from, model_type=args.model_type) else: kwargs = { 'vocab': vocab, 'feature_dim': parse_int_list(args.feature_dim), 'stem_batchnorm': args.module_stem_batchnorm == 1, 'stem_num_layers': args.module_stem_num_layers, 'module_dim': args.module_dim, 'module_residual': args.module_residual == 1, 'module_batchnorm': args.module_batchnorm == 1, 'classifier_proj_dim': args.classifier_proj_dim, 'classifier_downsample': args.classifier_downsample, 'classifier_fc_layers': parse_int_list(args.classifier_fc_dims), 'classifier_batchnorm': args.classifier_batchnorm == 1, 'classifier_dropout': args.classifier_dropout, } if args.model_type.startswith('FiLM'): kwargs['num_modules'] = args.num_modules kwargs['stem_use_resnet'] = (args.model_type == 'FiLM+ResNet1' or args.model_type == 'FiLM+ResNet0') kwargs['stem_resnet_fixed'] = args.model_type == 'FiLM+ResNet0' kwargs['stem_kernel_size'] = args.module_stem_kernel_size kwargs['stem_stride2_freq'] = args.module_stem_stride2_freq kwargs['stem_padding'] = args.module_stem_padding kwargs['module_num_layers'] = args.module_num_layers kwargs['module_batchnorm_affine'] = args.module_batchnorm_affine == 1 kwargs['module_dropout'] = args.module_dropout kwargs['module_input_proj'] = args.module_input_proj kwargs['module_kernel_size'] = args.module_kernel_size kwargs['use_gamma'] = args.use_gamma == 1 kwargs['use_beta'] = args.use_beta == 1 kwargs['use_coords'] = args.use_coords kwargs['debug_every'] = args.debug_every kwargs['print_verbose_every'] = args.print_verbose_every kwargs['condition_method'] = args.condition_method kwargs['condition_pattern'] = parse_int_list(args.condition_pattern) ee = FiLMedNet(**kwargs) else: ee = ModuleNet(**kwargs) if torch.cuda.is_available(): ee.cuda() else: ee.cpu() ee.train() return ee, kwargs
def get_program_generator(args): vocab = utils.load_vocab(args.vocab_json) if args.program_generator_start_from is not None: # it is None pg, kwargs = utils.load_program_generator( args.program_generator_start_from, model_type=args.model_type) cur_vocab_size = pg.encoder_embed.weight.size(0) if cur_vocab_size != len(vocab['question_token_to_idx']): print('Expanding vocabulary of program generator') pg.expand_encoder_vocab(vocab['question_token_to_idx']) kwargs['encoder_vocab_size'] = len(vocab['question_token_to_idx']) else: kwargs = { 'encoder_vocab_size': len(vocab['question_token_to_idx']), 'decoder_vocab_size': len(vocab['program_token_to_idx']), 'wordvec_dim': args.rnn_wordvec_dim, 'hidden_dim': args.rnn_hidden_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, # 0e-2 } if args.model_type == 'FiLM': kwargs[ 'parameter_efficient'] = args.program_generator_parameter_efficient == 1 kwargs['output_batchnorm'] = args.rnn_output_batchnorm == 1 kwargs['bidirectional'] = args.bidirectional == 1 kwargs['encoder_type'] = args.encoder_type kwargs['decoder_type'] = args.decoder_type kwargs['gamma_option'] = args.gamma_option kwargs['gamma_baseline'] = args.gamma_baseline kwargs['num_modules'] = args.num_modules kwargs['module_num_layers'] = args.module_num_layers kwargs['module_dim'] = args.module_dim kwargs['debug_every'] = args.debug_every pg = FiLMGen(**kwargs) else: pg = Seq2Seq(**kwargs) pg.cuda() pg.encoder_rnn.flatten_parameters() if args.gpu_devices: gpu_id = parse_int_list(args.gpu_devices) pg = DataParallel(pg, device_ids=gpu_id) pg.train() pg.module.encoder_rnn.flatten_parameters() return pg, kwargs
def get_program_generator(args): vocab = utils.load_vocab(args.vocab_json) if args.program_generator_start_from is not None: pg, kwargs = utils.load_program_generator( args.program_generator_start_from, model_type=args.model_type) cur_vocab_size = pg.encoder_embed.weight.size(0) if cur_vocab_size != len(vocab['question_token_to_idx']): print('Expanding vocabulary of program generator') pg.expand_encoder_vocab(vocab['question_token_to_idx']) kwargs['encoder_vocab_size'] = len(vocab['question_token_to_idx']) else: kwargs = { 'encoder_vocab_size': len(vocab['question_token_to_idx']), 'decoder_vocab_size': len(vocab['program_token_to_idx']), 'wordvec_dim': args.rnn_wordvec_dim, 'hidden_dim': args.rnn_hidden_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, } if args.model_type.startswith('FiLM'): kwargs['parameter_efficient'] = args.program_generator_parameter_efficient == 1 kwargs['output_batchnorm'] = args.rnn_output_batchnorm == 1 kwargs['bidirectional'] = args.bidirectional == 1 kwargs['encoder_type'] = args.encoder_type kwargs['decoder_type'] = args.decoder_type kwargs['gamma_option'] = args.gamma_option kwargs['gamma_baseline'] = args.gamma_baseline kwargs['num_modules'] = args.num_modules kwargs['module_num_layers'] = args.module_num_layers kwargs['module_dim'] = args.module_dim kwargs['debug_every'] = args.debug_every if args.model_type == 'FiLM+BoW': kwargs['encoder_type'] = 'bow' pg = FiLMGen(**kwargs) else: pg = Seq2Seq(**kwargs) if torch.cuda.is_available(): pg.cuda() else: pg.cpu() pg.train() return pg, kwargs
def rewrite_programs(src_dir, dst_dir): if not os.path.exists(dst_dir): os.mkdir(dst_dir) vocab = load_vocab(os.path.join(src_dir, 'vocab.json')) old_vocab = copy.deepcopy(vocab) arity = vocab['program_token_arity'] program_vocab = vocab['program_idx_to_token'] question_vocab = vocab['question_idx_to_token'] # Step 1: change the arity of filters for func in arity.keys(): if needs_shortcut(func): arity[func] = 2 with open(os.path.join(dst_dir, 'vocab.json'), 'w') as dst: json.dump(vocab, dst) for part in ['train', 'val', 'test']: src_questions = "{}/{}_questions.h5".format(src_dir, part) dst_questions = "{}/{}_questions.h5".format(dst_dir, part) with h5py.File(src_questions) as src_file: programs = src_file['programs'] prog_wshortcuts = [] for i in range(len(programs)): prog_wshortcuts.append( add_shortcuts(programs[i], old_vocab)[1]) new_max_program_len = max(len(p) for p in prog_wshortcuts) shutil.copyfile(src_questions, dst_questions) with h5py.File(dst_questions, 'a') as dst_file: del dst_file['programs'] program_dataset = dst_file.create_dataset( 'programs', (len(prog_wshortcuts), new_max_program_len), dtype=numpy.int64) for i in range(len(prog_wshortcuts)): program_dataset[ i, :len(prog_wshortcuts[i])] = prog_wshortcuts[i]
def train_loop(args, train_loader, val_loader): vocab = utils.load_vocab(args.vocab_json) program_generator, pg_kwargs, pg_optimizer = None, None, None execution_engine, ee_kwargs, ee_optimizer = None, None, None baseline_model, baseline_kwargs, baseline_optimizer = None, None, None baseline_type = None pg_best_state, ee_best_state, baseline_best_state = None, None, None # Set up model optim_method = getattr(torch.optim, args.optimizer) if args.model_type in ['FiLM', 'PG', 'PG+EE']: program_generator, pg_kwargs = get_program_generator(args) pg_optimizer = optim_method(program_generator.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) print('Here is the conditioning network:') print(program_generator) if args.model_type in ['FiLM', 'EE', 'PG+EE']: execution_engine, ee_kwargs = get_execution_engine(args) ee_optimizer = optim_method(execution_engine.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) print('Here is the conditioned network:') print(execution_engine) if args.model_type in ['LSTM', 'CNN+LSTM', 'CNN+LSTM+SA']: baseline_model, baseline_kwargs = get_baseline_model(args) params = baseline_model.parameters() if args.baseline_train_only_rnn == 1: params = baseline_model.rnn.parameters() baseline_optimizer = optim_method(params, lr=args.learning_rate, weight_decay=args.weight_decay) print('Here is the baseline model') print(baseline_model) baseline_type = args.model_type loss_fn = torch.nn.CrossEntropyLoss().cuda() stats = { 'train_losses': [], 'train_rewards': [], 'train_losses_ts': [], 'train_accs': [], 'val_accs': [], 'val_accs_ts': [], 'best_val_acc': -1, 'model_t': 0, } t, epoch, reward_moving_average = 0, 0, 0 set_mode('train', [program_generator, execution_engine, baseline_model]) print('train_loader has %d samples' % len(train_loader.dataset)) print('val_loader has %d samples' % len(val_loader.dataset)) num_checkpoints = 0 epoch_start_time = 0.0 epoch_total_time = 0.0 train_pass_total_time = 0.0 val_pass_total_time = 0.0 running_loss = 0.0 while t < args.num_iterations: if (epoch > 0) and (args.time == 1): epoch_time = time.time() - epoch_start_time epoch_total_time += epoch_time print(colored('EPOCH PASS AVG TIME: ' + str(epoch_total_time / epoch), 'white')) print(colored('Epoch Pass Time : ' + str(epoch_time), 'white')) epoch_start_time = time.time() epoch += 1 print('Starting epoch %d' % epoch) for batch in train_loader: t += 1 questions, _, feats, answers, programs, _ = batch if isinstance(questions, list): questions = questions[0] questions_var = Variable(questions.cuda()) feats_var = Variable(feats.cuda()) answers_var = Variable(answers.cuda()) if programs[0] is not None: programs_var = Variable(programs.cuda()) reward = None if args.model_type == 'PG': # Train program generator with ground-truth programs pg_optimizer.zero_grad() loss = program_generator(questions_var, programs_var) loss.backward() pg_optimizer.step() elif args.model_type == 'EE': # Train execution engine with ground-truth programs ee_optimizer.zero_grad() scores = execution_engine(feats_var, programs_var) loss = loss_fn(scores, answers_var) loss.backward() ee_optimizer.step() elif args.model_type in ['LSTM', 'CNN+LSTM', 'CNN+LSTM+SA']: baseline_optimizer.zero_grad() baseline_model.zero_grad() scores = baseline_model(questions_var, feats_var) loss = loss_fn(scores, answers_var) loss.backward() baseline_optimizer.step() elif args.model_type == 'PG+EE': programs_pred = program_generator.reinforce_sample( questions_var) scores = execution_engine(feats_var, programs_pred) loss = loss_fn(scores, answers_var) _, preds = scores.data.cpu().max(1) raw_reward = (preds == answers).float() reward_moving_average *= args.reward_decay reward_moving_average += ((1.0 - args.reward_decay) * raw_reward.mean()) centered_reward = raw_reward - reward_moving_average if args.train_execution_engine == 1: ee_optimizer.zero_grad() loss.backward() ee_optimizer.step() if args.train_program_generator == 1: pg_optimizer.zero_grad() program_generator.reinforce_backward( centered_reward.cuda()) pg_optimizer.step() elif args.model_type == 'FiLM': if args.set_execution_engine_eval == 1: set_mode('eval', [execution_engine]) programs_pred = program_generator(questions_var) scores = execution_engine(feats_var, programs_pred) loss = loss_fn(scores, answers_var) pg_optimizer.zero_grad() ee_optimizer.zero_grad() if args.debug_every <= -2: pdb.set_trace() loss.backward() if args.debug_every < float('inf'): check_grad_num_nans(execution_engine, 'FiLMedNet') check_grad_num_nans(program_generator, 'FiLMGen') if args.train_program_generator == 1: if args.grad_clip > 0: torch.nn.utils.clip_grad_norm( program_generator.parameters(), args.grad_clip) pg_optimizer.step() if args.train_execution_engine == 1: if args.grad_clip > 0: torch.nn.utils.clip_grad_norm( execution_engine.parameters(), args.grad_clip) ee_optimizer.step() if t % args.record_loss_every == 0: running_loss += loss.data[0] avg_loss = running_loss / args.record_loss_every print(t, avg_loss) stats['train_losses'].append(avg_loss) stats['train_losses_ts'].append(t) if reward is not None: stats['train_rewards'].append(reward) running_loss = 0.0 else: running_loss += loss.data[0] if t % args.checkpoint_every == 0: num_checkpoints += 1 print('Checking training accuracy ... ') start = time.time() train_acc = check_accuracy( args, program_generator, execution_engine, baseline_model, train_loader) if args.time == 1: train_pass_time = (time.time() - start) train_pass_total_time += train_pass_time print(colored('TRAIN PASS AVG TIME: ' + str(train_pass_total_time / num_checkpoints), 'red')) print(colored('Train Pass Time : ' + str(train_pass_time), 'red')) print('train accuracy is', train_acc) print('Checking validation accuracy ...') start = time.time() val_acc = check_accuracy( args, program_generator, execution_engine, baseline_model, val_loader) if args.time == 1: val_pass_time = (time.time() - start) val_pass_total_time += val_pass_time print(colored('VAL PASS AVG TIME: ' + str(val_pass_total_time / num_checkpoints), 'cyan')) print(colored('Val Pass Time : ' + str(val_pass_time), 'cyan')) print('val accuracy is ', val_acc) stats['train_accs'].append(train_acc) stats['val_accs'].append(val_acc) stats['val_accs_ts'].append(t) if val_acc > stats['best_val_acc']: stats['best_val_acc'] = val_acc stats['model_t'] = t best_pg_state = get_state(program_generator) best_ee_state = get_state(execution_engine) best_baseline_state = get_state(baseline_model) checkpoint = { 'args': args.__dict__, 'program_generator_kwargs': pg_kwargs, 'program_generator_state': best_pg_state, 'execution_engine_kwargs': ee_kwargs, 'execution_engine_state': best_ee_state, 'baseline_kwargs': baseline_kwargs, 'baseline_state': best_baseline_state, 'baseline_type': baseline_type, 'vocab': vocab } for k, v in stats.items(): checkpoint[k] = v print('Saving checkpoint to %s' % args.checkpoint_path) torch.save(checkpoint, args.checkpoint_path) del checkpoint['program_generator_state'] del checkpoint['execution_engine_state'] del checkpoint['baseline_state'] with open(args.checkpoint_path + '.json', 'w') as f: json.dump(checkpoint, f) if t == args.num_iterations: break
def check_accuracy(args, scene2action_model, program_generator, execution_engine, baseline_model, loader): ##### Modified ##### set_mode('eval', [ scene2action_model, program_generator, execution_engine, baseline_model ]) num_correct, num_samples = 0, 0 for batch in loader: ##### Modified ##### questions, _, feats, feats_aux, answers, programs, _ = batch if isinstance(questions, list): questions = questions[0] questions_var = Variable(questions.cuda(), volatile=True) feats_var = Variable(feats.cuda(), volatile=True) ##### Modified ##### feats_var_aux = Variable(feats_aux.cuda(), volatile=True) answers_var = Variable(feats.cuda(), volatile=True) if programs[0] is not None: programs_var = Variable(programs.cuda(), volatile=True) ##### Modified #####------------------------------------------ ### Preprocess batch started ### # For each data in the current batch (step-by-step) for turn in range(feats_var.size(0)): current_action = -1 current_count = 0 current_scene = feats_var[turn] current_question = questions_var[turn] while ((current_action != args.end_action_index) and (current_count < args.maximum_action_number)): current_count = current_count + 1 current_action, _ = scene2action_model(current_scene, current_question) if (current_action != args.end_action_index): if (current_action == 0): current_scene = current_scene + feats_var[turn] else: current_scene = current_scene + feats_var_aux[turn][ current_action - 1] ### Prepare data for VQA model ### feats_var[turn] = current_scene ### Preprocess batch ended ### ##### Modified #####------------------------------------------ scores = None # Use this for everything but PG if args.model_type == 'PG': vocab = utils.load_vocab(args.vocab_json) for i in range(questions.size(0)): program_pred = program_generator.sample( Variable(questions[i:i + 1].cuda(), volatile=True)) program_pred_str = vr.preprocess.decode( program_pred, vocab['program_idx_to_token']) program_str = vr.preprocess.decode( programs[i], vocab['program_idx_to_token']) if program_pred_str == program_str: num_correct += 1 num_samples += 1 elif args.model_type == 'EE': scores = execution_engine(feats_var, programs_var) elif args.model_type == 'PG+EE': programs_pred = program_generator.reinforce_sample(questions_var, argmax=True) scores = execution_engine(feats_var, programs_pred) elif args.model_type == 'FiLM': programs_pred = program_generator(questions_var) scores = execution_engine(feats_var, programs_pred) elif args.model_type in ['LSTM', 'CNN+LSTM', 'CNN+LSTM+SA']: scores = baseline_model(questions_var, feats_var) if scores is not None: _, preds = scores.data.cpu().max(1) num_correct += (preds == answers).sum() num_samples += preds.size(0) if args.num_val_samples is not None and num_samples >= args.num_val_samples: break set_mode('train', [ scene2action_model, program_generator, execution_engine, baseline_model ]) acc = float(num_correct) / num_samples return acc
def train_loop(args, train_loader, val_loader, writer): vocab = utils.load_vocab(args.vocab_json) program_generator, pg_kwargs, pg_optimizer = None, None, None execution_engine, ee_kwargs, ee_optimizer = None, None, None baseline_model, baseline_kwargs, baseline_optimizer = None, None, None baseline_type = None pg_best_state, ee_best_state, baseline_best_state = None, None, None # Set up model optim_method = getattr(torch.optim, args.optimizer) ##### Modified ##### scene2action_model, scene2action_optimizer = None, None s2a_kwargs = { 'feat_dim': args.scene2action_feat_dim, 'hidden_dim': args.scene2action_hidden_dim, 'action_dim': args.scene2action_action_dim, 'dropout': args.scene2action_dropout, 'word_vocab_size': args.scene2action_word_vocab_size, 'word_embed_size': args.scene2action_word_embed_size, 'lstm_hidden_size': args.scene2action_lstm_hidden_size, 'lstm_num_layers': args.scene2action_lstm_num_layers, } scene2action_model = Scene2Action(**s2a_kwargs) scene2action_model.cuda() scene2action_model.train() scene2action_optimizer = optim_method(scene2action_model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) print("Here is the scene to action network: ") print(scene2action_model) if args.model_type in ['FiLM', 'PG', 'PG+EE']: program_generator, pg_kwargs = get_program_generator(args) pg_optimizer = optim_method(program_generator.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) print('Here is the conditioning network:') print(program_generator) if args.model_type in ['FiLM', 'EE', 'PG+EE']: execution_engine, ee_kwargs = get_execution_engine(args) ee_optimizer = optim_method(execution_engine.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) print('Here is the conditioned network:') print(execution_engine) if args.model_type in ['LSTM', 'CNN+LSTM', 'CNN+LSTM+SA']: baseline_model, baseline_kwargs = get_baseline_model(args) params = baseline_model.parameters() if args.baseline_train_only_rnn == 1: params = baseline_model.rnn.parameters() baseline_optimizer = optim_method(params, lr=args.learning_rate, weight_decay=args.weight_decay) print('Here is the baseline model') print(baseline_model) baseline_type = args.model_type loss_fn = torch.nn.CrossEntropyLoss().cuda() ##### Modified ##### loss_s2a = torch.nn.CrossEntropyLoss().cuda() ##### Modified ##### stats = { 'train_losses': [], 'train_rewards': [], 'train_losses_ts': [], 'train_accs': [], 'val_accs': [], 'val_accs_ts': [], 'best_val_acc': -1, 'model_t': 0, 'train_losses_s2a': [], } t, epoch, reward_moving_average = 0, 0, 0 ##### Modified ##### set_mode('train', [ scene2action_model, program_generator, execution_engine, baseline_model ]) print('train_loader has %d samples' % len(train_loader.dataset)) print('val_loader has %d samples' % len(val_loader.dataset)) num_checkpoints = 0 epoch_start_time = 0.0 epoch_total_time = 0.0 train_pass_total_time = 0.0 val_pass_total_time = 0.0 running_loss = 0.0 while t < args.num_iterations: if (epoch > 0) and (args.time == 1): epoch_time = time.time() - epoch_start_time epoch_total_time += epoch_time print( colored( 'EPOCH PASS AVG TIME: ' + str(epoch_total_time / epoch), 'white')) print(colored('Epoch Pass Time : ' + str(epoch_time), 'white')) epoch_start_time = time.time() epoch += 1 print('Starting epoch %d' % epoch) for batch in train_loader: t += 1 print( "Current batch " + str(t) ) #------------------------------------------------------------------------------------# ##### Modified ##### questions, _, feats, feats_aux, answers, programs, _ = batch if isinstance(questions, list): questions = questions[0] questions_var = Variable(questions.cuda()) feats_var = Variable(feats.cuda()) ##### Modified #### feats_var_aux = Variable(feats_aux.cuda()) answers_var = Variable(answers.cuda()) #print("answers var 0 " + str(answers_var.size(0))) ### 64 if programs[0] is not None: programs_var = Variable(programs.cuda()) reward = None ##### Modified #####--------------------------------------------------------- ### Preprocess batch started ### # For each data in the current batch (step-by-step) avg_action_length = 0.0 total_repeated_rate = 0.0 count_turn = 0 for turn in range(feats_var.size(0)): #print("Current turn " + str(turn)) #------------------------------------------------------------------------------------# set_mode('eval', [program_generator, execution_engine]) current_action = -1 current_count = 0 current_scene = feats_var[turn] current_question = questions_var[turn] current_answer = answers_var[turn] actions = [] flag = 0 while ((current_action != args.end_action_index) and (current_count < args.maximum_action_number)): current_count += 1 current_action, action_propability = scene2action_model( current_scene, current_question) actions.append(current_action.item()) #print("Current_action " + str(current_action)) #print("action_propability " + str(action_propability)) #print("current action after " + str(current_action)) if (current_action != args.end_action_index): if (current_action == 0): current_scene = current_scene + feats_var[turn] else: current_scene = current_scene + feats_var_aux[ turn][current_action - 1] ##### To check ##### temp_question = current_question.view( 1, -1).clone() ##### To check ##### programs_pred = program_generator(temp_question) current_scene = current_scene.view( 1, 256, 16, 16) ##### To check ##### scores = execution_engine(current_scene, programs_pred) current_answer = current_answer.view( -1) ##### To check ##### loss_current = loss_fn(scores, current_answer) if (flag == 0): Minus_reward_currentturn = torch.zeros(1, 1) Minus_reward_currentturn[0][0] = torch.from_numpy( np.array(loss_current.data.cpu().numpy())) else: temp = torch.zeros(1, 1) temp[0][0] = torch.from_numpy( np.array(loss_current.data.cpu().numpy())) Minus_reward_currentturn = torch.cat( (Minus_reward_currentturn, temp), 0) ### ??? ### if (flag == 0): Logits_currentturn = torch.zeros(1, 1) Logits_currentturn[0][0] = torch.from_numpy( np.array(action_propability.data.cpu().numpy())) flag = flag + 1 else: temp = torch.zeros(1, 1) temp[0][0] = torch.from_numpy( np.array(action_propability.data.cpu().numpy())) Logits_currentturn = torch.cat( (Logits_currentturn, temp), 0) ##### To check ##### #print("Current Action " + str(current_action)) #------------------------------------------------------------------------------------# vt = decay_normalize_loss(Minus_reward_currentturn, args.scene2action_gamma) Logits_currentturn = Logits_currentturn.cuda() vt = vt.cuda() current_route_length = torch.tensor(len(actions)) current_route_length_cuda = current_route_length.cuda() ##### Modified ##### ### Add route loss here ### route_loss_rate loss_current_s2a = torch.mean( Logits_currentturn * vt) + args.route_loss_rate * current_route_length_cuda loss_current_s2a.requires_grad = True scene2action_optimizer.zero_grad() loss_current_s2a = loss_current_s2a.cuda() loss_current_s2a.backward() scene2action_optimizer.step() feats_var[turn] = current_scene avg_action_length = avg_action_length + len(actions) repeated_actions = 0 for a1 in range(len(actions) - 1): for a2 in range(a1 + 1, len(actions)): if actions[a1] == actions[a2]: repeated_actions = repeated_actions + 1 break total_repeated_rate = total_repeated_rate + ( repeated_actions) / (float)(len(actions)) count_turn = count_turn + 1 writer.add_scalar('scene2action_loss', loss_current_s2a.item(), t) ### Add avg length, no_repeated rate writer.add_scalar('avg_route_length', avg_action_length / (float)(count_turn), t) writer.add_scalar('avg_repeat_rate', total_repeated_rate / (float)(count_turn), t) ### Turn actions to image ### image_ = torch.zeros(3, 13, 13) print("actions ----------------------- ") print(actions) for ii_ in range(len(actions)): image_[0][actions[ii_]][ii_] = 0 image_[1][actions[ii_]][ii_] = 1 image_[2][actions[ii_]][ii_] = 1 writer.add_image('selected_viewpoints', image_, t) set_mode('train', [ scene2action_model, program_generator, execution_engine, baseline_model ]) #exit() ### Preprocess batch ended ### ##### Modified #####--------------------------------------------------------- if args.model_type == 'PG': # Train program generator with ground-truth programs pg_optimizer.zero_grad() loss = program_generator(questions_var, programs_var) loss.backward() pg_optimizer.step() elif args.model_type == 'EE': # Train execution engine with ground-truth programs ee_optimizer.zero_grad() scores = execution_engine(feats_var, programs_var) loss = loss_fn(scores, answers_var) loss.backward() ee_optimizer.step() elif args.model_type in ['LSTM', 'CNN+LSTM', 'CNN+LSTM+SA']: baseline_optimizer.zero_grad() baseline_model.zero_grad() scores = baseline_model(questions_var, feats_var) loss = loss_fn(scores, answers_var) loss.backward() baseline_optimizer.step() elif args.model_type == 'PG+EE': programs_pred = program_generator.reinforce_sample( questions_var) scores = execution_engine(feats_var, programs_pred) loss = loss_fn(scores, answers_var) _, preds = scores.data.cpu().max(1) raw_reward = (preds == answers).float() reward_moving_average *= args.reward_decay reward_moving_average += ( 1.0 - args.reward_decay) * raw_reward.mean() centered_reward = raw_reward - reward_moving_average if args.train_execution_engine == 1: ee_optimizer.zero_grad() loss.backward() ee_optimizer.step() if args.train_program_generator == 1: pg_optimizer.zero_grad() program_generator.reinforce_backward( centered_reward.cuda()) pg_optimizer.step() elif args.model_type == 'FiLM': if args.set_execution_engine_eval == 1: set_mode('eval', [execution_engine]) programs_pred = program_generator(questions_var) scores = execution_engine(feats_var, programs_pred) loss = loss_fn(scores, answers_var) pg_optimizer.zero_grad() ee_optimizer.zero_grad() if args.debug_every <= -2: pdb.set_trace() loss.backward() if args.debug_every < float('inf'): check_grad_num_nans(execution_engine, 'FiLMedNet') check_grad_num_nans(program_generator, 'FiLMGen') if args.train_program_generator == 1: if args.grad_clip > 0: torch.nn.utils.clip_grad_norm( program_generator.parameters(), args.grad_clip) pg_optimizer.step() if args.train_execution_engine == 1: if args.grad_clip > 0: torch.nn.utils.clip_grad_norm( execution_engine.parameters(), args.grad_clip) ee_optimizer.step() if t % args.record_loss_every == 0: running_loss += loss.data.item() avg_loss = running_loss / args.record_loss_every print(t, avg_loss) stats['train_losses'].append(avg_loss) stats['train_losses_ts'].append(t) ##### Modified ##### stats['train_losses_s2a'].append(loss_current_s2a) if reward is not None: stats['train_rewards'].append(reward) running_loss = 0.0 else: running_loss += loss.data.item() writer.add_scalar('train_losses', loss.data.item(), t) if t % args.checkpoint_every == 0: num_checkpoints += 1 print('Checking training accuracy ... ') start = time.time() """ ##### Modified ##### train_acc = check_accuracy(args, scene2action_model, program_generator, execution_engine, baseline_model, train_loader) if args.time == 1: train_pass_time = (time.time() - start) train_pass_total_time += train_pass_time print(colored('TRAIN PASS AVG TIME: ' + str(train_pass_total_time / num_checkpoints), 'red')) print(colored('Train Pass Time : ' + str(train_pass_time), 'red')) print('train accuracy is', train_acc) print('Checking validation accuracy ...') start = time.time() ##### Modified ##### """ val_acc = check_accuracy(args, scene2action_model, program_generator, execution_engine, baseline_model, val_loader) if args.time == 1: val_pass_time = (time.time() - start) val_pass_total_time += val_pass_time print( colored( 'VAL PASS AVG TIME: ' + str(val_pass_total_time / num_checkpoints), 'cyan')) print( colored('Val Pass Time : ' + str(val_pass_time), 'cyan')) print('val accuracy is ', val_acc) ### Val_acc writer.add_scalar('val_acc', val_acc, t) """ stats['train_accs'].append(train_acc) stats['val_accs'].append(val_acc) stats['val_accs_ts'].append(t) #if val_acc > stats['best_val_acc']: stats['best_val_acc'] = val_acc stats['model_t'] = t """ ##### Modified ##### best_scene2action_state = get_state(scene2action_model) best_pg_state = get_state(program_generator) best_ee_state = get_state(execution_engine) best_baseline_state = get_state(baseline_model) ##### Modified ##### checkpoint = { 'args': args.__dict__, 'scene2action_kwargs': s2a_kwargs, 'scene2action_state': best_scene2action_state, 'program_generator_kwargs': pg_kwargs, 'program_generator_state': best_pg_state, 'execution_engine_kwargs': ee_kwargs, 'execution_engine_state': best_ee_state, 'baseline_kwargs': baseline_kwargs, 'baseline_state': best_baseline_state, 'baseline_type': baseline_type, 'vocab': vocab } for k, v in stats.items(): checkpoint[k] = v print('Saving checkpoint to %s' % args.checkpoint_path) torch.save(checkpoint, args.checkpoint_path + '_' + str(t) + '.pt') ##### Modified ##### del checkpoint['scene2action_state'] del checkpoint['program_generator_state'] del checkpoint['execution_engine_state'] del checkpoint['baseline_state'] #with open(args.checkpoint_path + '.json', 'w') as f: # json.dump(checkpoint, f) if t == args.num_iterations: break
def main(args): torch.autograd.set_detect_anomaly(True) # for reproducibility torch.cuda.set_device(args.device_idd) torch.manual_seed(0) torch.cuda.manual_seed_all(0) torch.backends.cudnn.benckmark = False torch.backends.cudnn.deterministic = True if args.randomize_checkpoint_path == 1: name, ext = os.path.splitext(args.checkpoint_path) num = random.randint(1, 1000000) args.checkpoint_path = '%s_%06d%s' % (name, num, ext) print('Will save checkpoints to %s' % args.checkpoint_path) vocab = utils.load_vocab(args.vocab_json) ##### Modified ##### ### For Tensorboard ### #log_interval_num = args.log_interval #save_interval_num = args.save_interval log_dir = os.path.join(args.root_log_dir, args.log_dir) os.mkdir(log_dir) os.mkdir(os.path.join(log_dir, 'runs')) writer = SummaryWriter(log_dir=os.path.join(log_dir, 'runs')) ##### Modified ##### if args.use_local_copies == 1: shutil.copy(args.train_question_h5, '/tmp/train_questions.h5') shutil.copy(args.train_features_h5, '/tmp/train_features.h5') shutil.copy(args.train_features_h5_aux, '/tmp/train_features_aux.h5') shutil.copy(args.val_question_h5, '/tmp/val_questions.h5') shutil.copy(args.val_features_h5, '/tmp/val_features.h5') shutil.copy(args.val_features_h5_aux, '/tmp/val_features_aux.h5') args.train_question_h5 = '/tmp/train_questions.h5' args.train_features_h5 = '/tmp/train_features.h5' args.train_features_h5_aux = '/tmp/train_features_aux.h5' args.val_question_h5 = '/tmp/val_questions.h5' args.val_features_h5 = '/tmp/val_features.h5' args.val_features_h5_aux = '/tmp/val_features_aux.h5' question_families = None if args.family_split_file is not None: with open(args.family_split_file, 'r') as f: question_families = json.load(f) ##### Modified ##### train_loader_kwargs = { 'question_h5': args.train_question_h5, 'feature_h5': args.train_features_h5, 'feature_h5_aux': args.train_features_h5_aux, 'vocab': vocab, 'batch_size': args.batch_size, 'shuffle': args.shuffle_train_data == 1, 'question_families': question_families, 'max_samples': args.num_train_samples, 'num_workers': args.loader_num_workers, } val_loader_kwargs = { 'question_h5': args.val_question_h5, 'feature_h5': args.val_features_h5, 'feature_h5_aux': args.val_features_h5_aux, 'vocab': vocab, 'batch_size': args.batch_size, 'question_families': question_families, 'max_samples': args.num_val_samples, 'num_workers': args.loader_num_workers, } with ClevrDataLoader(**train_loader_kwargs) as train_loader, \ ClevrDataLoader(**val_loader_kwargs) as val_loader: train_loop(args, train_loader, val_loader, writer) ##### Modified ##### if args.use_local_copies == 1 and args.cleanup_local_copies == 1: os.remove('/tmp/train_questions.h5') os.remove('/tmp/train_features.h5') os.remove('/tmp/train_features_aux.h5') os.remove('/tmp/val_questions.h5') os.remove('/tmp/val_features.h5') os.remove('/tmp/val_features_aux.h5')
def main(args): if args.randomize_checkpoint_path == 1: name, ext = os.path.splitext(args.checkpoint_path) num = random.randint(1, 1000000) args.checkpoint_path = '%s_%06d%s' % (name, num, ext) print('Will save checkpoints to %s' % args.checkpoint_path) if args.data_dir: args.train_question_h5 = os.path.join(args.data_dir, args.train_question_h5) args.train_features_h5 = os.path.join(args.data_dir, args.train_features_h5) args.val_question_h5 = os.path.join(args.data_dir, args.val_question_h5) args.val_features_h5 = os.path.join(args.data_dir, args.val_features_h5) args.vocab_json = os.path.join(args.data_dir, args.vocab_json) if not args.checkpoint_path: if 'SLURM_JOB_ID' in os.environ: args.checkpoint_path = os.environ['SLURM_JOB_ID'] + '.pt' else: raise NotImplementedError() vocab = utils.load_vocab(args.vocab_json) if args.use_local_copies == 1: if os.path.exists('/Tmpfast'): tmp = '/Tmpfast/' else: tmp = '/Tmp/' if not os.path.exists(tmp + 'bahdanau'): os.mkdir(tmp + 'bahdanau') if not os.path.exists(tmp + 'bahdanau/clevr'): os.mkdir(tmp + 'bahdanau/clevr') root = tmp + 'bahdanau/clevr/' def rsync_copy_if_not_exists(src, dst): if not os.path.exists(dst): os.system("rsync -vrz --progress {} {}".format(src, dst)) rsync_copy_if_not_exists(args.train_question_h5, root + 'train_questions.h5') rsync_copy_if_not_exists(args.train_features_h5, root + 'train_features.h5') rsync_copy_if_not_exists(args.val_question_h5, root + 'val_questions.h5') rsync_copy_if_not_exists(args.val_features_h5, root + 'val_features.h5') args.train_question_h5 = root + 'train_questions.h5' args.train_features_h5 = root + 'train_features.h5' args.val_question_h5 = root + 'val_questions.h5' args.val_features_h5 = root + 'val_features.h5' question_families = None if args.family_split_file is not None: with open(args.family_split_file, 'r') as f: question_families = json.load(f) train_loader_kwargs = { 'question_h5': args.train_question_h5, 'feature_h5': args.train_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'shuffle': args.shuffle_train_data == 1, 'question_families': question_families, 'max_samples': args.num_train_samples, 'num_workers': args.loader_num_workers, } val_loader_kwargs = { 'question_h5': args.val_question_h5, 'feature_h5': args.val_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'question_families': question_families, 'max_samples': args.num_val_samples, 'num_workers': args.loader_num_workers, } with ClevrDataLoader(**train_loader_kwargs) as train_loader, \ ClevrDataLoader(**val_loader_kwargs) as val_loader: args.max_program_module_arity = max(train_loader.max_arity, val_loader.max_arity) args.max_program_tree_depth = max(train_loader.max_depth, val_loader.max_depth) train_loop(args, train_loader, val_loader) if args.use_local_copies == 1 and args.cleanup_local_copies == 1: os.remove('/tmp/train_questions.h5') os.remove('/tmp/train_features.h5') os.remove('/tmp/val_questions.h5') os.remove('/tmp/val_features.h5')
parser.add_argument('--batch_size', default=64, type=int) parser.add_argument('--learning_rate', default=5e-4, type=float) parser.add_argument('--reward_decay', default=0.9, type=float) parser.add_argument('--film_gen_weight_decay', default=0, type=float) parser.add_argument('--filmed_net_weight_decay', default=0, type=float) args = parser.parse_args() exp_dir = os.path.join(args.exp_dir, args.exp_name) if not os.path.exists(exp_dir): os.mkdir(exp_dir) logger = create_logger(os.path.join(exp_dir, 'log.txt')) logger.info(args) vocab = utils.load_vocab(os.path.join(args.data_dir, 'vocab.json')) film_gen = FiLMGen(encoder_vocab_size=len(vocab['question_token_to_idx']), wordvec_dim=args.rnn_wordvec_dim, hidden_dim=args.rnn_hidden_dim, rnn_num_layers=args.rnn_num_layers, rnn_dropout=0, output_batchnorm=False, bidirectional=False, encoder_type=args.encoder_type, decoder_type=args.decoder_type, gamma_option=args.gamma_option, gamma_baseline=1, num_modules=args.num_modules, module_num_layers=args.module_num_layers, module_dim=args.module_dim,
language=args.sw_language, config=args.sw_config) print('ShapeWorld dataset: {} (variant: {})'.format(dataset, args.sw_variant)) print('Config: ' + str(args.sw_config)) dataset = torch_util.ShapeWorldDataset(dataset=dataset, # include_model=True mode=(None if args.sw_mode == 'none' else args.sw_mode), epoch=(args.num_samples is None)) loader = ShapeWorldDataLoader(dataset=dataset, batch_size=args.batch_size) # num_workers=1 model = None if args.baseline_model is not None: assert args.model_type in ('LSTM', 'CNN+LSTM', 'CNN+LSTM+SA') print('Loading baseline model from', args.baseline_model) model, _ = utils.load_baseline(args.baseline_model) if args.vocab_json is not None: new_vocab = utils.load_vocab(args.vocab_json) model.rnn.expand_vocab(new_vocab['question_token_to_idx']) elif args.program_generator is not None and args.execution_engine is not None: pg, _ = utils.load_program_generator(args.program_generator, args.model_type) ee, _ = utils.load_execution_engine( args.execution_engine, verbose=False, model_type=args.model_type) if args.vocab_json is not None: new_vocab = utils.load_vocab(args.vocab_json) pg.expand_encoder_vocab(new_vocab['question_token_to_idx']) model = (pg, ee) else: print('Must give either --baseline_model or --program_generator and --execution_engine') return if torch.cuda.is_available(): dtype = torch.cuda.FloatTensor
def main(args): if args.debug_every <= 1: pdb.set_trace() if args.sw_name is not None or args.sw_config is not None: assert args.image is None and args.question is None from shapeworld import Dataset, torch_util from shapeworld.datasets import clevr_util class ShapeWorldDataLoader(torch_util.ShapeWorldDataLoader): def __iter__(self): for batch in super(ShapeWorldDataLoader, self).__iter__(): if "caption" in batch: question = batch["caption"].long() else: question = batch["question"].long() if args.sw_features == 1: image = batch["world_features"] else: image = batch["world"] feats = image if "agreement" in batch: answer = batch["agreement"].long() else: answer = batch["answer"].long() if "caption_model" in batch: assert args.sw_name.startswith( "clevr") or args.sw_program == 3 program_seq = batch["caption_model"] # .apply_(callable=(lambda model: clevr_util.parse_program(mode=0, model=model))) elif "question_model" in batch: program_seq = batch["question_model"] elif "caption" in batch: if args.sw_program == 1: program_seq = batch["caption_pn"].long() elif args.sw_program == 2: program_seq = batch["caption_rpn"].long() else: program_seq = [None] else: program_seq = [None] # program_seq = torch.IntTensor([0 for _ in batch['question']]) program_json = dict() yield question, image, feats, answer, program_seq, program_json dataset = Dataset.create( dtype=args.sw_type, name=args.sw_name, variant=args.sw_variant, language=args.sw_language, config=args.sw_config, ) print("ShapeWorld dataset: {} (variant: {})".format( dataset, args.sw_variant)) print("Config: " + str(args.sw_config)) if args.program_generator is not None: with open(args.program_generator + ".vocab", "r") as filehandle: vocab = json.load(filehandle) elif args.execution_engine is not None: with open(args.execution_engine + ".vocab", "r") as filehandle: vocab = json.load(filehandle) elif args.baseline_model is not None: with open(args.baseline_model + ".vocab", "r") as filehandle: vocab = json.load(filehandle) program_token_to_idx = vocab["program_token_to_idx"] include_model = args.model_type in ("PG", "EE", "PG+EE") and ( args.sw_name.startswith("clevr") or args.sw_program == 3) if include_model: def preprocess(model): if args.sw_name.startswith("clevr"): program_prefix = vr.programs.list_to_prefix( model["program"]) else: program_prefix = clevr_util.parse_program(mode=0, model=model) program_str = vr.programs.list_to_str(program_prefix) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, program_token_to_idx) program_encoded += [ program_token_to_idx["<NULL>"] for _ in range(27 - len(program_encoded)) ] return np.asarray(program_encoded, dtype=np.int64) if args.sw_name.startswith("clevr"): preprocessing = dict(question_model=preprocess) else: preprocessing = dict(caption_model=preprocess) elif args.sw_program in (1, 2): def preprocess(caption_pn): caption_pn += (caption_pn > 0) * 2 for n, symbol in enumerate(caption_pn): if symbol == 0: caption_pn[n] = 2 break caption_pn = np.concatenate(([1], caption_pn)) return caption_pn if args.sw_program == 1: preprocessing = dict(caption_pn=preprocess) else: preprocessing = dict(caption_rpn=preprocess) else: preprocessing = None dataset = torch_util.ShapeWorldDataset( dataset=dataset, mode=(None if args.sw_mode == "none" else args.sw_mode), include_model=include_model, epoch=(args.num_samples is None), preprocessing=preprocessing, ) loader = ShapeWorldDataLoader(dataset=dataset, batch_size=args.batch_size) model = None if args.model_type in ("CNN", "LSTM", "CNN+LSTM", "CNN+LSTM+SA"): assert args.baseline_model is not None print("Loading baseline model from", args.baseline_model) model, _ = utils.load_baseline(args.baseline_model) if args.vocab_json is not None: new_vocab = utils.load_vocab(args.vocab_json) model.rnn.expand_vocab(new_vocab["question_token_to_idx"]) elif args.program_generator is not None and args.execution_engine is not None: pg, _ = utils.load_program_generator(args.program_generator, args.model_type) ee, _ = utils.load_execution_engine(args.execution_engine, verbose=False, model_type=args.model_type) if args.vocab_json is not None: new_vocab = utils.load_vocab(args.vocab_json) pg.expand_encoder_vocab(new_vocab["question_token_to_idx"]) model = (pg, ee) elif args.model_type == "FiLM": assert args.baseline_model is not None pg, _ = utils.load_program_generator(args.baseline_model, args.model_type) ee, _ = utils.load_execution_engine(args.baseline_model, verbose=False, model_type=args.model_type) if args.vocab_json is not None: new_vocab = utils.load_vocab(args.vocab_json) pg.expand_encoder_vocab(new_vocab["question_token_to_idx"]) model = (pg, ee) else: print( "Must give either --baseline_model or --program_generator and --execution_engine" ) return if torch.cuda.is_available(): dtype = torch.cuda.FloatTensor else: dtype = torch.FloatTensor if args.question is not None and args.image is not None: run_single_example(args, model, dtype, args.question) # Interactive mode elif (args.image is not None and args.input_question_h5 is None and args.input_features_h5 is None): feats_var = extract_image_features(args, dtype) print(colored("Ask me something!", "cyan")) while True: # Get user question question_raw = input(">>> ") run_single_example(args, model, dtype, question_raw, feats_var) elif args.sw_name is not None or args.sw_config is not None: predictions, visualization = run_batch(args, model, dtype, loader) if args.sw_pred_dir is not None: assert args.sw_pred_name is not None pred_dir = os.path.join( args.sw_pred_dir, dataset.dataset.type, dataset.dataset.name, dataset.dataset.variant, ) if not os.path.isdir(pred_dir): os.makedirs(pred_dir) id2word = dataset.dataset.vocabulary(value_type="language") with open( os.path.join( pred_dir, args.sw_pred_name + "-" + args.sw_mode + ".txt"), "w", ) as filehandle: filehandle.write("".join( "{} {} {}\n".format(correct, agreement, " ".join( id2word[c] for c in caption)) for correct, agreement, caption in zip( predictions["correct"], predictions["agreement"], predictions["caption"], ))) print("Predictions saved") if args.sw_vis_dir is not None: assert args.sw_vis_name is not None from io import BytesIO from shapeworld.world import World vis_dir = os.path.join( args.sw_vis_dir, dataset.dataset.type, dataset.dataset.name, dataset.dataset.variant, ) image_dir = os.path.join(vis_dir, args.sw_mode, "images") if not os.path.isdir(image_dir): os.makedirs(image_dir) worlds = np.transpose(visualization["world"], (0, 2, 3, 1)) for n in range(worlds.shape[0]): image = World.get_image(world_array=worlds[n]) image_bytes = BytesIO() image.save(image_bytes, format="png") with open(os.path.join(image_dir, "world-{}.png".format(n)), "wb") as filehandle: filehandle.write(image_bytes.getvalue()) image_bytes.close() with open( os.path.join( vis_dir, args.sw_vis_name + "-" + args.sw_mode + ".html"), "w", ) as filehandle: html = dataset.dataset.get_html( generated=visualization, image_format="png", image_dir=(args.sw_mode + "/images/"), ) filehandle.write(html) print("Visualization saved") else: vocab = load_vocab(args) loader_kwargs = { "question_h5": args.input_question_h5, "feature_h5": args.input_features_h5, "vocab": vocab, "batch_size": args.batch_size, } if args.family_split_file is not None: with open(args.family_split_file, "r") as f: loader_kwargs["question_families"] = json.load(f) with ClevrDataLoader(**loader_kwargs) as loader: run_batch(args, model, dtype, loader)
def main(args): if args.randomize_checkpoint_path == 1: name, ext = os.path.splitext(args.checkpoint_path) num = random.randint(1, 1000000) args.checkpoint_path = '%s_%06d%s' % (name, num, ext) print('Will save checkpoints to %s' % args.checkpoint_path) if args.sw_name is not None or args.sw_config is not None: from shapeworld import Dataset, torch_util from shapeworld.datasets import clevr_util class ShapeWorldDataLoader(torch_util.ShapeWorldDataLoader): def __init__(self, **kwargs): super(ShapeWorldDataLoader, self).__init__(**kwargs) def __iter__(self): for batch in super(ShapeWorldDataLoader, self).__iter__(): question = batch['caption'].long() image = batch['world'] feats = batch['world'] answer = batch['agreement'].long() if 'caption_model' in batch: program_seq = batch['caption_model'].apply_(callable=(lambda model: clevr_util.parse_program(mode=0, model=model))) else: program_seq = torch.IntTensor([0 for _ in batch['caption']]) program_json = dict() yield question, image, feats, answer, program_seq, program_json dataset = Dataset.create(dtype='agreement', name=args.sw_name, variant=args.sw_variant, language=args.sw_language, config=args.sw_config) print('ShapeWorld dataset: {} (variant: {})'.format(dataset, args.sw_variant)) print('Config: ' + str(args.sw_config)) if args.program_generator_start_from is None: question_token_to_idx = { word: index + 2 if index > 0 else 0 for word, index in dataset.vocabularies['language'].items() } question_token_to_idx['<NULL>'] = 0 question_token_to_idx['<START>'] = 1 question_token_to_idx['<END>'] = 2 vocab = dict( question_token_to_idx=question_token_to_idx, program_token_to_idx={'<NULL>': 0, '<START>': 1, '<END>': 2}, # missing!!! answer_token_to_idx={'false': 0, 'true': 1} ) with open(args.checkpoint_path + '.vocab', 'w') as filehandle: json.dump(vocab, filehandle) else: with open(args.program_generator_start_from + '.vocab', 'r') as filehandle: vocab = json.load(filehandle) question_token_to_idx = vocab['question_token_to_idx'] index = len(question_token_to_idx) for word in dataset.vocabularies['language']: if word not in question_token_to_idx: question_token_to_idx[word] = index index += 1 with open(args.checkpoint_path + '.vocab', 'w') as filehandle: json.dump(vocab, filehandle) args.feature_dim = ','.join(str(n) for n in reversed(dataset.world_shape())) args.vocab_json = args.checkpoint_path + '.vocab' train_dataset = torch_util.ShapeWorldDataset(dataset=dataset, mode='train') # , include_model=True) train_loader = ShapeWorldDataLoader(dataset=train_dataset, batch_size=args.batch_size) # num_workers=1 if args.sw_mixer == 1: val_loader = list() for d in dataset.datasets: val_dataset = torch_util.ShapeWorldDataset(dataset=d, mode='validation', epoch=(args.num_val_samples is None)) val_loader.append(ShapeWorldDataLoader(dataset=val_dataset, batch_size=args.batch_size)) # num_workers=1 else: val_dataset = torch_util.ShapeWorldDataset(dataset=dataset, mode='validation', epoch=(args.num_val_samples is None)) val_loader = ShapeWorldDataLoader(dataset=val_dataset, batch_size=args.batch_size) # num_workers=1 train_loop(args, train_loader, val_loader) else: vocab = utils.load_vocab(args.vocab_json) if args.use_local_copies == 1: shutil.copy(args.train_question_h5, '/tmp/train_questions.h5') shutil.copy(args.train_features_h5, '/tmp/train_features.h5') shutil.copy(args.val_question_h5, '/tmp/val_questions.h5') shutil.copy(args.val_features_h5, '/tmp/val_features.h5') args.train_question_h5 = '/tmp/train_questions.h5' args.train_features_h5 = '/tmp/train_features.h5' args.val_question_h5 = '/tmp/val_questions.h5' args.val_features_h5 = '/tmp/val_features.h5' question_families = None if args.family_split_file is not None: with open(args.family_split_file, 'r') as f: question_families = json.load(f) train_loader_kwargs = { 'question_h5': args.train_question_h5, 'feature_h5': args.train_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'shuffle': args.shuffle_train_data == 1, 'question_families': question_families, 'max_samples': args.num_train_samples, 'num_workers': args.loader_num_workers, } val_loader_kwargs = { 'question_h5': args.val_question_h5, 'feature_h5': args.val_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'question_families': question_families, 'max_samples': args.num_val_samples, 'num_workers': args.loader_num_workers, } with ClevrDataLoader(**train_loader_kwargs) as train_loader, \ ClevrDataLoader(**val_loader_kwargs) as val_loader: train_loop(args, train_loader, val_loader) if args.use_local_copies == 1 and args.cleanup_local_copies == 1: os.remove('/tmp/train_questions.h5') os.remove('/tmp/train_features.h5') os.remove('/tmp/val_questions.h5') os.remove('/tmp/val_features.h5')