def run(): # the first parameter of get_system_corpus specifies the system to be benchmarked. # this string should match a name in the Docker compose file or a name listed in get_system_corpus # # the second parameter specifies the corpus to be used for the benchmark # see tp.Corpus for the possible options system_corpus = get_system_corpus('dialogflow', tp.Corpus.SNIPS2017) evaluate(system_corpus)
def evaluate_and_write(args, model, tasks, splits_to_write): """ Evaluate a model on dev and/or test, then write predictions """ val_results, val_preds = evaluate.evaluate(model, tasks, args.batch_size, args.cuda, "val") if 'val' in splits_to_write: evaluate.write_preds(tasks, val_preds, args.run_dir, 'val', strict_glue_format=args.write_strict_glue_format) if 'test' in splits_to_write: _, te_preds = evaluate.evaluate(model, tasks, args.batch_size, args.cuda, "test") evaluate.write_preds(tasks, te_preds, args.run_dir, 'test', strict_glue_format=args.write_strict_glue_format) run_name = args.get("run_name", os.path.basename(args.run_dir)) results_tsv = os.path.join(args.exp_dir, "results.tsv") log.info("Writing results for split 'val' to %s", results_tsv) evaluate.write_results(val_results, results_tsv, run_name=run_name)
def validate(model, valid_loader, valid_df, args, tokenizer, ner_index, save_result=False, progress=False, limit=None, decode_mode='greedy'): run_root = Path('../experiments/' + args.run_root) predictions = predict(model, valid_loader, args, tokenizer, progress=True, limit=limit, decode_mode=decode_mode) # valid_df = valid_df.loc[ner_index,:] # new_predictions = [] # for index, item in enumerate(ner_index): # if ner_index[index]: # new_predictions.append(predictions[index]) # predictions = new_predictions valid_label = valid_df['eval_label'].tolist() print_label = valid_df['label'].tolist() a = valid_df['a'].tolist() b = valid_df['b'].tolist() current = valid_df['current'].tolist() # print(len(predictions),len(valid_label)) predictions = [' '.join(x) for x in predictions] valid_metric = evaluate(predictions, valid_label) print(valid_metric) print('------------') for i, (a, b, current, p, l) in enumerate(zip(a, b, current, predictions, print_label)): print(a,' | ', b,' | ', current,' | ', p.replace(' ',''),' | ', l) if i >= args.print_num: break return valid_metric
def test_default_subword_model( checkpoint_path='/content/gdrive/My Drive/NMT/unittests/checkpoints/', config_path='/content/gdrive/My Drive/NMT/configs/', corpus_path='/content/gdrive/My Drive/NMT/unittests/first_ten_sentences/' ): hyperparams = import_configs(config_path=config_path, unittesting=True) # use subword-level vocab hyperparams["vocab_type"] = "subword_joint" #hyperparams["learning_rate"] = .01 # increase learning rate print(f"vocab_type: {hyperparams['vocab_type']}") print(f"tie_weights: {hyperparams['tie_weights']}") construct_model_data("train.de", "train.en", hyperparams=hyperparams, corpus_path=corpus_path, checkpoint_path=checkpoint_path, overfit=True) # model of sufficient capacity should be able to bring loss down to ~zero. model, loss = train(total_epochs=100, early_stopping=False, checkpoint_path=checkpoint_path, save=False, write=True) assert loss < .01 model_data = retrieve_model_data(checkpoint_path=checkpoint_path) dev_batches = model_data[ "dev_batches"] # holds the training data, bc overfit=True dev_references = model_data[ "references"] # holds the training data, bc overfit=True idx_to_trg_word = model_data["idx_to_trg_word"] # greedy search should be able to perfectly predict the training data. dev_translations, _, _ = predict(model, dev_batches, idx_to_trg_word, checkpoint_path) bleu = evaluate(dev_translations, dev_references) assert bleu >= 100 # beam search should be able to perfectly predict the training data. model.decoder.set_inference_alg("beam_search") dev_translations, _, _ = predict(model, dev_batches, idx_to_trg_word, checkpoint_path) bleu = evaluate(dev_translations, dev_references) assert bleu >= 100
def main(project_parameters): result = None if project_parameters.mode == 'train': result = train(project_parameters=project_parameters) elif project_parameters.mode == 'evaluate': if project_parameters.predefined_dataset is not None: print('temporarily does not support predefined dataset.') else: evaluate(project_parameters=project_parameters) elif project_parameters.mode == 'predict': if project_parameters.use_gui: gui = GUI(project_parameters=project_parameters) gui.run() else: result = Predict(project_parameters=project_parameters)( data_path=project_parameters.data_path) print(('{},' * project_parameters.num_classes).format( *project_parameters.classes)[:-1]) print(result) elif project_parameters.mode == 'tune': result = tune(project_parameters=project_parameters) return result
def analyse_fans(header, the_url, her_info, db): """ 获取博主粉丝列表首页的粉丝,并进行分析判断是不是我想找的人 :param header: 浏览器头,包含Cookie :param the_url: 博主的粉丝列表链接 :param her_info: 她的信息 :param db: 数据库管理 :return: 无 """ # 获取博主粉丝页的html文件 html_str = util.get_html(header=header, the_url=the_url) # -----------------------------第1步------------------------------------------------- # 获取粉丝列表 fan_list = fans.get_fans_list(html_str) # print("找到粉丝%s个" % len(fan_list)) # 目前只截取一个 first_fan = [ fan_list[0], ] print(('找到粉丝:%s' % first_fan[0].__str__()).encode('gbk', 'ignore').decode('gbk')) for fan in first_fan: # ------------------------第2步------------------------------------------------- # 评估这个人的是我要找的人的可能性 chance = analyse.evaluate(fan.__dict__, her_info) # print(fan) if chance > 0: # --------------------第3.2步----------------------------------------------- # 搜索关键词,越详细越准确越好 key_words = her_info['key_words'] # print(key_words) # 获取该粉丝的更多信息 print('分析粉丝"%s"中...' % fan.name) match_school, count = search_more_info_of_fan( header, fan.url, key_words, '成都医学院') # print(match_school, count) if count == -1 and not match_school: # 因为搜索太频繁返回,转向搜索用户的所有微博 print('搜索用户的所有微博中...') # --------------------第4.1步--------------------------------------------- match_school, count = find_more_info_in_fan_assays( header, fan, key_words, '成都医学院') if match_school or count > 0: # --------------------第5.1步---------------------------------------------- print('找到符合条件的粉丝', fan) db.add_a_fan(fan, match_school, count) mail.send_email('找到符合条件的粉丝', fan.__str__()) else: # --------------------第5.2步---------------------------------------------- print('分析完成,该粉丝不是我要找的')
def evaluate_model(self): ''' evaluation the current model performance ''' try: x, y = np.array(self.keys_use), np.array(self.values_use) x_train = np.array(x[:-self.span]) y_train = y[:-self.span] x_test = np.array(x[-self.span:]) y_test = y[-self.span:] self.model.fit(x_train, y_train) y_pred = self.model.predict(x_test) evaluation = evaluate(y_test, y_pred) self.model_details = { "r2": evaluation["r2"], "msle": evaluation["msle"] } except Exception as ex: logger.error(ex)
def main(cl_arguments): ''' Train or load a model. Evaluate on some tasks. ''' cl_args = handle_arguments(cl_arguments) args = config.params_from_file(cl_args.config_file, cl_args.overrides) # Logistics # maybe_make_dir(args.project_dir) # e.g. /nfs/jsalt/exp/$HOSTNAME maybe_make_dir(args.exp_dir) # e.g. <project_dir>/jiant-demo maybe_make_dir(args.run_dir) # e.g. <project_dir>/jiant-demo/sst log.getLogger().addHandler(log.FileHandler(args.local_log_path)) if cl_args.remote_log: gcp.configure_remote_logging(args.remote_log_name) if cl_args.notify: from src import emails global EMAIL_NOTIFIER log.info("Registering email notifier for %s", cl_args.notify) EMAIL_NOTIFIER = emails.get_notifier(cl_args.notify, args) if EMAIL_NOTIFIER: EMAIL_NOTIFIER(body="Starting run.", prefix="") _try_logging_git_info() log.info("Parsed args: \n%s", args) config_file = os.path.join(args.run_dir, "params.conf") config.write_params(args, config_file) log.info("Saved config to %s", config_file) seed = random.randint(1, 10000) if args.random_seed < 0 else args.random_seed random.seed(seed) torch.manual_seed(seed) log.info("Using random seed %d", seed) if args.cuda >= 0: try: if not torch.cuda.is_available(): raise EnvironmentError("CUDA is not available, or not detected" " by PyTorch.") log.info("Using GPU %d", args.cuda) torch.cuda.set_device(args.cuda) torch.cuda.manual_seed_all(seed) except Exception: log.warning( "GPU access failed. You might be using a CPU-only installation of PyTorch. Falling back to CPU.") args.cuda = -1 # Prepare data # log.info("Loading tasks...") start_time = time.time() train_tasks, eval_tasks, vocab, word_embs = build_tasks(args) if any([t.val_metric_decreases for t in train_tasks]) and any( [not t.val_metric_decreases for t in train_tasks]): log.warn("\tMixing training tasks with increasing and decreasing val metrics!") tasks = sorted(set(train_tasks + eval_tasks), key=lambda x: x.name) log.info('\tFinished loading tasks in %.3fs', time.time() - start_time) log.info('\t Tasks: {}'.format([task.name for task in tasks])) # Build or load model # log.info('Building model...') start_time = time.time() model = build_model(args, vocab, word_embs, tasks) log.info('\tFinished building model in %.3fs', time.time() - start_time) # Check that necessary parameters are set for each step. Exit with error if not. steps_log = [] if not args.load_eval_checkpoint == 'none': assert_for_log(os.path.exists(args.load_eval_checkpoint), "Error: Attempting to load model from non-existent path: [%s]" % args.load_eval_checkpoint) assert_for_log( not args.do_train, "Error: Attempting to train a model and then replace that model with one from a checkpoint.") steps_log.append("Loading model from path: %s" % args.load_eval_checkpoint) if args.do_train: assert_for_log(args.train_tasks != "none", "Error: Must specify at least on training task: [%s]" % args.train_tasks) assert_for_log( args.val_interval % args.bpp_base == 0, "Error: val_interval [%d] must be divisible by bpp_base [%d]" % (args.val_interval, args.bpp_base)) steps_log.append("Training model on tasks: %s" % args.train_tasks) if args.train_for_eval: steps_log.append("Re-training model for individual eval tasks") assert_for_log( args.eval_val_interval % args.bpp_base == 0, "Error: eval_val_interval [%d] must be divisible by bpp_base [%d]" % (args.eval_val_interval, args.bpp_base)) assert_for_log(len(set(train_tasks).intersection(eval_tasks)) == 0 or args.allow_reuse_of_pretraining_parameters or args.do_train == 0, "If you're pretraining on a task you plan to reuse as a target task, set\n" "allow_reuse_of_pretraining_parameters = 1(risky), or train in two steps:\n" " train with do_train = 1, train_for_eval = 0, stop, and restart with\n" " do_train = 0 and train_for_eval = 1.") if args.do_eval: assert_for_log(args.eval_tasks != "none", "Error: Must specify at least one eval task: [%s]" % args.eval_tasks) steps_log.append("Evaluating model on tasks: %s" % args.eval_tasks) # Start Tensorboard if requested if cl_args.tensorboard: tb_logdir = os.path.join(args.run_dir, "tensorboard") _run_background_tensorboard(tb_logdir, cl_args.tensorboard_port) log.info("Will run the following steps:\n%s", '\n'.join(steps_log)) if args.do_train: # Train on train tasks # log.info("Training...") params = build_trainer_params(args, task_names=[]) stop_metric = train_tasks[0].val_metric if len(train_tasks) == 1 else 'macro_avg' should_decrease = train_tasks[0].val_metric_decreases if len(train_tasks) == 1 else False trainer, _, opt_params, schd_params = build_trainer(params, model, args.run_dir, should_decrease) to_train = [(n, p) for n, p in model.named_parameters() if p.requires_grad] best_epochs = trainer.train(train_tasks, stop_metric, args.batch_size, args.bpp_base, args.weighting_method, args.scaling_method, to_train, opt_params, schd_params, args.shared_optimizer, args.load_model, phase="main") # Select model checkpoint from main training run to load if not args.train_for_eval: log.info("In strict mode because train_for_eval is off. " "Will crash if any tasks are missing from the checkpoint.") strict = True else: strict = False if args.train_for_eval and not args.allow_reuse_of_pretraining_parameters: # If we're training models for evaluation, which is always done from scratch with a fresh # optimizer, we shouldn't load parameters for those models. # Usually, there won't be trained parameters to skip, but this can happen if a run is killed # during the train_for_eval phase. task_names_to_avoid_loading = [task.name for task in eval_tasks] else: task_names_to_avoid_loading = [] if not args.load_eval_checkpoint == "none": log.info("Loading existing model from %s...", args.load_eval_checkpoint) load_model_state(model, args.load_eval_checkpoint, args.cuda, task_names_to_avoid_loading, strict=strict) else: # Look for eval checkpoints (available only if we're restoring from a run that already # finished), then look for training checkpoints. eval_best = glob.glob(os.path.join(args.run_dir, "model_state_eval_best.th")) if len(eval_best) > 0: load_model_state( model, eval_best[0], args.cuda, task_names_to_avoid_loading, strict=strict) else: macro_best = glob.glob(os.path.join(args.run_dir, "model_state_main_epoch_*.best_macro.th")) if len(macro_best) > 0: assert_for_log(len(macro_best) == 1, "Too many best checkpoints. Something is wrong.") load_model_state( model, macro_best[0], args.cuda, task_names_to_avoid_loading, strict=strict) else: assert_for_log( args.allow_untrained_encoder_parameters, "No best checkpoint found to evaluate.") log.warning("Evaluating untrained encoder parameters!") # Train just the task-specific components for eval tasks. if args.train_for_eval: # might be empty if no elmo. scalar_mix_0 should always be pretrain scalars elmo_scalars = [(n, p) for n, p in model.named_parameters() if "scalar_mix" in n and "scalar_mix_0" not in n] # fails when sep_embs_for_skip is 0 and elmo_scalars has nonzero length assert_for_log(not elmo_scalars or args.sep_embs_for_skip, "Error: ELMo scalars loaded and will be updated in train_for_eval but " "they should not be updated! Check sep_embs_for_skip flag or make an issue.") for task in eval_tasks: # Skip mnli-diagnostic # This has to be handled differently than probing tasks because probing tasks require the "is_probing_task" # to be set to True. For mnli-diagnostic this flag will be False because it is part of GLUE and # "is_probing_task is global flag specific to a run, not to a task. if task.name == 'mnli-diagnostic': continue pred_module = getattr(model, "%s_mdl" % task.name) to_train = elmo_scalars + [(n, p) for n, p in pred_module.named_parameters() if p.requires_grad] # Look for <task_name>_<param_name>, then eval_<param_name> params = build_trainer_params(args, task_names=[task.name, 'eval']) trainer, _, opt_params, schd_params = build_trainer(params, model, args.run_dir, task.val_metric_decreases) best_epoch = trainer.train([task], task.val_metric, args.batch_size, 1, args.weighting_method, args.scaling_method, to_train, opt_params, schd_params, args.shared_optimizer, load_model=False, phase="eval") # Now that we've trained a model, revert to the normal checkpoint logic for this task. task_names_to_avoid_loading.remove(task.name) # The best checkpoint will accumulate the best parameters for each task. # This logic looks strange. We think it works. best_epoch = best_epoch[task.name] layer_path = os.path.join(args.run_dir, "model_state_eval_best.th") load_model_state( model, layer_path, args.cuda, skip_task_models=task_names_to_avoid_loading, strict=strict) if args.do_eval: # Evaluate # log.info("Evaluating...") val_results, val_preds = evaluate.evaluate(model, eval_tasks, args.batch_size, args.cuda, "val") splits_to_write = evaluate.parse_write_preds_arg(args.write_preds) if 'val' in splits_to_write: evaluate.write_preds(eval_tasks, val_preds, args.run_dir, 'val', strict_glue_format=args.write_strict_glue_format) if 'test' in splits_to_write: _, te_preds = evaluate.evaluate(model, eval_tasks, args.batch_size, args.cuda, "test") evaluate.write_preds(tasks, te_preds, args.run_dir, 'test', strict_glue_format=args.write_strict_glue_format) run_name = args.get("run_name", os.path.basename(args.run_dir)) results_tsv = os.path.join(args.exp_dir, "results.tsv") log.info("Writing results for split 'val' to %s", results_tsv) evaluate.write_results(val_results, results_tsv, run_name=run_name) log.info("Done!")
def main(): args = parse_arguments() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) os.environ['PYTHONHASHSEED'] = str(args.seed) torch.cuda.manual_seed_all(args.seed) worker_init = WorkerInitObj(args.seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.enabled = True device, args = setup_training(args) model, optimizer, criterion = prepare_model_and_optimizer(args, device) pool = ProcessPoolExecutor(1) train_iter = subsetDataloader(path=args.train_path, batch_size=args.batch_size, worker_init=worker_init) test_iter = subsetDataloader(path=args.val_path, batch_size=args.batch_size, worker_init=worker_init) print('-' * 50 + 'args' + '-' * 50) for k in list(vars(args).keys()): print('{0}: {1}'.format(k, vars(args)[k])) print('-' * 30) print(model) print('-' * 50 + 'args' + '-' * 50) global_step = 0 global_auc = 0 s_time_train = time.time() for epoch in range(args.epoch): dataset_future = pool.submit(subsetDataloader, args.train_path, args.batch_size, worker_init) for step, batch in enumerate(train_iter): model.train() labels = batch['label'].to(device).float() batch = { t: {k: v.to(device) for k, v in d.items()} for t, d in batch.items() if isinstance(d, dict) } optimizer.zero_grad() logits = model(batch) # print('logits', logits) # print('label', labels) loss = criterion(logits, labels) loss.backward() optimizer.step() # evaluate if global_step != 0 and global_step % args.eval_freq == 0: s_time_eval = time.time() model.eval() auc = evaluate(model, test_iter, device) e_time_eval = time.time() print('-' * 68) print('Epoch:[{0}] Step:[{1}] AUC:[{2}] time:[{3}s]'.format( epoch, global_step, format(auc, '.4f'), format(e_time_eval - s_time_eval, '.4f'))) if auc > global_auc: model_to_save = model.module if hasattr( model, 'module') else model output_save_file = os.path.join( args.output_dir, "{}_auc_{}_step_{}_ckpt.pt".format( args.model_name, format(auc, '.4f'), global_step)) if os.path.exists(output_save_file): os.system('rm -rf {}'.format(output_save_file)) torch.save( { 'model': model_to_save.state_dict(), 'name': args.model_name }, output_save_file) print('Epoch:[{0}] Step:[{1}] SavePath:[{2}]'.format( epoch, global_step, output_save_file)) global_auc = auc print('-' * 68) # log if global_step != 0 and global_step % args.log_freq == 0: e_time_train = time.time() print('Epoch:[{0}] Step:[{1}] Loss:[{2}] Lr:[{3}] time:[{4}s]'. format(epoch, global_step, format(loss.item(), '.4f'), format(optimizer.param_groups[0]['lr'], '.6'), format(e_time_train - s_time_train, '.4f'))) s_time_train = time.time() global_step += 1 del train_iter train_iter = dataset_future.result(timeout=None)
def test_evaluate_fail(): with pytest.raises(Exception): # invalid input shape y_test = np.array([1, 2, 3]) y_pred = np.array([1, 2]) result = evaluate(y_test, y_pred)
def test_evaluate_pass(): y_test = np.array([1, 2, 3]) y_pred = np.array([1, 2, 2.2]) expected = {"r2": 0.6800000000000002, "msle": 0.01659768149770578} result = evaluate(y_test, y_pred) assert result == expected
def main(): arg = args() if not os.path.exists(arg.exp_name): os.makedirs(arg.exp_name) assert arg.exp_name.split( '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..." output_dir = arg.exp_name if arg.local_rank == 0: save_scripts_in_exp_dir(output_dir) logger = logging_set(output_dir, arg.local_rank) logger.info(arg) logger.info( '\n================ experient name:[{}] ===================\n'.format( arg.exp_name)) os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True np.random.seed(0) torch.manual_seed(0) config = edict(yaml.load(open(arg.cfg, 'r'))) if arg.search: assert arg.search in [ 'None', 'sync', 'random', 'second_order_gradient', 'first_order_gradient' ] config.train.arch_search_strategy = arg.search if arg.batchsize: logger.info("update batchsize to {}".format(arg.batchsize)) config.train.batchsize = arg.batchsize config.num_workers = arg.num_workers print( 'GPU memory : \ntotal | used\n', os.popen( 'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader' ).read()) logger.info( '------------------------------ configuration ---------------------------' ) logger.info( '\n==> available {} GPUs , use numbers are {} device is {}\n'.format( torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"], torch.cuda.current_device())) # torch.cuda._initialized = True logger.info(pprint.pformat(config)) logger.info( '------------------------------- -------- ----------------------------' ) best = 0 criterion = MSELoss() Arch = bulid_up_network(config, criterion) if config.train.arch_search_strategy == 'random': logger.info("==>random seed is {}".format(config.train.random_seed)) np.random.seed(config.train.random_seed) torch.manual_seed(config.train.random_seed) Arch.arch_parameters_random_search() if arg.param_flop: Arch._print_info() if len(arg.gpu) > 1: use_multi_gpu = True if arg.distributed: torch.distributed.init_process_group(backend="nccl") #torch.distributed.init_process_group(backend="nccl",init_method='env://') local_rank = torch.distributed.get_rank() torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank) Arch.to(device) Arch = torch.nn.parallel.DistributedDataParallel( Arch, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) logger.info("local rank = {}".format(local_rank)) else: Arch = torch.nn.DataParallel(Arch).cuda() else: use_multi_gpu = False Arch = Arch.cuda() Search = Search_Arch(Arch.module, config) if use_multi_gpu else Search_Arch( Arch, config) # Arch.module for nn.DataParallel search_strategy = config.train.arch_search_strategy if not arg.distributed: train_queue, arch_queue, valid_queue = Dataloaders( search_strategy, config, arg) else: train_queue, \ arch_queue, \ valid_queue, \ train_sampler_dist, = Dataloaders(search_strategy,config,arg) #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None! logger.info( "\nNeural Architecture Search strategy is {}".format(search_strategy)) assert search_strategy in [ 'first_order_gradient', 'random', 'None', 'second_order_gradient', 'sync' ] if search_strategy == 'sync': # arch_parameters is also registered to model's parameters # so the weight-optimizer will also update the arch_parameters logger.info( "sync: The arch_parameters is also optimized by weight-optmizer synchronously" ) optimizer = torch.optim.Adam( Arch.parameters(), lr=config.train.w_lr_cosine_begin, ) else: # if search strategy is None,random,second_order_gradient and so on # the arch_parameters will be filtered by the weight-optimizer optimizer = torch.optim.Adam( filter_arch_parameters(Arch), lr=config.train.w_lr_cosine_begin, ) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = config.train.lr_step_size, # gamma = config.train.lr_decay_gamma ) if config.train.scheduler_name == "MultiStepLR": scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.train.LR_STEP, config.train.LR_FACTOR) elif config.train.scheduler_name == "CosineAnnealingLR": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=config.train.epoch_end, eta_min=config.train.w_lr_cosine_end) # best_result logger.info( "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+==" ) begin, end = config.train.epoch_begin, config.train.epoch_end if arg.load_ckpt: if use_multi_gpu: begin, best = load_ckpt(Arch.module, optimizer, scheduler, output_dir, logger) else: begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir, logger) for epoch in range(begin, end): lr = scheduler.get_lr()[0] logger.info( '==>time:({})--training...... current learning rate is {:.7f}'. format(datetime.datetime.now(), lr)) if arg.distributed: train_sampler_dist.set_epoch(epoch) #valid_sampler_dist.set_epoch(epoch) train( epoch, train_queue, arch_queue, Arch, Search, criterion, optimizer, lr, search_strategy, output_dir, logger, config, arg, ) scheduler.step() if not arg.distributed or (arg.distributed and arg.local_rank == 0): eval_results = evaluate(Arch, valid_queue, config, output_dir) if use_multi_gpu: best = save_model(epoch, best, eval_results, Arch.module, optimizer, scheduler, output_dir, logger) else: best = save_model(epoch, best, eval_results, Arch, optimizer, scheduler, output_dir, logger)
def parseArgs(): """ Parses received arguments using argparse. :return: """ parser = argparse.ArgumentParser('Test and evaluate Ring Confidential Transactions') parser.add_argument('-rs', '--ringsizes', required=True, nargs='*', type=int, help="Define the size of the ring.") parser.add_argument('-c', '--curves', required=False, nargs='*', help="Elliptic curve to employ.") parser.add_argument('-m', '--message', required=False, help="Message to sign.") parser.add_argument('-o', '--output', required=False, help="Destination file to save the output graphics.") return parser.parse_args() """ Reads and parses the arguments. Calls the evaluation function. """ if __name__ == '__main__': args = parseArgs() curves = ['secp192r1'] message = 'I voted for Kodos' output = 'comparative' if args.curves is None: args.curves = curves if args.message is None: args.message = message if args.output is None: args.output = output evaluate(args)
def train(train_loop_func, args, logger): # Setup multi-GPU if necessary # args.distributed = False # if 'WORLD_SIZE' in os.environ: # args.distributed = int(os.environ['WORLD_SIZE']) > 1 # args.distributed = True # if args.distributed: # torch.cuda.set_device(args.local_rank) # torch.distributed.init_process_group(backend='nccl') # args.N_gpu = torch.distributed.get_world_size() # else: # args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(10000) # if args.distributed: # args.seed = (args.seed + torch.distributed.get_rank()) % 2 ** 32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True model = EfficientNet.from_pretrained('efficientnet-b4', num_classes=4) if args.local_rank is not None: torch.distributed.init_process_group(backend="nccl") torch.cuda.set_device(args.local_rank) model = model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.002}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if args.local_rank is not None: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) # Setup data, defaults train, test = construct_dataset(args.data) random.shuffle(train) train = train[:len(train)] split_position = int(len(train) * 0.96) train_dataset = ALASKA2Dataset(train[:split_position], root_dir=args.data, augmented=True) val_dataset = ALASKA2Dataset(train[split_position:], root_dir=args.data, augmented=False) test_dataset = ALASKA2Dataset(test, root_dir=args.data, augmented=False) if args.local_rank is not None: train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True) train_sampler.set_epoch(0) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, drop_last=False, num_workers=4, shuffle=False, sampler=train_sampler) val_dataloader = DataLoader(val_dataset, batch_size=args.eval_batch_size, drop_last=False, num_workers=4, shuffle=False) test_dataloader = DataLoader(test_dataset, batch_size=args.eval_batch_size, drop_last=False, num_workers=4, shuffle=False) mean, std = generate_mean_std(amp=args.amp) # args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2, verbose=False, threshold_mode='abs') start_epoch = 1 if args.checkpoint is not None: if os.path.isfile(args.checkpoint): model, optimizer, scheduler, start_epoch = load_checkpoint( args.checkpoint, model, optimizer, scheduler) start_epoch += 1 # this is because the epoch saved is the previous epoch else: print('Provided checkpoint is not path to a file') return # loss_function = nn.CrossEntropyLoss() loss_function = LabelSmoothing() if args.mode == 'evaluation': acc = evaluate(model, val_dataloader, args, mean, std, loss_function) print('Model precision {} mAP'.format(acc)) return elif args.mode == 'testing': test_(model, test_dataloader, args, mean, std) return for epoch in range(start_epoch, args.epochs + 1): print("-----------------------") print("Local Rank: {}, Epoch: {}, Training ...".format(args.local_rank, epoch)) print("Epoch {} of {}".format(epoch, args.epochs)) print("Total number of parameters trained this epoch: ", sum(p.numel() for pg in optimizer.param_groups for p in pg['params'] if p.requires_grad)) avg_loss = train_loop_func(model, loss_function, optimizer, train_dataloader, None, args, mean, std) # logger.update_epoch_time(epoch, end_epoch_time) print("saving model...") obj = {'epoch': epoch, 'model': model.module.state_dict(), # model.state_dict() for non DataParallel model 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()} if args.local_rank in [0, None]: torch.save(obj, f'./saved/{args.backbone}_epoch_{epoch}.pt') print("Incepe evaluarea") val_loss = evaluate(model, val_dataloader, args, mean, std, loss_function) test_(model, test_dataloader, args, mean, std, epoch) scheduler.step(val_loss)
def main(config): CASE_NUM = config['case_num'] DATASET = config['dataset'] NORMALIZATION = config['normalization'] BATCH_SIZE = config['batch_size'] MAX_EPOCH = config['max_epoch'] OPTIM_TYPE = config['optimizer'] LR = config['learning_rate'] LR_STEP = config['lr_step'] LR_DECAY = config['lr_decay'] L2_DECAY = config['l2_decay'] TB_STATE = config['use_tensorboard'] MODEL_NAME = config['model_name'] ALPHA = config['alpha'] BETA = config['beta'] GAMMA = config['gamma'] PHI = config['phi'] LOSS_FN = config['loss_fn'] KERNEL_SIZE = config['kernel_size'] result_dir = make_dir(RESULT_ROOT_DIR, str(CASE_NUM), overwrite=args.overwrite) ckpt_path = result_dir + '/' + 'checkpoint.pt' # =============================================== Select data and construct data_fname, data_dim = select_data(DATASET) data_path = '../data/' + data_fname data_train = NLUDataset(data_path, mode='train', normalization=NORMALIZATION, random_seed=42) dataloader_train = DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) data_valid = NLUDataset(data_path, mode='valid', normalization=NORMALIZATION, random_seed=42) dataloader_valid = DataLoader(data_valid, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) data_test = NLUDataset(data_path, mode='test', normalization=NORMALIZATION, random_seed=42) dataloader_test = DataLoader(data_test, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) num_train_samples = data_train.__len__() classes = data_train.labels num_classes = len(classes) # =============================================== Initialize model and optimizer device = ('cuda' if torch.cuda.is_available() else 'cpu') if device == 'cuda': print('Using GPU, %s' % torch.cuda.get_device_name(0)) net = select_model(MODEL_NAME, data_dim, KERNEL_SIZE, num_classes, ALPHA, BETA, PHI) net.to(device) loss_fn = select_loss(LOSS_FN) optimizer = select_optimizer(OPTIM_TYPE, net.parameters(), LR, L2_DECAY) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=LR_STEP, gamma=LR_DECAY) # =============================================== Train it = 0 train_losses, valid_losses, valid_accs = {}, {}, {} best_validation_acc = 0 log_term = 5 for epoch in range(MAX_EPOCH): #------------------------------------------------ One epoch start one_epoch_start = time.time() print('Epoch {} / Learning Rate: {:.0e}'.format( epoch, scheduler.get_lr()[0])) #------------------------------------------------ Train train_losses, it, net, optimizer, scheduler \ = train_1epoch(dataloader_train, device, train_losses, it, net, loss_fn, optimizer, scheduler, log_every=log_term) #------------------------------------------------ Validation valid_acc, valid_loss = evaluate(dataloader_valid, device, net, loss_fn) valid_losses[it] = valid_loss valid_accs[it] = valid_acc #------------------------------------------------ Save model saved = '' if valid_acc > best_validation_acc: best_validation_acc = valid_acc saved = save_ckpt(ckpt_path, net, best_validation_acc) print('Epoch {} / Valid loss: {:.4f}, Valid acc: {:.4f} {}'.format( epoch, valid_loss, valid_acc, saved)) #------------------------------------------------ One epoch end curr_time = time.time() print("One epoch time = %.2f s" % (curr_time - one_epoch_start)) print('#------------------------------------------------------#') save_train_log(result_dir, train_losses, valid_losses, valid_accs, best_validation_acc) # =============================================== Test net, best_validation_acc = load_ckpt(ckpt_path, net) test_acc, test_loss = evaluate(dataloader_test, device, net, loss_fn) return test_acc
def main(config, run_preprocessing, run_data_upload, log_dir): # Load experiment configuration with open(config) as f: config = yaml.load(f, Loader=yaml.FullLoader) # Get db connection conn = sql_utils.get_connection() # Get basic info of experiment exp_version = config['version'] exp_name = config["experiment_name"] exp_time = date_utils.get_current_time_string()[2:] username = getpass.getuser()[0] terminal_width = int(os.popen('stty size', 'r').read().split()[1]) print( f'Running Experiment: {username}_{exp_version}_{exp_name}_{exp_time}\n{"-" * terminal_width}\n' ) # Preprocessing preprocessing_prefix = config['preprocessing_config']['prefix'] if not run_preprocessing: print('Preprocessing skipped.') else: print('Preprocessing ...') run_preprocess(conn, config['preprocessing_config'], run_data_upload=run_data_upload) print('Preprocessing done.') # Get temporal configuration information train_dates_list, test_dates_list = parse_temporal_config( config['temporal_config']) # Training and evaluation test_results_over_time = [] experiment_loop = tqdm.tqdm(list(zip(train_dates_list, test_dates_list)), desc='Experiment Repeats') for train_dates, test_dates in experiment_loop: split_time_abbr = date_utils.date_to_string( test_dates['label_start_time']) split_time_abbr = split_time_abbr.replace('-', '')[2:] split_name = f'{split_time_abbr}' print(split_name) prefix = f'{username}_{exp_version}_{exp_name}_{exp_time}_{split_name}' experiment_table_prefix = f'experiments.{prefix}' train_save_dir = os.path.join(os.getcwd(), log_dir, prefix, 'train_' + exp_time) test_save_dir = os.path.join(os.getcwd(), log_dir, prefix, 'test_' + exp_time) # Prepare cohort as specified by our experiment configuration tqdm.tqdm.write('\nPreparing cohorts ...') train_feature_splits, train_label_splits = [], [] for i, train_dates_aod in enumerate(train_dates): train_feature_table, train_label_table = prepare_cohort( config, train_dates_aod, test_dates, preprocessing_prefix, experiment_table_prefix + f'_split{i}', include_test=False)[:2] train_feature_splits.append(train_feature_table) train_label_splits.append(train_label_table) test_feature_table, test_label_table = prepare_cohort( config, train_dates[-1], test_dates, preprocessing_prefix, experiment_table_prefix, include_train=False)[2:] train_feature_table = f'{experiment_table_prefix}_train_features' sql_utils.merge_tables(train_feature_splits, train_feature_table) train_label_table = f'{experiment_table_prefix}_train_labels' sql_utils.merge_tables(train_label_splits, train_label_table) # Delete intermediate cohort tables for i in range(len(train_dates)): cohort_table_name = f'{experiment_table_prefix}_split{i}_cohort' sql_utils.run_sql_from_string(conn, f'drop table {cohort_table_name};') # Train models as specified by our experiment configuration tqdm.tqdm.write('Training ...') model_summaries = train(config, train_feature_table, train_label_table, discard_columns=['split'], save_dir=train_save_dir) # Evaluate our models on the training data model_paths = glob.glob(f'{train_save_dir}/*.pkl') tqdm.tqdm.write('Evaluating on training data ...') train_results = evaluate(config, train_feature_table, train_label_table, model_paths, model_summaries, discard_columns=['split'], log_dir=train_save_dir) # Evaluate our models on the test data tqdm.tqdm.write('Evaluating on test data ...') test_results = evaluate(config, test_feature_table, test_label_table, model_paths, model_summaries, save_preds_to_db=True, save_prefix=f'{prefix}_test', log_dir=test_save_dir) test_results_over_time.append(test_results) # Save results to database train_results_name = f'{prefix}_train_results' test_results_name = f'{prefix}_test_results' train_results.to_sql(train_results_name, conn, schema='results') test_results.to_sql(test_results_name, conn, schema='results') # Plot test results over time test_results_tables_prefix = f'{username}_{exp_version}_{exp_name}_{exp_time}' plot_utils.plot_results_over_time(test_results_tables_prefix)
def evaluate_model(args): from src.evaluate import main as evaluate return evaluate(args.dataset_path, args.checkpoint_path, args.force)
def main(config): CASE_NUM = config['case_num'] DATASET = config['dataset'] NORMALIZATION = config['normalization'] BATCH_SIZE = config['batch_size'] MAX_EPOCH = config['max_epoch'] OPTIM_TYPE = config['optimzer'] LR = config['learning_rate'] LR_STEP = config['lr_step'] LR_DECAY = config['lr_decay'] L2_DECAY = config['l2_decay'] TB_STATE = config['use_tensorboard'] MODEL_NAME = config['model_name'] ALPHA = config['alpha'] BETA = config['beta'] GAMMA = config['gamma'] PHI = config['phi'] LOSS_FN = config['loss_fn'] KERNEL_SIZE = config['kernel_size'] result_dir = RESULT_ROOT_DIR + '/' + CASE_NUM ckpt_path = result_dir + '/' + 'checkpoint.pt' #%% data_fname, data_dim = select_data(DATASET) data_path = '../data/' + data_fname data_test = NLUDataset(data_path, mode='test', random_seed=42) dataloader_test = DataLoader(data_test, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) classes = data_test.labels num_classes = len(classes) #%% device = ('cuda' if torch.cuda.is_available() else 'cpu') if device == 'cuda': print('Using GPU, %s' % torch.cuda.get_device_name(0)) net = select_model(MODEL_NAME, data_dim, KERNEL_SIZE, num_classes, ALPHA, BETA, PHI) net.to(device) loss_fn = select_loss(LOSS_FN) #%% net, best_validation_acc = load_ckpt(ckpt_path, net) start_time = time.time() test_acc, test_loss = evaluate(dataloader_test, device, net, loss_fn) curr_time = time.time() ttt = curr_time - start_time tt1 = ttt / data_test.__len__() print('########################################################') print('# Test accuracy of %d: %.4f' % (CASE_NUM, test_acc)) print("# Average %.6f s to process one input" % (tt1)) print('########################################################')
def train(args): if args.amp: amp_handle = amp.init(enabled=args.fp16) args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) ssd300 = model(args) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) iteration = 0 loss_func = Loss(dboxes) loss_func.cuda() optimizer = torch.optim.SGD( tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR( optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.fp16: if args.amp: optimizer = amp_handle.wrap_optimizer(optimizer) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.) val_dataloader, inv_map = get_val_dataloader(args) train_loader = get_train_loader(args, dboxes) acc = 0 logger = Logger(args.batch_size, args.local_rank) for epoch in range(0, args.epochs): logger.start_epoch() scheduler.step() iteration = train_loop( ssd300, loss_func, epoch, optimizer, train_loader, iteration, logger, args) logger.end_epoch() if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Epoch {:2d}, Accuracy: {:4f} mAP'.format(epoch, acc)) if args.data_pipeline == 'dali': train_loader.reset() return acc, logger.average_speed()
def train(total_epochs=30, early_stopping=True, threshold=5, checkpoint_path='/content/gdrive/My Drive/NMT/checkpoints/my_model/', save=True, write=True): ### immutable training session data ### model_data = retrieve_model_data(checkpoint_path=checkpoint_path) train_batches = model_data["train_batches"] dev_batches = model_data["dev_batches"] dev_references = model_data["references"] idx_to_trg_word = model_data["idx_to_trg_word"] hyperparams = model_data["hyperparams"] ####################################### ### mutable training session data ### model, optimizer, checkpoint = load_checkpoint(hyperparams, checkpoint_path, "most_recent_model") epoch = checkpoint["epoch"] epoch_loss = checkpoint["epoch_loss"] bleu = checkpoint["bleu"] prev_bleu = checkpoint["prev_bleu"] best_bleu = checkpoint["best_bleu"] bad_epochs_count = checkpoint["bad_epochs_count"] ####################################### if epoch == 0: # loaded a checkpoint that has been trained for zero epochs. print("training model from scratch...") print() start_epoch = 1 else: print(f"loaded model checkpoint from epoch: {epoch:02d}") print( f"loss: {epoch_loss:.4f}, bleu: {bleu:.2f}, prev_bleu: {prev_bleu:.2f}, best_bleu: {best_bleu:.2f}, bad_epochs_count: {bad_epochs_count:02d}" ) start_epoch = epoch + 1 print(f"resuming training from epoch {start_epoch}...") print() ### training loop ############################## for epoch in range(start_epoch, total_epochs + 1): epoch_loss = 0. random.shuffle(train_batches) epoch_start_time = time.time() for batch in train_batches: epoch_loss += training_step(model, optimizer, batch) epoch_time = time.time() - epoch_start_time dev_translations, preds_time, post_time = predict(model, dev_batches, idx_to_trg_word, checkpoint_path, epoch, write=write) bleu = evaluate(dev_translations, dev_references) model.train() model.encoder.train() model.decoder.train() report_stats(epoch, epoch_loss, epoch_time, preds_time, bleu, checkpoint_path, post_time) if early_stopping: # if this epoch model performed better on dev set than prev epoch model, # bad_epochs_count resets to 0. (need not have outperformed best model, # just the most recent model). bad_epochs_count = (bad_epochs_count + 1) if epoch > 1 and bleu <= prev_bleu else 0 if bleu > best_bleu: best_bleu = bleu # when terminates, can load best model, rather than potentially suboptimal model of final epoch. store_checkpoint(model, optimizer, epoch, epoch_loss, bleu, prev_bleu, best_bleu, bad_epochs_count, checkpoint_path, "best_model") if bad_epochs_count == threshold: # early-stopping threshold met best_model, optimizer, checkpoint = load_checkpoint( hyperparams, checkpoint_path, "best_model") return best_model, checkpoint["epoch_loss"] if save: # store checkpoint each epoch, e.g., so can pick up training at later time. store_checkpoint(model, optimizer, epoch, epoch_loss, bleu, prev_bleu, best_bleu, bad_epochs_count, checkpoint_path, "most_recent_model") prev_bleu = bleu ################################################ if early_stopping: best_model, optimizer, checkpoint = load_checkpoint( hyperparams, checkpoint_path, "best_model") return best_model, checkpoint["epoch_loss"] else: return model, epoch_loss
def train(train_loop_func, logger, args): # Check that GPUs are actually available use_cuda = not args.no_cuda train_samples = 118287 # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='smddp', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) np.random.seed(seed=args.seed) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) train_loader = get_train_loader(args, args.seed - 2**31) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path)) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) start_epoch = 0 iteration = 0 loss_func = Loss(dboxes) if use_cuda: ssd300.cuda() loss_func.cuda() optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.amp: ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2') if args.distributed: ssd300 = DDP(ssd300) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() scheduler.step() iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration, logger, args, mean, std) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time if torch.distributed.get_rank() == 0: throughput = train_samples / end_epoch_time logger.update_epoch_time(epoch, end_epoch_time) logger.update_throughput_speed(epoch, throughput) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.save and args.local_rank == 0: print("saving model...") obj = {'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info} if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() save_path = os.path.join(args.save, f'epoch_{epoch}.pt') torch.save(obj, save_path) logger.log('model path', save_path) train_loader.reset() if torch.distributed.get_rank() == 0: DLLogger.log((), { 'Total training time': '%.2f' % total_time + ' secs' }) logger.log_summary()
def predict_from_raw_dataset(): e.evaluate()
def train(train_loop_func, logger, args): if args.amp: amp_handle = amp.init(enabled=args.fp16) # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) train_loader = get_train_loader(args, args.seed - 2**31) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(backbone=args.backbone) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) start_epoch = 0 iteration = 0 loss_func = Loss(dboxes) if use_cuda: ssd300.cuda() loss_func.cuda() if args.fp16 and not args.amp: ssd300 = network_to_half(ssd300) if args.distributed: ssd300 = DDP(ssd300) optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.fp16: if args.amp: optimizer = amp_handle.wrap_optimizer(optimizer) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage. cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) ssd300.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() scheduler.step() iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration, logger, args, mean, std) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time if args.local_rank == 0: logger.update_epoch_time(epoch, end_epoch_time) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: logger.update_epoch(epoch, acc) if args.save and args.local_rank == 0: print("saving model...") obj = { 'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info } if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() torch.save(obj, './models/epoch_{}.pt'.format(epoch)) train_loader.reset() print('total training time: {}'.format(total_time))
def train(args): args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(len(cocoGt.cats) + 1) args.learning_rate = args.learning_rate * \ args.N_gpu * (args.batch_size / 32) iteration = 0 loss_func = Loss(dboxes) ssd300.cuda() loss_func.cuda() if args.fp16: ssd300 = network_to_half(ssd300) if args.distributed: ssd300 = DDP(ssd300) optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.) inv_map = {v: k for k, v in val_dataset.label_map.items()} avg_loss = 0.0 acc = 0 batch_perf = AverageMeter() end = time.time() train_start = end args.train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") args.train_coco_root = os.path.join(args.data, "train2017") local_seed = set_seeds(args) if args.data_pipeline == 'no_dali': train_trans = SSDTransformer(dboxes, args, (300, 300), val=False) train_dataset = get_train_dataset(args, train_trans) train_loader = get_train_loader(train_dataset, args, args.num_workers) elif args.data_pipeline == 'dali': train_loader = get_train_dali_loader(args, dboxes, local_seed) for epoch in range(args.epochs): start_epoch_time = time.time() scheduler.step() epoch_loop(train_loader, args, ssd300, time.time(), loss_func, optimizer, iteration, avg_loss, batch_perf, epoch) torch.cuda.synchronize() if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) try: train_loader.reset() except AttributeError: pass if args.local_rank == 0: print( "Training end: Average speed: {:3f} img/sec, Total time: {:3f} sec, Final accuracy: {:3f} mAP" .format(args.N_gpu * args.batch_size / batch_perf.avg, time.time() - train_start, acc))
def main(): args = parse_arguments() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) worker_init = WorkerInitObj(args.seed) device, args = setup_training(args) test_data = prepare_test_data(args) model, optimizer, criterion = prepare_model_and_optimizer(args, device) pool = ProcessPoolExecutor(1) train_iter = ml_1mTrainDataLoader(path=args.train_path, num_negs=args.num_negs, batch_size=args.train_batch_size, seed=args.seed, worker_init=worker_init) print('-' * 50 + 'args' + '-' * 50) for k in list(vars(args).keys()): print('{0}: {1}'.format(k, vars(args)[k])) print('-' * 30) print(model) print('-' * 50 + 'args' + '-' * 50) global_step = 0 global_HR = 0.0 global_NDCG = 0.0 s_time_train = time.time() for epoch in range(args.epoch): dataset_future = pool.submit(ml_1mTrainDataLoader, args.train_path, args.num_negs, args.train_batch_size, args.seed, worker_init) for step, batch in enumerate(train_iter): model.train() batch = [t.to(device) for t in batch] users, items, labels = batch logits = model(users, items) loss = criterion(logits, labels.float()) optimizer.zero_grad() loss.backward() optimizer.step() #evaluate if global_step != 0 and global_step % args.eval_freq == 0: s_time_eval = time.time() model.eval() hits, ndcgs = evaluate(model, test_data, device, args.topk) e_time_eval = time.time() print('-' * 68) print('Epoch:[{0}] Step:[{1}] HR:[{2}] NDCG:[{3}] time:[{4}s]'. format(epoch, global_step, format(hits, '.4f'), format(ndcgs, '.4f'), format(e_time_eval - s_time_eval, '.4f'))) if hits > global_HR and ndcgs > global_NDCG: model_to_save = model.module if hasattr( model, 'module') else model output_save_file = os.path.join( args.output_dir, "{}_hr_{}_ndcg_{}_step_{}_ckpt.pt".format( args.model_name, format(hits, '.4f'), format(ndcgs, '.4f'), global_step)) if os.path.exists(output_save_file): os.system('rm -rf {}'.format(output_save_file)) torch.save( { 'model': model_to_save.state_dict(), 'name': args.model_name }, output_save_file) print('Epoch:[{0}] Step:[{1}] SavePath:[{2}]'.format( epoch, global_step, output_save_file)) global_HR = hits global_NDCG = ndcgs print('-' * 68) #log if global_step != 0 and global_step % args.log_freq == 0: e_time_train = time.time() print('Epoch:[{0}] Step:[{1}] Loss:[{2}] Lr:[{3}] time:[{4}s]'. format(epoch, global_step, format(loss.item(), '.4f'), format(optimizer.param_groups[0]['lr'], '.6'), format(e_time_train - s_time_train, '.4f'))) s_time_train = time.time() global_step += 1 del train_iter train_iter = dataset_future.result(timeout=None)
val_n = 0 for batch in dataloader_val: val_n += batch_size input_batches, input_lengths = batch['input'], batch[ 'length'].numpy().tolist() input_batches, input_lengths = zip( *sorted(zip(input_batches, input_lengths), key=lambda x: x[1], reverse=True)) input_batches, input_lengths = torch.stack( input_batches), list(input_lengths) input_batches = input_batches[:, :max(input_lengths)] input_batches = input_batches.transpose(0, 1) val_loss, real, generated = evaluate( encoder, decoder, input_batches, input_lengths, input_batches, input_lengths, batch_size, lang1) print_loss_total += loss if val_n % print_every_val == 0: logger.info( '\n-- Real sentence: {0},\n-- Generated sentence: {1}' .format(' '.join(real), ' '.join(generated))) val_n += 1 print_loss_avg = print_loss_total / val_n print_loss_total = 0 print_summary = 'VAL_LOSS_INFO: Epoch:%d - Batch:%d - Val_loss:%.4f' % ( epoch, batch_n, print_loss_avg) logger.info(print_summary) torch.cuda.empty_cache()
def test_outdrop( checkpoint_path='/content/gdrive/My Drive/NMT/unittests/checkpoints/', config_path='/content/gdrive/My Drive/NMT/configs/', corpus_path='/content/gdrive/My Drive/NMT/unittests/first_ten_sentences/' ): hyperparams = import_configs(config_path=config_path, unittesting=True) # use word-level vocab hyperparams["vocab_type"] = "word" hyperparams["trim_type"] = "top_k" hyperparams["enc_dropout"] = .5 hyperparams["dec_dropout"] = .5 print(f"hidden size: {hyperparams['dec_hidden_size']}") construct_model_data("train.de", "train.en", hyperparams=hyperparams, corpus_path=corpus_path, checkpoint_path=checkpoint_path, overfit=True) # model of sufficient capacity should be able to bring loss down to ~zero. model, loss = train(total_epochs=100, early_stopping=False, checkpoint_path=checkpoint_path, save=False, write=False) assert loss < .01 model_data = retrieve_model_data(checkpoint_path=checkpoint_path) dev_batches = model_data[ "dev_batches"] # holds the training data, bc overfit=True dev_references = model_data[ "references"] # holds the training data, bc overfit=True idx_to_trg_word = model_data["idx_to_trg_word"] # greedy search should be able to perfectly predict the training data. dev_translations, _, _ = predict(model, dev_batches, idx_to_trg_word, checkpoint_path) bleu = evaluate(dev_translations, dev_references) assert bleu >= 100 # beam search should be able to perfectly predict the training data. model.decoder.set_inference_alg("beam_search") dev_translations, _, _ = predict(model, dev_batches, idx_to_trg_word, checkpoint_path) bleu = evaluate(dev_translations, dev_references) assert bleu >= 100 # def test_default_subword_model(): # hyperparams = import_configs(config_path=config_path, unittesting=True) # hyperparams["vocab_type"] = "subword_joint" # train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams, # corpus_path=corpus_path, overfit=True, write=False # ) # predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path) # # default word model, except dn divide scores by scaling factor inside attention fn. # def test_attn(): # hyperparams = import_configs(config_path=config_path, unittesting=True) # hyperparams["vocab_type"] = "word" # hyperparams["trim_type"] = "top_k" # hyperparams["attention_fn"] = "dot_product" # train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams, # corpus_path=corpus_path, overfit=True, write=False # ) # predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path) # # no weight tying, no additional attention layer # def test_no_tying(): # hyperparams = import_configs(config_path=config_path, unittesting=True) # hyperparams["vocab_type"] = "word" # hyperparams["trim_type"] = "top_k" # hyperparams["attention_layer"] = False # hyperparams["tie_weights"] = False # train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams, # corpus_path=corpus_path, overfit=True, write=False # ) # predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path) # # no weight tying and no attention mechanism. # def test_no_attn_no_tying(): # hyperparams = import_configs(config_path=config_path, unittesting=True) # hyperparams["vocab_type"] = "word" # hyperparams["trim_type"] = "top_k" # hyperparams["attention_fn"] = "none" # hyperparams["attention_layer"] = False # hyperparams["tie_weights"] = False # train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams, # corpus_path=corpus_path, overfit=True, write=False # ) # predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path) # # default model, except dropout after lstm is turned on. # def test_dropout(): # hyperparams = import_configs(config_path=config_path, unittesting=True) # hyperparams["enc_dropout"] = 0.2 # hyperparams["dec_dropout"] = 0.2 # train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams, # corpus_path=corpus_path, overfit=True, write=False # ) # predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path) # # ensure still works on cpu. # # must change runtime type to cpu before performing this test # # def test_default_word_model_cpu(): # # hyperparams = import_configs(config_path=config_path, unittesting=True) # # hyperparams["vocab_type"] = "word" # # hyperparams["trim_type"] = "top_k" # # hyperparams["device"] = "cpu" # # train_batches, dev_batches, test_batches, vocabs, ref_corpuses, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams, # # corpus_path=corpus_path, overfit=True, write=False # # ) # # predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path) # # simplest possible model. # # - unidirectional encoder. # # - no attention mechanism. # def test_uni_no_attn(): # hyperparams = import_configs(config_path=config_path, unittesting=True) # hyperparams["attention_fn"] = "none" # constrain_configs(hyperparams) # ensure passes constraint-check # train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams, # corpus_path=corpus_path, overfit=True, write=False # ) # predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path) # # two-layer vanilla network with layer_to_layer decoder_init_scheme # def test_layer_to_layer_uni_no_attn(): # hyperparams = import_configs(config_path=config_path, unittesting=True) # hyperparams["enc_num_layers"] = 2 # hyperparams["dec_num_layers"] = 2 # hyperparams["decoder_init_scheme"] = "layer_to_layer" # hyperparams["attention_fn"] = "none" # hyperparams["bidirectional"] = False # constrain_configs(hyperparams) # ensure passes constraint-check # train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams, # corpus_path=corpus_path, overfit=True, write=False # ) # predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path) # # two-layer vanilla network with final_to_first decoder_init_scheme # def test_final_to_first_uni_no_attn(): # hyperparams = import_configs(config_path=config_path, unittesting=True) # hyperparams["enc_num_layers"] = 2 # hyperparams["dec_num_layers"] = 2 # hyperparams["decoder_init_scheme"] = "final_to_first" # hyperparams["attention_fn"] = "none" # hyperparams["bidirectional"] = False # constrain_configs(hyperparams) # ensure passes constraint-check # train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams, # corpus_path=corpus_path, overfit=True, write=False # ) # predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path) # # associate some epoch number with saved model, so can verify stored correct model. # def test_early_stopping(): # # set random seed # pass
def train(train_loop_func, logger, args): # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) train_loader = get_train_loader(args, args.seed - 2**31) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path)) # args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) print(f"Actual starting LR: {args.learning_rate}") start_epoch = 0 iteration = 0 loss_func = Loss(dboxes) if use_cuda: ssd300.cuda() loss_func.cuda() # optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, # momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) optimizer = torch.optim.AdamW(tencent_trick(ssd300), lr=args.learning_rate, betas=(0.8, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=True) # scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) # scheduler = CosineAnnealingWarmRestarts(optimizer=optimizer, T_0=20, T_mult=1, eta_min=1e-6) scheduler = CosineAnnealingLR(optimizer=optimizer, T_max=args.epochs, eta_min=1e-6) # scheduler = OneCycleLR(optimizer, max_lr=0.003, epochs=41, steps_per_epoch=173) # scheduler = CyclicLR(optimizer, base_lr=args.learning_rate, max_lr=2*args.learning_rate, # step_size_up=173*3, step_size_down=173*10) if args.amp: ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2') if args.distributed: ssd300 = DDP(ssd300) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage. cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() # scheduler.step() iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, scheduler, train_loader, val_dataloader, encoder, iteration, logger, args, mean, std) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time # https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate scheduler.step() if args.local_rank == 0: logger.update_epoch_time(epoch, end_epoch_time) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: logger.update_epoch(epoch, acc) if args.save and args.local_rank == 0: print("saving model...") obj = { 'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info } if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() torch.save(obj, './models/epoch_{}.pt'.format(epoch)) train_loader.reset() print('total training time: {}'.format(total_time))
def main(): arg = args() if not os.path.exists(arg.exp_name): os.makedirs(arg.exp_name) print(arg.exp_name.split('/')[0]) assert arg.exp_name.split( '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..." output_dir = arg.exp_name logger = logging_set(output_dir) logger.info( '\n================ experient name:[{}] ===================\n'.format( arg.exp_name)) os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu torch.backends.cudnn.enabled = True config = edict(yaml.load(open(arg.cfg, 'r'))) config.test.dataset_name = arg.dataset config.test.flip_test = arg.flip_test config.test.batchsize = 128 config.model.margin_to_border = arg.margin logger.info( '------------------------------ configuration ---------------------------' ) logger.info('\n==> available {} GPUs , numbers are {}\n'.format( torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"])) logger.info(pprint.pformat(config)) logger.info( '------------------------------- -------- ----------------------------' ) criterion = MSELoss() Arch = bulid_up_network(config, criterion) if arg.param_flop: Arch._print_info() logger.info("=========>current architecture's values before evaluate") if hasattr(Arch.backbone, "alphas"): Arch.backbone._show_alpha() Arch.backbone._show_beta() for id, group in enumerate(Arch.groups): group._show_alpha() group._show_beta() if arg.test_model: logger.info('\n===> load ckpt in : {}'.format(arg.test_model)) Arch.load_state_dict(torch.load(arg.test_model)) elif config.test.ckpt != '': logger.info('\n===> load ckpt in : ' + config.test.ckpt + '...') Arch.load_state_dict(torch.load(config.test.ckpt)) elif os.path.exists(os.path.join(output_dir, 'best_ckpt.tar')): logger.info('\n===> load ckpt in : ' + os.path.join(output_dir, 'best_ckpt.tar')) Arch.load_state_dict( torch.load(os.path.join(output_dir, 'best_ckpt.tar'))) else: logger.info('\n===>no ckpt is found, use the initial model ...') #raise ValueError #logger.info(Arch.backbone.alphas) logger.info("=========>Architecture's parameters") if hasattr(Arch, "backbone"): if hasattr(Arch.backbone, "alphas"): Arch.backbone._show_alpha(original_value=False) Arch.backbone._show_beta(original_value=False) for g in Arch.groups: g._show_alpha(original_value=False) g._show_beta(original_value=False) Arch = torch.nn.DataParallel(Arch).cuda() valid_dataset = dataset_(config, config.images_root_dir, config.annotation_root_dir, mode='val', transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) #test_img(valid_dataset,output_dir) valid_dt_dataset = dataset_(config, config.images_root_dir, config.person_detection_results_path, mode='dt', dataset=config.test.dataset_name, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) if arg.use_dt: logger.info("\n >>> use detection results ") valid_dataloader = torch.utils.data.DataLoader( valid_dt_dataset, batch_size=config.test.batchsize, shuffle=False, num_workers=4, pin_memory=True) else: logger.info("\n >>> use groundtruth bbox ") valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.test.batchsize, shuffle=False, num_workers=4, pin_memory=True) if arg.visualize: for i in range(len(valid_dataset)): imageid = 185250 # coco val set if valid_dataset[i][1] != imageid: # choose an image_id continue print(valid_dataset[i][1]) sample = valid_dataset[i] logger.info( "visualize the predicted heatmap of image id {} ".format( imageid)) img = sample[0].unsqueeze(0) #samples = next(iter(valid_dataloader)) #img = samples[0] output = Arch(img) print(img.size(), output.size()) visualize_heatamp(img, output, 'heatmaps', show_img=False) break results = evaluate(Arch, valid_dataloader, config, output_dir) logger.info('map = {}'.format(results))
def main(): arg = args() if not os.path.exists(arg.exp_name): os.makedirs(arg.exp_name) assert arg.exp_name.split( '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..." output_dir = arg.exp_name save_scripts_in_exp_dir(output_dir) logger = logging_set(output_dir) logger.info( '\n================ experient name:[{}] ===================\n'.format( arg.exp_name)) os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True np.random.seed(0) torch.manual_seed(0) config = edict(yaml.load(open(arg.cfg, 'r'))) if arg.search: assert arg.search in [ 'None', 'sync', 'random', 'second_order_gradient', 'first_order_gradient' ] config.train.arch_search_strategy = arg.search if arg.batchsize: logger.info("update batchsize to {}".format(arg.batchsize)) config.train.batchsize = arg.batchsize config.num_workers = arg.num_workers print( 'GPU memory : \ntotal | used\n', os.popen( 'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader' ).read()) logger.info( '------------------------------ configuration ---------------------------' ) logger.info( '\n==> available {} GPUs , use numbers are {} device is {}\n'.format( torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"], torch.cuda.current_device())) # torch.cuda._initialized = True logger.info(pprint.pformat(config)) logger.info( '------------------------------- -------- ----------------------------' ) criterion = MSELoss() Arch = bulid_up_network(config, criterion) if config.train.arch_search_strategy == 'random': logger.info("==>random seed is {}".format(config.train.random_seed)) np.random.seed(config.train.random_seed) torch.manual_seed(config.train.random_seed) Arch.arch_parameters_random_search() if arg.param_flop: Arch._print_info() # dump_input = torch.rand((1,3,128,128)) # graph = SummaryWriter(output_dir+'/log') # graph.add_graph(Arch, (dump_input, )) if len(arg.gpu) > 1: use_multi_gpu = True Arch = torch.nn.DataParallel(Arch).cuda() else: use_multi_gpu = False Arch = Arch.cuda() Search = Search_Arch(Arch.module, config) if use_multi_gpu else Search_Arch( Arch, config) # Arch.module for nn.DataParallel search_strategy = config.train.arch_search_strategy train_queue, arch_queue, valid_queue = Dataloaders(search_strategy, config, arg) #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None! logger.info( "\nNeural Architecture Search strategy is {}".format(search_strategy)) assert search_strategy in [ 'first_order_gradient', 'random', 'None', 'second_order_gradient', 'sync' ] if search_strategy == 'sync': # arch_parameters is also registered to model's parameters # so the weight-optimizer will also update the arch_parameters logger.info( "sync: The arch_parameters is also optimized by weight-optmizer synchronously" ) optimizer = torch.optim.Adam( Arch.parameters(), lr=config.train.w_lr_cosine_begin, ) else: # if search strategy is None,random,second_order_gradient and so on # the arch_parameters will be filtered by the weight-optimizer optimizer = torch.optim.Adam( filter_arch_parameters(Arch), lr=config.train.w_lr_cosine_begin, ) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = config.train.lr_step_size, # gamma = config.train.lr_decay_gamma ) if config.train.scheduler_name == "MultiStepLR": scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.train.LR_STEP, config.train.LR_FACTOR) elif config.train.scheduler_name == "CosineAnnealingLR": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=config.train.epoch_end, eta_min=config.train.w_lr_cosine_end) # best_result best = 0 logger.info( "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+==" ) begin, end = config.train.epoch_begin, config.train.epoch_end if arg.load_ckpt: if use_multi_gpu: begin, best = load_ckpt(Arch.module, optimizer, scheduler, output_dir, logger) else: begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir, logger) for epoch in range(begin, end): lr = scheduler.get_lr()[0] logger.info( '==>time:({})--training...... current learning rate is {:.7f}'. format(datetime.datetime.now(), lr)) train( epoch, train_queue, arch_queue, Arch, Search, criterion, optimizer, lr, search_strategy, output_dir, logger, config, arg, ) scheduler.step() eval_results = evaluate(Arch, valid_queue, config, output_dir) if use_multi_gpu: best = save_model(epoch, best, eval_results, Arch.module, optimizer, scheduler, output_dir, logger) else: best = save_model(epoch, best, eval_results, Arch, optimizer, scheduler, output_dir, logger) ## visualize_heatamp if arg.visualize and epoch % 5 == 0: for i in range(len(valid_queue.dataset)): if valid_queue.dataset[i][1] != 185250: # choose an image_id continue print(valid_queue.dataset[i][1]) sample = valid_queue.dataset[i] img = sample[0].unsqueeze(0) #samples = next(iter(valid_dataloader)) #img = samples[0] output = Arch(img) print(img.size(), output.size()) visualize_heatamp(img, output, 'heatmaps', show_img=False) break