Exemple #1
0
def run():
    # the first parameter of get_system_corpus specifies the system to be benchmarked.
    # this string should match a name in the Docker compose file or a name listed in get_system_corpus
    #
    # the second parameter specifies the corpus to be used for the benchmark
    # see tp.Corpus for the possible options
    system_corpus = get_system_corpus('dialogflow', tp.Corpus.SNIPS2017)
    evaluate(system_corpus)
Exemple #2
0
def evaluate_and_write(args, model, tasks, splits_to_write):
    """ Evaluate a model on dev and/or test, then write predictions """
    val_results, val_preds = evaluate.evaluate(model, tasks, args.batch_size, args.cuda, "val")
    if 'val' in splits_to_write:
        evaluate.write_preds(tasks, val_preds, args.run_dir, 'val',
                             strict_glue_format=args.write_strict_glue_format)
    if 'test' in splits_to_write:
        _, te_preds = evaluate.evaluate(model, tasks, args.batch_size, args.cuda, "test")
        evaluate.write_preds(tasks, te_preds, args.run_dir, 'test',
                             strict_glue_format=args.write_strict_glue_format)
    run_name = args.get("run_name", os.path.basename(args.run_dir))

    results_tsv = os.path.join(args.exp_dir, "results.tsv")
    log.info("Writing results for split 'val' to %s", results_tsv)
    evaluate.write_results(val_results, results_tsv, run_name=run_name)
Exemple #3
0
def validate(model, valid_loader, valid_df, args, tokenizer, ner_index, save_result=False, progress=False, limit=None,
             decode_mode='greedy'):
    run_root = Path('../experiments/' + args.run_root)
    predictions = predict(model, valid_loader, args, tokenizer, progress=True, limit=limit, decode_mode=decode_mode)
    # valid_df = valid_df.loc[ner_index,:]
    # new_predictions = []
    # for index, item in enumerate(ner_index):
    #     if ner_index[index]:
    #         new_predictions.append(predictions[index])
    # predictions = new_predictions
    valid_label = valid_df['eval_label'].tolist()
    print_label = valid_df['label'].tolist()
    a = valid_df['a'].tolist()
    b = valid_df['b'].tolist()
    current = valid_df['current'].tolist()
    # print(len(predictions),len(valid_label))
    predictions = [' '.join(x) for x in predictions]
    valid_metric = evaluate(predictions, valid_label)
    print(valid_metric)
    print('------------')
    for i, (a, b, current, p, l) in enumerate(zip(a, b, current, predictions, print_label)):
        print(a,' | ', b,' | ', current,' | ', p.replace(' ',''),' | ', l)
        if i >= args.print_num:
            break
    return valid_metric
Exemple #4
0
def test_default_subword_model(
        checkpoint_path='/content/gdrive/My Drive/NMT/unittests/checkpoints/',
        config_path='/content/gdrive/My Drive/NMT/configs/',
        corpus_path='/content/gdrive/My Drive/NMT/unittests/first_ten_sentences/'
):
    hyperparams = import_configs(config_path=config_path, unittesting=True)
    # use subword-level vocab
    hyperparams["vocab_type"] = "subword_joint"
    #hyperparams["learning_rate"] = .01 # increase learning rate
    print(f"vocab_type: {hyperparams['vocab_type']}")
    print(f"tie_weights: {hyperparams['tie_weights']}")

    construct_model_data("train.de",
                         "train.en",
                         hyperparams=hyperparams,
                         corpus_path=corpus_path,
                         checkpoint_path=checkpoint_path,
                         overfit=True)

    # model of sufficient capacity should be able to bring loss down to ~zero.
    model, loss = train(total_epochs=100,
                        early_stopping=False,
                        checkpoint_path=checkpoint_path,
                        save=False,
                        write=True)
    assert loss < .01

    model_data = retrieve_model_data(checkpoint_path=checkpoint_path)
    dev_batches = model_data[
        "dev_batches"]  # holds the training data, bc overfit=True
    dev_references = model_data[
        "references"]  # holds the training data, bc overfit=True
    idx_to_trg_word = model_data["idx_to_trg_word"]

    # greedy search should be able to perfectly predict the training data.
    dev_translations, _, _ = predict(model, dev_batches, idx_to_trg_word,
                                     checkpoint_path)
    bleu = evaluate(dev_translations, dev_references)
    assert bleu >= 100

    # beam search should be able to perfectly predict the training data.
    model.decoder.set_inference_alg("beam_search")
    dev_translations, _, _ = predict(model, dev_batches, idx_to_trg_word,
                                     checkpoint_path)
    bleu = evaluate(dev_translations, dev_references)
    assert bleu >= 100
def main(project_parameters):
    result = None
    if project_parameters.mode == 'train':
        result = train(project_parameters=project_parameters)
    elif project_parameters.mode == 'evaluate':
        if project_parameters.predefined_dataset is not None:
            print('temporarily does not support predefined dataset.')
        else:
            evaluate(project_parameters=project_parameters)
    elif project_parameters.mode == 'predict':
        if project_parameters.use_gui:
            gui = GUI(project_parameters=project_parameters)
            gui.run()
        else:
            result = Predict(project_parameters=project_parameters)(
                data_path=project_parameters.data_path)
            print(('{},' * project_parameters.num_classes).format(
                *project_parameters.classes)[:-1])
            print(result)
    elif project_parameters.mode == 'tune':
        result = tune(project_parameters=project_parameters)
    return result
Exemple #6
0
def analyse_fans(header, the_url, her_info, db):
    """
    获取博主粉丝列表首页的粉丝,并进行分析判断是不是我想找的人
    :param header: 浏览器头,包含Cookie
    :param the_url: 博主的粉丝列表链接
    :param her_info: 她的信息
    :param db: 数据库管理
    :return: 无
    """
    # 获取博主粉丝页的html文件
    html_str = util.get_html(header=header, the_url=the_url)

    # -----------------------------第1步-------------------------------------------------
    # 获取粉丝列表
    fan_list = fans.get_fans_list(html_str)
    # print("找到粉丝%s个" % len(fan_list))
    # 目前只截取一个
    first_fan = [
        fan_list[0],
    ]
    print(('找到粉丝:%s' % first_fan[0].__str__()).encode('gbk',
                                                      'ignore').decode('gbk'))
    for fan in first_fan:
        # ------------------------第2步-------------------------------------------------
        # 评估这个人的是我要找的人的可能性
        chance = analyse.evaluate(fan.__dict__, her_info)
        # print(fan)
        if chance > 0:
            # --------------------第3.2步-----------------------------------------------
            # 搜索关键词,越详细越准确越好
            key_words = her_info['key_words']
            # print(key_words)
            # 获取该粉丝的更多信息
            print('分析粉丝"%s"中...' % fan.name)
            match_school, count = search_more_info_of_fan(
                header, fan.url, key_words, '成都医学院')
            # print(match_school, count)

            if count == -1 and not match_school:  # 因为搜索太频繁返回,转向搜索用户的所有微博
                print('搜索用户的所有微博中...')
                # --------------------第4.1步---------------------------------------------
                match_school, count = find_more_info_in_fan_assays(
                    header, fan, key_words, '成都医学院')
            if match_school or count > 0:
                # --------------------第5.1步----------------------------------------------
                print('找到符合条件的粉丝', fan)
                db.add_a_fan(fan, match_school, count)
                mail.send_email('找到符合条件的粉丝', fan.__str__())
            else:
                # --------------------第5.2步----------------------------------------------
                print('分析完成,该粉丝不是我要找的')
 def evaluate_model(self):
     ''' evaluation the current model performance
     '''
     try:
         x, y = np.array(self.keys_use), np.array(self.values_use)
         x_train = np.array(x[:-self.span])
         y_train = y[:-self.span]
         x_test = np.array(x[-self.span:])
         y_test = y[-self.span:]
         self.model.fit(x_train, y_train)
         y_pred = self.model.predict(x_test)
         evaluation = evaluate(y_test, y_pred)
         self.model_details = {
             "r2": evaluation["r2"],
             "msle": evaluation["msle"]
         }
     except Exception as ex:
         logger.error(ex)
Exemple #8
0
def main(cl_arguments):
    ''' Train or load a model. Evaluate on some tasks. '''
    cl_args = handle_arguments(cl_arguments)
    args = config.params_from_file(cl_args.config_file, cl_args.overrides)

    # Logistics #
    maybe_make_dir(args.project_dir)  # e.g. /nfs/jsalt/exp/$HOSTNAME
    maybe_make_dir(args.exp_dir)      # e.g. <project_dir>/jiant-demo
    maybe_make_dir(args.run_dir)      # e.g. <project_dir>/jiant-demo/sst
    log.getLogger().addHandler(log.FileHandler(args.local_log_path))

    if cl_args.remote_log:
        gcp.configure_remote_logging(args.remote_log_name)

    if cl_args.notify:
        from src import emails
        global EMAIL_NOTIFIER
        log.info("Registering email notifier for %s", cl_args.notify)
        EMAIL_NOTIFIER = emails.get_notifier(cl_args.notify, args)

    if EMAIL_NOTIFIER:
        EMAIL_NOTIFIER(body="Starting run.", prefix="")

    _try_logging_git_info()

    log.info("Parsed args: \n%s", args)

    config_file = os.path.join(args.run_dir, "params.conf")
    config.write_params(args, config_file)
    log.info("Saved config to %s", config_file)

    seed = random.randint(1, 10000) if args.random_seed < 0 else args.random_seed
    random.seed(seed)
    torch.manual_seed(seed)
    log.info("Using random seed %d", seed)
    if args.cuda >= 0:
        try:
            if not torch.cuda.is_available():
                raise EnvironmentError("CUDA is not available, or not detected"
                                       " by PyTorch.")
            log.info("Using GPU %d", args.cuda)
            torch.cuda.set_device(args.cuda)
            torch.cuda.manual_seed_all(seed)
        except Exception:
            log.warning(
                "GPU access failed. You might be using a CPU-only installation of PyTorch. Falling back to CPU.")
            args.cuda = -1

    # Prepare data #
    log.info("Loading tasks...")
    start_time = time.time()
    train_tasks, eval_tasks, vocab, word_embs = build_tasks(args)
    if any([t.val_metric_decreases for t in train_tasks]) and any(
            [not t.val_metric_decreases for t in train_tasks]):
        log.warn("\tMixing training tasks with increasing and decreasing val metrics!")
    tasks = sorted(set(train_tasks + eval_tasks), key=lambda x: x.name)
    log.info('\tFinished loading tasks in %.3fs', time.time() - start_time)
    log.info('\t Tasks: {}'.format([task.name for task in tasks]))

    # Build or load model #
    log.info('Building model...')
    start_time = time.time()
    model = build_model(args, vocab, word_embs, tasks)
    log.info('\tFinished building model in %.3fs', time.time() - start_time)

    # Check that necessary parameters are set for each step. Exit with error if not.
    steps_log = []

    if not args.load_eval_checkpoint == 'none':
        assert_for_log(os.path.exists(args.load_eval_checkpoint),
                       "Error: Attempting to load model from non-existent path: [%s]" %
                       args.load_eval_checkpoint)
        assert_for_log(
            not args.do_train,
            "Error: Attempting to train a model and then replace that model with one from a checkpoint.")
        steps_log.append("Loading model from path: %s" % args.load_eval_checkpoint)

    if args.do_train:
        assert_for_log(args.train_tasks != "none",
                       "Error: Must specify at least on training task: [%s]" % args.train_tasks)
        assert_for_log(
            args.val_interval %
            args.bpp_base == 0, "Error: val_interval [%d] must be divisible by bpp_base [%d]" %
            (args.val_interval, args.bpp_base))
        steps_log.append("Training model on tasks: %s" % args.train_tasks)

    if args.train_for_eval:
        steps_log.append("Re-training model for individual eval tasks")
        assert_for_log(
            args.eval_val_interval %
            args.bpp_base == 0, "Error: eval_val_interval [%d] must be divisible by bpp_base [%d]" %
            (args.eval_val_interval, args.bpp_base))
        assert_for_log(len(set(train_tasks).intersection(eval_tasks)) == 0
                       or args.allow_reuse_of_pretraining_parameters
                       or args.do_train == 0,
                       "If you're pretraining on a task you plan to reuse as a target task, set\n"
                       "allow_reuse_of_pretraining_parameters = 1(risky), or train in two steps:\n"
                       "  train with do_train = 1, train_for_eval = 0, stop, and restart with\n"
                       "  do_train = 0 and train_for_eval = 1.")

    if args.do_eval:
        assert_for_log(args.eval_tasks != "none",
                       "Error: Must specify at least one eval task: [%s]" % args.eval_tasks)
        steps_log.append("Evaluating model on tasks: %s" % args.eval_tasks)

    # Start Tensorboard if requested
    if cl_args.tensorboard:
        tb_logdir = os.path.join(args.run_dir, "tensorboard")
        _run_background_tensorboard(tb_logdir, cl_args.tensorboard_port)

    log.info("Will run the following steps:\n%s", '\n'.join(steps_log))
    if args.do_train:
        # Train on train tasks #
        log.info("Training...")
        params = build_trainer_params(args, task_names=[])
        stop_metric = train_tasks[0].val_metric if len(train_tasks) == 1 else 'macro_avg'
        should_decrease = train_tasks[0].val_metric_decreases if len(train_tasks) == 1 else False
        trainer, _, opt_params, schd_params = build_trainer(params, model,
                                                            args.run_dir,
                                                            should_decrease)
        to_train = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
        best_epochs = trainer.train(train_tasks, stop_metric,
                                    args.batch_size, args.bpp_base,
                                    args.weighting_method, args.scaling_method,
                                    to_train, opt_params, schd_params,
                                    args.shared_optimizer, args.load_model, phase="main")

    # Select model checkpoint from main training run to load
    if not args.train_for_eval:
        log.info("In strict mode because train_for_eval is off. "
                 "Will crash if any tasks are missing from the checkpoint.")
        strict = True
    else:
        strict = False

    if args.train_for_eval and not args.allow_reuse_of_pretraining_parameters:
        # If we're training models for evaluation, which is always done from scratch with a fresh
        # optimizer, we shouldn't load parameters for those models.
        # Usually, there won't be trained parameters to skip, but this can happen if a run is killed
        # during the train_for_eval phase.
        task_names_to_avoid_loading = [task.name for task in eval_tasks]
    else:
        task_names_to_avoid_loading = []

    if not args.load_eval_checkpoint == "none":
        log.info("Loading existing model from %s...", args.load_eval_checkpoint)
        load_model_state(model, args.load_eval_checkpoint,
                         args.cuda, task_names_to_avoid_loading, strict=strict)
    else:
        # Look for eval checkpoints (available only if we're restoring from a run that already
        # finished), then look for training checkpoints.
        eval_best = glob.glob(os.path.join(args.run_dir,
                                           "model_state_eval_best.th"))
        if len(eval_best) > 0:
            load_model_state(
                model,
                eval_best[0],
                args.cuda,
                task_names_to_avoid_loading,
                strict=strict)
        else:
            macro_best = glob.glob(os.path.join(args.run_dir,
                                                "model_state_main_epoch_*.best_macro.th"))
            if len(macro_best) > 0:
                assert_for_log(len(macro_best) == 1,
                               "Too many best checkpoints. Something is wrong.")
                load_model_state(
                    model,
                    macro_best[0],
                    args.cuda,
                    task_names_to_avoid_loading,
                    strict=strict)
            else:
                assert_for_log(
                    args.allow_untrained_encoder_parameters,
                    "No best checkpoint found to evaluate.")
                log.warning("Evaluating untrained encoder parameters!")

    # Train just the task-specific components for eval tasks.
    if args.train_for_eval:
        # might be empty if no elmo. scalar_mix_0 should always be pretrain scalars
        elmo_scalars = [(n, p) for n, p in model.named_parameters() if
                        "scalar_mix" in n and "scalar_mix_0" not in n]
        # fails when sep_embs_for_skip is 0 and elmo_scalars has nonzero length
        assert_for_log(not elmo_scalars or args.sep_embs_for_skip,
                       "Error: ELMo scalars loaded and will be updated in train_for_eval but "
                       "they should not be updated! Check sep_embs_for_skip flag or make an issue.")
        for task in eval_tasks:
            # Skip mnli-diagnostic
            # This has to be handled differently than probing tasks because probing tasks require the "is_probing_task"
            # to be set to True. For mnli-diagnostic this flag will be False because it is part of GLUE and
            # "is_probing_task is global flag specific to a run, not to a task.
            if task.name == 'mnli-diagnostic':
                continue
            pred_module = getattr(model, "%s_mdl" % task.name)
            to_train = elmo_scalars + [(n, p)
                                       for n, p in pred_module.named_parameters() if p.requires_grad]
            # Look for <task_name>_<param_name>, then eval_<param_name>
            params = build_trainer_params(args, task_names=[task.name, 'eval'])
            trainer, _, opt_params, schd_params = build_trainer(params, model,
                                                                args.run_dir,
                                                                task.val_metric_decreases)
            best_epoch = trainer.train([task], task.val_metric,
                                       args.batch_size, 1,
                                       args.weighting_method, args.scaling_method,
                                       to_train, opt_params, schd_params,
                                       args.shared_optimizer, load_model=False, phase="eval")

            # Now that we've trained a model, revert to the normal checkpoint logic for this task.
            task_names_to_avoid_loading.remove(task.name)

            # The best checkpoint will accumulate the best parameters for each task.
            # This logic looks strange. We think it works.
            best_epoch = best_epoch[task.name]
            layer_path = os.path.join(args.run_dir, "model_state_eval_best.th")
            load_model_state(
                model,
                layer_path,
                args.cuda,
                skip_task_models=task_names_to_avoid_loading,
                strict=strict)

    if args.do_eval:
        # Evaluate #
        log.info("Evaluating...")
        val_results, val_preds = evaluate.evaluate(model, eval_tasks,
                                                   args.batch_size,
                                                   args.cuda, "val")

        splits_to_write = evaluate.parse_write_preds_arg(args.write_preds)
        if 'val' in splits_to_write:
            evaluate.write_preds(eval_tasks, val_preds, args.run_dir, 'val',
                                 strict_glue_format=args.write_strict_glue_format)
        if 'test' in splits_to_write:
            _, te_preds = evaluate.evaluate(model, eval_tasks,
                                            args.batch_size, args.cuda, "test")
            evaluate.write_preds(tasks, te_preds, args.run_dir, 'test',
                                 strict_glue_format=args.write_strict_glue_format)
        run_name = args.get("run_name", os.path.basename(args.run_dir))

        results_tsv = os.path.join(args.exp_dir, "results.tsv")
        log.info("Writing results for split 'val' to %s", results_tsv)
        evaluate.write_results(val_results, results_tsv, run_name=run_name)

    log.info("Done!")
Exemple #9
0
def main():
    args = parse_arguments()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    os.environ['PYTHONHASHSEED'] = str(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    worker_init = WorkerInitObj(args.seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.enabled = True
    device, args = setup_training(args)
    model, optimizer, criterion = prepare_model_and_optimizer(args, device)
    pool = ProcessPoolExecutor(1)
    train_iter = subsetDataloader(path=args.train_path,
                                  batch_size=args.batch_size,
                                  worker_init=worker_init)
    test_iter = subsetDataloader(path=args.val_path,
                                 batch_size=args.batch_size,
                                 worker_init=worker_init)

    print('-' * 50 + 'args' + '-' * 50)
    for k in list(vars(args).keys()):
        print('{0}: {1}'.format(k, vars(args)[k]))
    print('-' * 30)
    print(model)
    print('-' * 50 + 'args' + '-' * 50)

    global_step = 0
    global_auc = 0

    s_time_train = time.time()
    for epoch in range(args.epoch):

        dataset_future = pool.submit(subsetDataloader, args.train_path,
                                     args.batch_size, worker_init)

        for step, batch in enumerate(train_iter):

            model.train()
            labels = batch['label'].to(device).float()
            batch = {
                t: {k: v.to(device)
                    for k, v in d.items()}
                for t, d in batch.items() if isinstance(d, dict)
            }

            optimizer.zero_grad()
            logits = model(batch)
            # print('logits', logits)
            # print('label', labels)
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()

            # evaluate
            if global_step != 0 and global_step % args.eval_freq == 0:
                s_time_eval = time.time()
                model.eval()
                auc = evaluate(model, test_iter, device)
                e_time_eval = time.time()
                print('-' * 68)
                print('Epoch:[{0}] Step:[{1}] AUC:[{2}] time:[{3}s]'.format(
                    epoch, global_step, format(auc, '.4f'),
                    format(e_time_eval - s_time_eval, '.4f')))

                if auc > global_auc:
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_save_file = os.path.join(
                        args.output_dir, "{}_auc_{}_step_{}_ckpt.pt".format(
                            args.model_name, format(auc, '.4f'), global_step))

                    if os.path.exists(output_save_file):
                        os.system('rm -rf {}'.format(output_save_file))
                    torch.save(
                        {
                            'model': model_to_save.state_dict(),
                            'name': args.model_name
                        }, output_save_file)
                    print('Epoch:[{0}] Step:[{1}] SavePath:[{2}]'.format(
                        epoch, global_step, output_save_file))
                    global_auc = auc
                print('-' * 68)

            # log
            if global_step != 0 and global_step % args.log_freq == 0:
                e_time_train = time.time()
                print('Epoch:[{0}] Step:[{1}] Loss:[{2}] Lr:[{3}] time:[{4}s]'.
                      format(epoch, global_step, format(loss.item(), '.4f'),
                             format(optimizer.param_groups[0]['lr'], '.6'),
                             format(e_time_train - s_time_train, '.4f')))
                s_time_train = time.time()

            global_step += 1

        del train_iter
        train_iter = dataset_future.result(timeout=None)
Exemple #10
0
def test_evaluate_fail():
    with pytest.raises(Exception):
        # invalid input shape
        y_test = np.array([1, 2, 3])
        y_pred = np.array([1, 2])
        result = evaluate(y_test, y_pred)
Exemple #11
0
def test_evaluate_pass():
    y_test = np.array([1, 2, 3])
    y_pred = np.array([1, 2, 2.2])
    expected = {"r2": 0.6800000000000002, "msle": 0.01659768149770578}
    result = evaluate(y_test, y_pred)
    assert result == expected
Exemple #12
0
def main():

    arg = args()

    if not os.path.exists(arg.exp_name):
        os.makedirs(arg.exp_name)

    assert arg.exp_name.split(
        '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..."
    output_dir = arg.exp_name

    if arg.local_rank == 0:
        save_scripts_in_exp_dir(output_dir)

    logger = logging_set(output_dir, arg.local_rank)
    logger.info(arg)
    logger.info(
        '\n================ experient name:[{}] ===================\n'.format(
            arg.exp_name))
    os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    np.random.seed(0)
    torch.manual_seed(0)

    config = edict(yaml.load(open(arg.cfg, 'r')))

    if arg.search:
        assert arg.search in [
            'None', 'sync', 'random', 'second_order_gradient',
            'first_order_gradient'
        ]
        config.train.arch_search_strategy = arg.search

    if arg.batchsize:
        logger.info("update batchsize to {}".format(arg.batchsize))
        config.train.batchsize = arg.batchsize

    config.num_workers = arg.num_workers

    print(
        'GPU memory : \ntotal | used\n',
        os.popen(
            'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader'
        ).read())

    logger.info(
        '------------------------------ configuration ---------------------------'
    )
    logger.info(
        '\n==> available {} GPUs , use numbers are {} device is {}\n'.format(
            torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"],
            torch.cuda.current_device()))
    # torch.cuda._initialized = True
    logger.info(pprint.pformat(config))
    logger.info(
        '------------------------------- -------- ----------------------------'
    )

    best = 0

    criterion = MSELoss()

    Arch = bulid_up_network(config, criterion)

    if config.train.arch_search_strategy == 'random':

        logger.info("==>random seed is {}".format(config.train.random_seed))
        np.random.seed(config.train.random_seed)
        torch.manual_seed(config.train.random_seed)

        Arch.arch_parameters_random_search()

    if arg.param_flop:
        Arch._print_info()

    if len(arg.gpu) > 1:
        use_multi_gpu = True

        if arg.distributed:
            torch.distributed.init_process_group(backend="nccl")
            #torch.distributed.init_process_group(backend="nccl",init_method='env://')
            local_rank = torch.distributed.get_rank()
            torch.cuda.set_device(local_rank)
            device = torch.device("cuda", local_rank)
            Arch.to(device)

            Arch = torch.nn.parallel.DistributedDataParallel(
                Arch,
                device_ids=[local_rank],
                output_device=local_rank,
                find_unused_parameters=True)
            logger.info("local rank = {}".format(local_rank))
        else:
            Arch = torch.nn.DataParallel(Arch).cuda()
    else:
        use_multi_gpu = False
        Arch = Arch.cuda()

    Search = Search_Arch(Arch.module,
                         config) if use_multi_gpu else Search_Arch(
                             Arch, config)  # Arch.module for nn.DataParallel

    search_strategy = config.train.arch_search_strategy

    if not arg.distributed:
        train_queue, arch_queue, valid_queue = Dataloaders(
            search_strategy, config, arg)
    else:
        train_queue, \
        arch_queue, \
        valid_queue, \
        train_sampler_dist, = Dataloaders(search_strategy,config,arg)
    #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None!

    logger.info(
        "\nNeural Architecture Search strategy is {}".format(search_strategy))
    assert search_strategy in [
        'first_order_gradient', 'random', 'None', 'second_order_gradient',
        'sync'
    ]

    if search_strategy == 'sync':
        # arch_parameters is also registered to model's parameters
        # so the weight-optimizer will also update the arch_parameters
        logger.info(
            "sync: The arch_parameters is also optimized by weight-optmizer synchronously"
        )
        optimizer = torch.optim.Adam(
            Arch.parameters(),
            lr=config.train.w_lr_cosine_begin,
        )

    else:
        # if search strategy is None,random,second_order_gradient and so on
        # the arch_parameters will be filtered by the weight-optimizer
        optimizer = torch.optim.Adam(
            filter_arch_parameters(Arch),
            lr=config.train.w_lr_cosine_begin,
        )
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer,  step_size = config.train.lr_step_size,
    #                                                       gamma = config.train.lr_decay_gamma )
    if config.train.scheduler_name == "MultiStepLR":
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.train.LR_STEP, config.train.LR_FACTOR)
    elif config.train.scheduler_name == "CosineAnnealingLR":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=config.train.epoch_end,
            eta_min=config.train.w_lr_cosine_end)

    # best_result

    logger.info(
        "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=="
    )
    begin, end = config.train.epoch_begin, config.train.epoch_end

    if arg.load_ckpt:
        if use_multi_gpu:
            begin, best = load_ckpt(Arch.module, optimizer, scheduler,
                                    output_dir, logger)
        else:
            begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir,
                                    logger)

    for epoch in range(begin, end):

        lr = scheduler.get_lr()[0]
        logger.info(
            '==>time:({})--training...... current learning rate is {:.7f}'.
            format(datetime.datetime.now(), lr))

        if arg.distributed:
            train_sampler_dist.set_epoch(epoch)
            #valid_sampler_dist.set_epoch(epoch)

        train(
            epoch,
            train_queue,
            arch_queue,
            Arch,
            Search,
            criterion,
            optimizer,
            lr,
            search_strategy,
            output_dir,
            logger,
            config,
            arg,
        )
        scheduler.step()

        if not arg.distributed or (arg.distributed and arg.local_rank == 0):

            eval_results = evaluate(Arch, valid_queue, config, output_dir)

            if use_multi_gpu:
                best = save_model(epoch, best, eval_results, Arch.module,
                                  optimizer, scheduler, output_dir, logger)
            else:
                best = save_model(epoch, best, eval_results, Arch, optimizer,
                                  scheduler, output_dir, logger)
Exemple #13
0
def parseArgs():
    """
    Parses received arguments using argparse.
    :return:
    """
    parser = argparse.ArgumentParser('Test and evaluate Ring Confidential Transactions')
    parser.add_argument('-rs', '--ringsizes', required=True, nargs='*', type=int, help="Define the size of the ring.")
    parser.add_argument('-c', '--curves', required=False, nargs='*', help="Elliptic curve to employ.")
    parser.add_argument('-m', '--message', required=False, help="Message to sign.")
    parser.add_argument('-o', '--output', required=False, help="Destination file to save the output graphics.")
    return parser.parse_args()


"""
    Reads and parses the arguments.
    Calls the evaluation function.    
"""
if __name__ == '__main__':
    args = parseArgs()
    curves = ['secp192r1']
    message = 'I voted for Kodos'
    output = 'comparative'
    if args.curves is None:
        args.curves = curves
    if args.message is None:
        args.message = message
    if args.output is None:
        args.output = output
    evaluate(args)
Exemple #14
0
def train(train_loop_func, args, logger):
    # Setup multi-GPU if necessary
    # args.distributed = False
    # if 'WORLD_SIZE' in os.environ:
    #     args.distributed = int(os.environ['WORLD_SIZE']) > 1
    #     args.distributed = True
    #     if args.distributed:
    #         torch.cuda.set_device(args.local_rank)
    #         torch.distributed.init_process_group(backend='nccl')
    #         args.N_gpu = torch.distributed.get_world_size()
    #     else:
    #         args.N_gpu = 1

    if args.seed is None:
        args.seed = np.random.randint(10000)

    # if args.distributed:
    #     args.seed = (args.seed + torch.distributed.get_rank()) % 2 ** 32

    print("Using seed = {}".format(args.seed))
    torch.manual_seed(args.seed)
    np.random.seed(seed=args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    model = EfficientNet.from_pretrained('efficientnet-b4', num_classes=4)

    if args.local_rank is not None:
        torch.distributed.init_process_group(backend="nccl")
        torch.cuda.set_device(args.local_rank)
    model = model.cuda()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.002},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0}
    ]

    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    if args.amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if args.local_rank is not None:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)

    # Setup data, defaults

    train, test = construct_dataset(args.data)
    random.shuffle(train)

    train = train[:len(train)]
    split_position = int(len(train) * 0.96)

    train_dataset = ALASKA2Dataset(train[:split_position], root_dir=args.data, augmented=True)
    val_dataset = ALASKA2Dataset(train[split_position:], root_dir=args.data, augmented=False)
    test_dataset = ALASKA2Dataset(test, root_dir=args.data, augmented=False)

    if args.local_rank is not None:
        train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True)
        train_sampler.set_epoch(0)
    else:
        train_sampler = RandomSampler(train_dataset)

    train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, drop_last=False,
                                  num_workers=4, shuffle=False, sampler=train_sampler)
    val_dataloader = DataLoader(val_dataset, batch_size=args.eval_batch_size, drop_last=False,
                                num_workers=4, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=args.eval_batch_size, drop_last=False,
                                 num_workers=4, shuffle=False)

    mean, std = generate_mean_std(amp=args.amp)

    # args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)

    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2,
                                                           verbose=False, threshold_mode='abs')

    start_epoch = 1
    if args.checkpoint is not None:
        if os.path.isfile(args.checkpoint):
            model, optimizer, scheduler, start_epoch = load_checkpoint(
                args.checkpoint, model, optimizer, scheduler)

            start_epoch += 1  # this is because the epoch saved is the previous epoch
        else:
            print('Provided checkpoint is not path to a file')
            return

    # loss_function = nn.CrossEntropyLoss()
    loss_function = LabelSmoothing()

    if args.mode == 'evaluation':
        acc = evaluate(model, val_dataloader, args, mean, std, loss_function)

        print('Model precision {} mAP'.format(acc))
        return
    elif args.mode == 'testing':
        test_(model, test_dataloader, args, mean, std)
        return

    for epoch in range(start_epoch, args.epochs + 1):
        print("-----------------------")
        print("Local Rank: {}, Epoch: {}, Training ...".format(args.local_rank, epoch))
        print("Epoch {} of {}".format(epoch, args.epochs))

        print("Total number of parameters trained this epoch: ",
              sum(p.numel() for pg in optimizer.param_groups for p in pg['params'] if
                  p.requires_grad))

        avg_loss = train_loop_func(model, loss_function, optimizer, train_dataloader, None, args,
                                   mean, std)

        # logger.update_epoch_time(epoch, end_epoch_time)
        print("saving model...")
        obj = {'epoch': epoch,
               'model': model.module.state_dict(), # model.state_dict() for non DataParallel model
               'optimizer': optimizer.state_dict(),
               'scheduler': scheduler.state_dict()}

        if args.local_rank in [0, None]:
            torch.save(obj, f'./saved/{args.backbone}_epoch_{epoch}.pt')

        print("Incepe evaluarea")
        val_loss = evaluate(model, val_dataloader, args, mean, std, loss_function)
        test_(model, test_dataloader, args, mean, std, epoch)

        scheduler.step(val_loss)
Exemple #15
0
def main(config):
    CASE_NUM = config['case_num']

    DATASET = config['dataset']
    NORMALIZATION = config['normalization']

    BATCH_SIZE = config['batch_size']
    MAX_EPOCH = config['max_epoch']
    OPTIM_TYPE = config['optimizer']
    LR = config['learning_rate']
    LR_STEP = config['lr_step']
    LR_DECAY = config['lr_decay']
    L2_DECAY = config['l2_decay']
    TB_STATE = config['use_tensorboard']

    MODEL_NAME = config['model_name']
    ALPHA = config['alpha']
    BETA = config['beta']
    GAMMA = config['gamma']
    PHI = config['phi']
    LOSS_FN = config['loss_fn']
    KERNEL_SIZE = config['kernel_size']

    result_dir = make_dir(RESULT_ROOT_DIR,
                          str(CASE_NUM),
                          overwrite=args.overwrite)
    ckpt_path = result_dir + '/' + 'checkpoint.pt'

    # =============================================== Select data and construct
    data_fname, data_dim = select_data(DATASET)
    data_path = '../data/' + data_fname

    data_train = NLUDataset(data_path,
                            mode='train',
                            normalization=NORMALIZATION,
                            random_seed=42)
    dataloader_train = DataLoader(data_train,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=4)

    data_valid = NLUDataset(data_path,
                            mode='valid',
                            normalization=NORMALIZATION,
                            random_seed=42)
    dataloader_valid = DataLoader(data_valid,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=4)

    data_test = NLUDataset(data_path,
                           mode='test',
                           normalization=NORMALIZATION,
                           random_seed=42)
    dataloader_test = DataLoader(data_test,
                                 batch_size=BATCH_SIZE,
                                 shuffle=True,
                                 num_workers=4)

    num_train_samples = data_train.__len__()
    classes = data_train.labels
    num_classes = len(classes)

    # =============================================== Initialize model and optimizer
    device = ('cuda' if torch.cuda.is_available() else 'cpu')
    if device == 'cuda': print('Using GPU, %s' % torch.cuda.get_device_name(0))

    net = select_model(MODEL_NAME, data_dim, KERNEL_SIZE, num_classes, ALPHA,
                       BETA, PHI)
    net.to(device)
    loss_fn = select_loss(LOSS_FN)
    optimizer = select_optimizer(OPTIM_TYPE, net.parameters(), LR, L2_DECAY)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=LR_STEP,
                                          gamma=LR_DECAY)

    # =============================================== Train
    it = 0
    train_losses, valid_losses, valid_accs = {}, {}, {}
    best_validation_acc = 0
    log_term = 5

    for epoch in range(MAX_EPOCH):
        #------------------------------------------------ One epoch start
        one_epoch_start = time.time()
        print('Epoch {} / Learning Rate: {:.0e}'.format(
            epoch,
            scheduler.get_lr()[0]))
        #------------------------------------------------ Train
        train_losses, it, net, optimizer, scheduler \
            = train_1epoch(dataloader_train, device, train_losses, it, net, loss_fn, optimizer, scheduler, log_every=log_term)
        #------------------------------------------------ Validation
        valid_acc, valid_loss = evaluate(dataloader_valid, device, net,
                                         loss_fn)
        valid_losses[it] = valid_loss
        valid_accs[it] = valid_acc
        #------------------------------------------------ Save model
        saved = ''
        if valid_acc > best_validation_acc:
            best_validation_acc = valid_acc
            saved = save_ckpt(ckpt_path, net, best_validation_acc)
        print('Epoch {} / Valid loss: {:.4f}, Valid acc: {:.4f} {}'.format(
            epoch, valid_loss, valid_acc, saved))
        #------------------------------------------------ One epoch end
        curr_time = time.time()
        print("One epoch time = %.2f s" % (curr_time - one_epoch_start))
        print('#------------------------------------------------------#')

    save_train_log(result_dir, train_losses, valid_losses, valid_accs,
                   best_validation_acc)

    # =============================================== Test
    net, best_validation_acc = load_ckpt(ckpt_path, net)
    test_acc, test_loss = evaluate(dataloader_test, device, net, loss_fn)

    return test_acc
def main(config, run_preprocessing, run_data_upload, log_dir):
    # Load experiment configuration
    with open(config) as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    # Get db connection
    conn = sql_utils.get_connection()

    # Get basic info of experiment
    exp_version = config['version']
    exp_name = config["experiment_name"]
    exp_time = date_utils.get_current_time_string()[2:]
    username = getpass.getuser()[0]

    terminal_width = int(os.popen('stty size', 'r').read().split()[1])
    print(
        f'Running Experiment: {username}_{exp_version}_{exp_name}_{exp_time}\n{"-" * terminal_width}\n'
    )

    # Preprocessing
    preprocessing_prefix = config['preprocessing_config']['prefix']
    if not run_preprocessing:
        print('Preprocessing skipped.')
    else:
        print('Preprocessing ...')
        run_preprocess(conn,
                       config['preprocessing_config'],
                       run_data_upload=run_data_upload)
        print('Preprocessing done.')

    # Get temporal configuration information
    train_dates_list, test_dates_list = parse_temporal_config(
        config['temporal_config'])

    # Training and evaluation
    test_results_over_time = []
    experiment_loop = tqdm.tqdm(list(zip(train_dates_list, test_dates_list)),
                                desc='Experiment Repeats')
    for train_dates, test_dates in experiment_loop:
        split_time_abbr = date_utils.date_to_string(
            test_dates['label_start_time'])
        split_time_abbr = split_time_abbr.replace('-', '')[2:]
        split_name = f'{split_time_abbr}'
        print(split_name)
        prefix = f'{username}_{exp_version}_{exp_name}_{exp_time}_{split_name}'
        experiment_table_prefix = f'experiments.{prefix}'
        train_save_dir = os.path.join(os.getcwd(), log_dir, prefix,
                                      'train_' + exp_time)
        test_save_dir = os.path.join(os.getcwd(), log_dir, prefix,
                                     'test_' + exp_time)

        # Prepare cohort as specified by our experiment configuration
        tqdm.tqdm.write('\nPreparing cohorts ...')
        train_feature_splits, train_label_splits = [], []
        for i, train_dates_aod in enumerate(train_dates):
            train_feature_table, train_label_table = prepare_cohort(
                config,
                train_dates_aod,
                test_dates,
                preprocessing_prefix,
                experiment_table_prefix + f'_split{i}',
                include_test=False)[:2]
            train_feature_splits.append(train_feature_table)
            train_label_splits.append(train_label_table)
        test_feature_table, test_label_table = prepare_cohort(
            config,
            train_dates[-1],
            test_dates,
            preprocessing_prefix,
            experiment_table_prefix,
            include_train=False)[2:]
        train_feature_table = f'{experiment_table_prefix}_train_features'
        sql_utils.merge_tables(train_feature_splits, train_feature_table)
        train_label_table = f'{experiment_table_prefix}_train_labels'
        sql_utils.merge_tables(train_label_splits, train_label_table)

        # Delete intermediate cohort tables
        for i in range(len(train_dates)):
            cohort_table_name = f'{experiment_table_prefix}_split{i}_cohort'
            sql_utils.run_sql_from_string(conn,
                                          f'drop table {cohort_table_name};')

        # Train models as specified by our experiment configuration
        tqdm.tqdm.write('Training ...')
        model_summaries = train(config,
                                train_feature_table,
                                train_label_table,
                                discard_columns=['split'],
                                save_dir=train_save_dir)

        # Evaluate our models on the training data
        model_paths = glob.glob(f'{train_save_dir}/*.pkl')
        tqdm.tqdm.write('Evaluating on training data ...')
        train_results = evaluate(config,
                                 train_feature_table,
                                 train_label_table,
                                 model_paths,
                                 model_summaries,
                                 discard_columns=['split'],
                                 log_dir=train_save_dir)

        # Evaluate our models on the test data
        tqdm.tqdm.write('Evaluating on test data ...')
        test_results = evaluate(config,
                                test_feature_table,
                                test_label_table,
                                model_paths,
                                model_summaries,
                                save_preds_to_db=True,
                                save_prefix=f'{prefix}_test',
                                log_dir=test_save_dir)
        test_results_over_time.append(test_results)

        # Save results to database
        train_results_name = f'{prefix}_train_results'
        test_results_name = f'{prefix}_test_results'
        train_results.to_sql(train_results_name, conn, schema='results')
        test_results.to_sql(test_results_name, conn, schema='results')

    # Plot test results over time
    test_results_tables_prefix = f'{username}_{exp_version}_{exp_name}_{exp_time}'
    plot_utils.plot_results_over_time(test_results_tables_prefix)
Exemple #17
0
def evaluate_model(args):
    from src.evaluate import main as evaluate
    return evaluate(args.dataset_path, args.checkpoint_path, args.force)
def main(config):
    CASE_NUM = config['case_num']

    DATASET = config['dataset']
    NORMALIZATION = config['normalization']

    BATCH_SIZE = config['batch_size']
    MAX_EPOCH = config['max_epoch']
    OPTIM_TYPE = config['optimzer']
    LR = config['learning_rate']
    LR_STEP = config['lr_step']
    LR_DECAY = config['lr_decay']
    L2_DECAY = config['l2_decay']
    TB_STATE = config['use_tensorboard']

    MODEL_NAME = config['model_name']
    ALPHA = config['alpha']
    BETA = config['beta']
    GAMMA = config['gamma']
    PHI = config['phi']
    LOSS_FN = config['loss_fn']
    KERNEL_SIZE = config['kernel_size']

    result_dir = RESULT_ROOT_DIR + '/' + CASE_NUM
    ckpt_path = result_dir + '/' + 'checkpoint.pt'

    #%%
    data_fname, data_dim = select_data(DATASET)
    data_path = '../data/' + data_fname

    data_test = NLUDataset(data_path, mode='test', random_seed=42)
    dataloader_test = DataLoader(data_test,
                                 batch_size=BATCH_SIZE,
                                 shuffle=True,
                                 num_workers=4)

    classes = data_test.labels
    num_classes = len(classes)

    #%%
    device = ('cuda' if torch.cuda.is_available() else 'cpu')
    if device == 'cuda': print('Using GPU, %s' % torch.cuda.get_device_name(0))

    net = select_model(MODEL_NAME, data_dim, KERNEL_SIZE, num_classes, ALPHA,
                       BETA, PHI)
    net.to(device)
    loss_fn = select_loss(LOSS_FN)

    #%%
    net, best_validation_acc = load_ckpt(ckpt_path, net)

    start_time = time.time()
    test_acc, test_loss = evaluate(dataloader_test, device, net, loss_fn)
    curr_time = time.time()
    ttt = curr_time - start_time
    tt1 = ttt / data_test.__len__()

    print('########################################################')
    print('# Test accuracy of %d: %.4f' % (CASE_NUM, test_acc))
    print("# Average %.6f s to process one input" % (tt1))
    print('########################################################')
Exemple #19
0
def train(args):
    if args.amp:
        amp_handle = amp.init(enabled=args.fp16)

    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
        args.N_gpu = torch.distributed.get_world_size()
    else:
        args.N_gpu = 1

    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    ssd300 = model(args)
    args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)
    iteration = 0
    loss_func = Loss(dboxes)

    loss_func.cuda()

    optimizer = torch.optim.SGD(
        tencent_trick(ssd300), 
        lr=args.learning_rate,
        momentum=args.momentum,
        weight_decay=args.weight_decay)

    scheduler = MultiStepLR(
        optimizer=optimizer, 
        milestones=args.multistep, 
        gamma=0.1)

    if args.fp16:
        if args.amp:
            optimizer = amp_handle.wrap_optimizer(optimizer)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.)

    val_dataloader, inv_map = get_val_dataloader(args)
    train_loader = get_train_loader(args, dboxes)

    acc = 0
    logger = Logger(args.batch_size, args.local_rank)
    
    for epoch in range(0, args.epochs):
        logger.start_epoch()
        scheduler.step()

        iteration = train_loop(
            ssd300, loss_func, epoch, optimizer, 
            train_loader, iteration, logger, args)

        logger.end_epoch()

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
            if args.local_rank == 0:
                print('Epoch {:2d}, Accuracy: {:4f} mAP'.format(epoch, acc))

        if args.data_pipeline == 'dali':
            train_loader.reset()

    return acc, logger.average_speed()
Exemple #20
0
def train(total_epochs=30,
          early_stopping=True,
          threshold=5,
          checkpoint_path='/content/gdrive/My Drive/NMT/checkpoints/my_model/',
          save=True,
          write=True):
    ### immutable training session data ###
    model_data = retrieve_model_data(checkpoint_path=checkpoint_path)
    train_batches = model_data["train_batches"]
    dev_batches = model_data["dev_batches"]
    dev_references = model_data["references"]
    idx_to_trg_word = model_data["idx_to_trg_word"]
    hyperparams = model_data["hyperparams"]
    #######################################

    ### mutable training session data ###
    model, optimizer, checkpoint = load_checkpoint(hyperparams,
                                                   checkpoint_path,
                                                   "most_recent_model")
    epoch = checkpoint["epoch"]
    epoch_loss = checkpoint["epoch_loss"]
    bleu = checkpoint["bleu"]
    prev_bleu = checkpoint["prev_bleu"]
    best_bleu = checkpoint["best_bleu"]
    bad_epochs_count = checkpoint["bad_epochs_count"]
    #######################################

    if epoch == 0:
        # loaded a checkpoint that has been trained for zero epochs.
        print("training model from scratch...")
        print()
        start_epoch = 1
    else:
        print(f"loaded model checkpoint from epoch: {epoch:02d}")
        print(
            f"loss: {epoch_loss:.4f}, bleu: {bleu:.2f}, prev_bleu: {prev_bleu:.2f}, best_bleu: {best_bleu:.2f}, bad_epochs_count: {bad_epochs_count:02d}"
        )
        start_epoch = epoch + 1
        print(f"resuming training from epoch {start_epoch}...")
        print()

    ### training loop ##############################
    for epoch in range(start_epoch, total_epochs + 1):
        epoch_loss = 0.
        random.shuffle(train_batches)
        epoch_start_time = time.time()
        for batch in train_batches:
            epoch_loss += training_step(model, optimizer, batch)
        epoch_time = time.time() - epoch_start_time

        dev_translations, preds_time, post_time = predict(model,
                                                          dev_batches,
                                                          idx_to_trg_word,
                                                          checkpoint_path,
                                                          epoch,
                                                          write=write)
        bleu = evaluate(dev_translations, dev_references)

        model.train()
        model.encoder.train()
        model.decoder.train()
        report_stats(epoch, epoch_loss, epoch_time, preds_time, bleu,
                     checkpoint_path, post_time)

        if early_stopping:
            # if this epoch model performed better on dev set than prev epoch model,
            # bad_epochs_count resets to 0. (need not have outperformed best model,
            # just the most recent model).
            bad_epochs_count = (bad_epochs_count +
                                1) if epoch > 1 and bleu <= prev_bleu else 0
            if bleu > best_bleu:
                best_bleu = bleu
                # when terminates, can load best model, rather than potentially suboptimal model of final epoch.
                store_checkpoint(model, optimizer, epoch, epoch_loss, bleu,
                                 prev_bleu, best_bleu, bad_epochs_count,
                                 checkpoint_path, "best_model")
            if bad_epochs_count == threshold:
                # early-stopping threshold met
                best_model, optimizer, checkpoint = load_checkpoint(
                    hyperparams, checkpoint_path, "best_model")
                return best_model, checkpoint["epoch_loss"]

        if save:
            # store checkpoint each epoch, e.g., so can pick up training at later time.
            store_checkpoint(model, optimizer, epoch, epoch_loss, bleu,
                             prev_bleu, best_bleu, bad_epochs_count,
                             checkpoint_path, "most_recent_model")

        prev_bleu = bleu
    ################################################

    if early_stopping:
        best_model, optimizer, checkpoint = load_checkpoint(
            hyperparams, checkpoint_path, "best_model")
        return best_model, checkpoint["epoch_loss"]
    else:
        return model, epoch_loss
def train(train_loop_func, logger, args):
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda
    train_samples = 118287

    # Setup multi-GPU if necessary
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='smddp', init_method='env://')
        args.N_gpu = torch.distributed.get_world_size()
    else:
        args.N_gpu = 1

    if args.seed is None:
        args.seed = np.random.randint(1e4)

    if args.distributed:
        args.seed = (args.seed + torch.distributed.get_rank()) % 2**32
    print("Using seed = {}".format(args.seed))
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    np.random.seed(seed=args.seed)


    # Setup data, defaults
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    train_loader = get_train_loader(args, args.seed - 2**31)

    val_dataset = get_val_dataset(args)
    val_dataloader = get_val_dataloader(val_dataset, args)

    ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path))
    args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)
    start_epoch = 0
    iteration = 0
    loss_func = Loss(dboxes)

    if use_cuda:
        ssd300.cuda()
        loss_func.cuda()

    optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate,
                                    momentum=args.momentum, weight_decay=args.weight_decay)
    scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1)
    if args.amp:
        ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2')

    if args.distributed:
        ssd300 = DDP(ssd300)

    if args.checkpoint is not None:
        if os.path.isfile(args.checkpoint):
            load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint)
            checkpoint = torch.load(args.checkpoint,
                                    map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device()))
            start_epoch = checkpoint['epoch']
            iteration = checkpoint['iteration']
            scheduler.load_state_dict(checkpoint['scheduler'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print('Provided checkpoint is not path to a file')
            return

    inv_map = {v: k for k, v in val_dataset.label_map.items()}

    total_time = 0

    if args.mode == 'evaluation':
        acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
        if args.local_rank == 0:
            print('Model precision {} mAP'.format(acc))

        return
    mean, std = generate_mean_std(args)

    for epoch in range(start_epoch, args.epochs):
        start_epoch_time = time.time()
        scheduler.step()
        iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration,
                                    logger, args, mean, std)
        end_epoch_time = time.time() - start_epoch_time
        total_time += end_epoch_time

        if torch.distributed.get_rank() == 0:
            throughput = train_samples / end_epoch_time
            logger.update_epoch_time(epoch, end_epoch_time)
            logger.update_throughput_speed(epoch, throughput)

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)

        if args.save and args.local_rank == 0:
            print("saving model...")
            obj = {'epoch': epoch + 1,
                   'iteration': iteration,
                   'optimizer': optimizer.state_dict(),
                   'scheduler': scheduler.state_dict(),
                   'label_map': val_dataset.label_info}
            if args.distributed:
                obj['model'] = ssd300.module.state_dict()
            else:
                obj['model'] = ssd300.state_dict()
            save_path = os.path.join(args.save, f'epoch_{epoch}.pt')
            torch.save(obj, save_path)
            logger.log('model path', save_path)
        train_loader.reset()

    if torch.distributed.get_rank() == 0:
        DLLogger.log((), { 'Total training time': '%.2f' % total_time + ' secs' })
        logger.log_summary()
Exemple #22
0
def predict_from_raw_dataset():
    e.evaluate()
def train(train_loop_func, logger, args):
    if args.amp:
        amp_handle = amp.init(enabled=args.fp16)
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda

    # Setup multi-GPU if necessary
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.N_gpu = torch.distributed.get_world_size()
    else:
        args.N_gpu = 1

    if args.seed is None:
        args.seed = np.random.randint(1e4)

    if args.distributed:
        args.seed = (args.seed + torch.distributed.get_rank()) % 2**32
    print("Using seed = {}".format(args.seed))
    torch.manual_seed(args.seed)
    np.random.seed(seed=args.seed)

    # Setup data, defaults
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    train_loader = get_train_loader(args, args.seed - 2**31)

    val_dataset = get_val_dataset(args)
    val_dataloader = get_val_dataloader(val_dataset, args)

    ssd300 = SSD300(backbone=args.backbone)
    args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size /
                                                            32)
    start_epoch = 0
    iteration = 0
    loss_func = Loss(dboxes)

    if use_cuda:
        ssd300.cuda()
        loss_func.cuda()

    if args.fp16 and not args.amp:
        ssd300 = network_to_half(ssd300)

    if args.distributed:
        ssd300 = DDP(ssd300)

    optimizer = torch.optim.SGD(tencent_trick(ssd300),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    scheduler = MultiStepLR(optimizer=optimizer,
                            milestones=args.multistep,
                            gamma=0.1)
    if args.fp16:
        if args.amp:
            optimizer = amp_handle.wrap_optimizer(optimizer)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.)
    if args.checkpoint is not None:
        if os.path.isfile(args.checkpoint):
            load_checkpoint(ssd300, args.checkpoint)
            checkpoint = torch.load(args.checkpoint,
                                    map_location=lambda storage, loc: storage.
                                    cuda(torch.cuda.current_device()))
            start_epoch = checkpoint['epoch']
            iteration = checkpoint['iteration']
            scheduler.load_state_dict(checkpoint['scheduler'])
            ssd300.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print('Provided checkpoint is not path to a file')
            return

    inv_map = {v: k for k, v in val_dataset.label_map.items()}

    total_time = 0

    if args.mode == 'evaluation':
        acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
        if args.local_rank == 0:
            print('Model precision {} mAP'.format(acc))

        return
    mean, std = generate_mean_std(args)

    for epoch in range(start_epoch, args.epochs):
        start_epoch_time = time.time()
        scheduler.step()
        iteration = train_loop_func(ssd300, loss_func, epoch, optimizer,
                                    train_loader, val_dataloader, encoder,
                                    iteration, logger, args, mean, std)
        end_epoch_time = time.time() - start_epoch_time
        total_time += end_epoch_time

        if args.local_rank == 0:
            logger.update_epoch_time(epoch, end_epoch_time)

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map,
                           args)

            if args.local_rank == 0:
                logger.update_epoch(epoch, acc)

        if args.save and args.local_rank == 0:
            print("saving model...")
            obj = {
                'epoch': epoch + 1,
                'iteration': iteration,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                'label_map': val_dataset.label_info
            }
            if args.distributed:
                obj['model'] = ssd300.module.state_dict()
            else:
                obj['model'] = ssd300.state_dict()
            torch.save(obj, './models/epoch_{}.pt'.format(epoch))
        train_loader.reset()
    print('total training time: {}'.format(total_time))
Exemple #24
0
def train(args):
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.N_gpu = torch.distributed.get_world_size()
    else:
        args.N_gpu = 1

    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    val_dataset = get_val_dataset(args)
    val_dataloader = get_val_dataloader(val_dataset, args)

    ssd300 = SSD300(len(cocoGt.cats) + 1)
    args.learning_rate = args.learning_rate * \
        args.N_gpu * (args.batch_size / 32)
    iteration = 0
    loss_func = Loss(dboxes)

    ssd300.cuda()
    loss_func.cuda()

    if args.fp16:
        ssd300 = network_to_half(ssd300)

    if args.distributed:
        ssd300 = DDP(ssd300)

    optimizer = torch.optim.SGD(tencent_trick(ssd300),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    scheduler = MultiStepLR(optimizer=optimizer,
                            milestones=args.multistep,
                            gamma=0.1)

    if args.fp16:
        optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.)

    inv_map = {v: k for k, v in val_dataset.label_map.items()}

    avg_loss = 0.0
    acc = 0
    batch_perf = AverageMeter()
    end = time.time()
    train_start = end

    args.train_annotate = os.path.join(args.data,
                                       "annotations/instances_train2017.json")
    args.train_coco_root = os.path.join(args.data, "train2017")
    local_seed = set_seeds(args)

    if args.data_pipeline == 'no_dali':
        train_trans = SSDTransformer(dboxes, args, (300, 300), val=False)
        train_dataset = get_train_dataset(args, train_trans)
        train_loader = get_train_loader(train_dataset, args, args.num_workers)
    elif args.data_pipeline == 'dali':
        train_loader = get_train_dali_loader(args, dboxes, local_seed)

    for epoch in range(args.epochs):
        start_epoch_time = time.time()
        scheduler.step()

        epoch_loop(train_loader, args, ssd300, time.time(), loss_func,
                   optimizer, iteration, avg_loss, batch_perf, epoch)
        torch.cuda.synchronize()

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map,
                           args)

        try:
            train_loader.reset()
        except AttributeError:
            pass

    if args.local_rank == 0:
        print(
            "Training end: Average speed: {:3f} img/sec, Total time: {:3f} sec, Final accuracy: {:3f} mAP"
            .format(args.N_gpu * args.batch_size / batch_perf.avg,
                    time.time() - train_start, acc))
Exemple #25
0
def main():
    args = parse_arguments()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    worker_init = WorkerInitObj(args.seed)
    device, args = setup_training(args)
    test_data = prepare_test_data(args)
    model, optimizer, criterion = prepare_model_and_optimizer(args, device)

    pool = ProcessPoolExecutor(1)
    train_iter = ml_1mTrainDataLoader(path=args.train_path,
                                      num_negs=args.num_negs,
                                      batch_size=args.train_batch_size,
                                      seed=args.seed,
                                      worker_init=worker_init)

    print('-' * 50 + 'args' + '-' * 50)
    for k in list(vars(args).keys()):
        print('{0}: {1}'.format(k, vars(args)[k]))
    print('-' * 30)
    print(model)
    print('-' * 50 + 'args' + '-' * 50)

    global_step = 0
    global_HR = 0.0
    global_NDCG = 0.0

    s_time_train = time.time()
    for epoch in range(args.epoch):

        dataset_future = pool.submit(ml_1mTrainDataLoader, args.train_path,
                                     args.num_negs, args.train_batch_size,
                                     args.seed, worker_init)

        for step, batch in enumerate(train_iter):

            model.train()
            batch = [t.to(device) for t in batch]
            users, items, labels = batch

            logits = model(users, items)
            loss = criterion(logits, labels.float())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #evaluate
            if global_step != 0 and global_step % args.eval_freq == 0:
                s_time_eval = time.time()
                model.eval()
                hits, ndcgs = evaluate(model, test_data, device, args.topk)
                e_time_eval = time.time()
                print('-' * 68)
                print('Epoch:[{0}] Step:[{1}] HR:[{2}] NDCG:[{3}] time:[{4}s]'.
                      format(epoch, global_step, format(hits, '.4f'),
                             format(ndcgs, '.4f'),
                             format(e_time_eval - s_time_eval, '.4f')))

                if hits > global_HR and ndcgs > global_NDCG:
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_save_file = os.path.join(
                        args.output_dir,
                        "{}_hr_{}_ndcg_{}_step_{}_ckpt.pt".format(
                            args.model_name, format(hits, '.4f'),
                            format(ndcgs, '.4f'), global_step))

                    if os.path.exists(output_save_file):
                        os.system('rm -rf {}'.format(output_save_file))
                    torch.save(
                        {
                            'model': model_to_save.state_dict(),
                            'name': args.model_name
                        }, output_save_file)
                    print('Epoch:[{0}] Step:[{1}] SavePath:[{2}]'.format(
                        epoch, global_step, output_save_file))
                    global_HR = hits
                    global_NDCG = ndcgs
                print('-' * 68)

            #log
            if global_step != 0 and global_step % args.log_freq == 0:
                e_time_train = time.time()
                print('Epoch:[{0}] Step:[{1}] Loss:[{2}] Lr:[{3}] time:[{4}s]'.
                      format(epoch, global_step, format(loss.item(), '.4f'),
                             format(optimizer.param_groups[0]['lr'], '.6'),
                             format(e_time_train - s_time_train, '.4f')))
                s_time_train = time.time()

            global_step += 1

        del train_iter
        train_iter = dataset_future.result(timeout=None)
Exemple #26
0
                val_n = 0
                for batch in dataloader_val:
                    val_n += batch_size
                    input_batches, input_lengths = batch['input'], batch[
                        'length'].numpy().tolist()
                    input_batches, input_lengths = zip(
                        *sorted(zip(input_batches, input_lengths),
                                key=lambda x: x[1],
                                reverse=True))
                    input_batches, input_lengths = torch.stack(
                        input_batches), list(input_lengths)
                    input_batches = input_batches[:, :max(input_lengths)]
                    input_batches = input_batches.transpose(0, 1)

                    val_loss, real, generated = evaluate(
                        encoder, decoder, input_batches, input_lengths,
                        input_batches, input_lengths, batch_size, lang1)
                    print_loss_total += loss

                    if val_n % print_every_val == 0:
                        logger.info(
                            '\n-- Real sentence: {0},\n-- Generated sentence: {1}'
                            .format(' '.join(real), ' '.join(generated)))
                    val_n += 1
                print_loss_avg = print_loss_total / val_n
                print_loss_total = 0
                print_summary = 'VAL_LOSS_INFO: Epoch:%d - Batch:%d - Val_loss:%.4f' % (
                    epoch, batch_n, print_loss_avg)
                logger.info(print_summary)

            torch.cuda.empty_cache()
Exemple #27
0
def test_outdrop(
        checkpoint_path='/content/gdrive/My Drive/NMT/unittests/checkpoints/',
        config_path='/content/gdrive/My Drive/NMT/configs/',
        corpus_path='/content/gdrive/My Drive/NMT/unittests/first_ten_sentences/'
):
    hyperparams = import_configs(config_path=config_path, unittesting=True)
    # use word-level vocab
    hyperparams["vocab_type"] = "word"
    hyperparams["trim_type"] = "top_k"
    hyperparams["enc_dropout"] = .5
    hyperparams["dec_dropout"] = .5
    print(f"hidden size: {hyperparams['dec_hidden_size']}")

    construct_model_data("train.de",
                         "train.en",
                         hyperparams=hyperparams,
                         corpus_path=corpus_path,
                         checkpoint_path=checkpoint_path,
                         overfit=True)

    # model of sufficient capacity should be able to bring loss down to ~zero.
    model, loss = train(total_epochs=100,
                        early_stopping=False,
                        checkpoint_path=checkpoint_path,
                        save=False,
                        write=False)
    assert loss < .01

    model_data = retrieve_model_data(checkpoint_path=checkpoint_path)
    dev_batches = model_data[
        "dev_batches"]  # holds the training data, bc overfit=True
    dev_references = model_data[
        "references"]  # holds the training data, bc overfit=True
    idx_to_trg_word = model_data["idx_to_trg_word"]

    # greedy search should be able to perfectly predict the training data.
    dev_translations, _, _ = predict(model, dev_batches, idx_to_trg_word,
                                     checkpoint_path)
    bleu = evaluate(dev_translations, dev_references)
    assert bleu >= 100

    # beam search should be able to perfectly predict the training data.
    model.decoder.set_inference_alg("beam_search")
    dev_translations, _, _ = predict(model, dev_batches, idx_to_trg_word,
                                     checkpoint_path)
    bleu = evaluate(dev_translations, dev_references)
    assert bleu >= 100


# def test_default_subword_model():
#     hyperparams = import_configs(config_path=config_path, unittesting=True)
#     hyperparams["vocab_type"] = "subword_joint"
#     train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
#                         corpus_path=corpus_path, overfit=True, write=False
#                         )

#     predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path)

# # default word model, except dn divide scores by scaling factor inside attention fn.
# def test_attn():
#     hyperparams = import_configs(config_path=config_path, unittesting=True)
#     hyperparams["vocab_type"] = "word"
#     hyperparams["trim_type"] = "top_k"
#     hyperparams["attention_fn"] = "dot_product"
#     train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
#                         corpus_path=corpus_path, overfit=True, write=False
#                         )

#     predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path)

# # no weight tying, no additional attention layer
# def test_no_tying():
#     hyperparams = import_configs(config_path=config_path, unittesting=True)
#     hyperparams["vocab_type"] = "word"
#     hyperparams["trim_type"] = "top_k"
#     hyperparams["attention_layer"] = False
#     hyperparams["tie_weights"] = False

#     train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
#                         corpus_path=corpus_path, overfit=True, write=False
#                         )

#     predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path)

# # no weight tying and no attention mechanism.
# def test_no_attn_no_tying():
#     hyperparams = import_configs(config_path=config_path, unittesting=True)
#     hyperparams["vocab_type"] = "word"
#     hyperparams["trim_type"] = "top_k"
#     hyperparams["attention_fn"] = "none"
#     hyperparams["attention_layer"] = False
#     hyperparams["tie_weights"] = False

#     train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
#                         corpus_path=corpus_path, overfit=True, write=False
#                         )

#     predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path)

# # default model, except dropout after lstm is turned on.
# def test_dropout():
#     hyperparams = import_configs(config_path=config_path, unittesting=True)
#     hyperparams["enc_dropout"] = 0.2
#     hyperparams["dec_dropout"] = 0.2

#     train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
#                         corpus_path=corpus_path, overfit=True, write=False
#                         )

#     predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path)

# # ensure still works on cpu.
# # must change runtime type to cpu before performing this test
# # def test_default_word_model_cpu():
# #     hyperparams = import_configs(config_path=config_path, unittesting=True)
# #     hyperparams["vocab_type"] = "word"
# #     hyperparams["trim_type"] = "top_k"
# #     hyperparams["device"] = "cpu"
# #     train_batches, dev_batches, test_batches, vocabs, ref_corpuses, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
# #                         corpus_path=corpus_path, overfit=True, write=False
# #                         )

# #     predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path)

# # simplest possible model.
# # - unidirectional encoder.
# # - no attention mechanism.
# def test_uni_no_attn():
#     hyperparams = import_configs(config_path=config_path, unittesting=True)
#     hyperparams["attention_fn"] = "none"

#     constrain_configs(hyperparams) # ensure passes constraint-check
#     train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
#                         corpus_path=corpus_path, overfit=True, write=False
#                         )

#     predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path)

# # two-layer vanilla network with layer_to_layer decoder_init_scheme
# def test_layer_to_layer_uni_no_attn():
#     hyperparams = import_configs(config_path=config_path, unittesting=True)
#     hyperparams["enc_num_layers"] = 2
#     hyperparams["dec_num_layers"] = 2
#     hyperparams["decoder_init_scheme"] = "layer_to_layer"
#     hyperparams["attention_fn"] = "none"
#     hyperparams["bidirectional"] = False
#     constrain_configs(hyperparams) # ensure passes constraint-check
#     train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
#                         corpus_path=corpus_path, overfit=True, write=False
#                         )

#     predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path)

# # two-layer vanilla network with final_to_first decoder_init_scheme
# def test_final_to_first_uni_no_attn():
#     hyperparams = import_configs(config_path=config_path, unittesting=True)
#     hyperparams["enc_num_layers"] = 2
#     hyperparams["dec_num_layers"] = 2
#     hyperparams["decoder_init_scheme"] = "final_to_first"
#     hyperparams["attention_fn"] = "none"
#     hyperparams["bidirectional"] = False
#     constrain_configs(hyperparams) # ensure passes constraint-check
#     train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
#                         corpus_path=corpus_path, overfit=True, write=False
#                         )

#     predict_train_data(hyperparams, train_batches, dev_batches, ref_corpuses["train.en"], vocabs["idx_to_trg_word"], checkpoint_path)

# # associate some epoch number with saved model, so can verify stored correct model.
# def test_early_stopping():
#     # set random seed
#     pass
Exemple #28
0
def train(train_loop_func, logger, args):
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda

    # Setup multi-GPU if necessary
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.N_gpu = torch.distributed.get_world_size()
    else:
        args.N_gpu = 1

    if args.seed is None:
        args.seed = np.random.randint(1e4)

    if args.distributed:
        args.seed = (args.seed + torch.distributed.get_rank()) % 2**32
    print("Using seed = {}".format(args.seed))
    torch.manual_seed(args.seed)
    np.random.seed(seed=args.seed)

    # Setup data, defaults
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    train_loader = get_train_loader(args, args.seed - 2**31)

    val_dataset = get_val_dataset(args)
    val_dataloader = get_val_dataloader(val_dataset, args)

    ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path))
    # args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)

    print(f"Actual starting LR: {args.learning_rate}")

    start_epoch = 0
    iteration = 0
    loss_func = Loss(dboxes)

    if use_cuda:
        ssd300.cuda()
        loss_func.cuda()

    # optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate,
    #                                 momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True)
    optimizer = torch.optim.AdamW(tencent_trick(ssd300),
                                  lr=args.learning_rate,
                                  betas=(0.8, 0.999),
                                  eps=1e-08,
                                  weight_decay=0.01,
                                  amsgrad=True)

    # scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1)
    # scheduler = CosineAnnealingWarmRestarts(optimizer=optimizer, T_0=20, T_mult=1, eta_min=1e-6)
    scheduler = CosineAnnealingLR(optimizer=optimizer,
                                  T_max=args.epochs,
                                  eta_min=1e-6)

    # scheduler = OneCycleLR(optimizer, max_lr=0.003, epochs=41, steps_per_epoch=173)
    # scheduler = CyclicLR(optimizer, base_lr=args.learning_rate, max_lr=2*args.learning_rate,
    #                      step_size_up=173*3, step_size_down=173*10)

    if args.amp:
        ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2')

    if args.distributed:
        ssd300 = DDP(ssd300)

    if args.checkpoint is not None:
        if os.path.isfile(args.checkpoint):
            load_checkpoint(ssd300.module if args.distributed else ssd300,
                            args.checkpoint)
            checkpoint = torch.load(args.checkpoint,
                                    map_location=lambda storage, loc: storage.
                                    cuda(torch.cuda.current_device()))
            start_epoch = checkpoint['epoch']
            iteration = checkpoint['iteration']
            scheduler.load_state_dict(checkpoint['scheduler'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print('Provided checkpoint is not path to a file')
            return

    inv_map = {v: k for k, v in val_dataset.label_map.items()}

    total_time = 0

    if args.mode == 'evaluation':
        acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
        if args.local_rank == 0:
            print('Model precision {} mAP'.format(acc))

        return
    mean, std = generate_mean_std(args)

    for epoch in range(start_epoch, args.epochs):
        start_epoch_time = time.time()
        # scheduler.step()
        iteration = train_loop_func(ssd300, loss_func, epoch, optimizer,
                                    scheduler, train_loader, val_dataloader,
                                    encoder, iteration, logger, args, mean,
                                    std)
        end_epoch_time = time.time() - start_epoch_time
        total_time += end_epoch_time

        # https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
        scheduler.step()

        if args.local_rank == 0:
            logger.update_epoch_time(epoch, end_epoch_time)

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map,
                           args)

            if args.local_rank == 0:
                logger.update_epoch(epoch, acc)

        if args.save and args.local_rank == 0:
            print("saving model...")
            obj = {
                'epoch': epoch + 1,
                'iteration': iteration,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                'label_map': val_dataset.label_info
            }
            if args.distributed:
                obj['model'] = ssd300.module.state_dict()
            else:
                obj['model'] = ssd300.state_dict()
            torch.save(obj, './models/epoch_{}.pt'.format(epoch))
        train_loader.reset()
    print('total training time: {}'.format(total_time))
Exemple #29
0
def main():

    arg = args()

    if not os.path.exists(arg.exp_name):
        os.makedirs(arg.exp_name)
    print(arg.exp_name.split('/')[0])
    assert arg.exp_name.split(
        '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..."

    output_dir = arg.exp_name

    logger = logging_set(output_dir)
    logger.info(
        '\n================ experient name:[{}] ===================\n'.format(
            arg.exp_name))
    os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu
    torch.backends.cudnn.enabled = True

    config = edict(yaml.load(open(arg.cfg, 'r')))

    config.test.dataset_name = arg.dataset
    config.test.flip_test = arg.flip_test
    config.test.batchsize = 128
    config.model.margin_to_border = arg.margin

    logger.info(
        '------------------------------ configuration ---------------------------'
    )
    logger.info('\n==> available {} GPUs , numbers are {}\n'.format(
        torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"]))
    logger.info(pprint.pformat(config))
    logger.info(
        '------------------------------- -------- ----------------------------'
    )

    criterion = MSELoss()

    Arch = bulid_up_network(config, criterion)

    if arg.param_flop:
        Arch._print_info()

    logger.info("=========>current architecture's values before evaluate")

    if hasattr(Arch.backbone, "alphas"):

        Arch.backbone._show_alpha()
        Arch.backbone._show_beta()

    for id, group in enumerate(Arch.groups):

        group._show_alpha()
        group._show_beta()

    if arg.test_model:
        logger.info('\n===> load ckpt in : {}'.format(arg.test_model))
        Arch.load_state_dict(torch.load(arg.test_model))
    elif config.test.ckpt != '':
        logger.info('\n===> load ckpt in : ' + config.test.ckpt + '...')
        Arch.load_state_dict(torch.load(config.test.ckpt))
    elif os.path.exists(os.path.join(output_dir, 'best_ckpt.tar')):
        logger.info('\n===> load ckpt in : ' +
                    os.path.join(output_dir, 'best_ckpt.tar'))
        Arch.load_state_dict(
            torch.load(os.path.join(output_dir, 'best_ckpt.tar')))
    else:
        logger.info('\n===>no ckpt is found, use the initial model ...')
        #raise ValueError
    #logger.info(Arch.backbone.alphas)

    logger.info("=========>Architecture's parameters")
    if hasattr(Arch, "backbone"):
        if hasattr(Arch.backbone, "alphas"):
            Arch.backbone._show_alpha(original_value=False)
            Arch.backbone._show_beta(original_value=False)
        for g in Arch.groups:
            g._show_alpha(original_value=False)
            g._show_beta(original_value=False)

    Arch = torch.nn.DataParallel(Arch).cuda()

    valid_dataset = dataset_(config,
                             config.images_root_dir,
                             config.annotation_root_dir,
                             mode='val',
                             transform=torchvision.transforms.Compose([
                                 torchvision.transforms.ToTensor(),
                                 torchvision.transforms.Normalize(
                                     mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
                             ]))
    #test_img(valid_dataset,output_dir)
    valid_dt_dataset = dataset_(config,
                                config.images_root_dir,
                                config.person_detection_results_path,
                                mode='dt',
                                dataset=config.test.dataset_name,
                                transform=torchvision.transforms.Compose([
                                    torchvision.transforms.ToTensor(),
                                    torchvision.transforms.Normalize(
                                        mean=[0.485, 0.456, 0.406],
                                        std=[0.229, 0.224, 0.225])
                                ]))

    if arg.use_dt:

        logger.info("\n >>> use detection results ")
        valid_dataloader = torch.utils.data.DataLoader(
            valid_dt_dataset,
            batch_size=config.test.batchsize,
            shuffle=False,
            num_workers=4,
            pin_memory=True)
    else:
        logger.info("\n >>> use groundtruth bbox ")
        valid_dataloader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size=config.test.batchsize,
            shuffle=False,
            num_workers=4,
            pin_memory=True)

    if arg.visualize:
        for i in range(len(valid_dataset)):
            imageid = 185250  # coco val set

            if valid_dataset[i][1] != imageid:  # choose an image_id
                continue
            print(valid_dataset[i][1])
            sample = valid_dataset[i]
            logger.info(
                "visualize the predicted heatmap of image id {} ".format(
                    imageid))
            img = sample[0].unsqueeze(0)
            #samples = next(iter(valid_dataloader))
            #img = samples[0]
            output = Arch(img)
            print(img.size(), output.size())
            visualize_heatamp(img, output, 'heatmaps', show_img=False)
            break

    results = evaluate(Arch, valid_dataloader, config, output_dir)
    logger.info('map = {}'.format(results))
Exemple #30
0
def main():

    arg = args()

    if not os.path.exists(arg.exp_name):
        os.makedirs(arg.exp_name)

    assert arg.exp_name.split(
        '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..."
    output_dir = arg.exp_name

    save_scripts_in_exp_dir(output_dir)

    logger = logging_set(output_dir)

    logger.info(
        '\n================ experient name:[{}] ===================\n'.format(
            arg.exp_name))
    os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    np.random.seed(0)
    torch.manual_seed(0)

    config = edict(yaml.load(open(arg.cfg, 'r')))

    if arg.search:
        assert arg.search in [
            'None', 'sync', 'random', 'second_order_gradient',
            'first_order_gradient'
        ]
        config.train.arch_search_strategy = arg.search

    if arg.batchsize:
        logger.info("update batchsize to {}".format(arg.batchsize))
        config.train.batchsize = arg.batchsize

    config.num_workers = arg.num_workers

    print(
        'GPU memory : \ntotal | used\n',
        os.popen(
            'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader'
        ).read())

    logger.info(
        '------------------------------ configuration ---------------------------'
    )
    logger.info(
        '\n==> available {} GPUs , use numbers are {} device is {}\n'.format(
            torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"],
            torch.cuda.current_device()))
    # torch.cuda._initialized = True
    logger.info(pprint.pformat(config))
    logger.info(
        '------------------------------- -------- ----------------------------'
    )

    criterion = MSELoss()

    Arch = bulid_up_network(config, criterion)

    if config.train.arch_search_strategy == 'random':

        logger.info("==>random seed is {}".format(config.train.random_seed))
        np.random.seed(config.train.random_seed)
        torch.manual_seed(config.train.random_seed)
        Arch.arch_parameters_random_search()

    if arg.param_flop:
        Arch._print_info()

    # dump_input = torch.rand((1,3,128,128))
    # graph = SummaryWriter(output_dir+'/log')
    # graph.add_graph(Arch, (dump_input, ))

    if len(arg.gpu) > 1:
        use_multi_gpu = True
        Arch = torch.nn.DataParallel(Arch).cuda()
    else:
        use_multi_gpu = False
        Arch = Arch.cuda()

    Search = Search_Arch(Arch.module,
                         config) if use_multi_gpu else Search_Arch(
                             Arch, config)  # Arch.module for nn.DataParallel
    search_strategy = config.train.arch_search_strategy
    train_queue, arch_queue, valid_queue = Dataloaders(search_strategy, config,
                                                       arg)
    #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None!

    logger.info(
        "\nNeural Architecture Search strategy is {}".format(search_strategy))
    assert search_strategy in [
        'first_order_gradient', 'random', 'None', 'second_order_gradient',
        'sync'
    ]

    if search_strategy == 'sync':
        # arch_parameters is also registered to model's parameters
        # so the weight-optimizer will also update the arch_parameters
        logger.info(
            "sync: The arch_parameters is also optimized by weight-optmizer synchronously"
        )
        optimizer = torch.optim.Adam(
            Arch.parameters(),
            lr=config.train.w_lr_cosine_begin,
        )

    else:
        # if search strategy is None,random,second_order_gradient and so on
        # the arch_parameters will be filtered by the weight-optimizer
        optimizer = torch.optim.Adam(
            filter_arch_parameters(Arch),
            lr=config.train.w_lr_cosine_begin,
        )
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer,  step_size = config.train.lr_step_size,
    #                                                       gamma = config.train.lr_decay_gamma )
    if config.train.scheduler_name == "MultiStepLR":
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.train.LR_STEP, config.train.LR_FACTOR)
    elif config.train.scheduler_name == "CosineAnnealingLR":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=config.train.epoch_end,
            eta_min=config.train.w_lr_cosine_end)

    # best_result
    best = 0

    logger.info(
        "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=="
    )
    begin, end = config.train.epoch_begin, config.train.epoch_end

    if arg.load_ckpt:
        if use_multi_gpu:
            begin, best = load_ckpt(Arch.module, optimizer, scheduler,
                                    output_dir, logger)
        else:
            begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir,
                                    logger)

    for epoch in range(begin, end):

        lr = scheduler.get_lr()[0]
        logger.info(
            '==>time:({})--training...... current learning rate is {:.7f}'.
            format(datetime.datetime.now(), lr))

        train(
            epoch,
            train_queue,
            arch_queue,
            Arch,
            Search,
            criterion,
            optimizer,
            lr,
            search_strategy,
            output_dir,
            logger,
            config,
            arg,
        )
        scheduler.step()

        eval_results = evaluate(Arch, valid_queue, config, output_dir)
        if use_multi_gpu:
            best = save_model(epoch, best, eval_results, Arch.module,
                              optimizer, scheduler, output_dir, logger)
        else:

            best = save_model(epoch, best, eval_results, Arch, optimizer,
                              scheduler, output_dir, logger)

        ## visualize_heatamp
        if arg.visualize and epoch % 5 == 0:
            for i in range(len(valid_queue.dataset)):

                if valid_queue.dataset[i][1] != 185250:  # choose an image_id
                    continue
                print(valid_queue.dataset[i][1])
                sample = valid_queue.dataset[i]

                img = sample[0].unsqueeze(0)
                #samples = next(iter(valid_dataloader))
                #img = samples[0]
                output = Arch(img)
                print(img.size(), output.size())
                visualize_heatamp(img, output, 'heatmaps', show_img=False)
                break