Beispiel #1
0
def summarize_model(model_name, dataset_name, novalidate):
    """
    Trains the model with k-fold cross-validation and then generate the summary.
    Similar to sklearn.model_selection.cross_val_predict
    :param model_name: filename  in model package representing the pipeline or model object
    :param dataset_name: dataset name to load
    :param novalidate: If True, trains the data into whole dataset and save the model
    """
    model = __import__("models.%s" % model_name, globals(), locals(),
                       ['model']).model

    X, y = load_dataset(dataset_name)
    y_complete_pred = np.zeros_like(y).astype('float')

    if not novalidate:
        folds = load_folds()
        for i, (train_index, val_index) in enumerate(folds):
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            model.fit(X_train, y_train)
            y_pred = model.predict_proba(X_val)[:, 1]
            y_pred_train = model.predict_proba(X_train)[:, 1]

            # Copying the values to generate predictions of complete dataset
            y_complete_pred[val_index] = y_pred

            # Saving the model
            save_model(model, "%s_%s_fold%d" % (dataset_name, model_name, i))

            print "[Fold %d]: " % (i + 1)
            print "Fold Summary: ",
            print "Training AUPRC - %8.4f" % average_precision_score(
                y_train, y_pred_train)
            analyze_results(y_val, y_pred)
            print

        y_complete_pred.dump(
            os.path.join(RESULTS_PATH,
                         '%s_%s.npy' % (dataset_name, model_name)))
        save_features(y_complete_pred,
                      'probs/%s_%s' % (dataset_name, model_name))

        print "Complete Summary: ",
        analyze_results(y, y_complete_pred)
        print "\nModel parameters: "
        pprint.pprint(model.get_params(), indent=4, depth=1)
        print
    else:
        model.fit(X, y)
        y_pred_train = model.predict_proba(X)[:, 1]
        print "Training AUPRC - %8.4f" % average_precision_score(
            y, y_pred_train)
        save_model(model, '%s_%s' % (dataset_name, model_name))
Beispiel #2
0
def train(**kwargs):
    # 根据命令行参数更新配置
    opt = DefaultConfig()
    opt.parse(kwargs)
    print("参数配置完成")

    # 优化器
    learning_rate = opt.learning_rate
    # optimizer默认是Adam
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                       beta1=0.5,
                                       beta2=0.9)
    if opt.optimizer_type == "SGD":
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=learning_rate)
    elif opt.optimizer_type == "Momentum":
        momentum = opt.momentum
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                               momentum=momentum)
    elif opt.optimizer_type == "Adam":
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           beta1=0.5,
                                           beta2=0.9)

    # 建立静态图
    with tf.Graph().as_default():
        with tf.name_scope("inputs"):
            inputs = tf.placeholder("float", [None, 24, 2, 2],
                                    name="model_input")
            labels = tf.placeholder("float", [None, 72, 14, 2], name="labels")

        # 定义模型,统计并分类需要训练的模型参数
        model = []
        if opt.model_type == 1:  # 反卷积
            gmodel = GModel(opt.batch_size, opt.normal_type, True,
                            "generate_model")
            model.append(gmodel)
        elif opt.model_type == 2:  # 反卷积+可学习pooling
            gmodel = GModel(opt.batch_size, opt.normal_type, True,
                            "generate_model")
            model.append(gmodel)
            learningpoolingmodel = LearningPoolingModel(
                opt.batch_size, opt.normal_type, True, opt.model_2_layers,
                "learning_pooling_model")
            model.append(learningpoolingmodel)
        elif opt.model_type == 3:  # 反卷积+GAN
            gmodel = GModel(opt.batch_size, opt.normal_type, True,
                            "generate_model")
            model.append(gmodel)
            dmodel = DModel(opt.batch_size, opt.normal_type, True,
                            opt.GAN_type, "discriminate_model")
            model.append(dmodel)
        # print(model)

        # 统计并分类需要训练的参数
        # 由于下面加上了对tf.GraphKeys.UPDATE_OPS的依赖,所以get_vars函数要加到calculate_loss函数后面
        # 不然就会导致all_vars为空
        def get_vars():
            all_vars = tf.trainable_variables()
            # print(all_vars)
            gg_vars = [var for var in all_vars if "generate_model" in var.name]
            dd_vars = [
                var for var in all_vars if "discriminate_mode" in var.name
            ]
            ll_pp_vars = [
                var for var in all_vars if "learning_pooling_model" in var.name
            ]
            return gg_vars, dd_vars, ll_pp_vars

        # 加上对update_ops的依赖,不然BN就会出现问题!
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.device(opt.gpu_num):
            if opt.model_type == 1:  # 反卷积
                pre_loss, mse, pred = model[0].calculate_loss(inputs, labels)
                g_vars, _, _ = get_vars()
                with tf.control_dependencies(update_ops):
                    train_ops = optimizer.minimize(pre_loss, var_list=g_vars)
            elif opt.model_type == 2:  # 反卷积+可学习pooling
                _, mse, pred = model[0].calculate_loss(inputs, labels)
                l_p_loss = model[1].calculate_loss(pred, labels,
                                                   opt.model_2_scale)
                g_vars, _, l_p_vars = get_vars()
                with tf.control_dependencies(update_ops):
                    train_ops = optimizer.minimize(l_p_loss,
                                                   var_list=g_vars + l_p_vars)
            elif opt.model_type == 3:  # 反卷积+GAN
                pre_loss, mse, pred = model[0].calculate_loss(inputs, labels)
                gen_loss, dis_loss = model[1].calculate_loss(pred, labels)
                g_vars, d_vars, _ = get_vars()
                with tf.control_dependencies(update_ops):
                    # D网络的训练 --> G网络的训练 ——> 先验网络(也就是G网络)的训练
                    d_train_ops = optimizer.minimize(dis_loss, var_list=d_vars)
                    g_train_ops = optimizer.minimize(gen_loss, var_list=g_vars)
                    pre_train_ops = optimizer.minimize(pre_loss,
                                                       var_list=g_vars)

        tf.summary.scalar("MSE", mse)

        tf.add_to_collection("input_batch", inputs)
        tf.add_to_collection("predictions", pred)

        saver = tf.train.Saver()
        init = tf.global_variables_initializer()

        # 开始训练
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = opt.per_process_gpu_memory_fraction
        with tf.Session(config=config) as sess:
            # 首先是参数的初始化
            sess.run(init)

            if opt.model_type == 1:
                model_type = "model_1"
            elif opt.model_type == 2:
                model_type = "model_2_" + str(opt.model_2_layers)
            elif opt.model_type == 3:
                model_type = "model_3"
            summary_path = opt.summary_path + model_type + "\\data_SNR_" + str(
                opt.SNR)
            writer = tf.summary.FileWriter(summary_path, sess.graph)
            merge_ops = tf.summary.merge_all()

            start = time.time()

            data_path = opt.train_data_path + "data_SNR_" + str(opt.SNR)
            # 定义训练集dataset
            train_dataset = CSISet(data_path,
                                   opt.batch_size,
                                   True,
                                   state="train")
            # 定义验证集dataset
            validation_dataset = CSISet(data_path,
                                        opt.batch_size,
                                        True,
                                        state="validation")

            # 保存训练集和验证集的中间值,用于后续的画图
            train_mse_for_plot = []
            valid_mse_for_plot = []

            for num in range(opt.num_epoch):
                # 判断是否需要改变学习率
                if opt.optimizer_type == "Momentum" and (
                        num % opt.learning_rate_change_epoch) == 0:
                    learning_rate *= opt.learning_rate_decay
                    print("第%i个epoch开始,当前学习率是%f" % (num, learning_rate))

                for ii, (batch_x,
                         batch_y) in enumerate(train_dataset.get_data()):
                    if opt.model_type == 1 or opt.model_type == 2:
                        _, train_mse, summary = sess.run(
                            [train_ops, mse, merge_ops],
                            feed_dict={
                                inputs: batch_x,
                                labels: batch_y
                            })
                    elif opt.model_type == 3:
                        _, _, _, train_mse, summary = sess.run([
                            d_train_ops, g_train_ops, pre_train_ops, mse,
                            merge_ops
                        ],
                                                               feed_dict={
                                                                   inputs:
                                                                   batch_x,
                                                                   labels:
                                                                   batch_y
                                                               })
                    writer.add_summary(summary)

                    if (ii + 1) % 1000 == 0:
                        print("epoch-%d, batch_num-%d: 当前batch训练数据误差是%f" %
                              (num + 1, ii + 1, train_mse))

                        # 每1000个batch就在验证集上测试一次
                        validate_mse = 0
                        jj = 1
                        for (validate_x,
                             validate_y) in validation_dataset.get_data():
                            temp_mse = sess.run(mse,
                                                feed_dict={
                                                    inputs: validate_x,
                                                    labels: validate_y
                                                })
                            validate_mse += temp_mse
                            jj += 1
                        validate_mse = validate_mse / (jj + 1)
                        print("epoch-%d: 当前阶段验证集数据平均误差是%f" %
                              (num + 1, validate_mse))
                        train_mse_for_plot.append(train_mse)
                        valid_mse_for_plot.append(validate_mse)

            end = time.time()

            utils.print_time(start, end, "跑完" + str(opt.num_epoch) + "个epoch")

            plot_path = opt.result_path + model_type + "\\data_SNR_" + str(
                opt.SNR) + "\\train"
            utils.plot_fig(train_mse_for_plot, valid_mse_for_plot, plot_path)
            print("训练过程中最小验证误差是%f" % min(valid_mse_for_plot))

            # 保存模型文件
            model_file = opt.model_path + model_type + "\\data_SNR_" + str(
                opt.SNR) + "\\data_SNR_" + str(opt.SNR)
            model_utils.save_model(saver, sess, model_file)
Beispiel #3
0
def train_model(config):
    if config.start_date is not None:
        print("Training start date: ", config.start_date)
    if config.start_date is not None:
        print("Training end date: ", config.end_date)

    print("Loading training data from %s ..." % config.datafile)
    train_data = None
    valid_data = None

    if (config.validation_size > 0.0) or (config.split_date is not None):
        train_data, valid_data = data_utils.load_train_valid_data(config)
    else:
        train_data = data_utils.load_all_data(config, is_training_only=True)
        valid_data = train_data

    tf_config = tf.ConfigProto(allow_soft_placement=True,
                               log_device_placement=False)

    with tf.Graph().as_default(), tf.Session(config=tf_config) as session:
        if config.seed is not None:
            tf.set_random_seed(config.seed)

        print("Constructing model ...")
        model = model_utils.get_model(session, config, verbose=True)

        params = model_utils.get_scaling_params(config,
                                                train_data,
                                                verbose=True)
        model.set_scaling_params(session, **params)

        noise_model = None
        if config.training_noise is not None:
            print("Training noise level: %.2f * 1-stdev" %
                  config.training_noise)
            noise_model = NoiseModel(seed=config.seed,
                                     scaling_params=params,
                                     degree=config.training_noise)

        if config.early_stop is not None:
            print("Training will early stop without "
                  "improvement after %d epochs." % config.early_stop)
        sys.stdout.flush()

        train_history = list()
        valid_history = list()

        lr = model.set_learning_rate(session, config.learning_rate)

        train_data.cache(verbose=True)
        valid_data.cache(verbose=True)

        for i in range(config.max_epoch):

            (train_mse, valid_mse) = run_epoch(session,
                                               model,
                                               train_data,
                                               valid_data,
                                               keep_prob=config.keep_prob,
                                               passes=config.passes,
                                               noise_model=noise_model,
                                               verbose=True)
            print((
                'Epoch: %d Train MSE: %.6f Valid MSE: %.6f Learning rate: %.4f'
            ) % (i + 1, train_mse, valid_mse, lr))
            sys.stdout.flush()

            train_history.append(train_mse)
            valid_history.append(valid_mse)

            if re.match("Gradient|Momentum", config.optimizer):
                lr = model_utils.adjust_learning_rate(session, model, lr,
                                                      config.lr_decay,
                                                      train_history)

            if not os.path.exists(config.model_dir):
                print("Creating directory %s" % config.model_dir)
                os.mkdir(config.model_dir)

            if math.isnan(valid_mse):
                print("Training failed due to nan.")
                quit()
            elif stop_training(config, valid_history):
                print("Training stopped.")
                quit()
            else:
                if ((config.early_stop is None)
                        or (valid_history[-1] <= min(valid_history))):
                    model_utils.save_model(session, config, i)
Beispiel #4
0
def train_model(config):
    if config.start_date is not None:
        print("Training start date: ", config.start_date)
    if config.start_date is not None:
        print("Training end date: ", config.end_date)

    print("Loading training data from %s ..."%config.datafile)
    train_data = None
    valid_data = None

    if (config.validation_size > 0.0) or (config.split_date is not None):
        train_data, valid_data = data_utils.load_train_valid_data(config)
    else:
        train_data = data_utils.load_all_data(config, is_training_only=True)
        valid_data = train_data
        
    tf_config = tf.ConfigProto(allow_soft_placement=True,
                               log_device_placement=False)
    tf_config.gpu_options.allow_growth = True

    with tf.Graph().as_default(), tf.Session(config=tf_config) as session:
        if config.seed is not None:
            tf.set_random_seed(config.seed)

        print("Constructing model ...")
        model = model_utils.get_model(session, config, verbose=True)

        if config.data_scaler is not None:
            start_time = time.time()
            print("Calculating scaling parameters ...", end=' '); sys.stdout.flush()
            scaling_params = train_data.get_scaling_params(config.data_scaler)
            model.set_scaling_params(session,**scaling_params)
            print("done in %.2f seconds."%(time.time() - start_time))
            print("%-10s %-6s %-6s"%('feature','mean','std'))
            for i in range(len(train_data.feature_names)):
                center = "%.4f"%scaling_params['center'][i];
                scale  = "%.4f"%scaling_params['scale'][i];
                print("%-10s %-6s %-6s"%(train_data.feature_names[i],
                                         center,scale))
            sys.stdout.flush()

        if config.early_stop is not None:
            print("Training will early stop without "
              "improvement after %d epochs."%config.early_stop)

        train_history = list()
        valid_history = list()

        lr = model.set_learning_rate(session, config.learning_rate)

        train_data.cache(verbose=True)
        valid_data.cache(verbose=True)

        for i in range(config.max_epoch):

            # MVE Epoch
            if config.UQ_model_type == 'MVE':
                (train_mse, train_mse_var, valid_mse, valid_mse_var) = run_epoch_mve(session, model, train_data,
                                                                                     valid_data,
                                                                                     keep_prob=config.keep_prob,
                                                                                     passes=config.passes,
                                                                                     verbose=True)
                # Status to check if valid mse is nan, used to stop training
                if math.isnan(valid_mse):
                    is_metric_nan = True
                else:
                    is_metric_nan = False
                print('Epoch: %d Train MSE: %.8f Valid MSE: %.8f Learning rate: %.4f' %
                      (i + 1, train_mse, valid_mse, lr))
                print('Epoch: %d Train MSE_w_variance: %.8f Valid MSE_w_variance: %.8f Learning rate: %.4f' %
                      (i + 1, train_mse_var, valid_mse_var, lr))
                sys.stdout.flush()

                train_history.append(train_mse_var)
                valid_history.append(valid_mse_var)

            # PIE Epoch
            elif config.UQ_model_type == 'PIE':
                (train_mpiw, train_picp, train_picp_loss, valid_mpiw, valid_picp, valid_picp_loss) = \
                    run_epoch_pie(session, model, train_data, valid_data,
                                  keep_prob=config.keep_prob,
                                  passes=config.passes,
                                  verbose=True)

                train_loss = train_mpiw + config.picp_lambda*train_picp_loss
                valid_loss = valid_mpiw + config.picp_lambda*valid_picp_loss
                # Status to check if valid loss is nan, used to stop training
                if math.isnan(valid_loss):
                    is_metric_nan = True
                else:
                    is_metric_nan = False

                print('Epoch: %d Train MPIW: %.8f Valid MPIW: %.8f Learning rate: %.4f' %
                      (i + 1, train_mpiw, valid_mpiw, lr))
                print('Epoch: %d Train PICP: %.8f Valid PICP: %.8f' %
                      (i + 1, train_picp, valid_picp))
                print('Epoch: %d Train LOSS: %.8f Valid LOSS: %.8f' %
                      (i + 1, train_loss, valid_loss ))

                sys.stdout.flush()

                train_history.append(train_loss)
                valid_history.append(valid_loss)

            if re.match("Gradient|Momentum", config.optimizer):
                lr = model_utils.adjust_learning_rate(session, model, 
                                                      lr, config.lr_decay, train_history)

            if not os.path.exists(config.model_dir):
                print("Creating directory %s" % config.model_dir)
                os.mkdir(config.model_dir)

            if is_metric_nan:
                print("Training failed due to nan.")
                quit()
            elif stop_training(config, valid_history):
                print("Training stopped.")
                quit()
            else:
                if ( (config.early_stop is None) or 
                     (valid_history[-1] <= min(valid_history)) ):
                    model_utils.save_model(session, config, i)
Beispiel #5
0
def fit_model(args, X, y):
    print(f'Fitting model for {args.model}:')

    auc = EpochScoring(scoring='roc_auc', lower_is_better=False)
    apr = EpochScoring(scoring='average_precision', lower_is_better=False)
    lrs = LRScheduler(policy='StepLR', step_size=10, gamma=0.5)

    if args.model == 'glm':
        glm = LogitNet(alpha=0.5, n_lambda=50, n_jobs=-1)
        glm.fit(X, y)

        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1111)
        net = LogitNet(alpha=0.5, n_lambda=1, lambda_path=[glm.lambda_best_])

    if args.model == 'standard':

        net = NeuralNetClassifier(
            models.MpraDense,
            
            batch_size=256,
            optimizer=torch.optim.Adam,
            optimizer__weight_decay=2e-6,
            lr=1e-4,
            max_epochs=20,
            module__n_input=1079,
            module__n_units=(400, 250),
            module__dropout=0.3,
            
            callbacks=[auc, apr],
            iterator_train__shuffle=True,
            train_split=None
        )

    elif args.model == 'neighbors':

        net = NeuralNetClassifier(
            models.MpraFullCNN,

            batch_size=256,
            optimizer=torch.optim.Adam,
            optimizer__weight_decay=1e-2,
            lr=5e-5,
            max_epochs=20,

            callbacks=[auc, apr],
            iterator_train__shuffle=True,
            train_split=None      
        )

    # generate CV predictions
    np.random.seed(1000)
    torch.manual_seed(1000)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1000)
    cv_scores = cross_val_predict(net, X, y, cv=kf,
                                  method='predict_proba', n_jobs=-1)
    AUC = roc_auc_score(y, cv_scores[:, 1])
    APR = average_precision_score(y, cv_scores[:, 1])
    print('\tAUC ', np.round(AUC, 4))
    print('\tAPR ', np.round(APR, 4))

    save_scores(args, cv_scores[:, 1], y)

    # refit and store model on all data
    net.fit(X, y)
    save_model(net, args.project, args.model)
Beispiel #6
0
                        type=float,
                        default=1.0)
    parser.add_argument('--project_id',
                        help='ID (not name) of your project',
                        required=True)
    parser.add_argument(
        '--job-dir',
        help='Output directory for model, automatically provided by gcloud',
        required=True)

    args = parser.parse_args()
    arguments = args.__dict__

    print(arguments)

    estimator, acc_eval = model.train_and_evaluate(
        arguments['eval_size'], arguments['frac'], arguments['WE_max_df'],
        arguments['WE_min_df'], arguments['FT_norm'], arguments['M_alpha'],
        arguments['max_nb_label'])

    if estimator is not None:
        loc = model_utils.save_model(estimator, arguments['job_dir'],
                                     'stackoverlow')
        print("Saved model to {}".format(loc))

    # this is for hyperparameter tuning
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
        hyperparameter_metric_tag='accuracy',
        metric_value=acc_eval,
        global_step=0)
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--config",
                        default=None,
                        type=str,
                        required=True,
                        help="the training config file")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--multi_task",
                        action="store_true",
                        help="training with multi task schema")
    parser.add_argument("--debug",
                        action="store_true",
                        help="in debug mode, will not enable wandb log")
    parser.add_argument("--use_wandb",
                        action="store_true",
                        help="whether or not use wandb")
    args = parser.parse_args()
    cfg = parse_cfg(pathlib.Path(args.config))

    # set CUDA_VISIBLE_DEVICES and get num_gpus
    if args.local_rank == -1:  # not distributed
        os.environ["CUDA_VISIBLE_DEVICES"] = cfg["system"][
            "cuda_visible_devices"]
        num_gpus = torch.cuda.device_count()
        args.distributed = False
    else:  # distributed
        torch.cuda.set_device(args.local_rank)
        num_gpus = 1
        args.distributed = True
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()
    logger.info(
        "num_gpus: {}, distributed training: {}, 16-bits training: {}".format(
            num_gpus, bool(args.local_rank != -1), cfg["train"]["fp16"]))
    cudnn.benchmark = True

    cfg["train"]["output_dir"] = cfg["train"]["output_dir"] + "/" + \
                                 cfg["train"]["task_name"] + "_" + \
                                 cfg["train"]["model_name"] + "_" + \
                                 cfg["data"]["corpus"]

    output_dir_pl = pathlib.Path(cfg["train"]["output_dir"])
    if output_dir_pl.exists():
        logger.warn(
            "output directory ({}) already exists, continue after 2 seconds..."
            .format(output_dir_pl))
        time.sleep(2)
    else:
        output_dir_pl.mkdir(parents=True, exist_ok=True)

    if not args.debug and args.use_wandb:
        config_dictionary = dict(yaml=cfg, params=args)
        wandb.init(config=config_dictionary,
                   project="nlp-task",
                   dir=cfg["train"]["output_dir"])
        wandb.run.name = cfg["data"]["corpus"] + '-' + cfg["train"][
            "pretrained_tag"] + '-' + time.strftime("%Y-%m-%d %H:%M:%S",
                                                    time.localtime())
        wandb.config.update(args)
        wandb.run.save()

    if cfg["optimizer"]["gradient_accumulation_steps"] < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(cfg["optimizer"]["gradient_accumulation_steps"]))

    # true batch_size in training
    cfg["train"]["batch_size"] = cfg["train"]["batch_size"] // cfg[
        "optimizer"]["gradient_accumulation_steps"]

    # the type of label_map is bidict
    # label_map[x] = xx, label_map.inv[xx] = x
    label_map, num_labels = get_label_map(cfg)
    tokenizer, model = get_tokenizer_and_model(cfg, label_map, num_labels)

    # check model details on wandb
    if not args.debug and args.use_wandb:
        wandb.watch(model)

    num_examples, train_dataloader = get_dataloader(cfg,
                                                    tokenizer,
                                                    num_labels,
                                                    "train",
                                                    debug=args.debug)
    _, eval_dataloader = get_dataloader(cfg,
                                        tokenizer,
                                        num_labels,
                                        "dev",
                                        debug=args.debug)

    # total training steps (including multi epochs)
    num_training_steps = int(
        len(train_dataloader) //
        cfg["optimizer"]["gradient_accumulation_steps"] *
        cfg["train"]["train_epochs"])

    optimizer = AdamW(params=model.parameters(), lr=cfg["optimizer"]["lr"])
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=cfg["optimizer"]["num_warmup_steps"],
        num_training_steps=num_training_steps)

    scaler = None
    model = model.cuda()
    if cfg["train"]["fp16"] and _use_apex:
        logger.error("using apex amp for fp16...")
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    elif cfg["train"]["fp16"] and _use_native_amp:
        logger.error("using pytorch native amp for fp16...")
        scaler = torch.cuda.amp.GradScaler()
    elif cfg["train"]["fp16"] and (_use_apex is False
                                   and _use_native_amp is False):
        logger.error("your environment DO NOT support fp16 training...")
        exit()

    if cfg["system"]["distributed"]:
        # TODO distributed debug
        model.cuda(args.local_rank)
        from torch.nn.parallel import DistributedDataParallel as DDP
        model = DDP(model, device_ids=[args.local_rank])
    elif num_gpus > 1:
        model = torch.nn.DataParallel(model)

    # Train
    logger.info("start training on train set")
    epoch = 0
    best_score = -1
    for _ in trange(int(cfg["train"]["train_epochs"]), desc="Epoch"):
        best = False
        # train loop in one epoch
        train_loop(cfg, model, train_dataloader, optimizer, lr_scheduler,
                   num_gpus, epoch, scaler, args.debug, args.use_wandb)
        # begin to evaluate
        logger.info("running evaluation on dev set")
        score = eval_loop(cfg, tokenizer, model, eval_dataloader, label_map,
                          args.debug, args.use_wandb)
        if best_score < score:
            best_score = score
            best = True
        # Save a trained model and the associated configuration
        save_model(cfg, tokenizer, model, best)

        epoch += 1

    # Test Eval
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        logger.info("running evaluation on final test set")
        # TODO add stand alone test set
        _, eval_dataloader = get_dataloader(cfg,
                                            tokenizer,
                                            num_labels,
                                            "dev",
                                            debug=args.debug)
        score = eval_loop(cfg, tokenizer, model, eval_dataloader, label_map,
                          args.debug, args.use_wandb)
def train(args):
    # parameters
    num_gpus = args.num_gpu
    batch_size = args.batch_size * num_gpus
    start_epoch = args.start_epoch
    epochs = args.epochs
    torch.backends.cudnn.benchmark = True
    if num_gpus > 1:
        # net = nn.DataParallel(net)
        os.environ["MASTER_ADDR"] = "127.0.0.1"
        os.environ["MASTER_PORT"] = "6066"
        torch.distributed.init_process_group(backend='nccl',
                                             world_size=1,
                                             rank=0,
                                             init_method='env://')
    # variables
    device = cfg.device(num_gpus)
    train_loader = cfg.train_loader(num_gpus)
    val_loader = cfg.val_loader()
    net = cfg.model()
    # criterion = nn.CrossEntropyLoss().to(device)
    criterion = SoftCrossEntropyLoss(label_smoothing=0.1,
                                     num_classes=cfg.num_classes).to(device)
    optimizer = optim.SGD(model_utils.split_weights(net),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=1e-4)
    # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20, 40], gamma=0.1)
    # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    writer = SummaryWriter(log_dir=cfg.log_dir)

    # load weights or init weights
    if start_epoch > 0:
        weight_path = cfg.snapshot_save_path.format(start_epoch)
        print("load weight from file {}".format(weight_path))
        checkpoint = torch.load(weight_path)
        net.load_state_dict(checkpoint)
    else:
        model_utils.init_weights(net)
    net.to(device)

    if num_gpus > 1:
        # net = nn.DataParallel(net)
        net = nn.parallel.DistributedDataParallel(net)
    print("type of net:{}".format(type(net)))

    # training
    for epoch in range(start_epoch, epochs):
        running_loss, running_corrects = 0.0, 0.0
        start_time = timeit.default_timer()
        net.train()
        # scheduler.step(epoch)
        adjust_learning_rate(optimizer, epoch, args)
        _add_weight_history(writer, net, epoch)
        for images, labels in tqdm(train_loader):
            # print(type(labels), type(images))
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()  # 梯度置0
            if args.mixup:
                inputs, targets_a, targets_b, lam = model_utils.mix_up_data(
                    images, labels, args.alpha, True)
                outputs = net(inputs)
                loss_func = model_utils.mix_up_criterion(
                    targets_a, targets_b, lam)
                loss = loss_func(criterion, outputs)
            else:
                outputs = net.forward(images)  # [B,class_logits]
                loss = criterion(outputs, labels)
            # backward
            loss.backward()
            optimizer.step()

            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * images.size(0)
            running_corrects += torch.sum(preds == labels).item()

        train_loss = running_loss / len(train_loader.dataset)
        train_acc = 100. * running_corrects / len(train_loader.dataset)

        # 记录日志
        writer.add_scalar('scalar/learning_rate',
                          optimizer.param_groups[0]['lr'], epoch + 1)
        writer.add_scalar('scalar/train_loss', train_loss, epoch + 1)
        writer.add_scalar('scalar/train_acc', train_acc, epoch + 1)
        # 打印状态信息
        print("[{}] Epoch: {}/{} Loss: {:03f} Acc: {:03f}".format(
            'train', epoch + 1, epochs, train_loss, train_acc))
        stop_time = timeit.default_timer()
        print("Execution time: " + str(stop_time - start_time) + "\n")
        # 验证集
        acc = val(net, val_loader, device)
        writer.add_scalar('scalar/val_acc', acc, epoch + 1)
        print('Epoch: {}/{}  Val Acc:{:03f}'.format(epoch + 1, epochs, acc))

        # 保存中间模型
        if (epoch + 1) % cfg.SNAPSHOT == 0:
            # torch.save(net.state_dict(), cfg.snapshot_save_path.format(epoch + 1))
            model_utils.save_model(net,
                                   cfg.snapshot_save_path.format(epoch + 1))
    # 保存最终模型
    # torch.save(net.state_dict(), cfg.save_path)
    model_utils.save_model(net, cfg.save_path)
Beispiel #9
0
def train_model(config):
    print("Loading training data ...")
    train_data = None
    valid_data = None

    if config.early_stop is None:
        train_data = data_utils.load_all_data(config, is_training_only=True)
        valid_data = train_data
    else:
        train_data, valid_data = data_utils.load_train_valid_data(config)
        

    if config.start_date is not None:
        print("Training start date: ", config.start_date)
    if config.start_date is not None:
        print("Training end date: ", config.end_date)

    tf_config = tf.ConfigProto(allow_soft_placement=True,
                               log_device_placement=False)

    with tf.Graph().as_default(), tf.Session(config=tf_config) as session:
        if config.seed is not None:
            tf.set_random_seed(config.seed)

        print("Constructing model ...")
        model = model_utils.get_model(session, config, verbose=True)

        if config.data_scaler is not None:
            start_time = time.time()
            print("Calculating scaling parameters ...", end=' '); sys.stdout.flush()
            scaling_params = train_data.get_scaling_params(config.data_scaler)
            model.set_scaling_params(session,**scaling_params)
            print("done in %.2f seconds."%(time.time() - start_time))
            #print(scaling_params['center'])
            #print(scaling_params['scale'])
            #exit(0)

        if config.early_stop is not None:
            print("Training will early stop without "
              "improvement after %d epochs."%config.early_stop)

        train_history = list()
        valid_history = list()

        lr = model.set_learning_rate(session,config.learning_rate)

        train_data.cache(verbose=True)
        valid_data.cache(verbose=True)

        for i in range(config.max_epoch):

            (train_mse, valid_mse) = run_epoch(session, model, train_data, valid_data,
                                               keep_prob=config.keep_prob, 
                                               passes=config.passes,
                                               verbose=True)
            print( ('Epoch: %d Train MSE: %.6f Valid MSE: %.6f Learning rate: %.4f') %
                  (i + 1, train_mse, valid_mse, lr) )
            sys.stdout.flush()

            train_history.append( train_mse )
            valid_history.append( valid_mse )

            if re.match("Gradient|Momentum",config.optimizer):
                lr = model_utils.adjust_learning_rate(session, model, 
                                                      lr, config.lr_decay, train_history )

            if not os.path.exists(config.model_dir):
                print("Creating directory %s" % config.model_dir)
                os.mkdir(config.model_dir)

            if math.isnan(valid_mse):
                print("Training failed due to nan.")
                quit()
            elif stop_training(config,valid_history):
                print("Training stopped.")
                quit()
            else:
                if ( (config.early_stop is None) or 
                     (valid_history[-1] <= min(valid_history)) ):
                    model_utils.save_model(session,config,i)
Beispiel #10
0
def run(cv_method='loo', anom_type='mean'):
    args = config.parse_arguments(
        argv[1] if len(argv) >= 2 else _DEFAULT_CONFIG)
    # Load the data
    us_maize_regions = [
        'Indiana', 'Illinois', 'Ohio', 'Nebraska', 'Iowa', 'Minnesota'
    ]  # Growing season: April through to September

    data = data_loading.load_temp_precip_data('Maize',
                                              'Spring',
                                              'USA',
                                              us_maize_regions,
                                              range(3, 9),
                                              anom_type=anom_type)

    if args.model.lower() == 'corr_bvg' or args.model == 'uncorr_bvg':
        save_path = f'models/saved_models/{args.model}_save'
        load_path = f'{save_path}.pkl'
        if not os.path.exists(load_path):
            model = models.models.fetch_model(args.model)
            save_model(model=model, file_path=save_path)
        else:
            # Load model to circumvent compile time
            model = load_model(load_path)
        batched = False
        # Fit the model
        fit = model.sampling(data,
                             chains=args.chains,
                             iter=args.iter,
                             verbose=args.verbose,
                             seed=args.seed)
    elif args.model.lower() == 'gp':
        kernel = RBF(length_scale=0.5)
        model = GaussianProcessRegressor(kernel=kernel,
                                         normalize_y=True,
                                         random_state=42)
        batched = True
    elif args.model.lower() == 'lr':
        # model = LinearRegression(fit_intercept=True, normalize=True)
        model = RidgeCV()
        batched = True
    else:
        raise ValueError('Invalid model type.')

    if cv_method == 'rolling':
        # Rolling-origin cross-validation
        print("===> Rolling-origin CV")
        cv_results = validation.sliding_window_cv(model,
                                                  data,
                                                  args,
                                                  batched=batched)
    elif cv_method == 'time-series':
        # Time-series cross validation, incrementing by one year each split
        print("===> Time-series CV")
        n_splits = 34
        cv_results = validation.time_series_cv(model,
                                               data,
                                               args,
                                               n_splits=n_splits,
                                               batched=batched)
    elif cv_method == 'loo':
        # LOO cross-validation
        print("===> LOO CV")
        cv_results = validation.leave_p_out_cv(model,
                                               data,
                                               args,
                                               p=1,
                                               batched=batched)
    else:
        # LTO cross-validation
        print("===> LTO CV")
        cv_results = validation.leave_p_out_cv(model,
                                               data,
                                               args,
                                               p=3,
                                               batched=batched)

    print_metrics(cv_results)
Beispiel #11
0
def main(config):

	# set up workspace
	work_space = config["workspace"]
	tf_board = config["tf_board"]
	setup_workpath(work_space)
	name = config["Name"]

	# Construct or load embeddings
	print("Initializing embeddings ...")
	vocab_size = config["embeddings"]["vocab_size"]
	embed_size = config["embeddings"]["embed_size"]

	# Build the model and compute losses
	(encode_num_layers, encode_num_units, encode_cell_type, encode_bidir,
	 attn_num_units, decode_num_layers, decode_num_units, decode_cell_type,
	 use_user_feat,use_gate_memory,use_user_desc,use_blog_user_coattn,
	 use_external_desc_express,use_external_feat_express,
	 user_feat_dim,user_feat_unit,user_feat_mem_unit,
	 desc_rnn_unit,desc_attn_num_units,user_map_unit,
	 ) = get_pcgn_model_config(config)

	(train_file, dev_file,
	 source_max_length, target_max_length, desc_max_length,
	 gpu_fraction, gpu_id, train_steps, checkpoint_every, print_every,
	 batch_size,is_beam_search,beam_size,infer_max_iter,
	 l2_regularize,learning_rate,max_checkpoints,max_gradient_norm,
	  ) = get_pcgn_training_config(config)

	train_set=read_data(train_file)
	print(' # train data:',len(train_set))
	dev_set=read_data(dev_file)
	print(' # dev data:',len(dev_set))

	print("Building model architecture ")
	pcg_model = PCGNModel(
		mode='train', model_name=name,
		vocab_size=vocab_size, embedding_size=embed_size,
		encode_num_layers=encode_num_layers, encode_num_units=encode_num_units,
		encode_cell_type=encode_cell_type, encode_bidir=encode_bidir,
		attn_num_units=attn_num_units, decode_num_layers=decode_num_layers,
		decode_num_units=decode_num_units, decode_cell_type=decode_cell_type,
		use_user_feat=use_user_feat, use_gate_memory=use_gate_memory,
		use_user_desc=use_user_desc, use_blog_user_coattn=use_blog_user_coattn,
		use_external_desc_express=use_external_desc_express, use_external_feat_express=use_external_feat_express,

		user_feat_dim=user_feat_dim, user_feat_unit=user_feat_unit, user_feat_mem_unit=user_feat_mem_unit,
		desc_rnn_unit=desc_rnn_unit, desc_attn_num_units=desc_attn_num_units, user_map_unit=user_map_unit,

		batch_size=batch_size, beam_search=is_beam_search, beam_size=beam_size, infer_max_iter=infer_max_iter, target_max_length=target_max_length,
		l2_regularize=l2_regularize, learning_rate=learning_rate, max_to_keep=max_checkpoints, max_gradient_norm=max_gradient_norm,
	)

	print("\tDone.")


	logdir = '%s/nn_models/' % work_space

	# Set up session
	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, visible_device_list=gpu_id,allow_growth=True)

	sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
											gpu_options=gpu_options))
	init = tf.global_variables_initializer()
	sess.run(init)

	# tensorbord
	if use_tensorboard:
		train_writer = tf.summary.FileWriter(tf_board + 'train/', sess.graph)
		test_writer = tf.summary.FileWriter(tf_board + 'test/', sess.graph)

	try:
		saved_global_step = load_model(pcg_model.saver, sess, logdir)
		if saved_global_step is None:
			saved_global_step = -1

	except Exception:
		print("Something went wrong while restoring checkpoint. "
			  "Training is terminated to avoid the overwriting.")
		raise

	# ##### Training #####

	# Training
	last_saved_step = saved_global_step
	num_steps = saved_global_step + train_steps
	steps = []
	previous_losses=[]
	lr = pcg_model.learning_rate

	print("Start training ...")
	print('steps per epoch:',len(train_set)//batch_size)
	try:
		for step in range(saved_global_step + 1, num_steps):
			start_time = time.time()

			batch = get_pcgn_batch(train_set,'train', batch_size,source_max_length, target_max_length,desc_max_length)
			loss_value = pcg_model.train(sess, batch)
			previous_losses.append(loss_value)
			lr_decay_step = 10
			if step % 500 == 0 and len(previous_losses)-5 > lr_decay_step and np.mean(previous_losses[-5:]) >= np.mean(previous_losses[-lr_decay_step -5:-5]):
				lr=pcg_model.learning_rate
				if lr > 0.00001:
					pcg_model.learning_rate=lr*0.9
					print('learning rate decay:',lr*0.9)
			duration = (time.time() - start_time)
			if step % print_every == 0 and step != 0:
				# train perplexity
				t_perp = pcg_model.compute_perplexity(sess, batch)
				if use_tensorboard:
					add_summary(train_writer, step, 'train perplexity', t_perp)

				# eval perplexity
				dev_str = ""
				if dev_set is not None:
					eval_batch = get_pcgn_batch(dev_set,'train', batch_size,source_max_length, target_max_length,desc_max_length)
					eval_perp = pcg_model.compute_perplexity(sess, eval_batch)
					with open(logdir+'eval_perp.txt','a',encoding='utf-8') as f:
						f.write('{}\t{}\n'.format(str(step),str(eval_perp)))

					if use_tensorboard:
						add_summary(test_writer, step, 'eval perplexity', eval_perp)
					dev_str += "val_prep: {:.3f}\n".format(eval_perp)

				steps.append(step)
				ep=step//(len(train_set)//batch_size)
				info = 'epoch {:d}, step {:d},lr:{:.5f}, loss = {:.6f},perp: {:.3f}\n{}({:.3f} sec/step)'
				print(info.format(ep,step,lr, loss_value, t_perp, dev_str, duration))

			if step % checkpoint_every == 0:
				save_model(pcg_model.saver, sess, logdir, step)
				last_saved_step = step

	except KeyboardInterrupt:
		# Introduce a line break after ^C so save message is on its own line.
		print()

	finally:
		if step > last_saved_step:
			save_model(pcg_model.saver, sess, logdir, step)
Beispiel #12
0
def main():
    # Arguments
    ###########################################################################
    try:
        args = get_args()
        config = process_config(args.config)
    except:
        logging.error("Missing or invalid arguments.")
        exit(0)

    # Logging
    ###########################################################################
    logging.basicConfig(
        filename=os.path.join("logs", config.exp_name + ".log"),
        format="[%(asctime)s] - [%(levelname)s]: %(message)s",
        filemode="a",
        level=logging.DEBUG,
    )
    logging.info("Logging started.")
    logging.info("Keras version: {}".format(keras_version))

    # Session
    ###########################################################################
    sess = tf.Session()
    K.set_session(sess)

    # create experiment related directories
    ###########################################################################
    create_dirs([config.summary_dir, config.checkpoint_dir])

    # Initialize the model
    ###########################################################################
    model_formicID = load_model(config=config, num_species=97)
    model_formicID = compile_model(model=model_formicID, config=config)
    model_formicID = weights_load(
        model=model_formicID,
        weights=
        "experiments/T97_CaAll_QuM_ShSti_AugM_D05_LR0001_E200_I4_def_clean/checkpoint/weights_55-1.76.hdf5",
    )

    # Training in batches with iterator
    ###########################################################################
    history = trainer_dir(
        model=model_formicID,
        config=config,
        callbacks=build_logger(config=config, model=model_formicID),
    )
    save_model(model=model_formicID,
               filename="final_weights.hdf5",
               config=config)

    # Evaluation
    ###########################################################################
    plot_history(history=history, config=config, theme="ggplot", save=None)
    evaluator(model=model_formicID, config=config, test_dir=None)

    # Testing
    ###########################################################################
    Y_true, Y_pred, labels, species_dict = predictor(
        model=model_formicID,
        config=config,
        # species_json="data/species_dict.json",
        plot=True,
        n_img=10,
        n_cols=3,
    )
    predictor_reports(
        Y_true=Y_true,
        Y_pred=Y_pred,
        config=config,
        species_dict=species_dict,
        target_names=labels,
        digits=5,
    )
    plot_confusion_matrix(
        Y_pred=Y_pred,
        Y_true=Y_true,
        config=config,
        target_names=labels,
        species_dict=species_dict,
        title=None,
        cmap="viridis",
        normalize=True,
        scores=True,
        score_size=8,
        save="confusion_matrix.png",
    )
    # Footer
    ###########################################################################
    K.clear_session()
    logging.info("Logging ended.")
Beispiel #13
0
def train_model(train_anns,
                eval_anns,
                model,
                device,
                model_save_dir,
                ckpt='model_latest.pth',
                num_epoches=10,
                batch_size=4,
                ap='ap_50',
                ap_thre=0.5,
                ap_range=3,
                ap_shift_thre=0.001):
    # optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.005,
                                momentum=0.9,
                                weight_decay=0.0005)
    # load pretrain best model
    ckpt_path = os.path.join(model_save_dir, ckpt)
    model, optimizer, start_epoch = load_model(model, ckpt_path, optimizer)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=num_epoches)

    # train/eval dataset
    dataset = Detection_Dataset_anns(train_anns, get_transform(True))
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             pin_memory=True,
                                             num_workers=4,
                                             collate_fn=collate_fn)
    dataset_eval = Detection_Dataset_anns(eval_anns, get_transform(False))
    dataloader_eval = torch.utils.data.DataLoader(dataset_eval,
                                                  batch_size=1,
                                                  shuffle=False,
                                                  pin_memory=True,
                                                  num_workers=4,
                                                  collate_fn=collate_fn)
    # 检测参数
    writer = SummaryWriter(log_dir='runs/{}'.format(get_curtime()))
    ap_records = {'ap_50': [], 'ap_75': [], 'ap_shift': []}

    for epoch in range(start_epoch, num_epoches):  # epoch 要从0开始,内部有 warm_up
        # train
        train_one_epoch(model,
                        optimizer,
                        dataloader,
                        device,
                        epoch,
                        print_freq=10,
                        writer=writer,
                        begin_step=epoch * len(dataloader))
        # store & update lr
        writer.add_scalar('Train/lr',
                          optimizer.param_groups[0]["lr"],
                          global_step=epoch)
        lr_scheduler.step()

        # eval after each train
        evals = evaluate(model, dataloader_eval, device, writer, epoch)

        # states
        ap_records['ap_50'].append(evals['ap_50'])
        ap_records['ap_75'].append(evals['ap_75'])
        if len(ap_records[ap]) >= ap_range:
            ap_shift = lasso_shift(ap_records[ap][-ap_range:])
        else:
            ap_shift = 0
        ap_records['ap_shift'].append(ap_shift)

        writer.add_scalar('Accuracy/AP_shift', ap_shift, global_step=epoch)

        if evals[ap] > ap_thre:
            ckpt_path = os.path.join(model_save_dir,
                                     'model_{}.pth'.format(epoch))
            save_model(ckpt_path, model, epoch, optimizer)

            if 0 < ap_shift < ap_shift_thre:  # break and save ap records
                best_idx_in_range = ap_records[ap].index(
                    max(ap_records[ap][-ap_range:]))
                best_epoch = epoch - ap_range + 1 + best_idx_in_range

                print('best epoch:', best_epoch)
                save_clean_best_model(best_epoch, model_save_dir)
Beispiel #14
0
def main(opt):
    path_save_model_ = './model_save/'
    if not os.path.exists(path_save_model_):
        os.mkdir(path_save_model_)

    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    opt = opts().update_dataset_info_and_set_heads(opt, LoadImagesAndLabels)

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    cuda = torch.cuda.is_available()
    device_ = torch.device('cuda' if cuda else 'cpu')
    opt.device = device_
    chunk_sizes_ = [8]
    gpus_ = [0]
    # resnet_18 ,resnet_34 ,resnet_50,resnet_101,resnet_152
    model_arch = 'resnet_34'
    print('Creating model...')

    num_layer = int(model_arch.split("_")[1])
    num_classes = 1
    heads_ = {'hm': num_classes, 'wh': 2, 'reg': 2}

    print('heads : {}'.format(heads_))
    model = resnet(num_layers=num_layer,
                   heads=heads_,
                   head_conv=64,
                   pretrained=True)  # res_18
    # print(model)

    batch_size_ = 16
    num_workers_ = 4
    learning_rate_ = 1.25e-4
    path_load_model_ = './model_save/model_hand_last.pth'
    # path_load_model_ = ''
    lr_step_ = [190, 220]

    optimizer = torch.optim.Adam(model.parameters(), learning_rate_)
    start_epoch = 0
    if os.path.exists(path_load_model_):
        model, optimizer, start_epoch = load_model(model, path_load_model_,
                                                   optimizer, True,
                                                   learning_rate_, lr_step_)

    trainer = CtdetTrainer(opt, model, optimizer)

    trainer.set_device(gpus_, chunk_sizes_, device_)

    print('load train_dataset')
    train_dataset = LoadImagesAndLabels(state='train', path_='../done/')

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size_,
                                               shuffle=True,
                                               num_workers=num_workers_,
                                               pin_memory=False,
                                               drop_last=True)

    print('Starting training...')
    print("using arch      : {}".format(model_arch))
    print('num_classes     : {}'.format(num_classes))
    print('batch_size      : {}'.format(batch_size_))
    print('num_workers     : {}'.format(num_workers_))
    print('learning_rate   : {}'.format(learning_rate_))
    print('lr_step         : {}'.format(lr_step_))
    print('path_load_model : {}'.format(path_load_model_))

    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        log_dict_train, _ = trainer.train(epoch, train_loader)

        save_model(path_save_model_ + 'model_hand_last.pth', epoch, model,
                   optimizer)
        if epoch % 1 == 0:
            save_model(path_save_model_ + 'hand_epoch{}.pth'.format(epoch),
                       epoch, model, optimizer)

        if epoch in lr_step_:
            save_model(path_save_model_ + 'model_hand_{}.pth'.format(epoch),
                       epoch, model, optimizer)
            lr = learning_rate_ * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
def run_episode(agent, env, pub_result, pub_get_action, run_id, episode_number,
                global_step, param_dictionary, start_time, scores, episodes,
                log, log_title, save_model_to_disk):
    """
    Runs an episode in the chosen stage of the Gazebo Simulation
    Episode ends when a goal is reached or after episode max steps in the
    agent is exceeded

    :param agent: RL agent to act in the Gazebo environment
    :param env: Gazebo simulation
    :param pub_result: Rospy publisher for the latest score and q-values
    :param pub_get_action: Rospy publisher of the latest action and reward
    :param run_id: ID for logging purposes
    :param episode_number: current episode number
    :param global_step: all steps of all episodes counted
    :param param_dictionary: dict which contains all model parameters
    :param start_time: start time of run
    :param scores: list of cumulated rewards per episode
    :param episodes: list containing all episodes done
    :param log: logging object
    :param log_title: string title of log
    :param save_model_to_disk: boolean if model should be saved to disk
    """
    result = Float32MultiArray()
    get_action = Float32MultiArray()

    state = env.reset()
    score = 0
    for episode_step in range(agent.episode_max_step):
        goal = env.get_goal()
        action = agent.get_action(state)
        next_state, reward, done = env.step(action)

        if episode_step >= agent.episode_max_step - 1:
            rospy.loginfo("Time out!!")
            done = True

        position = env.get_position()
        agent.append_memory(state, action, reward, next_state, done)
        log_utils.make_log_entry(log, log_title, run_id, episode_number,
                                 episode_step, state, next_state, goal,
                                 position, action, agent.q_value, reward, done)

        if len(agent.memory) >= agent.train_start:
            if global_step <= agent.target_update:
                agent.train_model()
            else:
                agent.train_model(True)

        score += reward
        state = next_state
        get_action.data = [action, score, reward]
        pub_get_action.publish(get_action)

        if save_model_to_disk and episode_step == 0:
            save_model(agent, param_dictionary, episode_number)

        if done:
            result.data = [score, np.max(agent.q_value)]
            pub_result.publish(result)
            agent.update_target_model()
            scores.append(score)
            episodes.append(episode_number)
            log_episode_info(episode_number, score, agent, start_time)

            param_keys = ['epsilon', 'episode']
            param_values = [agent.epsilon, episode_number]
            param_dictionary = dict(zip(param_keys, param_values))

            return run_id, global_step, param_dictionary

        global_step += 1
        if global_step % agent.target_update == 0:
            rospy.loginfo("UPDATE TARGET NETWORK")