Esempio n. 1
0
def get_config(
    files_list,
    input_names=["state_1", "state_2"],
    output_names=["Qvalue_1", "Qvalue_2"],
    agents=2,
):
    """This is only used during training."""
    expreplay = ExpReplay(
        predictor_io_names=(input_names, output_names),
        player=get_player(task="train", files_list=files_list, agents=agents),
        state_shape=IMAGE_SIZE,
        batch_size=BATCH_SIZE,
        memory_size=MEMORY_SIZE,
        init_memory_size=INIT_MEMORY_SIZE,
        init_exploration=1.0,
        update_frequency=UPDATE_FREQ,
        history_len=FRAME_HISTORY,
        agents=agents,
    )

    return TrainConfig(
        # dataflow=expreplay,
        data=QueueInput(expreplay),
        model=Model(agents=agents),
        callbacks=[
            ModelSaver(),
            PeriodicTrigger(
                RunOp(DQNModel.update_target_param, verbose=True),
                # update target network every 10k steps
                every_k_steps=10000 // UPDATE_FREQ,
            ),
            expreplay,
            ScheduledHyperParamSetter("learning_rate", [(60, 4e-4),
                                                        (100, 2e-4)]),
            ScheduledHyperParamSetter(
                ObjAttrParam(expreplay, "exploration"),
                # 1->0.1 in the first million steps
                [(0, 1), (10, 0.1), (320, 0.01)],
                interp="linear",
            ),
            PeriodicTrigger(
                Evaluator(
                    nr_eval=EVAL_EPISODE,
                    input_names=input_names,
                    output_names=output_names,
                    files_list=files_list,
                    get_player_fn=get_player,
                    agents=agents,
                ),
                every_k_epochs=EPOCHS_PER_EVAL,
            ),
            HumanHyperParamSetter("learning_rate"),
        ],
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=1000,
    )
Esempio n. 2
0
def get_config():
    """This is only used during training."""
    expreplay = ExpReplay(predictor_io_names=(['state'], ['Qvalue']),
                          player=get_player(directory=data_dir,
                                            task='train',
                                            files_list=train_data_fpaths),
                          state_shape=OBSERVATION_DIMS,
                          batch_size=BATCH_SIZE,
                          memory_size=MEMORY_SIZE,
                          init_memory_size=INIT_MEMORY_SIZE,
                          init_exploration=1.0,
                          update_frequency=UPDATE_FREQ,
                          frame_history_len=FRAME_HISTORY)

    return TrainConfig(
        # dataflow=expreplay,
        data=QueueInput(expreplay),
        model=Model(),
        callbacks=[  # TODO: periodically save videos
            ModelSaver(checkpoint_dir="model_checkpoints",
                       keep_checkpoint_every_n_hours=0.25,
                       max_to_keep=1000),  # TODO: og was just ModelSaver()
            PeriodicTrigger(
                RunOp(DQNModel.update_target_param, verbose=True),
                # update target network every 10k/freq steps
                every_k_steps=10000 // UPDATE_FREQ),
            # expreplay,
            ScheduledHyperParamSetter('learning_rate', [(60, 4e-4),
                                                        (100, 2e-4)]),
            ScheduledHyperParamSetter(
                ObjAttrParam(expreplay, 'exploration'),
                # 1->0.1 in the first 10M steps
                [(0, 1), (100, 0.1), (120, 0.01)],
                interp='linear'),
            PeriodicTrigger(  # runs exprelay._trigger()
                expreplay, every_k_steps=5000),
            PeriodicTrigger(
                # eval_model_multithread(pred, EVAL_EPISODE, get_player)
                Evaluator(nr_eval=EVAL_EPISODE,
                          input_names=['state'],
                          output_names=['Qvalue'],
                          directory=data_dir,
                          files_list=test_data_fpaths,
                          get_player_fn=get_player),
                every_k_steps=10000 // UPDATE_FREQ),
            HumanHyperParamSetter('learning_rate'),
        ],
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=NUM_EPOCHS,
    )
Esempio n. 3
0
def train(args):
    data_folder = args.get("data_folder")
    save_folder = args.get("save_folder")
    image_size = args.get("image_size")
    max_epoch = args.get("max_epoch")
    save_epoch = args.get("save_epoch") or max_epoch // 10
    # Scale lr and steps_per_epoch accordingly.
    # Make sure the total number of gradient evaluations is consistent.
    n_gpu = args.get("n_gpu") or 1
    batch_size = args.get("batch_size") or BATCH
    equi_batch_size = max(n_gpu, 1) * batch_size
    lr = args.get("lr") or LR
    lr *= equi_batch_size
    steps_per_epoch = args.get("steps_per_epoch") or 1000
    steps_per_epoch /= equi_batch_size
    image_steps = args.get("image_steps") or steps_per_epoch // 10
    scalar_steps = args.get("scalar_steps")
    if scalar_steps > 0:
        scalar_steps = max(scalar_steps // equi_batch_size, 1)
    else:
        scalar_steps = 0  # merge scalar summary every epoch
    # lr starts decreasing at half of max epoch
    start_dec_epoch = max_epoch // 2
    # stops when lr is 0.01 of its initial value
    end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01)
    # adjust noise input range according to the input act
    zmin, zmax = (0, 1) if args.get("act_input") == "identity" else (-1, 1)

    if save_folder == None:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir(save_folder)

    df = get_data(data_folder,
                  image_size,
                  zmin=zmin,
                  zmax=zmax,
                  batch=batch_size)
    df = PrintData(df)
    data = QueueInput(df)

    SynTexTrainer(data, Style2PO(args), n_gpu).train_with_defaults(
        callbacks=[
            PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch),
            PeriodicTrigger(ModelSaver(),
                            every_k_epochs=end_epoch),  # save model at last
            ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr),
                                                        (max_epoch, 0)],
                                      interp="linear"),
            PeriodicTrigger(VisualizeTestSet(data_folder, image_size),
                            every_k_epochs=max(1, max_epoch // 100)),
            #MergeAllSummaries(period=scalar_steps), # scalar only, slowdown in training, use TCMalloc
            MergeAllSummaries(period=image_steps, key="image_summaries"),
            MergeAllSummaries(key="acti_summaries"),
        ],
        max_epoch=end_epoch,
        steps_per_epoch=steps_per_epoch,
        session_init=None)
Esempio n. 4
0
def get_config():
    """This is only used during training."""
    expreplay = ExpReplay(predictor_io_names=(['state'], ['Qvalue']),
                          player=get_player(directory=data_dir,
                                            task='train',
                                            files_list=train_list),
                          state_shape=IMAGE_SIZE,
                          batch_size=BATCH_SIZE,
                          memory_size=MEMORY_SIZE,
                          init_memory_size=INIT_MEMORY_SIZE,
                          init_exploration=1.0,
                          update_frequency=UPDATE_FREQ,
                          history_len=FRAME_HISTORY)

    return TrainConfig(
        # dataflow=expreplay,
        data=QueueInput(expreplay),
        model=Model(),
        callbacks=[
            ModelSaver(),
            PeriodicTrigger(
                RunOp(DQNModel.update_target_param, verbose=True),
                # update target network every 10k steps
                every_k_steps=10000 // UPDATE_FREQ),
            expreplay,
            ScheduledHyperParamSetter('learning_rate', [(60, 4e-4),
                                                        (100, 2e-4)]),
            ScheduledHyperParamSetter(
                ObjAttrParam(expreplay, 'exploration'),
                # 1->0.1 in the first million steps
                [(0, 1), (10, 0.1), (320, 0.01)],
                interp='linear'),
            PeriodicTrigger(Evaluator(nr_eval=EVAL_EPISODE,
                                      input_names=['state'],
                                      output_names=['Qvalue'],
                                      directory=data_dir,
                                      files_list=test_list,
                                      get_player_fn=get_player),
                            every_k_epochs=EPOCHS_PER_EVAL),
            HumanHyperParamSetter('learning_rate'),
        ],
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=1000,
    )
Esempio n. 5
0
def train_net(net,
              session_init,
              batch_size,
              num_epochs,
              train_dataflow,
              val_dataflow):

    num_towers = max(get_num_gpu(), 1)
    batch_per_tower = batch_size // num_towers
    logger.info("Running on {} towers. Batch size per tower: {}".format(num_towers, batch_per_tower))

    num_training_samples = 1281167
    step_size = num_training_samples // batch_size
    max_iter = (num_epochs - 1) * step_size
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(0, 0.5), (max_iter, 0)],
            interp='linear',
            step_based=True),
        EstimatedTimeLeft()]

    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
    if num_towers == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(
            input=QueueInput(val_dataflow),
            infs=infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(DataParallelInferenceRunner(
            input=val_dataflow,
            infs=infs,
            gpus=list(range(num_towers))))

    config = TrainConfig(
        dataflow=train_dataflow,
        model=net,
        callbacks=callbacks,
        session_init=session_init,
        steps_per_epoch=step_size,
        max_epoch=num_epochs)

    launch_train_with_config(
        config=config,
        trainer=SyncMultiGPUTrainerParameterServer(num_towers))
Esempio n. 6
0
def get_config(model, conf):
    nr_tower = max(get_nr_gpu(), 1)
    batch = conf.batch
    if conf.fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data(conf.data_dir, 'train', batch)
        dataset_val = get_data(conf.data_dir, 'val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(45, 1e-2), (60, 1e-3),
                                                        (65, 1e-4), (70, 1e-5),
                                                        (75, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))
    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=5000,
                       max_epoch=80,
                       nr_tower=nr_tower)
Esempio n. 7
0
def get_config(files_list, data_type, trainable_variables):
    """This is only used during training."""
    expreplay = ExpReplay(
        predictor_io_names=(['state'], ['Qvalue']),
        player=get_player(task='train',
                          files_list=files_list,
                          data_type=data_type),
        state_shape=IMAGE_SIZE,
        batch_size=BATCH_SIZE,
        memory_size=MEMORY_SIZE,
        init_memory_size=INIT_MEMORY_SIZE,
        init_exploration=0.8,  #0.0
        ###############################################################################
        # HITL UPDATE
        update_frequency=INIT_UPDATE_FREQ,
        ###############################################################################
        history_len=FRAME_HISTORY,
        arg_type=data_type)

    return TrainConfig(
        # dataflow=expreplay,
        data=QueueInput(expreplay),
        model=Model(IMAGE_SIZE, FRAME_HISTORY, METHOD, NUM_ACTIONS, GAMMA,
                    trainable_variables),
        callbacks=[
            ModelSaver(),
            PeriodicTrigger(
                RunOp(DQNModel.update_target_param, verbose=True),
                # update target network every 10k steps
                every_k_steps=10000 // UPDATE_FREQ),
            expreplay,
            ScheduledHyperParamSetter('learning_rate', [(60, 4e-4),
                                                        (100, 2e-4)]),
            ScheduledHyperParamSetter(
                ObjAttrParam(expreplay, 'exploration'),
                # 1->0.1 in the first million steps
                [(0, 0.8), (1000000, 0.1), (32000000, 0.01)],
                interp='linear',
                step_based=True),
            ###############################################################################
            # HITL UPDATE
            # Here the number of steps taken in the environment is increased from 0, during
            # the pretraining phase, to 4 to allow the agent to take 4 steps in the env
            # between each TD update.
            ScheduledHyperParamSetter(ObjAttrParam(expreplay,
                                                   'update_frequency'),
                                      [(0, INIT_UPDATE_FREQ),
                                       (NUM_PRETRAIN, UPDATE_FREQ)],
                                      interp=None,
                                      step_based=True),

            ###############################################################################
            PeriodicTrigger(Evaluator(nr_eval=EVAL_EPISODE,
                                      input_names=['state'],
                                      output_names=['Qvalue'],
                                      files_list=files_list,
                                      data_type=data_type,
                                      get_player_fn=get_player),
                            every_k_steps=STEPS_PER_EVAL),
            HumanHyperParamSetter('learning_rate'),
        ],
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=MAX_EPOCHS,
    )
Esempio n. 8
0
    start_dec_epoch = max_epoch // 2
    # stops when lr is 0.01 of its initial value
    end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01)
    # adjust noise input range according to the input act
    zmin, zmax = (0, 1) if args.get("act") == "identity" else (-1, 1)

    if save_folder == None:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir(save_folder)

    df = get_data(data_folder, image_size, zmin=zmin, zmax=zmax)
    df = PrintData(df)
    data = QueueInput(df)

    SynTexTrainer(data, AdaptiveSynTex(args), n_gpu).train_with_defaults(
        callbacks=[
            PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch),
            PeriodicTrigger(ModelSaver(),
                            every_k_epochs=end_epoch),  # save model at last
            ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr),
                                                        (max_epoch, 0)],
                                      interp="linear"),
            #PeriodicTrigger(VisualizeTestSet(data_folder, image_size), every_k_epochs=10),
            MergeAllSummaries(period=scalar_steps),  # scalar only
            MergeAllSummaries(period=image_steps, key="image_summaries"),
        ],
        max_epoch=end_epoch,
        steps_per_epoch=steps_per_epoch,
        session_init=None)
 def _network_specific_callbacks(self):
     self.callbacks.append(
         ScheduledHyperParamSetter("learning_rate",
                                   [(1, 0.0002),
                                    (int(self.args.epochs / 2), 0.0001)]))
Esempio n. 10
0
def train(checkpoint_dir,
          model_name,
          dataset,
          num_epochs,
          quant_type,
          batch_size_per_gpu,
          lr=None,
          post_quantize_only=False):
    train_data, test_data, (img_shape,
                            label_shape) = datasets.DATASETS[dataset]()

    num_gpus = max(gpu.get_num_gpu(), 1)
    effective_batch_size = batch_size_per_gpu * num_gpus
    train_data = BatchData(train_data, batch_size_per_gpu)
    test_data = BatchData(test_data, batch_size_per_gpu, remainder=True)
    steps_per_epoch = len(train_data) // num_gpus

    if lr:
        if isinstance(lr, str):
            lr = ast.literal_eval(lr)
        if isinstance(lr, float):
            lr_schedule = [(0, lr)]
        else:
            lr_schedule = lr
    else:
        lr_schedule = [(0, 0.005), (8, 0.1), (25, 0.005), (30, 0)]

    if num_epochs is None:
        num_epochs = lr_schedule[-1][0]
    if post_quantize_only:
        start_quantising_at_epoch = 0
    else:
        start_quantising_at_epoch = lr_schedule[-2][0] if len(
            lr_schedule) > 1 else max(0, num_epochs - 5)

    logger.info(f"Training with LR schedule: {str(lr_schedule)}")
    logger.info(f"Quantising at epoch {start_quantising_at_epoch}")

    # train_data = FakeData([(batch_size_per_gpu,) + img_shape, (batch_size_per_gpu, ) + label_shape])

    model_func, input_spec, output_spec = get_model_func(
        "train",
        model_name,
        quant_type,
        img_shape,
        num_classes=label_shape[0],
        quant_delay=steps_per_epoch * start_quantising_at_epoch)
    target_spec = [
        tf.TensorSpec(t.shape, t.dtype, name=t.name.split("/")[-1] + "_target")
        for t in output_spec
    ]
    model = KerasModel(get_model=model_func,
                       input_signature=input_spec,
                       target_signature=target_spec,
                       input=train_data,
                       trainer=SyncMultiGPUTrainerParameterServer(
                           num_gpus, ps_device='gpu'))

    lr = tf.get_variable('learning_rate',
                         initializer=lr_schedule[0][1],
                         trainable=False)
    tf.summary.scalar('learning_rate-summary', lr)
    model.compile(optimizer=tf.train.MomentumOptimizer(learning_rate=lr,
                                                       momentum=0.9),
                  loss="categorical_crossentropy",
                  metrics=["categorical_accuracy"])

    model.fit(steps_per_epoch=steps_per_epoch,
              max_epoch=num_epochs,
              callbacks=[
                  ModelSaver(max_to_keep=1, checkpoint_dir=checkpoint_dir),
                  DataParallelInferenceRunner(
                      test_data, ScalarStats(model._stats_to_inference),
                      num_gpus),
                  ScheduledHyperParamSetter('learning_rate',
                                            lr_schedule,
                                            interp="linear"),
                  StatMonitorParamSetter('learning_rate',
                                         'validation_categorical_accuracy',
                                         lambda x: x / 2,
                                         threshold=0.001,
                                         last_k=10,
                                         reverse=True)
              ],
              session_init=SaverRestore(checkpoint_dir + "/checkpoint")
              if post_quantize_only else None)
Esempio n. 11
0
    if save_dir is None:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir(save_dir)

    dataset_train = get_data('train')
    dataset_test = get_data('test')

    config = TrainConfig(
        model=CifarResNet(n=NUM_UNITS,
                          mult_decay=mult_decay,
                          lr_init=lr_base * 0.1),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(
                dataset_test,
                [ScalarStats('cost'),
                 ClassificationError('wrong_vector')]),
            ScheduledHyperParamSetter('learning_rate',
                                      [(1, lr_base), (82, lr_base * 0.1),
                                       (123, lr_base * 0.01),
                                       (164, lr_base * 0.002)])
        ],
        max_epoch=200,
        session_init=SmartInit(args.load),
    )
    num_gpu = max(get_num_gpu(), 1)
    launch_train_with_config(config,
                             SyncMultiGPUTrainerParameterServer(num_gpu))