Ejemplo n.º 1
0
def train(args, logdir):

    # model
    # ;model = Net1()

    # dataflow
    # ;df = Net1DataFlow(hp.train1.data_path, hp.train1.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir)

    session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(
        allow_growth=True, ), )

    train_conf = TrainConfig(
        # ;model=model,
        # ;data=QueueInput(df(n_prefetch=1000, n_thread=4)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            # TODO EvalCallback()
        ],
        # ;max_epoch=hp.train1.num_epochs,
        # ;steps_per_epoch=hp.train1.steps_per_epoch,
        # session_config=session_conf
    )
    ckpt = '{}/{}'.format(
        logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
Ejemplo n.º 2
0
def get_config():
    M = Model()

    dataflow = data_io
    from tensorpack.callbacks.base import Callback

    class CBSyncWeight(Callback):
        def _before_run(self, ctx):
            if self.local_step % 10 == 0:
                return [M._sync_op_pred]

    import functools
    from tensorpack.train.config import TrainConfig
    from tensorpack.callbacks.saver import ModelSaver
    from tensorpack.callbacks.graph import RunOp
    from tensorpack.callbacks.param import ScheduledHyperParamSetter, HumanHyperParamSetter, HyperParamSetterWithFunc
    from tensorpack.tfutils import sesscreate
    from tensorpack.tfutils.common import get_default_sess_config
    import tensorpack.tfutils.symbolic_functions as symbf

    sigma_beta_steering = symbf.get_scalar_var('actor/sigma_beta_steering',
                                               0.3,
                                               summary=True,
                                               trainable=False)
    sigma_beta_accel = symbf.get_scalar_var('actor/sigma_beta_accel',
                                            0.3,
                                            summary=True,
                                            trainable=False)

    return TrainConfig(
        model=M,
        data=dataflow,
        callbacks=[
            ModelSaver(),
            HyperParamSetterWithFunc(
                'learning_rate/actor',
                functools.partial(M._calc_learning_rate, 'actor')),
            HyperParamSetterWithFunc(
                'learning_rate/critic',
                functools.partial(M._calc_learning_rate, 'critic')),

            # ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]),
            ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]),
            # HumanHyperParamSetter('learning_rate'),
            # HumanHyperParamSetter('entropy_beta'),
            ScheduledHyperParamSetter('actor/sigma_beta_accel', [(1, 0.2),
                                                                 (2, 0.01)]),
            ScheduledHyperParamSetter('actor/sigma_beta_steering',
                                      [(1, 0.1), (2, 0.01)]),
            CBSyncWeight(),
            data_io,
            # PeriodicTrigger(Evaluator(
            #     EVAL_EPISODE, ['state'], ['policy'], get_player),
            #     every_k_epochs=3),
        ] + evaluators,
        session_creator=sesscreate.NewSessionCreator(
            config=get_default_sess_config(0.5)),
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=1000,
    )
Ejemplo n.º 3
0
def main(cfg):
    print(cfg)
    
    tf.reset_default_graph()
    
    logger.set_logger_dir('tflogs', action='d')

    copyfile(hydra.utils.to_absolute_path('model.py'), 'model.py')
    copyfile(hydra.utils.to_absolute_path('dataflow.py'), 'dataflow.py')
    
    if cfg.cat_name == 'smpl':
        train_df = SMPLDataFlow(cfg, True, 1000)
        val_df = VisSMPLDataFlow(cfg, True, 1000, port=1080)
    else:
        train_df = ShapeNetDataFlow(cfg, cfg.data.train_txt, True)
        val_df = VisDataFlow(cfg, cfg.data.val_txt, False, port=1080)
    
    config = TrainConfig(
        model=Model(cfg),
        dataflow=BatchData(PrefetchData(train_df, cpu_count() // 2, cpu_count() // 2), cfg.batch_size),
        callbacks=[
            ModelSaver(),
            SimpleMovingAverage(['recon_loss', 'GAN/loss_d', 'GAN/loss_g', 'GAN/gp_loss', 'symmetry_loss'], 100),
            PeriodicTrigger(val_df, every_k_steps=30)
        ],
        monitors=tensorpack.train.DEFAULT_MONITORS() + [ScalarPrinter(enable_step=True, enable_epoch=False)],
        max_epoch=10
    )
    launch_train_with_config(config, SimpleTrainer())
Ejemplo n.º 4
0
def train(args, logdir1, logdir2):
    # model
    model = Net2()

    preprocessing(data_path, logdir2)

    # dataflow
    df = Net2DataFlow(data_path, hp.train2.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir2)

    # session_conf = tf.ConfigProto(
    #     gpu_options=tf.GPUOptions(
    #         allow_growth=True,
    #         per_process_gpu_memory_fraction=0.6,
    #     ),
    # )

    dataset_size = len(glob.glob(data_path + '/wav/*.wav'))
    print("\t\data_path : ", data_path)
    print("\t\tDataset Size : ", dataset_size)
    print("\t\tBatch Size : ", hp.train2.batch_size)
    print("\t\tSteps per epoch : ", (dataset_size // hp.train2.batch_size))
    from time import sleep
    sleep(10)

    session_inits = []
    ckpt2 = '{}/{}'.format(
        logdir2,
        args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2)
    if ckpt2:
        session_inits.append(SaverRestore(ckpt2))
    ckpt1 = tf.train.latest_checkpoint(logdir1)
    if ckpt1:
        session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=8)),
        callbacks=[
            # TODO save on prefix net2
            ModelSaver(checkpoint_dir=logdir2),
            # ConvertCallback(logdir2, hp.train2.test_per_epoch),
        ],
        max_epoch=hp.train2.num_epochs,
        steps_per_epoch=dataset_size // hp.train2.batch_size,
        session_init=ChainInit(session_inits))
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
        gpu_list = args.gpu.split(',')
        gpu_list = list(map(int, gpu_list))

    #trainer = SimpleTrainer()
    trainer = SyncMultiGPUTrainerReplicated(gpu_list)
    #trainer = AsyncMultiGPUTrainer(gpu_list, False)

    launch_train_with_config(train_conf, trainer=trainer)
Ejemplo n.º 5
0
def train(args, logdir):

    # model
    model = Net1()

    preprocessing(data_path)
    preprocessing(test_path)

    # dataflow
    df = Net1DataFlow(data_path, hp.train1.batch_size)
    df_test = Net1DataFlow(test_path, hp.train1.batch_size)

    #datas = df.get_data()
    #print(datas[1])
    # set logger for event and model saver
    logger.set_logger_dir(logdir)
    #session_conf = tf.ConfigProto(
    #    gpu_options=tf.GPUOptions(
    #        allow_growth=True,
    #    ),)

    # cv test code
    # https://github.com/tensorpack/tensorpack/blob/master/examples/boilerplate.py

    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=hp.train1.batch_size * 10, n_thread=1)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            InferenceRunner(
                df_test(n_prefetch=1),
                ScalarStats(['net1/eval/loss', 'net1/eval/acc'], prefix='')),
        ],
        max_epoch=hp.train1.num_epochs,
        steps_per_epoch=hp.train1.steps_per_epoch,
        #session_config=session_conf
    )
    ckpt = '{}/{}'.format(
        logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    num_gpu = hp.train1.num_gpu

    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
        num_gpu = len(args.gpu.split(','))
        trainer = SyncMultiGPUTrainerReplicated(num_gpu)
    else:
        trainer = SimpleTrainer()

    launch_train_with_config(train_conf, trainer=trainer)
Ejemplo n.º 6
0
def train(args, logdir1, logdir2):
    # model
    model = Net2()

    # dataflow
    df = Net2DataFlow(hp.train2.data_path, hp.train2.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir2)

    session_conf = tf.ConfigProto(
    #    log_device_placement=True,
        allow_soft_placement=True,
        gpu_options=tf.GPUOptions(
    #         allow_growth=True,
            per_process_gpu_memory_fraction=0.6,
        ),
    )

    session_inits = []
    ckpt2 = '{}/{}'.format(logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2)
    if ckpt2:
        session_inits.append(SaverRestore(ckpt2))
    ckpt1 = tf.train.latest_checkpoint(logdir1)
    if ckpt1:
        session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    train_conf = TrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=4)),
        callbacks=[
            # TODO save on prefix net2
            ModelSaver(checkpoint_dir=logdir2),
            # ConvertCallback(logdir2, hp.train2.test_per_epoch),
        ],
        max_epoch=hp.train2.num_epochs,
        steps_per_epoch=hp.train2.steps_per_epoch,
        session_init=ChainInit(session_inits),
        session_config=session_conf
    )
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))

    #trainer = SyncMultiGPUTrainerParameterServer(hp.train2.num_gpu)
    trainer = SimpleTrainer()
    launch_train_with_config(train_conf, trainer=trainer)
Ejemplo n.º 7
0
def train(args, logdir):

    # model
    print("####model")
    model = Net1()

    # dataflow
    print("####dataflow")
    df = Net1DataFlow(hp.Train1.data_path, hp.Train1.batch_size)

    # set logger for event and model saver
    print("####logger")
    logger.set_logger_dir(logdir)

    print("####session_conf")
    session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(
        allow_growth=True, ),
                                  allow_soft_placement=True)

    print("####train_conf")
    train_conf = TrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=5)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            # TODO EvalCallback()
        ],
        max_epoch=hp.Train1.num_epochs,
        steps_per_epoch=hp.Train1.steps_per_epoch,
        session_config=session_conf)
    print("####ckpt")
    ckpt = '{}/{}'.format(
        logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))

    print("####trainer")
    trainer = SyncMultiGPUTrainerReplicated(hp.Train1.num_gpu)

    print("####launch_train_with_config")
    launch_train_with_config(train_conf, trainer=trainer)
Ejemplo n.º 8
0
def train(args, logdir):
    # model
    model = Net1()

    # dataflow
    TIMIT_TRAIN_WAV = 'TIMIT/TRAIN/*/*/*.npz'
    TIMIT_TEST_WAV = 'TIMIT/TEST/*/*/*.npz'

    print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV))
    print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV))

    df = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV), hp.train1.batch_size)
    df_test = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV), hp.train1.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir)
    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=8)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            InferenceRunner(df_test(n_prefetch=1),
                            ScalarStats(['net1/eval/loss', 'net1/eval/acc'],prefix='')),
        ],
        max_epoch=hp.train1.num_epochs,
        steps_per_epoch=hp.train1.steps_per_epoch,
        #session_config=session_conf
    )
    ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)

    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if hp.default.use_gpu == True:
        os.environ['CUDA_VISIBLE_DEVICES'] = hp.default.gpu_list
        train_conf.nr_tower = len(hp.default.gpu_list.split(','))
        num_gpu = len(hp.default.gpu_list.split(','))
        trainer = SyncMultiGPUTrainerReplicated(num_gpu)
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        trainer = SimpleTrainer()

    launch_train_with_config(train_conf, trainer=trainer)
Ejemplo n.º 9
0
def train(args, logdir):

    # model
    model = Net()

    # dataflow
    df = NetDataFlow(hp.train.data_path, hp.train.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir)

    session_conf = tf.ConfigProto(
        gpu_options=tf.GPUOptions(
            allow_growth=True,
        ),)
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.45  # 占用GPU90%的显存

    train_conf = TrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=4)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            # TODO EvalCallback()
        ],
        max_epoch=hp.train.num_epochs,
        steps_per_epoch=hp.train.steps_per_epoch,
        # session_config=session_conf
    )
    ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))

    trainer = SyncMultiGPUTrainerReplicated(hp.train.num_gpu)

    launch_train_with_config(train_conf, trainer=trainer)
Ejemplo n.º 10
0
def train(args, logdir2):
    # model
    model = Net2()

    # dataflow
    df = Net2DataFlow(hp.train2.mel_path, hp.train2.ppgs_path,
                      hp.train2.batch_size)
    session_inits = []
    ckpt2 = '{}/{}'.format(
        logdir2,
        args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2)
    if ckpt2:
        session_inits.append(SaverRestore(ckpt2))
    '''
    ckpt1 = tf.train.latest_checkpoint(logdir1)
    if ckpt1:
        session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    '''
    train_conf = TrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=4)),
        callbacks=[
            # TODO save on prefix net2
            ModelSaver(checkpoint_dir=logdir2),
            # ConvertCallback(logdir2, hp.train2.test_per_epoch),
        ],
        max_epoch=hp.train2.num_epochs,
        steps_per_epoch=hp.train2.steps_per_epoch,
        session_init=ChainInit(session_inits))
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))

    trainer = SyncMultiGPUTrainerReplicated(hp.train2.num_gpu)
    print("strated trainer")
    launch_train_with_config(train_conf, trainer=trainer)
Ejemplo n.º 11
0
    model = IAFVocoder(batch_size=hp.train.batch_size, length=hp.signal.length)

    # dataset
    dataset = Dataset(hp.data_path,
                      hp.train.batch_size,
                      length=hp.signal.length)
    print('dataset size is {}'.format(len(dataset.wav_files)))

    # set logger for event and model saver
    logger.set_logger_dir(hp.logdir)

    train_conf = TrainConfig(
        model=model,
        data=TFDatasetInput(dataset()),
        callbacks=[
            ModelSaver(checkpoint_dir=hp.logdir),
            RunUpdateOps()  # for batch norm, exponential moving average
            # TODO GenerateCallback()
        ],
        max_epoch=hp.train.num_epochs,
        steps_per_epoch=hp.train.steps_per_epoch,
    )
    ckpt = '{}/{}'.format(
        hp.logdir, ckpt) if ckpt else tf.train.latest_checkpoint(hp.logdir)
    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, gpu))
        train_conf.nr_tower = len(gpu)
Ejemplo n.º 12
0
def get_config():
    M = Model()

    name_base = str(uuid.uuid1())[:6]
    PIPE_DIR = os.environ.get('TENSORPACK_PIPEDIR',
                              '/tmp/.ipcpipe').rstrip('/')
    if not os.path.exists(PIPE_DIR): os.makedirs(PIPE_DIR)
    else: os.system('rm -f {}/sim-*'.format(PIPE_DIR))
    namec2s = 'ipc://{}/sim-c2s-{}'.format(PIPE_DIR, name_base)
    names2c = 'ipc://{}/sim-s2c-{}'.format(PIPE_DIR, name_base)
    # AgentTorcs * SIMULATOR_PROC, AgentReplay * SIMULATOR_PROC
    procs = [
        MySimulatorWorker(k, namec2s, names2c)
        for k in range(SIMULATOR_PROC * 2)
    ]
    ensure_proc_terminate(procs)
    start_proc_mask_signal(procs)

    master = MySimulatorMaster(namec2s, names2c, M)
    dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE)

    class CBSyncWeight(Callback):
        def _after_run(self, ctx, _):
            if self.local_step > 1 and self.local_step % SIMULATOR_PROC == 0:
                # print("before step ",self.local_step)
                return [M._td_sync_op]

        def _before_run(self, ctx):

            if self.local_step % 10 == 0:
                return [M._sync_op, M._td_sync_op]
            if self.local_step % SIMULATOR_PROC == 0 and 0:
                return [M._td_sync_op]

    import functools
    return TrainConfig(
        model=M,
        dataflow=dataflow,
        callbacks=[
            ModelSaver(),
            HyperParamSetterWithFunc(
                'learning_rate/actor',
                functools.partial(M._calc_learning_rate, 'actor')),
            HyperParamSetterWithFunc(
                'learning_rate/critic',
                functools.partial(M._calc_learning_rate, 'critic')),

            # ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]),
            ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]),
            # HumanHyperParamSetter('learning_rate'),
            # HumanHyperParamSetter('entropy_beta'),
            # ScheduledHyperParamSetter('actor/sigma_beta_accel', [(1, 0.2), (2, 0.01), (3, 1e-3), (4, 1e-4)]),
            # ScheduledHyperParamSetter('actor/sigma_beta_steering', [(1, 0.1), (2, 0.01), (3, 1e-3), (4, 1e-4)]),
            master,
            StartProcOrThread(master),
            CBSyncWeight(),
            # CBTDSyncWeight()
            # PeriodicTrigger(Evaluator(
            #     EVAL_EPISODE, ['state'], ['policy'], get_player),
            #     every_k_epochs=3),
        ],
        session_creator=sesscreate.NewSessionCreator(
            config=get_default_sess_config(0.5)),
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=1000,
    )
Ejemplo n.º 13
0
    audio_meta = AudioMeta(hp.train.data_path)
    if args.remote:
        df = get_remote_dataflow(args.port, hp.train.batch_size)
    else:
        df = DataLoader(audio_meta, hp.train.batch_size).dataflow(
            nr_prefetch=5000,
            nr_thread=int(multiprocessing.cpu_count() // 1.5))

    # set logger for event and model saver
    logger.set_logger_dir(hp.logdir)
    if True:
        train_conf = TrainConfig(
            model=ClassificationModel(num_classes=audio_meta.num_speaker,
                                      **hp.model),
            data=FlexibleQueueInput(df, capacity=500),
            callbacks=[ModelSaver(checkpoint_dir=hp.logdir),
                       EvalCallback()],
            steps_per_epoch=hp.train.steps_per_epoch,
            # session_config=session_config
        )

        ckpt = args.ckpt if args.ckpt else tf.train.latest_checkpoint(
            hp.logdir)
        if ckpt and not args.r:
            train_conf.session_init = SaverRestore(ckpt)

        if args.gpu:
            os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
            train_conf.nr_tower = len(args.gpu.split(','))

        trainer = SyncMultiGPUTrainerReplicated(hp.train.num_gpu)