Beispiel #1
0
def do_setup_and_start_training(modules,
                                configs,
                                rank,
                                size,
                                device_list,
                                single=False):
    if not single:
        os.environ['MASTER_ADDR'] = '127.0.0.1'
        os.environ['MASTER_PORT'] = '29509'
        dist.init_process_group('nccl', rank=rank, world_size=size)

    with torch.cuda.device(device_list[rank]):
        print('initializing model on rank %d' % rank)
        if single:
            shared_model = DummySharedWrapper(modules['model'](
                configs['model'])).cuda()
        else:
            shared_model = torch.nn.parallel.DistributedDataParallel(
                modules['model'](configs['model']).cuda(),
                device_ids=[device_list[rank]],
                find_unused_parameters=True)
        model = DistributedWrapper(shared_model)
        memory = modules['memory'](configs['memory'])
        loss_function = modules['loss'](configs['loss'])
        trainer_config = configs['trainer']
        trainer_config.log_path = trainer_config.log_path + '%d.log' % rank
        if rank > 0:
            trainer_config.save_frequency = 0
        trainer = Trainer(model, memory, loss_function, trainer_config)
        print('starting training process on rank %d' % rank)
        stats = trainer.train()

    print('done %d' % rank)
    return stats
    trainer_config = TrainerConfig()
    trainer_config.log_path = os.path.join(output_folder, 'training_log.log')

    trainer = Trainer(model, ReplayPILDataset(MemoryConfigPIL()),
                      loss_function, trainer_config)

    print('Running instance segmentation only pre-training')

    actor_config.instance_only = True
    loss_function.config.instance_only = True
    trainer_config.checkpoint_path = os.path.join(
        output_folder, args.checkpoint_prefix + 'inst_only_')

    model.toggle_mass_head(False)

    trainer.train()

    print('Training with force prediction')

    actor_config.instance_only = False
    loss_function.config.instance_only = False
    trainer_config.checkpoint_path = os.path.join(output_folder,
                                                  args.checkpoint_prefix)
    trainer_config.update_schedule = lambda episode, episodes: int(
        15 + 20 * episode / episodes)
    trainer_config.poking_schedule = lambda episode, episodes: 10
    trainer.memory = ReplayPILDataset(MemoryConfigPIL())

    model.toggle_mass_head(True)
    model.toggle_detection_net(False)
    trainer_config.unfreeze = 100