def do_setup_and_start_training(modules, configs, rank, size, device_list, single=False): if not single: os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '29509' dist.init_process_group('nccl', rank=rank, world_size=size) with torch.cuda.device(device_list[rank]): print('initializing model on rank %d' % rank) if single: shared_model = DummySharedWrapper(modules['model']( configs['model'])).cuda() else: shared_model = torch.nn.parallel.DistributedDataParallel( modules['model'](configs['model']).cuda(), device_ids=[device_list[rank]], find_unused_parameters=True) model = DistributedWrapper(shared_model) memory = modules['memory'](configs['memory']) loss_function = modules['loss'](configs['loss']) trainer_config = configs['trainer'] trainer_config.log_path = trainer_config.log_path + '%d.log' % rank if rank > 0: trainer_config.save_frequency = 0 trainer = Trainer(model, memory, loss_function, trainer_config) print('starting training process on rank %d' % rank) stats = trainer.train() print('done %d' % rank) return stats
trainer_config = TrainerConfig() trainer_config.log_path = os.path.join(output_folder, 'training_log.log') trainer = Trainer(model, ReplayPILDataset(MemoryConfigPIL()), loss_function, trainer_config) print('Running instance segmentation only pre-training') actor_config.instance_only = True loss_function.config.instance_only = True trainer_config.checkpoint_path = os.path.join( output_folder, args.checkpoint_prefix + 'inst_only_') model.toggle_mass_head(False) trainer.train() print('Training with force prediction') actor_config.instance_only = False loss_function.config.instance_only = False trainer_config.checkpoint_path = os.path.join(output_folder, args.checkpoint_prefix) trainer_config.update_schedule = lambda episode, episodes: int( 15 + 20 * episode / episodes) trainer_config.poking_schedule = lambda episode, episodes: 10 trainer.memory = ReplayPILDataset(MemoryConfigPIL()) model.toggle_mass_head(True) model.toggle_detection_net(False) trainer_config.unfreeze = 100