def main(): args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args)
normalize, logger, it, num_vis, tag='val_images') del loss_dict, gloss, gout else: break save_epoch = opts.save_epoch if epoch % save_epoch == 0 and epoch > save_epoch - 1 and logging: print('Saving checkpoint') utils.save_model( os.path.join(opts.log_dir, 'model' + str(epoch) + '.pt'), epoch, netG, netD, opts) utils.save_optim( os.path.join(opts.log_dir, 'optim' + str(epoch) + '.pt'), epoch, optG_temporal, optG_graphic, optD) if __name__ == '__main__': parser = config.init_parser() opts, args = parser.parse_args(sys.argv) if opts.num_gpu > 1: os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '8888' mp.spawn(train_gamegan, nprocs=opts.num_gpu, args=(opts, )) else: train_gamegan(opts.gpu, opts)
def train_from_folder( data = './data', results_dir = './results', models_dir = './models', name = 'default', new = False, load_from = -1, image_size = 128, network_capacity = 16, fmap_max = 512, transparent = False, batch_size = 5, gradient_accumulate_every = 6, num_train_steps = 150000, learning_rate = 2e-4, lr_mlp = 0.1, ttur_mult = 1.5, rel_disc_loss = False, num_workers = None, save_every = 1000, evaluate_every = 1000, generate = False, num_generate = 1, generate_interpolation = False, interpolation_num_steps = 100, save_frames = False, num_image_tiles = 8, trunc_psi = 0.75, mixed_prob = 0.9, fp16 = False, no_pl_reg = False, cl_reg = False, fq_layers = [], fq_dict_size = 256, attn_layers = [], no_const = False, aug_prob = 0., aug_types = ['translation', 'cutout'], top_k_training = False, generator_top_k_gamma = 0.99, generator_top_k_frac = 0.5, dataset_aug_prob = 0., multi_gpus = False, calculate_fid_every = None, calculate_fid_num_images = 12800, clear_fid_cache = False, seed = 42, log = False ): model_args = dict( name = name, results_dir = results_dir, models_dir = models_dir, batch_size = batch_size, gradient_accumulate_every = gradient_accumulate_every, image_size = image_size, network_capacity = network_capacity, fmap_max = fmap_max, transparent = transparent, lr = learning_rate, lr_mlp = lr_mlp, ttur_mult = ttur_mult, rel_disc_loss = rel_disc_loss, num_workers = num_workers, save_every = save_every, evaluate_every = evaluate_every, num_image_tiles = num_image_tiles, trunc_psi = trunc_psi, fp16 = fp16, no_pl_reg = no_pl_reg, cl_reg = cl_reg, fq_layers = fq_layers, fq_dict_size = fq_dict_size, attn_layers = attn_layers, no_const = no_const, aug_prob = aug_prob, aug_types = cast_list(aug_types), top_k_training = top_k_training, generator_top_k_gamma = generator_top_k_gamma, generator_top_k_frac = generator_top_k_frac, dataset_aug_prob = dataset_aug_prob, calculate_fid_every = calculate_fid_every, calculate_fid_num_images = calculate_fid_num_images, clear_fid_cache = clear_fid_cache, mixed_prob = mixed_prob, log = log ) if generate: model = Trainer(**model_args) model.load(load_from) samples_name = timestamped_filename() for num in tqdm(range(num_generate)): model.evaluate(f'{samples_name}-{num}', num_image_tiles) print(f'sample images generated at {results_dir}/{name}/{samples_name}') return if generate_interpolation: model = Trainer(**model_args) model.load(load_from) samples_name = timestamped_filename() model.generate_interpolation(samples_name, num_image_tiles, num_steps = interpolation_num_steps, save_frames = save_frames) print(f'interpolation generated at {results_dir}/{name}/{samples_name}') return world_size = torch.cuda.device_count() if world_size == 1 or not multi_gpus: run_training(0, 1, model_args, data, load_from, new, num_train_steps, name, seed) return mp.spawn(run_training, args=(world_size, model_args, data, load_from, new, num_train_steps, name, seed), nprocs=world_size, join=True)
if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--num_gpus', type=int, default=4) parser.add_argument('--batch_size', type=int, default=512) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--interval', type=int, default=1) parser.add_argument('--video_title', type=str, default='flow') parser.add_argument('--cmap', type=str, default='viridis') parser.add_argument('--fps', type=float, default=50) parser.add_argument( '--cascade', default=False, action="store_true", help="Whether to show intermediate layers in visualiation.") args = parser.parse_args() assert isinstance(args.num_gpus, int) assert args.num_gpus > 0 and args.num_gpus <= torch.cuda.device_count() if args.num_gpus == 1: train(**args.__dict__), else: # torch multiprocessing mp.spawn( train_distributed, # train function specifically for DDP args=(args.batch_size, args.num_gpus, args.epochs, args.interval, args.video_title, args.cmap, args.fps, args.cascade), nprocs=args.num_gpus, join=True)
def global_update(self, state, lr, E=1): """Execute one round of serial global update""" self._send(state) mp.spawn(self._client_update, (lr, E), nprocs=self.client_count) self._recv() return self._fed_avg(), sum(self.losses) / self.client_count
visuals = OrderedDict([('synthesized_image', trainer.get_latest_generated()), ('real_image', data_i['image'])]) visualizer.display_current_results(visuals, epoch, iter_counter.total_steps_so_far) if rank == 0: print('saving the latest model (epoch %d, total_steps %d)' % (epoch, iter_counter.total_steps_so_far)) trainer.save('latest') iter_counter.record_current_iter() trainer.update_learning_rate(epoch) iter_counter.record_epoch_end() if (epoch % opt.save_epoch_freq == 0 or epoch == iter_counter.total_epochs) and (rank == 0): print('saving the model at the end of epoch %d, iters %d' % (epoch, iter_counter.total_steps_so_far)) trainer.save(epoch) print('Training was successfully finished.') if __name__ == '__main__': global TrainOptions TrainOptions = TrainOptions() opt = TrainOptions.parse(save=True) opt.world_size = opt.num_gpu opt.mpdist = True mp.set_start_method('spawn', force=True) mp.spawn(main_worker, nprocs=opt.world_size, args=(opt.world_size, opt))
model = UGATITPlus(args) model.train() model.cuda() # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank]) model.G_A = torch.nn.parallel.DistributedDataParallel(model.G_A, device_ids=[rank]) model.G_B = torch.nn.parallel.DistributedDataParallel(model.G_B, device_ids=[rank]) model.D_A = torch.nn.parallel.DistributedDataParallel(model.D_A, device_ids=[rank]) model.D_B = torch.nn.parallel.DistributedDataParallel(model.D_B, device_ids=[rank]) dataset = UnpairDataset(args) sample = torch.utils.data.distributed.DistributedSampler(dataset) train_loader = DataLoader(dataset, args.batchsize, num_workers=args.worker, sampler=sample) trainer = Train(model, train_loader, sample, args) trainer.train() if __name__ == '__main__': args = cfgs port_id = 10000 + np.random.randint(0, 5000) args.dist_url = 'tcp://127.0.0.1:' + str(port_id) args.gpus_num = torch.cuda.device_count() mp.spawn(main_worker, nprocs=args.gpus_num, args=(args, ))
def start_predicting(self, trainer): mp.spawn(self.new_process, **self.mp_spawn_kwargs)
parser.add_argument('--project', default='albert-train', type=str, required=False, help='project name for wandb') args = parser.parse_args() args.save = os.path.abspath( os.path.join(os.getcwd(), "save", f'{get_model_filename(args)}.pth')) for arg in vars(args): print(arg, getattr(args, arg)) if torch.cuda.is_available(): args.n_gpu = torch.cuda.device_count() if args.gpu is None else 1 else: args.n_gpu = 0 set_seed(args) if not os.path.exists(os.path.dirname(args.save)): os.makedirs(os.path.dirname(args.save)) if 1 < args.n_gpu: # noinspection PyTypeChecker mp.spawn(train_model, args=(args.n_gpu, args), nprocs=args.n_gpu, join=True) else: train_model(rank=0 if args.gpu is None else args.gpu, world_size=args.n_gpu, args=args)
training_dbs = [datasets[dataset](config["db"], split=train_split, sys_config=system_config) for _ in range(workers)] validation_db = datasets[dataset](config["db"], split=val_split, sys_config=system_config) if rank == 0: print("system config...") pprint.pprint(system_config.full) print("db config...") pprint.pprint(training_dbs[0].configs) print("len of db: {}".format(len(training_dbs[0].db_inds))) print("distributed: {}".format(args.distributed)) train(training_dbs, validation_db, system_config, model, args) if __name__ == "__main__": args = parse_args() distributed = args.distributed world_size = args.world_size if distributed and world_size < 0: raise ValueError("world size must be greater than 0 in distributed training") ngpus_per_node = torch.cuda.device_count() if distributed: args.world_size = ngpus_per_node * args.world_size mp.spawn(main, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: main(None, ngpus_per_node, args)
def _test_model_impl( self, mode, name, model, eager_quantizable_model, check_with_eager=True, diff_of_quant=None, diff_from_eager=None): if diff_of_quant is None or diff_from_eager is None: diff_of_quant = {} diff_from_eager = {} if mode not in diff_of_quant or mode not in diff_from_eager: diff_of_quant[mode] = {} diff_from_eager[mode] = {} input_tensor = torch.rand(1, 3, 224, 224) input_tensor_inception = torch.rand(1, 3, 299, 299) output_value = torch.randint(0, 1, (1,)) # print('quantizing:', name, ' mode:', mode) if name == 'inception_v3': input_value = input_tensor_inception else: input_value = input_tensor qconfig = default_qconfig if mode == 'static' else default_qat_qconfig qconfig_dict = {'': qconfig} graph_module = symbolic_trace(model) # print('graph module:', graph_module.src) script = torch.jit.script(graph_module) # make sure graph module and script module are both runanble original_out = graph_module(input_value) is_not_tuple_out = not isinstance(original_out, tuple) script_out = script(input_value) self.assertEqual( (original_out - script_out).abs().max(), 0, 'Reslut of original graph module and script module does not match') # set to train just before quantization if mode != 'static': model.train() graph_module = fuse_fx(graph_module) prepared = prepare_fx(graph_module, qconfig_dict) if mode == 'ddp': mp.spawn(run_ddp, args=(world_size, prepared), nprocs=world_size, join=True) elif mode == 'qat': assert prepared.training, 'prepared must be in training mode for qat' optimizer = torch.optim.SGD(prepared.parameters(), lr=0.0001) criterion = nn.CrossEntropyLoss() train_one_epoch(prepared, criterion, optimizer, [(input_value, output_value)], torch.device('cpu'), 1) else: for i in range(10): prepared(input_value) # print('after observation root:', prepared.root) qgraph = convert_fx(prepared) # print('after quantization root:', qgraph.root) # print('after quantization code:', qgraph.src) qgraph.eval() qgraph_script = torch.jit.script(qgraph) # print('quantized and scripted:', qgraph_script.graph) qgraph_out = qgraph(input_value) qgraph_script = qgraph_script(input_value) if is_not_tuple_out: diff_of_quant[mode][name] = (original_out - qgraph_out).abs().max() assert torch.allclose(qgraph_out, qgraph_script), 'graph, scripted graph' else: print('tuple output') if eager_quantizable_model is not None: # comparing to eager mode quantization qeager = eager_quantizable_model ref_out = qeager(input_value) qeager.qconfig = qconfig if mode == 'static': qeager.fuse_model() prepare(qeager, inplace=True) else: qeager.train() qeager.fuse_model() prepare_qat(qeager, inplace=True) # calibration if mode == 'ddp': mp.spawn(run_ddp, args=(world_size, qeager), nprocs=world_size, join=True) elif mode == 'qat': assert qeager.training, 'qeager should be in training mode for qat' optimizer = torch.optim.SGD(qeager.parameters(), lr=0.0001) train_one_epoch(qeager, criterion, optimizer, [(input_value, output_value)], torch.device('cpu'), 1) else: for i in range(10): qeager(input_value) # print('ref after observation:', qeager) convert(qeager, inplace=True) qeager.eval() # print('ref after quantization:', qeager) qeager_out = qeager(input_value) qeager_script = torch.jit.script(qeager) qscript_out = qeager_script(input_value) if is_not_tuple_out: diff_from_eager[mode][name] = (qeager_out - qgraph_out).abs().max() if check_with_eager: self.assertEqual(diff_from_eager[mode][name], 0, 'Result of graph mode quantization and ' + 'eager mode quantization on model: ' + name + ' should match. Mode: ' + mode + ' diff:' + str(diff_from_eager[mode][name]))
smoothed_total_reward = (smoothed_total_reward * 0.9 + total_reward * 0.1) logger.info("Process {} Episode {} total reward={:.2f}".format( rank, episode, smoothed_total_reward)) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") # will cause torch RPC to complain # since other processes may have not finished yet. # just for demonstration. exit(0) else: reward_fulfilled = 0 elif rank in (2, 3): # wait for enough samples while dqn_apex.replay_buffer.all_size() < 500: sleep(0.1) while True: dqn_apex.update() if __name__ == "__main__": # spawn 4 sub processes # Process 0 and 1 will be workers(samplers) # Process 2 and 3 will be learners spawn(main, nprocs=4)
def train_from_folder( data='./data', results_dir='./results', models_dir='./models', name='default', new=False, load_from=-1, image_size=256, optimizer='adam', fmap_max=512, transparent=False, greyscale=False, batch_size=10, gradient_accumulate_every=4, num_train_steps=150000, learning_rate=2e-4, save_every=1000, evaluate_every=1000, generate=False, generate_types=['default', 'ema'], generate_interpolation=False, aug_test=False, aug_prob=None, aug_types=['cutout', 'translation'], dataset_aug_prob=0., attn_res_layers=[32], freq_chan_attn=False, disc_output_size=1, dual_contrast_loss=False, antialias=False, interpolation_num_steps=100, save_frames=False, num_image_tiles=None, num_workers=None, multi_gpus=False, calculate_fid_every=None, calculate_fid_num_images=12800, clear_fid_cache=False, seed=42, amp=False, show_progress=False, ): num_image_tiles = default(num_image_tiles, 4 if image_size > 512 else 8) model_args = dict(name=name, results_dir=results_dir, models_dir=models_dir, batch_size=batch_size, gradient_accumulate_every=gradient_accumulate_every, attn_res_layers=cast_list(attn_res_layers), freq_chan_attn=freq_chan_attn, disc_output_size=disc_output_size, dual_contrast_loss=dual_contrast_loss, antialias=antialias, image_size=image_size, num_image_tiles=num_image_tiles, optimizer=optimizer, num_workers=num_workers, fmap_max=fmap_max, transparent=transparent, greyscale=greyscale, lr=learning_rate, save_every=save_every, evaluate_every=evaluate_every, aug_prob=aug_prob, aug_types=cast_list(aug_types), dataset_aug_prob=dataset_aug_prob, calculate_fid_every=calculate_fid_every, calculate_fid_num_images=calculate_fid_num_images, clear_fid_cache=clear_fid_cache, amp=amp) if generate: model = Trainer(**model_args) model.load(load_from) samples_name = timestamped_filename() checkpoint = model.checkpoint_num dir_result = model.generate(samples_name, num_image_tiles, checkpoint, generate_types) print(f'sample images generated at {dir_result}') return if generate_interpolation: model = Trainer(**model_args) model.load(load_from) samples_name = timestamped_filename() model.generate_interpolation(samples_name, num_image_tiles, num_steps=interpolation_num_steps, save_frames=save_frames) print( f'interpolation generated at {results_dir}/{name}/{samples_name}') return if show_progress: model = Trainer(**model_args) model.show_progress(num_images=num_image_tiles, types=generate_types) return if aug_test: DiffAugmentTest(data=data, image_size=image_size, batch_size=batch_size, types=aug_types, nrow=num_image_tiles) return world_size = torch.cuda.device_count() if world_size == 1 or not multi_gpus: run_training(0, 1, model_args, data, load_from, new, num_train_steps, name, seed) return mp.spawn(run_training, args=(world_size, model_args, data, load_from, new, num_train_steps, name, seed), nprocs=world_size, join=True)
def _train_procedure(self, net, n_idx, retraining, keep_ratio, s_idx=0, r_idx=0): # the parameters steps_per_epoch = len(self.train_loader) params = self.retrain_params if retraining else self.train_params # check the file names we need file_name_net = self._get_net_name(net, n_idx, retraining, keep_ratio, s_idx, r_idx, False) file_name_check = self._get_net_name(net, n_idx, retraining, keep_ratio, s_idx, r_idx, True) file_name_rewind = self._get_net_name( net, n_idx, retraining, keep_ratio, s_idx, r_idx, False, False, True, ) file_name_best = self._get_net_name( net, n_idx, retraining, keep_ratio, s_idx, r_idx, get_checkpoint=False, get_best=True, ) # get test metrics assembled metrics_test = get_test_metrics(params) # set up the train logger # doing this before returning with pre-trained net is important so that # we don't have old data stored in the train logger. if self._train_logger is not None: self._train_logger.initialize( net_class_name=type(net).__name__, is_retraining=retraining, num_epochs=params["numEpochs"], steps_per_epoch=steps_per_epoch, early_stop_epoch=params["earlyStopEpoch"], metrics_test=metrics_test, n_idx=n_idx, r_idx=r_idx, s_idx=s_idx, ) # check if network is already pretrained and done. then we can return found_trained_net, _ = load_checkpoint( file_name_net, net, train_logger=self._train_logger, loc=str(next(net.parameters()).device), ) if found_trained_net: print("Loading pre-trained network...") return # retrieve net handle if hasattr(net, "compressed_net"): net_handle = net.compressed_net else: net_handle = net # enable grad computations torch.set_grad_enabled(True) # empty gpu cache to make sure everything is ready for retraining torch.cuda.empty_cache() # register sparsity pattern to before retraining if retraining: net_handle.register_sparsity_pattern() args = ( self.num_gpus, self.train_loader.num_workers, net_handle, retraining, self.train_loader.dataset, self.valid_loader.dataset, self.train_loader.collate_fn, params, self._train_logger, file_name_check, file_name_rewind, file_name_best, ) # setup torch.distributed and spawn processes if not retraining or net.retrainable: if self.num_gpus > 1: os.environ["MASTER_ADDR"] = "127.0.0.1" os.environ["MASTER_PORT"] = "12355" mp.spawn(train_with_worker, nprocs=self.num_gpus, args=args) else: train_with_worker(0, *args) # disable grad computations torch.set_grad_enabled(False) # load result into this net here load_checkpoint( file_name_check, net_handle, train_logger=self._train_logger, loc=str(next(net_handle.parameters()).device), ) # then overwrite with early stopping checkpoint (net only, no logger!) found_best, epoch_best = load_checkpoint( file_name_best, net_handle, loc=str(next(net_handle.parameters()).device), ) if found_best: print(f"Loaded early stopping checkpoint from epoch: {epoch_best}") # store full net as well save_checkpoint(file_name_net, net, params["numEpochs"], self._train_logger) # delete checkpoint to save storage delete_checkpoint(file_name_check) delete_checkpoint(file_name_best)
cluster.add_slurm_cmd(cmd='cpus-per-task', value='1', comment='nb cpus / task') cluster.add_slurm_cmd(cmd='account', value='def-training-wa_gpu', comment='account') cluster.add_slurm_cmd(cmd='reservation', value='hackathon-wr_gpu', comment='reservation') cluster.add_slurm_cmd(cmd='mem-per-cpu', value='10g', comment='memory per CPU') # Notify job status cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True) # Set job options cluster.per_experiment_nb_gpus = 1 cluster.per_experiment_nb_nodes = 1 cluster.job_time = '3:00:00' # Run models on cluster cluster.optimize_parallel_cluster_cpu(mp.spawn(train, nprocs=args.gpus, args=(args, )), nb_trials=20, job_name='first_tt_batch', job_display_name='CNN_ddp_tune')
def start_training(self, trainer): mp.spawn(self.new_process, **self.mp_spawn_kwargs) # reset optimizers, since main process is never used for training and thus does not have a valid optim state trainer.optimizers = []
def run_demo(demo_fn, world_size): mp.spawn(demo_fn, args=(world_size, ), nprocs=world_size, join=True)
# loss batch_loss = cross_entropy_loss(y_pred, y) epoch_loss = loss_container(batch_loss.item()) # metrics: top-1,5 error epoch_acc = raw_metric(y_pred, y) # end of validation epoch_loss = torch.tensor(epoch_loss).cuda() epoch_acc = torch.tensor(epoch_acc).cuda() dist.all_reduce(epoch_loss, op=dist.ReduceOp.SUM) dist.all_reduce(epoch_acc, op=dist.ReduceOp.SUM) epoch_loss = epoch_loss.item()/ngpus_per_node epoch_acc = epoch_acc.cpu().numpy()/ngpus_per_node logs['val_{}'.format(loss_container.name)] = epoch_loss logs['val_{}'.format(raw_metric.name)] = epoch_acc end_time = time.time() batch_info = 'Val Loss {:.4f}, Val Acc ({:.2f})'.format(epoch_loss, epoch_acc[0]) # write log for this epoch if local_rank == 0: logging.info('Valid: {}, Time {:3.2f}'.format(batch_info, end_time - start_time)) logging.info('') if __name__ == '__main__': ngpus_per_node = torch.cuda.device_count() mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, ''))
cleanup_nccl() if __name__ == '__main__': parser = argparse.ArgumentParser() # training settings parser.add_argument('--num_gpus', type=int, default=1) parser.add_argument('--lr', type=float, default=5e-4) parser.add_argument('--batch_size', type=int, default=4096) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--interval', type=int, default=2) parser.add_argument('--save', type=int, default=10) parser.add_argument('--num_workers', type=int, default=4, help="Number of workers for dataloader.") parser.add_argument('--num_basis', type=int, default=100, help="Number of SVD reduced basis elements to use") parser.add_argument('--verbose', default=False, action="store_true") # parser.add_argument('--profile', default=False, action="store_true") args = parser.parse_args() assert isinstance(args.num_gpus, int), "num_gpus argument must be an integer." assert args.num_gpus > 0 and args.num_gpus <= torch.cuda.device_count(), f"{args.num_gpus} not a valid number of GPU devices." # data distributed parallel mp.spawn( train, args=tuple(args.__dict__.values()), # assumes parser loaded in correct order nprocs=args.num_gpus, join=True )
print(('Validation ||' + (' %s: %.3f |' * len(losses)) + ')') % tuple(loss_labels), flush=True) def compute_validation_map(yolact_net, dataset): with torch.no_grad(): yolact_net.eval() logger = logging.getLogger("yolact.eval") logger.info("Computing validation mAP (this may take a while)...") eval_script.evaluate(yolact_net, dataset, train_mode=True, train_cfg=cfg) yolact_net.train() def setup_eval(): eval_script.parse_args([ '--no_bar', '--fast_eval', '--max_images=' + str(args.validation_size) ]) if __name__ == '__main__': if args.num_gpus is None: args.num_gpus = torch.cuda.device_count() if args.num_gpus > 1: mp.spawn(train, nprocs=args.num_gpus, args=(args, ), daemon=False) else: train(0, args=args)
def main(): ngpus_per_node = torch.cuda.device_count() worldsize = ngpus_per_node mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, ))
def main(): world_size = torch.cuda.device_count() os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = FLAGS.port mp.spawn(train, nprocs=world_size, args=(world_size, FLAGS), join=True)
def fit(self, model: LightningModule, train_dataloader: Optional[DataLoader] = None, val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None): r""" Runs the full optimization routine. Args: model: Model to fit. train_dataloader: A Pytorch DataLoader with training samples. If the model has a predefined train_dataloader method this will be skipped. val_dataloaders: Either a single Pytorch Dataloader or a list of them, specifying validation samples. If the model has a predefined val_dataloaders method this will be skipped Example:: # Option 1, # Define the train_dataloader() and val_dataloader() fxs # in the lightningModule # RECOMMENDED FOR MOST RESEARCH AND APPLICATIONS TO MAINTAIN READABILITY trainer = Trainer() model = LightningModule() trainer.fit(model) # Option 2 # in production cases we might want to pass different datasets to the same model # Recommended for PRODUCTION SYSTEMS train, val = DataLoader(...), DataLoader(...) trainer = Trainer() model = LightningModule() trainer.fit(model, train_dataloader=train, val_dataloaders=val) # Option 1 & 2 can be mixed, for example the training set can be # defined as part of the model, and validation can then be feed to .fit() """ # bind logger and other properties model.logger = self.logger self.copy_trainer_model_properties(model) # clean hparams if hasattr(model, 'hparams'): parsing.clean_namespace(model.hparams) # set up the passed in dataloaders (if needed) self.__attach_dataloaders(model, train_dataloader, val_dataloaders) # check that model is configured correctly self.check_model_configuration(model) # callbacks self.on_fit_start() if self.is_function_implemented('on_fit_start'): model.on_fit_start() # on multi-gpu jobs we only want to manipulate (download, etc) on node_rank=0, local_rank=0 # or in the case where each node needs to do its own manipulation in which case just local_rank=0 if self.can_prepare_data(): model.prepare_data() self._is_data_prepared = True # Run auto batch size scaling if self.auto_scale_batch_size: if isinstance(self.auto_scale_batch_size, bool): self.auto_scale_batch_size = 'power' self.scale_batch_size(model, mode=self.auto_scale_batch_size) model.logger = self.logger # reset logger binding # Run learning rate finder: if self.auto_lr_find: self._run_lr_finder_internally(model) model.logger = self.logger # reset logger binding # route to appropriate start method # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp2: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) # torchelastic or general non_slurm ddp2 elif 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ): task = int(os.environ['LOCAL_RANK']) self.ddp_train(task, model) elif self.use_ddp: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) # torchelastic or general non_slurm ddp elif 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ): task = int(os.environ['LOCAL_RANK']) self.ddp_train(task, model) elif self.distributed_backend == 'cpu_ddp': self.__set_random_port() self.model = model mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model, )) elif self.distributed_backend == 'ddp_spawn': model.share_memory() # spin up peers mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model, )) elif self.distributed_backend == 'ddp': self.spawn_ddp_children(model) # 1 gpu or dp option triggers training using DP module # easier to avoid NCCL issues elif self.use_dp: self.dp_train(model) elif self.use_horovod: self.horovod_train(model) elif self.single_gpu: self.single_gpu_train(model) elif self.use_tpu: # pragma: no-cover rank_zero_info(f'training on {self.tpu_cores} TPU cores') # COLAB_GPU is an env var available by default in Colab environments. start_method = 'fork' if self.on_colab_kaggle else 'spawn' # track for predict self.model = model # train if self.tpu_id is not None: self.tpu_train(self.tpu_id, model) else: xmp.spawn(self.tpu_train, args=(model, ), nprocs=self.tpu_cores, start_method=start_method) # load weights if not interrupted self.load_spawn_weights(model) self.model = model # ON CPU else: # run through amp wrapper if self.use_amp: raise MisconfigurationException( 'amp + cpu is not supported. Please use a GPU option') # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers( model) self.run_pretrain_routine(model) # callbacks self.on_fit_end() # model hooks if self.is_function_implemented('on_fit_end'): model.on_fit_end() # return 1 when finished # used for testing or when we need to know that training succeeded return 1
workers = [] for i in range(num_workers): color = colours[i % len(colours)] worker_id = "worker_{}".format(i + 1) worker_network = ActorCriticModel(image_dim, color_dim, n_classes, C, lr, cpu_device, weight_repository) worker = Worker(worker_id, env_name, worker_network, n_steps, gamma, color, step_queue) workers.append(worker) print("Running Workers".format(num_workers)) threads = [] for worker in workers: t = mp.spawn(run, args=(worker, ), nprocs=1, join=False, daemon=False, start_method='spawn') threads.append(t) steps = [] losses = [] batch_size = 1028 while True: worker_steps = step_queue.get() if len(steps) > batch_size: states = [] actions = [] rewards = [] for step in steps:
if hasattr(args, 'writer') and args.writer: args.writer.add_scalar("Loss/train", train_loss, epoch) args.writer.add_scalar("Accu/test", test_accu1, epoch) args.writer.add_scalar("Misc/learning_rate", lr, epoch) if __name__ == '__main__': args = parser.parse_args() output_prefix = f"ssl_{args.ssl}_{args.arch}" output_prefix += "/session_" + datetime.datetime.now().strftime( "%Y%m%d%H%M%S") if not hasattr(args, 'output_dir'): args.output_dir = args.data_dir args.output_dir = os.path.join(args.output_dir, output_prefix) os.makedirs(args.output_dir) print("=> results will be saved to {}".format(args.output_dir)) args = dist.init_distributed_mode(args) if args.mp_dist: if args.world_size > args.ngpus: print(f"Training with {args.world_size // args.ngpus} nodes, " f"waiting until all nodes join before starting training") # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main, args=(args, ), nprocs=args.ngpus, join=True) else: main(args.gpu, args)
def run_training(training_fn, nprocs, *args): mp.spawn(fn=training_fn, args=(nprocs, *args), nprocs=nprocs, join=True)
if __name__ == '__main__': opt = get_opt() opt.device = torch.device('cpu' if opt.no_cuda else 'cuda') if not opt.no_cuda: cudnn.benchmark = True if opt.accimage: torchvision.set_image_backend('accimage') opt.ngpus_per_node = torch.cuda.device_count() if opt.distributed: opt.world_size = opt.ngpus_per_node * opt.world_size mp.spawn(main_worker, nprocs=opt.ngpus_per_node, args=(opt,)) else: main_worker(-1, opt) ''' # for rate pretrain and fine tune python main_rate.py \ --label_path /scr-ssd/enguyen/slowed_clips_0.5x/frames_fps16/meta/video_metadata.json \ --video_id_path /scr-ssd/enguyen/432_meta/clip_ids_split_merged.json \ --frame_dir /scr-ssd/enguyen/slowed_clips_0.5x/frames_fps16/ \ --image_size 224 \ --result_path /vision2/u/enguyen/results/rate_pred/run5_stoch_window24 \ --dataset cpr_rate \ --n_classes 2 \
def run_distributed(fn, config, args): try: mp.spawn(fn, args=(config, args), nprocs=args.n_gpus, join=True) except: cleanup()
dist.init_process_group( backend="gloo", rank=rank, world_size=2) # Initialize RPC. trainer_name = "trainer{}".format(rank) rpc.init_rpc( trainer_name, rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) # Trainer just waits for RPCs from master. else: rpc.init_rpc( "ps", rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) # parameter server do nothing pass # block until all rpcs finish rpc.shutdown() if __name__=="__main__": # 2 trainers, 1 parameter server, 1 master. world_size = 4 mp.spawn(run_worker, args=(world_size, ), nprocs=world_size, join=True) # END run_worker
def main(): parser = argparse.ArgumentParser() parser.add_argument("--cpu", action="store_true", help="If set, we only use CPU.") parser.add_argument("--single_gpu", action="store_true", help="If set, we only use single GPU.") parser.add_argument("--fp16", action="store_true", help="If set, we will use fp16.") parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) # environment arguments parser.add_argument('-s', '--seed', default=1, type=int, metavar='N', help='manual random seed') parser.add_argument('-n', '--num_nodes', default=1, type=int, metavar='N', help='number of nodes') parser.add_argument('-g', '--gpus_per_node', default=1, type=int, help='number of gpus per node') parser.add_argument('-nr', '--node_rank', default=0, type=int, help='ranking within the nodes') # experiments specific arguments parser.add_argument('--debug_mode', action='store_true', dest='debug_mode', help='weather this is debug mode or normal') parser.add_argument( "--model_class_name", type=str, help="Set the model class of the experiment.", ) parser.add_argument( "--experiment_name", type=str, help= "Set the name of the experiment. [model_name]/[data]/[task]/[other]", ) parser.add_argument("--save_prediction", action='store_true', dest='save_prediction', help='Do we want to save prediction') parser.add_argument('--epochs', default=2, type=int, metavar='N', help='number of total epochs to run') parser.add_argument( "--per_gpu_train_batch_size", default=16, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=64, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument("--max_length", default=160, type=int, help="Max length of the sequences.") parser.add_argument("--warmup_steps", default=-1, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument( "--eval_frequency", default=1000, type=int, help="set the evaluation frequency, evaluate every X global step.", ) parser.add_argument("--train_data", type=str, help="The training data used in the experiments.") parser.add_argument( "--train_weights", type=str, help="The training data weights used in the experiments.") parser.add_argument("--eval_data", type=str, help="The training data used in the experiments.") args = parser.parse_args() if args.cpu: args.world_size = 1 train(-1, args) elif args.single_gpu: args.world_size = 1 train(0, args) else: # distributed multiGPU training ######################################################### args.world_size = args.gpus_per_node * args.num_nodes # # os.environ['MASTER_ADDR'] = '152.2.142.184' # This is the IP address for nlp5 # maybe we will automatically retrieve the IP later. os.environ['MASTER_PORT'] = '88888' # mp.spawn(train, nprocs=args.gpus_per_node, args=(args, )) # spawn how many process in this node
parser.add_argument("--eval_model_name", default=None, type=str, help="The filename of the model to be loaded from the directory specified in --model_dir") parser.add_argument('--mp', '-mp', action='store_true', help="Multiprocessing option") args = parser.parse_args() procs = [] use_mp = args.mp for split in args.split_schemes: flags = Flags() args_dict = args.__dict__ for arg in args_dict: setattr(flags, arg, args_dict[arg]) setattr(flags, "cv", True if flags.fold_num > 2 else False) setattr(flags, "views", [(cv, pv) for cv, pv in zip(args.comp_view, args.prot_view)]) flags['split'] = split flags['predict_cold'] = split == 'cold_drug_target' flags['cold_drug'] = split == 'cold_drug' flags['cold_target'] = split == 'cold_target' flags['cold_drug_cluster'] = split == 'cold_drug_cluster' flags['split_warm'] = split == 'warm' if use_mp: p = mp.spawn(fn=main, args=(flags,), join=False) procs.append(p) # p.start() else: main(0, flags) for proc in procs: proc.join()