def train_model_distributed(config): assert (config.use_cuda_if_available and torch.cuda.is_available()) or config.distributed_world_size == 1, ( "distributed training is only available for GPU training") assert ( config.distributed_world_size == 1 or not config.task.__class__.__name__ == "DisjointMultitask.Config" ), "Distributed training currently not supported for DisjointMultitask" assert (config.distributed_world_size == 1 or config.distributed_world_size <= torch.cuda.device_count()), ( f"Only {torch.cuda.device_count()} GPUs are available, " "{config.distributed_world_size} GPUs were requested") print( f"\n=== Starting training, World size is {config.distributed_world_size}" ) if not config.use_cuda_if_available or not torch.cuda.is_available(): run_single(0, config_to_json(PyTextConfig, config), 1, None) else: with tempfile.NamedTemporaryFile(delete=False, suffix=".dist_sync") as sync_file: dist_init_method = "file://" + sync_file.name spawn( run_single, ( config_to_json(PyTextConfig, config), config.distributed_world_size, dist_init_method, ), config.distributed_world_size, )
def train_model_distributed(config, metric_channels: Optional[List[Channel]]): assert ( config.use_cuda_if_available and torch.cuda.is_available() ) or config.distributed_world_size == 1, ( "distributed training is only available for GPU training" ) assert ( config.distributed_world_size == 1 or config.distributed_world_size <= torch.cuda.device_count() ), ( f"Only {torch.cuda.device_count()} GPUs are available, " "{config.distributed_world_size} GPUs were requested" ) print(f"\n=== Starting training, World size is {config.distributed_world_size}") if not config.use_cuda_if_available or not torch.cuda.is_available(): run_single( rank=0, config_json=config_to_json(PyTextConfig, config), world_size=1, dist_init_method=None, metadata=None, metric_channels=metric_channels, ) else: with tempfile.NamedTemporaryFile( delete=False, suffix=".dist_sync" ) as sync_file: dist_init_method = "file://" + sync_file.name metadata = prepare_task_metadata(config) spawn( run_single, ( config_to_json(PyTextConfig, config), config.distributed_world_size, dist_init_method, metadata, [], ), config.distributed_world_size, )
valid_loader, device, rank, writer, save_root=cfg.predicted_path) else: raise Exception(f"wrong value for argument -mode: {args.mode}") writer = cfg.logger.write(writer) writer.close() if __name__ == '__main__': args = get_args() if args.mode == 'test_topic': parts = ['train', 'test_topic'] else: parts = ['train', 'valid'] data = load_data(args.dataset, parts) for prt, ds in zip(['train', 'valid', 'test'], data): cfg.logger.log(f"{prt} examples: {len(ds)}") cfg.logger.log('Main starting point...') if args.distributed: spawn(main, args=(cfg.n_devices, args, data), nprocs=cfg.n_devices) else: main(args=args, data=data)
print(all_filenames) combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames]) combined_csv['id'] = range(len(combined_csv)) combined_csv.to_csv(f"data/{save_prefix}_{split}.csv", index=False) for file_path in all_filenames: os.remove(file_path) if __name__ == '__main__': args = get_args() split = args.split save_prefix = args.save_prefix save_root = Path("models/summarization/data") root2data = Path(args.root2data) n_procs = args.n_procs if args.mode == "merge": merge_csvs(split, save_root, save_prefix) elif args.mode == "preprocess": papers_files = get_files(root2data, split, 'fragments') abstracts_files = get_files(root2data, split, 'abstracts') # assert n_procs >= len(papers_files) spawn(main, args=(n_procs, args, papers_files, abstracts_files), nprocs=min(n_procs, len(papers_files)))