Example #1
0
def train_model_distributed(config):
    assert (config.use_cuda_if_available and
            torch.cuda.is_available()) or config.distributed_world_size == 1, (
                "distributed training is only available for GPU training")
    assert (
        config.distributed_world_size == 1
        or not config.task.__class__.__name__ == "DisjointMultitask.Config"
    ), "Distributed training currently not supported for DisjointMultitask"
    assert (config.distributed_world_size == 1
            or config.distributed_world_size <= torch.cuda.device_count()), (
                f"Only {torch.cuda.device_count()} GPUs are available, "
                "{config.distributed_world_size} GPUs were requested")

    print(
        f"\n=== Starting training, World size is {config.distributed_world_size}"
    )
    if not config.use_cuda_if_available or not torch.cuda.is_available():
        run_single(0, config_to_json(PyTextConfig, config), 1, None)
    else:
        with tempfile.NamedTemporaryFile(delete=False,
                                         suffix=".dist_sync") as sync_file:
            dist_init_method = "file://" + sync_file.name
            spawn(
                run_single,
                (
                    config_to_json(PyTextConfig, config),
                    config.distributed_world_size,
                    dist_init_method,
                ),
                config.distributed_world_size,
            )
Example #2
0
def train_model_distributed(config, metric_channels: Optional[List[Channel]]):
    assert (
        config.use_cuda_if_available and torch.cuda.is_available()
    ) or config.distributed_world_size == 1, (
        "distributed training is only available for GPU training"
    )
    assert (
        config.distributed_world_size == 1
        or config.distributed_world_size <= torch.cuda.device_count()
    ), (
        f"Only {torch.cuda.device_count()} GPUs are available, "
        "{config.distributed_world_size} GPUs were requested"
    )

    print(f"\n=== Starting training, World size is {config.distributed_world_size}")
    if not config.use_cuda_if_available or not torch.cuda.is_available():
        run_single(
            rank=0,
            config_json=config_to_json(PyTextConfig, config),
            world_size=1,
            dist_init_method=None,
            metadata=None,
            metric_channels=metric_channels,
        )
    else:
        with tempfile.NamedTemporaryFile(
            delete=False, suffix=".dist_sync"
        ) as sync_file:
            dist_init_method = "file://" + sync_file.name
            metadata = prepare_task_metadata(config)
            spawn(
                run_single,
                (
                    config_to_json(PyTextConfig, config),
                    config.distributed_world_size,
                    dist_init_method,
                    metadata,
                    [],
                ),
                config.distributed_world_size,
            )
Example #3
0
                                       valid_loader,
                                       device,
                                       rank,
                                       writer,
                                       save_root=cfg.predicted_path)

    else:
        raise Exception(f"wrong value for argument -mode: {args.mode}")

    writer = cfg.logger.write(writer)
    writer.close()

if __name__ == '__main__':

    args = get_args()

    if args.mode == 'test_topic':
        parts = ['train', 'test_topic']
    else:
        parts = ['train', 'valid']
    data = load_data(args.dataset, parts)

    for prt, ds in zip(['train', 'valid', 'test'], data):
        cfg.logger.log(f"{prt} examples: {len(ds)}")

    cfg.logger.log('Main starting point...')
    if args.distributed:
        spawn(main, args=(cfg.n_devices, args, data), nprocs=cfg.n_devices)
    else:
        main(args=args, data=data)
    print(all_filenames)
    combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
    combined_csv['id'] = range(len(combined_csv))
    combined_csv.to_csv(f"data/{save_prefix}_{split}.csv", index=False)

    for file_path in all_filenames:
        os.remove(file_path)


if __name__ == '__main__':
    args = get_args()

    split = args.split
    save_prefix = args.save_prefix
    save_root = Path("models/summarization/data")
    root2data = Path(args.root2data)
    n_procs = args.n_procs

    if args.mode == "merge":
        merge_csvs(split, save_root, save_prefix)

    elif args.mode == "preprocess":
        papers_files = get_files(root2data, split, 'fragments')
        abstracts_files = get_files(root2data, split, 'abstracts')

        #         assert n_procs >= len(papers_files)

        spawn(main,
              args=(n_procs, args, papers_files, abstracts_files),
              nprocs=min(n_procs, len(papers_files)))