コード例 #1
0
        def train_loop_per_worker():
            import pandas as pd

            rank = train.world_rank()
            data_shard = train.get_dataset_shard("train")
            start = time.perf_counter()
            num_epochs, num_batches, num_bytes = 0, 0, 0
            batch_delays = []

            print("Starting train loop on worker", rank)
            while time.perf_counter() - start < runtime_seconds:
                num_epochs += 1
                batch_start = time.perf_counter()
                for batch in data_shard.iter_batches(
                    prefetch_blocks=prefetch_blocks, batch_size=batch_size
                ):
                    batch_delay = time.perf_counter() - batch_start
                    batch_delays.append(batch_delay)
                    num_batches += 1
                    if isinstance(batch, pd.DataFrame):
                        num_bytes += int(
                            batch.memory_usage(index=True, deep=True).sum()
                        )
                    elif isinstance(batch, np.ndarray):
                        num_bytes += batch.nbytes
                    else:
                        # NOTE: This isn't recursive and will just return the size of
                        # the object pointers if list of non-primitive types.
                        num_bytes += sys.getsizeof(batch)
                    train.report(
                        bytes_read=num_bytes,
                        num_batches=num_batches,
                        num_epochs=num_epochs,
                        batch_delay=batch_delay,
                    )
                    batch_start = time.perf_counter()
            delta = time.perf_counter() - start

            print("Time to read all data", delta, "seconds")
            print(
                "P50/P95/Max batch delay (s)",
                np.quantile(batch_delays, 0.5),
                np.quantile(batch_delays, 0.95),
                np.max(batch_delays),
            )
            print("Num epochs read", num_epochs)
            print("Num batches read", num_batches)
            print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB")
            print(
                "Mean throughput", round(num_bytes / (1024 * 1024) / delta, 2), "MiB/s"
            )

            if rank == 0:
                print("Ingest stats from rank=0:\n\n{}".format(data_shard.stats()))
コード例 #2
0
            def with_sampler(loader):
                # Automatically set the DistributedSampler

                # If you're using a sampler, the DataLoader shuffle flag must be set to
                # False. Shuffling is instead determined by the shuffle argument passed
                # to the DistributedSampler constructor.

                # If no sampler is passed to the DataLoader constructor, Torch
                # constructs a default sampler. The default sampler is a RandomSampler
                # if shuffling is enabled and a SequentialSampler otherwise. DataLoader
                # does not have a shuffle attribute, so we instead identify whether
                # shuffling is enabled by checking the default sampler type.
                shuffle = not isinstance(loader.sampler, SequentialSampler)

                def seeded_worker_init_fn(worker_init_fn):
                    def wrapper(worker_id):
                        worker_seed = torch.initial_seed() % 2**32
                        np.random.seed(worker_seed)
                        random.seed(worker_seed)
                        worker_init_fn(worker_id)

                    return wrapper

                worker_init_fn = loader.worker_init_fn
                generator = loader.generator
                if self._seed is not None:
                    worker_init_fn = seeded_worker_init_fn(
                        loader.worker_init_fn)
                    generator = torch.Generator()
                    generator.manual_seed(self._seed)

                using_default_sampler = isinstance(
                    loader.sampler, (SequentialSampler, RandomSampler))
                if not using_default_sampler and train.world_rank() == 0:
                    logger.warn(
                        f"The {loader.sampler.__class__.__name__} will be overwritten "
                        "with a DistributedSampler. You can disable this by setting "
                        "`with_sampler` to False in `prepare_data_loader`.")

                data_loader_args = {
                    "dataset": loader.dataset,
                    "batch_size": loader.batch_size,
                    "shuffle": False,
                    "num_workers": loader.num_workers,
                    "collate_fn": loader.collate_fn,
                    "pin_memory": loader.pin_memory,
                    "drop_last": loader.drop_last,
                    "timeout": loader.timeout,
                    "worker_init_fn": worker_init_fn,
                    "generator": generator,
                    "sampler": DistributedSampler(loader.dataset,
                                                  shuffle=shuffle),
                }
                return DataLoader(**data_loader_args)
コード例 #3
0
def train_func(config):
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset_shard = train.get_dataset_shard("train")
    validation_dataset = train.get_dataset_shard("validation")

    model = nn.Linear(1, hidden_size)
    model = train.torch.prepare_model(model)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    for _ in range(epochs):
        train_torch_dataset = train_dataset_shard.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )
        validation_torch_dataset = validation_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )

        device = train.torch.get_device()

        train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
        if train.world_rank() == 0:
            result = validate_epoch(validation_torch_dataset, model, loss_fn,
                                    device)
        else:
            result = {}
        train.report(**result)
        results.append(result)
        train.save_checkpoint(model=model)

    return results
コード例 #4
0
ファイル: check_ingest.py プロジェクト: tchordia/ray
        def train_loop_per_worker():
            rank = train.world_rank()
            data_shard = train.get_dataset_shard("train")
            start = time.perf_counter()
            num_epochs, num_batches, num_bytes = 0, 0, 0
            batch_delays = []

            print("Starting train loop on worker", rank)
            while time.perf_counter() - start < runtime_seconds:
                num_epochs += 1
                batch_start = time.perf_counter()
                for batch in data_shard.iter_batches(
                        prefetch_blocks=prefetch_blocks,
                        batch_size=batch_size):
                    batch_delay = time.perf_counter() - batch_start
                    batch_delays.append(batch_delay)
                    num_batches += 1
                    num_bytes += int(
                        batch.memory_usage(index=True, deep=True).sum())
                    train.report(
                        bytes_read=num_bytes,
                        num_batches=num_batches,
                        num_epochs=num_epochs,
                        batch_delay=batch_delay,
                    )
                    batch_start = time.perf_counter()
            delta = time.perf_counter() - start

            print("Time to read all data", delta, "seconds")
            print(
                "P50/P95/Max batch delay (s)",
                np.quantile(batch_delays, 0.5),
                np.quantile(batch_delays, 0.95),
                np.max(batch_delays),
            )
            print("Num epochs read", num_epochs)
            print("Num batches read", num_batches)
            print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB")
            print("Mean throughput", round(num_bytes / (1024 * 1024) / delta,
                                           2), "MiB/s")

            if rank == 0:
                print("Ingest stats from rank=0:\n\n{}".format(
                    data_shard.stats()))
コード例 #5
0
def _huggingface_train_loop_per_worker(config):
    """Per-worker training loop for HuggingFace Transformers."""
    trainer_init_per_worker = config.pop("_trainer_init_per_worker")

    # Env vars necessary for HF to setup DDP
    os.environ["RANK"] = str(train.world_rank())
    os.environ["WORLD_SIZE"] = str(train.world_size())
    os.environ["LOCAL_RANK"] = str(train.local_rank())

    train_dataset = train.get_dataset_shard(TRAIN_DATASET_KEY)
    eval_dataset = train.get_dataset_shard(EVALUATION_DATASET_KEY)

    train_torch_dataset, eval_torch_dataset = process_datasets(
        train_dataset,
        eval_dataset,
    )

    trainer: transformers.trainer.Trainer = trainer_init_per_worker(
        train_torch_dataset, eval_torch_dataset, **config)

    if trainer.args.push_to_hub and not trainer.args.hub_token:
        warnings.warn(
            "You have set `push_to_hub=True` but didn't specify `hub_token`. "
            "Pushing to hub will most likely fail, as the credentials will not "
            "be automatically propagated from the local enviroment to the Ray Actors. "
            "If that happens, specify `hub_token` in `TrainingArguments`.")

    if (trainer.args.evaluation_strategy == "steps"
            or trainer.args.save_strategy == "steps"
            or trainer.args.logging_strategy == "steps"):
        raise ValueError(
            "'steps' value for `evaluation_strategy`, `logging_strategy` "
            "or `save_strategy` is not yet supported.")

    trainer = wrap_transformers_trainer(trainer)

    # ensure no HF logging callbacks are added
    # aside from doubling functionality with our callbacks,
    # the Wandb callbacks causes training to freeze
    integration_callbacks = transformers.trainer.get_reporting_integration_callbacks(
        trainer.args.report_to)
    for callback in integration_callbacks:
        trainer.pop_callback(callback)

    trainer.add_callback(TrainReportCallback)

    checkpoint = session.get_checkpoint()
    checkpoint_path = None
    remove_checkpoint_path = False
    if checkpoint:
        assert isinstance(checkpoint, Checkpoint)
        checkpoint_dict = checkpoint.to_dict()
        source_ip = checkpoint_dict[NODE_IP_KEY]
        source_path = checkpoint_dict[CHECKPOINT_PATH_ON_NODE_KEY]
        target_ip = get_node_ip_address()
        if source_ip == target_ip:
            checkpoint_path = source_path
        else:
            checkpoint_path = tempfile.mkdtemp(
                suffix=Path(trainer.args.output_dir).name)
            remove_checkpoint_path = True
            sync_dir_between_nodes(
                source_ip=source_ip,
                source_path=source_path,
                target_ip=target_ip,
                target_path=checkpoint_path,
                return_futures=False,
                max_size_bytes=None,
            )
    trainer.train(resume_from_checkpoint=checkpoint_path)
    if remove_checkpoint_path:
        shutil.rmtree(checkpoint_path, ignore_errors=True)
コード例 #6
0
 def train_func():
     train.report(rank=train.world_rank())
コード例 #7
0
 def train_func():
     return train.world_rank()
コード例 #8
0
def train_loop_per_worker(train_loop_config):
    dataset = train_loop_config["dataset_fn"]()
    batch_size = train_loop_config["batch_size"]
    num_epochs = train_loop_config["num_epochs"]

    data = dataset[0]
    train_idx = data.train_mask.nonzero(as_tuple=False).view(-1)
    train_idx = train_idx.split(train_idx.size(0) // train.world_size())[
        train.world_rank()
    ]

    train_loader = NeighborSampler(
        data.edge_index,
        node_idx=train_idx,
        sizes=[25, 10],
        batch_size=batch_size,
        shuffle=True,
    )

    # Disable distributed sampler since the train_loader has already been split above.
    train_loader = train.torch.prepare_data_loader(train_loader, add_dist_sampler=False)

    # Do validation on rank 0 worker only.
    if train.world_rank() == 0:
        subgraph_loader = NeighborSampler(
            data.edge_index, node_idx=None, sizes=[-1], batch_size=2048, shuffle=False
        )
        subgraph_loader = train.torch.prepare_data_loader(
            subgraph_loader, add_dist_sampler=False
        )

    model = SAGE(dataset.num_features, 256, dataset.num_classes)
    model = train.torch.prepare_model(model)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    x, y = data.x.to(train.torch.get_device()), data.y.to(train.torch.get_device())

    for epoch in range(num_epochs):
        model.train()

        # ``batch_size`` is the number of samples in the current batch.
        # ``n_id`` are the ids of all the nodes used in the computation. This is
        # needed to pull in the necessary features just for the current batch that is
        # being trained on.
        # ``adjs`` is a list of 3 element tuple consisting of ``(edge_index, e_id,
        # size)`` for each sample in the batch, where ``edge_index``represent the
        # edges of the sampled subgraph, ``e_id`` are the ids of the edges in the
        # sample, and ``size`` holds the shape of the subgraph.
        # See ``torch_geometric.loader.neighbor_sampler.NeighborSampler`` for more info.
        for batch_size, n_id, adjs in train_loader:
            optimizer.zero_grad()
            out = model(x[n_id], adjs)
            loss = F.nll_loss(out, y[n_id[:batch_size]])
            loss.backward()
            optimizer.step()

        if train.world_rank() == 0:
            print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}")

        train_accuracy = validation_accuracy = test_accuracy = None

        # Do validation on rank 0 worker only.
        if train.world_rank() == 0:
            model.eval()
            with torch.no_grad():
                out = model.module.test(x, subgraph_loader)
            res = out.argmax(dim=-1) == data.y
            train_accuracy = int(res[data.train_mask].sum()) / int(
                data.train_mask.sum()
            )
            validation_accuracy = int(res[data.val_mask].sum()) / int(
                data.val_mask.sum()
            )
            test_accuracy = int(res[data.test_mask].sum()) / int(data.test_mask.sum())

        train.report(
            train_accuracy=train_accuracy,
            validation_accuracy=validation_accuracy,
            test_accuracy=test_accuracy,
        )
コード例 #9
0
def train_func(config):
    use_gpu = config["use_gpu"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_layers = config["num_layers"]
    num_hidden = config["num_hidden"]
    dropout_every = config["dropout_every"]
    dropout_prob = config["dropout_prob"]
    num_features = config["num_features"]

    print("Defining model, loss, and optimizer...")

    # Setup device.
    device = torch.device(f"cuda:{train.local_rank()}"
                          if use_gpu and torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Setup data.
    train_dataset_pipeline = train.get_dataset_shard("train_dataset")
    train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs()
    test_dataset = train.get_dataset_shard("test_dataset")
    test_torch_dataset = test_dataset.to_torch(label_column="label",
                                               batch_size=batch_size)

    net = Net(
        n_layers=num_layers,
        n_features=num_features,
        num_hidden=num_hidden,
        dropout_every=dropout_every,
        drop_prob=dropout_prob,
    ).to(device)
    print(net.parameters)

    net = train.torch.prepare_model(net)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), weight_decay=0.0001)

    print("Starting training...")
    for epoch in range(num_epochs):
        train_dataset = next(train_dataset_epoch_iterator)

        train_torch_dataset = train_dataset.to_torch(label_column="label",
                                                     batch_size=batch_size)

        train_running_loss, train_num_correct, train_num_total = train_epoch(
            train_torch_dataset, net, device, criterion, optimizer,
            num_features)
        train_acc = train_num_correct / train_num_total
        print(f"epoch [{epoch + 1}]: training accuracy: "
              f"{train_num_correct} / {train_num_total} = {train_acc:.4f}")

        test_running_loss, test_num_correct, test_num_total = test_epoch(
            test_torch_dataset, net, device, criterion)
        test_acc = test_num_correct / test_num_total
        print(f"epoch [{epoch + 1}]: testing accuracy: "
              f"{test_num_correct} / {test_num_total} = {test_acc:.4f}")

        # Record and log stats.
        train.report(
            train_acc=train_acc,
            train_loss=train_running_loss,
            test_acc=test_acc,
            test_loss=test_running_loss,
        )

        # Checkpoint model.
        module = net.module if isinstance(net,
                                          DistributedDataParallel) else net
        train.save_checkpoint(model_state_dict=module.state_dict())

    if train.world_rank() == 0:
        return module.cpu()
コード例 #10
0
 def train_func():
     if (train.world_rank()) == 0:
         train.save_checkpoint(epoch=0)
     else:
         train.report(iter=0)
コード例 #11
0
def train_func(config):
    is_distributed = config.get("is_distributed", False)
    use_gpu = config["use_gpu"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_layers = config["num_layers"]
    num_hidden = config["num_hidden"]
    dropout_every = config["dropout_every"]
    dropout_prob = config["dropout_prob"]
    num_features = config["num_features"]

    print("Defining model, loss, and optimizer...")

    # Setup device.
    if is_distributed:
        device = torch.device(f"cuda:{train.local_rank()}" if use_gpu
                              and torch.cuda.is_available() else "cpu")
    else:
        device = torch.device(
            "cuda:0" if use_gpu and torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Setup data.
    if is_distributed:
        train_dataset_pipeline = train.get_dataset_shard("train_dataset")
        train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs()
        test_dataset = train.get_dataset_shard("test_dataset")
    else:
        train_dataset_epoch_iterator = config["train_dataset"].iter_epochs()
        test_dataset = config["test_dataset"]
    test_torch_dataset = test_dataset.to_torch(label_column="label",
                                               batch_size=batch_size)

    # Setup Tensorboard and MLflow.
    if is_distributed:
        # Setup is done through Callback.
        pass
    else:
        writer = SummaryWriter()
        mlflow.start_run()
        mlflow_config = config.copy()
        mlflow_config.pop("test_dataset")
        mlflow_config.pop("train_dataset")
        mlflow.log_params(mlflow_config)

    net = Net(
        n_layers=num_layers,
        n_features=num_features,
        num_hidden=num_hidden,
        dropout_every=dropout_every,
        drop_prob=dropout_prob,
    ).to(device)
    print(net.parameters)

    if is_distributed:
        net = DistributedDataParallel(net)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), weight_decay=0.0001)

    print("Starting training...")
    for epoch in range(num_epochs):
        train_dataset = next(train_dataset_epoch_iterator)

        train_torch_dataset = train_dataset.to_torch(label_column="label",
                                                     batch_size=batch_size)

        train_running_loss, train_num_correct, train_num_total = train_epoch(
            train_torch_dataset, net, device, criterion, optimizer)
        train_acc = train_num_correct / train_num_total
        print(
            f"epoch [{epoch + 1}]: training accuracy: {train_num_correct} / {train_num_total} = {train_acc:.4f}"
        )

        test_running_loss, test_num_correct, test_num_total = test_epoch(
            test_torch_dataset, net, device, criterion)
        test_acc = test_num_correct / test_num_total
        print(
            f"epoch [{epoch + 1}]: testing accuracy: {test_num_correct} / {test_num_total} = {test_acc:.4f}"
        )

        # Record and log stats.
        if is_distributed:
            train.report(train_acc=train_acc,
                         train_loss=train_running_loss,
                         test_acc=test_acc,
                         test_loss=test_running_loss)
        else:
            writer.add_scalar("Accuracy/train", train_acc, epoch)
            writer.add_scalar("Loss/train", train_running_loss, epoch)
            writer.add_scalar("Accuracy/test", test_acc, epoch)
            writer.add_scalar("Loss/test", test_running_loss, epoch)
            writer.flush()

            mlflow.log_metrics({
                "train_acc": train_acc,
                "train_loss": train_running_loss,
                "test_acc": test_acc,
                "test_loss": test_running_loss
            })

        # Checkpoint model.
        if is_distributed:
            import copy
            model_copy = copy.deepcopy(net.module)
            train.save_checkpoint(
                model_state_dict=model_copy.cpu().state_dict())
        else:
            torch.save(net.state_dict(), f"models/model-epoch-{epoch}.torch")

    # Shutdown Tensorboard and MLflow.
    if is_distributed:
        pass
    else:
        writer.close()
        # mlflow.end_run()

    if is_distributed:
        if train.world_rank() == 0:
            return net.module.cpu()
        else:
            return None
    else:
        return net
コード例 #12
0
ファイル: train_loop_utils.py プロジェクト: parasj/ray
    def prepare_data_loader(
        self,
        data_loader: torch.utils.data.DataLoader,
        add_dist_sampler: bool = True,
        move_to_device: bool = True,
        auto_transfer: bool = True,
    ) -> torch.utils.data.DataLoader:
        """Prepares DataLoader for distributed execution.

        This allows you to use the same exact code regardless of number of
        workers or the device type being used (CPU, GPU).

        Args:
            data_loader (torch.utils.data.DataLoader): The DataLoader to
                prepare.
            add_dist_sampler: Whether to add a DistributedSampler to
                the provided DataLoader.
            move_to_device: If set, automatically move the data
                returned by the data loader to the correct device.
            auto_transfer: If set and device is GPU, another CUDA stream
                is created to automatically copy data from host (CPU) memory
                to device (GPU) memory (the default CUDA stream still runs the
                training procedure). If device is CPU, it will be disabled
                regardless of the setting. This configuration will be ignored
                if ``move_to_device`` is False.
        """

        # Backwards compatibility
        try:
            world_size = session.get_world_size()
            world_rank = session.get_world_rank()
        except Exception:
            world_size = train.world_size()
            world_rank = train.world_rank()

        # Only add Distributed Sampler if the following conditions hold:
        # 1. More than one training worker is being used.
        # 2. A DistributedSampler has not already been added by the user.
        # 3. The dataset is not an IterableDataset. Samplers do not worker with
        # IterableDatasets.
        if (world_size > 1
                and not isinstance(data_loader.sampler, DistributedSampler)
                and not (hasattr(data_loader, "dataset")
                         and isinstance(data_loader.dataset, IterableDataset))
                and add_dist_sampler):

            def with_sampler(loader):
                # Automatically set the DistributedSampler

                # If you're using a sampler, the DataLoader shuffle flag must be set to
                # False. Shuffling is instead determined by the shuffle argument passed
                # to the DistributedSampler constructor.

                # If no sampler is passed to the DataLoader constructor, Torch
                # constructs a default sampler. The default sampler is a RandomSampler
                # if shuffling is enabled and a SequentialSampler otherwise. DataLoader
                # does not have a shuffle attribute, so we instead identify whether
                # shuffling is enabled by checking the default sampler type.
                shuffle = not isinstance(loader.sampler, SequentialSampler)

                def seeded_worker_init_fn(worker_init_fn):
                    def wrapper(worker_id):
                        worker_seed = torch.initial_seed() % 2**32
                        np.random.seed(worker_seed)
                        random.seed(worker_seed)
                        worker_init_fn(worker_id)

                    return wrapper

                worker_init_fn = loader.worker_init_fn
                generator = loader.generator
                if self._seed is not None:
                    worker_init_fn = seeded_worker_init_fn(
                        loader.worker_init_fn)
                    generator = torch.Generator()
                    generator.manual_seed(self._seed)

                using_default_sampler = isinstance(
                    loader.sampler, (SequentialSampler, RandomSampler))
                if not using_default_sampler and world_rank == 0:
                    logger.warn(
                        f"The {loader.sampler.__class__.__name__} will be overwritten "
                        "with a DistributedSampler. You can disable this by setting "
                        "`with_sampler` to False in `prepare_data_loader`.")

                data_loader_args = {
                    "dataset": loader.dataset,
                    "batch_size": loader.batch_size,
                    "shuffle": False,
                    "num_workers": loader.num_workers,
                    "collate_fn": loader.collate_fn,
                    "pin_memory": loader.pin_memory,
                    "drop_last": loader.drop_last,
                    "timeout": loader.timeout,
                    "worker_init_fn": worker_init_fn,
                    "generator": generator,
                    "sampler": DistributedSampler(loader.dataset,
                                                  shuffle=shuffle),
                }
                return DataLoader(**data_loader_args)

            data_loader = with_sampler(data_loader)

        if move_to_device:
            device = self.get_device()
            data_loader = _WrappedDataLoader(data_loader, device,
                                             auto_transfer)

        return data_loader