Example #1
0
def test_simple_linears():
    def sum_grad(parameters):
        return sum([p.grad.sum() for p in parameters if p.grad is not None])

    def zero_grad(parameters):
        for p in parameters:
            p.grad = None

    inputs = torch.rand(8, 1)
    model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),)

    # Without Pipe
    outputs = model(inputs)
    loss = outputs.mean()
    loss.backward()

    grad_without_pipe = sum_grad(model.parameters())

    zero_grad(model.parameters())

    # With Pipe
    model = Pipe(model, [2, 2], devices=["cpu", "cpu"], chunks=4)

    outputs = model(inputs)
    loss = outputs.mean()
    loss.backward()

    grad_with_pipe = sum_grad(model.parameters())

    # Both grads should be identical.
    assert torch.allclose(grad_with_pipe, grad_without_pipe)
Example #2
0
def _train_pipe_model(model, use_fp16=False, checkpoint="never", chunks=1):
    model = copy.deepcopy(model)
    model = Pipe(
        model,
        balance=[1] * torch.cuda.device_count(),
        devices=list(range(torch.cuda.device_count())),
        chunks=chunks,
        checkpoint=checkpoint,
    )
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
    return _train(model, optimizer, use_fp16)
Example #3
0
def simple_linears(pipeline_style):
    def sum_grad(parameters):
        return sum([p.grad.sum() for p in parameters if p.grad is not None])

    def zero_grad(parameters):
        for p in parameters:
            p.grad = None

    set_random_seed(12345)
    inputs = torch.rand(8, 1)
    model = nn.Sequential(
        nn.Linear(1, 2),
        nn.Linear(2, 4),
        nn.Linear(4, 2),
        nn.Linear(2, 1),
    )

    # Without Pipe
    outputs = model(inputs)
    loss = outputs.mean()
    loss.backward()

    grad_without_pipe = [
        sum_grad([*model[0].parameters(), *model[1].parameters()]),
        sum_grad([*model[2].parameters(), *model[3].parameters()]),
    ]

    ref_without_pipe = [p.grad for p in model.parameters()]

    zero_grad(model.parameters())

    # With Pipe
    model = Pipe(model, [2, 2],
                 style=pipeline_style,
                 worker_map=get_worker_map(),
                 chunks=4)

    outputs = model(inputs)
    if model.group.rank() == 1:
        loss = outputs.mean()
        loss.backward()
        grad_with_pipe = sum_grad(
            model.pipeline.mp_partitions[0].module.parameters())

        # Both grads should be identical.
        assert torch.allclose(grad_with_pipe, grad_without_pipe[1])
    else:
        model.back_helper(outputs)
        grad_with_pipe = sum_grad(
            model.pipeline.mp_partitions[0].module.parameters())

        # Both grads should be identical.
        assert torch.allclose(grad_with_pipe, grad_without_pipe[0])
    torch.distributed.barrier()
Example #4
0
 def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint):
     try:
         from fairscale.nn import Pipe
     except ImportError:
         raise ImportError(
             'Please install fairscale with: pip install fairscale')
     super().__init__()
     assert isinstance(encoder, FairseqEncoder)
     assert isinstance(decoder, FairseqDecoder)
     module_list = nn.Sequential(
         encoder.embedding_layer,
         *list(encoder.encoder_layers),
         encoder.final_layer_norm,
         decoder.embedding_layer,
         *list(decoder.decoder_layers),
         decoder.decoder_output_layer,
     )
     self.devices = devices
     self.model = Pipe(
         module_list,
         balance=balance,
         devices=devices,
         chunks=chunks,
         checkpoint=checkpoint,
     )
     self.encoder_max_positions = self.max_positions_helper(
         encoder.embedding_layer, 'max_source_positions')
     self.decoder_max_positions = self.max_positions_helper(
         decoder.embedding_layer, 'max_target_positions')
     self.adaptive_softmax = getattr(decoder, 'adaptive_softmax', None)
Example #5
0
    def init_components(
        self,
        model_fn=None,
        criterion_fn=None,
        optimizer_fn=None,
        scheduler_fn=None,
    ):
        """Inits the runs components."""
        model = model_fn()

        if "balance" not in self.pipe_kwargs:
            warnings.warn(
                "With FairScale Pipe setup, "
                "you need to specify ``balance`` under ``pipe_kwargs``. "
                "Generating balance automatically. (Experimental feature)")
            self.pipe_kwargs["balance"] = _generate_balance(
                self.device_count, len(model))
        pipe_model = Pipe(model, **self.pipe_kwargs)
        del model

        # criterion
        criterion = criterion_fn()
        # optimizer
        optimizer = optimizer_fn(pipe_model)
        # scheduler
        scheduler = scheduler_fn()

        return pipe_model, criterion, optimizer, scheduler
Example #6
0
 def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None):
     super().__init__(dictionary)
     self.register_buffer("version", torch.Tensor([3]))
     try:
         from fairscale.nn import Pipe
     except ImportError:
         raise ImportError("Please install fairscale with: pip install fairscale")
     self.use_pipeline = encoder_module_list is not None
     if not self.use_pipeline:
         self.embedding_layer = TransformerEncoderEmbedding(args, embed_tokens)
         self.encoder_layers = nn.Sequential(*[TransformerEncoderLayer(args) for i in range(args.encoder_layers)])
         if isinstance(embed_tokens, nn.ModuleList):
             emb_dim = sum(e.embedding_dim for e in embed_tokens)
         else:
             emb_dim = embed_tokens.embedding_dim
         self.final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim)
     else:
         encoder_balance = utils.eval_str_list(
             args.pipeline_encoder_balance, type=int
         )
         encoder_devices = utils.eval_str_list(
             args.pipeline_encoder_devices, type=int
         )
         assert sum(encoder_balance) == len(encoder_module_list), (
             f"Sum of encoder_balance={encoder_balance} is not equal "
             + f"to num_encoder_modules={len(encoder_module_list)}"
         )
         self.model = Pipe(
             module=nn.Sequential(*encoder_module_list),
             balance=encoder_balance,
             devices=encoder_devices,
             chunks=args.pipeline_chunks,
             checkpoint=args.pipeline_checkpoint,
         )
Example #7
0
 def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint):
     try:
         from fairscale.nn import Pipe
     except ImportError:
         raise ImportError(
             "Please install fairscale with: pip install fairscale")
     super().__init__()
     assert isinstance(encoder, FairseqEncoder)
     assert isinstance(decoder, FairseqDecoder)
     encoder_module_list = ([encoder.embedding_layer] +
                            list(encoder.encoder_layers) +
                            [encoder.final_layer_norm])
     self.num_encoder_modules = len(encoder_module_list)
     decoder_module_list = ([decoder.embedding_layer] +
                            list(decoder.decoder_layers) +
                            [decoder.decoder_output_layer])
     self.num_decoder_modules = len(decoder_module_list)
     module_list = encoder_module_list + decoder_module_list
     self.devices = devices
     self.model = Pipe(
         nn.Sequential(*module_list),
         balance=balance,
         devices=devices,
         chunks=chunks,
         checkpoint=checkpoint,
     )
     self.encoder_max_positions = self.max_positions_helper(
         encoder.embedding_layer, "max_source_positions")
     self.decoder_max_positions = self.max_positions_helper(
         decoder.embedding_layer, "max_target_positions")
     self.adaptive_softmax = getattr(decoder, "adaptive_softmax", None)
     # Note: To be populated during inference
     self.encoder = None
     self.decoder = None
Example #8
0
def benchmark_single_process(args):
    """Benchmark a given model using a single process and multiple devices."""

    init_method_pgroup = "tcp://localhost:{}".format(MPI_PORT)
    torch.distributed.init_process_group(backend="gloo",
                                         rank=0,
                                         world_size=1,
                                         init_method=init_method_pgroup)

    num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1
    assert num_devices > 0
    init_random_seed(0)

    benchmark_config = create_benchmark_config(args.model_name)
    model_specs = get_model_specs(args.model_name)
    model_config = create_model_config(args,
                                       benchmark_config=benchmark_config,
                                       model_specs=model_specs)
    model = model_config["model"]

    balance = generate_balance(min(num_devices, 4), len(model))
    pipe_model = Pipe(model,
                      balance,
                      chunks=args.chunks,
                      checkpoint=args.checkpoint)
    del model
    del model_config["model"]

    if args.dry_run:
        train(model_config, pipe_model, benchmark_config, model_specs, args)
    else:
        benchmark_language_model(model_config, pipe_model, benchmark_config,
                                 model_specs, args)
Example #9
0
def make_model(device, ntokens):
    ninp = 50  # embedding dimension
    nhid = 50  # the dimension of the feedforward network model in nn.TransformerEncoder
    nhead = 2  # the number of heads in the multiheadattention models
    dropout = 0
    initrange = 0.1

    model = TransformerLMSequntial(ntokens, ninp, nhead, nhid, dropout, initrange).half().to(device)
    balance = generate_balance(min(num_devices, 4), len(model))
    p = Pipe(model, balance, chunks=len(balance))

    criterion = nn.CrossEntropyLoss()
    lr = 0.001  # learning rate

    try:
        optimizer = Adam(p.parameters(), lr=lr, precision=Precision.PURE_FP16)
    except NameError:
        optimizer = Adam(p.parameters(), lr=lr)
    scaler = GradScaler()

    return p, criterion, optimizer, scaler
Example #10
0
 def __init__(
     self,
     args,
     dictionary,
     embed_tokens,
     no_encoder_attn=False,
     decoder_module_list=None,
 ):
     super().__init__(dictionary)
     self.register_buffer("version", torch.Tensor([3]))
     try:
         from fairscale.nn import Pipe
     except ImportError:
         raise ImportError(
             "Please install fairscale with: pip install fairscale")
     if decoder_module_list is None:
         embedding_layer = TransformerDecoderEmbedding(args, embed_tokens)
         layers = [
             TransformerDecoderLayer(args, no_encoder_attn)
             for _ in range(args.decoder_layers)
         ]
         decoder_output_layer = TransformerDecoderOutputLayer(
             args, embed_tokens, dictionary)
         decoder_module_list = [embedding_layer
                                ] + layers + [decoder_output_layer]
     self.use_pipeline = getattr(args, "pipeline_decoder_balance",
                                 None) is not None
     if self.use_pipeline:
         decoder_balance = utils.eval_str_list(
             args.pipeline_decoder_balance, type=int)
         decoder_devices = utils.eval_str_list(
             args.pipeline_decoder_devices, type=int)
         assert sum(decoder_balance) == len(decoder_module_list), (
             f"Sum of decoder_balance={decoder_balance} is not equal " +
             f"to num_decoder_modules={len(decoder_module_list)}")
         self.model = Pipe(
             module=nn.Sequential(*decoder_module_list),
             balance=decoder_balance,
             devices=decoder_devices,
             chunks=args.pipeline_chunks,
             checkpoint=args.pipeline_checkpoint,
         )
     else:
         self.embedding_layer = decoder_module_list[0]
         self.decoder_layers = nn.Sequential(*decoder_module_list[1:-1])
         self.decoder_output_layer = decoder_module_list[-1]
Example #11
0
def benchmark_single_process(args):
    """Benchmark a given model using a single process and multiple devices."""

    num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1
    assert num_devices > 0
    init_random_seed(0)

    benchmark_config = create_benchmark_config(args.model_name)
    model_config = create_model_config(args, benchmark_config=benchmark_config)
    model = model_config["model"]

    balance = generate_balance(min(num_devices, 4), len(model))
    pipe_model = Pipe(model, balance, chunks=args.chunks, checkpoint=args.checkpoint)
    del model
    del model_config["model"]

    if args.dry_run:
        train(model_config, pipe_model, benchmark_config, args)
    else:
        benchmark_language_model(model_config, pipe_model, benchmark_config, args)
Example #12
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument("--batch-size",
                        type=int,
                        default=64,
                        metavar="N",
                        help="input batch size for training (default: 64)")
    parser.add_argument("--test-batch-size",
                        type=int,
                        default=1000,
                        metavar="N",
                        help="input batch size for testing (default: 1000)")
    parser.add_argument("--epochs",
                        type=int,
                        default=14,
                        metavar="N",
                        help="number of epochs to train (default: 14)")
    parser.add_argument("--lr",
                        type=float,
                        default=1.0,
                        metavar="LR",
                        help="learning rate (default: 1.0)")
    parser.add_argument("--gamma",
                        type=float,
                        default=0.7,
                        metavar="M",
                        help="Learning rate step gamma (default: 0.7)")
    parser.add_argument("--dry-run",
                        action="store_true",
                        default=False,
                        help="quickly check a single pass")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        metavar="S",
                        help="random seed (default: 1)")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=10,
        metavar="N",
        help="how many batches to wait before logging training status",
    )
    parser.add_argument("--save-model",
                        action="store_true",
                        default=False,
                        help="For Saving the current Model")
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    kwargs = {"batch_size": args.batch_size}
    kwargs.update({"num_workers": 1, "pin_memory": True, "shuffle": True}, )

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])
    dataset1 = datasets.MNIST("../data",
                              train=True,
                              download=True,
                              transform=transform)
    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1, **kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **kwargs)

    model = net
    model = Pipe(model, balance=[6, 6], devices=[0, 1], chunks=2)
    device = model.devices[0]

    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        tic = time.perf_counter()
        train(args, model, device, train_loader, optimizer, epoch)
        toc = time.perf_counter()
        print(f">>> TRANING Time {toc - tic:0.4f} seconds")

        tic = time.perf_counter()
        test(model, device, test_loader)
        toc = time.perf_counter()
        print(f">>> TESTING Time {toc - tic:0.4f} seconds")
        scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")