Ejemplo n.º 1
0
def run():
    # Initialize the T5 layer norm and load the weight from FlexFlow
    HIDDEN_SIZE = 512
    t5_layernorm = T5LayerNorm(HIDDEN_SIZE).to(DEVICE)
    t5_layernorm_weight = torch.load(os.path.join(OUT_DIR, "ff_layernorm_weight.pt"))
    assert t5_layernorm.weight.shape == t5_layernorm_weight.shape, (
        "Shape mismatch: "
        f"FF={t5_layernorm_weight.shape} torch={t5_layernorm.weight.shape}"
    )
    t5_layernorm.weight = torch.nn.Parameter(t5_layernorm_weight.to(DEVICE))

    inp: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
        dtype="float32",
    ).to(DEVICE)
    label: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
        dtype="float32",
    ).to(DEVICE)

    output = t5_layernorm(inp)
    torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))

    t5_layernorm.zero_grad()
    output.retain_grad()
    loss_fn = torch.nn.MSELoss(reduction="mean")
    loss = loss_fn(output, label)
    loss.backward()
    torch.save(
        t5_layernorm.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt")
    )
    torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
Ejemplo n.º 2
0
def run():
    INPUT_SIZE = 512
    SEQ_LENGTH = 5
    inp1: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
                                    dtype="float32")
    inp2: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
                                    dtype="float32")
    label: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
                                     dtype="float32")

    ffconfig = FFConfig()
    ffmodel = FFModel(ffconfig)
    input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT)
    input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT)
    output_tensor = ffmodel.multiply(x=input_tensor_1,
                                     y=input_tensor_2,
                                     name="multiply")

    # compile
    compile_ffmodel(ffmodel)
    dls = init_ffmodel(ffmodel,
                       ((input_tensor_1, inp1), (input_tensor_2, inp2)), label)
    assert len(dls) == 3
    inp1_dl, inp2_dl, label_dl = dls

    # forward/backward pass
    run_fwd_bwd(ffmodel, ffconfig, (inp1_dl, inp2_dl), label_dl)

    # save data
    save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
    save_tensor_grad_ff(output_tensor, ffmodel,
                        os.path.join(OUT_DIR, "ff_out_grad.pt"))
Ejemplo n.º 3
0
def run():
    NUM_EMBEDDINGS = 250112
    EMBEDDING_DIM = 512
    embedding = torch.nn.Embedding(
        num_embeddings=NUM_EMBEDDINGS,
        embedding_dim=EMBEDDING_DIM,
        device=DEVICE,
    )
    embedding_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt"))
    assert embedding_weight.shape == embedding.weight.shape
    embedding.weight = torch.nn.Parameter(embedding_weight.to(DEVICE))

    inp: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH),
        dtype="int64",
        low=0,
        high=NUM_EMBEDDINGS,
    ).to(DEVICE)
    label: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM),
        dtype="float32",
    ).to(DEVICE)

    output = embedding(inp)
    embedding.zero_grad()
    output.retain_grad()
    loss_fn = torch.nn.MSELoss(reduction="mean")
    loss = loss_fn(output, label)
    loss.backward()
    torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
    torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
    torch.save(embedding.weight.grad.cpu(),
               os.path.join(OUT_DIR, "torch_weight_grad.pt"))
Ejemplo n.º 4
0
def run():
    """Checks the ``getitem()`` code path for tensor slicing."""
    attention_mask = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH),
        dtype="int64",
        low=0,
        high=2,
    )
    label: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH),
        dtype="float32",
    )  # unused

    ffconfig = FFConfig()
    ffmodel = FFModel(ffconfig)
    attention_mask_tensor = ffmodel.create_tensor(
        attention_mask.shape,
        DataType.DT_INT64,
    )
    extended_attention_mask = GetItemNode.slice_tensor(
        ffmodel,
        attention_mask_tensor,
        (slice(None, None, None), None, None, slice(None, None, None)),
        "slice",
    )

    compile_ffmodel(ffmodel)
    dls = init_ffmodel(
        ffmodel, ((attention_mask_tensor, attention_mask),), label,
    )
    assert len(dls) == 2
    inp_dl, label_dl = dls
    run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl, run_bwd=False)

    save_tensor_ff(extended_attention_mask, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
Ejemplo n.º 5
0
def run():
    KERNEL_SIZE = 3
    INPUT_SIZE = 512
    IN_CHANNELS = 3
    OUTPUT_SIZE = 510
    OUT_CHANNELS = 5
    inp: torch.Tensor = gen_tensor(
        (BATCH_SIZE, IN_CHANNELS, INPUT_SIZE, INPUT_SIZE), dtype="float32")
    label: torch.Tensor = gen_tensor(
        (BATCH_SIZE, OUT_CHANNELS, OUTPUT_SIZE, OUTPUT_SIZE), dtype="float32")

    ffconfig = FFConfig()
    ffmodel = FFModel(ffconfig)
    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
    output_tensor = ffmodel.conv2d(input=input_tensor,
                                   out_channels=OUT_CHANNELS,
                                   kernel_h=KERNEL_SIZE,
                                   kernel_w=KERNEL_SIZE,
                                   stride_h=1,
                                   stride_w=1,
                                   padding_h=0,
                                   padding_w=0,
                                   name="conv2d")

    # compile model
    compile_ffmodel(ffmodel)
    dls = init_ffmodel(ffmodel, ((input_tensor, inp), ), label)
    assert len(dls) == 2
    inp_dl, label_dl = dls

    # forward/back pass
    run_fwd_bwd(ffmodel, ffconfig, (inp_dl, ), label_dl)

    conv2d_layer: Op = ffmodel.get_layers()[0]
    assert isinstance(conv2d_layer, Conv2D)
    conv2d_weight: Parameter = conv2d_layer.get_weight_tensor()
    conv2d_bias: Parameter = conv2d_layer.get_bias_tensor()

    # save output data
    save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
    save_tensor_grad_ff(output_tensor, ffmodel,
                        os.path.join(OUT_DIR, "ff_out_grad.pt"))

    # save layer data
    save_param_ff(conv2d_weight, ffmodel, os.path.join(OUT_DIR,
                                                       "ff_weight.pt"))
    save_param_ff(conv2d_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias.pt"))
    save_param_grad_ff(conv2d_weight, ffmodel,
                       os.path.join(OUT_DIR, "ff_weight_grad.pt"))
    save_param_grad_ff(conv2d_bias, ffmodel,
                       os.path.join(OUT_DIR, "ff_bias_grad.pt"))
Ejemplo n.º 6
0
def run():
  # create input, label tensors
  INPUT_SIZE = 512
  OUTPUT_SIZE = 128
  inp: torch.Tensor = gen_tensor(
      (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
      dtype="float32"
  )
  label: torch.Tensor = gen_tensor(
      (BATCH_SIZE, SEQ_LENGTH, OUTPUT_SIZE),
      dtype="float32"
  )

  # initialize ffmodel object
  ffconfig = FFConfig()
  ffmodel = FFModel(ffconfig)
  input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
  output_tensor = ffmodel.dense(
      input=input_tensor,
      out_dim=128,
      name="linear"
  )


  # compile model 
  compile_ffmodel(ffmodel)

  # fails here
  dls = init_ffmodel(ffmodel, ((input_tensor, inp),), label)
  assert len(dls) == 2
  inp_dl, label_dl = dls

  # forward/back pass
  run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl)

  # get linear layer
  linear_layer: Op = ffmodel.get_layers()[0]
  assert isinstance(linear_layer, Linear)
  linear_weight: Parameter = linear_layer.get_weight_tensor()
  linear_bias: Parameter = linear_layer.get_bias_tensor()

  # save output data
  save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
  save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
  
  # save layer data
  save_param_ff(linear_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt"))
  save_param_ff(linear_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias.pt"))
  save_param_grad_ff(linear_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt"))
  save_param_grad_ff(linear_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias_grad.pt"))
Ejemplo n.º 7
0
def run():
    # define layer in pytorch
    INPUT_SIZE = 512
    OUTPUT_SIZE = 128
    linear = torch.nn.Linear(in_features=512, out_features=128).to(DEVICE)

    # get weight/bias from ff files, check same shape
    linear_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt"))
    linear_bias = torch.load(os.path.join(OUT_DIR, "ff_bias.pt"))
    assert linear.weight.shape == linear_weight.shape, (
        "Shape mismatch: "
        f"FF={linear_weight.shape} torch={linear.weight.shape}")
    assert linear.bias.shape == linear_bias.shape, (
        "Shape mismatch: "
        f"FF={linear_bias.shape} torch={linear.bias.shape}")

    # set weight/bias
    linear.weight = torch.nn.Parameter(linear_weight.to(DEVICE))
    linear.bias = torch.nn.Parameter(linear_bias.to(DEVICE))

    # generate input/label tensors w/ gen_tensor
    inp: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
                                   dtype="float32").to(DEVICE)
    label: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, OUTPUT_SIZE),
                                     dtype="float32").to(DEVICE)

    # get output running input through layer
    output = linear(inp)
    linear.zero_grad()
    output.retain_grad()

    # loss function
    loss_fn = torch.nn.MSELoss(reduction="mean")
    loss = loss_fn(output, label)

    # backpropogate
    loss.backward()

    # save out, out grad, layer weight & bias gradients
    torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
    torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
    torch.save(linear.weight.grad.cpu(),
               os.path.join(OUT_DIR, "torch_weight_grad.pt"))
    torch.save(linear.bias.grad.cpu(),
               os.path.join(OUT_DIR, "torch_bias_grad.pt"))
Ejemplo n.º 8
0
def run():
    HIDDEN_SIZE = 512
    EPS = 1e-6
    inp: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
        dtype="float32",
    )
    label: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
        dtype="float32",
    )

    ffconfig = FFConfig()
    ffmodel = FFModel(ffconfig)
    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT)
    output_tensor = ffmodel.layer_norm(
        input=input_tensor,
        axes=[len(input_tensor.dims) - 1],  # normalize over the last dimension
        elementwise_affine=True,
        eps=EPS,
        name="layernorm",
    )

    compile_ffmodel(ffmodel)
    dls = init_ffmodel(ffmodel, ((input_tensor, inp), ), label)
    assert len(dls) == 2
    inp_dl, label_dl = dls
    run_fwd_bwd(ffmodel, ffconfig, (inp_dl, ), label_dl)

    layernorm_layer: Op = ffmodel.get_layers()[0]
    assert isinstance(layernorm_layer, LayerNorm)
    layernorm_weight: Parameter = layernorm_layer.get_weight_tensor()
    layernorm_bias: Parameter = layernorm_layer.get_bias_tensor()
    save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
    save_tensor_grad_ff(output_tensor, ffmodel,
                        os.path.join(OUT_DIR, "ff_out_grad.pt"))
    save_param_ff(layernorm_weight, ffmodel,
                  os.path.join(OUT_DIR, "ff_weight.pt"))
    save_param_ff(layernorm_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias.pt"))
    save_param_grad_ff(layernorm_weight, ffmodel,
                       os.path.join(OUT_DIR, "ff_weight_grad.pt"))
    save_param_grad_ff(layernorm_bias, ffmodel,
                       os.path.join(OUT_DIR, "ff_bias_grad.pt"))
Ejemplo n.º 9
0
def run():
    INPUT_SIZE = 512
    SEQ_LENGTH = 5

    inp1: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
                                    dtype="float32").to(DEVICE)
    inp2: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
                                    dtype="float32").to(DEVICE)
    label: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE),
                                     dtype="float32").to(DEVICE)
    output = torch.mul(input=inp1, other=inp2).to(DEVICE)
    output.requires_grad = True
    output.retain_grad()

    loss_fn = torch.nn.MSELoss(reduction="mean")
    loss = loss_fn(output, label)
    loss.backward()
    torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
    torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
Ejemplo n.º 10
0
def run():
    """Checks the ``getitem()`` code path for tensor slicing."""
    attention_mask = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH),
        dtype="int64",
        low=0,
        high=2,
    ).to(DEVICE)
    # Extend to shape (BATCH_SIZE, 1, 1, SEQ_LENGTH)
    extended_attention_mask = attention_mask[:, None, None, :]
    torch.save(extended_attention_mask.cpu(),
               os.path.join(OUT_DIR, "torch_out.pt"))
Ejemplo n.º 11
0
def run():
    HIDDEN_SIZE = 512
    EPS = 1e-6
    layernorm = torch.nn.LayerNorm(
        normalized_shape=HIDDEN_SIZE,
        eps=EPS,
        elementwise_affine=True,
    ).to(DEVICE)
    layernorm_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt"))
    layernorm_bias = torch.load(os.path.join(OUT_DIR, "ff_bias.pt"))
    assert layernorm.weight.shape == layernorm_weight.shape, (
        "Shape mismatch: "
        f"FF={layernorm_weight.shape} torch={layernorm.weight.shape}")
    assert layernorm.bias.shape == layernorm_bias.shape, (
        "Shape mismatch: "
        f"FF={layernorm_bias.shape} torch={layernorm.bias.shape}")
    layernorm.weight = torch.nn.Parameter(layernorm_weight.to(DEVICE))
    layernorm.bias = torch.nn.Parameter(layernorm_bias.to(DEVICE))

    inp: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
        dtype="float32",
    ).to(DEVICE)
    label: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE),
        dtype="float32",
    ).to(DEVICE)

    output = layernorm(inp)
    layernorm.zero_grad()
    output.retain_grad()
    loss_fn = torch.nn.MSELoss(reduction="mean")
    loss = loss_fn(output, label)
    loss.backward()
    torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
    torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
    torch.save(layernorm.weight.grad.cpu(),
               os.path.join(OUT_DIR, "torch_weight_grad.pt"))
    torch.save(layernorm.bias.grad.cpu(),
               os.path.join(OUT_DIR, "torch_bias_grad.pt"))
Ejemplo n.º 12
0
def run():
    NUM_EMBEDDINGS = 250112
    EMBEDDING_DIM = 512
    inp: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH),
        dtype="int64",
        low=0,
        high=NUM_EMBEDDINGS,
    )
    label: torch.Tensor = gen_tensor(
        (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM),
        dtype="float32",
    )

    ffconfig = FFConfig()
    ffmodel = FFModel(ffconfig)
    input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_INT64)
    output_tensor = ffmodel.embedding(
        input=input_tensor,
        num_embeddings=NUM_EMBEDDINGS,
        embedding_dim=EMBEDDING_DIM,
        aggr=AggrMode.AGGR_MODE_NONE,
        kernel_initializer=NormInitializer(seed=42, mean=0, stddev=1),
        name="embedding",
    )
    compile_ffmodel(ffmodel)
    dls = init_ffmodel(ffmodel, ((input_tensor, inp),), label)
    assert len(dls) == 2
    inp_dl, label_dl = dls
    run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl)

    embedding_layer: Op = ffmodel.get_layers()[0]
    assert isinstance(embedding_layer, Embedding)
    embedding_weight: Parameter = embedding_layer.get_weight_tensor()
    save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
    save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
    save_param_ff(embedding_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt"))
    save_param_grad_ff(
        embedding_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt")
    )