def run(): # Initialize the T5 layer norm and load the weight from FlexFlow HIDDEN_SIZE = 512 t5_layernorm = T5LayerNorm(HIDDEN_SIZE).to(DEVICE) t5_layernorm_weight = torch.load(os.path.join(OUT_DIR, "ff_layernorm_weight.pt")) assert t5_layernorm.weight.shape == t5_layernorm_weight.shape, ( "Shape mismatch: " f"FF={t5_layernorm_weight.shape} torch={t5_layernorm.weight.shape}" ) t5_layernorm.weight = torch.nn.Parameter(t5_layernorm_weight.to(DEVICE)) inp: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE), dtype="float32", ).to(DEVICE) label: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE), dtype="float32", ).to(DEVICE) output = t5_layernorm(inp) torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt")) t5_layernorm.zero_grad() output.retain_grad() loss_fn = torch.nn.MSELoss(reduction="mean") loss = loss_fn(output, label) loss.backward() torch.save( t5_layernorm.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt") ) torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
def run(): INPUT_SIZE = 512 SEQ_LENGTH = 5 inp1: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE), dtype="float32") inp2: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE), dtype="float32") label: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE), dtype="float32") ffconfig = FFConfig() ffmodel = FFModel(ffconfig) input_tensor_1 = ffmodel.create_tensor(inp1.shape, DataType.DT_FLOAT) input_tensor_2 = ffmodel.create_tensor(inp2.shape, DataType.DT_FLOAT) output_tensor = ffmodel.multiply(x=input_tensor_1, y=input_tensor_2, name="multiply") # compile compile_ffmodel(ffmodel) dls = init_ffmodel(ffmodel, ((input_tensor_1, inp1), (input_tensor_2, inp2)), label) assert len(dls) == 3 inp1_dl, inp2_dl, label_dl = dls # forward/backward pass run_fwd_bwd(ffmodel, ffconfig, (inp1_dl, inp2_dl), label_dl) # save data save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt")) save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt"))
def run(): NUM_EMBEDDINGS = 250112 EMBEDDING_DIM = 512 embedding = torch.nn.Embedding( num_embeddings=NUM_EMBEDDINGS, embedding_dim=EMBEDDING_DIM, device=DEVICE, ) embedding_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt")) assert embedding_weight.shape == embedding.weight.shape embedding.weight = torch.nn.Parameter(embedding_weight.to(DEVICE)) inp: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH), dtype="int64", low=0, high=NUM_EMBEDDINGS, ).to(DEVICE) label: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM), dtype="float32", ).to(DEVICE) output = embedding(inp) embedding.zero_grad() output.retain_grad() loss_fn = torch.nn.MSELoss(reduction="mean") loss = loss_fn(output, label) loss.backward() torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt")) torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt")) torch.save(embedding.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt"))
def run(): """Checks the ``getitem()`` code path for tensor slicing.""" attention_mask = gen_tensor( (BATCH_SIZE, SEQ_LENGTH), dtype="int64", low=0, high=2, ) label: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH), dtype="float32", ) # unused ffconfig = FFConfig() ffmodel = FFModel(ffconfig) attention_mask_tensor = ffmodel.create_tensor( attention_mask.shape, DataType.DT_INT64, ) extended_attention_mask = GetItemNode.slice_tensor( ffmodel, attention_mask_tensor, (slice(None, None, None), None, None, slice(None, None, None)), "slice", ) compile_ffmodel(ffmodel) dls = init_ffmodel( ffmodel, ((attention_mask_tensor, attention_mask),), label, ) assert len(dls) == 2 inp_dl, label_dl = dls run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl, run_bwd=False) save_tensor_ff(extended_attention_mask, ffmodel, os.path.join(OUT_DIR, "ff_out.pt"))
def run(): KERNEL_SIZE = 3 INPUT_SIZE = 512 IN_CHANNELS = 3 OUTPUT_SIZE = 510 OUT_CHANNELS = 5 inp: torch.Tensor = gen_tensor( (BATCH_SIZE, IN_CHANNELS, INPUT_SIZE, INPUT_SIZE), dtype="float32") label: torch.Tensor = gen_tensor( (BATCH_SIZE, OUT_CHANNELS, OUTPUT_SIZE, OUTPUT_SIZE), dtype="float32") ffconfig = FFConfig() ffmodel = FFModel(ffconfig) input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT) output_tensor = ffmodel.conv2d(input=input_tensor, out_channels=OUT_CHANNELS, kernel_h=KERNEL_SIZE, kernel_w=KERNEL_SIZE, stride_h=1, stride_w=1, padding_h=0, padding_w=0, name="conv2d") # compile model compile_ffmodel(ffmodel) dls = init_ffmodel(ffmodel, ((input_tensor, inp), ), label) assert len(dls) == 2 inp_dl, label_dl = dls # forward/back pass run_fwd_bwd(ffmodel, ffconfig, (inp_dl, ), label_dl) conv2d_layer: Op = ffmodel.get_layers()[0] assert isinstance(conv2d_layer, Conv2D) conv2d_weight: Parameter = conv2d_layer.get_weight_tensor() conv2d_bias: Parameter = conv2d_layer.get_bias_tensor() # save output data save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt")) save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt")) # save layer data save_param_ff(conv2d_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt")) save_param_ff(conv2d_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias.pt")) save_param_grad_ff(conv2d_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt")) save_param_grad_ff(conv2d_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias_grad.pt"))
def run(): # create input, label tensors INPUT_SIZE = 512 OUTPUT_SIZE = 128 inp: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE), dtype="float32" ) label: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH, OUTPUT_SIZE), dtype="float32" ) # initialize ffmodel object ffconfig = FFConfig() ffmodel = FFModel(ffconfig) input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT) output_tensor = ffmodel.dense( input=input_tensor, out_dim=128, name="linear" ) # compile model compile_ffmodel(ffmodel) # fails here dls = init_ffmodel(ffmodel, ((input_tensor, inp),), label) assert len(dls) == 2 inp_dl, label_dl = dls # forward/back pass run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl) # get linear layer linear_layer: Op = ffmodel.get_layers()[0] assert isinstance(linear_layer, Linear) linear_weight: Parameter = linear_layer.get_weight_tensor() linear_bias: Parameter = linear_layer.get_bias_tensor() # save output data save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt")) save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt")) # save layer data save_param_ff(linear_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt")) save_param_ff(linear_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias.pt")) save_param_grad_ff(linear_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt")) save_param_grad_ff(linear_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias_grad.pt"))
def run(): # define layer in pytorch INPUT_SIZE = 512 OUTPUT_SIZE = 128 linear = torch.nn.Linear(in_features=512, out_features=128).to(DEVICE) # get weight/bias from ff files, check same shape linear_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt")) linear_bias = torch.load(os.path.join(OUT_DIR, "ff_bias.pt")) assert linear.weight.shape == linear_weight.shape, ( "Shape mismatch: " f"FF={linear_weight.shape} torch={linear.weight.shape}") assert linear.bias.shape == linear_bias.shape, ( "Shape mismatch: " f"FF={linear_bias.shape} torch={linear.bias.shape}") # set weight/bias linear.weight = torch.nn.Parameter(linear_weight.to(DEVICE)) linear.bias = torch.nn.Parameter(linear_bias.to(DEVICE)) # generate input/label tensors w/ gen_tensor inp: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE), dtype="float32").to(DEVICE) label: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, OUTPUT_SIZE), dtype="float32").to(DEVICE) # get output running input through layer output = linear(inp) linear.zero_grad() output.retain_grad() # loss function loss_fn = torch.nn.MSELoss(reduction="mean") loss = loss_fn(output, label) # backpropogate loss.backward() # save out, out grad, layer weight & bias gradients torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt")) torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt")) torch.save(linear.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt")) torch.save(linear.bias.grad.cpu(), os.path.join(OUT_DIR, "torch_bias_grad.pt"))
def run(): HIDDEN_SIZE = 512 EPS = 1e-6 inp: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE), dtype="float32", ) label: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE), dtype="float32", ) ffconfig = FFConfig() ffmodel = FFModel(ffconfig) input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_FLOAT) output_tensor = ffmodel.layer_norm( input=input_tensor, axes=[len(input_tensor.dims) - 1], # normalize over the last dimension elementwise_affine=True, eps=EPS, name="layernorm", ) compile_ffmodel(ffmodel) dls = init_ffmodel(ffmodel, ((input_tensor, inp), ), label) assert len(dls) == 2 inp_dl, label_dl = dls run_fwd_bwd(ffmodel, ffconfig, (inp_dl, ), label_dl) layernorm_layer: Op = ffmodel.get_layers()[0] assert isinstance(layernorm_layer, LayerNorm) layernorm_weight: Parameter = layernorm_layer.get_weight_tensor() layernorm_bias: Parameter = layernorm_layer.get_bias_tensor() save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt")) save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt")) save_param_ff(layernorm_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt")) save_param_ff(layernorm_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias.pt")) save_param_grad_ff(layernorm_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt")) save_param_grad_ff(layernorm_bias, ffmodel, os.path.join(OUT_DIR, "ff_bias_grad.pt"))
def run(): INPUT_SIZE = 512 SEQ_LENGTH = 5 inp1: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE), dtype="float32").to(DEVICE) inp2: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE), dtype="float32").to(DEVICE) label: torch.Tensor = gen_tensor((BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE), dtype="float32").to(DEVICE) output = torch.mul(input=inp1, other=inp2).to(DEVICE) output.requires_grad = True output.retain_grad() loss_fn = torch.nn.MSELoss(reduction="mean") loss = loss_fn(output, label) loss.backward() torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt")) torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt"))
def run(): """Checks the ``getitem()`` code path for tensor slicing.""" attention_mask = gen_tensor( (BATCH_SIZE, SEQ_LENGTH), dtype="int64", low=0, high=2, ).to(DEVICE) # Extend to shape (BATCH_SIZE, 1, 1, SEQ_LENGTH) extended_attention_mask = attention_mask[:, None, None, :] torch.save(extended_attention_mask.cpu(), os.path.join(OUT_DIR, "torch_out.pt"))
def run(): HIDDEN_SIZE = 512 EPS = 1e-6 layernorm = torch.nn.LayerNorm( normalized_shape=HIDDEN_SIZE, eps=EPS, elementwise_affine=True, ).to(DEVICE) layernorm_weight = torch.load(os.path.join(OUT_DIR, "ff_weight.pt")) layernorm_bias = torch.load(os.path.join(OUT_DIR, "ff_bias.pt")) assert layernorm.weight.shape == layernorm_weight.shape, ( "Shape mismatch: " f"FF={layernorm_weight.shape} torch={layernorm.weight.shape}") assert layernorm.bias.shape == layernorm_bias.shape, ( "Shape mismatch: " f"FF={layernorm_bias.shape} torch={layernorm.bias.shape}") layernorm.weight = torch.nn.Parameter(layernorm_weight.to(DEVICE)) layernorm.bias = torch.nn.Parameter(layernorm_bias.to(DEVICE)) inp: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE), dtype="float32", ).to(DEVICE) label: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE), dtype="float32", ).to(DEVICE) output = layernorm(inp) layernorm.zero_grad() output.retain_grad() loss_fn = torch.nn.MSELoss(reduction="mean") loss = loss_fn(output, label) loss.backward() torch.save(output.cpu(), os.path.join(OUT_DIR, "torch_out.pt")) torch.save(output.grad.cpu(), os.path.join(OUT_DIR, "torch_out_grad.pt")) torch.save(layernorm.weight.grad.cpu(), os.path.join(OUT_DIR, "torch_weight_grad.pt")) torch.save(layernorm.bias.grad.cpu(), os.path.join(OUT_DIR, "torch_bias_grad.pt"))
def run(): NUM_EMBEDDINGS = 250112 EMBEDDING_DIM = 512 inp: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH), dtype="int64", low=0, high=NUM_EMBEDDINGS, ) label: torch.Tensor = gen_tensor( (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_DIM), dtype="float32", ) ffconfig = FFConfig() ffmodel = FFModel(ffconfig) input_tensor = ffmodel.create_tensor(inp.shape, DataType.DT_INT64) output_tensor = ffmodel.embedding( input=input_tensor, num_embeddings=NUM_EMBEDDINGS, embedding_dim=EMBEDDING_DIM, aggr=AggrMode.AGGR_MODE_NONE, kernel_initializer=NormInitializer(seed=42, mean=0, stddev=1), name="embedding", ) compile_ffmodel(ffmodel) dls = init_ffmodel(ffmodel, ((input_tensor, inp),), label) assert len(dls) == 2 inp_dl, label_dl = dls run_fwd_bwd(ffmodel, ffconfig, (inp_dl,), label_dl) embedding_layer: Op = ffmodel.get_layers()[0] assert isinstance(embedding_layer, Embedding) embedding_weight: Parameter = embedding_layer.get_weight_tensor() save_tensor_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out.pt")) save_tensor_grad_ff(output_tensor, ffmodel, os.path.join(OUT_DIR, "ff_out_grad.pt")) save_param_ff(embedding_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight.pt")) save_param_grad_ff( embedding_weight, ffmodel, os.path.join(OUT_DIR, "ff_weight_grad.pt") )