def popart_result_and_model(popart_config, weight_decay=0, lr=0, l1_lambda=0): builder = popart.Builder() popart_model = Bert(popart_config, builder=builder) input_info = popart.TensorInfo(popart_config.popart_dtype, [ popart_config.batch_size * popart_config.sequence_length, popart_config.hidden_size ]) input_tensor = builder.addInputTensor(input_info) data = { input_tensor: np.random.normal(0, 0.02, input_info.shape()).astype(popart_config.dtype) } output = popart_model.feed_forward(input_tensor) proto = builder.getModelProto() l1 = popart.L1Loss(output, "l1LossVal", l1_lambda) iteration = MockIteration() args = MockArgs(lr, weight_decay) optimizer_factory = BaseOptimizerFactory(args, iteration, popart_model.tensors) optimizer = optimizer_factory.create() outputs, post_proto = run_py(proto, data, (output, l1.output(0)), loss=l1, optimizer=optimizer) return data[input_tensor], outputs, proto, post_proto
def popart_result_and_model(popart_config, is_bwd=False): builder = popart.Builder() popart_model = Bert(popart_config, builder=builder) input_info = popart.TensorInfo(popart_config.popart_dtype, [ popart_config.batch_size * popart_config.sequence_length, popart_config.hidden_size ]) input_tensor = builder.addInputTensor(input_info) data = { input_tensor: np.random.normal(0, 0.02, input_info.shape()).astype(popart_config.dtype) } output = popart_model.feed_forward(input_tensor) proto = builder.getModelProto() if is_bwd: l1_lambda = 0.1 l1 = popart.L1Loss(output, "l1LossVal", l1_lambda) optimizer = popart.ConstSGD(0.01) outputs, post_proto = run_py(proto, data, (output, l1.output(0)), loss=l1, optimizer=optimizer) else: outputs, post_proto = run_py(proto, data, output) return data[input_tensor], outputs, proto, post_proto
def popart_result_and_model(popart_config, is_bwd=False, momentum=0.0): popart_model = Bert(popart_config) input_info = popart.TensorInfo(popart_config.popart_dtype, [ popart_config.micro_batch_size * popart_config.sequence_length, popart_config.hidden_size ]) input_tensor = popart_model.builder.addInputTensor(input_info) data = { input_tensor: np.random.normal(0, 0.02, input_info.shape()).astype(popart_config.dtype) } output = popart_model.feed_forward(input_tensor) if is_bwd: l1 = popart_model.builder.aiGraphcore.l1loss( [output], 0.1, debugContext="l1LossVal", reduction=popart.ReductionType.Sum) proto = popart_model.builder.getModelProto() if momentum > 0.0: optimizer = popart.SGD({ "defaultLearningRate": (lr, False), "defaultMomentum": (momentum, False), "defaultWeightDecay": (0.0, False) }) else: optimizer = popart.ConstSGD(lr) outputs, post_proto = run_py(proto, data, (output, l1), loss=l1, optimizer=optimizer, num_reps=num_reps_bwd) else: proto = popart_model.builder.getModelProto() outputs, post_proto = run_py(proto, data, output) return data[input_tensor], outputs, proto, post_proto
def popart_result_and_model(popart_config, weight_decay=0.0, lr=0.0, l1_lambda=0.0): popart_model = Bert(popart_config) builder = popart_model.builder input_info = popart.TensorInfo(popart_config.popart_dtype, [ popart_config.micro_batch_size * popart_config.sequence_length, popart_config.hidden_size ]) input_tensor = builder.addInputTensor(input_info) data = { input_tensor: np.random.normal(0, 0.02, input_info.shape()).astype(popart_config.dtype) } output = popart_model.feed_forward(input_tensor) l1 = builder.aiGraphcore.l1loss([output], l1_lambda, debugContext="l1LossVal", reduction=popart.ReductionType.Sum) proto = builder.getModelProto() iteration = MockIteration() args = MockArgs("SGD", lr, weight_decay) optimizer_factory = BaseOptimizerFactory(args, iteration, popart_model.tensors) optimizer = optimizer_factory.create() outputs, post_proto = run_py(proto, data, (output, l1), loss=l1, optimizer=optimizer) return data[input_tensor], outputs, proto, post_proto