def main(arg_list=None): run_args, remaining_args = parse_args(arg_list) remaining_args += ["--checkpoint-dir", run_args.checkpoint_dir] # If no config is supplied, try to load the config that should have been saved with the ckpts. if "--config" not in remaining_args: config_path = find_checkpoint_config(run_args.checkpoint_dir) remaining_args += ["--config", config_path] bert_args = utils.parse_bert_args(remaining_args) if not run_args.no_logger_setup: setup_logger(logging.getLevelName('INFO')) # Force variable weights in inference mode - otherwise we can't override the model weights for # validating each new checkpoint. bert_args.variable_weights_inference = True # Required to allow squeezed models to fit. bert_args.max_copy_merge_size = 32000 logger.info("Program Start") # `parse_bert_args` will suffix the user-supplied checkpoint path with the current date/time. # To avoid modifying core Bert code, we'll just remove the suffix (we don't need the created # config). shutil.rmtree(bert_args.checkpoint_dir) bert_args.checkpoint_dir = os.path.dirname(bert_args.checkpoint_dir) logger.info( f"Validating over checkpoints in directory {bert_args.checkpoint_dir}") return validate_checkpoints(run_args, utils.get_validation_args(bert_args))
# Define a specific Handler for this file that removes the root name. console = logging.StreamHandler() console.setLevel(log_level) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', '%Y-%m-%d %H:%M:%S') console.setFormatter(formatter) logger.addHandler(console) logger.propagate = False if __name__ == "__main__": args = utils.parse_bert_args() setup_logger(logging.getLevelName(args.log_level)) logger.info("Program Start") logger.info("Hostname: " + socket.gethostname()) logger.info("Command Executed: " + str(sys.argv)) # Run the main inference/training session by default if args.inference or not args.no_training: main(args) # If this was a training session and validation isn't disabled; validate. if not args.inference and not args.no_validation and not args.no_model_save: logger.info("Doing Validation") main(utils.get_validation_args(args)) logger.info("Program Finished")
def main(args): set_library_seeds(args.seed) config = bert_config_from_args(args) initializers = bert_pretrained_initialisers(config, args) logger.info("Building Model") # Specifying ai.onnx opset9 for the slice syntax # TODO: Change slice to opset10 model = Bert(config, builder=popart.Builder( opsets={"ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1}), initializers=initializers) indices, positions, segments, masks, labels = bert_add_inputs(args, model) logits = bert_logits_graph(model, indices, positions, segments, masks) if args.inference: outputs = bert_add_infer_outputs(model, logits) losses = [] writer = None else: predictions, probs = bert_infer_graph(model, logits) losses = bert_loss_graph(model, probs, labels) outputs = bert_add_validation_outputs(model, predictions, losses) writer = bert_writer(args) dataset = get_bert_dataset(model, args, [indices, positions, segments, masks, labels]) logger.info(f"Dataset length: {len(dataset)}") data_flow = popart.DataFlow(dataset.batches_per_step, outputs) iteration = Iteration( args, batches_per_step=dataset.batches_per_step, steps_per_epoch=len(dataset), writer=writer, recording_steps=args.aggregate_metrics_over_steps) request_ipus, required_ipus = calc_required_ipus(args, model) device = acquire_device(args, request_ipus) if args.inference: session, anchors = bert_inference_session(model, args, data_flow, losses, device) logger.info("Inference Started") bert_infer_loop(args, session, dataset, logits, anchors, iteration) device.detach() else: if not args.no_training: optimizer_factory = ScheduledOptimizerFactory(args, iteration, model.pipeline_stage_tensors) session, anchors = bert_training_session(model, args, data_flow, losses, device, optimizer_factory) logger.info("Training Started") bert_train_loop(args, session, writer, dataset, labels, predictions, losses, anchors, iteration, optimizer_factory) device.detach() logger.info("Training Finished") if not args.no_validation: logger.info("Doing Validation") main(utils.get_validation_args(args)) return session, iteration