def monitor(monitor_config: AMLTensorBoardMonitorConfig, azure_config: AzureConfig) -> None: """ Starts TensorBoard monitoring as per the provided arguments. :param monitor_config: The config containing information on which runs that need be monitored. :param azure_config: An AzureConfig object with secrets/keys to access the workspace. """ # Fetch AzureML workspace and the experiment runs in it workspace = azure_config.get_workspace() if monitor_config.run_ids is not None: if len(monitor_config.run_ids) == 0: print("At least one run_recovery_id must be given for monitoring.") sys.exit(1) exp_runs = [ azure_util.fetch_run(workspace, run_id) for run_id in monitor_config.run_ids ] else: if monitor_config.experiment_name not in workspace.experiments: print(f"The experiment: {monitor_config.experiment_name} doesn't " f"exist in the {monitor_config.workspace_name} workspace.") sys.exit(1) experiment = Experiment(workspace, monitor_config.experiment_name) filters = common_util.get_items_from_string( monitor_config.run_status) if monitor_config.run_status else [] exp_runs = azure_util.fetch_runs(experiment, filters) if len(exp_runs) == 0: _msg = "No runs to monitor" if monitor_config.run_status: _msg += f"with status [{monitor_config.run_status}]." print(_msg) sys.exit(1) # Start TensorBoard on executing machine ts = Tensorboard(exp_runs, local_root=str(monitor_config.local_root), port=monitor_config.port) print( "==============================================================================" ) for run in exp_runs: print(f"Run URL: {run.get_portal_url()}") print("TensorBoard URL: ") ts.start() print( "==============================================================================\n\n" ) input("Press Enter to close TensorBoard...") ts.stop()
class Train(): def __init__(self): self._parser = argparse.ArgumentParser("train") self._parser.add_argument( "--release_id", type=str, help="The ID of the release triggering this pipeline run") self._parser.add_argument("--model_name", type=str, help="Name of the tf model") self._parser.add_argument("--ckpt_path", type=str, help="Chekpoint path", default="checkpoint/yolov3.ckpt") self._parser.add_argument("--datastore", type=str, help="Name of the datastore", default="epis_datastore") self._parser.add_argument("--storage_container", type=str, help="Name of the storage container", default="ppe") self._args = self._parser.parse_args() self._run = Run.get_context() self._exp = self._run.experiment self._ws = self._run.experiment.workspace self._tb = Tensorboard([self._run]) self._datastore = Datastore.get(self._ws, datastore_name=self._args.datastore) def __get_mime_type(self, file_path): return mime_content_type(file_path) def training(self): self.__getDataset() trainset = Dataset('train') logdir = "./data/log" steps_per_epoch = len(trainset) global_steps = tf.Variable(1, trainable=False, dtype=tf.int64) warmup_steps = cfg.TRAIN.WARMUP_EPOCHS * steps_per_epoch total_steps = cfg.TRAIN.EPOCHS * steps_per_epoch input_tensor = tf.keras.layers.Input([416, 416, 3]) conv_tensors = YOLOv3(input_tensor) output_tensors = [] for i, conv_tensor in enumerate(conv_tensors): pred_tensor = decode(conv_tensor, i) output_tensors.append(conv_tensor) output_tensors.append(pred_tensor) model = tf.keras.Model(input_tensor, output_tensors) optimizer = tf.keras.optimizers.Adam() if os.path.exists(logdir): shutil.rmtree(logdir) writer = tf.summary.create_file_writer(logdir) self._tb.start() for epoch in range(cfg.TRAIN.EPOCHS): print(epoch) for image_data, target in trainset: self.__train_step(image_data, target, model, global_steps, writer, optimizer, warmup_steps, total_steps) model.save_weights(self._args.ckpt_path) self._tb.stop() model.save(f"./models") zipFolder("check.zip", "checkpoint") zipFolder("log.zip", "data/log") zipFolder("model.zip", "models") self._run.upload_file(name='check.zip', path_or_stream="check.zip") print( f"Uploaded the checkpoints to experiment {self._run.experiment.name}" ) self._run.upload_file(name='log.zip', path_or_stream="log.zip") print(f"Uploaded the tfruns to experiment {self._run.experiment.name}") self._run.upload_file(name='model.zip', path_or_stream="model.zip") print(f"Uploaded the model to experiment {self._run.experiment.name}") print("Following files are uploaded") print(self._run.get_file_names()) self._run.add_properties({ "release_id": self._args.release_id, "run_type": "train" }) print(f"added properties: {self._run.properties}") self._run.complete() def __getDataset(self): voc_train = self._datastore.blob_service.list_blobs( self._args.storage_container, prefix='voc_train.txt') voc_test = self._datastore.blob_service.list_blobs( self._args.storage_container, prefix='voc_test.txt') voc_train_imagesets = list(voc_train) print("Succesfully get voc_train.txt") voc_test_imagesets = list(voc_test) print("Succesfully get voc_test.txt") self._datastore.blob_service.get_blob_to_path( self._args.storage_container, voc_train_imagesets[0].name, f'./data/dataset/{voc_train_imagesets[0].name}') self._datastore.blob_service.get_blob_to_path( self._args.storage_container, voc_test_imagesets[0].name, f'./data/dataset/{voc_test_imagesets[0].name}') def __train_step(self, image_data, target, model, global_steps, writer, optimizer, warmup_steps, total_steps): with tf.GradientTape() as tape: pred_result = model(image_data, training=True) giou_loss = conf_loss = prob_loss = 0 for i in range(3): conv, pred = pred_result[i * 2], pred_result[i * 2 + 1] loss_items = compute_loss(pred, conv, *target[i], i) giou_loss += loss_items[0] conf_loss += loss_items[1] prob_loss += loss_items[2] total_loss = giou_loss + conf_loss + prob_loss gradients = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) tf.print( "=> STEP %4d lr: %.6f giou_loss: %4.2f conf_loss: %4.2f " "prob_loss: %4.2f total_loss: %4.2f" % (global_steps, optimizer.lr.numpy(), giou_loss, conf_loss, prob_loss, total_loss)) global_steps.assign_add(1) if global_steps < warmup_steps: lr = global_steps / warmup_steps * cfg.TRAIN.LR_INIT else: lr = cfg.TRAIN.LR_END + 0.5 * ( cfg.TRAIN.LR_INIT - cfg.TRAIN.LR_END) * ((1 + tf.cos( (global_steps - warmup_steps) / (total_steps - warmup_steps) * np.pi))) optimizer.lr.assign(lr.numpy()) with writer.as_default(): tf.summary.scalar("lr", optimizer.lr, step=global_steps) tf.summary.scalar("loss/total_loss", total_loss, step=global_steps) tf.summary.scalar("loss/giou_loss", giou_loss, step=global_steps) tf.summary.scalar("loss/conf_loss", conf_loss, step=global_steps) tf.summary.scalar("loss/prob_loss", prob_loss, step=global_steps) writer.flush()
nargs='+', default=None, help='runids to create') return parser.parse_args() args = parse_args() print(args) if args.runids: # get workspace ws = Workspace.from_config() # set the expiriment experiment_name = 'test' exp = Experiment(workspace=ws, name=experiment_name) runs = [] for idx in args.runids: run = Run(exp, idx) runs.append(run) tb = Tensorboard(runs) tb.start() ## Wait for input to stop tensorboard. print('Enter to stop tensorboard') input() tb.stop()
def main(): """ Run the experiment for training """ work_space = Workspace.from_config() # Set up the dataset for training datastore = work_space.get_default_datastore() dataset = Dataset.File.from_files(path=(datastore, "datasets/mnist")) # Set up the experiment for training experiment = Experiment(workspace=work_space, name="keras-lenet-train") # azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 2000000000 config = ScriptRunConfig( source_directory=".", script="train_keras.py", compute_target="cpu-cluster", arguments=[ "--data_folder", dataset.as_named_input("input").as_mount(), ], ) # Set up the Tensoflow/Keras environment environment = Environment("keras-environment") environment.python.conda_dependencies = CondaDependencies.create( python_version="3.7.7", pip_packages=["azureml-defaults", "numpy", "tensorflow==2.3.1"]) config.run_config.environment = environment # Run the experiment for training run = experiment.submit(config) aml_url = run.get_portal_url() print( "Submitted to an Azure Machine Learning compute cluster. Click on the link below" ) print("") print(aml_url) tboard = Tensorboard([run]) # If successful, start() returns a string with the URI of the instance. tboard.start(start_browser=True) run.wait_for_completion(show_output=True) # After your job completes, be sure to stop() the streaming otherwise it will continue to run. print("Press enter to stop") input() tboard.stop() # Register Model metrics = run.get_metrics() run.register_model( model_name="keras_mnist", tags={ "data": "mnist", "model": "classification" }, model_path="outputs/keras_lenet.h5", model_framework=Model.Framework.TENSORFLOW, model_framework_version="2.3.1", properties={ "train_loss": metrics["train_loss"][-1], "train_accuracy": metrics["train_accuracy"][-1], "val_loss": metrics["val_loss"][-1], "val_accuracy": metrics["val_accuracy"][-1], }, )