コード例 #1
0
def monitor(monitor_config: AMLTensorBoardMonitorConfig,
            azure_config: AzureConfig) -> None:
    """
    Starts TensorBoard monitoring as per the provided arguments.
    :param monitor_config: The config containing information on which runs that need be monitored.
    :param azure_config: An AzureConfig object with secrets/keys to access the workspace.
    """
    # Fetch AzureML workspace and the experiment runs in it
    workspace = azure_config.get_workspace()

    if monitor_config.run_ids is not None:
        if len(monitor_config.run_ids) == 0:
            print("At least one run_recovery_id must be given for monitoring.")
            sys.exit(1)
        exp_runs = [
            azure_util.fetch_run(workspace, run_id)
            for run_id in monitor_config.run_ids
        ]
    else:
        if monitor_config.experiment_name not in workspace.experiments:
            print(f"The experiment: {monitor_config.experiment_name} doesn't "
                  f"exist in the {monitor_config.workspace_name} workspace.")
            sys.exit(1)

        experiment = Experiment(workspace, monitor_config.experiment_name)
        filters = common_util.get_items_from_string(
            monitor_config.run_status) if monitor_config.run_status else []

        exp_runs = azure_util.fetch_runs(experiment, filters)

        if len(exp_runs) == 0:
            _msg = "No runs to monitor"
            if monitor_config.run_status:
                _msg += f"with status [{monitor_config.run_status}]."
            print(_msg)
            sys.exit(1)

    # Start TensorBoard on executing machine
    ts = Tensorboard(exp_runs,
                     local_root=str(monitor_config.local_root),
                     port=monitor_config.port)

    print(
        "=============================================================================="
    )
    for run in exp_runs:
        print(f"Run URL: {run.get_portal_url()}")
    print("TensorBoard URL: ")
    ts.start()
    print(
        "==============================================================================\n\n"
    )
    input("Press Enter to close TensorBoard...")
    ts.stop()
コード例 #2
0
ファイル: train.py プロジェクト: tusharkalecam/MLOps-YoloV3
class Train():
    def __init__(self):
        self._parser = argparse.ArgumentParser("train")
        self._parser.add_argument(
            "--release_id",
            type=str,
            help="The ID of the release triggering this pipeline run")
        self._parser.add_argument("--model_name",
                                  type=str,
                                  help="Name of the tf model")
        self._parser.add_argument("--ckpt_path",
                                  type=str,
                                  help="Chekpoint path",
                                  default="checkpoint/yolov3.ckpt")
        self._parser.add_argument("--datastore",
                                  type=str,
                                  help="Name of the datastore",
                                  default="epis_datastore")
        self._parser.add_argument("--storage_container",
                                  type=str,
                                  help="Name of the storage container",
                                  default="ppe")

        self._args = self._parser.parse_args()
        self._run = Run.get_context()
        self._exp = self._run.experiment
        self._ws = self._run.experiment.workspace
        self._tb = Tensorboard([self._run])
        self._datastore = Datastore.get(self._ws,
                                        datastore_name=self._args.datastore)

    def __get_mime_type(self, file_path):
        return mime_content_type(file_path)

    def training(self):

        self.__getDataset()

        trainset = Dataset('train')
        logdir = "./data/log"
        steps_per_epoch = len(trainset)
        global_steps = tf.Variable(1, trainable=False, dtype=tf.int64)
        warmup_steps = cfg.TRAIN.WARMUP_EPOCHS * steps_per_epoch
        total_steps = cfg.TRAIN.EPOCHS * steps_per_epoch

        input_tensor = tf.keras.layers.Input([416, 416, 3])
        conv_tensors = YOLOv3(input_tensor)

        output_tensors = []
        for i, conv_tensor in enumerate(conv_tensors):
            pred_tensor = decode(conv_tensor, i)
            output_tensors.append(conv_tensor)
            output_tensors.append(pred_tensor)

        model = tf.keras.Model(input_tensor, output_tensors)
        optimizer = tf.keras.optimizers.Adam()
        if os.path.exists(logdir): shutil.rmtree(logdir)
        writer = tf.summary.create_file_writer(logdir)

        self._tb.start()
        for epoch in range(cfg.TRAIN.EPOCHS):
            print(epoch)
            for image_data, target in trainset:
                self.__train_step(image_data, target, model, global_steps,
                                  writer, optimizer, warmup_steps, total_steps)
            model.save_weights(self._args.ckpt_path)
        self._tb.stop()
        model.save(f"./models")

        zipFolder("check.zip", "checkpoint")
        zipFolder("log.zip", "data/log")
        zipFolder("model.zip", "models")

        self._run.upload_file(name='check.zip', path_or_stream="check.zip")
        print(
            f"Uploaded the checkpoints to experiment {self._run.experiment.name}"
        )
        self._run.upload_file(name='log.zip', path_or_stream="log.zip")
        print(f"Uploaded the tfruns to experiment {self._run.experiment.name}")
        self._run.upload_file(name='model.zip', path_or_stream="model.zip")
        print(f"Uploaded the model to experiment {self._run.experiment.name}")

        print("Following files are uploaded")
        print(self._run.get_file_names())

        self._run.add_properties({
            "release_id": self._args.release_id,
            "run_type": "train"
        })
        print(f"added properties: {self._run.properties}")

        self._run.complete()

    def __getDataset(self):
        voc_train = self._datastore.blob_service.list_blobs(
            self._args.storage_container, prefix='voc_train.txt')
        voc_test = self._datastore.blob_service.list_blobs(
            self._args.storage_container, prefix='voc_test.txt')

        voc_train_imagesets = list(voc_train)
        print("Succesfully get voc_train.txt")
        voc_test_imagesets = list(voc_test)
        print("Succesfully get voc_test.txt")

        self._datastore.blob_service.get_blob_to_path(
            self._args.storage_container, voc_train_imagesets[0].name,
            f'./data/dataset/{voc_train_imagesets[0].name}')
        self._datastore.blob_service.get_blob_to_path(
            self._args.storage_container, voc_test_imagesets[0].name,
            f'./data/dataset/{voc_test_imagesets[0].name}')

    def __train_step(self, image_data, target, model, global_steps, writer,
                     optimizer, warmup_steps, total_steps):
        with tf.GradientTape() as tape:
            pred_result = model(image_data, training=True)
            giou_loss = conf_loss = prob_loss = 0

            for i in range(3):
                conv, pred = pred_result[i * 2], pred_result[i * 2 + 1]
                loss_items = compute_loss(pred, conv, *target[i], i)
                giou_loss += loss_items[0]
                conf_loss += loss_items[1]
                prob_loss += loss_items[2]

            total_loss = giou_loss + conf_loss + prob_loss

            gradients = tape.gradient(total_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients,
                                          model.trainable_variables))
            tf.print(
                "=> STEP %4d   lr: %.6f   giou_loss: %4.2f   conf_loss: %4.2f   "
                "prob_loss: %4.2f   total_loss: %4.2f" %
                (global_steps, optimizer.lr.numpy(), giou_loss, conf_loss,
                 prob_loss, total_loss))
            global_steps.assign_add(1)
            if global_steps < warmup_steps:
                lr = global_steps / warmup_steps * cfg.TRAIN.LR_INIT
            else:
                lr = cfg.TRAIN.LR_END + 0.5 * (
                    cfg.TRAIN.LR_INIT - cfg.TRAIN.LR_END) * ((1 + tf.cos(
                        (global_steps - warmup_steps) /
                        (total_steps - warmup_steps) * np.pi)))
            optimizer.lr.assign(lr.numpy())

            with writer.as_default():
                tf.summary.scalar("lr", optimizer.lr, step=global_steps)
                tf.summary.scalar("loss/total_loss",
                                  total_loss,
                                  step=global_steps)
                tf.summary.scalar("loss/giou_loss",
                                  giou_loss,
                                  step=global_steps)
                tf.summary.scalar("loss/conf_loss",
                                  conf_loss,
                                  step=global_steps)
                tf.summary.scalar("loss/prob_loss",
                                  prob_loss,
                                  step=global_steps)
            writer.flush()
                        nargs='+',
                        default=None,
                        help='runids to create')

    return parser.parse_args()


args = parse_args()

print(args)

if args.runids:
    # get workspace
    ws = Workspace.from_config()

    # set the expiriment
    experiment_name = 'test'
    exp = Experiment(workspace=ws, name=experiment_name)

    runs = []
    for idx in args.runids:
        run = Run(exp, idx)
        runs.append(run)
    tb = Tensorboard(runs)
    tb.start()

    ## Wait for input to stop tensorboard.
    print('Enter to stop tensorboard')
    input()
    tb.stop()
コード例 #4
0
def main():
    """
    Run the experiment for training
    """
    work_space = Workspace.from_config()

    # Set up the dataset for training
    datastore = work_space.get_default_datastore()
    dataset = Dataset.File.from_files(path=(datastore, "datasets/mnist"))

    # Set up the experiment for training
    experiment = Experiment(workspace=work_space, name="keras-lenet-train")
    #     azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 2000000000
    config = ScriptRunConfig(
        source_directory=".",
        script="train_keras.py",
        compute_target="cpu-cluster",
        arguments=[
            "--data_folder",
            dataset.as_named_input("input").as_mount(),
        ],
    )

    # Set up the Tensoflow/Keras environment
    environment = Environment("keras-environment")
    environment.python.conda_dependencies = CondaDependencies.create(
        python_version="3.7.7",
        pip_packages=["azureml-defaults", "numpy", "tensorflow==2.3.1"])
    config.run_config.environment = environment

    # Run the experiment for training
    run = experiment.submit(config)
    aml_url = run.get_portal_url()
    print(
        "Submitted to an Azure Machine Learning compute cluster. Click on the link below"
    )
    print("")
    print(aml_url)

    tboard = Tensorboard([run])
    # If successful, start() returns a string with the URI of the instance.
    tboard.start(start_browser=True)
    run.wait_for_completion(show_output=True)
    # After your job completes, be sure to stop() the streaming otherwise it will continue to run.
    print("Press enter to stop")
    input()
    tboard.stop()

    # Register Model
    metrics = run.get_metrics()
    run.register_model(
        model_name="keras_mnist",
        tags={
            "data": "mnist",
            "model": "classification"
        },
        model_path="outputs/keras_lenet.h5",
        model_framework=Model.Framework.TENSORFLOW,
        model_framework_version="2.3.1",
        properties={
            "train_loss": metrics["train_loss"][-1],
            "train_accuracy": metrics["train_accuracy"][-1],
            "val_loss": metrics["val_loss"][-1],
            "val_accuracy": metrics["val_accuracy"][-1],
        },
    )