Esempio n. 1
0
 def start_experiment(self, engine):
     # log the initial artifacts in the dir
     mlflow.log_artifacts(args.logdir, "training")
     mlflow.log_metric("finished", False)
 def on_train_end(self, args, state, control, **kwargs):
     if self._initialized and state.is_world_process_zero:
         if self._log_artifacts:
             logger.info("Logging artifacts. This may take time.")
             mlflow.log_artifacts(args.output_dir)
         mlflow.end_run()
    log_scalar('test_loss', test_loss, step)
    log_scalar('test_accuracy', test_accuracy, step)


def log_scalar(name, value, step):
    """Log a scalar value to both MLflow and TensorBoard"""
    writer.add_scalar(name, value, step)
    mlflow.log_metric(name, value)


with mlflow.start_run():
    # Log our parameters into mlflow
    for key, value in vars(args).items():
        mlflow.log_param(key, value)

    # Create a SummaryWriter to write TensorBoard events locally
    output_dir = dirpath = tempfile.mkdtemp()
    writer = SummaryWriter(output_dir)
    print("Writing TensorBoard events locally to %s\n" % output_dir)

    # Perform the training
    for epoch in range(1, args.epochs + 1):
        train(epoch)
        test(epoch)

    # Upload the TensorBoard event logs as a run artifact
    print("Uploading TensorBoard events as a run artifact...")
    mlflow.log_artifacts(output_dir, artifact_path="events")
    print("\nLaunch TensorBoard with:\n\ntensorboard --logdir=%s" %
          os.path.join(mlflow.get_artifact_uri(), "events"))
Esempio n. 4
0
def run_evaluation(
    *,
    model_hash: str,
    thr: float = 0.5,
    dpath: str = "datasets/clouds/38-Cloud/38-Cloud_test",
    gtpath: str = "datasets/clouds/38-Cloud/38-Cloud_test/Entire_scene_gts",
    vpath: str = "datasets/clouds/38-Cloud/38-Cloud_test/Natural_False_Color",
    rpath: str = "artifacts/",
    vids: ("v", multi(min=1)),
    batch_size: int = 32,
    img_ids: ("iid", multi(min=0)),
    mlflow: bool = False,
    run_name: str = None,
):
    """
    Load model given model hash and get evaluation metrics on 38-Cloud testset.

    :param model_hash: MLFlow hash of the model to load.
    :param thr: threshold to be used during evaluation.
    :param dpath: path to dataset.
    :param gtpath: path to dataset ground truths.
    :param vpath: path to dataset (false color) visualisation images.
    :param rpath: path to directory where results
                  and artifacts should be logged.
    :param vids: tuple of ids of images which should be used to create
                 visualisations. If contains '*' visualisations will be
                 created for all images in the dataset.
    :type vids: tuple[str]
    :param batch_size: size of generated batches, only one batch is loaded
           to memory at a time.
    :param img_ids: if given, process only these images.
    :type img_ids: list[int]
    :param mlflow: whether to use MLFlow.
    :param run_name: name of the run.
    """
    snow_imgs = [
        "LC08_L1TP_064015_20160420_20170223_01_T1",
        "LC08_L1TP_035035_20160120_20170224_01_T1",
        "LC08_L1TP_050024_20160520_20170324_01_T1",
    ]
    if img_ids == []:
        img_ids = None
    else:
        snow_imgs = list(set(snow_imgs) & set(img_ids))
    dpath, gtpath, vpath, rpath = make_paths(dpath, gtpath, vpath, rpath)
    rpath = rpath / uuid.uuid4().hex
    print(f"Working dir: {os.getcwd()}, "
          + f"artifacts dir: {rpath}",
          flush=True)
    mpath = Path(
        f"/media/ML/mlflow/beetle/artifacts/34/{model_hash}/"
        # Change init_model to model for old models
        + "artifacts/init_model/data/model.h5"
    )
    # WARNING: If ever upgraded to tf2.x, this way of using metrics will not work
    # because loaded metrics will become MeanMetricWrapper objects in tf2.x and
    # this script isn't prepared for such objects (because MeanMetricWrapper has state,
    # as opposed to present stateless metric functions).
    model = keras.models.load_model(
        mpath,
        custom_objects={
            "jaccard_index_loss": losses.JaccardIndexLoss(),
            "jaccard_index_metric": losses.JaccardIndexMetric(),
            "dice_coeff_metric": losses.DiceCoefMetric(),
            "recall": losses.recall,
            "precision": losses.precision,
            "specificity": losses.specificity,
            # F1 score is needed for old models
            # "f1_score": losses.f1_score,
            "tf": tf,
        },
    )
    model.load_weights(
        f"/media/ML/mlflow/beetle/artifacts/34/{model_hash}/"
        + "artifacts/best_weights/best_weights"
    )
    metrics, _ = evaluate_model(
        model=model,
        thr=thr,
        dpath=dpath,
        gtpath=gtpath,
        vpath=vpath,
        rpath=rpath,
        vids=vids,
        batch_size=batch_size,
        img_ids=img_ids,
        mlflow=mlflow,
        run_name=run_name,
    )
    mean_metrics = {}
    mean_metrics_snow = {}
    for key, value in metrics.items():
        mean_metrics[key] = np.mean(list(value.values()))
        mean_metrics_snow[f"snow_{key}"] = np.mean(
            [value[x] for x in snow_imgs])
    print(mean_metrics, mean_metrics_snow)
    if mlflow:
        log_metrics(mean_metrics)
        log_metrics(mean_metrics_snow)
        log_artifacts(rpath)
Esempio n. 5
0
 def log_artifacts(cls, dir_path, artifact_path=None):
     mlflow.log_artifacts(dir_path, artifact_path)
Esempio n. 6
0
def main():
    with mlflow.start_run(run_name="Gen Expert Data"):
        import argparse
        parser = argparse.ArgumentParser()
        parser.add_argument('expert_policy_file', type=str)
        parser.add_argument('envname', type=str)
        parser.add_argument('--render',
                            type=str2bool,
                            nargs='?',
                            const=True,
                            default=False)
        parser.add_argument("--max_timesteps", type=int)
        parser.add_argument('--num_rollouts',
                            type=int,
                            default=20,
                            help='Number of expert roll outs')
        args = parser.parse_args()
        for k, v in vars(args).items():
            mlflow.log_param(k, v)

        print('loading and building expert policy')
        policy_fn = load_policy.load_policy(args.expert_policy_file)
        print('loaded and built')

        with tf.Session():
            tf_util.initialize()
            returns = []
            observations = []
            actions = []
            steps = []
            for i in range(args.num_rollouts):
                env = gym.make(args.envname)
                if args.render:
                    video_dir = "./videos/{0}/".format(time())
                    env = wrappers.Monitor(env, video_dir, force=True)
                max_steps = args.max_timesteps or env.spec.max_episode_steps
                print("max_steps set to {0}".format(max_steps))
                print('iter', i)
                obs = env.reset()
                done = False
                totalr = 0.
                trial_steps = 0
                while not done:
                    action = policy_fn(obs[None, :])
                    #action = env.action_space.sample()
                    observations.append(obs)
                    actions.append(action)
                    obs, r, done, _ = env.step(action)
                    totalr += r
                    trial_steps += 1
                    if args.render:
                        env.render(mode='rgb_array')
                    if trial_steps % 100 == 0:
                        print("%i/%i" % (trial_steps, max_steps))
                    if trial_steps >= max_steps:
                        print("hit max_steps")
                        break
                returns.append(totalr)
                steps.append(trial_steps)
                env.close()
                if args.render:
                    mlflow.log_artifacts(video_dir)

            for s in steps:
                mlflow.log_metric('steps', s)
            for r in returns:
                mlflow.log_metric('returns', r)
            mlflow.log_metric('mean return', np.mean(returns))
            mlflow.log_metric('std of return', np.std(returns))

            expert_data = {
                'observations': np.array(observations),
                'actions': np.array(actions)
            }

            if not os.path.exists('expert_data'):
                os.makedirs('expert_data')
            filename = os.path.join('expert_data', args.envname + '.pkl')
            with open((filename), 'wb') as f:
                pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)
            mlflow.log_artifact(filename, artifact_path="expert_data_file")
Esempio n. 7
0
    def train_models(self, args, base_line=True):
        """
        Train the model and log all the MLflow Metrics
        :param args: command line arguments. If no arguments then use default
        :param base_line: Default flag. Create Baseline model
        """
        # Create TensorFlow Session
        sess = tf.InteractiveSession()

        # Configure output_dir
        output_dir = tempfile.mkdtemp()

        #
        # initialize some classes
        #
        kdata_cls = KIMDB_Data_Utils()
        ktrain_cls = KTrain()
        kplot_cls = KPlot()

        #
        # get IMDB Data
        #
        (train_data,
         train_labels), (test_data, test_labels) = kdata_cls.fetch_imdb_data()

        #
        # prepare and vectorize data
        #
        x_train = kdata_cls.prepare_vectorized_sequences(train_data)
        x_test = kdata_cls.prepare_vectorized_sequences(test_data)

        y_train = kdata_cls.prepare_vectorized_labels(train_labels)
        y_test = kdata_cls.prepare_vectorized_labels(test_labels)

        image_dir = ktrain_cls.get_directory_path("images")
        model_dir = ktrain_cls.get_directory_path("models")

        graph_label_loss = 'Baseline Model: Training and Validation Loss'
        graph_label_acc = 'Baseline Model: Training and Validation Accuracy'
        graph_image_loss_png = os.path.join(image_dir, 'baseline_loss.png')
        graph_image_acc_png = os.path.join(image_dir, 'baseline_accuracy.png')

        if not base_line:
            graph_label_loss = 'Experimental: Training and Validation Loss'
            graph_label_acc = 'Experimental Model: Training and Validation Accuracy'
            graph_image_loss_png = os.path.join(image_dir,
                                                'experimental_loss.png')
            graph_image_acc_png = os.path.join(image_dir,
                                               'experimental_accuracy.png')

        kmodel = KModel()
        if base_line:
            print("Baseline Model:")
            model = kmodel.build_basic_model()
        else:
            print("Experiment Model:")
            model = kmodel.build_experimental_model(args.hidden_layers,
                                                    args.output)

        history = ktrain_cls.compile_and_fit_model(model,
                                                   x_train,
                                                   y_train,
                                                   epochs=args.epochs,
                                                   loss=args.loss,
                                                   output_dir=output_dir)
        model.summary()
        ktrain_cls.print_metrics(history)
        figure_loss = kplot_cls.plot_loss_graph(history, graph_label_loss)
        figure_loss.savefig(graph_image_loss_png)
        figure_acc = kplot_cls.plot_accuracy_graph(history, graph_label_acc)
        figure_acc.savefig(graph_image_acc_png)
        results = ktrain_cls.evaluate_model(model, x_test, y_test)
        print("Average Probability Results:")
        print(results)
        print()
        print("Predictions Results:")
        predictions = model.predict(x_test)
        print(predictions)

        mlflow_server = args.tracking_server
        #
        # We don't want to force people to have tracking server
        # running on localhost as it tracks in mlruns directory
        if mlflow_server:
            # Tracking URI
            if not mlflow_server.startswith("http"):
                mlflow_tracking_uri = 'http://' + mlflow_server + ':5000'
            else:
                mlflow_tracking_uri = mlflow_server
            # Set the Tracking URI
            mlflow.set_tracking_uri(mlflow_tracking_uri)
            print("MLflow Tracking URI: %s" % mlflow_tracking_uri)
        else:
            print("MLflow Tracking URI: %s" % "local directory 'mlruns'")

        with mlflow.start_run():
            # print out current run_uuid
            run_uuid = mlflow.active_run().info.run_uuid
            print("MLflow Run ID: %s" % run_uuid)

            # log parameters
            mlflow.log_param("hidden_layers", args.hidden_layers)
            mlflow.log_param("output", args.output)
            mlflow.log_param("epochs", args.epochs)
            mlflow.log_param("loss_function", args.loss)

            # calculate metrics
            binary_loss = ktrain_cls.get_binary_loss(history)
            binary_acc = ktrain_cls.get_binary_acc(history)
            validation_loss = ktrain_cls.get_validation_loss(history)
            validation_acc = ktrain_cls.get_validation_acc(history)
            average_loss = results[0]
            average_acc = results[1]

            # log metrics
            mlflow.log_metric("binary_loss", binary_loss)
            mlflow.log_metric("binary_acc", binary_acc)
            mlflow.log_metric("validation_loss", validation_loss)
            mlflow.log_metric("validation_acc", validation_acc)
            mlflow.log_metric("average_loss", average_loss)
            mlflow.log_metric("average_acc", average_acc)

            # log artifacts
            mlflow.log_artifacts(image_dir, "images")

            # log model
            mlflow.keras.log_model(model, "models")

            # save model locally
            pathdir = "keras_models/" + run_uuid
            model_dir = self.get_directory_path(pathdir, False)
            ktrain_cls.keras_save_model(model, model_dir)

            # Write out TensorFlow events as a run artifact
            print("Uploading TensorFlow events as a run artifact.")
            mlflow.log_artifacts(output_dir, artifact_path="events")

        print("loss function use", args.loss)
Esempio n. 8
0
                             numpy.ones((points.shape[0], 1),
                                        dtype=numpy.float64))))
              for t, points in zip(timepoints, samples)]
    ret = list(
        scopyon.generate_images(inputs,
                                num_frames=num_frames,
                                config=config,
                                rng=rng,
                                full_output=True))

    inputs_ = []
    for t, data in inputs:
        inputs_.extend(([t] + list(row) for row in data))
    inputs_ = numpy.array(inputs_)
    numpy.save(artifacts / f"inputs{i:03d}.npy", inputs_)

    numpy.save(artifacts / f"images{i:03d}.npy",
               numpy.array([img.as_array() for img, infodict in ret]))

    true_data = []
    for t, (_, infodict) in zip(timepoints, ret):
        true_data.extend([t, key] + list(value)
                         for key, value in infodict['true_data'].items())
    true_data = numpy.array(true_data)
    numpy.save(artifacts / f"true_data{i:03d}.npy", true_data)

#!ls ./artifacts

log_artifacts(artifactsPath)
mlflow.end_run()
Esempio n. 9
0
    def train_models(self, args, base_line=True):
        """
        Train the model and log all the MLflow Metrics
        :param args: command line arguments. If no arguments then use default
        :param base_line: Default flag. Create Baseline model
        """

        #
        # initialize some classes
        #
        kdata_cls = KIMDB_Data_Utils()
        ktrain_cls = KTrain()
        kplot_cls = KPlot()

        start_time = time()
        #
        # get IMDB Data
        #
        (train_data, train_labels), (test_data, test_labels) = kdata_cls.fetch_imdb_data()

        #
        # prepare and vectorize data
        #
        x_train = kdata_cls.prepare_vectorized_sequences(train_data)
        x_test = kdata_cls.prepare_vectorized_sequences(test_data)

        y_train = kdata_cls.prepare_vectorized_labels(train_labels)
        y_test = kdata_cls.prepare_vectorized_labels(test_labels)

        image_dir = ktrain_cls.get_directory_path("images")
        model_dir = ktrain_cls.get_directory_path("models")

        graph_label_loss = 'Baseline Model: Training and Validation Loss'
        graph_label_acc = 'Baseline Model: Training and Validation Accuracy'
        graph_image_loss_png = os.path.join(image_dir,'baseline_loss.png')
        graph_image_acc_png = os.path.join(image_dir, 'baseline_accuracy.png')

        if not base_line:
            graph_label_loss = 'Experimental: Training and Validation Loss'
            graph_label_acc = 'Experimental Model: Training and Validation Accuracy'
            graph_image_loss_png = os.path.join(image_dir, 'experimental_loss.png')
            graph_image_acc_png = os.path.join(image_dir,'experimental_accuracy.png')

        kmodel = KModel()
        if base_line:
            print("Baseline Model:")
            model = kmodel.build_basic_model()
        else:
            print("Experiment Model:")
            model = kmodel.build_experimental_model(args.hidden_layers, args.output)

        history = ktrain_cls.compile_and_fit_model(model, x_train, y_train, epochs=args.epochs, loss=args.loss)
        model.summary()
        ktrain_cls.print_metrics(history)

        figure_loss = kplot_cls.plot_loss_graph(history, graph_label_loss)
        figure_loss.savefig(graph_image_loss_png )

        figure_acc = kplot_cls.plot_accuracy_graph(history, graph_label_acc)
        figure_acc.savefig(graph_image_acc_png)

        results = ktrain_cls.evaulate_model(model, x_test, y_test)

        print("Average Probability Results:")
        print(results)

        print()
        print("Predictions Results:")
        predictions = model.predict(x_test)
        print(predictions)

        timed = time() - start_time

        with mlflow.start_run():
            # log parameters
            mlflow.log_param("hidden_layers", args.hidden_layers)
            mlflow.log_param("output", args.output)
            mlflow.log_param("epochs", args.epochs)
            mlflow.log_param("loss_function", args.loss)
            # log metrics
            mlflow.log_metric("binary_loss", ktrain_cls.get_binary_loss(history))
            mlflow.log_metric("binary_acc",  ktrain_cls.get_binary_acc(history))
            mlflow.log_metric("validation_loss", ktrain_cls.get_binary_loss(history))
            mlflow.log_metric("validation_acc", ktrain_cls.get_validation_acc(history))
            mlflow.log_metric("average_loss", results[0])
            mlflow.log_metric("average_acc", results[1])
            # log artifacts
            mlflow.log_artifacts(image_dir, "images")
            # log model
            mlflow.keras.log_model(model, "models")

        print("This model took", timed, " seconds to train and test.")
        print("loss function use", args.loss)
Esempio n. 10
0
def train_model(args, base_line=True):
    '''
    Train model function
    '''
    graph_label_loss = 'Baseline Model: Training and Validation Loss'
    graph_label_acc = 'Baseline Model: Training and Validation Accuracy'
    graph_image_loss_png = os.path.join(image_dir, 'baseline_loss.png')
    graph_image_acc_png = os.path.join(image_dir, 'baseline_accuracy.png')

    if not base_line:
        graph_label_loss = 'Experimental: Training and Validation Loss'
        graph_label_acc = 'Experimental Model: Training and Validation Accuracy'
        graph_image_loss_png = os.path.join(image_dir, 'experimental_loss.png')
        graph_image_acc_png = os.path.join(image_dir,
                                           'experimental_accuracy.png')

    image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        validation_split=validation_split)

    train_generator = image_data_generator.flow_from_directory(
        TRAIN_DATA_DIR,
        target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE),
        batch_size=TRAIN_BATCH_SIZE,
        class_mode='categorical',
        subset='training')

    validation_generator = image_data_generator.flow_from_directory(
        TRAIN_DATA_DIR,
        target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE),
        batch_size=TRAIN_BATCH_SIZE,
        class_mode='categorical',
        subset='validation')

    # Create the model
    model = Sequential()

    model.add(
        Conv2D(args.filters,
               kernel_size=args.kernel_size,
               activation='relu',
               padding='same',
               input_shape=(img_width, img_height, img_num_channels)))
    model.add(Flatten())
    model.add(Dense(args.output, activation='softmax'))

    # Compile the model
    model.compile(loss=args.loss,
                  optimizer=args.optimizer,
                  metrics=['accuracy'])

    history = model.fit_generator(train_generator,
                                  epochs=args.epochs,
                                  validation_data=validation_generator)

    model.summary()

    print_metrics(history)
    figure_loss = plot_loss_graph(history, graph_label_loss)
    figure_loss.savefig(graph_image_loss_png)
    figure_acc = plot_accuracy_graph(history, graph_label_acc)
    figure_acc.savefig(graph_image_acc_png)
    # print('==================================================')
    # predictions = model.predict(TEST_DATA_DIR)
    # print(predictions)
    # print('==================================================')

    #mlflow.set_experiment(args.experiment_name)
    with mlflow.start_run():
        # print out current run_uuid
        run_uuid = mlflow.active_run().info.run_uuid
        print("MLflow Run ID: %s" % run_uuid)

        # mlflow.create_experiment("Training CNN Model", artifact_location=None)

        # log parameters
        mlflow.log_param("Filters", args.filters)
        mlflow.log_param("Kernel Size", args.kernel_size)
        mlflow.log_param("Output", args.output)
        mlflow.log_param("Epochs", args.epochs)
        mlflow.log_param("Loss", args.loss)
        mlflow.log_param("Optimize", args.optimizer)

        # calculate metrics
        binary_loss = get_binary_loss(history)
        binary_acc = get_binary_acc(history)
        validation_loss = get_validation_loss(history)
        validation_acc = get_validation_acc(history)

        # log metrics
        mlflow.log_metric("binary_loss", binary_loss)
        mlflow.log_metric("binary_acc", binary_acc)
        mlflow.log_metric("validation_loss", validation_loss)
        mlflow.log_metric("validation_acc", validation_acc)

        # log artifacts
        mlflow.log_artifacts(image_dir, "images")

        # log model
        mlflow.keras.log_model(model, "models")

        # save model locally
        pathdir = "../data/out/keras_models/" + run_uuid
        # keras_save_model(model, pathdir)

        # Write out TensorFlow events as a run artifact
        print("Uploading TensorFlow events as a run artifact.")
        mlflow.log_artifacts(output_dir, artifact_path="events")
        mlflow.end_run()
Esempio n. 11
0
    def run(self, args: argparse.Namespace) -> None:
        logger.info("Load config from %s", args.config)
        config = load_yaml(minato.cached_path(args.config), args.overrides)

        logger.info("Configuration: %s", str(config))
        builder = ConfigBuilder.build(config)
        model = builder.model
        train_file = args.train or builder.train_file
        validation_file = args.validation or builder.validation_file

        if not train_file:
            raise ConfigurationError("train file is required.")

        logger.info("Start training...")
        logger.info("Training data: %s", str(train_file))
        logger.info("Validation data: %s", str(validation_file))

        params = {
            "command": " ".join(sys.argv),
            "config_file": args.config,
            "train_file": train_file,
            "validation_file": validation_file,
            "serialization_dir": args.serialization_dir,
            "config": config,
        }

        with _mlflow_start_run():
            serialization_dir = args.serialization_dir
            if args.serialization_dir is None and mlflow is None:
                serialization_dir = "./output"

            with create_workdir(
                    serialization_dir,
                    exist_ok=args.force,
            ) as workdir:
                workdir = workdir.absolute()
                try:
                    with open(workdir / "config.yaml", "w") as f:
                        yaml.dump(config, f)

                    with open(workdir / "params.json", "w") as f:
                        json.dump(params, f, indent=2)

                    if mlflow is not None:
                        logger.info("Log params to mlflow")
                        mlflow.log_params(params)

                    metrics = model.train(train_file, validation_file, workdir)

                    if mlflow is not None:
                        logger.info("Log metrics to mlflow")
                        mlflow.log_metrics(metrics)

                    logger.info("Training completed")
                    logger.info("Training metrics: %s",
                                json.dumps(metrics, indent=2))

                    with open(workdir / "metrics.json", "w") as metrics_file:
                        json.dump(metrics, metrics_file)

                    with open(workdir / "model.pkl", "wb") as model_file:
                        pickle.dump(model, model_file)
                finally:
                    if mlflow is not None:
                        logger.info("Log metrics to mlflow")
                        mlflow.log_artifacts(str(workdir))

        logger.info("Done!")
def main(learning_rate, batch_size, checkpoint_base_path, data_path,
         tracking_url):

    checkpoint_path = os.path.join(checkpoint_base_path, current_timestamp())
    os.makedirs(checkpoint_path, exist_ok=True)

    params = {
        'hidden_size': 512,
        'keep_rate': 0.8,
        'learning_rate': learning_rate,
        'nb_epochs': 1,
        'batch_size': batch_size,
        'checkpoint_path': checkpoint_path
    }

    # Configure the location where tracking data will be written to. In real-life
    # this would be a remote MLFlow Tracking Servinc (using HTTP) or something like
    # S3, HDFS etc.
    mlflow.set_tracking_uri(tracking_url)

    # Set name of experiment
    mlflow.set_experiment('MNIST_TF_Estimator')

    with mlflow.start_run() as run:

        # Log parameters in MLFlow
        for name, value in params.items():
            mlflow.log_param(name, value)

        def train_input_fn():
            ds = dataset.train(data_path)
            ds = ds.shuffle(buffer_size=50000)
            ds = ds.take(5000)  # just to speed up training
            ds = ds.batch(params['batch_size'])
            ds = ds.repeat(params['nb_epochs'])
            return ds

        def eval_input_fn():
            ds = dataset.test(data_path)
            ds = ds.batch(params['batch_size'])
            return ds

        run_config = tf.estimator.RunConfig(log_step_count_steps=50)
        estimator = tf.estimator.Estimator(model_fn=model_fn,
                                           model_dir=checkpoint_path,
                                           params=params,
                                           config=run_config)

        estimator.train(input_fn=train_input_fn)
        eval_results = estimator.evaluate(input_fn=eval_input_fn)
        tf.logging.info('Eval loss: %s' % eval_results['loss'])
        tf.logging.info('Eval accuracy: %s' % eval_results['accuracy'])

        # Log results in MLFlow
        mlflow.log_metric("eval_loss", eval_results['loss'])
        mlflow.log_metric("eval_acc", eval_results['accuracy'])

        # Send checkpoint and event files to MLFlow
        mlflow.log_artifacts(checkpoint_path)

        # Export the latest checkpoint as SavedModel
        feat_spec = {
            "images": tf.placeholder("float", name="images", shape=[None, 784])
        }
        receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(
            feat_spec)
        export_dir_base = os.path.join(checkpoint_path, 'saved_models')
        saved_estimator_path = estimator.export_saved_model(
            export_dir_base, receiver_fn).decode("utf-8")

        tf.logging.info('SavedModel has been exported to %s' %
                        saved_estimator_path)

        # Log the SavedModel as MLFlow model
        mlflow.tensorflow.log_model(tf_saved_model_dir=saved_estimator_path,
                                    tf_meta_graph_tags=[tag_constants.SERVING],
                                    tf_signature_def_key="serving_default",
                                    artifact_path="exported_model")
Esempio n. 13
0
#
# Code snippet for https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.log_artifacts
#
import warnings
import os
import json
import mlflow

if __name__ == "__main__":

    warnings.filterwarnings("ignore")
    print(mlflow.__version__)

    # Create some artifacts data to preserve
    features = "rooms, zipcode, median_price, school_rating, transport"
    data = {"state": "TX", "Available": 25, "Type": "Detached"}

    # Create couple of artifact files under the directory "data"
    os.makedirs("data", exist_ok=True)
    with open("data/data.json", 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)
    with open("data/features.txt", 'w') as f:
        f.write(features)

    # Write all files in "data" to root artifact_uri/states
    with mlflow.start_run():
        mlflow.log_artifacts("data", artifact_path="states")

Esempio n. 14
0
def log_artifact(source_dir, artifact_path=None):
    print(f"[INFO] Logging artifacts in {source_dir}...")
    mlflow.log_artifacts(local_dir, artifact_path=None)
Esempio n. 15
0
# Log a metric; metrics can be updated throughout the run
mlflow.log_metric("foo", random())
mlflow.log_metric("foo", random()+1)
mlflow.log_metric("foo", random()+2)
mlflow.log_metric("foo", random()+3)

mlflow.log_metric("bar", random())
mlflow.log_metric("bar", random()+1)
mlflow.log_metric("bar", random()+2)
mlflow.log_metric("bar", random()+3)

# Log an artifact (output file)
os.makedirs("outputs", exist_ok=True)
with open("outputs/test.txt", "w") as f:
    f.write("hello world!")
mlflow.log_artifacts("outputs")

mlflow.end_run()

# Step 1: Set your Environment Variables

get_ipython().run_line_magic('env', 'NEPTUNE_API_TOKEN=ANONYMOUS')
get_ipython().run_line_magic('env', 'NEPTUNE_PROJECT=shared/mlflow-integration')

# Step 2: Sync your MLruns with Neptune

get_ipython().system(' neptune mlflow')

# **Note:**  
# You can specify the path to the directory where the 'mlruns' directory is. 
                            random_state=123)
rf.fit(train_resampled.drop(outcome, axis=1), train_resampled[outcome])

# evaluate model
train_predictions = rf.predict(train_resampled.drop(outcome, axis=1))
test_predictions = rf.predict(test.drop(outcome, axis=1))
train_auc = roc_auc_score(train_resampled[outcome], train_predictions)
test_auc = roc_auc_score(test[outcome], test_predictions)

# log data
import mlflow
import tempfile

mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('case-study-one')
mlflow.start_run()

mlflow.log_param('n_estimators', n_estimators)
mlflow.log_param('max_features', max_features)
mlflow.log_metric('train_auc', train_auc)
mlflow.log_metric('test_auc', test_auc)

with tempfile.TemporaryDirectory() as tmp:
    path = os.path.join(tmp, 'train.csv')
    train.to_csv(path)
    mlflow.log_artifacts(tmp)

mlflow.sklearn.log_model(rf, 'model')

mlflow.end_run()
Esempio n. 17
0
def train_elasticnet(datasets,
                     in_alpha,
                     in_l1_ratio,
                     trial=None,
                     verbose=True):
    from sklearn.linear_model import ElasticNet

    train_x = datasets['train_x']
    train_y = datasets['train_y']
    test_x = datasets['test_x']
    test_y = datasets['test_y']

    alpha, l1_ratio = eval_parameters(in_alpha, in_l1_ratio)
    print("Parameters (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))

    server_uri = "http://localhost:5000"
    #mlflow.set_tracking_uri(server_uri)
    #mlflow.set_experiment("wine6")
    with mlflow.start_run():
        # train with ElasticNet
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        # Evaluate Metrics
        predicted_qualities = lr.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        # Print out metrics
        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        mlflow.set_tag("algo", "ElastiNet")
        if trial != None:
            mlflow.set_tag("trial", trial)

        # store info
        if verbose:
            workdir = tempfile.mkdtemp()
            with tempfile.TemporaryDirectory() as tmpdirname:
                output_train_data_summary(tmpdirname, train_x, train_y)
                output_enet_coefs(tmpdirname, train_x.columns, lr)

                # plots
                plot_enet_feature_importance(tmpdirname, train_x.columns,
                                             lr.coef_)
                # Call plot_enet_descent_path
                #image = plot_enet_descent_path(tmpdirname, train_x, train_y, l1_ratio)

                # Log artifacts (output files)
                mlflow.log_artifacts(tmpdirname, artifact_path="artifacts")

        # store model
        mlflow.sklearn.log_model(lr, "model")

        return rmse
Esempio n. 18
0
 def log_artifacts(cls, dir_path, artifact_path=None):
     try:
         mlflow.log_artifacts(dir_path, artifact_path)
     except ConnectionError:
         logger.warning(f"ConnectionError in logging artifacts to MLFlow")
Esempio n. 19
0
def train_and_evaluate(args):
    """Trains and evaluates the Keras model.

    Uses the Keras model defined in model.py and trains on data loaded and
    preprocessed in utils.py. Saves the trained model in TensorFlow SavedModel
    format to the path defined in part by the --job-dir argument.

    History objects returns:
        {'loss': [0.5699903990809373, 0.3629718415849791],
         'acc': [0.78604823, 0.8331693],
         'val_loss': [0.3966572880744934, 0.3477487564086914],
         'val_acc': [0.8278044, 0.8281116],
         'lr': [0.02, 0.015]}
    Args:
      args: dictionary of arguments - see get_args() for details
    """

    logging.info('Resume training: {}'.format(args.reuse_job_dir))
    if not args.reuse_job_dir:
        if tf.io.gfile.exists(args.job_dir):
            tf.io.gfile.rmtree(args.job_dir)
            logging.info('Deleted job_dir {} to avoid re-use'.format(
                args.job_dir))
    else:
        logging.info('Reusing job_dir {} if it exists'.format(args.job_dir))

    train_x, train_y, eval_x, eval_y = utils.load_data(args.train_files,
                                                       args.eval_files)
    # dimensions
    num_train_examples, input_dim = train_x.shape
    num_eval_examples = eval_x.shape[0]

    # Create the Keras Model
    keras_model = model.create_keras_model(input_dim=input_dim,
                                           learning_rate=args.learning_rate)

    # Pass a numpy array by passing DataFrame.values
    training_dataset = model.input_fn(features=train_x.values,
                                      labels=train_y,
                                      shuffle=True,
                                      num_epochs=args.num_epochs,
                                      batch_size=args.batch_size)

    # Pass a numpy array by passing DataFrame.values
    validation_dataset = model.input_fn(features=eval_x.values,
                                        labels=eval_y,
                                        shuffle=False,
                                        num_epochs=args.num_epochs,
                                        batch_size=num_eval_examples)

    start_time = time()
    # Set MLflow tracking URI
    if args.mlflow_tracking_uri:
        mlflow.set_tracking_uri(args.mlflow_tracking_uri)
    # Train model
    with mlflow.start_run() as active_run:
        run_id = active_run.info.run_id

        # Callbacks
        class MlflowCallback(tf.keras.callbacks.Callback):
            # This function will be called after training completes.
            def on_train_end(self, logs=None):
                mlflow.log_param('num_layers', len(self.model.layers))
                mlflow.log_param('optimizer_name',
                                 type(self.model.optimizer).__name__)

        # MLflow callback
        mlflow_callback = MlflowCallback()
        # Setup Learning Rate decay callback.
        lr_decay_callback = tf.keras.callbacks.LearningRateScheduler(
            lambda epoch: args.learning_rate + 0.02 * (0.5**(1 + epoch)),
            verbose=False)
        # Setup TensorBoard callback.
        tensorboard_path = os.path.join(args.job_dir, run_id, 'tensorboard')
        tensorboard_callback = tf.keras.callbacks.TensorBoard(tensorboard_path,
                                                              histogram_freq=1)

        history = keras_model.fit(
            training_dataset,
            steps_per_epoch=int(num_train_examples / args.batch_size),
            epochs=args.num_epochs,
            validation_data=validation_dataset,
            validation_steps=args.eval_steps,
            verbose=1,
            callbacks=[
                lr_decay_callback, tensorboard_callback, mlflow_callback
            ])
        metrics = history.history
        logging.info(metrics)
        keras_model.summary()
        mlflow.log_param('train_files', args.train_files)
        mlflow.log_param('eval_files', args.eval_files)
        mlflow.log_param('num_epochs', args.num_epochs)
        mlflow.log_param('batch_size', args.batch_size)
        mlflow.log_param('learning_rate', args.learning_rate)
        mlflow.log_param('train_samples', num_train_examples)
        mlflow.log_param('eval_samples', num_eval_examples)
        mlflow.log_param('eval_steps', args.eval_steps)
        mlflow.log_param('steps_per_epoch',
                         int(num_train_examples / args.batch_size))
        # Add metrics
        _mlflow_log_metrics(metrics, 'loss')
        _mlflow_log_metrics(metrics, 'acc')
        _mlflow_log_metrics(metrics, 'val_loss')
        _mlflow_log_metrics(metrics, 'val_acc')
        _mlflow_log_metrics(metrics, 'lr')
        # Export SavedModel
        model_local_path = os.path.join(args.job_dir, run_id, 'model')
        tf.keras.experimental.export_saved_model(keras_model, model_local_path)
        # Define artifacts.
        logging.info('Model exported to: {}'.format(model_local_path))
        # MLflow workaround since is unable to read GCS path.
        # https://github.com/mlflow/mlflow/issues/1765
        if model_local_path.startswith('gs://'):
            logging.info('Creating temp folder')
            temp = tempfile.mkdtemp()
            model_deployment.copy_artifacts(model_local_path, temp)
            model_local_path = os.path.join(temp, 'model')

        if tensorboard_path.startswith('gs://'):
            logging.info('Creating temp folder')
            temp = tempfile.mkdtemp()
            model_deployment.copy_artifacts(tensorboard_path, temp)
            tensorboard_path = temp

        mlflow.tensorflow.log_model(tf_saved_model_dir=model_local_path,
                                    tf_meta_graph_tags=[tag_constants.SERVING],
                                    tf_signature_def_key='serving_default',
                                    artifact_path='model')
        # Reloading the model
        pyfunc_model = mlflow.pyfunc.load_model(
            mlflow.get_artifact_uri('model'))
        logging.info('Uploading TensorFlow events as a run artifact.')
        mlflow.log_artifacts(tensorboard_path)
        logging.info('Launch TensorBoard with:\n\ntensorboard --logdir=%s' %
                     tensorboard_path)
        duration = time() - start_time
        mlflow.log_metric('duration', duration)
        mlflow.end_run()
        if model_local_path.startswith(
                'gs://') and tensorboard_path.startswith('gs://'):
            shutil.rmtree(model_local_path)
            shutil.rmtree(tensorboard_path)

    # Deploy model to AI Platform.
    if args.deploy_gcp:
        # Create AI Platform helper instance.
        if not args.project_id:
            raise ValueError('No Project is defined')
        if not args.gcs_bucket:
            raise ValueError('No GCS bucket')
        model_helper = model_deployment.AIPlatformModel(
            project_id=args.project_id)
        # Copy local model to GCS for deployment.
        if not model_local_path.startswith('gs://'):
            model_gcs_path = os.path.join('gs://', args.gcs_bucket, run_id,
                                          'model')
            model_deployment.copy_artifacts(model_local_path, model_gcs_path)
        # Create model
        model_helper.create_model(args.model_name)
        # Create model version
        model_helper.deploy_model(model_gcs_path, args.model_name, run_id,
                                  args.run_time_version)
        logging.info('Model deployment in GCP completed')
    logging.info(
        'This model took: {} seconds to train and test.'.format(duration))
Esempio n. 20
0
def run_evaluation(
    *,
    model_hash: str,
    thr: float = 0.5,
    dpath: str = "datasets/clouds/Landsat-Cloud-Cover-Assessment-" +
    "Validation-Data-Partial",
    rpath: str = "artifacts/",
    vids: ("v", multi(min=1)),
    batch_size: int = 32,
    bands: ("b", multi(min=1)),
    bands_names: ("bn", multi(min=1)),
    img_ids: ("iid", multi(min=0)),
    resize: bool = False,
    normalize: bool = False,
    standardize: bool = False,
    mlflow: bool = False,
    run_name: str = None,
):
    """
    Load model given model hash and get evaluation metrics on L8CCA testset.

    :param model_hash: MLFlow hash of the model to load.
    :param thr: threshold to be used during evaluation.
    :param dpath: path to dataset.
    :param rpath: path to directory where results
                  and artifacts should be logged.
    :param vids: tuple of ids of images which should be used to create
                 visualisations. If contains '*' visualisations will be
                 created for all images in the dataset.
    :type vids: tuple[str]
    :param batch_size: size of generated batches, only one batch is loaded
           to memory at a time.
    :param bands: band numbers to load
    :type bands: list[int]
    :param bands_names: names of the bands to load. Should have the same number
                        of elements as bands.
    :type bands_names: list[str]
    :param img_ids: if given, process only these images.
    :type img_ids: list[int]
    :param resize: whether to resize loaded img to gt.
    :param normalize: whether to normalize the image.
    :param standardize: whether to standardize the image.
    :param mlflow: whether to use MLFlow.
    :param run_name: name of the run.
    """
    snow_imgs = ["LC82271192014287LGN00", "LC81321192014054LGN00"]
    if img_ids == []:
        img_ids = None
    else:
        snow_imgs = list(set(snow_imgs) & set(img_ids))
    dpath, rpath = make_paths(dpath, rpath)
    rpath = rpath / uuid.uuid4().hex
    print(f"Working dir: {os.getcwd()}, " + f"artifacts dir: {rpath}",
          flush=True)
    mpath = Path(f"/media/ML/mlflow/beetle/artifacts/34/{model_hash}/"
                 # Change init_model to model for old models
                 + "artifacts/init_model/data/model.h5")
    # WARNING: If ever upgraded to tf2.x, this way of using metrics will not work
    # because loaded metrics will become MeanMetricWrapper objects in tf2.x and
    # this script isn't prepared for such objects (because MeanMetricWrapper has state,
    # as opposed to present stateless metric functions).
    model = keras.models.load_model(
        mpath,
        custom_objects={
            "jaccard_index_loss": losses.JaccardIndexLoss(),
            "jaccard_index_metric": losses.JaccardIndexMetric(),
            "dice_coeff_metric": losses.DiceCoefMetric(),
            "recall": losses.recall,
            "precision": losses.precision,
            "specificity": losses.specificity,
            # F1 score is needed for old models
            # "f1_score": losses.f1_score,
            "tf": tf,
        },
    )
    model.load_weights(f"/media/ML/mlflow/beetle/artifacts/34/{model_hash}/" +
                       "artifacts/best_weights/best_weights")
    metrics, _ = evaluate_model(
        model=model,
        thr=thr,
        dpath=dpath,
        rpath=rpath,
        vids=vids,
        batch_size=batch_size,
        bands=bands,
        bands_names=bands_names,
        img_ids=img_ids,
        resize=resize,
        normalize=normalize,
        standardize=standardize,
        mlflow=mlflow,
        run_name=run_name,
    )
    mean_metrics = {}
    mean_metrics_snow = {}
    for key, value in metrics.items():
        mean_metrics[key] = np.mean(list(value.values()))
        mean_metrics_snow[f"snow_{key}"] = np.mean(
            [value[x] for x in snow_imgs])
    print(mean_metrics, mean_metrics_snow)
    if mlflow:
        log_metrics(mean_metrics)
        log_metrics(mean_metrics_snow)
        log_artifacts(rpath)
Esempio n. 21
0
if gen_method in ratio_estimator_methods:
    llr_means = np.mean(llr, axis=0)
    mlflow.log_metrics({
        "theta 0 LLR": llr_means[0],
        "theta 1 LLR": llr_means[1],
    })

elif gen_method in score_estimator_methods:
    score_means = np.mean(scores, axis=0)
    mlflow.log_metrics({
        "theta 0 score": score_means[0],
        "theta 1 score": score_means[1],
    })

mlflow.log_artifacts(f"{results_dir}/{gen_method}")

#################################
## Calculating expected limits ##
#################################

for flag in include_xsec:

    _, p_values, best_fit_index, _, _, _ = limits.expected_limits(
        mode="histo",
        theta_true=theta_true,
        grid_ranges=theta_ranges,
        grid_resolutions=resolutions,
        include_xsec=flag,
        luminosity=luminosity,
        hist_vars=[histogram_var],
Esempio n. 22
0
def train(logger):
    """
    perform the training routine for a given fold. saves plots and selected parameters to the experiment dir
    specified in the configs.
    """
    logger.info(
        'performing training in {}D over fold {} on experiment {} with model {}'
        .format(cf.dim, cf.fold, cf.exp_dir, cf.model))

    net = model.net(cf, logger).cuda()
    if hasattr(cf, "optimizer") and cf.optimizer.lower() == "adam":
        logger.info("Using Adam optimizer.")
        optimizer = torch.optim.Adam(utils.parse_params_for_optim(
            net,
            weight_decay=cf.weight_decay,
            exclude_from_wd=cf.exclude_from_wd),
                                     lr=cf.learning_rate[0])
    else:
        logger.info("Using AdamW optimizer.")
        optimizer = torch.optim.AdamW(utils.parse_params_for_optim(
            net,
            weight_decay=cf.weight_decay,
            exclude_from_wd=cf.exclude_from_wd),
                                      lr=cf.learning_rate[0])

    if cf.dynamic_lr_scheduling:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode=cf.scheduling_mode,
            factor=cf.lr_decay_factor,
            patience=cf.scheduling_patience)

    model_selector = utils.ModelSelector(cf, logger)
    train_evaluator = Evaluator(cf, logger, mode='train')
    val_evaluator = Evaluator(cf, logger, mode=cf.val_mode)

    starting_epoch = 1

    # prepare monitoring
    monitor_metrics = utils.prepare_monitoring(cf)

    if cf.resume:
        checkpoint_path = os.path.join(cf.fold_dir, "last_checkpoint")
        starting_epoch, net, optimizer, monitor_metrics = \
            utils.load_checkpoint(checkpoint_path, net, optimizer)
        logger.info('resumed from checkpoint {} to epoch {}'.format(
            checkpoint_path, starting_epoch))

    logger.info('loading dataset and initializing batch generators...')
    batch_gen = data_loader.get_train_generators(cf, logger)

    # Prepare MLFlow
    best_loss = 1e3
    step = 1
    mlflow.log_artifacts(cf.exp_dir, "exp")

    for epoch in range(starting_epoch, cf.num_epochs + 1):

        logger.info('starting training epoch {}'.format(epoch))
        start_time = time.time()

        net.train()
        train_results_list = []
        bix = 0
        seen_pids = []
        while True:
            bix = bix + 1
            try:
                batch = next(batch_gen['train'])
            except StopIteration:
                break
            for pid in batch['pid']:
                seen_pids.append(pid)
            # print(f'\rtr. batch {bix}: {batch["pid"]}')
            tic_fw = time.time()
            results_dict = net.train_forward(batch)
            tic_bw = time.time()
            optimizer.zero_grad()
            results_dict['torch_loss'].backward()
            optimizer.step()
            print(
                '\rtr. batch {0} (ep. {1}) fw {2:.2f}s / bw {3:.2f} s / total {4:.2f} s || '
                .format(bix + 1, epoch, tic_bw - tic_fw,
                        time.time() - tic_bw,
                        time.time() - tic_fw) + results_dict['logger_string'],
                flush=True,
                end="")
            train_results_list.append(
                ({k: v
                  for k, v in results_dict.items()
                  if k != "seg_preds"}, batch["pid"]))
        print(f"Seen pids (unique): {len(np.unique(seen_pids))}")
        print()

        _, monitor_metrics['train'] = train_evaluator.evaluate_predictions(
            train_results_list, monitor_metrics['train'])

        logger.info('generating training example plot.')
        utils.split_off_process(
            plot_batch_prediction,
            batch,
            results_dict,
            cf,
            outfile=os.path.join(cf.plot_dir,
                                 'pred_example_{}_train.png'.format(cf.fold)))

        train_time = time.time() - start_time

        logger.info('starting validation in mode {}.'.format(cf.val_mode))
        with torch.no_grad():
            net.eval()
            if cf.do_validation:
                val_results_list = []
                val_predictor = Predictor(cf, net, logger, mode='val')
                while True:
                    try:
                        batch = next(batch_gen[cf.val_mode])
                    except StopIteration:
                        break
                    if cf.val_mode == 'val_patient':
                        results_dict = val_predictor.predict_patient(batch)
                    elif cf.val_mode == 'val_sampling':
                        results_dict = net.train_forward(batch,
                                                         is_validation=True)
                    val_results_list.append(({
                        k: v
                        for k, v in results_dict.items() if k != "seg_preds"
                    }, batch["pid"]))

                _, monitor_metrics['val'] = val_evaluator.evaluate_predictions(
                    val_results_list, monitor_metrics['val'])
                best_model_path = model_selector.run_model_selection(
                    net, optimizer, monitor_metrics, epoch)
                # Save best model
                mlflow.log_artifacts(
                    best_model_path,
                    os.path.join("exp", os.path.basename(cf.fold_dir),
                                 'best_checkpoint'))

            # Save logs and plots
            mlflow.log_artifacts(os.path.join(cf.exp_dir, "logs"),
                                 os.path.join("exp", 'logs'))
            mlflow.log_artifacts(
                cf.plot_dir, os.path.join("exp",
                                          os.path.basename(cf.plot_dir)))

            # update monitoring and prediction plots
            monitor_metrics.update({
                "lr": {
                    str(g): group['lr']
                    for (g, group) in enumerate(optimizer.param_groups)
                }
            })

            # replace tboard metrics with MLFlow
            #logger.metrics2tboard(monitor_metrics, global_step=epoch)
            mlflow.log_metric('learning rate', optimizer.param_groups[0]['lr'],
                              cf.num_epochs * cf.fold + epoch)
            for key in ['train', 'val']:
                for tag, val in monitor_metrics[key].items():
                    val = val[
                        -1]  # maybe remove list wrapping, recording in evaluator?
                    if 'loss' in tag.lower() and not np.isnan(val):
                        mlflow.log_metric(f'{key}_{tag}', val,
                                          cf.num_epochs * cf.fold + epoch)
                    elif not np.isnan(val):
                        mlflow.log_metric(f'{key}_{tag}', val,
                                          cf.num_epochs * cf.fold + epoch)

            epoch_time = time.time() - start_time
            logger.info('trained epoch {}: took {} ({} train / {} val)'.format(
                epoch, utils.get_formatted_duration(epoch_time, "ms"),
                utils.get_formatted_duration(train_time, "ms"),
                utils.get_formatted_duration(epoch_time - train_time, "ms")))
            batch = next(batch_gen['val_sampling'])
            results_dict = net.train_forward(batch, is_validation=True)
            logger.info('generating validation-sampling example plot.')
            utils.split_off_process(plot_batch_prediction,
                                    batch,
                                    results_dict,
                                    cf,
                                    outfile=os.path.join(
                                        cf.plot_dir,
                                        'pred_example_{}_val.png'.format(
                                            cf.fold)))

        # -------------- scheduling -----------------
        if cf.dynamic_lr_scheduling:
            scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1])
        else:
            for param_group in optimizer.param_groups:
                param_group['lr'] = cf.learning_rate[epoch - 1]
    # Save whole experiment to MLFlow
    mlflow.log_artifacts(cf.exp_dir, "exp")
Esempio n. 23
0
import tempfile

import mlflow
from mlflow import log_metric, log_param, log_artifacts, get_artifact_uri, active_run,\
    get_tracking_uri, log_artifact

if __name__ == "__main__":
    print("Running {} with tracking URI {}".format(sys.argv[0],
                                                   get_tracking_uri()))
    log_param("param1", 5)
    log_metric("foo", 5)
    log_metric("foo", 6)
    log_metric("foo", 7)
    log_metric("random_int", random.randint(0, 100))
    run_id = active_run().info.run_id
    # Get run metadata & data from the tracking server
    service = mlflow.tracking.MlflowClient()
    run = service.get_run(run_id)
    print("Metadata & data for run with UUID %s: %s" % (run_id, run))
    local_dir = tempfile.mkdtemp()
    message = "test artifact written during run %s within artifact URI %s\n" \
              % (active_run().info.run_id, get_artifact_uri())
    try:
        file_path = os.path.join(local_dir, "some_output_file.txt")
        with open(file_path, "w") as handle:
            handle.write(message)
        log_artifacts(local_dir, "some_subdir")
        log_artifact(file_path, "another_dir")
    finally:
        shutil.rmtree(local_dir)
Esempio n. 24
0
for i in range(num_samples):
    imgs = [
        scopyon.Image(data)
        for data in numpy.load(inputpath / f"images{i:03d}.npy")
    ]
    spots = [
        scopyon.analysis.spot_detection(img.as_array(),
                                        processes=nproc,
                                        min_sigma=min_sigma,
                                        max_sigma=max_sigma,
                                        threshold=threshold,
                                        overlap=overlap) for img in imgs
    ]

    spots_ = []
    for t, data in zip(timepoints, spots):
        spots_.extend(([t] + list(row) for row in data))
    spots_ = numpy.array(spots_)
    numpy.save(artifacts / f"spots{i:03d}.npy", spots_)

    print("{} spots are detected in {} frames.".format(len(spots_), len(imgs)))

warnings.resetwarnings()

#!ls ./artifacts

#log_artifacts("./artifacts")
log_artifacts(generated_data)
mlflow.end_run()
Esempio n. 25
0
        train_time = timeit.default_timer()
        os.system('python {} --mode {} --train-csv {} --model-dir {}'.format(
            train_file,
            'regression' if dataset[-1] == 'r' else 'classification',
            '{}/{}/train.csv'.format(data_dir, dataset),
            '{}/{}/'.format(result_dir, dataset)))
        train_time = timeit.default_timer() - train_time

        pred_time = timeit.default_timer()
        os.system('python {} --prediction-csv {} --test-csv {} --model-dir {}'.
                  format(predict_file,
                         '{}/{}/pred.csv'.format(result_dir, dataset),
                         '{}/{}/test.csv'.format(data_dir, dataset),
                         '{}/{}/'.format(result_dir, dataset)))
        pred_time = timeit.default_timer() - pred_time

        df = pd.read_csv('{}/{}/test-target.csv'.format(data_dir, dataset))
        df_pred = pd.read_csv('{}/{}/pred.csv'.format(result_dir, dataset))
        df = pd.merge(df, df_pred, on='line_id', left_index=True)

        score = roc_auc_score(df.target.values, df.prediction.values) if dataset[-1] == 'c' else \
                np.sqrt(mean_squared_error(df.target.values, df.prediction.values))
        print('Score {:0.5f}'.format(score))

        n = dataset.split('_')[1]
        mlflow.log_metric('score_{}'.format(n), score)
        mlflow.log_metric('train_time_{}'.format(n), train_time)
        mlflow.log_metric('test_time_{}'.format(n), pred_time)

    mlflow.log_artifacts('./')
Esempio n. 26
0
def rbf_svr_tuning(c = [0.001, 0.01, 0.1, 1, 10], gamma = [0.001, 0.01, 0.1, 1, 10], k = 5,
             train_data_path = '../data/training_data.csv', save_model = False, tracking_uri = "http://0.0.0.0:5000"):

    # Log the parameters with mlflow
    mlflow.log_param("c", c)
    mlflow.set_tag("k", k)

    # Set random seed for reproducibility
    np.random.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)

    # Get data shuffled and split into training and test sets
    mdr = MiningDataReader(path = train_data_path)
    (variable_names, X_train, X_test, y_train, y_test) = mdr.get_splitted_data()

    pipeline = Pipeline(steps = [('scaling', StandardScaler()),
                                 ('regression', SVR(kernel = 'rbf'))])

    ### TRAINING ###
    ################

    # Generate grid search for hyperparam tuning
    hyperparams = {}
    hyperparams['regression__C'] = c
    hyperparams['regression__gamma'] = gamma

    print("Training started...\n")

    # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors
    modelCV = GridSearchCV(estimator = pipeline,
                           param_grid = hyperparams,
                           cv = k,
                           scoring = 'neg_mean_squared_error',
                           n_jobs = -1)

    with ProgressBar():
        modelCV.fit(X_train, y_train)

    # Iterate over the results storing training error for each hyperparameter combination
    results = modelCV.cv_results_
    param_list, training_err_list, training_dev_list = [], [], []
    for i in range(len(results['params'])):
        param = results['params'][i]
        score = (-1) * results['mean_test_score'][i] # NEGATIVE MSE
        std = results['std_test_score'][i]
        param_list.append(param)
        training_err_list.append(score)
        training_dev_list.append(std)

    print(f"\nBest parameter set found for the training set:\n{modelCV.best_params_}")

    # Store the index of the best combination
    best_index = param_list.index(modelCV.best_params_)

    # Get the best values for hyperparams
    best_c = modelCV.best_params_['regression__C']
    best_gamma = modelCV.best_params_['regression__gamma']

    print("\nTraining finished. Evaluating model...\n")

    ### EVALUATION ###
    ##################

    # Criteria is C
    criteria = 'c'
    mlflow.set_tag("criteria", criteria)
    param_values = c

    # Predict test data variying criteria param and evaluate the models
    training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], []
    rmse_score, mae_score, r2_score = -1, -1, -1
    feature_names, feature_importances = [], []
    for param_value in tqdm(param_values):
        model = Pipeline(steps = [('scaler', StandardScaler()),
                                  ('regression', SVR(
                                        C = param_value,
                                        gamma = best_gamma,
                                        kernel = 'rbf'))])
        param = {'regression__C': param_value, 'regression__gamma': best_gamma}

        # Fit model and evaluate results
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        index = param_list.index(param)
        training_err = training_err_list[index]
        training_dev = training_dev_list[index]
        (training_mse, test_mse, rmse, mae, r2) = get_test_metrics(training_err, y_test, prediction)
        # Store metrics
        training_err_by_criteria.append(training_mse)
        training_dev_by_criteria.append(training_dev)
        test_err_list.append(test_mse)
        # Set aditional metrics for the best combination
        if index == best_index:
            rmse_score = rmse
            mae_score = mae
            r2_score = r2

    # Generate the plots
    empty_img_folder()
    plot_errors(criteria, param_values, training_err_by_criteria, training_dev_by_criteria, test_err_list)

    # Once hyperparameters are selected, train and save the best model
    if save_model:
        print("\nEvaluation finished. Training final model with train + test data with the best hyperparameters...")
        final_model = Pipeline(steps = [('scaler', StandardScaler()),
                                        ('regression', SVR(
                                            C = param_list[best_index]['regression__C'],
                                            gamma = best_gamma,
                                            kernel = 'rbf'))])

        # Train the best model with all the data (training + test)
        full_X = np.vstack((X_train, X_test))
        full_y = np.concatenate((y_train, y_test))
        final_model.fit(full_X, full_y)

        # Log plots and model with mlflow
        mlflow.log_artifacts('./img')
        mlflow.sklearn.log_model(final_model, 'model')

    # Log results with mlflow
    mlflow.log_metric("train_mse", training_err_list[best_index])
    mlflow.log_metric("test_mse", min(test_err_list))
    mlflow.log_metric("rmse", rmse_score)
    mlflow.log_metric("mae", mae_score)
    mlflow.log_metric("r2", r2_score)
    mlflow.set_tag("best_params", param_list[best_index])

    # Output the results
    print(f'''
-----------------------------------------------------------------------------------------------------------------------
RESULTS
-----------------------------------------------------------------------------------------------------------------------
Best params: {param_list[best_index]}
Training MSE: {training_err_list[best_index]}
Test MSE: {min(test_err_list)}
RMSE: {rmse_score}
MAE: {mae_score}
R2: {r2_score}
-----------------------------------------------------------------------------------------------------------------------
''')
Esempio n. 27
0
        plt.title('XGBoost Classification ' + metric_name)
        plt.savefig("artifacts/" + metric_name + ".png")
        plt.show()

    ##################################
    # Plot decision tree, feature importance stack and precision-recall curve
    # ................................
    precision, recall, _ = precision_recall_curve(test.get_label(),
                                                  predictions)
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall curve')
    plt.savefig("artifacts/PR_curve.png")
    plt.show()

    plt.rcParams["figure.figsize"] = (14, 7)
    xgb.plot_importance(clf, grid=False)
    plt.savefig("artifacts/importance.png")
    plt.show()

    plt.rcParams["figure.figsize"] = (14, 3)
    xgb.plot_tree(clf)
    plt.title('Decision Tree')
    plt.savefig("artifacts/tree.png")
    plt.show()

    # persist the model and save artficats to mlflow
    clf.save_model('artifacts/xgb.model')
    mlflow.log_artifacts("artifacts")
Esempio n. 28
0
                callback=callback,
                log_interval=args.log_interval)
    model.save(env.simulator._env_working_dir_parent + '/' + name)

    # If the algorithm doesn't reset or close the environment, this script will do it in
    # order to correctly log all the simulation data (Energyplus + Sinergym
    # logs)
    if env.simulator._episode_existed:
        env.close()

    # ---------------------------------------------------------------------------- #
    #                           Mlflow artifacts storege                           #
    # ---------------------------------------------------------------------------- #
    if args.mlflow_store:
        # Code for send output and tensorboard to mlflow artifacts.
        mlflow.log_artifacts(local_dir=env.simulator._env_working_dir_parent,
                             artifact_path=name)
        if args.evaluation:
            mlflow.log_artifacts(local_dir='best_model/' + name,
                                 artifact_path='best_model/' + name)
        # If tensorboard is active (in local) we should send to mlflow
        if args.tensorboard and 'gs://' + args.bucket_name not in args.tensorboard:
            mlflow.log_artifacts(local_dir=args.tensorboard + '/' + name,
                                 artifact_path=os.path.abspath(
                                     args.tensorboard).split('/')[-1] + '/' +
                                 name)

    # ---------------------------------------------------------------------------- #
    #                          Google Cloud Bucket Storage                         #
    # ---------------------------------------------------------------------------- #
    if args.remote_store:
        # Initiate Google Cloud client
Esempio n. 29
0
import os
from random import random, randint
from mlflow import log_metric, log_param, log_artifacts

if __name__ == "__main__":
    # Log a parameter (key-value pair)
    log_param("param1", randint(0, 100))
    
    # Log a metric; metrics can be updated throughout the run
    log_metric("foo", random())
    log_metric("foo", random() + 1)
    log_metric("foo", random() + 2)
    
    # Log an artifact (output file)
    if not os.path.exists("outputs"):
        os.makedirs("outputs")
        
    with open("outputs/test.txt", "w") as f:
        f.write("hello world!")
        
    log_artifacts("outputs")
Esempio n. 30
0
        if not is_nni_run_standalone():
            # Report final training results to NNI (NNI HP or NNI Classic NAS APIs)
            # TODO: make sure `valid_state.metrics` is ordered so that reported default metric to NNI is always the same
            nni.report_final_result({'default': valid_evaluator.state.metrics.values()[0], **train_evaluator.state.metrics, **valid_evaluator.state.metrics})
        return (valid_evaluator.state.metrics, state)
    except Exception as e:
        logging.error(
            f'Ignite training loop of "{type(model).__name__}" model failed, exception "{e}" raised{deepcv.utils.NL}### Traceback ###{deepcv.utils.NL}{traceback.format_exc()}')
        raise RuntimeError(f'Error: `{e}` exception raised during ignite training loop of "{type(model).__name__}" model...') from e
    finally:
        if backend_conf.rank == 0:
            tb_logger.close()
        if hp['log_output_dir_to_mlflow'] and mlflow.active_run():
            logging.info('Logging training output directory as mlfow artifacts...')
            mlflow.log_artifacts(str(output_path))
            # TODO: log and replace artifacts to mlflow at every epochs?
            # TODO: make sure all artifacts are loaded synchronously here
            # shutil.rmtree(output_path)


def _setup_distributed_training(device, backend_conf: BackendConfig, model: torch.nn.Module, batch_shape: torch.Size, use_sync_batch_norm: bool = False) -> torch.nn.Module:
    if backend_conf.distributed:
        # Setup distributed training with `torch.distributed`
        dist.init_process_group(backend_conf.dist_backend, init_method=backend_conf.dist_url)
        assert backend_conf.is_cuda, 'Error: Distributed training must be run on GPU(s).'
        torch.cuda.set_device(backend_conf.device)
        # TODO: make sure we dont want to add more device IDs here (see distributed examples in Ignite or PyTorch)
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[backend_conf.local_rank, ], output_device=backend_conf.local_rank)

        if use_sync_batch_norm and any(map(model.modules(), lambda m: isinstance(m, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)))):