Esempio n. 1
0
def create_dict_and_corpus():
    # split the text into words
    words = list(map(lambda x: x.lower().split(), texts))

    # create dict file
    DICT_FILE = "news.topics.dict"
    dictionary = gensim.corpora.Dictionary(words)
    dictionary.filter_extremes(no_below=3, no_above=0.6, keep_n=2000000)
    dictionary.filter_n_most_frequent(15)
    dictionary.save(DICT_FILE)
    mlflow.log_artifact(DICT_FILE)

    print("dictionary tokenid", len(dictionary.token2id))
    print("words", words[:1])

    corpus = list(map(lambda x: dictionary.doc2bow(x), words))
    print("corpus length", len(corpus))

    return dictionary, corpus
Esempio n. 2
0
def test_download_artifacts(artifacts_server, tmpdir):
    url = artifacts_server.url
    mlflow.set_tracking_uri(url)

    tmp_path_a = tmpdir.join("a.txt")
    tmp_path_a.write("0")
    tmp_path_b = tmpdir.join("b.txt")
    tmp_path_b.write("1")
    with mlflow.start_run() as run:
        mlflow.log_artifact(tmp_path_a)
        mlflow.log_artifact(tmp_path_b, "dir")

    client = mlflow.tracking.MlflowClient()
    dest_path = client.download_artifacts(run.info.run_id, "")
    assert sorted(os.listdir(dest_path)) == ["a.txt", "dir"]
    assert read_file(os.path.join(dest_path, "a.txt")) == "0"
    dest_path = client.download_artifacts(run.info.run_id, "dir")
    assert os.listdir(dest_path) == ["b.txt"]
    assert read_file(os.path.join(dest_path, "b.txt")) == "1"
Esempio n. 3
0
def generate_submission(args):
    reviews_file = f"{args.latest_dir}/{args.dataset_name}_reviews_{args.local_id}.tsv"
    df_reviews = pd.read_csv(reviews_file, sep='\t')

    submission_file = f"{args.submissions_dir}/{args.dataset_name}_submission_{args.local_id}.txt"
    with open(submission_file, 'w') as fw:
        for i, row in tqdm(df_reviews.iterrows()):
            text = row.text_a
            label = row.label
            fw.write(f"{text}\t{label}\n")
    logger.info(f"Saved {df_reviews.shape[0]} lines in {submission_file}")

    #  ----- Tracking -----
    if args.do_experiment:
        mlflow.log_param("submission_file", submission_file)
        mlflow.log_artifact(submission_file)

    from theta.modeling import archive_local_model
    archive_local_model(args, submission_file)
Esempio n. 4
0
 def on_epoch_end(self, epoch, logs=None):
     self.epoch = epoch
     for k in logs.keys():
         if k.startswith('val_'):
             mlflow.log_metric('Validation ' + k[4:], logs[k], epoch)
         else:
             mlflow.log_metric('Epoch ' + k, logs[k], epoch)
     if config.mlflow.checkpoints.frequency() and epoch > 0 and epoch % config.mlflow.checkpoints.frequency() == 0:
         filename = os.path.join(self.temp_dir, '%d%s' % (epoch, self.model_extension))
         save_model(self.model, filename)
         if config.mlflow.checkpoints.only_save_latest():
             old = filename
             filename = os.path.join(self.temp_dir, 'latest' + self.model_extension)
             os.rename(old, filename)
         mlflow.log_artifact(filename, 'checkpoints')
         if os.path.isdir(filename):
             shutil.rmtree(filename)
         else:
             os.remove(filename)
def test_download_artifacts_from_uri():
    with mlflow.start_run() as run:
        with TempDir() as tmp:
            local_path = tmp.path("test")
            with open(local_path, "w") as f:
                f.write("test")
            mlflow.log_artifact(local_path, "test")
    command = ["mlflow", "artifacts", "download", "-u"]
    # Test with run uri
    run_uri = "runs:/{run_id}/test".format(run_id=run.info.run_id)
    actual_uri = posixpath.join(run.info.artifact_uri, "test")
    for uri in (run_uri, actual_uri):
        p = Popen(command + [uri], stdout=PIPE, stderr=STDOUT)
        output = p.stdout.readlines()
        downloaded_file_path = output[-1].strip()
        downloaded_file = os.listdir(downloaded_file_path)[0]
        with open(os.path.join(downloaded_file_path, downloaded_file),
                  "r") as f:
            assert f.read() == "test"
Esempio n. 6
0
def download_csv(company_abbreviation: str):
    dataset_url = f"{QUANLD_API}/{company_abbreviation}.csv"

    with mlflow.start_run(run_name="download"):
        local_dir = create_tmp_dir()
        local_filename = path.join(local_dir, DATASET_NAME)
        print(f"Downloading {dataset_url} to {local_filename}")

        dataset = requests.get(dataset_url)
        decoded_content = dataset.content.decode("utf-8").splitlines()

        with open(local_filename, "w", newline="") as file:
            writer = csv.writer(file, delimiter=",", quoting=csv.QUOTE_MINIMAL)
            for line in decoded_content:
                columns = line.split(",")
                writer.writerow(columns)

        print(f"Uploading stock market data: {local_filename}")
        mlflow.log_artifact(local_filename, DATASET_ARTIFACT_DIR)
Esempio n. 7
0
def plot_trajectories(traj1, traj2, labels, figname):
    fig, ax = plt.subplots(len(traj1), 1)
    for row, (t1, t2, label) in enumerate(zip(traj1, traj2, labels)):
        if t2 is not None:
            ax[row].plot(t1, label=f'True')
            ax[row].plot(t2, '--', label=f'Pred')
        else:
            ax[row].plot(t1)
        steps = range(0, t1.shape[0] + 1, 288)
        days = np.array(list(range(len(steps)))) + 7
        ax[row].set(xticks=steps,
                    xticklabels=days,
                    ylabel=label,
                    xlim=(0, len(t1)))
        ax[row].tick_params(labelbottom=False)
        ax[row].axvspan(2016, 4032, facecolor='grey', alpha=0.25, zorder=-100)
        ax[row].axvspan(4032, 6048, facecolor='grey', alpha=0.5, zorder=-100)
    ax[row].tick_params(labelbottom=True)
    ax[row].set_xlabel('Day')
    ax[0].text(64,
               30,
               '             Train                ',
               bbox={
                   'facecolor': 'white',
                   'alpha': 0.5
               })
    ax[0].text(2064,
               30,
               '           Validation           ',
               bbox={
                   'facecolor': 'grey',
                   'alpha': 0.25
               })
    ax[0].text(4116,
               30,
               '              Test                ',
               bbox={
                   'facecolor': 'grey',
                   'alpha': 0.5
               })
    plt.tight_layout()
    plt.savefig(figname)
    mlflow.log_artifact(figname)
Esempio n. 8
0
def log_fobj(fobj, fname):
    """Log the contents as fobj as an artifact."""
    if hasattr(fobj, "encoding"):
        mode = "wt"

    else:
        mode = "wb"

    with tempfile.TemporaryDirectory() as tempdir:
        target_fname = os.path.join(tempdir, fname)
        with open(target_fname, mode) as target:
            while True:
                chunk = fobj.read(1024 * 1024)
                if len(chunk) == 0:
                    break

                target.write(chunk)

        log_artifact(target_fname)
Esempio n. 9
0
def saveData(dataBuffer: list, filename: str):
    """
    Save generated codes and embedding vectors

    Args:
      dataBuffer: (list) Each list element contains (embedding,label)
      filename: (str) Filename to use for saving data

    Returns:

    """

    dataArray = np.array([x[0].cpu().numpy() for x in dataBuffer],
                         dtype=np.float32)
    targetArray = np.array([x[1].cpu().numpy() for x in dataBuffer],
                           dtype=np.float32)
    np.savez_compressed(filename, data=dataArray, targets=targetArray)
    mlflow.log_artifact(filename)
    return
Esempio n. 10
0
 def __exit__(self, *args):
     if self.disabled:
         mlflow.set_tracking_uri(self._archived_tracking_uri)
         try:
             mlflow.end_run()
         except Exception:
             # this is meant to kill stupid mlflow errors on program end,
             # as it seems they register end_run() to be called on program
             # end using atexit._run_exitfuncs
             pass
         return
     runtime = time.time() - self.start_time
     mlflow.log_metrics({'runtime_in_sec': runtime})
     self.artifactory.log_artifacts(artifacts_dir_paths=None)
     self.artifactory.close()
     self.logger.close()
     mlflow.log_artifact(local_path=self.log_fpath)
     self.mlflow_run.__exit__(*args)
     os.remove(self.log_fpath)
def missing_state_remove_75_percent(dataset):
    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset = dataset.reset_index()
    dataset = dataset.drop(['index'], axis = 1)

    zero_values = set(dataset.columns[dataset.eq('0').mean() > .75])
    dataset = dataset.drop(zero_values, axis = 1)

    zero_values = set(dataset.columns[dataset.eq('0').mean() > 0])
    for feature in zero_values:
        dataset[f'{feature}_is_missing'] = dataset[feature].apply(lambda f: 1 if f == '0' else 0)

    features_columns = [column for column in dataset.columns if '_is_missing' not in column]
    dataset[features_columns] = label_encode(dataset[features_columns])
    dataset.to_csv("datasets/experiments/missing_state_remove_75_percent.csv", index = False)
    log_artifact("datasets/experiments/missing_state_remove_75_percent.csv")
Esempio n. 12
0
def save_amap(amap: Dict[int, int], path):
    """
    Save atomic number to index map.

    Parameters
    ----------
    amap:
        Atomic numbers to index map
    path:
        Save path
    """

    # Original amap contains np.int64
    converted = {int(k): int(v) for k, v in amap.items()}

    with open(path, "w") as fout:
        json.dump(converted, fout)

    mlflow.log_artifact(path)
def make_submission(validation_name, X, cv, run_id):

    X['preds'] = X['preds'].round().astype('int')
    sub_name = f'sub_{validation_name}_1fold_{cv}_{run_id}'
    valid_sub_dir = f'{SUB_DIR}/{sub_name}'
    if not os.path.exists(valid_sub_dir):
        os.mkdir(valid_sub_dir)

    X['preds'].to_csv(f'{valid_sub_dir}/{validation_name}.predict',
                      index=False,
                      header=None)
    with zipfile.ZipFile(f'{SUB_DIR}/{sub_name}.zip',
                         'w',
                         compression=zipfile.ZIP_DEFLATED) as new_zip:
        new_zip.write(f'{valid_sub_dir}/{validation_name}.predict',
                      arcname=f'{validation_name}.predict')
    shutil.rmtree(valid_sub_dir)
    mlflow.log_artifact(f'{SUB_DIR}/{sub_name}.zip')
    return
Esempio n. 14
0
def test_log_artifact():
    artifact_src_dir = tempfile.mkdtemp()
    # Create artifacts
    _, path0 = tempfile.mkstemp(dir=artifact_src_dir)
    _, path1 = tempfile.mkstemp(dir=artifact_src_dir)
    for i, path in enumerate([path0, path1]):
        with open(path, "w") as handle:
            handle.write("%s" % str(i))
    # Log an artifact, verify it exists in the directory returned by get_artifact_uri
    # after the run finishes
    artifact_parent_dirs = ["some_parent_dir", None]
    for parent_dir in artifact_parent_dirs:
        with start_run():
            artifact_uri = mlflow.get_artifact_uri()
            run_artifact_dir = local_file_uri_to_path(artifact_uri)
            mlflow.log_artifact(path0, parent_dir)
        expected_dir = (
            os.path.join(run_artifact_dir, parent_dir)
            if parent_dir is not None
            else run_artifact_dir
        )
        assert os.listdir(expected_dir) == [os.path.basename(path0)]
        logged_artifact_path = os.path.join(expected_dir, path0)
        assert filecmp.cmp(logged_artifact_path, path0, shallow=False)
    # Log multiple artifacts, verify they exist in the directory returned by get_artifact_uri
    for parent_dir in artifact_parent_dirs:
        with start_run():
            artifact_uri = mlflow.get_artifact_uri()
            run_artifact_dir = local_file_uri_to_path(artifact_uri)

            mlflow.log_artifacts(artifact_src_dir, parent_dir)
        # Check that the logged artifacts match
        expected_artifact_output_dir = (
            os.path.join(run_artifact_dir, parent_dir)
            if parent_dir is not None
            else run_artifact_dir
        )
        dir_comparison = filecmp.dircmp(artifact_src_dir, expected_artifact_output_dir)
        assert len(dir_comparison.left_only) == 0
        assert len(dir_comparison.right_only) == 0
        assert len(dir_comparison.diff_files) == 0
        assert len(dir_comparison.funny_files) == 0
def log_to_mlflow(params,
                  metrics,
                  artifact_dict,
                  tags=None,
                  experiment_name="default",
                  run_name=None):
    """Logs metrics, parameters and artifacts to MLflow

    Args:
        params (dict of {str: str}): input parameters to the model
        metrics (dict of {str: numeric}): metrics output from the model
        artifact_dict (dict): file paths of artifacts
        tags (dict): dict of tags
        experiment_name (str): name of the MLflow experiment (default: "default")
        run_name (str): name of the MLflow run (default: None)

    """

    mlflow.set_tracking_uri(TRACKING_URL)
    mlflow.set_experiment(experiment_name)

    if isinstance(params, ModelBuildingSessionParams):
        params = params.dict()
    if isinstance(metrics, ModelBuildingSessionMetrics):
        metrics = metrics.dict()
    if isinstance(artifact_dict, ModelBuildingSessionOutputArtifacts):
        artifact_dict = artifact_dict.dict()

    params = flatten(params)
    metrics = flatten(metrics)
    artifact_dict = flatten(artifact_dict)

    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(params)
        for metric, value in metrics.items():
            if isinstance(value, numbers.Number):
                mlflow.log_metric(key=metric, value=value)
        for artifact, path in artifact_dict.items():
            if path is not None and os.path.isfile(path):
                mlflow.log_artifact(path)
        if tags is not None:
            mlflow.set_tags(tags)
Esempio n. 16
0
def bagging_rgcn(adj_matrix_files, node_labels_file, node_features_file,
                 use_cuda, params, metadata):
    mlflow.set_experiment("LOOCV")

    with mlflow.start_run(run_name=RUN_NAME):
        mlflow.log_param("model", MODEL_NAME)
        u_mlflow.add_params(**params)
        u_mlflow.add_metadata(metadata)
        mlflow.set_tag("use_cuda", use_cuda)
        mlflow.log_param("merged_layers", len(adj_matrix_files) == 1)

        labels = data_loaders.load_labels(node_labels_file, use_cuda=use_cuda)
        n_nodes = labels.size(0)

        if node_features_file is not None:
            mlflow.log_param("node_features", True)
            mlflow.log_artifact(node_features_file, "inputs")
            features = data_loaders.load_node_features(node_features_file,
                                                       use_cuda)
        else:
            mlflow.log_param("node_features", False)
            features = None

        graph = data_loaders.load_graph(
            adj_matrix_files,
            n_nodes,
            add_edge_type=True,
            add_node_ids=True,
            normalization=NORMALIZATION,
            use_cuda=use_cuda,
        )

        print(RUN_NAME)
        ranks_df = loocv.run(labels=labels,
                             model_class=Bagging,
                             bagging_model=RGCN,
                             graph=graph,
                             features=features,
                             n_rels=len(adj_matrix_files),
                             **params)

        data_savers.save_ranks(ranks_df, n_nodes, RUN_NAME, params)
    def __log(self, _type, log_obj, prepend=''):
        self.fprint(f"Logging {_type}...")

        if not self.mlflow_logging_enabled:
            print(log_obj)
            return

        if _type == 'params' and self.mlflow_param_logging_enabled:
            log_obj = prepend_key(log_obj, prepend)
            mlflow.log_params(log_obj)

        if _type == 'metrics' and self.mlflow_metric_logging_enabled:
            log_obj['metrics'] = prepend_key(log_obj['metrics'], prepend)
            mlflow.log_metrics(**log_obj)

        if _type == 'artifact' and self.mlflow_artifact_logging_enabled:
            mlflow.log_artifact(log_obj)

        if _type == 'artifacts' and self.mlflow_artifact_logging_enabled:
            mlflow.log_artifacts(log_obj)
Esempio n. 18
0
def new_model_log(**kwargs):
    """
    Hijack the mlflow.models.Model.log method and upload the .whylogs.yaml configuration to the model path
    This will allow us to pick up the configuration later under /opt/ml/model/.whylogs.yaml path
    """
    import mlflow

    global _original_model_log

    if not os.path.isfile(WHYLOG_YAML):
        logger.warning(
            'Unable to detect .whylogs.yaml file under current directory. whylogs will write to local disk in the '
            'container')
        _original_model_log(**kwargs)
        return
    if _original_model_log is None:
        raise RuntimeError(
            'MlFlow is not patched. Please call whylogs.enable_mlflow()')
    mlflow.log_artifact(WHYLOG_YAML, kwargs['artifact_path'])
    _original_model_log(**kwargs)
Esempio n. 19
0
def make_predictions(X_test: pd.DataFrame, clf: Any,
                     parameters: Dict[str, Any]) -> None:

    SK_ID_CURR_index = X_test['SK_ID_CURR']
    feats = [
        f for f in X_test.columns if f not in
        ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
    ]
    X_test = X_test[feats]

    y_test_pred = clf.predict_proba(X_test,
                                    num_iteration=clf.best_iteration_)[:, 1]

    X_test['TARGET'] = y_test_pred
    X_test['SK_ID_CURR'] = SK_ID_CURR_index
    X_test[['SK_ID_CURR', 'TARGET']].to_csv("submission.csv", index=False)

    mlflow.log_artifact("submission.csv")

    return y_test_pred
Esempio n. 20
0
    def log_results(self, extracted_features):
        with mlflow.start_run() as mlrun:
            local_dir = tempfile.mkdtemp()
            extracted_features_pkl = os.path.join(local_dir,
                                                  'extracted_features_df.pkl')

            pickle.dump(extracted_features, open(extracted_features_pkl, 'wb'))
            print('Going to log at..', extracted_features_pkl)

            mlflow.log_artifact(extracted_features_pkl, "extracted_features")
            mlflow.log_param('bigram_min_count', 2)
            mlflow.log_param('bigram_thresh', 10)
            mlflow.log_param('trigram_min_count', 3)
            mlflow.log_param('trigram_thresh', 10)

            unigram_distr, bigram_distr, trigram_distr = get_phrase_distribution(
                extracted_features.trigrams)
            mlflow.log_metric('unigrams_count', len(unigram_distr))
            mlflow.log_metric('bigrams_count', len(bigram_distr))
            mlflow.log_metric('trigrams_count', len(trigram_distr))
Esempio n. 21
0
def test_artifact_can_be_downloaded_from_absolute_uri_successfully(tmpdir):
    artifact_file_name = "artifact.txt"
    artifact_text = "Sample artifact text"
    local_artifact_path = tmpdir.join(artifact_file_name).strpath
    with open(local_artifact_path, "w") as out:
        out.write(artifact_text)

    logged_artifact_path = "artifact"
    with mlflow.start_run():
        mlflow.log_artifact(local_path=local_artifact_path,
                            artifact_path=logged_artifact_path)
        artifact_uri = mlflow.get_artifact_uri(
            artifact_path=logged_artifact_path)

    downloaded_artifact_path = os.path.join(
        _download_artifact_from_uri(artifact_uri), artifact_file_name)
    assert downloaded_artifact_path != local_artifact_path
    assert downloaded_artifact_path != logged_artifact_path
    with open(downloaded_artifact_path, "r") as f:
        assert f.read() == artifact_text
Esempio n. 22
0
def log(test_df: pd.DataFrame, score_train: np.float,
        score_val: np.float, model_param: dict):
    mlflow.set_tracking_uri('http://localhost:5000')
    experiment_name = 'coursera'
    mlflow.set_experiment(experiment_name)
    tracking = mlflow.tracking.MlflowClient()
    experiment = tracking.get_experiment_by_name(experiment_name)
    with mlflow.start_run(experiment_id=experiment.experiment_id):
        model_name = model_param.pop('name')
        for name, value in model_param[model_name].items():
            mlflow.log_param(f"model_{name}", value)
        mlflow.log_param("model_name", model_name)

        mlflow.log_metric("train_rsme", score_train)
        mlflow.log_metric("val_rsme", score_val)

        filename = 'submission.csv'
        test_df.to_csv(filename, index=False)
        mlflow.log_artifact(filename)
        os.remove(filename)
Esempio n. 23
0
def load_raw_data(url):
    with mlflow.start_run() as mlrun:
        local_dir = tempfile.mkdtemp()
        local_filename = os.path.join(local_dir, "ml-20m.zip")
        print("Downloading %s to %s" % (url, local_filename))
        r = requests.get(url, stream=True)
        with open(local_filename, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)

        extracted_dir = os.path.join(local_dir, "ml-20m")
        print("Extracting %s into %s" % (local_filename, extracted_dir))
        with zipfile.ZipFile(local_filename, "r") as zip_ref:
            zip_ref.extractall(local_dir)

        ratings_file = os.path.join(extracted_dir, "ratings.csv")

        print("Uploading ratings: %s" % ratings_file)
        mlflow.log_artifact(ratings_file, "ratings-csv-dir")
def run(alpha, run_origin, log_artifact):
    with mlflow.start_run(run_name=run_origin) as run:
        print("runId:",run.info.run_uuid)
        print("artifact_uri:",mlflow.get_artifact_uri())
        print("alpha:",alpha)
        print("log_artifact:",log_artifact)
        print("run_origin:",run_origin)
        mlflow.log_param("alpha", alpha)
        mlflow.log_metric("rmse", 0.789)
        mlflow.set_tag("run_origin", run_origin)
        mlflow.set_tag("log_artifact", log_artifact)
        if log_artifact:
            with open("info.txt", "w") as f:
                f.write("Hi artifact")
            mlflow.log_artifact("info.txt")

        params = [ Param("p1","0.1"), Param("p2","0.2") ]
        metrics = [ Metric("m1",0.1,now), Metric("m2",0.2,now) ]
        tags = [ RunTag("t1","hi1"), RunTag("t2","hi2") ]
        client.log_batch(run.info.run_uuid, metrics, params, tags)
Esempio n. 25
0
    def log_artifact(self, src_file_path: str):
        """
        Make a copy of the file under the logging directory.

        Args:
            src_file_path:
                Path of the file. If path is not a child of the logging directory, the file will be copied.
                If ``with_mlflow`` is True, ``mlflow.log_artifact`` will be called (then another copy will be made).
        """
        logging_path = os.path.abspath(self.logging_directory)
        src_file_path = os.path.abspath(src_file_path)

        if os.path.commonpath([logging_path]) != os.path.commonpath(
            [logging_path, src_file_path]):
            src_file = os.path.basename(src_file_path)
            shutil.copy(src_file, self.logging_directory)

        if self.with_mlflow:
            import mlflow
            mlflow.log_artifact(src_file_path)
Esempio n. 26
0
def test_download_artifact_from_absolute_uri_persists_data_to_specified_output_directory(tmpdir):
    artifact_file_name = "artifact.txt"
    artifact_text = "Sample artifact text"
    local_artifact_path = tmpdir.join(artifact_file_name).strpath
    with open(local_artifact_path, "w") as out:
        out.write(artifact_text)

    logged_artifact_subdir = "logged_artifact"
    with mlflow.start_run():
        mlflow.log_artifact(local_path=local_artifact_path, artifact_path=logged_artifact_subdir)
        artifact_uri = mlflow.get_artifact_uri(artifact_path=logged_artifact_subdir)

    artifact_output_path = tmpdir.join("artifact_output").strpath
    os.makedirs(artifact_output_path)
    _download_artifact_from_uri(artifact_uri=artifact_uri, output_path=artifact_output_path)
    assert logged_artifact_subdir in os.listdir(artifact_output_path)
    assert artifact_file_name in os.listdir(
        os.path.join(artifact_output_path, logged_artifact_subdir))
    with open(os.path.join(
            artifact_output_path, logged_artifact_subdir, artifact_file_name), "r") as f:
        assert f.read() == artifact_text
Esempio n. 27
0
    def _log_image_artifact(
        self,
        do_plot,
        artifact_name,
    ):
        import matplotlib.pyplot as pyplot

        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".png"
        artifact_file_local_path = self.temp_dir.path(artifact_file_name)

        try:
            pyplot.clf()
            do_plot()
            pyplot.savefig(artifact_file_local_path)
        finally:
            pyplot.close(pyplot.gcf())

        mlflow.log_artifact(artifact_file_local_path)
        artifact = ImageEvaluationArtifact(uri=mlflow.get_artifact_uri(artifact_file_name))
        artifact._load(artifact_file_local_path)
        self.artifacts[artifact_name] = artifact
            def _save(self, data: Any):
                # _get_save_path needs to be called before super, otherwise
                # it will throw exception that file under path already exist.
                local_path = (self._get_save_path() if hasattr(
                    self, "_version") else self._filepath)
                # it must be converted to a string with as_posix()
                # for logging on remote storage like Azure S3
                local_path = local_path.as_posix()

                super()._save(data)
                if self.run_id:
                    # if a run id is specified, we have to use mlflow client
                    # to avoid potential conflicts with an already active run
                    mlflow_client = MlflowClient()
                    mlflow_client.log_artifact(
                        run_id=self.run_id,
                        local_path=local_path,
                        artifact_path=self.artifact_path,
                    )
                else:
                    mlflow.log_artifact(local_path, self.artifact_path)
Esempio n. 29
0
    def run(self):
        data_df = pd.read_csv(self.input_file_path)

        with open(self.requires().output().path, 'rb') as f:
            tfidf = pickle.load(f)

        featurizer = MNBFeaturizer(tfidf)
        logger.info("Fitting MNB for category {}".format(self.category_name))
        featurizer.fit(data_df['comment_text'], data_df[self.category_name])
        try_mkdir(self.artefact_output_path)
        featurizer.save(self.output().path)

        try:
            mlflow.set_experiment(f'/mnb_category_{self.category_name}')
            with mlflow.start_run():
                logger.info("Sending MNB artefact to MLFlow")
                mlflow.log_artifact(self.output().path)
        except Exception as e:
            logger.error(
                "Something went wrong while trying to use MLFlow tracking: ",
                e)
Esempio n. 30
0
    def log_checkpoint(self, data: NumpySequences, checkpoint_num: int) -> None:
        print(f"Logging checkpoint {checkpoint_num}")
        mlflow.keras.log_model(self.combined, f"models/gan-{checkpoint_num}")
        mlflow.keras.log_model(self.generator, f"models/generator-{checkpoint_num}")
        mlflow.keras.log_model(
            self.discriminator, f"models/discriminator-{checkpoint_num}"
        )

        filename = f"confusion-{checkpoint_num}.txt"
        with open(filename, "w") as file:
            samples = numpy.concatenate(
                [self.generate(Gan.N_SAMPLES, i) for i in range(self.num_classes)]
            )
            confusion_matrix = create_confusion_matrix(
                samples, data, data.window_sequence.num_classes
            )
            numpy.set_printoptions(formatter={"float": "{: 0.3f}".format})
            file.write(str(confusion_matrix))
            numpy.set_printoptions()
        mlflow.log_artifact(filename)
        os.remove(filename)
Esempio n. 31
0
File: main.py Progetto: Daiver/jff
        # Log a metric; metrics can be updated throughout the run
        for i in range(200):
            time.sleep(0.1)
            log_metric("foo1", 1 * i)
            log_metric("foo2", 2 * i)
            log_metric("foo3", 3 * i)
            log_metric("foo4", 3 * i)
            log_metric("foo5", 3 * i)
            log_metric("foo6", 3 * i)
            log_metric("foo7", 3 * i)
            log_metric("foo8", 3 * i)
            log_metric("foo9", 3 * i)
            log_metric("foo10", 3 * i)
            log_metric("foo11", 3 * i)
            log_metric("foo12", 3 * i)
            log_metric("foo13", 3 * i)
            log_metric("foo14", 3 * i)
            log_metric("foo15", 3 * i)
            log_metric("foo16", 3 * i)
            log_metric("foo17", 3 * i)
            log_metric("foo18", 3 * i)
            log_metric("foo19", 3 * i)
            log_metric("foo20", 3 * i)

            # Log an artifact (output file)
            with open("output.txt", "w") as f:
                f.write("Hello world!")
            with open("output.txt", "w") as f:
                f.write("Hello world!{}".format(i))
            log_artifact("output.txt")