コード例 #1
0
def test_save_and_load_dict():
    with tempfile.TemporaryDirectory() as dp:
        d = {"hello": "world"}
        fp = Path(dp, "d.json")
        utils.save_dict(d=d, filepath=fp)
        d = utils.load_dict(filepath=fp)
        assert d["hello"] == "world"
コード例 #2
0
def compute_features(params: Namespace) -> None:
    """Compute features to use for training.

    Args:
        params (Namespace): Input parameters for operations.
    """
    # Set up
    utils.set_seed(seed=params.seed)

    # Load data
    projects_url = (
        "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/projects.json"
    )
    projects = utils.load_json_from_url(url=projects_url)
    df = pd.DataFrame(projects)

    # Compute features
    df["text"] = df.title + " " + df.description
    df.drop(columns=["title", "description"], inplace=True)
    df = df[["id", "created_on", "text", "tags"]]

    # Save
    features = df.to_dict(orient="records")
    df_dict_fp = Path(config.DATA_DIR, "features.json")
    utils.save_dict(d=features, filepath=df_dict_fp)

    return df, features
コード例 #3
0
ファイル: cli.py プロジェクト: Peaky8linders/applied-ml
def behavioral_reevaluation(
    model_dir: Path = config.MODEL_DIR,
):  # pragma: no cover, requires changing existing runs
    """Reevaluate existing runs on current behavioral tests in eval.py.
    This is possible since behavioral tests are inputs applied to black box
    models and compared with expected outputs. There is not dependency on
    data or model versions.

    Args:
        model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR.

    Raises:
        ValueError: Run id doesn't exist in experiment.
    """

    # Generate behavioral report
    artifacts = main.load_artifacts(model_dir=model_dir)
    artifacts["performance"]["behavioral"] = eval.get_behavioral_report(
        artifacts=artifacts)
    mlflow.log_metric("behavioral_score",
                      artifacts["performance"]["behavioral"]["score"])

    # Log updated performance
    utils.save_dict(artifacts["performance"],
                    Path(model_dir, "performance.json"))
コード例 #4
0
ファイル: cli.py プロジェクト: Peaky8linders/applied-ml
def train_model(
    params_fp: Path = Path(config.CONFIG_DIR, "params.json"),
    model_dir: Optional[Path] = Path(config.MODEL_DIR),
    experiment_name: Optional[str] = "best",
    run_name: Optional[str] = "model",
) -> None:
    """Train a model using the specified parameters.

    Args:
        params_fp (Path, optional): Parameters to use for training. Defaults to `config/params.json`.
        model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR.
        experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`.
        run_name (str, optional): Name of the run. Defaults to `model`.
    """
    # Set experiment and start run
    params = Namespace(**utils.load_dict(filepath=params_fp))

    # Start run
    mlflow.set_experiment(experiment_name=experiment_name)
    with mlflow.start_run(run_name=run_name):
        # Train
        artifacts = main.run(params=params)

        # Set tags
        tags = {}
        mlflow.set_tags(tags)

        # Log metrics
        performance = artifacts["performance"]
        logger.info(json.dumps(performance["overall"], indent=2))
        metrics = {
            "precision": performance["overall"]["precision"],
            "recall": performance["overall"]["recall"],
            "f1": performance["overall"]["f1"],
            "best_val_loss": artifacts["loss"],
            "behavioral_score": performance["behavioral"]["score"],
            "slices_f1": performance["slices"]["overall"]["f1"],
        }
        mlflow.log_metrics(metrics)

        # Log artifacts
        with tempfile.TemporaryDirectory() as dp:
            artifacts["label_encoder"].save(Path(dp, "label_encoder.json"))
            artifacts["tokenizer"].save(Path(dp, "tokenizer.json"))
            torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt"))
            utils.save_dict(performance, Path(dp, "performance.json"))
            mlflow.log_artifacts(dp)
        mlflow.log_params(vars(artifacts["params"]))

    # Save for repo
    with open(Path(model_dir, "params.json"), "w") as fp:
        json.dump(vars(params), fp=fp, indent=2, cls=NumpyEncoder)
    artifacts["label_encoder"].save(Path(model_dir, "label_encoder.json"))
    artifacts["tokenizer"].save(Path(model_dir, "tokenizer.json"))
    torch.save(artifacts["model"].state_dict(), Path(model_dir, "model.pt"))
    utils.save_dict(performance, Path(model_dir, "performance.json"))
コード例 #5
0
def download_auxiliary_data():
    """Load auxiliary data from URL and save to local drive."""
    # Download auxiliary data
    tags_url = "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/tags.json"
    tags = utils.load_json_from_url(url=tags_url)

    # Save data
    tags_fp = Path(config.DATA_DIR, "tags.json")
    utils.save_dict(d=tags, filepath=tags_fp)
    logger.info("✅ Auxiliary data downloaded!")
コード例 #6
0
    def update_behavioral_report(run_id):
        with mlflow.start_run(run_id=run_id):
            # Generate behavioral report
            artifacts = main.load_artifacts(run_id=run_id)
            behavioral_report = eval.get_behavioral_report(artifacts=artifacts)
            mlflow.log_metric("behavioral_score", behavioral_report["score"])

            # Log artifacts
            with tempfile.TemporaryDirectory() as dp:
                utils.save_dict(behavioral_report,
                                Path(dp, "behavioral_report.json"))
                mlflow.log_artifacts(dp)
        logger.info(f"Updated behavioral report for run_id {run_id}")
コード例 #7
0
def train_model(
    args_fp: Path = Path(config.CONFIG_DIR, "args.json"),
    experiment_name: Optional[str] = "best",
    run_name: Optional[str] = "model",
) -> None:
    """Train a model using the specified parameters.

    Args:
        args_fp (Path, optional): Location of arguments to use for training. Defaults to `config/args.json`.
        experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`.
        run_name (str, optional): Name of the run. Defaults to `model`.
    """
    # Set experiment and start run
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # Start run
    mlflow.set_experiment(experiment_name=experiment_name)
    with mlflow.start_run(run_name=run_name
                          ) as run:  # NOQA: F841 (assigned to but never used)
        # Train
        artifacts = main.run(args=args)

        # Set tags
        tags = {"data_version": artifacts["data_version"]}
        mlflow.set_tags(tags)

        # Log metrics
        performance = artifacts["performance"]
        logger.info(json.dumps(performance["overall"], indent=2))
        metrics = {
            "precision": performance["overall"]["precision"],
            "recall": performance["overall"]["recall"],
            "f1": performance["overall"]["f1"],
            "best_val_loss": artifacts["loss"],
            "behavioral_score": artifacts["behavioral_report"]["score"],
            "slices_f1": performance["slices"]["f1"],
        }
        mlflow.log_metrics(metrics)

        # Log artifacts
        with tempfile.TemporaryDirectory() as dp:
            artifacts["label_encoder"].save(Path(dp, "label_encoder.json"))
            artifacts["tokenizer"].save(Path(dp, "tokenizer.json"))
            torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt"))
            utils.save_dict(performance, Path(dp, "performance.json"))
            utils.save_dict(artifacts["behavioral_report"],
                            Path(dp, "behavioral_report.json"))
            mlflow.log_artifacts(dp)
        mlflow.log_params(vars(artifacts["args"]))
コード例 #8
0
ファイル: main.py プロジェクト: 00mjk/applied-ml
def download_data():
    """Download data from online to local drive.

    Note:
        We could've just copied files from `datasets` but
        we'll use this later on with other data sources.
    """
    # Download data
    projects_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/projects.json"
    tags_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/tags.json"
    projects = utils.load_json_from_url(url=projects_url)
    tags = utils.load_json_from_url(url=tags_url)

    # Save data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    utils.save_dict(d=projects, filepath=projects_fp)
    utils.save_dict(d=tags, filepath=tags_fp)
    logger.info("✅ Data downloaded!")
コード例 #9
0
def optimize(
    params_fp: Path = Path(config.CONFIG_DIR, "params.json"),
    study_name: Optional[str] = "optimization",
    num_trials: int = 100,
) -> None:
    """Optimize a subset of hyperparameters towards an objective.

    This saves the best trial's parameters into `config/params.json`.

    Args:
        params_fp (Path, optional): Location of parameters (just using num_samples,
                                  num_epochs, etc.) to use for training.
                                  Defaults to `config/params.json`.
        study_name (str, optional): Name of the study to save trial runs under. Defaults to `optimization`.
        num_trials (int, optional): Number of trials to run. Defaults to 100.
    """
    # Starting parameters (not actually used but needed for set up)
    params = Namespace(**utils.load_dict(filepath=params_fp))

    # Optimize
    pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
    study = optuna.create_study(study_name=study_name,
                                direction="maximize",
                                pruner=pruner)
    mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(),
                                     metric_name="f1")
    study.optimize(
        lambda trial: main.objective(params, trial),
        n_trials=num_trials,
        callbacks=[mlflow_callback],
    )

    # All trials
    trials_df = study.trials_dataframe()
    trials_df = trials_df.sort_values(["value"], ascending=False)

    # Best trial
    logger.info(f"Best value (f1): {study.best_trial.value}")
    params = {**params.__dict__, **study.best_trial.params}
    params["threshold"] = study.best_trial.user_attrs["threshold"]
    utils.save_dict(params, params_fp, cls=NumpyEncoder)
    logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))
コード例 #10
0
ファイル: main.py プロジェクト: 00mjk/applied-ml
def train_model(args_fp: Path = Path(config.CONFIG_DIR, "args.json")) -> None:
    """Train a model using the specified parameters.

    Args:
        args_fp (Path, optional): Location of arguments to use for training.
                                  Defaults to `config/args.json`.
    """
    # Set experiment and start run
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # Start run
    mlflow.set_experiment(experiment_name="best")
    with mlflow.start_run(
        run_name="cnn"
    ) as run:  # NOQA: F841 (assigned to but never used)
        # Train
        artifacts = train.run(args=args)

        # Log key metrics
        performance = artifacts["performance"]
        loss = artifacts["loss"]
        mlflow.log_metrics({"precision": performance["overall"]["precision"]})
        mlflow.log_metrics({"recall": performance["overall"]["recall"]})
        mlflow.log_metrics({"f1": performance["overall"]["f1"]})
        mlflow.log_metrics({"best_val_loss": loss})

        # Log artifacts
        args = artifacts["args"]
        model = artifacts["model"]
        label_encoder = artifacts["label_encoder"]
        tokenizer = artifacts["tokenizer"]
        with tempfile.TemporaryDirectory() as fp:
            label_encoder.save(Path(fp, "label_encoder.json"))
            tokenizer.save(Path(fp, "tokenizer.json"))
            torch.save(model.state_dict(), Path(fp, "model.pt"))
            utils.save_dict(performance, Path(fp, "performance.json"))
            mlflow.log_artifacts(fp)
        mlflow.log_params(vars(args))
    logger.info(json.dumps(performance["overall"], indent=2))