Ejemplo n.º 1
0
def predict_tags(
        text: Optional[
            str] = "Transfer learning with BERT for self-supervised learning",
        run_id: str = open(Path(config.MODEL_DIR, "run_id.txt")).read(),
) -> Dict:
    """Predict tags for a give input text using a trained model.

    Warning:
        Make sure that you have a trained model first!

    Args:
        text (str, optional): Input text to predict tags for.
                              Defaults to "Transfer learning with BERT for self-supervised learning".
        run_id (str): ID of the model run to load artifacts. Defaults to run ID in config.MODEL_DIR.

    Raises:
        ValueError: Run id doesn't exist in experiment.

    Returns:
        Predicted tags for input text.
    """
    # Predict
    artifacts = main.load_artifacts(run_id=run_id)
    prediction = predict.predict(texts=[text], artifacts=artifacts)
    logger.info(json.dumps(prediction, indent=2))

    return prediction
Ejemplo n.º 2
0
def predict_tags(
    text: Optional[
        str] = "Transfer learning with BERT for self-supervised learning",
    model_dir: Path = config.MODEL_DIR,
) -> Dict:
    """Predict tags for a give input text using a trained model.

    Warning:
        Make sure that you have a trained model first!

    Args:
        text (str, optional): Input text to predict tags for.
                              Defaults to "Transfer learning with BERT for self-supervised learning".
        model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR.

    Raises:
        ValueError: Run id doesn't exist in experiment.

    Returns:
        Predicted tags for input text.
    """
    # Predict
    artifacts = main.load_artifacts(model_dir=model_dir)
    prediction = predict.predict(texts=[text], artifacts=artifacts)
    logger.info(json.dumps(prediction, indent=2))

    return prediction
Ejemplo n.º 3
0
def predict_tags(
    text: str = "Transfer learning with BERT for self-supervised learning",
    run_id: str = "",
) -> Dict:
    """Predict tags for a give input text using a trained model.

    Warning:
        Make sure that you have a trained model first!

    Args:
        text (str, optional): Input text to predict tags for.
                              Defaults to "Transfer learning with BERT for self-supervised learning".
        run_id (str, optional): ID of the run to load model artifacts from.
                                Defaults to model with lowest `best_val_loss` from the `best` experiment.

    Returns:
        Predicted tags for input text.
    """
    # Get best run
    if not run_id:
        experiment_id = mlflow.get_experiment_by_name("best").experiment_id
        all_runs = mlflow.search_runs(
            experiment_ids=experiment_id,
            order_by=["metrics.best_val_loss ASC"],
        )
        run_id = all_runs.iloc[0].run_id

    # Predict
    prediction = predict.predict(texts=[text], run_id=run_id)
    logger.info(json.dumps(prediction, indent=2))

    return prediction
Ejemplo n.º 4
0
def train_model(
    params_fp: Path = Path(config.CONFIG_DIR, "params.json"),
    model_dir: Optional[Path] = Path(config.MODEL_DIR),
    experiment_name: Optional[str] = "best",
    run_name: Optional[str] = "model",
) -> None:
    """Train a model using the specified parameters.

    Args:
        params_fp (Path, optional): Parameters to use for training. Defaults to `config/params.json`.
        model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR.
        experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`.
        run_name (str, optional): Name of the run. Defaults to `model`.
    """
    # Set experiment and start run
    params = Namespace(**utils.load_dict(filepath=params_fp))

    # Start run
    mlflow.set_experiment(experiment_name=experiment_name)
    with mlflow.start_run(run_name=run_name):
        run_id = mlflow.active_run().info.run_id

        # Train
        artifacts = main.run(params=params)

        # Set tags
        tags = {}
        mlflow.set_tags(tags)

        # Log metrics
        performance = artifacts["performance"]
        logger.info(json.dumps(performance["overall"], indent=2))
        metrics = {
            "precision": performance["overall"]["precision"],
            "recall": performance["overall"]["recall"],
            "f1": performance["overall"]["f1"],
            "best_val_loss": artifacts["loss"],
            "behavioral_score": performance["behavioral"]["score"],
            "slices_f1": performance["slices"]["overall"]["f1"],
        }
        mlflow.log_metrics(metrics)

        # Log artifacts
        with tempfile.TemporaryDirectory() as dp:
            utils.save_dict(vars(artifacts["params"]),
                            Path(dp, "params.json"),
                            cls=NumpyEncoder)
            utils.save_dict(performance, Path(dp, "performance.json"))
            artifacts["label_encoder"].save(Path(dp, "label_encoder.json"))
            artifacts["tokenizer"].save(Path(dp, "tokenizer.json"))
            torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt"))
            mlflow.log_artifacts(dp)
        mlflow.log_params(vars(artifacts["params"]))

    # Save for repo
    open(Path(model_dir, "run_id.txt"), "w").write(run_id)
    utils.save_dict(vars(params),
                    Path(model_dir, "params.json"),
                    cls=NumpyEncoder)
    utils.save_dict(performance, Path(model_dir, "performance.json"))
Ejemplo n.º 5
0
def health_check():
    response = requests.get(
        f"http://0.0.0.0:{os.environ.get('PORT', 5000)}/",
        headers=headers,
    )
    results = json.loads(response.text)
    logger.info(json.dumps(results, indent=4))
Ejemplo n.º 6
0
def load_best_artifacts():
    global runs, run_ids, best_artifacts, best_run_id
    runs = utils.get_sorted_runs(experiment_name="best",
                                 order_by=["metrics.f1 DESC"])
    run_ids = [run["run_id"] for run in runs]
    best_run_id = run_ids[0]
    best_artifacts = predict.load_artifacts(run_id=best_run_id)
    logger.info(
        "Loaded trained model and other required artifacts for inference!")
Ejemplo n.º 7
0
def set_artifact_metadata():
    """Set the artifact URI for all experiments and runs.
    Used when transferring experiments from other locations (ex. Colab).

    Note:
        check out the [optimize.ipynb](https://colab.research.google.com/github/GokuMohandas/applied-ml/blob/main/notebooks/optimize.ipynb){:target="_blank"} notebook for how to train on Google Colab and transfer to local.
    """
    def set_artifact_location(var, fp):
        """Set variable's yaml value on file at fp."""
        with open(fp) as f:
            metadata = yaml.load(f)

        # Set new value
        experiment_id = metadata[var].split("/")[-1]
        artifact_location = Path("file://", config.EXPERIMENTS_DIR,
                                 experiment_id)
        metadata[var] = str(artifact_location)

        with open(fp, "w") as f:
            yaml.dump(metadata, f)

    def set_artifact_uri(var, fp):
        """Set variable's yaml value on file at fp."""
        with open(fp) as f:
            metadata = yaml.load(f)

        # Set new value
        experiment_id = metadata[var].split("/")[-3]
        run_id = metadata[var].split("/")[-2]
        artifact_uri = Path(
            "file://",
            config.EXPERIMENTS_DIR,
            experiment_id,
            run_id,
            "artifacts",
        )
        metadata[var] = str(artifact_uri)

        with open(fp, "w") as f:
            yaml.dump(metadata, f)

    # Get artifact location
    experiment_meta_yamls = list(
        Path(config.EXPERIMENTS_DIR).glob("*/meta.yaml"))
    for meta_yaml in experiment_meta_yamls:
        set_artifact_location(var="artifact_location", fp=meta_yaml)
        logger.info(f"Set artfifact location for {meta_yaml}")

    # Change artifact URI
    run_meta_yamls = list(Path(config.EXPERIMENTS_DIR).glob("*/*/meta.yaml"))
    for meta_yaml in run_meta_yamls:
        set_artifact_uri(var="artifact_uri", fp=meta_yaml)
        logger.info(f"Set artfifact URI for {meta_yaml}")
Ejemplo n.º 8
0
    def update_behavioral_report(run_id):
        with mlflow.start_run(run_id=run_id):
            # Generate behavioral report
            artifacts = main.load_artifacts(run_id=run_id)
            behavioral_report = eval.get_behavioral_report(artifacts=artifacts)
            mlflow.log_metric("behavioral_score", behavioral_report["score"])

            # Log artifacts
            with tempfile.TemporaryDirectory() as dp:
                utils.save_dict(behavioral_report,
                                Path(dp, "behavioral_report.json"))
                mlflow.log_artifacts(dp)
        logger.info(f"Updated behavioral report for run_id {run_id}")
Ejemplo n.º 9
0
def performance(
    author: str = config.AUTHOR,
    repo: str = config.REPO,
    tag: str = "workspace",
    verbose: bool = True,
):
    if tag == "workspace":
        performance = utils.load_dict(filepath=Path(config.MODEL_DIR, "performance.json"))
    else:
        url = f"https://raw.githubusercontent.com/{author}/{repo}/{tag}/model/performance.json"
        performance = utils.load_json_from_url(url=url)
    if verbose:
        logger.info(json.dumps(performance, indent=2))
    return performance
Ejemplo n.º 10
0
def train_model(
    args_fp: Path = Path(config.CONFIG_DIR, "args.json"),
    experiment_name: Optional[str] = "best",
    run_name: Optional[str] = "model",
) -> None:
    """Train a model using the specified parameters.

    Args:
        args_fp (Path, optional): Location of arguments to use for training. Defaults to `config/args.json`.
        experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`.
        run_name (str, optional): Name of the run. Defaults to `model`.
    """
    # Set experiment and start run
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # Start run
    mlflow.set_experiment(experiment_name=experiment_name)
    with mlflow.start_run(run_name=run_name
                          ) as run:  # NOQA: F841 (assigned to but never used)
        # Train
        artifacts = main.run(args=args)

        # Set tags
        tags = {"data_version": artifacts["data_version"]}
        mlflow.set_tags(tags)

        # Log metrics
        performance = artifacts["performance"]
        logger.info(json.dumps(performance["overall"], indent=2))
        metrics = {
            "precision": performance["overall"]["precision"],
            "recall": performance["overall"]["recall"],
            "f1": performance["overall"]["f1"],
            "best_val_loss": artifacts["loss"],
            "behavioral_score": artifacts["behavioral_report"]["score"],
            "slices_f1": performance["slices"]["f1"],
        }
        mlflow.log_metrics(metrics)

        # Log artifacts
        with tempfile.TemporaryDirectory() as dp:
            artifacts["label_encoder"].save(Path(dp, "label_encoder.json"))
            artifacts["tokenizer"].save(Path(dp, "tokenizer.json"))
            torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt"))
            utils.save_dict(performance, Path(dp, "performance.json"))
            utils.save_dict(artifacts["behavioral_report"],
                            Path(dp, "behavioral_report.json"))
            mlflow.log_artifacts(dp)
        mlflow.log_params(vars(artifacts["args"]))
Ejemplo n.º 11
0
def predict():
    data = {
        "run_id": "",
        "texts": [
            {"text": "Transfer learning with transformers for self-supervised learning."},
            {"text": "Generative adversarial networks in both PyTorch and TensorFlow."},
        ],
    }
    response = requests.post(
        f"http://0.0.0.0:{os.environ.get('PORT', 5000)}/predict",
        headers=headers,
        data=json.dumps(data),
    )
    results = json.loads(response.text)
    logger.info(json.dumps(results, indent=4))
Ejemplo n.º 12
0
def predict_tags(
    text: Optional[
        str] = "Transfer learning with BERT for self-supervised learning",
    experiment_name: Optional[str] = "best",
    run_id: Optional[str] = "",
) -> Dict:
    """Predict tags for a give input text using a trained model.

    Warning:
        Make sure that you have a trained model first!

    Args:
        text (str, optional): Input text to predict tags for.
                              Defaults to "Transfer learning with BERT for self-supervised learning".
        experiment_name (str, optional): Name of the experiment to fetch run from.
        run_id (str, optional): ID of the run to load model artifacts from.
                                Defaults to run with highest F1 score.

    Raises:
        ValueError: Run id doesn't exist in experiment.

    Returns:
        Predicted tags for input text.
    """
    # Get sorted runs
    runs = utils.get_sorted_runs(
        experiment_name=experiment_name,
        order_by=["metrics.f1 DESC"],
    )
    run_ids = [run["run_id"] for run in runs]

    # Get best run
    if not run_id:
        run_id = run_ids[0]

    # Validate run id
    if run_id not in run_ids:  # pragma: no cover, simple value check
        raise ValueError(
            f"Run_id {run_id} does not exist in experiment {experiment_name}")

    # Predict
    artifacts = main.load_artifacts(run_id=run_id)
    prediction = predict.predict(texts=[text], artifacts=artifacts)
    logger.info(json.dumps(prediction, indent=2))

    return prediction
Ejemplo n.º 13
0
def download_data():
    """Download data from online to local drive.

    Note:
        We could've just copied files from `datasets` but
        we'll use this later on with other data sources.
    """
    # Download data
    projects_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/projects.json"
    tags_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/tags.json"
    projects = utils.load_json_from_url(url=projects_url)
    tags = utils.load_json_from_url(url=tags_url)

    # Save data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    utils.save_dict(d=projects, filepath=projects_fp)
    utils.save_dict(d=tags, filepath=tags_fp)
    logger.info("✅ Data downloaded!")
Ejemplo n.º 14
0
def optimize(
    params_fp: Path = Path(config.CONFIG_DIR, "params.json"),
    study_name: Optional[str] = "optimization",
    num_trials: int = 100,
) -> None:
    """Optimize a subset of hyperparameters towards an objective.

    This saves the best trial's parameters into `config/params.json`.

    Args:
        params_fp (Path, optional): Location of parameters (just using num_samples,
                                  num_epochs, etc.) to use for training.
                                  Defaults to `config/params.json`.
        study_name (str, optional): Name of the study to save trial runs under. Defaults to `optimization`.
        num_trials (int, optional): Number of trials to run. Defaults to 100.
    """
    # Starting parameters (not actually used but needed for set up)
    params = Namespace(**utils.load_dict(filepath=params_fp))

    # Optimize
    pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
    study = optuna.create_study(study_name=study_name,
                                direction="maximize",
                                pruner=pruner)
    mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(),
                                     metric_name="f1")
    study.optimize(
        lambda trial: main.objective(params, trial),
        n_trials=num_trials,
        callbacks=[mlflow_callback],
    )

    # All trials
    trials_df = study.trials_dataframe()
    trials_df = trials_df.sort_values(["value"], ascending=False)

    # Best trial
    logger.info(f"Best value (f1): {study.best_trial.value}")
    params = {**params.__dict__, **study.best_trial.params}
    params["threshold"] = study.best_trial.user_attrs["threshold"]
    with open(params_fp, "w") as fp:
        json.dump(params, fp=fp, indent=2, cls=NumpyEncoder)
    logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))
Ejemplo n.º 15
0
def optimize(num_trials: int = 100) -> None:
    """Optimize a subset of hyperparameters towards an objective.

    This saves the best trial's arguments into `config/args.json`.

    Args:
        num_trials (int, optional): Number of trials to run. Defaults to 100.
    """
    # Starting arguments (not actually used but needed for set up)
    args_fp = Path(config.CONFIG_DIR, "args.json")
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # Optimize
    pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
    study = optuna.create_study(
        study_name="optimization", direction="maximize", pruner=pruner
    )
    mlflow_callback = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(), metric_name="f1"
    )
    study.optimize(
        lambda trial: train.objective(args, trial),
        n_trials=num_trials,
        callbacks=[mlflow_callback],
    )

    # All trials
    trials_df = study.trials_dataframe()
    trials_df = trials_df.sort_values(
        ["value"], ascending=False
    )  # sort by metric
    trials_df.to_csv(
        Path(config.EXPERIMENTS_DIR, "trials.csv"), index=False
    )  # save

    # Best trial
    logger.info(f"Best value (f1): {study.best_trial.value}")
    params = {**args.__dict__, **study.best_trial.params}
    params["threshold"] = study.best_trial.user_attrs["threshold"]
    with open(Path(config.CONFIG_DIR, "args.json"), "w") as fp:
        json.dump(params, fp=fp, indent=2, cls=NumpyEncoder)
    logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))
Ejemplo n.º 16
0
def objective(args: Namespace, trial: optuna.trial._trial.Trial) -> float:
    """Objective function for optimization trials.
    Args:
        args (Namespace): Input arguments for each trial (see `config/args.json`) for argument names.
        trial (optuna.trial._trial.Trial): Optuna optimization trial.
    Returns:
        F1 score from evaluating the trained model on the test data split.
    """
    # Paramters (to tune)
    args.embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
    args.num_filters = trial.suggest_int("num_filters", 128, 512)
    args.hidden_dim = trial.suggest_int("hidden_dim", 128, 512)
    args.dropout_p = trial.suggest_uniform("dropout_p", 0.3, 0.8)
    args.lr = trial.suggest_loguniform("lr", 5e-5, 5e-4)

    # Train (can move some of these outside for efficiency)
    logger.info(f"\nTrial {trial.number}:")
    logger.info(json.dumps(trial.params, indent=2))
    artifacts = run(args=args, trial=trial)

    # Set additional attributes
    args = artifacts["args"]
    performance = artifacts["performance"]
    logger.info(json.dumps(performance["overall"], indent=2))
    trial.set_user_attr("threshold", args.threshold)
    trial.set_user_attr("precision", performance["overall"]["precision"])
    trial.set_user_attr("recall", performance["overall"]["recall"])
    trial.set_user_attr("f1", performance["overall"]["f1"])

    return performance["overall"]["f1"]
Ejemplo n.º 17
0
def diff(
    author: str = config.AUTHOR,
    repo: str = config.REPO,
    tag_a: str = "workspace",
    tag_b: str = "",
):  # pragma: no cover, can't be certain what diffs will exist
    # Tag b
    if tag_b == "":
        tags_url = f"https://api.github.com/repos/{author}/{repo}/tags"
        tag_b = utils.load_json_from_url(url=tags_url)[0]["name"]
    logger.info(f"Comparing {tag_a} with {tag_b}:")

    # Params
    params_a = params(author=author, repo=repo, tag=tag_a, verbose=False)
    params_b = params(author=author, repo=repo, tag=tag_b, verbose=False)
    params_diff = utils.dict_diff(d_a=params_a, d_b=params_b, d_a_name=tag_a, d_b_name=tag_b)
    logger.info(f"Parameter differences: {json.dumps(params_diff, indent=2)}")

    # Performance
    performance_a = performance(author=author, repo=repo, tag=tag_a, verbose=False)
    performance_b = performance(author=author, repo=repo, tag=tag_b, verbose=False)
    performance_diff = utils.dict_diff(
        d_a=performance_a, d_b=performance_b, d_a_name=tag_a, d_b_name=tag_b
    )
    logger.info(f"Performance differences: {json.dumps(performance_diff, indent=2)}")

    return params_diff, performance_diff
Ejemplo n.º 18
0
def train_model(args_fp: Path = Path(config.CONFIG_DIR, "args.json")) -> None:
    """Train a model using the specified parameters.

    Args:
        args_fp (Path, optional): Location of arguments to use for training.
                                  Defaults to `config/args.json`.
    """
    # Set experiment and start run
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # Start run
    mlflow.set_experiment(experiment_name="best")
    with mlflow.start_run(
        run_name="cnn"
    ) as run:  # NOQA: F841 (assigned to but never used)
        # Train
        artifacts = train.run(args=args)

        # Log key metrics
        performance = artifacts["performance"]
        loss = artifacts["loss"]
        mlflow.log_metrics({"precision": performance["overall"]["precision"]})
        mlflow.log_metrics({"recall": performance["overall"]["recall"]})
        mlflow.log_metrics({"f1": performance["overall"]["f1"]})
        mlflow.log_metrics({"best_val_loss": loss})

        # Log artifacts
        args = artifacts["args"]
        model = artifacts["model"]
        label_encoder = artifacts["label_encoder"]
        tokenizer = artifacts["tokenizer"]
        with tempfile.TemporaryDirectory() as fp:
            label_encoder.save(Path(fp, "label_encoder.json"))
            tokenizer.save(Path(fp, "tokenizer.json"))
            torch.save(model.state_dict(), Path(fp, "model.pt"))
            utils.save_dict(performance, Path(fp, "performance.json"))
            mlflow.log_artifacts(fp)
        mlflow.log_params(vars(args))
    logger.info(json.dumps(performance["overall"], indent=2))
Ejemplo n.º 19
0
    def train(
        self,
        num_epochs: int,
        patience: int,
        train_dataloader: torch.utils.data.DataLoader,
        val_dataloader: torch.utils.data.DataLoader,
    ) -> Tuple:
        """Training loop.

        Args:
            num_epochs (int): Maximum number of epochs to train for (can stop earlier based on performance).
            patience (int): Number of acceptable epochs for continuous degrading performance.
            train_dataloader (torch.utils.data.DataLoader): Dataloader object with training data split.
            val_dataloader (torch.utils.data.DataLoader): Dataloader object with validation data split.

        Raises:
            optuna.TrialPruned: Early stopping of the optimization trial if poor performance.

        Returns:
            The best validation loss and the trained model from that point.
        """

        best_val_loss = np.inf
        best_model = None
        _patience = patience
        for epoch in range(num_epochs):
            # Steps
            train_loss = self.train_step(dataloader=train_dataloader)
            val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
            self.scheduler.step(val_loss)

            # Pruning based on the intermediate value
            if self.trial:
                self.trial.report(val_loss, epoch)
                if self.trial.should_prune():  # pragma: no cover, optuna pruning
                    logger.info("Unpromising trial pruned!")
                    raise optuna.TrialPruned()

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = self.model
                _patience = patience  # reset _patience
            else:  # pragma: no cover, simple subtraction
                _patience -= 1
            if not _patience:  # pragma: no cover, simple break
                logger.info("Stopping early!")
                break

            # Logging
            logger.info(
                f"Epoch: {epoch+1} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}, "
                f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
                f"_patience: {_patience}"
            )
        return best_val_loss, best_model
Ejemplo n.º 20
0
def clean_experiments(experiments_to_keep: str = "best"):
    """Removes all experiments besides the
    ones specified in `experiments_to_keep`.

    Args:
        experiments_to_keep (str): comma separated string of experiments to keep.
    """
    # Get experiments to keep
    experiments_to_keep = list(
        set([exp.strip() for exp in experiments_to_keep.split(",")]))
    if not len(experiments_to_keep) or not experiments_to_keep[0]:
        raise ValueError("You must keep at least one experiment.")

    # Filter and delete
    client = mlflow.tracking.MlflowClient()
    for experiment in client.list_experiments():
        if experiment.name not in experiments_to_keep:  # pragma: no cover, mlflow function
            logger.info(f"Deleting Experiment {experiment.name}")
            client.delete_experiment(experiment_id=experiment.experiment_id)

    # Delete MLFlow trash
    shutil.rmtree(Path(config.EXPERIMENTS_DIR, ".trash"))
    logger.info(f"Cleared experiments besides {experiments_to_keep}")
Ejemplo n.º 21
0
                                        max_filter_size=args.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=args.max_filter_size)
    test_dataset = data.CNNTextDataset(X=X_test,
                                       y=y_test,
                                       max_filter_size=args.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=args.batch_size)
    val_dataloader = val_dataset.create_dataloader(batch_size=args.batch_size)
    test_dataloader = test_dataset.create_dataloader(
        batch_size=args.batch_size)

    # Load artifacts
    runs = utils.get_sorted_runs(experiment_name="best",
                                 order_by=["metrics.f1 DESC"])
    run_ids = [run["run_id"] for run in runs]
    artifacts = main.load_artifacts(run_id=run_ids[0],
                                    device=torch.device("cpu"))

    # Evaluation
    device = torch.device("cpu")
    performance, behavioral_report = evaluate(
        artifacts=artifacts,
        dataloader=test_dataloader,
        df=test_df,
        device=device,
    )
    logger.info(json.dumps(performance, indent=2))
    logger.info(json.dumps(behavioral_report, indent=2))
Ejemplo n.º 22
0
def run(args: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.
    1. Set seed
    2. Set device
    3. Load data
    4. Clean data
    5. Preprocess data
    6. Encode labels
    7. Split data
    8. Tokenize inputs
    9. Create dataloaders
    10. Initialize model
    11. Train model
    12. Evaluate model
    Args:
        args (Namespace): Input arguments for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.
    Returns:
        Artifacts to save and load for later.
    """
    # 1. Set seed
    utils.set_seed(seed=args.seed)

    # 2. Set device
    device = utils.set_device(cuda=args.cuda)

    # 3. Load data
    df, projects_dict, tags_dict = data.load(
        shuffle=args.shuffle, num_samples=args.num_samples
    )

    # 4. Clean data
    df, tags_dict, tags_above_frequency = data.clean(
        df=df, tags_dict=tags_dict, min_tag_freq=args.min_tag_freq
    )

    # 5. Preprocess data
    df.text = df.text.apply(data.preprocess, lower=args.lower, stem=args.stem)

    # 6. Encode labels
    y, class_weights, label_encoder = data.encode_labels(labels=df.tags)

    # 7. Split data
    utils.set_seed(seed=args.seed)  # needed for skmultilearn
    X_train, X_val, X_test, y_train, y_val, y_test = data.split(
        X=df.text.to_numpy(), y=y, train_size=args.train_size
    )

    # 8. Tokenize inputs
    X_train, tokenizer = data.tokenize_text(
        X=X_train, char_level=args.char_level
    )
    X_val, _ = data.tokenize_text(
        X=X_val, char_level=args.char_level, tokenizer=tokenizer
    )
    X_test, _ = data.tokenize_text(
        X=X_test, char_level=args.char_level, tokenizer=tokenizer
    )

    # 9. Create dataloaders
    train_dataloader = data.get_dataloader(
        data=[X_train, y_train],
        max_filter_size=args.max_filter_size,
        batch_size=args.batch_size,
    )
    val_dataloader = data.get_dataloader(
        data=[X_val, y_val],
        max_filter_size=args.max_filter_size,
        batch_size=args.batch_size,
    )
    test_dataloader = data.get_dataloader(
        data=[X_test, y_test],
        max_filter_size=args.max_filter_size,
        batch_size=args.batch_size,
    )

    # 10. Initialize model
    model = models.initialize_model(
        args=args,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # 11. Train model
    logger.info(
        f"Arguments: {json.dumps(args.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    args, model, loss = train(
        args=args,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        model=model,
        device=device,
        class_weights=class_weights,
        trial=trial,
    )

    # 12. Evaluate model
    device = torch.device("cpu")
    performance = evaluate(
        dataloader=test_dataloader,
        model=model.to(device),
        device=device,
        threshold=args.threshold,
        classes=label_encoder.classes,
    )

    return {
        "args": args,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "loss": loss,
        "performance": performance,
    }
Ejemplo n.º 23
0
def diff(commit_a: str = "workspace",
         commit_b: str = "head"):  # pragma: no cover
    """Compare relevant differences (params, metrics) between commits.
    Inspired by DVC's `dvc metrics diff`but repurposed to
    display diffs pertinent to our experiments.

    Args:
        commit_a (str, optional): Primary commit. Defaults to "workspace".
        commit_b (str, optional): Commit to compare to. Defaults to "head".

    Raises:
        ValueError: Invalid commit.
    """
    diffs = {}
    commits = ["a", "b"]
    if commit_a.lower() in ("head", "current"):
        commit_a = "main"
    if commit_b.lower() in ("head", "current"):
        commit_b = "main"

    # Get params
    params = {"a": {}, "b": {}}
    for i, commit in enumerate([commit_a, commit_b]):
        if commit == "workspace":
            params[commits[i]] = utils.load_dict(
                filepath=Path(config.CONFIG_DIR, "params.json"))
            continue
        params_url = (
            f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/params.json"
        )
        params[commits[i]] = utils.load_json_from_url(url=params_url)

    # Parameter differences
    diffs["params"] = {}
    for arg in params["a"]:
        a = params["a"][arg]
        b = params["b"][arg]
        if a != b:
            diffs["params"][arg] = {commit_a: a, commit_b: b}
    logger.info(
        f"Parameter differences:\n{json.dumps(diffs['params'], indent=2)}")

    # Get metrics
    metrics = {"a": {}, "b": {}}
    for i, commit in enumerate([commit_a, commit_b]):
        if commit == "workspace":
            metrics[commits[i]] = utils.load_dict(
                filepath=Path(config.MODEL_DIR, "performance.json"))
            continue
        metrics_url = f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/performance.json"
        metrics[commits[i]] = utils.load_json_from_url(url=metrics_url)

    # Recursively flatten
    metrics_a = pd.json_normalize(metrics["a"],
                                  sep=".").to_dict(orient="records")[0]
    metrics_b = pd.json_normalize(metrics["b"],
                                  sep=".").to_dict(orient="records")[0]
    if metrics_a.keys() != metrics_b.keys():
        raise Exception(
            "Cannot compare these commits because they have different metrics."
        )

    # Metric differences
    diffs["metrics"] = {}
    diffs["metrics"]["improvements"] = {}
    diffs["metrics"]["regressions"] = {}
    for metric in metrics_a:
        if ((metric in metrics_b) and (metrics_a[metric] != metrics_b[metric])
                and (isinstance(metrics_a[metric], numbers.Number))
                and (metric.split(".")[-1] != "num_samples")):
            item = {
                commit_a: metrics_a[metric],
                commit_b: metrics_b[metric],
                "diff": metrics_a[metric] - metrics_b[metric],
            }
            if item["diff"] >= 0.0:
                diffs["metrics"]["improvements"][metric] = item
            else:
                diffs["metrics"]["regressions"][metric] = item
    logger.info(
        f"Metric differences:\n{json.dumps(diffs['metrics'], indent=2)}")

    return diffs
Ejemplo n.º 24
0
def load_artifacts():
    global artifacts
    run_id = open(Path(config.MODEL_DIR, "run_id.txt")).read()
    artifacts = main.load_artifacts(run_id=run_id)
    logger.info("Ready for inference!")
Ejemplo n.º 25
0
def load_artifacts():
    global artifacts
    artifacts = main.load_artifacts(model_dir=config.MODEL_DIR)
    logger.info("Ready for inference!")
Ejemplo n.º 26
0
def run(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.

    Args:
        params (Namespace): Input parameters for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.

    Returns:
        Artifacts to save and load for later.
    """
    # 1. Set seed
    utils.set_seed(seed=params.seed)

    # 2. Set device
    device = utils.set_device(cuda=params.cuda)

    # 3. Load data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    projects = utils.load_dict(filepath=projects_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(projects)
    if params.shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    df = df[:params.subset]  # None = all samples

    # 4. Prepare data (feature engineering, filter, clean)
    df, tags_above_freq, tags_below_freq = data.prepare(
        df=df,
        include=list(tags_dict.keys()),
        exclude=config.EXCLUDED_TAGS,
        min_tag_freq=params.min_tag_freq,
    )
    params.num_samples = len(df)

    # 5. Preprocess data
    df.text = df.text.apply(data.preprocess,
                            lower=params.lower,
                            stem=params.stem)

    # 6. Encode labels
    labels = df.tags
    label_encoder = data.MultiLabelLabelEncoder()
    label_encoder.fit(labels)
    y = label_encoder.encode(labels)

    # Class weights
    all_tags = list(itertools.chain.from_iterable(labels.values))
    counts = np.bincount(
        [label_encoder.class_to_index[class_] for class_ in all_tags])
    class_weights = {i: 1.0 / count for i, count in enumerate(counts)}

    # 7. Split data
    utils.set_seed(seed=params.seed)  # needed for skmultilearn
    X = df.text.to_numpy()
    X_train, X_, y_train, y_ = data.iterative_train_test_split(
        X=X, y=y, train_size=params.train_size)
    X_val, X_test, y_val, y_test = data.iterative_train_test_split(
        X=X_, y=y_, train_size=0.5)
    test_df = pd.DataFrame({
        "text": X_test,
        "tags": label_encoder.decode(y_test)
    })

    # 8. Tokenize inputs
    tokenizer = data.Tokenizer(char_level=params.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
    X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
    X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

    # 9. Create dataloaders
    train_dataset = data.CNNTextDataset(X=X_train,
                                        y=y_train,
                                        max_filter_size=params.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=params.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=params.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=params.batch_size)

    # 10. Initialize model
    model = models.initialize_model(
        params=params,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # 11. Train model
    logger.info(
        f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    params, model, loss = train.train(
        params=params,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        model=model,
        device=device,
        class_weights=class_weights,
        trial=trial,
    )

    # 12. Evaluate model
    artifacts = {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "loss": loss,
    }
    device = torch.device("cpu")
    y_true, y_pred, performance = eval.evaluate(df=test_df,
                                                artifacts=artifacts)
    artifacts["performance"] = performance

    return artifacts