Beispiel #1
0
def load(shuffle: bool, num_samples: int = 0) -> pd.DataFrame:
    """Load the data from local drive to a Pandas DataFrame.

    Args:
        shuffle (bool): Shuffle the data.
        num_samples (int, optional): Number of samples to include (used for quick testting). Defaults to 0 which includes all samples.

    Returns:
        Dataframe, projects and tags dictionaries.
    """
    # Load data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    projects_dict = utils.load_dict(filepath=projects_fp)
    tags_dict = utils.load_dict(filepath=tags_fp)

    # Create dataframe
    df = pd.DataFrame(projects_dict)

    # Shuffling since projects are chronologically organized
    if shuffle:
        df = df.sample(frac=1).reset_index(drop=True)

    # Subset
    if num_samples:
        df = df[:num_samples]

    return df, projects_dict, tags_dict
Beispiel #2
0
def load_artifacts(run_id: str, device: torch.device = torch.device("cpu")) -> Dict:
    """Load artifacts for current model.

    Args:
        run_id (str): ID of the model run to load artifacts.
        device (torch.device): Device to run model on. Defaults to CPU.

    Returns:
        Artifacts needed for inference.
    """
    # Load artifacts
    artifact_uri = mlflow.get_run(run_id=run_id).info.artifact_uri.split("file://")[-1]
    params = Namespace(**utils.load_dict(filepath=Path(artifact_uri, "params.json")))
    label_encoder = data.MultiLabelLabelEncoder.load(fp=Path(artifact_uri, "label_encoder.json"))
    tokenizer = data.Tokenizer.load(fp=Path(artifact_uri, "tokenizer.json"))
    model_state = torch.load(Path(artifact_uri, "model.pt"), map_location=device)
    performance = utils.load_dict(filepath=Path(artifact_uri, "performance.json"))

    # Initialize model
    model = models.initialize_model(
        params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder)
    )
    model.load_state_dict(model_state)

    return {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "performance": performance,
    }
Beispiel #3
0
def load_artifacts(
    model_dir: Path = config.MODEL_DIR,
    device: torch.device = torch.device("cpu")
) -> Dict:
    """Load artifacts for current model.

    Args:
        model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR.
        device (torch.device): Device to run model on. Defaults to CPU.

    Returns:
        Artifacts needed for inference.
    """
    # Load artifacts
    params = Namespace(**utils.load_dict(
        filepath=Path(model_dir, "params.json")))
    label_encoder = data.MultiLabelLabelEncoder.load(
        fp=Path(model_dir, "label_encoder.json"))
    tokenizer = data.Tokenizer.load(fp=Path(model_dir, "tokenizer.json"))
    model_state = torch.load(Path(model_dir, "model.pt"), map_location=device)
    performance = utils.load_dict(filepath=Path(model_dir, "performance.json"))

    # Initialize model
    model = models.initialize_model(params=params,
                                    vocab_size=len(tokenizer),
                                    num_classes=len(label_encoder))
    model.load_state_dict(model_state)

    return {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "performance": performance,
    }
Beispiel #4
0
def train_model(
    params_fp: Path = Path(config.CONFIG_DIR, "params.json"),
    model_dir: Optional[Path] = Path(config.MODEL_DIR),
    experiment_name: Optional[str] = "best",
    run_name: Optional[str] = "model",
) -> None:
    """Train a model using the specified parameters.

    Args:
        params_fp (Path, optional): Parameters to use for training. Defaults to `config/params.json`.
        model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR.
        experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`.
        run_name (str, optional): Name of the run. Defaults to `model`.
    """
    # Set experiment and start run
    params = Namespace(**utils.load_dict(filepath=params_fp))

    # Start run
    mlflow.set_experiment(experiment_name=experiment_name)
    with mlflow.start_run(run_name=run_name):
        run_id = mlflow.active_run().info.run_id

        # Train
        artifacts = main.run(params=params)

        # Set tags
        tags = {}
        mlflow.set_tags(tags)

        # Log metrics
        performance = artifacts["performance"]
        logger.info(json.dumps(performance["overall"], indent=2))
        metrics = {
            "precision": performance["overall"]["precision"],
            "recall": performance["overall"]["recall"],
            "f1": performance["overall"]["f1"],
            "best_val_loss": artifacts["loss"],
            "behavioral_score": performance["behavioral"]["score"],
            "slices_f1": performance["slices"]["overall"]["f1"],
        }
        mlflow.log_metrics(metrics)

        # Log artifacts
        with tempfile.TemporaryDirectory() as dp:
            utils.save_dict(vars(artifacts["params"]),
                            Path(dp, "params.json"),
                            cls=NumpyEncoder)
            utils.save_dict(performance, Path(dp, "performance.json"))
            artifacts["label_encoder"].save(Path(dp, "label_encoder.json"))
            artifacts["tokenizer"].save(Path(dp, "tokenizer.json"))
            torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt"))
            mlflow.log_artifacts(dp)
        mlflow.log_params(vars(artifacts["params"]))

    # Save for repo
    open(Path(model_dir, "run_id.txt"), "w").write(run_id)
    utils.save_dict(vars(params),
                    Path(model_dir, "params.json"),
                    cls=NumpyEncoder)
    utils.save_dict(performance, Path(model_dir, "performance.json"))
Beispiel #5
0
def test_save_and_load_dict():
    with tempfile.TemporaryDirectory() as dp:
        d = {"hello": "world"}
        fp = Path(dp, "d.json")
        utils.save_dict(d=d, filepath=fp)
        d = utils.load_dict(filepath=fp)
        assert d["hello"] == "world"
Beispiel #6
0
def performance(
    author: str = config.AUTHOR,
    repo: str = config.REPO,
    tag: str = "workspace",
    verbose: bool = True,
):
    if tag == "workspace":
        performance = utils.load_dict(filepath=Path(config.MODEL_DIR, "performance.json"))
    else:
        url = f"https://raw.githubusercontent.com/{author}/{repo}/{tag}/model/performance.json"
        performance = utils.load_json_from_url(url=url)
    if verbose:
        logger.info(json.dumps(performance, indent=2))
    return performance
Beispiel #7
0
def train_model(
    args_fp: Path = Path(config.CONFIG_DIR, "args.json"),
    experiment_name: Optional[str] = "best",
    run_name: Optional[str] = "model",
) -> None:
    """Train a model using the specified parameters.

    Args:
        args_fp (Path, optional): Location of arguments to use for training. Defaults to `config/args.json`.
        experiment_name (str, optional): Name of the experiment to save the run to. Defaults to `best`.
        run_name (str, optional): Name of the run. Defaults to `model`.
    """
    # Set experiment and start run
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # Start run
    mlflow.set_experiment(experiment_name=experiment_name)
    with mlflow.start_run(run_name=run_name
                          ) as run:  # NOQA: F841 (assigned to but never used)
        # Train
        artifacts = main.run(args=args)

        # Set tags
        tags = {"data_version": artifacts["data_version"]}
        mlflow.set_tags(tags)

        # Log metrics
        performance = artifacts["performance"]
        logger.info(json.dumps(performance["overall"], indent=2))
        metrics = {
            "precision": performance["overall"]["precision"],
            "recall": performance["overall"]["recall"],
            "f1": performance["overall"]["f1"],
            "best_val_loss": artifacts["loss"],
            "behavioral_score": artifacts["behavioral_report"]["score"],
            "slices_f1": performance["slices"]["f1"],
        }
        mlflow.log_metrics(metrics)

        # Log artifacts
        with tempfile.TemporaryDirectory() as dp:
            artifacts["label_encoder"].save(Path(dp, "label_encoder.json"))
            artifacts["tokenizer"].save(Path(dp, "tokenizer.json"))
            torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt"))
            utils.save_dict(performance, Path(dp, "performance.json"))
            utils.save_dict(artifacts["behavioral_report"],
                            Path(dp, "behavioral_report.json"))
            mlflow.log_artifacts(dp)
        mlflow.log_params(vars(artifacts["args"]))
Beispiel #8
0
def compute_features(
    params_fp: Path = Path(config.CONFIG_DIR, "params.json"),
) -> None:
    """Compute and save features for training.

    Args:
        params_fp (Path, optional): Location of parameters (just using num_samples,
                                    num_epochs, etc.) to use for training.
                                    Defaults to `config/params.json`.
    """
    # Parameters
    params = Namespace(**utils.load_dict(filepath=params_fp))

    # Compute features
    data.compute_features(params=params)
    logger.info("✅ Computed features!")
Beispiel #9
0
def optimize(
    params_fp: Path = Path(config.CONFIG_DIR, "params.json"),
    study_name: Optional[str] = "optimization",
    num_trials: int = 100,
) -> None:
    """Optimize a subset of hyperparameters towards an objective.

    This saves the best trial's parameters into `config/params.json`.

    Args:
        params_fp (Path, optional): Location of parameters (just using num_samples,
                                  num_epochs, etc.) to use for training.
                                  Defaults to `config/params.json`.
        study_name (str, optional): Name of the study to save trial runs under. Defaults to `optimization`.
        num_trials (int, optional): Number of trials to run. Defaults to 100.
    """
    # Starting parameters (not actually used but needed for set up)
    params = Namespace(**utils.load_dict(filepath=params_fp))

    # Optimize
    pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
    study = optuna.create_study(study_name=study_name,
                                direction="maximize",
                                pruner=pruner)
    mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(),
                                     metric_name="f1")
    study.optimize(
        lambda trial: main.objective(params, trial),
        n_trials=num_trials,
        callbacks=[mlflow_callback],
    )

    # All trials
    trials_df = study.trials_dataframe()
    trials_df = trials_df.sort_values(["value"], ascending=False)

    # Best trial
    logger.info(f"Best value (f1): {study.best_trial.value}")
    params = {**params.__dict__, **study.best_trial.params}
    params["threshold"] = study.best_trial.user_attrs["threshold"]
    with open(params_fp, "w") as fp:
        json.dump(params, fp=fp, indent=2, cls=NumpyEncoder)
    logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))
Beispiel #10
0
def optimize(num_trials: int = 100) -> None:
    """Optimize a subset of hyperparameters towards an objective.

    This saves the best trial's arguments into `config/args.json`.

    Args:
        num_trials (int, optional): Number of trials to run. Defaults to 100.
    """
    # Starting arguments (not actually used but needed for set up)
    args_fp = Path(config.CONFIG_DIR, "args.json")
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # Optimize
    pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
    study = optuna.create_study(
        study_name="optimization", direction="maximize", pruner=pruner
    )
    mlflow_callback = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(), metric_name="f1"
    )
    study.optimize(
        lambda trial: train.objective(args, trial),
        n_trials=num_trials,
        callbacks=[mlflow_callback],
    )

    # All trials
    trials_df = study.trials_dataframe()
    trials_df = trials_df.sort_values(
        ["value"], ascending=False
    )  # sort by metric
    trials_df.to_csv(
        Path(config.EXPERIMENTS_DIR, "trials.csv"), index=False
    )  # save

    # Best trial
    logger.info(f"Best value (f1): {study.best_trial.value}")
    params = {**args.__dict__, **study.best_trial.params}
    params["threshold"] = study.best_trial.user_attrs["threshold"]
    with open(Path(config.CONFIG_DIR, "args.json"), "w") as fp:
        json.dump(params, fp=fp, indent=2, cls=NumpyEncoder)
    logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))
Beispiel #11
0
def train_model(args_fp: Path = Path(config.CONFIG_DIR, "args.json")) -> None:
    """Train a model using the specified parameters.

    Args:
        args_fp (Path, optional): Location of arguments to use for training.
                                  Defaults to `config/args.json`.
    """
    # Set experiment and start run
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # Start run
    mlflow.set_experiment(experiment_name="best")
    with mlflow.start_run(
        run_name="cnn"
    ) as run:  # NOQA: F841 (assigned to but never used)
        # Train
        artifacts = train.run(args=args)

        # Log key metrics
        performance = artifacts["performance"]
        loss = artifacts["loss"]
        mlflow.log_metrics({"precision": performance["overall"]["precision"]})
        mlflow.log_metrics({"recall": performance["overall"]["recall"]})
        mlflow.log_metrics({"f1": performance["overall"]["f1"]})
        mlflow.log_metrics({"best_val_loss": loss})

        # Log artifacts
        args = artifacts["args"]
        model = artifacts["model"]
        label_encoder = artifacts["label_encoder"]
        tokenizer = artifacts["tokenizer"]
        with tempfile.TemporaryDirectory() as fp:
            label_encoder.save(Path(fp, "label_encoder.json"))
            tokenizer.save(Path(fp, "tokenizer.json"))
            torch.save(model.state_dict(), Path(fp, "model.pt"))
            utils.save_dict(performance, Path(fp, "performance.json"))
            mlflow.log_artifacts(fp)
        mlflow.log_params(vars(args))
    logger.info(json.dumps(performance["overall"], indent=2))
Beispiel #12
0
def load_artifacts(
        run_id: str,
        device: torch.device = torch.device("cpu"),
) -> Dict:
    """Load artifacts for a particular `run_id`.

    Args:
        run_id (str): ID of the run to load model artifacts from.
        device (torch.device): Device to run model on. Defaults to CPU.

    Returns:
        Artifacts needed for inference.
    """
    # Load model
    client = mlflow.tracking.MlflowClient()
    device = torch.device("cpu")
    with tempfile.TemporaryDirectory() as fp:
        client.download_artifacts(run_id=run_id, path="", dst_path=fp)
        label_encoder = data.LabelEncoder.load(
            fp=Path(fp, "label_encoder.json"))
        tokenizer = data.Tokenizer.load(fp=Path(fp, "tokenizer.json"))
        model_state = torch.load(Path(fp, "model.pt"), map_location=device)
        performance = utils.load_dict(filepath=Path(fp, "performance.json"))

    # Load model
    run = mlflow.get_run(run_id=run_id)
    args = Namespace(**run.data.params)
    model = models.initialize_model(args=args,
                                    vocab_size=len(tokenizer),
                                    num_classes=len(label_encoder))
    model.load_state_dict(model_state)

    return {
        "args": args,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "performance": performance,
    }
Beispiel #13
0
def df():
    # Load features
    params_fp = Path(config.CONFIG_DIR, "params.json")
    params = Namespace(**utils.load_dict(filepath=params_fp))
    df, _ = data.compute_features(params=params)
    return df
Beispiel #14
0
def df():
    projects_fp = Path(config.DATA_DIR, "projects.json")
    projects_dict = utils.load_dict(filepath=projects_fp)
    df = pd.DataFrame(projects_dict)
    return df
Beispiel #15
0
def tags():
    tags_fp = Path(config.DATA_DIR, "tags.json")
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    tags = list(tags_dict.keys())
    return tags
Beispiel #16
0

if __name__ == "__main__":  # pragma: no cover, playground for eval components
    import json
    from argparse import Namespace
    from pathlib import Path

    import numpy as np
    import pandas as pd

    from tagifai import config, data, main, utils
    from tagifai.config import logger

    # Set experiment and start run
    args_fp = Path(config.CONFIG_DIR, "args.json")
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # 1. Set seed
    utils.set_seed(seed=args.seed)

    # 2. Set device
    device = utils.set_device(cuda=args.cuda)

    # 3. Load data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    projects = utils.load_dict(filepath=projects_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(projects)
    if args.shuffle:
Beispiel #17
0
def diff(commit_a: str = "workspace",
         commit_b: str = "head"):  # pragma: no cover
    """Compare relevant differences (params, metrics) between commits.
    Inspired by DVC's `dvc metrics diff`but repurposed to
    display diffs pertinent to our experiments.

    Args:
        commit_a (str, optional): Primary commit. Defaults to "workspace".
        commit_b (str, optional): Commit to compare to. Defaults to "head".

    Raises:
        ValueError: Invalid commit.
    """
    diffs = {}
    commits = ["a", "b"]
    if commit_a.lower() in ("head", "current"):
        commit_a = "main"
    if commit_b.lower() in ("head", "current"):
        commit_b = "main"

    # Get params
    params = {"a": {}, "b": {}}
    for i, commit in enumerate([commit_a, commit_b]):
        if commit == "workspace":
            params[commits[i]] = utils.load_dict(
                filepath=Path(config.CONFIG_DIR, "params.json"))
            continue
        params_url = (
            f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/params.json"
        )
        params[commits[i]] = utils.load_json_from_url(url=params_url)

    # Parameter differences
    diffs["params"] = {}
    for arg in params["a"]:
        a = params["a"][arg]
        b = params["b"][arg]
        if a != b:
            diffs["params"][arg] = {commit_a: a, commit_b: b}
    logger.info(
        f"Parameter differences:\n{json.dumps(diffs['params'], indent=2)}")

    # Get metrics
    metrics = {"a": {}, "b": {}}
    for i, commit in enumerate([commit_a, commit_b]):
        if commit == "workspace":
            metrics[commits[i]] = utils.load_dict(
                filepath=Path(config.MODEL_DIR, "performance.json"))
            continue
        metrics_url = f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/performance.json"
        metrics[commits[i]] = utils.load_json_from_url(url=metrics_url)

    # Recursively flatten
    metrics_a = pd.json_normalize(metrics["a"],
                                  sep=".").to_dict(orient="records")[0]
    metrics_b = pd.json_normalize(metrics["b"],
                                  sep=".").to_dict(orient="records")[0]
    if metrics_a.keys() != metrics_b.keys():
        raise Exception(
            "Cannot compare these commits because they have different metrics."
        )

    # Metric differences
    diffs["metrics"] = {}
    diffs["metrics"]["improvements"] = {}
    diffs["metrics"]["regressions"] = {}
    for metric in metrics_a:
        if ((metric in metrics_b) and (metrics_a[metric] != metrics_b[metric])
                and (isinstance(metrics_a[metric], numbers.Number))
                and (metric.split(".")[-1] != "num_samples")):
            item = {
                commit_a: metrics_a[metric],
                commit_b: metrics_b[metric],
                "diff": metrics_a[metric] - metrics_b[metric],
            }
            if item["diff"] >= 0.0:
                diffs["metrics"]["improvements"][metric] = item
            else:
                diffs["metrics"]["regressions"][metric] = item
    logger.info(
        f"Metric differences:\n{json.dumps(diffs['metrics'], indent=2)}")

    return diffs
Beispiel #18
0
def train(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.

    Args:
        params (Namespace): Input parameters for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.

    Returns:
        Artifacts to save and load for later.
    """
    # Set up
    utils.set_seed(seed=params.seed)
    device = utils.set_device(cuda=params.cuda)

    # Load features
    features_fp = Path(config.DATA_DIR, "features.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    features = utils.load_dict(filepath=features_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(features)
    if params.shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    df = df[:params.subset]  # None = all samples

    # Prepare data (filter, clean, etc.)
    df, tags_above_freq, tags_below_freq = data.prepare(
        df=df,
        include=list(tags_dict.keys()),
        exclude=config.EXCLUDED_TAGS,
        min_tag_freq=params.min_tag_freq,
    )
    params.num_samples = len(df)

    # Preprocess data
    df.text = df.text.apply(data.preprocess,
                            lower=params.lower,
                            stem=params.stem)

    # Encode labels
    labels = df.tags
    label_encoder = data.MultiLabelLabelEncoder()
    label_encoder.fit(labels)
    y = label_encoder.encode(labels)

    # Class weights
    all_tags = list(itertools.chain.from_iterable(labels.values))
    counts = np.bincount(
        [label_encoder.class_to_index[class_] for class_ in all_tags])
    class_weights = {i: 1.0 / count for i, count in enumerate(counts)}

    # Split data
    utils.set_seed(seed=params.seed)  # needed for skmultilearn
    X = df.text.to_numpy()
    X_train, X_, y_train, y_ = data.iterative_train_test_split(
        X=X, y=y, train_size=params.train_size)
    X_val, X_test, y_val, y_test = data.iterative_train_test_split(
        X=X_, y=y_, train_size=0.5)
    test_df = pd.DataFrame({
        "text": X_test,
        "tags": label_encoder.decode(y_test)
    })

    # Tokenize inputs
    tokenizer = data.Tokenizer(char_level=params.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
    X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
    X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

    # Create dataloaders
    train_dataset = data.CNNTextDataset(X=X_train,
                                        y=y_train,
                                        max_filter_size=params.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=params.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=params.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=params.batch_size)

    # Initialize model
    model = models.initialize_model(
        params=params,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # Train model
    logger.info(
        f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    class_weights_tensor = torch.Tensor(np.array(list(class_weights.values())))
    loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor)
    optimizer = torch.optim.Adam(model.parameters(), lr=params.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="min",
                                                           factor=0.05,
                                                           patience=5)

    # Trainer module
    trainer = Trainer(
        model=model,
        device=device,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        trial=trial,
    )

    # Train
    best_val_loss, best_model = trainer.train(params.num_epochs,
                                              params.patience,
                                              train_dataloader, val_dataloader)

    # Find best threshold
    _, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader)
    params.threshold = find_best_threshold(y_true=y_true, y_prob=y_prob)

    # Evaluate model
    artifacts = {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": best_model,
        "loss": best_val_loss,
    }
    device = torch.device("cpu")
    y_true, y_pred, performance = eval.evaluate(df=test_df,
                                                artifacts=artifacts)
    artifacts["performance"] = performance

    return artifacts
Beispiel #19
0
def run(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.

    Args:
        params (Namespace): Input parameters for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.

    Returns:
        Artifacts to save and load for later.
    """
    # 1. Set seed
    utils.set_seed(seed=params.seed)

    # 2. Set device
    device = utils.set_device(cuda=params.cuda)

    # 3. Load data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    projects = utils.load_dict(filepath=projects_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(projects)
    if params.shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    df = df[:params.subset]  # None = all samples

    # 4. Prepare data (feature engineering, filter, clean)
    df, tags_above_freq, tags_below_freq = data.prepare(
        df=df,
        include=list(tags_dict.keys()),
        exclude=config.EXCLUDED_TAGS,
        min_tag_freq=params.min_tag_freq,
    )
    params.num_samples = len(df)

    # 5. Preprocess data
    df.text = df.text.apply(data.preprocess,
                            lower=params.lower,
                            stem=params.stem)

    # 6. Encode labels
    labels = df.tags
    label_encoder = data.MultiLabelLabelEncoder()
    label_encoder.fit(labels)
    y = label_encoder.encode(labels)

    # Class weights
    all_tags = list(itertools.chain.from_iterable(labels.values))
    counts = np.bincount(
        [label_encoder.class_to_index[class_] for class_ in all_tags])
    class_weights = {i: 1.0 / count for i, count in enumerate(counts)}

    # 7. Split data
    utils.set_seed(seed=params.seed)  # needed for skmultilearn
    X = df.text.to_numpy()
    X_train, X_, y_train, y_ = data.iterative_train_test_split(
        X=X, y=y, train_size=params.train_size)
    X_val, X_test, y_val, y_test = data.iterative_train_test_split(
        X=X_, y=y_, train_size=0.5)
    test_df = pd.DataFrame({
        "text": X_test,
        "tags": label_encoder.decode(y_test)
    })

    # 8. Tokenize inputs
    tokenizer = data.Tokenizer(char_level=params.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
    X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
    X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

    # 9. Create dataloaders
    train_dataset = data.CNNTextDataset(X=X_train,
                                        y=y_train,
                                        max_filter_size=params.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=params.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=params.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=params.batch_size)

    # 10. Initialize model
    model = models.initialize_model(
        params=params,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # 11. Train model
    logger.info(
        f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    params, model, loss = train.train(
        params=params,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        model=model,
        device=device,
        class_weights=class_weights,
        trial=trial,
    )

    # 12. Evaluate model
    artifacts = {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "loss": loss,
    }
    device = torch.device("cpu")
    y_true, y_pred, performance = eval.evaluate(df=test_df,
                                                artifacts=artifacts)
    artifacts["performance"] = performance

    return artifacts
Beispiel #20
0
st.title("Tagifai · MLOps · Made With ML")
"""by [Goku Mohandas](https://twitter.com/GokuMohandas)"""
st.info("🔍 Explore the different pages below.")

# Pages
pages = ["Data", "Performance", "Inference", "Inspection"]
st.header("Pages")
selected_page = st.radio("Select a page:", pages, index=2)

if selected_page == "Data":
    st.header("Data")

    # Load data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    projects = utils.load_dict(filepath=projects_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag")
    col1, col2 = st.beta_columns(2)
    with col1:
        st.subheader("Projects (sample)")
        st.write(projects[0])
    with col2:
        st.subheader("Tags")
        tag = st.selectbox("Choose a tag", list(tags_dict.keys()))
        st.write(tags_dict[tag])

    # Dataframe
    df = pd.DataFrame(projects)
    st.text(f"Projects (count: {len(df)}):")
    st.write(df)
Beispiel #21
0
def get_performance(model_dir):
    performance = utils.load_dict(filepath=Path(model_dir, "performance.json"))
    return performance
Beispiel #22
0
def predict(texts: List, run_id: str) -> Dict:
    """Predict tags for an input text using the
    best model from the `best` experiment.

    Usage:

    ```python
    texts = ["Transfer learning with BERT."]
    predict(texts=texts, run_id="264ac530b78c42608e5dea1086bc2c73")
    ```
    <pre>
    [
      {
          "input_text": "Transfer learning with BERT.",
          "preprocessed_text": "transfer learning bert",
          "predicted_tags": [
            "attention",
            "language-modeling",
            "natural-language-processing",
            "transfer-learning",
            "transformers"
          ]
      }
    ]
    </pre>

    Note:
        The input argument `texts` can hold multiple input texts and so the resulting prediction dictionary will have `len(texts)` items.

    Args:
        texts (List): List of input text to predict tags for.
        run_id (str): ID of the run to load model artifacts from.

    Returns:
        Predicted tags for input texts.
    """
    # Load artifacts from run
    client = mlflow.tracking.MlflowClient()
    run = mlflow.get_run(run_id=run_id)
    device = torch.device("cpu")
    with tempfile.TemporaryDirectory() as fp:
        client.download_artifacts(run_id=run_id, path="", dst_path=fp)
        args = Namespace(**utils.load_dict(
            filepath=Path(config.CONFIG_DIR, "args.json")))
        label_encoder = data.LabelEncoder.load(
            fp=Path(fp, "label_encoder.json"))
        tokenizer = data.Tokenizer.load(fp=Path(fp, "tokenizer.json"))
        model_state = torch.load(Path(fp, "model.pt"), map_location=device)
        # performance = utils.load_dict(filepath=Path(fp, "performance.json"))

    # Load model
    args = Namespace(**run.data.params)
    model = train.initialize_model(args=args,
                                   vocab_size=len(tokenizer),
                                   num_classes=len(label_encoder))
    model.load_state_dict(model_state)

    # Prepare data
    preprocessed_texts = [data.preprocess(text) for text in texts]
    X = np.array(tokenizer.texts_to_sequences(preprocessed_texts))
    y_filler = label_encoder.encode(
        [np.array([label_encoder.classes[0]] * len(X))])
    dataset = data.CNNTextDataset(X=X,
                                  y=y_filler,
                                  max_filter_size=int(args.max_filter_size))
    dataloader = dataset.create_dataloader(batch_size=int(args.batch_size))

    # Get predictions
    trainer = train.Trainer(model=model, device=device)
    _, y_prob = trainer.predict_step(dataloader)
    y_pred = np.array(
        [np.where(prob >= float(args.threshold), 1, 0) for prob in y_prob])
    tags = label_encoder.decode(y_pred)
    predictions = [{
        "input_text": texts[i],
        "preprocessed_text": preprocessed_texts[i],
        "predicted_tags": tags[i],
    } for i in range(len(tags))]

    return predictions