Beispiel #1
0
 def setup_method(self):
     """Called before every method."""
     self.X = [[4, 2, 3, 0], [2, 4, 3, 3], [2, 3, 0, 0]]
     self.y = [[0, 1], [1, 1], [1, 0]]
     self.max_filter_size = 2
     self.batch_size = 1
     self.dataset = data.CNNTextDataset(X=self.X, y=self.y, max_filter_size=self.max_filter_size)
Beispiel #2
0
def evaluate(
        df: pd.DataFrame,
        artifacts: Dict,
        device: torch.device = torch.device("cpu"),
) -> Tuple:
    """Evaluate performance on data.

    Args:
        df (pd.DataFrame): Dataframe (used for slicing).
        artifacts (Dict): Artifacts needed for inference.
        device (torch.device): Device to run model on. Defaults to CPU.

    Returns:
        Ground truth and predicted labels, performance.
    """
    # Artifacts
    params = artifacts["params"]
    model = artifacts["model"]
    tokenizer = artifacts["tokenizer"]
    label_encoder = artifacts["label_encoder"]
    model = model.to(device)
    classes = label_encoder.classes

    # Create dataloader
    X = np.array(tokenizer.texts_to_sequences(df.text.to_numpy()),
                 dtype="object")
    y = label_encoder.encode(df.tags)
    dataset = data.CNNTextDataset(X=X,
                                  y=y,
                                  max_filter_size=int(params.max_filter_size))
    dataloader = dataset.create_dataloader(batch_size=int(params.batch_size))

    # Determine predictions using threshold
    trainer = train.Trainer(model=model, device=device)
    y_true, y_prob = trainer.predict_step(dataloader=dataloader)
    y_pred = np.array(
        [np.where(prob >= float(params.threshold), 1, 0) for prob in y_prob])

    # Evaluate performance
    performance = {}
    performance = get_metrics(df=df,
                              y_true=y_true,
                              y_pred=y_pred,
                              classes=classes)
    performance["behavioral"] = get_behavioral_report(artifacts=artifacts)

    return y_true, y_pred, performance
Beispiel #3
0
def predict(
    texts: List, artifacts: Dict,
    device: torch.device = torch.device("cpu")) -> Dict:
    """Predict tags for an input text using the
    best model from the `best` experiment.

    Usage:

    ```python
    texts = ["Transfer learning with BERT."]
    artifacts = load_artifacts(run_id="264ac530b78c42608e5dea1086bc2c73")
    predict(texts=texts, artifacts=artifacts)
    ```
    <pre>
    [
      {
          "input_text": "Transfer learning with BERT.",
          "preprocessed_text": "transfer learning bert",
          "predicted_tags": [
            "attention",
            "language-modeling",
            "natural-language-processing",
            "transfer-learning",
            "transformers"
          ]
      }
    ]
    </pre>

    Note:
        The input parameter `texts` can hold multiple input texts and so the resulting prediction dictionary will have `len(texts)` items.

    Args:
        texts (List): List of input texts to predict tags for.
        artifacts (Dict): Artifacts needed for inference.
        device (torch.device): Device to run model on. Defaults to CPU.

    Returns:
        Predicted tags for each of the input texts.

    """
    # Retrieve artifacts
    params = artifacts["params"]
    label_encoder = artifacts["label_encoder"]
    tokenizer = artifacts["tokenizer"]
    model = artifacts["model"]

    # Prepare data
    preprocessed_texts = [
        data.preprocess(
            text,
            lower=bool(strtobool(str(
                params.lower))),  # params.lower could be str/bool
            stem=bool(strtobool(str(params.stem))),
        ) for text in texts
    ]
    X = np.array(tokenizer.texts_to_sequences(preprocessed_texts),
                 dtype="object")
    y_filler = np.zeros((len(X), len(label_encoder)))
    dataset = data.CNNTextDataset(X=X,
                                  y=y_filler,
                                  max_filter_size=int(params.max_filter_size))
    dataloader = dataset.create_dataloader(batch_size=int(params.batch_size))

    # Get predictions
    trainer = train.Trainer(model=model, device=device)
    _, y_prob = trainer.predict_step(dataloader)
    y_pred = [
        np.where(prob >= float(params.threshold), 1, 0) for prob in y_prob
    ]
    tags = label_encoder.decode(y_pred)
    predictions = [{
        "input_text": texts[i],
        "preprocessed_text": preprocessed_texts[i],
        "predicted_tags": tags[i],
    } for i in range(len(tags))]

    return predictions
Beispiel #4
0
    print(f"{len(cv_transformers_df)} projects")
    print(cv_transformers_df[["text", "tags"]].head())
    short_text_df = slice_dataframe(test_df, short_text)
    print(f"{len(short_text_df)} projects")
    print(short_text_df[["text", "tags"]].head())

    # 8. Tokenize inputs
    tokenizer = data.Tokenizer(char_level=args.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
    X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
    X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

    # 9. Create dataloaders
    train_dataset = data.CNNTextDataset(X=X_train,
                                        y=y_train,
                                        max_filter_size=args.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=args.max_filter_size)
    test_dataset = data.CNNTextDataset(X=X_test,
                                       y=y_test,
                                       max_filter_size=args.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=args.batch_size)
    val_dataloader = val_dataset.create_dataloader(batch_size=args.batch_size)
    test_dataloader = test_dataset.create_dataloader(
        batch_size=args.batch_size)

    # Load artifacts
    runs = utils.get_sorted_runs(experiment_name="best",
Beispiel #5
0
def run(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.

    Args:
        params (Namespace): Input parameters for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.

    Returns:
        Artifacts to save and load for later.
    """
    # 1. Set seed
    utils.set_seed(seed=params.seed)

    # 2. Set device
    device = utils.set_device(cuda=params.cuda)

    # 3. Load data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    projects = utils.load_dict(filepath=projects_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(projects)
    if params.shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    df = df[:params.subset]  # None = all samples

    # 4. Prepare data (feature engineering, filter, clean)
    df, tags_above_freq, tags_below_freq = data.prepare(
        df=df,
        include=list(tags_dict.keys()),
        exclude=config.EXCLUDED_TAGS,
        min_tag_freq=params.min_tag_freq,
    )
    params.num_samples = len(df)

    # 5. Preprocess data
    df.text = df.text.apply(data.preprocess,
                            lower=params.lower,
                            stem=params.stem)

    # 6. Encode labels
    labels = df.tags
    label_encoder = data.MultiLabelLabelEncoder()
    label_encoder.fit(labels)
    y = label_encoder.encode(labels)

    # Class weights
    all_tags = list(itertools.chain.from_iterable(labels.values))
    counts = np.bincount(
        [label_encoder.class_to_index[class_] for class_ in all_tags])
    class_weights = {i: 1.0 / count for i, count in enumerate(counts)}

    # 7. Split data
    utils.set_seed(seed=params.seed)  # needed for skmultilearn
    X = df.text.to_numpy()
    X_train, X_, y_train, y_ = data.iterative_train_test_split(
        X=X, y=y, train_size=params.train_size)
    X_val, X_test, y_val, y_test = data.iterative_train_test_split(
        X=X_, y=y_, train_size=0.5)
    test_df = pd.DataFrame({
        "text": X_test,
        "tags": label_encoder.decode(y_test)
    })

    # 8. Tokenize inputs
    tokenizer = data.Tokenizer(char_level=params.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
    X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
    X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

    # 9. Create dataloaders
    train_dataset = data.CNNTextDataset(X=X_train,
                                        y=y_train,
                                        max_filter_size=params.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=params.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=params.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=params.batch_size)

    # 10. Initialize model
    model = models.initialize_model(
        params=params,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # 11. Train model
    logger.info(
        f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    params, model, loss = train.train(
        params=params,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        model=model,
        device=device,
        class_weights=class_weights,
        trial=trial,
    )

    # 12. Evaluate model
    artifacts = {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "loss": loss,
    }
    device = torch.device("cpu")
    y_true, y_pred, performance = eval.evaluate(df=test_df,
                                                artifacts=artifacts)
    artifacts["performance"] = performance

    return artifacts
Beispiel #6
0
def train(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.

    Args:
        params (Namespace): Input parameters for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.

    Returns:
        Artifacts to save and load for later.
    """
    # Set up
    utils.set_seed(seed=params.seed)
    device = utils.set_device(cuda=params.cuda)

    # Load features
    features_fp = Path(config.DATA_DIR, "features.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    features = utils.load_dict(filepath=features_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(features)
    if params.shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    df = df[:params.subset]  # None = all samples

    # Prepare data (filter, clean, etc.)
    df, tags_above_freq, tags_below_freq = data.prepare(
        df=df,
        include=list(tags_dict.keys()),
        exclude=config.EXCLUDED_TAGS,
        min_tag_freq=params.min_tag_freq,
    )
    params.num_samples = len(df)

    # Preprocess data
    df.text = df.text.apply(data.preprocess,
                            lower=params.lower,
                            stem=params.stem)

    # Encode labels
    labels = df.tags
    label_encoder = data.MultiLabelLabelEncoder()
    label_encoder.fit(labels)
    y = label_encoder.encode(labels)

    # Class weights
    all_tags = list(itertools.chain.from_iterable(labels.values))
    counts = np.bincount(
        [label_encoder.class_to_index[class_] for class_ in all_tags])
    class_weights = {i: 1.0 / count for i, count in enumerate(counts)}

    # Split data
    utils.set_seed(seed=params.seed)  # needed for skmultilearn
    X = df.text.to_numpy()
    X_train, X_, y_train, y_ = data.iterative_train_test_split(
        X=X, y=y, train_size=params.train_size)
    X_val, X_test, y_val, y_test = data.iterative_train_test_split(
        X=X_, y=y_, train_size=0.5)
    test_df = pd.DataFrame({
        "text": X_test,
        "tags": label_encoder.decode(y_test)
    })

    # Tokenize inputs
    tokenizer = data.Tokenizer(char_level=params.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
    X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
    X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

    # Create dataloaders
    train_dataset = data.CNNTextDataset(X=X_train,
                                        y=y_train,
                                        max_filter_size=params.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=params.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=params.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=params.batch_size)

    # Initialize model
    model = models.initialize_model(
        params=params,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # Train model
    logger.info(
        f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    class_weights_tensor = torch.Tensor(np.array(list(class_weights.values())))
    loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor)
    optimizer = torch.optim.Adam(model.parameters(), lr=params.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="min",
                                                           factor=0.05,
                                                           patience=5)

    # Trainer module
    trainer = Trainer(
        model=model,
        device=device,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        trial=trial,
    )

    # Train
    best_val_loss, best_model = trainer.train(params.num_epochs,
                                              params.patience,
                                              train_dataloader, val_dataloader)

    # Find best threshold
    _, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader)
    params.threshold = find_best_threshold(y_true=y_true, y_prob=y_prob)

    # Evaluate model
    artifacts = {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": best_model,
        "loss": best_val_loss,
    }
    device = torch.device("cpu")
    y_true, y_pred, performance = eval.evaluate(df=test_df,
                                                artifacts=artifacts)
    artifacts["performance"] = performance

    return artifacts
Beispiel #7
0
def predict(texts: List, run_id: str) -> Dict:
    """Predict tags for an input text using the
    best model from the `best` experiment.

    Usage:

    ```python
    texts = ["Transfer learning with BERT."]
    predict(texts=texts, run_id="264ac530b78c42608e5dea1086bc2c73")
    ```
    <pre>
    [
      {
          "input_text": "Transfer learning with BERT.",
          "preprocessed_text": "transfer learning bert",
          "predicted_tags": [
            "attention",
            "language-modeling",
            "natural-language-processing",
            "transfer-learning",
            "transformers"
          ]
      }
    ]
    </pre>

    Note:
        The input argument `texts` can hold multiple input texts and so the resulting prediction dictionary will have `len(texts)` items.

    Args:
        texts (List): List of input text to predict tags for.
        run_id (str): ID of the run to load model artifacts from.

    Returns:
        Predicted tags for input texts.
    """
    # Load artifacts from run
    client = mlflow.tracking.MlflowClient()
    run = mlflow.get_run(run_id=run_id)
    device = torch.device("cpu")
    with tempfile.TemporaryDirectory() as fp:
        client.download_artifacts(run_id=run_id, path="", dst_path=fp)
        args = Namespace(**utils.load_dict(
            filepath=Path(config.CONFIG_DIR, "args.json")))
        label_encoder = data.LabelEncoder.load(
            fp=Path(fp, "label_encoder.json"))
        tokenizer = data.Tokenizer.load(fp=Path(fp, "tokenizer.json"))
        model_state = torch.load(Path(fp, "model.pt"), map_location=device)
        # performance = utils.load_dict(filepath=Path(fp, "performance.json"))

    # Load model
    args = Namespace(**run.data.params)
    model = train.initialize_model(args=args,
                                   vocab_size=len(tokenizer),
                                   num_classes=len(label_encoder))
    model.load_state_dict(model_state)

    # Prepare data
    preprocessed_texts = [data.preprocess(text) for text in texts]
    X = np.array(tokenizer.texts_to_sequences(preprocessed_texts))
    y_filler = label_encoder.encode(
        [np.array([label_encoder.classes[0]] * len(X))])
    dataset = data.CNNTextDataset(X=X,
                                  y=y_filler,
                                  max_filter_size=int(args.max_filter_size))
    dataloader = dataset.create_dataloader(batch_size=int(args.batch_size))

    # Get predictions
    trainer = train.Trainer(model=model, device=device)
    _, y_prob = trainer.predict_step(dataloader)
    y_pred = np.array(
        [np.where(prob >= float(args.threshold), 1, 0) for prob in y_prob])
    tags = label_encoder.decode(y_pred)
    predictions = [{
        "input_text": texts[i],
        "preprocessed_text": preprocessed_texts[i],
        "predicted_tags": tags[i],
    } for i in range(len(tags))]

    return predictions