Example #1
0
def generate_slice_labels(X, Y, slice_funcs):
    """
    Args:
        X: [N x D] data
        Y: [N x 1] labels \in {0, 1}
        slice_funcs [dict]: mapping slice_names to slice_fn(X),
            which returns [N x 1] boolean mask indic. whether examples are in slice

    Returns:
        slice_labels [dict]: mapping slice_names to dict of {
            pred: [N x 1] \in {0, 1, 2} original Y abstaining (with 0)
                on examples not in slice
            ind: [N x 1] \in {1, 2} mask labels in categorical format
        }
    """
    slice_labels = {}
    for slice_name, slice_fn in slice_funcs.items():
        slice_mask = slice_fn(X)
        Y_gt = Y.copy()
        # if not in slice, abstain with label = 0
        Y_gt[np.logical_not(slice_mask)] = 0

        # convert from True/False mask -> 1,2 categorical labels
        categorical_indicator = convert_labels(slice_mask.astype(np.int),
                                               "onezero", "categorical")

        slice_labels[slice_name] = {"ind": categorical_indicator, "pred": Y_gt}

    return slice_labels
    def score(self, X, Y, metric='f1', verbose=True):
        Y = convert_labels(Y, 'categorical', 'onezero')
        Y_p = self.predict(X)

        metric_list = metric if isinstance(metric, list) else [metric]
        scores = []
        for metric in metric_list:
            score = metric_score(Y, Y_p, metric)
            scores.append(score)
            if verbose:
                print(f"{metric.capitalize()}: {score:.3f}")

        if isinstance(scores, list) and len(scores) == 1:
            return scores[0]
        else:
            return scores
Example #3
0
def plot_calibration_plot(Y_probs, Y_gold, bins=20, title=None):
    """Plot a histogram of the accuracy for predictions with varying confidences

    Args:
        Y_probs: An [n] or [n, 1] np.ndarray of probabilities (floats in [0,1])
        Y_gold: An [n] or [n, 1] np.ndarray of gold labels

    For a well-behaved classifier, the plot should be a U-shape.
    """
    # For now, we only tackle binary classification with categorical labels
    assert all(Y_gold > 0)
    assert all(Y_gold <= 2)

    if Y_probs.ndim > 1:
        print("Plotting probabilities from the first column of Y_probs")
        Y_probs = Y_probs[:, 0]
    Y_preds = convert_labels((Y_probs > 0.5).astype(np.int64), "onezero",
                             "categorical")

    correct_idxs = Y_preds == Y_gold
    centers = []
    accuracies = []
    interval = 1 / bins
    for i in range(bins + 1):
        if i == bins:
            bin_idxs = (interval * i <= Y_probs) * (Y_probs <= 1)
        else:
            bin_idxs = (interval * i <= Y_probs) * (Y_probs < interval *
                                                    (i + 1))
        bin_accuracy = sum(bin_idxs * correct_idxs) / sum(bin_idxs)
        centers.append(interval * (i + 0.5))
        accuracies.append(bin_accuracy)

    # print("Accuracy: ", len(correct_idx) / (1.0 * len(Y_probs)))
    # Y_p_correct = Y_probs[correct_idx]
    plt.plot(centers, accuracies)
    plt.xlim((0, 1.025))
    plt.xlabel("Probability")
    plt.ylabel("Accuracy")
    if isinstance(title, str):
        plt.title(title)
Example #4
0
# # Train the Generative Model

# After visualizing the label functions and their associated properties, now it is time to work on the generative model. As with common machine learning pipelines, the first step is to find the best hyperparameters for this model. Using the grid search algorithm, the follow parameters were optimized: amount of burnin, strength of regularization, number of epochs to run the model.

# ## Set the hyperparameter grid search

# In[13]:

regularization_grid = pd.np.round(pd.np.linspace(0.1, 6, num=25), 3)

# ## What are the best hyperparameters for the conditionally independent model?

# In[14]:

L = convert_labels(label_matricies['train'].toarray(), 'plusminus',
                   'categorical')
L_dev = convert_labels(label_matricies['dev'].toarray(), 'plusminus',
                       'categorical')
L_test = convert_labels(label_matricies['test'].toarray(), 'plusminus',
                        'categorical')

validation_data = list(
    zip([L[:, :7], L[:, :24], L], [L_dev[:, :7], L_dev[:, :24], L_dev]))
test_data = list(
    zip([L[:, :7], L[:, :24], L], [L_test[:, :7], L_test[:, :24], L_test]))
model_labels = ["Distant Supervision (DS)", "DS+User Defined Rules", "All"]

# In[15]:

model_grid_search = {}
for model_data, model_label in zip(validation_data, model_labels):
Example #5
0
def categorical_cross_entropy(X, Y):
    return F.binary_cross_entropy(
        torch.sigmoid(X["data"]),
        convert_labels(Y, "categorical", "onezero").float())
 def train(self, X, Y, X_dev=None, Y_dev=None, **kwargs):
     Y_bin = convert_labels(Y, 'categorical', 'onezero')
     self.model.fit(X, Y_bin)
Example #7
0
def create_dataframe(
    task_name,
    model,
    dl,
    target_uids=None,
    max_batches=None,
    bert_model="bert-base-uncased",
):
    """Create dataframe with datapoint, predicted score, and true label.

    Args:
        task_name: task to create evaluation information for
        model: MetalModel object of model to evaluate
        dl: DataLoader object for task_name task
        target_uids: uids to evaluate on
        max_batches: number of batches to eval before stopping (useful for large datasets)

    Returns:
        DataFrame object: info. about datapoints, labels, score
    """
    if task_name == "MNLI":
        raise NotImplementedError("We currently assume binary tasks")

    # Use BERT model to convert tokenization to sentence
    data = {
        "sentence1": [],
        "sentence2": [],
        "label": [],
        "score": [],
        "uid": []
    }
    do_lower_case = "uncased" in bert_model
    tokenizer = BertTokenizer.from_pretrained(bert_model,
                                              do_lower_case=do_lower_case)

    # Create a list of examples and associated predicted score and true label
    count = 0
    all_uids = dl.dataset.uids

    # assuming data_loader batch_size=1
    for (x, y), uid in tqdm(zip(list(dl), all_uids)):
        if target_uids and uid not in target_uids:
            continue

        tokens_idx = x[0][0]
        tokens = tokenizer.convert_ids_to_tokens(tokens_idx.numpy())
        phrases = (" ".join(tokens).replace("[PAD]",
                                            "").replace("[CLS]",
                                                        "").split("[SEP]"))
        data["sentence1"] += [phrases[0]]
        if len(phrases) > 1:
            data["sentence2"] += [phrases[1]]
        else:
            data["sentence2"] += ["NA"]

        scores = np.array(model.calculate_probs(x, [task_name])[task_name])[:,
                                                                            0]

        # Score is the predicted probabilistic label, label is the ground truth
        data["score"] += list(scores)
        data["label"] += list(y[task_name].numpy())
        data["uid"].append(uid)
        count += 1
        if max_batches and count > max_batches:
            break

    # Create DataFrame with datapoint, score, label, pred, uid
    df_error = pd.DataFrame(
        data, columns=["sentence1", "sentence2", "score", "label", "uid"])
    df_error["label"] = convert_labels(df_error["label"].values, "categorical",
                                       "onezero")
    df_error["pred"] = 1 * (df_error["score"] > 0.5)

    df_error["is_wrong"] = df_error["pred"] != df_error["label"]
    return df_error