def generate_slice_labels(X, Y, slice_funcs): """ Args: X: [N x D] data Y: [N x 1] labels \in {0, 1} slice_funcs [dict]: mapping slice_names to slice_fn(X), which returns [N x 1] boolean mask indic. whether examples are in slice Returns: slice_labels [dict]: mapping slice_names to dict of { pred: [N x 1] \in {0, 1, 2} original Y abstaining (with 0) on examples not in slice ind: [N x 1] \in {1, 2} mask labels in categorical format } """ slice_labels = {} for slice_name, slice_fn in slice_funcs.items(): slice_mask = slice_fn(X) Y_gt = Y.copy() # if not in slice, abstain with label = 0 Y_gt[np.logical_not(slice_mask)] = 0 # convert from True/False mask -> 1,2 categorical labels categorical_indicator = convert_labels(slice_mask.astype(np.int), "onezero", "categorical") slice_labels[slice_name] = {"ind": categorical_indicator, "pred": Y_gt} return slice_labels
def score(self, X, Y, metric='f1', verbose=True): Y = convert_labels(Y, 'categorical', 'onezero') Y_p = self.predict(X) metric_list = metric if isinstance(metric, list) else [metric] scores = [] for metric in metric_list: score = metric_score(Y, Y_p, metric) scores.append(score) if verbose: print(f"{metric.capitalize()}: {score:.3f}") if isinstance(scores, list) and len(scores) == 1: return scores[0] else: return scores
def plot_calibration_plot(Y_probs, Y_gold, bins=20, title=None): """Plot a histogram of the accuracy for predictions with varying confidences Args: Y_probs: An [n] or [n, 1] np.ndarray of probabilities (floats in [0,1]) Y_gold: An [n] or [n, 1] np.ndarray of gold labels For a well-behaved classifier, the plot should be a U-shape. """ # For now, we only tackle binary classification with categorical labels assert all(Y_gold > 0) assert all(Y_gold <= 2) if Y_probs.ndim > 1: print("Plotting probabilities from the first column of Y_probs") Y_probs = Y_probs[:, 0] Y_preds = convert_labels((Y_probs > 0.5).astype(np.int64), "onezero", "categorical") correct_idxs = Y_preds == Y_gold centers = [] accuracies = [] interval = 1 / bins for i in range(bins + 1): if i == bins: bin_idxs = (interval * i <= Y_probs) * (Y_probs <= 1) else: bin_idxs = (interval * i <= Y_probs) * (Y_probs < interval * (i + 1)) bin_accuracy = sum(bin_idxs * correct_idxs) / sum(bin_idxs) centers.append(interval * (i + 0.5)) accuracies.append(bin_accuracy) # print("Accuracy: ", len(correct_idx) / (1.0 * len(Y_probs))) # Y_p_correct = Y_probs[correct_idx] plt.plot(centers, accuracies) plt.xlim((0, 1.025)) plt.xlabel("Probability") plt.ylabel("Accuracy") if isinstance(title, str): plt.title(title)
# # Train the Generative Model # After visualizing the label functions and their associated properties, now it is time to work on the generative model. As with common machine learning pipelines, the first step is to find the best hyperparameters for this model. Using the grid search algorithm, the follow parameters were optimized: amount of burnin, strength of regularization, number of epochs to run the model. # ## Set the hyperparameter grid search # In[13]: regularization_grid = pd.np.round(pd.np.linspace(0.1, 6, num=25), 3) # ## What are the best hyperparameters for the conditionally independent model? # In[14]: L = convert_labels(label_matricies['train'].toarray(), 'plusminus', 'categorical') L_dev = convert_labels(label_matricies['dev'].toarray(), 'plusminus', 'categorical') L_test = convert_labels(label_matricies['test'].toarray(), 'plusminus', 'categorical') validation_data = list( zip([L[:, :7], L[:, :24], L], [L_dev[:, :7], L_dev[:, :24], L_dev])) test_data = list( zip([L[:, :7], L[:, :24], L], [L_test[:, :7], L_test[:, :24], L_test])) model_labels = ["Distant Supervision (DS)", "DS+User Defined Rules", "All"] # In[15]: model_grid_search = {} for model_data, model_label in zip(validation_data, model_labels):
def categorical_cross_entropy(X, Y): return F.binary_cross_entropy( torch.sigmoid(X["data"]), convert_labels(Y, "categorical", "onezero").float())
def train(self, X, Y, X_dev=None, Y_dev=None, **kwargs): Y_bin = convert_labels(Y, 'categorical', 'onezero') self.model.fit(X, Y_bin)
def create_dataframe( task_name, model, dl, target_uids=None, max_batches=None, bert_model="bert-base-uncased", ): """Create dataframe with datapoint, predicted score, and true label. Args: task_name: task to create evaluation information for model: MetalModel object of model to evaluate dl: DataLoader object for task_name task target_uids: uids to evaluate on max_batches: number of batches to eval before stopping (useful for large datasets) Returns: DataFrame object: info. about datapoints, labels, score """ if task_name == "MNLI": raise NotImplementedError("We currently assume binary tasks") # Use BERT model to convert tokenization to sentence data = { "sentence1": [], "sentence2": [], "label": [], "score": [], "uid": [] } do_lower_case = "uncased" in bert_model tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) # Create a list of examples and associated predicted score and true label count = 0 all_uids = dl.dataset.uids # assuming data_loader batch_size=1 for (x, y), uid in tqdm(zip(list(dl), all_uids)): if target_uids and uid not in target_uids: continue tokens_idx = x[0][0] tokens = tokenizer.convert_ids_to_tokens(tokens_idx.numpy()) phrases = (" ".join(tokens).replace("[PAD]", "").replace("[CLS]", "").split("[SEP]")) data["sentence1"] += [phrases[0]] if len(phrases) > 1: data["sentence2"] += [phrases[1]] else: data["sentence2"] += ["NA"] scores = np.array(model.calculate_probs(x, [task_name])[task_name])[:, 0] # Score is the predicted probabilistic label, label is the ground truth data["score"] += list(scores) data["label"] += list(y[task_name].numpy()) data["uid"].append(uid) count += 1 if max_batches and count > max_batches: break # Create DataFrame with datapoint, score, label, pred, uid df_error = pd.DataFrame( data, columns=["sentence1", "sentence2", "score", "label", "uid"]) df_error["label"] = convert_labels(df_error["label"].values, "categorical", "onezero") df_error["pred"] = 1 * (df_error["score"] > 0.5) df_error["is_wrong"] = df_error["pred"] != df_error["label"] return df_error