def ranking_acc_f1(gold, outputs, probs): """A convenience custom function that returns accuracy, f1, and their mean for ranking task heads.""" gold = (1 - gold) + 1 outputs = 1 * (probs.reshape((-1,)) > 0.5) accuracy = metric_score(gold, outputs, metric="accuracy") f1 = metric_score(gold, outputs, metric="f1") return {"accuracy": accuracy, "f1": f1, "acc_f1": np.mean([accuracy, f1])}
def score_task(self, X, Y, t=0, metric="accuracy", verbose=True, **kwargs): """Scores the predictive performance of the Classifier on task t Args: X: The input for the predict_task method Y: A [n] or [n, 1] np.ndarray or torch.Tensor of gold labels in {1,...,K_t} t: The task index to score metric: The metric with which to score performance on this task Returns: The (float) score of the Classifier for the specified task and metric """ Y = self._to_numpy(Y) Y_tp = self.predict_task(X, t=t, **kwargs) probs = self.predict_proba(X)[t] score = metric_score(Y[t], Y_tp, metric, ignore_in_gold=[0], probs=probs, **kwargs) if verbose: print(f"[t={t}] {metric.capitalize()}: {score:.7f}") return score
def _calculate_standard_metrics(self, model, data_loader, target_metrics, metrics_dict, split): target_standard_metrics = [] for split_metric in target_metrics: metric = self.remove_split_prefix(split_metric) if metric in standard_metric_names: target_standard_metrics.append(metric) # Only calculate predictions if at least one standard metric requires it if target_standard_metrics: if model.multitask: # For multitask models, use score method for aggregation # This may cause inefficiency if there are multiple desired metrics # and we re-predict for each one. for metric in target_standard_metrics: score = model.score(data_loader, metric, verbose=False) metrics_dict[self.add_split_prefix(metric, split)] = score else: # For singletask models, predict once and use Y_probs/Y_preds # for all metrics calculations Y_preds, Y, Y_probs = model._get_predictions(data_loader, return_probs=True) for metric in target_standard_metrics: score = metric_score(Y, Y_preds, metric, probs=Y_probs) metrics_dict[self.add_split_prefix(metric, split)] = score return metrics_dict
def score(self, X, Y, metric=['accuracy'], break_ties='random', verbose=True, **kwargs): """Scores the predictive performance of the Classifier on all tasks Args: X: The input for the predict method Y: An [N] or [N, 1] torch.Tensor or np.ndarray of gold labels in {1,...,K_t} metric: A metric (string) with which to score performance or a list of such metrics break_ties: How to break ties when making predictions Returns: scores: A (float) score """ Y = self._to_numpy(Y) Y_p = self.predict(X, break_ties=break_ties, **kwargs) metric_list = metric if isinstance(metric, list) else [metric] for metric in metric_list: score = metric_score(Y, Y_p, metric, ignore_in_gold=[0]) if verbose: print(f"{metric.capitalize()}: {score:.3f}") return score
def score(self, Y, Y_probs, Y_preds, target_metrics=None): """ Calculates and returns a metrics_dict for a given set of predictions and labels Args: Y: an [n] list of gold labels Y_probs: an [n] list of probabilities Y_preds: an [n] list of predictions target_metrics: a list of simple metrics to calculate Returns: a metrics_dict object of the form: {metric1 : score1, ...., metricN: score N} Note that the returned metrics dict will be transformed to have full metric names (e.g., "accuracy" -> "foo_task/bar_payload/accuracy") in the trainer. """ self.validate_target_metrics(target_metrics) # TODO: Tighen this up; it can be much more efficient # The main issue is that we currently require Y/Y_probs/Y_preds to be lists # so that they can support sequence-based tasks that have arbitrary length # labels. But there is certainly a way we can be more strict/certain about # what our data types will be and do some much more efficient slice operation # instead of list comprehension. # Identify all examples with at least one non-zero (i.e., non-abstain) label active = [bool(y != 0) for y in Y] if sum(active) != len(active): Y = [y for a, y in zip(active, Y) if a] if Y_probs: Y_probs = [y for a, y in zip(active, Y_probs) if a] if Y_preds: Y_preds = [y for a, y in zip(active, Y_preds) if a] simple_metrics_dict = {} for metric in self.standard_metrics: # If target metrics were specified and this is not one of them, skip it if target_metrics and metric not in target_metrics: continue score = metric_score(Y, Y_preds, metric, probs=Y_probs) simple_metrics_dict[metric] = score for metric, custom_metric_func in self.custom_metric_map.items(): # If target metrics were specified and this is not one of them, skip it if target_metrics and metric not in target_metrics: continue # If the current metric is already in the simple_metrics_dict, skip it # This is possible because a custom_metric_func can return multiple metrics if metric in simple_metrics_dict: continue custom_metric_dict = custom_metric_func(Y, Y_preds, probs=Y_probs) for metric, score in custom_metric_dict.items(): if not target_metrics or metric in target_metrics: simple_metrics_dict[metric] = score return simple_metrics_dict
def score( self, data, metric="accuracy", break_ties="random", verbose=True, print_confusion_matrix=True, **kwargs, ): """Scores the predictive performance of the Classifier on all tasks Args: data: a Pytorch DataLoader, Dataset, or tuple with Tensors (X,Y): X: The input for the predict method Y: An [n] or [n, 1] torch.Tensor or np.ndarray of target labels in {1,...,k} metric: A metric (string) with which to score performance or a list of such metrics break_ties: A tie-breaking policy (see Classifier._break_ties()) verbose: The verbosity for just this score method; it will not update the class config. print_confusion_matrix: Print confusion matrix (overwritten to False if verbose=False) Returns: scores: A (float) score or a list of such scores if kwarg metric is a list """ Y_p, Y, Y_s = self._get_predictions(data, break_ties=break_ties, return_probs=True, **kwargs) # Evaluate on the specified metrics return_list = isinstance(metric, list) metric_list = metric if isinstance(metric, list) else [metric] scores = [] for metric in metric_list: score = metric_score(Y, Y_p, metric, probs=Y_s, ignore_in_gold=[0]) scores.append(score) if verbose: if type(score) != list: print(f"{metric.capitalize()}: {score:.7f}") else: print(f"{metric.capitalize()}: {score}") # Optionally print confusion matrix if print_confusion_matrix and verbose: confusion_matrix(Y, Y_p, pretty_print=True) # If a single metric was given as a string (not list), return a float if len(scores) == 1 and not return_list: return scores[0] else: return scores
def score( self, X, Y, metric="accuracy", reduce="mean", break_ties="random", verbose=True, **kwargs, ): """Scores the predictive performance of the Classifier on all tasks Args: X: The input for the predict method Y: A t-length list of [n] or [n, 1] np.ndarrays or torch.Tensors of gold labels in {1,...,K_t} metric: The metric with which to score performance on each task reduce: How to reduce the scores of multiple tasks: None : return a t-length list of scores 'mean': return the mean score across tasks break_ties: How to break ties when making predictions Returns: scores: A (float) score or a t-length list of such scores if reduce=None """ self._check(Y, typ=list) Y = [self._to_numpy(Y_t) for Y_t in Y] Y_p = self.predict(X, break_ties=break_ties, **kwargs) self._check(Y_p, typ=list) task_scores = [] for t, Y_tp in enumerate(Y_p): score = metric_score(Y[t], Y_tp, metric, ignore_in_gold=[0]) task_scores.append(score) # TODO: Other options for reduce, including scoring only certain # primary tasks, and converting to end labels using TaskGraph... if reduce is None: score = task_scores elif reduce == "mean": score = np.mean(task_scores) else: raise Exception(f"Keyword reduce='{reduce}' not recognized.") if verbose: if reduce is None: for t, score_t in enumerate(score): print(f"{metric.capitalize()} (t={t}): {score_t:0.3f}") else: print(f"{metric.capitalize()}: {score:.3f}") return score
def score(self, X, Y, metric='f1', verbose=True): Y = convert_labels(Y, 'categorical', 'onezero') Y_p = self.predict(X) metric_list = metric if isinstance(metric, list) else [metric] scores = [] for metric in metric_list: score = metric_score(Y, Y_p, metric) scores.append(score) if verbose: print(f"{metric.capitalize()}: {score:.3f}") if isinstance(scores, list) and len(scores) == 1: return scores[0] else: return scores
def score(self, probs, target_probs): """ """ metrics = defaultdict(dict) for task_idx, _ in enumerate(probs): probs_t = torch.tensor(probs[task_idx]).double() preds_t = soft_to_hard(probs_t, break_ties='random') target_probs_t = torch.tensor(target_probs[task_idx]).double() targets = soft_to_hard(target_probs_t, break_ties='random') print(pred_to_prob(targets, k=probs_t.shape[1])) for metric in METRICS_LIST: metrics[self.idx_to_task[task_idx]][metric] = metric_score( targets + 1, preds_t + 1, metric, probs=probs_t) return metrics
def score( self, X, Y, metric=["accuracy"], break_ties="random", verbose=True, **kwargs, ): """Scores the predictive performance of the Classifier on all tasks Args: X: The input for the predict method Y: An [n] or [n, 1] torch.Tensor or np.ndarray of target labels in {1,...,k} metric: A metric (string) with which to score performance or a list of such metrics break_ties: How to break ties when making predictions verbose: The verbosity for just this score method; it will not update the class config. Returns: scores: A (float) score """ Y = self._to_numpy(Y) Y_p = self.predict(X, break_ties=break_ties, **kwargs) metric_list = metric if isinstance(metric, list) else [metric] scores = [] for metric in metric_list: score = metric_score(Y, Y_p, metric, ignore_in_gold=[0]) scores.append(score) if verbose: print(f"{metric.capitalize()}: {score:.3f}") if isinstance(scores, list) and len(scores) == 1: return scores[0] else: return scores
def score( self, data, metric="accuracy", validation_task=None, reduce="mean", break_ties="random", verbose=True, print_confusion_matrix=False, **kwargs, ): """Scores the predictive performance of the Classifier on all tasks Args: data: either a Pytorch Dataset, DataLoader or tuple supplying (X,Y): X: The input for the predict method Y: A t-length list of [n] or [n, 1] np.ndarrays or torch.Tensors of gold labels in {1,...,K_t} metric: The metric with which to score performance on each task validation_task: int: returns score for specific task number. reduce: How to reduce the scores of multiple tasks: None : return a t-length list of scores 'mean': return the mean score across tasks break_ties: How to break ties when making predictions Returns: scores: A (float) score or a t-length list of such scores if reduce=None """ Y_p, Y, Y_s = self._get_predictions(data, break_ties=break_ties, return_probs=True, **kwargs) # TODO: Handle multiple metrics... metric_list = metric if isinstance(metric, list) else [metric] if len(metric_list) > 1: raise NotImplementedError( "Multiple metrics for multi-task score() not yet supported.") metric = metric_list[0] # Return score for task t only. if validation_task is not None: score = metric_score( Y[validation_task], Y_p[validation_task], metric, probs=Y_s[validation_task], ignore_in_gold=[0], ) if verbose: print(f"{metric.capitalize()}: {score:.7f}") return score task_scores = [] for t, Y_tp in enumerate(Y_p): score = metric_score(Y[t], Y_tp, metric, probs=Y_s[t], ignore_in_gold=[0]) task_scores.append(score) # TODO: Other options for reduce, including scoring only certain # primary tasks, and converting to end labels using TaskGraph... if reduce is None: score = task_scores elif reduce == "mean": score = np.mean(task_scores) else: raise Exception(f"Keyword reduce='{reduce}' not recognized.") if verbose: if reduce is None: for t, score_t in enumerate(score): print(f"{metric.capitalize()} (t={t}): {score_t:0.3f}") else: print(f"{metric.capitalize()}: {scor7:.7f}") return score
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 108 # using get_frm_output_size() if (torch.cuda.is_available()): device = torch.device('cuda:0') #device = 'cuda' else: device = 'cpu' #print(device) L, Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y=Y["dev"])) # majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # training label model - no temporal modelling label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs=500, log_train_every=50) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # training label model without temporal modelling # naive model #print(L["train"].todense().shape) # (18850,5) #print(L["dev"].todense().shape) # (1500,5) #print(Y["dev"].shape) # (1500,) m_per_task = L["train"].todense().shape[1] # 5 MRI_data_naive = { 'Li_train': torch.FloatTensor(np.array(L["train"].todense().astype('int_'))), 'Li_dev': torch.FloatTensor(np.array(L["dev"].todense())), 'R_dev': Y["dev"] } MRI_data_naive['class_balance'] = torch.FloatTensor([0.5, 0.5]).to(device) # training naive model naive_model = DPLabelModel( m=m_per_task, T=1, edges=[], coverage_sets=[[ 0, ]] * m_per_task, mu_sharing=[[ i, ] for i in range(m_per_task)], phi_sharing=[], device=device, #class_balance=MRI_data_naive['class_balance'], seed=0) optimize(naive_model, L_hat=MRI_data_naive['Li_train'], num_iter=300, lr=1e-3, momentum=0.8, clamp=True, seed=0) # evaluating naive model R_pred = naive_model.predict(MRI_data_naive['Li_dev']).data.numpy() R_pred = 2 - R_pred #print(R_pred) #print(MRI_data_naive['R_dev']) for metric in ['accuracy', 'f1', 'recall', 'precision']: score = metric_score(MRI_data_naive['R_dev'], R_pred, metric) print(f"{metric.capitalize()}: {score:.3f}") # training label model with temporal modelling # reshaping dataset num_frames = 50 n_patients_train = round(L["train"].todense().shape[0] / num_frames) #(377) n_patients_dev = round(L["dev"].todense().shape[0] / num_frames) #(30) Ltrain = np.reshape(np.array(L["train"].todense()), (n_patients_train, num_frames, -1)) Ldev = np.reshape(np.array(L["dev"].todense()), (n_patients_dev, num_frames, -1)) Ydev = np.reshape(Y["dev"], (n_patients_dev, num_frames)) # print(Ltrain.shape) # (377,50,5) #print(Ldev.shape) # (30,50,5) #print(Ydev.shape) # (30,50) # subsampling # selecting frames 3,13,23,33,43 indices = np.linspace(2, 42, 5).astype(int) m_per_task = 5 T = 5 Ltrain_small = Ltrain[:, indices, :] # shape (377,5,5) Ldev_small = Ldev[:, indices, :] # shape (30,5,5) Ydev_small = Ydev[:, indices] # shape (30,5) Ltrain_small = np.reshape( Ltrain_small, ((n_patients_train * T), m_per_task)) # shape (1885,5) Ldev_small = np.reshape( Ldev_small, ((n_patients_dev * T), m_per_task)) # shape (150,5) Ydev_small = np.reshape(Ydev_small, ((n_patients_dev * T), )) # shape (150,) MRI_data_temporal = { 'Li_train': torch.LongTensor(Ltrain_small).view(n_patients_train, (m_per_task * T)), 'Li_dev': torch.LongTensor(Ldev_small).view(n_patients_dev, (m_per_task * T)), 'R_dev': torch.LongTensor(Ydev_small)[::T] * (2**T - 1), 'm': m_per_task * T, 'T': T } MRI_data_temporal['class_balance'] = normalize( (MRI_data_temporal['R_dev'].unsqueeze(1) == torch.arange( 2**T, device=device).unsqueeze(0)).sum(0).float(), dim=0, p=1) max_seed = 10 temporal_models = [ None, ] * max_seed for seed in range(max_seed): markov_model = DPLabelModel( m=m_per_task * T, T=T, edges=[(i, i + m_per_task) for i in range((T - 1) * m_per_task)], coverage_sets=[[ t, ] for t in range(T) for _ in range(m_per_task)], mu_sharing=[[t * m_per_task + i for t in range(T)] for i in range(m_per_task)], phi_sharing=[[(t * m_per_task + i, (t + 1) * m_per_task + i) for t in range(T - 1)] for i in range(m_per_task)], device=device, class_balance=MRI_data_temporal['class_balance'], seed=seed) optimize(markov_model, L_hat=MRI_data_temporal['Li_train'], num_iter=1000, lr=1e-5, momentum=0.8, clamp=True, verbose=False, seed=seed) temporal_models[seed] = markov_model for seed, model in enumerate(temporal_models): R_pred = model.predict(MRI_data_temporal['Li_dev'].cpu()) F1 = metric_score(MRI_data_temporal['R_dev'].cpu() > 0, R_pred.cpu() > 0, 'f1') accuracy = metric_score(MRI_data_temporal['R_dev'].cpu(), R_pred.cpu(), 'accuracy') print(f"seed={seed} accuracy={accuracy:.3f} F1={F1:.3f}")
def acc_f1(gold, outputs, **kwargs): """A convenience custom function that returns accuracy, f1, and their mean""" accuracy = metric_score(gold, outputs, metric="accuracy") f1 = metric_score(gold, outputs, metric="f1") return {"accuracy": accuracy, "f1": f1, "acc_f1": np.mean([accuracy, f1])}
def test_metric_score(self): gold = [1, 1, 1, 2, 2] pred = [1, 1, 1, 2, 1] acc = accuracy_score(gold, pred) met = metric_score(gold, pred, metric="accuracy") self.assertAlmostEqual(acc, met)