def run_on_all_states(f, index_slice=None): if index_slice is not None: states = list(us.STATES)[index_slice] else: states = list(us.STATES) run_task = catch_errors(f) results = [run_task(state) for state in states] successes = sum(result is Result.Success for result in results) errors = sum(result is Result.Error for result in results) printer = Printer() printer.info("Final result:") printer.info(f"{successes} were created successfully. {errors} errored.") printer.table( list( zip( [name for name in states], [ str(result) if result is not None else "Error" for result in results ], )), header=("State", "Created"), divider=True, )
def print_summary(nlp, pretty=True, no_print=False): """Print a formatted summary for the current nlp object's pipeline. Shows a table with the pipeline components and why they assign and require, as well as any problems if available. nlp (Language): The nlp object. pretty (bool): Pretty-print the results (color etc). no_print (bool): Don't print anything, just return the data. RETURNS (dict): A dict with "overview" and "problems". """ msg = Printer(pretty=pretty, no_print=no_print) overview = [] problems = {} for i, (name, pipe) in enumerate(nlp.pipeline): requires = getattr(pipe, "requires", []) assigns = getattr(pipe, "assigns", []) retok = getattr(pipe, "retokenizes", False) overview.append((i, name, requires, assigns, retok)) problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False) msg.divider("Pipeline Overview") header = ("#", "Component", "Requires", "Assigns", "Retokenizes") msg.table(overview, header=header, divider=True, multiline=True) n_problems = sum(len(p) for p in problems.values()) if any(p for p in problems.values()): msg.divider("Problems ({})".format(n_problems)) for name, problem in problems.items(): if problem: problem = ", ".join(problem) msg.warn("'{}' requirements not met: {}".format(name, problem)) else: msg.good("No problems found.") if no_print: return {"overview": overview, "problems": problems}
def evaluate( model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None, displacy_limit=25, return_scores=False, ): """ Evaluate a model. To render a sample of parses in a HTML file, set an output directory as the displacy_path argument. """ msg = Printer() util.fix_random_seed() if gpu_id >= 0: util.use_gpu(gpu_id) util.set_env_log(False) data_path = util.ensure_path(data_path) displacy_path = util.ensure_path(displacy_path) if not data_path.exists(): msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) begin = timer() scorer = nlp.evaluate(dev_docs, verbose=False) end = timer() nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) results = { "Time": "%.2f s" % (end - begin), "Words": nwords, "Words/s": "%.0f" % (nwords / (end - begin)), "TOK": "%.2f" % scorer.token_acc, "POS": "%.2f" % scorer.tags_acc, "UAS": "%.2f" % scorer.uas, "LAS": "%.2f" % scorer.las, "NER P": "%.2f" % scorer.ents_p, "NER R": "%.2f" % scorer.ents_r, "NER F": "%.2f" % scorer.ents_f, } msg.table(results, title="Results") if displacy_path: docs, golds = zip(*dev_docs) render_deps = "parser" in nlp.meta.get("pipeline", []) render_ents = "ner" in nlp.meta.get("pipeline", []) render_parses( docs, displacy_path, model_name=model, limit=displacy_limit, deps=render_deps, ents=render_ents, ) msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path) if return_scores: return scorer.scores
def info( model: Optional[str] = None, *, markdown: bool = False, silent: bool = True, exclude: Optional[List[str]] = None, ) -> Union[str, dict]: msg = Printer(no_print=silent, pretty=not silent) if not exclude: exclude = [] if model: title = f"Info about pipeline '{model}'" data = info_model(model, silent=silent) else: title = "Info about spaCy" data = info_spacy() raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()} if "Pipelines" in data and isinstance(data["Pipelines"], dict): data["Pipelines"] = ", ".join(f"{n} ({v})" for n, v in data["Pipelines"].items()) markdown_data = get_markdown(data, title=title, exclude=exclude) if markdown: if not silent: print(markdown_data) return markdown_data if not silent: table_data = {k: v for k, v in data.items() if k not in exclude} msg.table(table_data, title=title) return raw_data
def print_textcats_auc_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: msg.table( [(k, f"{v:.2f}") for k, v in scores.items()], header=("", "ROC AUC"), aligns=("l", "r"), title="Textcat ROC AUC (per label)", )
def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None: data = [(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") for k, v in scores.items()] msg.table( data, header=("", "P", "R", "F"), aligns=("l", "r", "r", "r"), title=f"{name} (per {type})", )
def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None: data = [] for key, value in scores.items(): row = [key] for k in ("p", "r", "f"): v = value[k] row.append(f"{v * 100:.2f}" if isinstance(v, (int, float)) else v) data.append(row) msg.table( data, header=("", "P", "R", "F"), aligns=("l", "r", "r", "r"), title=f"{name} (per {type})", )
def info(model=None, markdown=False, silent=False): """ Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ msg = Printer() if model: if util.is_package(model): model_path = util.get_package_path(model) else: model_path = util.get_data_path() / model meta_path = model_path / "meta.json" if not meta_path.is_file(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if model_path.resolve() != model_path: meta["link"] = path2str(model_path) meta["source"] = path2str(model_path.resolve()) else: meta["source"] = path2str(model_path) if not silent: title = "Info about model '{}'".format(model) model_meta = { k: v for k, v in meta.items() if k not in ("accuracy", "speed") } if markdown: print_markdown(model_meta, title=title) else: msg.table(model_meta, title=title) return meta data = { "spaCy version": about.__version__, "Location": path2str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), "Models": list_models(), } if not silent: title = "Info about spaCy" if markdown: print_markdown(data, title=title) else: msg.table(data, title=title) return data
def evaluate(self, data: List[Example]) -> None: msg = Printer() formatted_data, _ = self._format_data(data) sc = self.nlp.evaluate(formatted_data, batch_size=64) msg.divider("Recognizer Results") result = [ ("Precision", f"{sc.ents_p:.3f}"), ("Recall", f"{sc.ents_r:.3f}"), ("F-Score", f"{sc.ents_f:.3f}"), ] msg.table(result) table_data = [] for label, scores in sorted(sc.ents_per_type.items(), key=lambda tup: tup[0]): table_data.append((label, f"{scores['p']:.3f}", f"{scores['r']:.3f}", f"{scores['f']:.3f}")) header = ("Label", "Precision", "Recall", "F-Score") formatted = msg.table(table_data, header=header, divider=True) return sc
def evaluate( model: str, data_path: Path, output: Optional[Path] = None, use_gpu: int = -1, gold_preproc: bool = False, displacy_path: Optional[Path] = None, displacy_limit: int = 25, silent: bool = True, spans_key: str = "sc", ) -> Dict[str, Any]: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() setup_gpu(use_gpu, silent=silent) data_path = util.ensure_path(data_path) output_path = util.ensure_path(output) displacy_path = util.ensure_path(displacy_path) if not data_path.exists(): msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = Corpus(data_path, gold_preproc=gold_preproc) nlp = util.load_model(model) dev_dataset = list(corpus(nlp)) scores = nlp.evaluate(dev_dataset) metrics = { "TOK": "token_acc", "TAG": "tag_acc", "POS": "pos_acc", "MORPH": "morph_acc", "LEMMA": "lemma_acc", "UAS": "dep_uas", "LAS": "dep_las", "NER P": "ents_p", "NER R": "ents_r", "NER F": "ents_f", "TEXTCAT": "cats_score", "SENT P": "sents_p", "SENT R": "sents_r", "SENT F": "sents_f", "SPAN P": f"spans_{spans_key}_p", "SPAN R": f"spans_{spans_key}_r", "SPAN F": f"spans_{spans_key}_f", "SPEED": "speed", } results = {} data = {} for metric, key in metrics.items(): if key in scores: if key == "cats_score": metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" if isinstance(scores[key], (int, float)): if key == "speed": results[metric] = f"{scores[key]:.0f}" else: results[metric] = f"{scores[key]*100:.2f}" else: results[metric] = "-" data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] msg.table(results, title="Results") data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) render_deps = "parser" in factory_names render_ents = "ner" in factory_names render_parses( docs, displacy_path, model_name=model, limit=displacy_limit, deps=render_deps, ents=render_ents, ) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) if output_path is not None: srsly.write_json(output_path, data) msg.good(f"Saved results to {output_path}") return data
def validate(): """ Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ msg = Printer() with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( "Server error ({})".format(r.status_code), "Couldn't fetch compatibility table.", exits=1, ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] version = about.__version__ version = version.rsplit(".dev", 1)[0] current_compat = compat.get(version) if not current_compat: msg.fail( "Can't find spaCy v{} in compatibility table".format(version), about.__compatibility__, exits=1, ) all_models = set() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) incompat_links = {l for l, d in model_links.items() if not d["compat"]} incompat_models = { d["name"] for _, d in model_pkgs.items() if not d["compat"] } incompat_models.update( [d["name"] for _, d in model_links.items() if not d["compat"]]) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider("Installed models (spaCy v{})".format(about.__version__)) msg.info("spaCy installation: {}".format(path2str(spacy_dir))) if model_links or model_pkgs: header = ("TYPE", "NAME", "MODEL", "VERSION", "") rows = [] for name, data in model_pkgs.items(): rows.append(get_model_row(current_compat, name, data, msg)) for name, data in model_links.items(): rows.append(get_model_row(current_compat, name, data, msg, "link")) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the model packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text("The following models are not available for spaCy " "v{}: {}".format(about.__version__, ", ".join(na_models))) if incompat_links: msg.text( "You may also want to overwrite the incompatible links using the " "`python -m spacy link` command with `--force`, or remove them " "from the data directory. " "Data path: {path}".format(path=path2str(get_data_path()))) if incompat_models or incompat_links: sys.exit(1)
class PrecisionRecallFMeasure(BaseMetric, ClassNursery): def __init__(self, idx2labelname_mapping: Optional[Dict[int, str]] = None): """ Parameters ---------- idx2labelname_mapping : Dict[int, str] Mapping from index to label. If this is not provided then we are going to use the class indices in all the reports """ super(PrecisionRecallFMeasure, self).__init__() self.idx2labelname_mapping = idx2labelname_mapping self.msg_printer = Printer() self.classification_metrics_utils = ClassificationMetricsUtils( idx2labelname_mapping=idx2labelname_mapping ) # setup counters to calculate true positives, false positives, # false negatives and true negatives # The keys are the different class indices in the dataset and the # values are the number of true positives, false positives, false negative # true negatvies for the dataset self.tp_counter = {} self.fp_counter = {} self.fn_counter = {} self.tn_counter = {} def print_confusion_metrics( self, predicted_probs: torch.FloatTensor, labels: torch.LongTensor, labels_mask: Optional[torch.ByteTensor] = None, ) -> None: """ Prints confusion matrix Parameters ---------- predicted_probs : torch.FloatTensor Predicted Probabilities ``[batch_size, num_classes]`` labels : torch.FloatTensor True labels of the size ``[batch_size, 1]`` labels_mask : Optional[torch.ByteTensor] Labels mask indicating 1 in thos places where the true label is ignored Otherwise 0. It should be of same size as labels """ assert predicted_probs.ndimension() == 2, self.msg_printer.fail( "The predicted probs should " "have 2 dimensions. The probs " "that you passed have shape " "{0}".format(predicted_probs.size()) ) assert labels.ndimension() == 2, self.msg_printer.fail( "The labels should have 2 dimension." "The labels that you passed have shape " "{0}".format(labels.size()) ) if labels_mask is None: labels_mask = torch.zeros_like(labels).type(torch.ByteTensor) # TODO: for now k=1, change it to different number of ks top_probs, top_indices = predicted_probs.topk(k=1, dim=1) # convert to 1d numpy top_indices_numpy = top_indices.cpu().numpy().tolist() # convert labels to 1 dimension true_labels_numpy = labels.cpu().numpy().tolist() confusion_mtrx, classes = self.classification_metrics_utils.get_confusion_matrix_and_labels( predicted_tag_indices=top_indices_numpy, true_tag_indices=true_labels_numpy, masked_label_indices=labels_mask, ) if self.idx2labelname_mapping is not None: classes_with_names = [ f"cls_{class_}({self.idx2labelname_mapping[class_]})" for class_ in classes ] else: classes_with_names = classes assert ( len(classes) == confusion_mtrx.shape[1] ), f"len(classes) = {len(classes)} confusion matrix shape {confusion_mtrx.shape}" header = [f"{class_}" for class_ in classes] header.insert(0, "pred(cols)/true(rows)") confusion_mtrx = pd.DataFrame(confusion_mtrx) confusion_mtrx.insert(0, "class_name", classes_with_names) self.msg_printer.table( data=confusion_mtrx.values.tolist(), header=header, divider=True ) def calc_metric( self, iter_dict: Dict[str, Any], model_forward_dict: Dict[str, Any] ) -> None: """ Updates the values being tracked for calculating the metric For Precision Recall FMeasure we update the true positive, false positive and false negative of the different classes being tracked Parameters ---------- iter_dict : Dict[str, Any] The ``iter_dict`` from the dataset is expected to have ``label`` which are labels for instances. They are usually of the size ``[batch_size]`` Optionally there can be a ``label_mask`` of the size ``[batch_size]`` The ``label_mask`` is 1 where the label should be masked otherwise if the label is not masked then it is 0 model_forward_dict : Dict[str, Any] The dictionary obtained after a forward pass The model_forward_pass is expected to have ``normalized_probs`` that usually is of the size ``[batch_size, num_classes]`` """ normalized_probs = model_forward_dict["normalized_probs"] labels = iter_dict["label"] labels_mask = iter_dict.get("label_mask") if labels_mask is None: labels_mask = torch.zeros_like(labels).type(torch.ByteTensor) normalized_probs = normalized_probs.cpu() labels = labels.cpu() assert normalized_probs.ndimension() == 2, self.msg_printer.fail( "The predicted probs should " "have 2 dimensions. The probs " "that you passed have shape " "{0}".format(normalized_probs.size()) ) assert labels.ndimension() == 2, self.msg_printer.fail( "The labels should have 2 dimension." "The labels that you passed have shape " "{0}".format(labels.size()) ) # TODO: for now k=1, change it to different number of ks top_probs, top_indices = normalized_probs.topk(k=1, dim=1) # convert to 1d numpy top_indices_numpy = top_indices.cpu().numpy().tolist() # convert labels to 1 dimension true_labels_numpy = labels.cpu().numpy().tolist() labels_mask = labels_mask.tolist() confusion_mtrx, classes = self.classification_metrics_utils.get_confusion_matrix_and_labels( true_tag_indices=true_labels_numpy, predicted_tag_indices=top_indices_numpy, masked_label_indices=labels_mask, ) # For further confirmation on how I calculated this I searched for stackoverflow on # 18th of July 2019. This seems to be the correct way to calculate tps, fps, fns # You can refer to https://stackoverflow.com/a/43331484/2704763 # calculate tps tps = np.around(np.diag(confusion_mtrx), decimals=4) # calculate fps fps = np.around(np.sum(confusion_mtrx, axis=0) - tps, decimals=4) # calculate fns fns = np.around(np.sum(confusion_mtrx, axis=1) - tps, decimals=4) tps = tps.tolist() fps = fps.tolist() fns = fns.tolist() class_tps_mapping = dict(zip(classes, tps)) class_fps_mapping = dict(zip(classes, fps)) class_fns_mapping = dict(zip(classes, fns)) self.tp_counter = merge_dictionaries_with_sum( self.tp_counter, class_tps_mapping ) self.fp_counter = merge_dictionaries_with_sum( self.fp_counter, class_fps_mapping ) self.fn_counter = merge_dictionaries_with_sum( self.fn_counter, class_fns_mapping ) def get_metric(self) -> Dict[str, Any]: """ Returns different values being tracked to calculate Precision Recall FMeasure Returns ------- Dict[str, Any] Returns a dictionary with the following key value pairs precision: Dict[str, float] The precision for different classes recall: Dict[str, float] The recall values for different classes fscore: Dict[str, float] The fscore values for different classes, num_tp: Dict[str, int] The number of true positives for different classes, num_fp: Dict[str, int] The number of false positives for different classes, num_fn: Dict[str, int] The number of false negatives for different classes "macro_precision": float The macro precision value considering all different classes, macro_recall: float The macro recall value considering all different classes macro_fscore: float The macro fscore value considering all different classes micro_precision: float The micro precision value considering all different classes, micro_recall: float The micro recall value considering all different classes. micro_fscore: float The micro fscore value considering all different classes """ precision_dict, recall_dict, fscore_dict = self.classification_metrics_utils.get_prf_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) # macro scores # for a detailed discussion on micro and macro scores please follow the discussion @ # https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin # micro scores micro_precision, micro_recall, micro_fscore = self.classification_metrics_utils.get_micro_prf_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) # macro scores macro_precision, macro_recall, macro_fscore = self.classification_metrics_utils.get_macro_prf_from_prf_dicts( precision_dict=precision_dict, recall_dict=recall_dict, fscore_dict=fscore_dict, ) return { "precision": precision_dict, "recall": recall_dict, "fscore": fscore_dict, "num_tp": self.tp_counter, "num_fp": self.fp_counter, "num_fn": self.fn_counter, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_fscore": macro_fscore, "micro_precision": micro_precision, "micro_recall": micro_recall, "micro_fscore": micro_fscore, } def reset(self) -> None: """ Resets all the counters Resets the ``tp_counter`` which is the true positive counter Resets the ``fp_counter`` which is the false positive counter Resets the ``fn_counter`` - which is the false negative counter Resets the ``tn_counter`` - which is the true nagative counter """ self.tp_counter = {} self.fp_counter = {} self.fn_counter = {} self.tn_counter = {} def report_metrics(self, report_type="wasabi"): """ Reports metrics in a printable format Parameters ---------- report_type : type Select one of ``[wasabi, paper]`` If wasabi, then we return a printable table that represents the precision recall and fmeasures for different classes """ accuracy_metrics = self.get_metric() precision = accuracy_metrics["precision"] recall = accuracy_metrics["recall"] fscore = accuracy_metrics["fscore"] macro_precision = accuracy_metrics["macro_precision"] macro_recall = accuracy_metrics["macro_recall"] macro_fscore = accuracy_metrics["macro_fscore"] micro_precision = accuracy_metrics["micro_precision"] micro_recall = accuracy_metrics["micro_recall"] micro_fscore = accuracy_metrics["micro_fscore"] if report_type == "wasabi": table = self.classification_metrics_utils.generate_table_report_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) return table elif report_type == "paper": "Refer to the paper Logical Structure Recovery in Scholarly Articles with " "Rich Document Features Table 2. It generates just fscores and returns" class_nums = fscore.keys() class_nums = sorted(class_nums, reverse=False) fscores = [fscore[class_num] for class_num in class_nums] fscores.extend([micro_fscore, macro_fscore]) return fscores
def validate(): """ Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ msg = Printer() with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( "Server error ({})".format(r.status_code), "Couldn't fetch compatibility table.", exits=1, ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] version = about.__version__ version = version.rsplit(".dev", 1)[0] current_compat = compat.get(version) if not current_compat: msg.fail( "Can't find spaCy v{} in compatibility table".format(version), about.__compatibility__, exits=1, ) all_models = set() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) incompat_links = {l for l, d in model_links.items() if not d["compat"]} incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]} incompat_models.update( [d["name"] for _, d in model_links.items() if not d["compat"]] ) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider("Installed models (spaCy v{})".format(about.__version__)) msg.info("spaCy installation: {}".format(path2str(spacy_dir))) if model_links or model_pkgs: header = ("TYPE", "NAME", "MODEL", "VERSION", "") rows = [] for name, data in model_pkgs.items(): rows.append(get_model_row(current_compat, name, data, msg)) for name, data in model_links.items(): rows.append(get_model_row(current_compat, name, data, msg, "link")) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the model packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text( "The following models are not available for spaCy " "v{}: {}".format(about.__version__, ", ".join(na_models)) ) if incompat_links: msg.text( "You may also want to overwrite the incompatible links using the " "`python -m spacy link` command with `--force`, or remove them " "from the data directory. " "Data path: {path}".format(path=path2str(get_data_path())) ) if incompat_models or incompat_links: sys.exit(1)
class PrecisionRecallFMeasure(BaseMetric, ClassNursery): def __init__(self, datasets_manager: DatasetsManager): """ Parameters ---------- datasets_manager : DatasetsManager The dataset manager managing the labels and other information """ super(PrecisionRecallFMeasure, self).__init__(datasets_manager=datasets_manager) self.datasets_manager = datasets_manager self.idx2labelname_mapping = None self.msg_printer = Printer() self.classification_metrics_utils = ClassificationMetricsUtils() self.label_namespace = self.datasets_manager.label_namespaces[0] self.normalized_probs_namespace = "normalized_probs" self.label_numericalizer = self.datasets_manager.namespace_to_numericalizer[ self.label_namespace] # setup counters to calculate true positives, false positives, # false negatives and true negatives # The keys are the different class indices in the dataset and the # values are the number of true positives, false positives, false negative # true negatvies for the dataset self.tp_counter = {} self.fp_counter = {} self.fn_counter = {} self.tn_counter = {} def print_confusion_metrics( self, predicted_probs: torch.FloatTensor, labels: torch.LongTensor, labels_mask: Optional[torch.ByteTensor] = None, ) -> None: """ Prints confusion matrix Parameters ---------- predicted_probs : torch.FloatTensor Predicted Probabilities ``[batch_size, num_classes]`` labels : torch.FloatTensor True labels of the size ``[batch_size, 1]`` labels_mask : Optional[torch.ByteTensor] Labels mask indicating 1 in thos places where the true label is ignored Otherwise 0. It should be of same size as labels """ assert predicted_probs.ndimension() == 2, self.msg_printer.fail( "The predicted probs should " "have 2 dimensions. The probs " "that you passed have shape " "{0}".format(predicted_probs.size())) assert labels.ndimension() == 2, self.msg_printer.fail( "The labels should have 2 dimension." "The labels that you passed have shape " "{0}".format(labels.size())) if labels_mask is None: labels_mask = torch.zeros_like(labels, dtype=torch.bool) # TODO: for now k=1, change it to different number of ks top_probs, top_indices = predicted_probs.topk(k=1, dim=1) # convert to 1d numpy top_indices_numpy = top_indices.cpu().numpy().tolist() # convert labels to 1 dimension true_labels_numpy = labels.cpu().numpy().tolist() ( confusion_mtrx, classes, ) = self.classification_metrics_utils.get_confusion_matrix_and_labels( predicted_tag_indices=top_indices_numpy, true_tag_indices=true_labels_numpy, true_masked_label_indices=labels_mask, ) if self.idx2labelname_mapping is not None: classes_with_names = [ f"cls_{class_}({self.idx2labelname_mapping[class_]})" for class_ in classes ] else: classes_with_names = classes assert ( len(classes) == confusion_mtrx.shape[1] ), f"len(classes) = {len(classes)} confusion matrix shape {confusion_mtrx.shape}" header = [f"{class_}" for class_ in classes] header.insert(0, "pred(cols)/true(rows)") confusion_mtrx = pd.DataFrame(confusion_mtrx) confusion_mtrx.insert(0, "class_name", classes_with_names) self.msg_printer.table(data=confusion_mtrx.values.tolist(), header=header, divider=True) def calc_metric(self, lines: List[Line], labels: List[Label], model_forward_dict: Dict[str, Any]) -> None: """ Updates the values being tracked for calculating the metric For Precision Recall FMeasure we update the true positive, false positive and false negative of the different classes being tracked Parameters ---------- lines : List[Line] A list of lines labels: List[Label] A list of labels. This has to be the label used for classification Refer to the documentation of Label for more information model_forward_dict : Dict[str, Any] The dictionary obtained after a forward pass The model_forward_pass is expected to have ``normalized_probs`` that usually is of the size ``[batch_size, num_classes]`` """ normalized_probs = model_forward_dict[self.normalized_probs_namespace] labels_tensor = [] for label in labels: tokens = label.tokens[self.label_namespace] tokens = [tok.text for tok in tokens] numericalized_instance = self.label_numericalizer.numericalize_instance( instance=tokens) labels_tensor.extend(numericalized_instance) labels_tensor = torch.LongTensor(labels_tensor) labels_tensor = labels_tensor.view(-1, 1) labels_mask = torch.zeros_like(labels_tensor).type(torch.ByteTensor) normalized_probs = normalized_probs.cpu() assert normalized_probs.ndimension() == 2, self.msg_printer.fail( "The predicted probs should " "have 2 dimensions. The probs " "that you passed have shape " "{0}".format(normalized_probs.size())) assert labels_tensor.ndimension() == 2, self.msg_printer.fail( "The labels should have 2 dimension." "The labels that you passed have shape " "{0}".format(labels_tensor.size())) # TODO: for now k=1, change it to different number of ks top_probs, top_indices = normalized_probs.topk(k=1, dim=1) # convert to 1d numpy top_indices_numpy = top_indices.cpu().numpy().tolist() # convert labels to 1 dimension true_labels_numpy = labels_tensor.cpu().numpy().tolist() labels_mask = labels_mask.tolist() ( confusion_mtrx, classes, ) = self.classification_metrics_utils.get_confusion_matrix_and_labels( true_tag_indices=true_labels_numpy, predicted_tag_indices=top_indices_numpy, true_masked_label_indices=labels_mask, ) # For further confirmation on how I calculated this I searched for stackoverflow on # 18th of July 2019. This seems to be the correct way to calculate tps, fps, fns # You can refer to https://stackoverflow.com/a/43331484/2704763 # calculate tps tps = np.around(np.diag(confusion_mtrx), decimals=4) # calculate fps fps = np.around(np.sum(confusion_mtrx, axis=0) - tps, decimals=4) # calculate fns fns = np.around(np.sum(confusion_mtrx, axis=1) - tps, decimals=4) tps = tps.tolist() fps = fps.tolist() fns = fns.tolist() class_tps_mapping = dict(zip(classes, tps)) class_fps_mapping = dict(zip(classes, fps)) class_fns_mapping = dict(zip(classes, fns)) self.tp_counter = merge_dictionaries_with_sum(self.tp_counter, class_tps_mapping) self.fp_counter = merge_dictionaries_with_sum(self.fp_counter, class_fps_mapping) self.fn_counter = merge_dictionaries_with_sum(self.fn_counter, class_fns_mapping) def get_metric(self) -> Dict[str, Any]: """ Returns different values being tracked to calculate Precision Recall FMeasure Returns ------- Dict[str, Any] Returns a dictionary with the following key value pairs for every namespace precision: Dict[str, float] The precision for different classes recall: Dict[str, float] The recall values for different classes fscore: Dict[str, float] The fscore values for different classes, num_tp: Dict[str, int] The number of true positives for different classes, num_fp: Dict[str, int] The number of false positives for different classes, num_fn: Dict[str, int] The number of false negatives for different classes "macro_precision": float The macro precision value considering all different classes, macro_recall: float The macro recall value considering all different classes macro_fscore: float The macro fscore value considering all different classes micro_precision: float The micro precision value considering all different classes, micro_recall: float The micro recall value considering all different classes. micro_fscore: float The micro fscore value considering all different classes """ ( precision_dict, recall_dict, fscore_dict, ) = self.classification_metrics_utils.get_prf_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) # macro scores # for a detailed discussion on micro and macro scores please follow the discussion @ # https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin # micro scores ( micro_precision, micro_recall, micro_fscore, ) = self.classification_metrics_utils.get_micro_prf_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) # macro scores ( macro_precision, macro_recall, macro_fscore, ) = self.classification_metrics_utils.get_macro_prf_from_prf_dicts( precision_dict=precision_dict, recall_dict=recall_dict, fscore_dict=fscore_dict, ) metric = { self.label_namespace: { "precision": precision_dict, "recall": recall_dict, "fscore": fscore_dict, "num_tp": self.tp_counter, "num_fp": self.fp_counter, "num_fn": self.fn_counter, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_fscore": macro_fscore, "micro_precision": micro_precision, "micro_recall": micro_recall, "micro_fscore": micro_fscore, } } return metric def reset(self) -> None: """ Resets all the counters Resets the ``tp_counter`` which is the true positive counter Resets the ``fp_counter`` which is the false positive counter Resets the ``fn_counter`` - which is the false negative counter Resets the ``tn_counter`` - which is the true nagative counter """ self.tp_counter = {} self.fp_counter = {} self.fn_counter = {} self.tn_counter = {} def report_metrics(self, report_type="wasabi"): """ Reports metrics in a printable format Parameters ---------- report_type : type Select one of ``[wasabi, paper]`` If wasabi, then we return a printable table that represents the precision recall and fmeasures for different classes """ if report_type == "wasabi": table = self.classification_metrics_utils.generate_table_report_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) return {self.label_namespace: table}
def top_prediction_errors( recognizer: EntityRecognizer, data: List[Example], labels: List[str] = None, n: int = None, k: int = None, exclude_fp: bool = False, exclude_fn: bool = False, verbose: bool = False, ) -> List[PredictionError]: """Get a sorted list of examples your model is worst at predicting. Args: recognizer (EntityRecognizer): An instance of EntityRecognizer data (List[Example]): List of annotated Examples labels (List[str], optional): List of labels to get errors for. Defaults to the labels property of `recognizer`. n (int, optional): If set, only use the top n examples from data. k (int, optional): If set, return the top k prediction errors, otherwise the whole list. exclude_fp (bool, optional): Flag to exclude False Positive errors. exclude_fn (bool, optional): Flag to exclude False Negative errors. verbose (bool, optional): Show verbose output. Returns: List[PredictionError]: List of Prediction Errors your model is making, sorted by the spans your model has the most trouble with. """ labels_ = labels or recognizer.labels if n is not None: data = data[:n] n_examples = len(data) texts = (e.text for e in data) anns = (e.spans for e in data) errors = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) # type: ignore error_examples: DefaultDict[str, List[PredictionErrorExamplePair]] = defaultdict(list) n_errors = 0 for orig_example, pred_example, ann in zip(data, recognizer.predict(texts), anns): if k is not None and n_errors > k: break pred_error_example_pair = PredictionErrorExamplePair( original=orig_example, predicted=pred_example ) cand = set([(s.start, s.end, s.label) for s in pred_example.spans]) gold = set([(s.start, s.end, s.label) for s in ann]) fp_diff = cand - gold fn_diff = gold - cand seen = set() if fp_diff and not exclude_fp: for fp in fp_diff: gold_ent = None for ge in gold: if fp[0] == ge[0] and fp[1] == ge[1]: gold_ent = ge break if gold_ent: start, end, label = gold_ent text = pred_example.text[start:end] false_label = fp[2] errors[label][text][false_label] += 1 error_examples[f"{text}||{label}||{false_label}"].append( pred_error_example_pair ) else: start, end, false_label = fp text = pred_example.text[start:end] errors[NONE][text][false_label] += 1 error_examples[f"{text}||{NONE}||{false_label}"].append(pred_error_example_pair) n_errors += 1 seen.add((start, end)) if fn_diff and not exclude_fn: for fn in fn_diff: start, end, label = fn if (start, end) not in seen: text = pred_example.text[start:end] errors[label][text][NONE] += 1 error_examples[f"{text}||{label}||{NONE}"].append(pred_error_example_pair) n_errors += 1 ranked_errors_map: Dict[str, PredictionError] = {} for label, errors_per_label in errors.items(): for error_text, error_labels in errors_per_label.items(): for error_label, count in error_labels.items(): pe_hash = f"{error_text}||{label}||{error_label}" ranked_errors_map[pe_hash] = PredictionError( text=error_text, true_label=label, pred_label=error_label, count=count, examples=error_examples[f"{error_text}||{label}||{error_label}"], ) ranked_errors: List[PredictionError] = sorted( list(ranked_errors_map.values()), key=lambda error: error.count, reverse=True # type: ignore ) error_texts = set() for re in ranked_errors: if re.examples: for e in re.examples: error_texts.add(e.original.text) error_rate = round(len(error_texts) / len(data), 2) if verbose: error_summary = { "N Examples": len(data), "N Errors": len(ranked_errors), "N Error Examples": len(error_texts), "Error Rate": error_rate, } msg = Printer() msg.divider("Error Analysis") msg.table(error_summary) return ranked_errors