def parse(self, text, verbose=False): nlp = en_core_web_sm.load() doc = nlp(text) chunks = break_into_chunks(doc, max_words=self.MAX_WORDS) tokens = [[token.text for token in chunk] for chunk in chunks] preds = self.drp.predict(tokens, load_weights=True) flat_predictions = list(itertools.chain.from_iterable(preds))[0] flat_X = list(itertools.chain.from_iterable(tokens)) rows = [i for i in zip(flat_X, flat_predictions)] if verbose: msg.divider("Token Results") header = ("token", "label") aligns = ("r", "l") formatted = wasabi.table(rows, header=header, divider=True, aligns=aligns) print(formatted) out = rows return out
def print_stats(self): num_instances = self.num_instances formatted = self.label_stats_table self.msg_printer.divider(f"Label Stats for Parscit {self.dataset_type} dataset") print(formatted) # print some other stats random_int = np.random.randint(0, num_instances, size=1)[0] random_instance = self.word_instances[random_int] random_label = self.labels[random_int].split() assert len(random_instance) == len(random_label) self.msg_printer.divider( f"Random Instance from Parscit {self.dataset_type.capitalize()} Dataset" ) tagged_string = self.tag_visualizer.visualize_tokens( random_instance, random_label ) print(tagged_string) num_instances = len(self) other_stats_header = ["", "Value"] rows = [ ("Num Instances", num_instances), ("Longest Instance Length", self.instance_max_len), ] other_stats_table = wasabi.table( data=rows, header=other_stats_header, divider=True ) self.msg_printer.divider(f"Other stats for Parscit {self.dataset_type} dataset") print(other_stats_table)
def split_parse(self, text, return_tokens=False, verbose=False): nlp = en_core_web_sm.load() doc = nlp(text) chunks = break_into_chunks(doc, max_words=self.MAX_WORDS) tokens = [[token.text for token in chunk] for chunk in chunks] preds = self.drp.predict(tokens, load_weights=True) # If tokens argument passed, return the labelled tokens if return_tokens: flat_preds_list = list(map(itertools.chain.from_iterable, preds)) flat_X = list(itertools.chain.from_iterable(tokens)) rows = [i for i in zip(*[flat_X] + flat_preds_list)] if verbose: msg.divider("Token Results") header = tuple(["token"] + ["label"] * len(flat_preds_list)) aligns = tuple(["r"] + ["l"] * len(flat_preds_list)) formatted = wasabi.table(rows, header=header, divider=True, aligns=aligns) print(formatted) out = rows else: # Return references with attributes (author, title, year) # in json format. # List of lists for each reference - each reference list contains all token attributes predictions # [[(token, attribute), ... , (token, attribute)], ..., [(token, attribute), ...]] references_components = tokens_to_reference_lists( tokens, spans=preds[1], components=preds[0]) if verbose: msg.divider("Results") if references_components: msg.good(f"Found {len(references_components)} references.") msg.info("Printing found references:") for ref in references_components: msg.text(ref['Reference'], icon="check", spaced=True) else: msg.fail("Failed to find any references.") out = references_components return out
def split(self, text, return_tokens=False, verbose=False): nlp = en_core_web_sm.load() doc = nlp(text) chunks = break_into_chunks(doc, max_words=self.MAX_WORDS) tokens = [[token.text for token in chunk] for chunk in chunks] preds = self.drp.predict(tokens, load_weights=True) # If tokens argument passed, return the labelled tokens if return_tokens: flat_predictions = list(itertools.chain.from_iterable(preds))[0] flat_X = list(itertools.chain.from_iterable(tokens)) rows = [i for i in zip(flat_X, flat_predictions)] if verbose: msg.divider("Token Results") header = ("token", "label") aligns = ("r", "l") formatted = wasabi.table(rows, header=header, divider=True, aligns=aligns) print(formatted) out = rows else: # Otherwise convert the tokens into references and return refs = tokens_to_references(tokens, preds[0]) if verbose: msg.divider("Results") if refs: msg.good(f"Found {len(refs)} references.") msg.info("Printing found references:") for ref in refs: msg.text(ref, icon="check", spaced=True) else: msg.fail("Failed to find any references.") out = refs return out
def __init__( self, name: str, X: Any, Y: Any, errors: List[Dict[str, Any]] = [] ) -> None: """Custom error for validating inputs / outputs at runtime.""" message = f"Data validation error in '{name}'" type_info = f"X: {type(X)} Y: {type(Y)}" data = [] for error in errors: err_loc = " -> ".join([str(p) for p in error.get("loc", [])]) data.append((err_loc, error.get("msg"))) result = [message, type_info, table(data)] ValueError.__init__(self, "\n\n" + "\n".join(result))
def print_stats(self) -> None: orig_vocab_len = self.get_orig_vocab_len() vocab_len = self.get_vocab_len() N = 5 top_n = self.get_topn_frequent_words(n=N) data = [ ("Original vocab length", orig_vocab_len), ("Clipped vocab length", vocab_len), ("Top {0} words".format(N), top_n), ] header = ("Stats Description", "#") table_string = wasabi.table(data=data, header=header, divider=True) self.msg_printer.divider("VOCAB STATS") print(table_string)
def _get_label_stats_table(self): all_labels = [] for label in self.labels: all_labels.extend(label.split()) labels_stats = dict(collections.Counter(all_labels)) classes = list(set(labels_stats.keys())) classes = sorted(classes) header = ["label index", "label name", "count"] classname2idx = self.wrapped_cls.get_classname2idx() rows = [ (classname2idx[class_], class_, labels_stats[class_]) for class_ in classes ] formatted = wasabi.table(data=rows, header=header, divider=True) return formatted
def __init__( self, config: Union[Config, Dict[str, Dict[str, Any]]], errors: List[Dict[str, Any]], message: str = "Config validation error", element: str = "", ) -> None: """Custom error for validating configs.""" data = [] for error in errors: err_loc = " -> ".join([str(p) for p in error.get("loc", [])]) if element: err_loc = f"{element} -> {err_loc}" data.append((err_loc, error.get("msg"))) result = [message, table(data), f"{config}"] ValueError.__init__(self, "\n\n" + "\n".join(result))
def _format(self) -> str: """Format the error message.""" loc_divider = "->" data = [] for error in self.errors: err_loc = f" {loc_divider} ".join([str(p) for p in error.get("loc", [])]) if self.parent: err_loc = f"{self.parent} {loc_divider} {err_loc}" data.append((err_loc, error.get("msg"))) result = [] if self.title: result.append(self.title) if self.desc: result.append(self.desc) if data: result.append(table(data)) if self.config and self.show_config: result.append(f"{self.config}") return "\n\n" + "\n".join(result)
def debug_config( config_path: Path, *, overrides: Dict[str, Any] = {}, show_funcs: bool = False, show_vars: bool = False, ): msg.divider("Config validation") with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) nlp = util.load_model_from_config(config) config = nlp.config.interpolate() msg.divider("Config validation for [initialize]") with show_validation_error(config_path): T = registry.resolve(config["initialize"], schema=ConfigSchemaInit) msg.divider("Config validation for [training]") with show_validation_error(config_path): T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] util.resolve_dot_names(config, dot_names) msg.good("Config is valid") if show_vars: variables = get_variables(config) msg.divider(f"Variables ({len(variables)})") head = ("Variable", "Value") msg.table(variables, header=head, divider=True, widths=(41, 34), spacing=2) if show_funcs: funcs = get_registered_funcs(config) msg.divider(f"Registered functions ({len(funcs)})") for func in funcs: func_data = { "Registry": f"@{func['registry']}", "Name": func["name"], "Module": func["module"], "File": f"{func['file']} (line {func['line_no']})", } msg.info(f"[{func['path']}]") print(table(func_data).strip())
def report_metrics(self, report_type: str = "wasabi") -> Any: reports = {} if report_type == "wasabi": for namespace in [self.words_namespace]: metric = self.get_metric()[namespace] rouge_1 = metric["rouge_1"] rouge_2 = metric["rouge_2"] rouge_l = metric["rouge_l"] # build table header_row = ["Metric", "Value"] rows = [ ("Rouge_1", rouge_1), ("Rouge_2", rouge_2), ("Rouge_l", rouge_l), ] table = wasabi.table(rows, header=header_row, divider=True) reports[namespace] = table return reports
def report_metrics(self, report_type: str = "wasabi") -> Any: reports = {} if report_type == "wasabi": for namespace in self.label_namespaces: metric = self.get_metric()[namespace] acc = metric["accuracy"] precision = metric["precision"] recall = metric["recall"] fscore = metric["fscore"] # build table header_row = ["Metric", "Value"] rows = [ ("Acc", acc), ("Precision", precision), ("Recall", recall), ("Fscore", fscore), ] table = wasabi.table(rows, header=header_row, divider=True) reports[namespace] = table return reports
def __repr__(self) -> str: return table(self.data, header=["Parameter", "Value"], divider=True)
def evaluate_model( approach, model_path, data_path, label_binarizer_path, threshold, split_data=True, results_path=None, sparse_y=False, parameters=None, ): with open(label_binarizer_path, "rb") as f: label_binarizer = pickle.loads(f.read()) if split_data: print( "Warning: Data will be split in the same way as train. If you don't want that you set split_data=False" ) _, X_test, _, Y_test = load_train_test_data(data_path, label_binarizer) else: X_test, Y_test, _ = load_data(data_path, label_binarizer) # Some models (e.g. MeshXLinear) need to know the parameters beforehand, to know which # Load function to use model = load_model(approach, model_path, parameters=parameters) if sparse_y: predict_sparse_probs(model, X_test) else: Y_pred_proba = model.predict_proba(X_test) if type(threshold) != list: threshold = [threshold] widths = (12, 5, 5, 5) header = ["Threshold", "P", "R", "F1"] print(table([], header, divider=True, widths=widths)) results = [] for th in threshold: Y_pred = Y_pred_proba > th p, r, f1, _ = precision_recall_fscore_support(Y_test, Y_pred, average="micro") result = { "threshold": f"{th:.2f}", "precision": f"{p:.2f}", "recall": f"{r:.2f}", "f1": f"{f1:.2f}", } results.append(result) row_data = ( result["threshold"], result["precision"], result["recall"], result["f1"], ) print(row(row_data, widths=widths)) if results_path: with open(results_path, "w") as f: f.write(json.dumps(results))
def generate_table_report_from_counters( self, tp_counter: Dict[int, int], fp_counter: Dict[int, int], fn_counter: Dict[int, int], idx2labelname_mapping: Dict[int, str] = None, ) -> str: """ Returns a table representation for Precision Recall and FMeasure Parameters ---------- tp_counter : Dict[int, int] The mapping between class index and true positive count fp_counter : Dict[int, int] The mapping between class index and false positive count fn_counter : Dict[int, int] The mapping between class index and false negative count idx2labelname_mapping: Dict[int, str] The mapping between idx and label name Returns ------- str Returns a string representing the table of precision recall and fmeasure for every class in the dataset """ precision_dict, recall_dict, fscore_dict = self.get_prf_from_counters( tp_counter=tp_counter, fp_counter=fp_counter, fn_counter=fn_counter) micro_precision, micro_recall, micro_fscore = self.get_micro_prf_from_counters( tp_counter=tp_counter, fp_counter=fp_counter, fn_counter=fn_counter) macro_precision, macro_recall, macro_fscore = self.get_macro_prf_from_prf_dicts( precision_dict=precision_dict, recall_dict=recall_dict, fscore_dict=fscore_dict, ) classes = precision_dict.keys() classes = sorted(classes) if idx2labelname_mapping is None: idx2labelname_mapping = { class_num: class_num for class_num in classes } else: idx2labelname_mapping = idx2labelname_mapping header_row = [" ", "Precision", "Recall", "F_measure"] rows = [] for class_num in classes: p = precision_dict[class_num] r = recall_dict[class_num] f = fscore_dict[class_num] rows.append( (f"cls_{class_num} ({idx2labelname_mapping[int(class_num)]})", p, r, f)) rows.append(["-"] * 4) rows.append(["Macro", macro_precision, macro_recall, macro_fscore]) rows.append(["Micro", micro_precision, micro_recall, micro_fscore]) return wasabi.table(rows, header=header_row, divider=True)
def print_stats(self): num_instances = len(self.word_instances) len_instances = [len(instance) for instance in self.word_instances] max_len_instance = max(len_instances) index_max_instance = len_instances.index(max_len_instance) all_task_labels = [] all_process_labels = [] all_material_labels = [] for idx in range(num_instances): iter_dict = self[idx] labels = iter_dict["label"] task_labels, process_labels, material_labels = torch.chunk( labels, chunks=3, dim=0 ) all_task_labels.extend(task_labels.cpu().tolist()) all_process_labels.extend(process_labels.cpu().tolist()) all_material_labels.extend(material_labels.cpu().tolist()) all_labels = { "Task": all_task_labels, "Process": all_process_labels, "Material": all_material_labels, } for entity_type in self.entity_types: label_stats = dict(collections.Counter(all_labels[entity_type])) classes = list(set(label_stats.keys())) classes = sorted(classes) header = ["label index", "label name", "count"] rows = [ (class_, self.idx2classnames[class_], label_stats[class_]) for class_ in classes ] formatted = wasabi.table(data=rows, header=header, divider=True) self.msg_printer.divider( f"Label Stats for Science IE {self.dataset_type} dataset with Entity Type {entity_type}" ) print(formatted) # print some other stats random_instance = self.word_instances[index_max_instance] random_label = self.labels[index_max_instance].split() random_task_label = [label.split(":")[0] for label in random_label] random_process_label = [label.split(":")[1] for label in random_label] random_material_label = [label.split(":")[2] for label in random_label] assert len(random_instance) == len(random_label) self.msg_printer.divider( f"Random Instance from Parscit {self.dataset_type.capitalize()} Dataset" ) self.msg_printer.text(title="Task Labels") tagged_string = self.tag_visualizer.visualize_tokens( random_instance, random_task_label ) print(tagged_string) self.msg_printer.text(title="Process Labels") tagged_string = self.tag_visualizer.visualize_tokens( random_instance, random_process_label ) print(tagged_string) self.msg_printer.text(title="Material Labels") tagged_string = self.tag_visualizer.visualize_tokens( random_instance, random_material_label ) print(tagged_string) num_instances = len(self) other_stats_header = ["", "Value"] rows = [ ("Num Instances", num_instances), ("Longest Instance Length", max_len_instance), ] other_stats_table = wasabi.table( data=rows, header=other_stats_header, divider=True ) self.msg_printer.divider( f"Other stats for ScienceIE {self.dataset_type} dataset" ) print(other_stats_table)
def prodigy_to_tsv(input_files, output_file, respect_lines, respect_docs, line_limit=250): """ Convert token annotated jsonl to token annotated tsv ready for use in the deep_reference_parser model. Will combine annotations from two jsonl files containing the same docs and the same tokens by comparing the "_input_hash" and token texts. If they are compatible, the output file will contain both labels ready for use in a multi-task model, for example: token label label ------------ ----- ----- References o o 1 o o . o o WHO title b-r treatment title i-r guidelines title i-r for title i-r drug title i-r - title i-r resistant title i-r tuberculosis title i-r , title i-r 2016 title i-r Multiple files must be passed as a comma separated list e.g. python -m deep_reference_parser.prodigy prodigy_to_tsv file1.jsonl,file2.jsonl out.tsv """ input_files = input_files.split(",") msg.info(f"Loading annotations from {len(input_files)} datasets") msg.info(f"Respect line endings: {respect_lines}") msg.info(f"Respect doc endings: {respect_docs}") msg.info(f"Target example length (n tokens): {line_limit}") # Read the input_files. Note the use of map here, because we don't know # how many sets of annotations area being passed in the list. It could be 2 # but in future it may be more. annotated_data = list(map(read_jsonl, input_files)) # Sort the docs so that they are in the same order before converting to # token label pairs. tlp = TokenLabelPairs( respect_doc_endings=respect_docs, respect_line_endings=respect_lines, line_limit=line_limit, ) pairs_list = tlp.run(annotated_data) write_tsv(pairs_list, output_file) # Print out the first ten rows as a sense check msg.divider("Example output") header = ["token"] + ["label"] * len(annotated_data) aligns = ["r"] + ["l"] * len(annotated_data) formatted = table(pairs_list[0:ROWS_TO_PRINT], header=header, divider=True, aligns=aligns) print(formatted) msg.good(f"Wrote token/label pairs to {output_file}")