Ejemplo n.º 1
0
    def parse(self, text, verbose=False):

        nlp = en_core_web_sm.load()
        doc = nlp(text)
        chunks = break_into_chunks(doc, max_words=self.MAX_WORDS)
        tokens = [[token.text for token in chunk] for chunk in chunks]

        preds = self.drp.predict(tokens, load_weights=True)

        flat_predictions = list(itertools.chain.from_iterable(preds))[0]
        flat_X = list(itertools.chain.from_iterable(tokens))
        rows = [i for i in zip(flat_X, flat_predictions)]

        if verbose:

            msg.divider("Token Results")

            header = ("token", "label")
            aligns = ("r", "l")
            formatted = wasabi.table(rows,
                                     header=header,
                                     divider=True,
                                     aligns=aligns)
            print(formatted)

        out = rows

        return out
Ejemplo n.º 2
0
    def print_stats(self):
        num_instances = self.num_instances
        formatted = self.label_stats_table
        self.msg_printer.divider(f"Label Stats for Parscit {self.dataset_type} dataset")
        print(formatted)

        # print some other stats
        random_int = np.random.randint(0, num_instances, size=1)[0]
        random_instance = self.word_instances[random_int]
        random_label = self.labels[random_int].split()
        assert len(random_instance) == len(random_label)
        self.msg_printer.divider(
            f"Random Instance from Parscit {self.dataset_type.capitalize()} Dataset"
        )
        tagged_string = self.tag_visualizer.visualize_tokens(
            random_instance, random_label
        )
        print(tagged_string)

        num_instances = len(self)
        other_stats_header = ["", "Value"]
        rows = [
            ("Num Instances", num_instances),
            ("Longest Instance Length", self.instance_max_len),
        ]

        other_stats_table = wasabi.table(
            data=rows, header=other_stats_header, divider=True
        )
        self.msg_printer.divider(f"Other stats for Parscit {self.dataset_type} dataset")
        print(other_stats_table)
    def split_parse(self, text, return_tokens=False, verbose=False):

        nlp = en_core_web_sm.load()
        doc = nlp(text)
        chunks = break_into_chunks(doc, max_words=self.MAX_WORDS)
        tokens = [[token.text for token in chunk] for chunk in chunks]

        preds = self.drp.predict(tokens, load_weights=True)

        # If tokens argument passed, return the labelled tokens

        if return_tokens:

            flat_preds_list = list(map(itertools.chain.from_iterable, preds))
            flat_X = list(itertools.chain.from_iterable(tokens))
            rows = [i for i in zip(*[flat_X] + flat_preds_list)]

            if verbose:

                msg.divider("Token Results")

                header = tuple(["token"] + ["label"] * len(flat_preds_list))
                aligns = tuple(["r"] + ["l"] * len(flat_preds_list))
                formatted = wasabi.table(rows,
                                         header=header,
                                         divider=True,
                                         aligns=aligns)
                print(formatted)

            out = rows

        else:

            # Return references with attributes (author, title, year)
            # in json format.
            # List of lists for each reference - each reference list contains all token attributes predictions
            # [[(token, attribute), ... , (token, attribute)], ..., [(token, attribute), ...]]

            references_components = tokens_to_reference_lists(
                tokens, spans=preds[1], components=preds[0])
            if verbose:

                msg.divider("Results")

                if references_components:

                    msg.good(f"Found {len(references_components)} references.")
                    msg.info("Printing found references:")

                    for ref in references_components:
                        msg.text(ref['Reference'], icon="check", spaced=True)

                else:

                    msg.fail("Failed to find any references.")

            out = references_components

        return out
Ejemplo n.º 4
0
    def split(self, text, return_tokens=False, verbose=False):

        nlp = en_core_web_sm.load()
        doc = nlp(text)
        chunks = break_into_chunks(doc, max_words=self.MAX_WORDS)
        tokens = [[token.text for token in chunk] for chunk in chunks]

        preds = self.drp.predict(tokens, load_weights=True)

        # If tokens argument passed, return the labelled tokens

        if return_tokens:

            flat_predictions = list(itertools.chain.from_iterable(preds))[0]
            flat_X = list(itertools.chain.from_iterable(tokens))
            rows = [i for i in zip(flat_X, flat_predictions)]

            if verbose:

                msg.divider("Token Results")

                header = ("token", "label")
                aligns = ("r", "l")
                formatted = wasabi.table(rows,
                                         header=header,
                                         divider=True,
                                         aligns=aligns)
                print(formatted)

            out = rows

        else:

            # Otherwise convert the tokens into references and return

            refs = tokens_to_references(tokens, preds[0])

            if verbose:

                msg.divider("Results")

                if refs:

                    msg.good(f"Found {len(refs)} references.")
                    msg.info("Printing found references:")

                    for ref in refs:
                        msg.text(ref, icon="check", spaced=True)

                else:

                    msg.fail("Failed to find any references.")

            out = refs

        return out
Ejemplo n.º 5
0
 def __init__(
     self, name: str, X: Any, Y: Any, errors: List[Dict[str, Any]] = []
 ) -> None:
     """Custom error for validating inputs / outputs at runtime."""
     message = f"Data validation error in '{name}'"
     type_info = f"X: {type(X)} Y: {type(Y)}"
     data = []
     for error in errors:
         err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
         data.append((err_loc, error.get("msg")))
     result = [message, type_info, table(data)]
     ValueError.__init__(self, "\n\n" + "\n".join(result))
Ejemplo n.º 6
0
    def print_stats(self) -> None:
        orig_vocab_len = self.get_orig_vocab_len()
        vocab_len = self.get_vocab_len()
        N = 5
        top_n = self.get_topn_frequent_words(n=N)

        data = [
            ("Original vocab length", orig_vocab_len),
            ("Clipped vocab length", vocab_len),
            ("Top {0} words".format(N), top_n),
        ]
        header = ("Stats Description", "#")
        table_string = wasabi.table(data=data, header=header, divider=True)
        self.msg_printer.divider("VOCAB STATS")
        print(table_string)
Ejemplo n.º 7
0
    def _get_label_stats_table(self):
        all_labels = []
        for label in self.labels:
            all_labels.extend(label.split())

        labels_stats = dict(collections.Counter(all_labels))
        classes = list(set(labels_stats.keys()))
        classes = sorted(classes)
        header = ["label index", "label name", "count"]
        classname2idx = self.wrapped_cls.get_classname2idx()
        rows = [
            (classname2idx[class_], class_, labels_stats[class_]) for class_ in classes
        ]
        formatted = wasabi.table(data=rows, header=header, divider=True)
        return formatted
Ejemplo n.º 8
0
 def __init__(
     self,
     config: Union[Config, Dict[str, Dict[str, Any]]],
     errors: List[Dict[str, Any]],
     message: str = "Config validation error",
     element: str = "",
 ) -> None:
     """Custom error for validating configs."""
     data = []
     for error in errors:
         err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
         if element:
             err_loc = f"{element} -> {err_loc}"
         data.append((err_loc, error.get("msg")))
     result = [message, table(data), f"{config}"]
     ValueError.__init__(self, "\n\n" + "\n".join(result))
Ejemplo n.º 9
0
 def _format(self) -> str:
     """Format the error message."""
     loc_divider = "->"
     data = []
     for error in self.errors:
         err_loc = f" {loc_divider} ".join([str(p) for p in error.get("loc", [])])
         if self.parent:
             err_loc = f"{self.parent} {loc_divider} {err_loc}"
         data.append((err_loc, error.get("msg")))
     result = []
     if self.title:
         result.append(self.title)
     if self.desc:
         result.append(self.desc)
     if data:
         result.append(table(data))
     if self.config and self.show_config:
         result.append(f"{self.config}")
     return "\n\n" + "\n".join(result)
Ejemplo n.º 10
0
def debug_config(
    config_path: Path,
    *,
    overrides: Dict[str, Any] = {},
    show_funcs: bool = False,
    show_vars: bool = False,
):
    msg.divider("Config validation")
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
        nlp = util.load_model_from_config(config)
        config = nlp.config.interpolate()
    msg.divider("Config validation for [initialize]")
    with show_validation_error(config_path):
        T = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
    msg.divider("Config validation for [training]")
    with show_validation_error(config_path):
        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
        dot_names = [T["train_corpus"], T["dev_corpus"]]
        util.resolve_dot_names(config, dot_names)
    msg.good("Config is valid")
    if show_vars:
        variables = get_variables(config)
        msg.divider(f"Variables ({len(variables)})")
        head = ("Variable", "Value")
        msg.table(variables,
                  header=head,
                  divider=True,
                  widths=(41, 34),
                  spacing=2)
    if show_funcs:
        funcs = get_registered_funcs(config)
        msg.divider(f"Registered functions ({len(funcs)})")
        for func in funcs:
            func_data = {
                "Registry": f"@{func['registry']}",
                "Name": func["name"],
                "Module": func["module"],
                "File": f"{func['file']} (line {func['line_no']})",
            }
            msg.info(f"[{func['path']}]")
            print(table(func_data).strip())
    def report_metrics(self, report_type: str = "wasabi") -> Any:
        reports = {}
        if report_type == "wasabi":
            for namespace in [self.words_namespace]:
                metric = self.get_metric()[namespace]
                rouge_1 = metric["rouge_1"]
                rouge_2 = metric["rouge_2"]
                rouge_l = metric["rouge_l"]

                # build table
                header_row = ["Metric", "Value"]
                rows = [
                    ("Rouge_1", rouge_1),
                    ("Rouge_2", rouge_2),
                    ("Rouge_l", rouge_l),
                ]

                table = wasabi.table(rows, header=header_row, divider=True)
                reports[namespace] = table

        return reports
Ejemplo n.º 12
0
    def report_metrics(self, report_type: str = "wasabi") -> Any:
        reports = {}
        if report_type == "wasabi":
            for namespace in self.label_namespaces:
                metric = self.get_metric()[namespace]
                acc = metric["accuracy"]
                precision = metric["precision"]
                recall = metric["recall"]
                fscore = metric["fscore"]

                # build table
                header_row = ["Metric", "Value"]
                rows = [
                    ("Acc", acc),
                    ("Precision", precision),
                    ("Recall", recall),
                    ("Fscore", fscore),
                ]

                table = wasabi.table(rows, header=header_row, divider=True)
                reports[namespace] = table

        return reports
Ejemplo n.º 13
0
 def __repr__(self) -> str:
     return table(self.data, header=["Parameter", "Value"], divider=True)
Ejemplo n.º 14
0
def evaluate_model(
    approach,
    model_path,
    data_path,
    label_binarizer_path,
    threshold,
    split_data=True,
    results_path=None,
    sparse_y=False,
    parameters=None,
):
    with open(label_binarizer_path, "rb") as f:
        label_binarizer = pickle.loads(f.read())

    if split_data:
        print(
            "Warning: Data will be split in the same way as train. If you don't want that you set split_data=False"
        )
        _, X_test, _, Y_test = load_train_test_data(data_path, label_binarizer)
    else:
        X_test, Y_test, _ = load_data(data_path, label_binarizer)

    # Some models (e.g. MeshXLinear) need to know the parameters beforehand, to know which
    # Load function to use

    model = load_model(approach, model_path, parameters=parameters)

    if sparse_y:
        predict_sparse_probs(model, X_test)
    else:
        Y_pred_proba = model.predict_proba(X_test)

    if type(threshold) != list:
        threshold = [threshold]

    widths = (12, 5, 5, 5)
    header = ["Threshold", "P", "R", "F1"]
    print(table([], header, divider=True, widths=widths))

    results = []
    for th in threshold:
        Y_pred = Y_pred_proba > th
        p, r, f1, _ = precision_recall_fscore_support(Y_test,
                                                      Y_pred,
                                                      average="micro")
        result = {
            "threshold": f"{th:.2f}",
            "precision": f"{p:.2f}",
            "recall": f"{r:.2f}",
            "f1": f"{f1:.2f}",
        }
        results.append(result)

        row_data = (
            result["threshold"],
            result["precision"],
            result["recall"],
            result["f1"],
        )
        print(row(row_data, widths=widths))

    if results_path:
        with open(results_path, "w") as f:
            f.write(json.dumps(results))
Ejemplo n.º 15
0
    def generate_table_report_from_counters(
        self,
        tp_counter: Dict[int, int],
        fp_counter: Dict[int, int],
        fn_counter: Dict[int, int],
        idx2labelname_mapping: Dict[int, str] = None,
    ) -> str:
        """ Returns a table representation for Precision Recall and FMeasure

        Parameters
        ----------
        tp_counter : Dict[int, int]
            The mapping between class index and true positive count
        fp_counter : Dict[int, int]
            The mapping between class index and false positive count
        fn_counter : Dict[int, int]
            The mapping between class index and false negative count
        idx2labelname_mapping: Dict[int, str]
            The mapping between idx and label name

        Returns
        -------
        str
            Returns a string representing the table of precision recall and fmeasure
            for every class in the dataset

        """
        precision_dict, recall_dict, fscore_dict = self.get_prf_from_counters(
            tp_counter=tp_counter,
            fp_counter=fp_counter,
            fn_counter=fn_counter)
        micro_precision, micro_recall, micro_fscore = self.get_micro_prf_from_counters(
            tp_counter=tp_counter,
            fp_counter=fp_counter,
            fn_counter=fn_counter)
        macro_precision, macro_recall, macro_fscore = self.get_macro_prf_from_prf_dicts(
            precision_dict=precision_dict,
            recall_dict=recall_dict,
            fscore_dict=fscore_dict,
        )

        classes = precision_dict.keys()
        classes = sorted(classes)

        if idx2labelname_mapping is None:
            idx2labelname_mapping = {
                class_num: class_num
                for class_num in classes
            }
        else:
            idx2labelname_mapping = idx2labelname_mapping

        header_row = [" ", "Precision", "Recall", "F_measure"]
        rows = []
        for class_num in classes:
            p = precision_dict[class_num]
            r = recall_dict[class_num]
            f = fscore_dict[class_num]
            rows.append(
                (f"cls_{class_num} ({idx2labelname_mapping[int(class_num)]})",
                 p, r, f))

        rows.append(["-"] * 4)
        rows.append(["Macro", macro_precision, macro_recall, macro_fscore])
        rows.append(["Micro", micro_precision, micro_recall, micro_fscore])

        return wasabi.table(rows, header=header_row, divider=True)
Ejemplo n.º 16
0
    def print_stats(self):
        num_instances = len(self.word_instances)
        len_instances = [len(instance) for instance in self.word_instances]
        max_len_instance = max(len_instances)
        index_max_instance = len_instances.index(max_len_instance)

        all_task_labels = []
        all_process_labels = []
        all_material_labels = []
        for idx in range(num_instances):
            iter_dict = self[idx]
            labels = iter_dict["label"]
            task_labels, process_labels, material_labels = torch.chunk(
                labels, chunks=3, dim=0
            )

            all_task_labels.extend(task_labels.cpu().tolist())
            all_process_labels.extend(process_labels.cpu().tolist())
            all_material_labels.extend(material_labels.cpu().tolist())

        all_labels = {
            "Task": all_task_labels,
            "Process": all_process_labels,
            "Material": all_material_labels,
        }

        for entity_type in self.entity_types:
            label_stats = dict(collections.Counter(all_labels[entity_type]))
            classes = list(set(label_stats.keys()))
            classes = sorted(classes)
            header = ["label index", "label name", "count"]
            rows = [
                (class_, self.idx2classnames[class_], label_stats[class_])
                for class_ in classes
            ]
            formatted = wasabi.table(data=rows, header=header, divider=True)
            self.msg_printer.divider(
                f"Label Stats for Science IE {self.dataset_type} dataset with Entity Type {entity_type}"
            )
            print(formatted)

        # print some other stats
        random_instance = self.word_instances[index_max_instance]
        random_label = self.labels[index_max_instance].split()
        random_task_label = [label.split(":")[0] for label in random_label]
        random_process_label = [label.split(":")[1] for label in random_label]
        random_material_label = [label.split(":")[2] for label in random_label]
        assert len(random_instance) == len(random_label)
        self.msg_printer.divider(
            f"Random Instance from Parscit {self.dataset_type.capitalize()} Dataset"
        )
        self.msg_printer.text(title="Task Labels")
        tagged_string = self.tag_visualizer.visualize_tokens(
            random_instance, random_task_label
        )
        print(tagged_string)

        self.msg_printer.text(title="Process Labels")
        tagged_string = self.tag_visualizer.visualize_tokens(
            random_instance, random_process_label
        )
        print(tagged_string)

        self.msg_printer.text(title="Material Labels")
        tagged_string = self.tag_visualizer.visualize_tokens(
            random_instance, random_material_label
        )
        print(tagged_string)

        num_instances = len(self)
        other_stats_header = ["", "Value"]
        rows = [
            ("Num Instances", num_instances),
            ("Longest Instance Length", max_len_instance),
        ]

        other_stats_table = wasabi.table(
            data=rows, header=other_stats_header, divider=True
        )
        self.msg_printer.divider(
            f"Other stats for ScienceIE {self.dataset_type} dataset"
        )
        print(other_stats_table)
Ejemplo n.º 17
0
def prodigy_to_tsv(input_files,
                   output_file,
                   respect_lines,
                   respect_docs,
                   line_limit=250):
    """
    Convert token annotated jsonl to token annotated tsv ready for use in the
    deep_reference_parser model.

    Will combine annotations from two jsonl files containing the same docs and
    the same tokens by comparing the "_input_hash" and token texts. If they are
    compatible, the output file will contain both labels ready for use in a
    multi-task model, for example:

           token   label   label
    ------------   -----   -----
      References   o       o
               1   o       o
               .   o       o
             WHO   title   b-r
       treatment   title   i-r
      guidelines   title   i-r
             for   title   i-r
            drug   title   i-r
               -   title   i-r
       resistant   title   i-r
    tuberculosis   title   i-r
               ,   title   i-r
            2016   title   i-r

    Multiple files must be passed as a comma separated list e.g.

    python -m deep_reference_parser.prodigy prodigy_to_tsv file1.jsonl,file2.jsonl out.tsv

    """

    input_files = input_files.split(",")

    msg.info(f"Loading annotations from {len(input_files)} datasets")
    msg.info(f"Respect line endings: {respect_lines}")
    msg.info(f"Respect doc endings: {respect_docs}")
    msg.info(f"Target example length (n tokens): {line_limit}")

    # Read the input_files. Note the use of map here, because we don't know
    # how many sets of annotations area being passed in the list. It could be 2
    # but in future it may be more.

    annotated_data = list(map(read_jsonl, input_files))

    # Sort the docs so that they are in the same order before converting to
    # token label pairs.

    tlp = TokenLabelPairs(
        respect_doc_endings=respect_docs,
        respect_line_endings=respect_lines,
        line_limit=line_limit,
    )

    pairs_list = tlp.run(annotated_data)

    write_tsv(pairs_list, output_file)

    # Print out the first ten rows as a sense check

    msg.divider("Example output")
    header = ["token"] + ["label"] * len(annotated_data)
    aligns = ["r"] + ["l"] * len(annotated_data)
    formatted = table(pairs_list[0:ROWS_TO_PRINT],
                      header=header,
                      divider=True,
                      aligns=aligns)
    print(formatted)

    msg.good(f"Wrote token/label pairs to {output_file}")