Beispiel #1
0
 def test_different_file_diff(self):
     """Tests that when two different files are used in the difference method, the output is a list with more than
     one value."""
     ann_1 = Annotations(self.ann_path_1)
     ann_2 = Annotations(self.ann_path_2)
     result = ann_1.difference(ann_2)
     self.assertGreater(len(result), 0)
    def test_overlays_annotations(self):
        """
        Tests that this pipeline component adds the correct labels.
        Note that this only tests that at least one instance of each label is overlayed because the number of tokens
        that receive the label varies based on the tokenizer.
        """

        sample_file = sample_dataset.data_files[0]
        txt_file_path = sample_file.txt_path
        ann_file_path = sample_file.ann_path

        with open(txt_file_path) as f:
            text = f.read()
        doc: Doc = self.nlp(text)

        doc.set_extension('file_name', default=None, force=True)
        doc._.file_name = txt_file_path
        doc.set_extension('gold_annotation_file', default=None, force=True)
        doc._.gold_annotation_file = ann_file_path

        ann = Annotations(ann_file_path)
        labels = ann.get_labels()

        gold_annotator = GoldAnnotatorOverlayer(self.nlp, list(labels))

        doc = gold_annotator(doc)

        overlayed_labels = {t._.gold_label for t in doc}
        overlayed_labels.remove('O')

        self.assertSetEqual(overlayed_labels, labels)
Beispiel #3
0
    def init_from_doc(cls, doc):
        """
        Creates a list of Entities for all entity annotations in a document.
        :param doc: can be a DataFile or str of a file path
        :return: a list of Entities
        """
        if isinstance(doc, DataFile):
            ann = Annotations(doc.ann_path, doc.txt_path)
        elif isinstance(doc, str):
            ann = Annotations(doc)
        else:
            raise ValueError(f"'doc'' must be DataFile or str, but is '{type(doc)}'")

        entities = []

        for ent in ann:
            # Entities are a tuple of (label, start, end, text)
            new_ent = cls(
                tag=ent[0],
                start=ent[1],
                end=ent[2],
                text=ent[3]
            )
            entities.append(new_ent)

        return entities
Beispiel #4
0
    def compute_ambiguity(self, dataset):
        """
        Finds occurrences of spans from 'dataset' that intersect with a span from this annotation but do not have this spans label.
        label. If 'dataset' comprises a models predictions, this method provides a strong indicators
        of a model's in-ability to dis-ambiguate between entities. For a full analysis, compute a confusion matrix.

        :param dataset: a Dataset object containing a predicted version of this dataset.
        :param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side.
        :return: a dictionary containing the ambiguity computations on each gold, predicted file pair
        """
        if not isinstance(dataset, Dataset):
            raise ValueError("dataset must be instance of Dataset")

        # verify files are consistent
        diff = set(file.ann_path.split(os.sep)[-1] for file in self) - set(file.ann_path.split(os.sep)[-1] for file in dataset)
        if diff:
            raise ValueError("Dataset of predictions is missing the files: " + str(list(diff)))

        #Dictionary storing ambiguity over dataset
        ambiguity_dict = {}

        for gold_data_file in self:
            prediction_iter = iter(dataset)
            prediction_data_file = next(prediction_iter)
            while str(gold_data_file) != str(prediction_data_file):
                prediction_data_file = next(prediction_iter)

            gold_annotation = Annotations(gold_data_file.ann_path)
            pred_annotation = Annotations(prediction_data_file.ann_path)

            # compute matrix on the Annotation file level
            ambiguity_dict[str(gold_data_file)] = gold_annotation.compute_ambiguity(pred_annotation)


        return ambiguity_dict
Beispiel #5
0
    def test_or(self):
        """
        Tests that the pipe operator correctly merges two Annotations and retains the source text path of
        the left operand
        """
        tup_1 = ('Object', 66, 77, 'this is some text')
        tup_2 = ('Entity', 44, 77, 'I love NER')
        tup_3 = ('Thingy', 66, 188, 'this is some sample text')
        file_name = 'some_file'

        ann_1 = Annotations([tup_1, tup_2], source_text_path=file_name)
        ann_2 = Annotations([tup_3])

        for a in [ann_1, ann_2]:
            self._test_is_sorted(a)

        # Test __or__
        result = ann_1 | ann_2
        expected = {tup_1, tup_2, tup_3}
        actual = set(result)
        self.assertSetEqual(actual, expected)
        self.assertEqual(file_name, result.source_text_path)
        self._test_is_sorted(result)

        # Test __ior__
        ann_1 |= ann_2
        actual = set(ann_1)
        self.assertSetEqual(actual, expected)
        self._test_is_sorted(ann_1)
Beispiel #6
0
 def test_ann_conversions(self):
     """Tests converting and un-converting a valid Annotations object to an ANN file."""
     self.maxDiff = None
     annotations = Annotations(self.ann_path_1)
     temp_path = os.path.join(self.test_dir, "intermediary.ann")
     annotations.to_ann(write_location=temp_path)
     annotations2 = Annotations(temp_path)
     self.assertListEqual(annotations.annotations, annotations2.annotations)
Beispiel #7
0
 def generate_annotations(self):
     """Generates Annotation objects for all the files in this Dataset"""
     for file in self:
         if file.ann_path is not None:
             yield Annotations(file.ann_path,
                               source_text_path=file.txt_path)
         else:
             yield Annotations([])
    def __call__(self, doc):
        """
        Overlays entity annotations over tokens in a Doc object. Requires that tokens in the Doc have the custom
        'gold_annotation_file' and 'file_name' extension.
        :param doc: a spaCy Doc object.
        :return: the same Doc object, but it now has 'gold_label' annotations.
        """

        if hasattr(doc._, 'file_name'):
            logging.debug("%s: Called GoldAnnotator Component", doc._.file_name)

        if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
            # print document tokenization
            for token in doc:
                logging.debug(str(token))

        # check if gold annotation file path has been set.
        if not hasattr(doc._, 'gold_annotation_file'):
            logging.warning("No extension doc._.gold_annotation_file is present; it will not be possible to fit a model with this Doc")
            return doc

        gold_annotations = Annotations(doc._.gold_annotation_file)

        for e_label, e_start, e_end, _ in gold_annotations.get_entity_annotations():
            if e_start > e_end:
                logging.critical("%s: Broken annotation - start is greater than end: (%i,%i,%s)",
                                 doc._.file_name, e_start, e_end, e_label)
                continue
            span = doc.char_span(e_start, e_end)

            if span is None:
                self.failed_overlay_count += 1
                self.failed_identifying_span_count += 1
                logging.warning("%s: Number of failed annotation overlays with current tokenizer: %i (%i,%i,%s)",
                                doc._.file_name, self.failed_overlay_count, e_start, e_end, e_label)

            fixed_span = self.find_span(e_start, e_end, doc)
            if fixed_span is not None:
                if span is None:
                    logging.warning("%s: Fixed span (%i,%i,%s) into: %s",
                                    doc._.file_name, e_start, e_end, e_label, fixed_span.text)
                    self.failed_identifying_span_count -= 1
                for token in fixed_span:
                    if e_label in self.labels or not self.labels:
                        token._.set('gold_label', e_label)

            else:  # annotation was not able to be fixed, it will be ignored - this is bad in evaluation.
                logging.warning("%s: Could not fix annotation: (%i,%i,%s)", doc._.file_name, e_start, e_end, e_label)
                logging.warning("%s: Total Failed Annotations: %i", doc._.file_name, self.failed_identifying_span_count)

        if self.failed_overlay_count > .3 * len(gold_annotations):
            logging.warning("%s: Annotations may mis-aligned as more than 30 percent failed to overlay: %s",
                            doc._.file_name, doc._.gold_annotation_file)

        return doc
Beispiel #9
0
    def test_cross_validate_create_groundtruth_predictions(self):
        """
        Tests that during cross validation, the medaCy groundtruth (that is, the version of the training dataset
        used by medaCy) is written as well as the predictions that are created for each fold
        """
        model = Model(self.pipeline)
        model.cross_validate(self.dataset,
                             num_folds=2,
                             prediction_directory=self.prediction_directory_3,
                             groundtruth_directory=self.groundtruth_directory)

        prediction_dataset = Dataset(self.prediction_directory_3)
        groundtruth_dataset = Dataset(self.groundtruth_directory)

        for d in [prediction_dataset, groundtruth_dataset]:
            self.assertIsInstance(d, Dataset)

        original_file_names = {d.file_name for d in self.dataset}
        prediction_file_names = {d.file_name for d in prediction_dataset}
        groundtruth_file_names = {d.file_name for d in groundtruth_dataset}

        for n in [prediction_file_names, groundtruth_file_names]:
            self.assertSetEqual(n, original_file_names)

        # Container for all Annotations in all files in all folds
        all_anns_all_folds_actual = Annotations([])

        # Test that fold groundtruth is written to file
        for fold_name in ["fold_1", "fold_2"]:
            fold_dataset = Dataset(groundtruth_dataset.data_directory /
                                   fold_name)
            for d in fold_dataset:
                fold_ann = Annotations(d.ann_path)
                groundtruth_ann = groundtruth_dataset[d.file_name]
                # Test that the entities in the fold groundtruth are a subset of the whole for that file
                self.assertTrue(set(fold_ann) <= set(groundtruth_ann))
                all_anns_all_folds_actual |= fold_ann

        # Container for all annotations pulled directly from the groundtruth dataset
        all_groundtruth_tuples = Annotations([])
        for ann in groundtruth_dataset.generate_annotations():
            all_groundtruth_tuples |= ann

        expected = set(all_groundtruth_tuples)
        actual = set(all_anns_all_folds_actual)
        self.assertSetEqual(expected, actual)
Beispiel #10
0
 def __getitem__(self, item):
     """
     Creates and returns the Annotations object with the given file name, else raises FileNotFoundError;
     useful for getting Annotations objects from parallel Datasets
     :param item: the name of the file to be represented (not including the extension or parent directories)
     :return: an Annotations object
     """
     path = os.path.join(self.data_directory, item + '.ann')
     return Annotations(path)
Beispiel #11
0
    def compute_confusion_matrix(self, other, leniency=0):
        """
        Generates a confusion matrix where this Dataset serves as the gold standard annotations and `dataset` serves
        as the predicted annotations. A typical workflow would involve creating a Dataset object with the prediction directory
        outputted by a model and then passing it into this method.

        :param other: a Dataset object containing a predicted version of this dataset.
        :param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side.
        :return: two element tuple containing a label array (of entity names) and a matrix where rows are gold labels and columns are predicted labels. matrix[i][j] indicates that entities[i] in this dataset was predicted as entities[j] in 'annotation' matrix[i][j] times
        """
        if not isinstance(other, Dataset):
            raise ValueError("other must be instance of Dataset")

        # verify files are consistent
        diff = {d.file_name for d in self} - {d.file_name for d in other}
        if diff:
            raise ValueError(
                f"Dataset of predictions is missing the files: {repr(diff)}")

        # sort entities in ascending order by count.
        entities = [
            key for key, _ in sorted(self.compute_counts().items(),
                                     key=lambda x: x[1])
        ]
        confusion_matrix = [[0 * len(entities)] * len(entities)]

        for gold_data_file in self:
            prediction_iter = iter(other)
            prediction_data_file = next(prediction_iter)
            while str(gold_data_file) != str(prediction_data_file):
                prediction_data_file = next(prediction_iter)

            gold_annotation = Annotations(gold_data_file.ann_path)
            pred_annotation = Annotations(prediction_data_file.ann_path)

            # compute matrix on the Annotation file level
            ann_confusion_matrix = gold_annotation.compute_confusion_matrix(
                pred_annotation, entities, leniency=leniency)
            for i in range(len(confusion_matrix)):
                for j in range(len(confusion_matrix)):
                    confusion_matrix[i][j] += ann_confusion_matrix[i][j]

        return entities, confusion_matrix
Beispiel #12
0
def sequence_to_ann(X: List[FeatureTuple], y: List[str],
                    file_names: Iterable[str]) -> Dict[str, Annotations]:
    """
    Creates a dictionary of document-level Annotations objects for a given sequence
    :param X: A list of sentence level zipped (features, indices, document_name) tuples
    :param y: A  list of sentence-level lists of tags
    :param file_names: A list of file names that are used by these sequences
    :return: A dictionary mapping txt file names (the whole path) to their Annotations objects, where the
    Annotations are constructed from the X and y data given here.
    """
    # Flattening nested structures into 2d lists
    anns = {filename: Annotations([]) for filename in file_names}
    tuples_by_doc = {filename: [] for filename in file_names}
    document_indices = []
    span_indices = []

    for sequence in X:
        document_indices += [sequence.file_name] * len(sequence.features)
        span_indices.extend(sequence.indices)

    groundtruth = [element for sentence in y for element in sentence]

    # Map the predicted sequences to their corresponding documents
    i = 0

    while i < len(groundtruth):
        if groundtruth[i] == 'O':
            i += 1
            continue

        entity = groundtruth[i]
        document = document_indices[i]
        first_start, first_end = span_indices[i]
        # Ensure that consecutive tokens with the same label are merged
        while i < len(groundtruth) - 1 and groundtruth[
                i + 1] == entity:  # If inside entity, keep incrementing
            i += 1

        last_start, last_end = span_indices[i]
        tuples_by_doc[document].append((entity, first_start, last_end))
        i += 1

    # Create the Annotations objects
    for file_name, tups in tuples_by_doc.items():
        ann_tups = []
        with open(file_name) as f:
            text = f.read()
        for tup in tups:
            entity, start, end = tup
            ent_text = text[start:end]
            new_tup = EntTuple(entity, start, end, ent_text)
            ann_tups.append(new_tup)
        anns[file_name].annotations = ann_tups

    return anns
Beispiel #13
0
 def test_confusion_matrix(self):
     ann_1 = Annotations(self.ann_path_1)
     ann_2 = Annotations(self.ann_path_2)
     ann_1.add_entity(*ann_2.annotations[0])
     self.assertEqual(
         len(ann_1.compute_confusion_matrix(ann_2, self.entities)[0]),
         len(self.entities))
     self.assertEqual(
         len(ann_1.compute_confusion_matrix(ann_2, self.entities)),
         len(self.entities))
Beispiel #14
0
    def test_predict(self):
        """Test predict and fit functions."""
        model = SpacyModel()
        model.fit(dataset=self.dataset, iterations=1)

        model.predict(self.dataset,
                      prediction_directory=self.prediction_directory)

        second_ann_file = "%s.ann" % self.dataset.all_data_files[0].file_name
        annotations = Annotations(
            os.path.join(self.prediction_directory, second_ann_file))
        self.assertIsInstance(annotations, Annotations)
Beispiel #15
0
    def predict_directory(self, data_directory, prediction_directory):
        """
        Predicts over all txt files in a directory using every Model. Note that this method spends a lot of time
        on file IO because each txt file is opened as many times as there are models.
        :param data_directory: Path to a directory of text files to predict over
        :param prediction_directory: a directory to write predictions to
        :return: a Dataset of the predictions
        """
        if not os.path.isdir(data_directory):
            raise ValueError(
                f"'data_directory' must be an existing directory, but is '{repr(data_directory)}'"
            )
        if not os.path.isdir(prediction_directory):
            raise ValueError(
                f"'prediction_directory' must be a directory, but is '{repr(prediction_directory)}'"
            )

        # Get all the txt files in the input directory
        txt_files = [
            f for f in os.listdir(data_directory) if f.endswith('.txt')
        ]
        # Create a dictionary of empty Annotations objects to store the predictions
        annotation_dict = {
            f: Annotations([], source_text_path=f)
            for f in txt_files
        }

        for model in self:
            for file_name in txt_files:
                file_path = os.path.join(data_directory, file_name)
                with open(file_path) as f:
                    text = f.read()
                this_annotations = annotation_dict[file_name]
                resulting_annotations = model.predict(text)
                # Merge the two Annotations together and store them back in the dictionary
                annotation_dict[
                    file_name] = this_annotations | resulting_annotations

        # Create the new Dataset directory
        for path, ann in annotation_dict.items():
            # Get the name of the output ann file
            path = os.path.join(data_directory, path)
            base_name = os.path.basename(path)[:-4]
            output_ann = os.path.join(prediction_directory, base_name + '.ann')
            output_txt = os.path.join(prediction_directory, base_name + '.txt')

            # Write the ann file
            ann.to_ann(output_ann)
            # Copy the txt file
            copyfile(path, output_txt)

        return Dataset(prediction_directory)
Beispiel #16
0
 def test_intersection(self):
     ann_1 = Annotations(self.ann_path_1)
     ann_2 = Annotations(self.ann_path_2)
     ann_1.add_entity(*ann_2.annotations[0])
     ann_1.add_entity(*ann_2.annotations[1])
     expected = {ann_2.annotations[0], ann_2.annotations[1]}
     actual = ann_1.intersection(ann_2)
     self.assertSetEqual(actual, expected)
Beispiel #17
0
 def test_compute_ambiguity(self):
     ann_1 = Annotations(self.ann_path_1)
     ann_1_copy = Annotations(self.ann_path_1)
     ambiguity = ann_1.compute_ambiguity(ann_1_copy)
     # The number of overlapping spans for the selected ann file is known to be 25
     self.assertEqual(25, len(ambiguity))
     # Manually introduce ambiguity by changing the name of an entity in the copy
     first_tuple = ann_1_copy.annotations[0]
     ann_1_copy.annotations[0] = ('different_name', first_tuple[1], first_tuple[2], first_tuple[3])
     ambiguity = ann_1.compute_ambiguity(ann_1_copy)
     # See if this increased the ambiguity score by one
     self.assertEqual(26, len(ambiguity))
Beispiel #18
0
    def compute_ambiguity(self, dataset):
        """
        Finds occurrences of spans from 'dataset' that intersect with a span from this annotation but do not have this spans label.
        label. If 'dataset' comprises a models predictions, this method provides a strong indicators
        of a model's in-ability to dis-ambiguate between entities. For a full analysis, compute a confusion matrix.

        :param dataset: a Dataset object containing a predicted version of this dataset.
        :return: a dictionary containing the ambiguity computations on each gold, predicted file pair
        """
        if not isinstance(dataset, Dataset):
            raise ValueError("dataset must be instance of Dataset")

        # verify files are consistent
        diff = {d.file_name for d in self} - {d.file_name for d in dataset}
        if diff:
            raise ValueError(
                f"Dataset of predictions is missing the files: {repr(diff)}")

        # Dictionary storing ambiguity over dataset
        ambiguity_dict = {}

        for gold_data_file in self:
            prediction_iter = iter(dataset)
            prediction_data_file = next(prediction_iter)
            while str(gold_data_file) != str(prediction_data_file):
                prediction_data_file = next(prediction_iter)

            gold_annotation = Annotations(gold_data_file.ann_path)
            pred_annotation = Annotations(prediction_data_file.ann_path)

            # compute matrix on the Annotation file level
            ambiguity_dict[str(
                gold_data_file)] = gold_annotation.compute_ambiguity(
                    pred_annotation)

        return ambiguity_dict
Beispiel #19
0
    def _predict_document(self, doc):
        """
        Generates an dictionary of predictions of the given model over the corresponding document. The passed document
        is assumed to be annotated by the same pipeline utilized when training the model.
        :param doc: A spacy document
        :return: an Annotations object containing the model predictions
        """

        feature_extractor = self.pipeline.get_feature_extractor()

        features, indices = feature_extractor.get_features_with_span_indices(
            doc)
        predictions = self.model.predict(features)
        predictions = [
            element for sentence in predictions for element in sentence
        ]  # flatten 2d list
        span_indices = [
            element for sentence in indices for element in sentence
        ]  # parallel array containing indices
        annotations = []

        i = 0
        while i < len(predictions):
            if predictions[i] == 'O':
                i += 1
                continue

            entity = predictions[i]
            first_start, first_end = span_indices[i]

            # Ensure that consecutive tokens with the same label are merged
            while i < len(predictions) - 1 and predictions[
                    i + 1] == entity:  # If inside entity, keep incrementing
                i += 1

            last_start, last_end = span_indices[i]
            labeled_text = doc.text[first_start:last_end]
            new_ent = EntTuple(entity, first_start, last_end, labeled_text)
            annotations.append(new_ent)

            logging.debug(
                f"{doc._.file_name}: Predicted {entity} at ({first_start}, {last_end}) {labeled_text}"
            )

            i += 1

        return Annotations(annotations)
Beispiel #20
0
    def test_init_tuples(self):
        """Tests the creation of individual annotation tuples, including ones with non-contiguous spans"""
        temp_path = os.path.join(self.test_dir, 'tuples.ann')

        samples = [
            ("T1\tObject 66 77\tthis is some text\n", ('Object', 66, 77, 'this is some text')),
            ("T2\tEntity 44 55;66 77\tI love NER\n", ('Entity', 44, 77, 'I love NER')),
            ("T3\tThingy 66 77;88 99;100 188\tthis is some sample text\n", ('Thingy', 66, 188, 'this is some sample text'))
        ]

        for string, expected in samples:
            with open(temp_path, 'w') as f:
                f.write(string)

            resulting_ann = Annotations(temp_path)
            actual = resulting_ann.annotations[0]
            self.assertTupleEqual(actual, expected)
Beispiel #21
0
def predict_document(model, doc, medacy_pipeline):
    """
    Generates an dictionary of predictions of the given model over the corresponding document. The passed document
    is assumed to be annotated by the same pipeline utilized when training the model.
    :param model: A loaded medaCy NER model
    :param doc: A spacy document
    :param medacy_pipeline: An instance of a medacy pipeline
    :return: an Annotations object containing the model predictions
    """
    #assert isinstance(feature_extractor, FeatureExtractor), "feature_extractor must be an instance of FeatureExtractor"

    feature_extractor = medacy_pipeline.get_feature_extractor()

    features, indices = feature_extractor.get_features_with_span_indices(doc)
    predictions = model.predict(features)
    predictions = [
        element for sentence in predictions for element in sentence
    ]  # flatten 2d list
    span_indices = [element for sentence in indices for element in sentence
                    ]  #parallel array containing indices
    annotations = []

    i = 0
    while i < len(predictions):
        if predictions[i] == "O":
            i += 1
            continue
        entity = predictions[i]
        first_start, first_end = span_indices[i]
        # Ensure that consecutive tokens with the same label are merged
        while i < len(predictions) - 1 and predictions[
                i + 1] == entity:  #If inside entity, keep incrementing
            i += 1
        last_start, last_end = span_indices[i]

        labeled_text = doc.text[first_start:last_end]

        logging.debug("%s: Predicted %s at (%i, %i) %s", doc._.file_name,
                      entity, first_start, last_end,
                      labeled_text.replace('\n', ''))

        annotations.append((entity, first_start, last_end, labeled_text))
        i += 1

    return Annotations(annotations)
Beispiel #22
0
def construct_annotations_from_tuples(doc, predictions):
    """
    Converts predictions mapped to a document into an Annotations object
    :param doc: SpaCy doc corresponding to predictions
    :param predictions: List of tuples containing (entity, start offset, end offset)
    :return: Annotations Object representing predicted entities for the given doc
    """
    predictions = sorted(predictions, key=lambda x: x[1])
    annotations = []

    for prediction in predictions:
        if len(prediction) == 3:
            (entity, start, end) = prediction
            labeled_text = doc.text[start:end]
        elif len(prediction) == 4:
            (entity, start, end, labeled_text) = prediction
        else:
            raise ValueError("Incorrect prediction length.")

        annotations.append((entity, start, end, labeled_text))

    return Annotations(annotations)
Beispiel #23
0
def calculate_document_overlap(data_file):
    already_matched = []

    print(data_file.txt_path)
    ann = Annotations(data_file.ann_path)
    counts = Counter()

    for a, b in product(ann, ann):

        if a is b or {a, b} in already_matched:
            continue

        already_matched.append({a, b})

        a_tag, a_start, a_end, a_text = a
        b_tag, b_start, b_end, b_text = b

        left_cut = a_start < b_start < a_end < b_end
        right_cut = b_start < a_start < b_end < a_end
        a_inside = b_start < a_start < a_end < b_end
        b_inside = a_start < b_start < b_end < a_end

        if left_cut:
            print(f"Leftside cutoff: {a}, {b}")
        elif right_cut:
            print(f"Rightside cutoff: {a}, {b}")
        elif a_inside:
            print(f"A inside B: {a}, {b}")
        elif b_inside:
            print(f"B inside A: {a}, {b}")

        if any([left_cut, right_cut, a_inside, b_inside]):
            counts[(a_tag, b_tag)] += 1

    print(counts)
    return counts
Beispiel #24
0
def write_ann_dicts(
        output_dir: Path,
        dict_list: List[Dict[str, Annotations]]) -> Dict[str, Annotations]:
    """
    Merges a list of dicts of Annotations into one dict representing all the individual ann files and prints the
    ann data for both the individual Annotations and the combined one.
    :param output_dir: Path object of the output directory (a subdirectory is made for each fold)
    :param dict_list: a list of file_name: Annotations dictionaries
    :return: The merged Annotations dict, if wanted
    """
    file_names = set()
    for d in dict_list:
        file_names |= set(d.keys())

    all_annotations_dict = {
        filename: Annotations([])
        for filename in file_names
    }
    for i, fold_dict in enumerate(dict_list, 1):
        fold_dir = output_dir / f"fold_{i}"
        os.mkdir(fold_dir)
        for file_name, ann in fold_dict.items():
            # Write the Annotations from the individual fold to file;
            # Note that in this is written to the fold_dir, which is a subfolder of the output_dir
            ann.to_ann(fold_dir /
                       (os.path.basename(file_name).rstrip("txt") + "ann"))
            # Merge the Annotations from the fold into the inter-fold Annotations
            all_annotations_dict[file_name] |= ann

    # Write the Annotations that are the combination of all folds to file
    for file_name, ann in all_annotations_dict.items():
        output_file_path = output_dir / (
            os.path.basename(file_name).rstrip("txt") + "ann")
        ann.to_ann(output_file_path)

    return all_annotations_dict
Beispiel #25
0
 def test_init_from_invalid_ann(self):
     """Tests initialization from invalid annotation file"""
     with self.assertRaises(FileNotFoundError):
         Annotations("not_a_file_path")
Beispiel #26
0
 def test_init_from_ann_file(self):
     """Tests initialization from valid ann file"""
     ann = Annotations(self.ann_path_1)
     self.assertIsNotNone(ann.annotations)
Beispiel #27
0
 def generate_annotations(self):
     """Generates Annotation objects for all the files in this Dataset"""
     for file in self.get_data_files():
         yield Annotations(file.ann_path, source_text_path=file.txt_path)
Beispiel #28
0
 def test_difference(self):
     """Tests that when a given Annotations object uses the diff() method with another Annotations object created
     from the same source file, that it returns an empty list."""
     ann = Annotations(self.ann_path_1)
     result = ann.difference(ann)
     self.assertFalse(result)
Beispiel #29
0
 def test_compute_counts(self):
     ann_1 = Annotations(self.ann_path_1)
     self.assertIsInstance(ann_1.compute_counts(), dict)
Beispiel #30
0
 def test_init_from_ann_file(self):
     """Tests initialization from valid ann file"""
     ann = Annotations(self.ann_path_1)
     self._test_is_sorted(ann)