def evaluate_quotes(self, fandom_fname, fic_representation, save=True, exact_match=True):
        """ Evaluate quotes for a fic.
            Args:
                save: save AnnotatedSpan quote objects in a pickled file in a tmp directory
        """
        # Quote extraction evaluation
        # Load gold quote spans
        gold = Annotation(self.quote_settings.gold_dirpath, fandom_fname, file_ext=self.quote_settings.gold_ext, fic_csv_dirpath=self.fic_csv_dirpath)
        gold.extract_annotated_spans()

        # Load predicted quote spans
        fic_representation.extract_quotes(
            save_dirpath=self.quote_settings.preds_outpath, 
            coref_from=self.coref_from)

        # Get scores
        quote_scores, quote_groups = scorer.quote_scores(fic_representation.quotes, gold.annotations, exact_match=exact_match)
        print('\tQuote extraction results:')
        for key in ['extraction_f1', 'extraction_precision', 'extraction_recall']:
            print(f'\t\t{key}: {quote_scores[key]: .2%}')
        print('\tQuote attribution results:')
        for key in ['attribution_f1', 'attribution_precision', 'attribution_recall']:
            print(f'\t\t{key}: {quote_scores[key]: .2%}')
        print()
        return quote_scores, quote_groups
Exemple #2
0
    def modify_coref_files(self,
                           coref_annotations_dirpath,
                           coref_annotations_ext,
                           annotation_type='gold'):
        """ Changes coref tokens to specified external annotations in 
            self.token_data.
            Saves out to {token_output_dirpath}_gold_coref/token_fpath.
            Returns the suffix added to dirpaths.
        """
        # Load externally annotated mentions, place in self.character_mentions
        annotation = Annotation(coref_annotations_dirpath,
                                self.fandom_fname,
                                file_ext=coref_annotations_ext,
                                fic_csv_dirpath=self.fic_csv_dirpath,
                                annotation_type=annotation_type)
        annotation.extract_annotated_spans()
        self.character_mentions = annotation.annotations

        # Modify coref <tags> in CSV
        self.modify_coref_tags(
            annotation.annotations)  # Modifies self.coref_fic

        # Save out
        modify_text = f'_{annotation_type}_coref'
        self.coref_output_dirpath = self.coref_output_dirpath.rstrip(
            '/') + modify_text
        self.save_coref_csv()

        # Modify coref characters file
        self.coref_chars_output_dirpath = self.coref_chars_output_dirpath.rstrip(
            '/') + modify_text
        self.save_characters_file()

        return modify_text
Exemple #3
0
    def modify_quote_spans(self, quote_annotations_dirpath,
                           quote_annotations_ext):
        """ Modifies quote marks so that the pipeline will recognized
            gold quotes as quote spans """
        # Load gold quote extractions
        gold = Annotation(quote_annotations_dirpath,
                          self.fandom_fname,
                          file_ext=quote_annotations_ext,
                          fic_csv_dirpath=self.fic_csv_dirpath)
        gold.extract_annotated_spans()

        # Modify CSV text_tokenized
        self.modify_quote_marks(gold.annotations)  # Modifies self.coref_fic

        # Save out
        modify_text = '_gold_quotes'
        self.coref_output_dirpath = self.coref_output_dirpath.rstrip(
            '/') + modify_text
        self.save_coref_csv()

        # Change characters file path, too
        self.coref_chars_output_dirpath = self.coref_chars_output_dirpath.rstrip(
            '/') + modify_text

        return modify_text
    def modify_quote_tokens(self,
                            original_tokenization_dirpath=None,
                            quote_annotations_dirpath=None,
                            quote_annotations_ext=None,
                            change_to='gold'):
        """ Changes quote tokens so BookNLP will recognize them in certain ways.
            Args:
                change_to:
                    'gold': Change to gold quote extractions
                    'match': Replace quotes with smart quotes to match a tokens file done without whitespace tokenization
                    'strict': Change existing BookNLP quotes using a dictionary. Single quotes to ` and ', double quotes to `` and ''
        """
        if change_to == 'gold':
            # Load gold quote extractions
            gold = Annotation(quote_annotations_dirpath,
                              self.fandom_fname,
                              file_ext=quote_annotations_ext,
                              fic_csv_dirpath=self.fic_csv_dirpath)
            gold.extract_annotated_spans()

            # Clear existing quotes, since might have been modified after whitespace tokenization
            self.clear_quotes()

            # Add gold quote spans in
            for span in gold.annotations:
                self.add_quote_span(span)

            # Change output dirpath for later saving (after replace gold coref)
            self.modified_token_output_dirpath = self.modified_token_output_dirpath.rstrip(
                '/') + '_gold_quotes'

        elif change_to == 'match':
            original_tokens = load_tokens_file(
                os.path.join(self.original_tokenization_dirpath,
                             self.fandom_fname + self.token_file_ext))
            self.token_data = match_quotes(original_tokens, self.token_data)

            # Save out
            save_tokens_file(self.token_data, self.modified_token_fpath)

        elif change_to == 'strict':
            quote_changes = {
                "“": "``",
                "”": "''",
            }
            self.token_data['normalizedWord'] = self.token_data[
                'normalizedWord'].map(lambda x: quote_changes.get(x, x))
            self.token_data['lemma'] = self.token_data['lemma'].map(
                lambda x: quote_changes.get(x, x))

            # Save out
            pdb.set_trace()
            self.token_data.to_csv(self.modified_token_fpath,
                                   sep='\t',
                                   quoting=csv.QUOTE_NONE,
                                   index=False)
    def load_fic_spans(self, fandom_fname, gold_dirpath, baseline_dirpath,
                       experimental_dirpath, gold_annotations_ext):
        """ Load quote or coref predictions and gold spans for a fic.
            Returns gold_spans, baseline_spans, experimental_spans
        """
        gold_annotation = Annotation(gold_dirpath,
                                     fandom_fname,
                                     file_ext=gold_annotations_ext,
                                     fic_csv_dirpath=self.fic_csv_dirpath)
        gold_annotation.extract_annotated_spans()
        gold_spans = gold_annotation.annotations
        baseline_spans = utils.load_pickle(baseline_dirpath, fandom_fname)
        experimental_spans = utils.load_pickle(experimental_dirpath,
                                               fandom_fname)

        return gold_spans, baseline_spans, experimental_spans
    def modify_coref_tokens(self, coref_annotations_dirpath,
                            coref_annotations_ext):
        """ Changes coref tokens to gold annotations in self.token_data.
            Saves out to {token_output_dirpath}_gold_coref/token_fpath
        """
        # Load gold mentions
        gold = Annotation(coref_annotations_dirpath,
                          self.fandom_fname,
                          file_ext=coref_annotations_ext,
                          fic_csv_dirpath=self.fic_csv_dirpath)
        gold.extract_annotated_spans()

        # Build character name to id dictionary for gold characters (arbitrary)
        self.char_name2id = defaultdict(lambda: len(self.char_name2id))
        #self.char_name2id = {charname: len(self.char_name2id) for charname in sorted(gold.annotations_set)}

        # Clear existing character coref annotations
        self.token_data['characterId'] = -1

        # Modify original tokens file
        for span in gold.annotations:
            self.modify_coref_span(span)

        # Renumber BookNLP's own token IDs for re-running on modified output
        self.renumber_token_ids()

        # Save out
        self.modified_token_output_dirpath = self.modified_token_output_dirpath.rstrip(
            '/') + '_gold_coref'
        if not os.path.exists(self.modified_token_output_dirpath):
            os.mkdir(self.modified_token_output_dirpath)
        self.modified_token_fpath = os.path.join(
            self.modified_token_output_dirpath,
            f'{self.fandom_fname}{self.token_file_ext}')
        self.token_data.to_csv(self.modified_token_fpath,
                               sep='\t',
                               quoting=csv.QUOTE_NONE,
                               index=False)
    def evaluate_coref(self, fandom_fname, fic_representation, save=True):
        """ Evaluate coref for a fic.
            Args:
                save: save AnnotatedSpan objects in a pickled file in a tmp directory
        """
        # Load gold mentions
        gold = Annotation(self.coref_settings.gold_dirpath, fandom_fname, 
            file_ext=self.coref_settings.gold_ext, 
            fic_csv_dirpath=self.fic_csv_dirpath)
        gold.extract_annotated_spans()

        # Load predicted mentions
        fic_representation.extract_character_mentions(
            save_dirpath=self.coref_settings.preds_outpath)

        # Get scores
        coref_scores = scorer.coref_scores(
            fic_representation.character_mentions, gold.annotations, exact_match=True)
        print('\tCoref results:')
        for key in ['lea_f1', 'lea_precision', 'lea_recall']:
            print(f'\t\t{key}: {coref_scores[key]: .2%}')
        print()
        return coref_scores