def evaluate_quotes(self, fandom_fname, fic_representation, save=True, exact_match=True): """ Evaluate quotes for a fic. Args: save: save AnnotatedSpan quote objects in a pickled file in a tmp directory """ # Quote extraction evaluation # Load gold quote spans gold = Annotation(self.quote_settings.gold_dirpath, fandom_fname, file_ext=self.quote_settings.gold_ext, fic_csv_dirpath=self.fic_csv_dirpath) gold.extract_annotated_spans() # Load predicted quote spans fic_representation.extract_quotes( save_dirpath=self.quote_settings.preds_outpath, coref_from=self.coref_from) # Get scores quote_scores, quote_groups = scorer.quote_scores(fic_representation.quotes, gold.annotations, exact_match=exact_match) print('\tQuote extraction results:') for key in ['extraction_f1', 'extraction_precision', 'extraction_recall']: print(f'\t\t{key}: {quote_scores[key]: .2%}') print('\tQuote attribution results:') for key in ['attribution_f1', 'attribution_precision', 'attribution_recall']: print(f'\t\t{key}: {quote_scores[key]: .2%}') print() return quote_scores, quote_groups
def modify_coref_files(self, coref_annotations_dirpath, coref_annotations_ext, annotation_type='gold'): """ Changes coref tokens to specified external annotations in self.token_data. Saves out to {token_output_dirpath}_gold_coref/token_fpath. Returns the suffix added to dirpaths. """ # Load externally annotated mentions, place in self.character_mentions annotation = Annotation(coref_annotations_dirpath, self.fandom_fname, file_ext=coref_annotations_ext, fic_csv_dirpath=self.fic_csv_dirpath, annotation_type=annotation_type) annotation.extract_annotated_spans() self.character_mentions = annotation.annotations # Modify coref <tags> in CSV self.modify_coref_tags( annotation.annotations) # Modifies self.coref_fic # Save out modify_text = f'_{annotation_type}_coref' self.coref_output_dirpath = self.coref_output_dirpath.rstrip( '/') + modify_text self.save_coref_csv() # Modify coref characters file self.coref_chars_output_dirpath = self.coref_chars_output_dirpath.rstrip( '/') + modify_text self.save_characters_file() return modify_text
def modify_quote_spans(self, quote_annotations_dirpath, quote_annotations_ext): """ Modifies quote marks so that the pipeline will recognized gold quotes as quote spans """ # Load gold quote extractions gold = Annotation(quote_annotations_dirpath, self.fandom_fname, file_ext=quote_annotations_ext, fic_csv_dirpath=self.fic_csv_dirpath) gold.extract_annotated_spans() # Modify CSV text_tokenized self.modify_quote_marks(gold.annotations) # Modifies self.coref_fic # Save out modify_text = '_gold_quotes' self.coref_output_dirpath = self.coref_output_dirpath.rstrip( '/') + modify_text self.save_coref_csv() # Change characters file path, too self.coref_chars_output_dirpath = self.coref_chars_output_dirpath.rstrip( '/') + modify_text return modify_text
def modify_quote_tokens(self, original_tokenization_dirpath=None, quote_annotations_dirpath=None, quote_annotations_ext=None, change_to='gold'): """ Changes quote tokens so BookNLP will recognize them in certain ways. Args: change_to: 'gold': Change to gold quote extractions 'match': Replace quotes with smart quotes to match a tokens file done without whitespace tokenization 'strict': Change existing BookNLP quotes using a dictionary. Single quotes to ` and ', double quotes to `` and '' """ if change_to == 'gold': # Load gold quote extractions gold = Annotation(quote_annotations_dirpath, self.fandom_fname, file_ext=quote_annotations_ext, fic_csv_dirpath=self.fic_csv_dirpath) gold.extract_annotated_spans() # Clear existing quotes, since might have been modified after whitespace tokenization self.clear_quotes() # Add gold quote spans in for span in gold.annotations: self.add_quote_span(span) # Change output dirpath for later saving (after replace gold coref) self.modified_token_output_dirpath = self.modified_token_output_dirpath.rstrip( '/') + '_gold_quotes' elif change_to == 'match': original_tokens = load_tokens_file( os.path.join(self.original_tokenization_dirpath, self.fandom_fname + self.token_file_ext)) self.token_data = match_quotes(original_tokens, self.token_data) # Save out save_tokens_file(self.token_data, self.modified_token_fpath) elif change_to == 'strict': quote_changes = { "“": "``", "”": "''", } self.token_data['normalizedWord'] = self.token_data[ 'normalizedWord'].map(lambda x: quote_changes.get(x, x)) self.token_data['lemma'] = self.token_data['lemma'].map( lambda x: quote_changes.get(x, x)) # Save out pdb.set_trace() self.token_data.to_csv(self.modified_token_fpath, sep='\t', quoting=csv.QUOTE_NONE, index=False)
def load_fic_spans(self, fandom_fname, gold_dirpath, baseline_dirpath, experimental_dirpath, gold_annotations_ext): """ Load quote or coref predictions and gold spans for a fic. Returns gold_spans, baseline_spans, experimental_spans """ gold_annotation = Annotation(gold_dirpath, fandom_fname, file_ext=gold_annotations_ext, fic_csv_dirpath=self.fic_csv_dirpath) gold_annotation.extract_annotated_spans() gold_spans = gold_annotation.annotations baseline_spans = utils.load_pickle(baseline_dirpath, fandom_fname) experimental_spans = utils.load_pickle(experimental_dirpath, fandom_fname) return gold_spans, baseline_spans, experimental_spans
def modify_coref_tokens(self, coref_annotations_dirpath, coref_annotations_ext): """ Changes coref tokens to gold annotations in self.token_data. Saves out to {token_output_dirpath}_gold_coref/token_fpath """ # Load gold mentions gold = Annotation(coref_annotations_dirpath, self.fandom_fname, file_ext=coref_annotations_ext, fic_csv_dirpath=self.fic_csv_dirpath) gold.extract_annotated_spans() # Build character name to id dictionary for gold characters (arbitrary) self.char_name2id = defaultdict(lambda: len(self.char_name2id)) #self.char_name2id = {charname: len(self.char_name2id) for charname in sorted(gold.annotations_set)} # Clear existing character coref annotations self.token_data['characterId'] = -1 # Modify original tokens file for span in gold.annotations: self.modify_coref_span(span) # Renumber BookNLP's own token IDs for re-running on modified output self.renumber_token_ids() # Save out self.modified_token_output_dirpath = self.modified_token_output_dirpath.rstrip( '/') + '_gold_coref' if not os.path.exists(self.modified_token_output_dirpath): os.mkdir(self.modified_token_output_dirpath) self.modified_token_fpath = os.path.join( self.modified_token_output_dirpath, f'{self.fandom_fname}{self.token_file_ext}') self.token_data.to_csv(self.modified_token_fpath, sep='\t', quoting=csv.QUOTE_NONE, index=False)
def evaluate_coref(self, fandom_fname, fic_representation, save=True): """ Evaluate coref for a fic. Args: save: save AnnotatedSpan objects in a pickled file in a tmp directory """ # Load gold mentions gold = Annotation(self.coref_settings.gold_dirpath, fandom_fname, file_ext=self.coref_settings.gold_ext, fic_csv_dirpath=self.fic_csv_dirpath) gold.extract_annotated_spans() # Load predicted mentions fic_representation.extract_character_mentions( save_dirpath=self.coref_settings.preds_outpath) # Get scores coref_scores = scorer.coref_scores( fic_representation.character_mentions, gold.annotations, exact_match=True) print('\tCoref results:') for key in ['lea_f1', 'lea_precision', 'lea_recall']: print(f'\t\t{key}: {coref_scores[key]: .2%}') print() return coref_scores