def store_embeddings(data_points: Union[List[DataPoint], Dataset], storage_mode: str): # if memory mode option 'none' delete everything if isinstance(data_points, Dataset): data_points = list(_iter_dataset(data_points)) if storage_mode == "none": delete_keys = None # else delete only dynamic embeddings (otherwise autograd will keep everything in memory) else: # find out which ones are dynamic embeddings delete_keys = [] data_point = data_points[0] if isinstance(data_point, Sentence): first_token = data_point[0] for name, vector in first_token._embeddings.items(): if vector.requires_grad: delete_keys.append(name) for name, vector in data_point._embeddings.items(): if vector.requires_grad: delete_keys.append(name) # find out which ones are dynamic embeddings for data_point in data_points: data_point.clear_embeddings(delete_keys) # memory management - option 1: send everything to CPU (pin to memory if we train on GPU) if storage_mode == "cpu": pin_memory = str(flair.device) != "cpu" for data_point in data_points: data_point.to("cpu", pin_memory=pin_memory) # record current embedding storage mode to allow optimization (for instance in FlairEmbeddings class) flair.embedding_storage_mode = storage_mode
def evaluate( self, data_points: Union[List[Sentence], Dataset], gold_label_type: str, out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 1, # unnecessary, but trainer.train calls evaluate with this parameter num_workers: Optional[int] = 8, **kwargs, ) -> Result: if isinstance(data_points, Dataset): data_points = list(_iter_dataset(data_points)) if self.regression: return self.evaluate_regression( sentences=data_points, out_path=out_path, embedding_storage_mode=embedding_storage_mode, ) return self.evaluate_classification( sentences=data_points, out_path=out_path, embedding_storage_mode=embedding_storage_mode, )
def test_sanity_not_too_many_entities(CorpusType: Type[ColumnCorpus]): corpus = CorpusType() # type: ignore n_entities_per_sentence = [] for sentence in _iter_dataset(corpus.get_all_sentences()): entities = sentence.get_spans("ner") n_entities_per_sentence.append(len(entities)) avg_entities_per_sentence = sum(n_entities_per_sentence) / len( n_entities_per_sentence) assert avg_entities_per_sentence <= 5
def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): folder_path = Path(folder_path) folder_path = folder_path / "WNLI.tsv" with open(folder_path, mode="w") as tsv_file: tsv_file.write("index\tprediction\n") datapoint: DataPair for index, datapoint in enumerate(_iter_dataset(self.eval_dataset)): tsv_file.write(str(index) + "\t" + datapoint.get_labels("entailment")[0].value + "\n")
def test_sanity_no_long_entities(CorpusType: Type[ColumnCorpus]): corpus = CorpusType() # type: ignore longest_entity: List[str] = [] for sentence in _iter_dataset(corpus.get_all_sentences()): entities = sentence.get_spans("ner") for entity in entities: if len(entity.tokens) > len(longest_entity): longest_entity = [t.text for t in entity.tokens] assert len(longest_entity) < 10, " ".join(longest_entity)
def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): folder_path = Path(folder_path) folder_path = folder_path / "MRPC.tsv" with open(folder_path, mode="w") as tsv_file: tsv_file.write("index\tprediction\n") datapoint: DataPair for index, datapoint in enumerate(_iter_dataset(self.test)): label = datapoint.get_labels("paraphrase")[0].value tsv_file.write(str(index) + "\t" + label + "\n")
def test_sanity_no_unmatched_parentheses(CorpusType: Type[ColumnCorpus]): corpus = CorpusType() # type: ignore unbalanced_entities = [] for sentence in _iter_dataset(corpus.get_all_sentences()): entities = sentence.get_spans("ner") for entity in entities: entity_text = "".join(t.text for t in entity.tokens) if not has_balanced_parantheses(entity_text): unbalanced_entities.append(entity_text) assert unbalanced_entities == []
def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): folder_path = Path(folder_path) glue_eval_tsv = "MNLI-m.tsv" if self.evaluate_on_matched else "MNLI-mm.tsv" folder_path = folder_path / glue_eval_tsv with open(folder_path, mode="w") as tsv_file: tsv_file.write("index\tprediction\n") datapoint: DataPair for index, datapoint in enumerate(_iter_dataset(self.eval_dataset)): label = datapoint.get_labels("textual_entailment")[0].value tsv_file.write(str(index) + "\t" + label + "\n")
def test_sanity_not_starting_with_minus(CorpusType: Type[ColumnCorpus]): corpus = CorpusType() # type: ignore entities_starting_with_minus = [] for sentence in _iter_dataset(corpus.get_all_sentences()): entities = sentence.get_spans("ner") for entity in entities: if str(entity.tokens[0].text).startswith("-"): entities_starting_with_minus.append(" ".join( [t.text for t in entity.tokens])) assert len(entities_starting_with_minus) == 0, "|".join( entities_starting_with_minus)
def jsonl_from_eval_dataset(self, folder_path: Union[str, Path]): folder_path = Path(folder_path) folder_path = folder_path / "RTE.jsonl" with open(folder_path, mode="w") as jsonl_file: datapoint: DataPair for index, datapoint in enumerate(_iter_dataset(self.eval_dataset)): entry = { "idx": index, "label": datapoint.get_labels("textual_entailment")[0].value, } jsonl_file.write(str(entry) + "\n")
def test_sanity_no_repeating_Bs(CorpusType: Type[ColumnCorpus]): corpus = CorpusType() # type: ignore longest_repeat_tokens: List[Token] = [] repeat_tokens: List[Token] = [] for sentence in _iter_dataset(corpus.get_all_sentences()): for token in sentence.tokens: if token.get_labels()[0].value.startswith( "B") or token.get_labels()[0].value.startswith("S"): repeat_tokens.append(token) else: if len(repeat_tokens) > len(longest_repeat_tokens): longest_repeat_tokens = repeat_tokens repeat_tokens = [] assert len(longest_repeat_tokens) < 4
def store_embeddings(data_points: Union[List[DataPoint], Dataset], storage_mode: str, dynamic_embeddings: Optional[List[str]] = None): if isinstance(data_points, Dataset): data_points = list(_iter_dataset(data_points)) # if memory mode option 'none' delete everything if storage_mode == "none": dynamic_embeddings = None # if dynamic embedding keys not passed, identify them automatically elif not dynamic_embeddings: dynamic_embeddings = identify_dynamic_embeddings(data_points[0]) # always delete dynamic embeddings for data_point in data_points: data_point.clear_embeddings(dynamic_embeddings) # if storage mode is "cpu", send everything to CPU (pin to memory if we train on GPU) if storage_mode == "cpu": pin_memory = str(flair.device) != "cpu" for data_point in data_points: data_point.to("cpu", pin_memory=pin_memory)