def to_input_samples(self, fold: Optional[str] = None) -> List[InputSample]: files_found = False input_samples = [] for i, file_path in enumerate(self.files_path.glob(self.glob_pattern)): if fold and fold not in file_path.name: continue files_found = True with open(file_path, "r", encoding="utf-8") as file: text = file.readlines() text = "".join(text) output_docs = conll_ner_to_docs(input_data=text, n_sents=None, no_print=True) for doc in tqdm(output_docs, f"Processing doc for file {file_path.name}"): input_samples.append(InputSample.from_spacy_doc(doc=doc)) if not files_found: raise FileNotFoundError( f"No files found for pattern {self.glob_pattern} and fold {fold}" ) return input_samples
def test_from_spacy_doc(): nlp = spacy.load("en_core_web_sm") doc = nlp("Nice to meet you Mr. Perkins.") sample = InputSample.from_spacy_doc(doc) assert sample.spans[0].entity_type == "PERSON" assert sample.tags == ["O", "O", "O", "O", "O", "U-PERSON", "O"]