def load(self): """Load and clean the reports.""" collection = bioc.BioCCollection() reports = pd.read_csv(self.reports_path, header=None, names=[REPORTS])[REPORTS].tolist() for i, report in enumerate(reports): clean_report = self.clean(report) document = text2bioc.text2document(str(i), clean_report) if self.extract_impression: document = section_split.split_document(document) self.extract_impression_from_passages(document) split_document = self.splitter.split_doc(document) assert len(split_document.passages) == 1,\ ('Each document must have a single passage, ' + 'the Impression section.') collection.add_document(split_document) self.reports = reports self.collection = collection
def get_bioc_collection(df): collection = bioc.BioCCollection() splitter = NegBioSSplitter() for i, report in enumerate(df["Report Impression"]): document = text2bioc.text2document(str(i), report) document = splitter.split_doc(document) collection.add_document(document) return collection
def load(self): collection = bioc.BioCCollection() for i, sentence in enumerate(self.report): clean_report = self.clean(sentence) document = text2bioc.text2document(str(i), clean_report) split_document = self.splitter.split_doc(document) assert len(split_document.passages) == 1,\ ('Each document must have a single passage, ' + 'the Impression section.') collection.add_document(split_document) self.collection = collection
def prep_collection(self): """Apply splitter and create bioc collection""" collection = bioc.BioCCollection() for i, report in enumerate(self.reports): clean_report = self.clean(report) document = text2bioc.text2document(str(i), clean_report) if self.extract_impression: document = section_split.split_document(document) self.extract_impression_from_passages(document) split_document = self.splitter.split_doc(document) assert len(split_document.passages) == 1,\ ('Each document must have a single passage, ' + 'the Impression section.') collection.add_document(split_document) self.collection = collection