def BioC_Converter(Inputfile, Outputfile, Originalfile): tiabs = {} with open(Originalfile, 'r', encoding='utf8') as file_Originalfile: collection = bioc.load(file_Originalfile) document_count = 0 for document in collection.documents: passage_count = 0 for passage in document.passages: if document_count not in tiabs: tiabs[document_count] = {} tiabs[document_count][passage_count] = passage.text passage_count = passage_count + 1 document_count = document_count + 1 file_Originalfile.close() with open(Outputfile, 'w', encoding='utf8') as file_Outputfile: with open(Inputfile, 'r', encoding='utf8') as file_Inputfile: collection = bioc.load(file_Inputfile) document_count = 0 for document in collection.documents: passage_count = 0 for passage in document.passages: passage.text = tiabs[document_count][passage_count] for annotation in passage.annotations: start = annotation.locations[0].offset last = start + annotation.locations[0].length annotation.text = tiabs[document_count][passage_count][ start:last] passage_count = passage_count + 1 document_count = document_count + 1 bioc.dump(collection, file_Outputfile, pretty_print=False) file_Inputfile.close() file_Outputfile.close()
def test_dump(): with open(file, encoding='utf8') as fp: collection = bioc.load(fp, BioCFileType.BIOC_JSON) tmp = tempfile.mktemp() with open(tmp, 'w', encoding='utf8') as fp: bioc.dump(collection, fp, BioCFileType.BIOC_JSON) with open(tmp, encoding='utf8') as fp: collection = bioc.load(fp, BioCFileType.BIOC_JSON) assert_everything(collection)
def test_dump(self): with open(self.src) as fp: collection = bioc.load(fp) tmp = tempfile.NamedTemporaryFile() with open(tmp.name, 'w') as fp: bioc.dump(collection, fp) with open(tmp.name) as fp: collection = bioc.load(fp) self.__test_collection(collection)
def create_prediction(source, dest, phrases_file, verbose=True): """ Args: source: a list of source pathnames dest: output file name phrases_file: phrase pathname """ with open(phrases_file) as fp: phrases = yaml.load(fp, yaml.FullLoader) total_findings = list(phrases.keys()) rows = [] cnt = collections.Counter() for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose, unit='col'): with open(pathname, encoding='utf8') as fp: collection = bioc.load(fp) for doc in collection.documents: label_dict = aggregate(doc) # if doc.id == 's50351835': # print(label_dict) label_vec = dict_to_vec(label_dict, total_findings) findings = collections.OrderedDict() findings['id'] = str(doc.id) for i, f in enumerate(total_findings): findings[f] = label_vec[i] rows.append(findings) rows = sorted(rows, key=lambda x: x['id']) row_df = pd.DataFrame(rows) row_df.to_csv(dest, index=None, float_format='%1.0f') logging.debug(cnt)
def split(source, *, prefix: str, num_doc: int, additional_suffix: str = '.xml', suffix_length: int = 2): path_format = prefix + '{:0' + str( suffix_length) + 'x}' + additional_suffix with open(source, encoding='utf8') as fp: collection = bioc.load(fp) newc = bioc.BioCCollection() newc.infons = collection.infons newc.source = collection.source newc.version = collection.version newc.source = collection.source newc.standalone = collection.standalone i = 0 for doc in tqdm.tqdm(collection.documents): newc.add_document(doc) if len(newc.documents) == num_doc: dst = path_format.format(i) with open(dst, 'w', encoding='utf8') as fp: bioc.dump(newc, fp) del newc.documents[:] i += 1 if newc.documents: dst = path_format.format(i) with open(dst, 'w', encoding='utf8') as fp: bioc.dump(newc, fp)
def xml_decoder(self): """Decode an XML file and return a bioc.collection object in a variable called output.""" with open("foodbase/FoodBase_curated.xml", "r") as xml_file: collection = bioc.load(xml_file) output = bioc.dumps(collection, BioCFileType.BIOC_JSON, indent=2) return output
def BioC_Converter(infile, outfile, biotag_dic, nn_model, para_set): with open(infile, 'r', encoding='utf-8') as fin: with open(outfile, 'w', encoding='utf8') as fout: collection = bioc.load(fin) for document in collection.documents: for passage in document.passages: tag_result = bioTag(passage.text, biotag_dic, nn_model, onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'], Threshold=para_set['ML_Threshold']) mention_num = 0 for ele in tag_result: bioc_note = bioc.BioCAnnotation() bioc_note.id = str(mention_num) mention_num += 1 bioc_note.infons['identifier'] = ele[2] bioc_note.infons['type'] = "Phenotype" bioc_note.infons['score'] = ele[3] start = int(ele[0]) last = int(ele[1]) loc = bioc.BioCLocation(offset=str(start), length=str(last - start)) bioc_note.locations.append(loc) bioc_note.text = passage.text[start:last] passage.annotations.append(bioc_note) bioc.dump(collection, fout, pretty_print=True)
def read_word_based_annotations(infile, textfile, prediction_file=False): with open(infile, 'r') as fin: try: collection = bioc.load(fin) except: logging.error('BioC file {0} not well formed'.format(infile)) raise with open(textfile, 'r') as fin: text = fin.read() assert collection.documents.__len__( ) != 0, "Each document should be encoded in its own collection" annotations, seen_anns = {}, set() for passage in collection.documents[0].passages: for annotation in passage.annotations: assert annotation.id not in seen_anns, 'Duplicate annotation id found. Please verify{0}'.format( annotation.id) if annotation.infons['type'] in ANNOTATION_TYPE: word_annotations = split_annotations( annotation.id, annotation.infons['type'], annotation.locations[0].offset, annotation.locations[0].length, text) for word_id, word_annotation in word_annotations: annotations[word_id] = word_annotation annotation_map = {ann: key for key, ann in annotations.items()} return annotations, annotation_map
def test_chexpert_extractor(): extractor = RegExExtractor( __tests_dir / 'data/patterns/chexpert_phrases.yml', 'CheXpert labeler') dir = get_example_dir() with open(dir / '1.chexpert.xml') as fp: c = bioc.load(fp) actual_documents = c.documents expected_documents = [] for doc in actual_documents: doc = copy.deepcopy(doc) for p in doc.passages: del p.annotations[:] expected_documents.append(doc) for expected_doc, actual_doc in zip(expected_documents, actual_documents): extractor.__call__(expected_doc) expected_anns = sorted(list( bioc.annotations(expected_doc, bioc.PASSAGE)), key=lambda a: a.total_span.offset) actual_anns = sorted(list(bioc.annotations(actual_doc, bioc.PASSAGE)), key=lambda a: a.total_span.offset) assert len(expected_anns) == len(actual_anns), \ '{} vs {}'.format(len(expected_anns), len(actual_anns)) for expected_ann, actual_ann in zip(expected_anns, actual_anns): assert expected_ann.total_span == actual_ann.total_span for k in ['observation', 'annotator']: assert expected_ann.infons[k] == actual_ann.infons[k]
def main(): argv = parse_args(__doc__, version='version 2') print(argv) lemmatizer = Lemmatizer() ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True) splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break']) parser = NegBioParser(model_dir=argv['--bllip-model']) argv = get_absolute_path(argv, '--neg-patterns', 'negbio/patterns/neg_patterns.txt') argv = get_absolute_path(argv, '--uncertainty-patterns', 'negbio/patterns/uncertainty_patterns.txt') mm = pymetamap.MetaMap.get_instance(argv['--metamap']) neg_detector = negdetect.Detector(argv['--neg-patterns'], argv['--uncertainty-patterns']) if argv['--cuis'] == 'None': cuis = None else: cuis = read_cuis(argv['--cuis']) if argv['text']: collection = text2bioc.text2collection(argv['SOURCES']) elif argv['bioc']: with open(argv['SOURCE']) as fp: collection = bioc.load(fp) else: raise KeyError pipeline(collection, mm, splitter, parser, ptb2dep, neg_detector, cuis) with open(os.path.expanduser(argv['--output']), 'w') as fp: bioc.dump(collection, fp)
def test_toJSON(): with open(file, encoding='utf8') as fp: collection = bioc.load(fp, BioCFileType.BIOC_JSON) obj = toJSON(collection) assert obj['documents'][0]['id'] == '1' with pytest.raises(TypeError): toJSON({})
def test_dump(): collection = _get_collection() tmp = tempfile.mktemp() with open(tmp, 'w', encoding='utf8') as fp: bioc.dump(collection, fp) with open(tmp, encoding='utf8') as fp: collection = bioc.load(fp) assert_everything(collection)
def add_text(objs: List[OneFigure], bioc_dir): for obj in objs: pmcid = obj.pmcid with open(bioc_dir / f'{pmcid}.xml', encoding='utf8') as fp: collection = bioc.load(fp) for doc in collection.documents: get_figure_caption(obj, doc) return objs
def loadDataFromBioC(filename, ignoreEntities=[]): with open(filename, 'r') as fp: collection = bioc.load(fp) assert isinstance(collection, bioc.BioCCollection) parsed = [] for document in collection.documents: parsed += convertBiocDocToKindredDocs(document) return parsed
def add_text(objs: List[OneArticle], bioc_dir): for obj in objs: pmcid = obj.pmcid with open(bioc_dir / f'{pmcid}.xml', encoding='utf8') as fp: collection = bioc.load(fp) for doc in collection.documents: # split sentences # doc = split_sentences(doc) for figure in obj.figures.values(): get_figure_caption(figure, doc) return objs
def parse(self, filename, output_dir=tempfile.mkdtemp()): # Read file and do preliminary pre processing to form rows of records data_rows = [] with open(filename, 'r') as fp: collection = bioc.load(fp) for doc in collection.documents: rows_x = self.convert_bioc_document_to_rows(doc) data_rows.extend(rows_x) # subset # data_rows = data_rows[1:100] return data_rows
def test_text_to_collection_file(): text = 'No pneumothorax.' input = tempfile.mktemp() with open(input, 'w') as fp: fp.write(text) output = tempfile.mktemp() text_to_collection_file(output, input) with open(output) as fp: c = bioc.load(fp) assert c.documents[0].passages[0].text == text
def main(): argv = parse_args(__doc__, version='version 2') print(argv) lemmatizer = Lemmatizer() ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True) ssplitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break']) parser = NegBioParser(model_dir=argv['--bllip-model']) argv = get_absolute_path(argv, '--mention_phrases_dir', 'negbio/chexpert/phrases/mention') argv = get_absolute_path(argv, '--unmention_phrases_dir', 'negbio/chexpert/phrases/unmention') argv = get_absolute_path( argv, '--pre-negation-uncertainty-patterns', 'negbio/chexpert/patterns/pre_negation_uncertainty.txt') argv = get_absolute_path( argv, '--post-negation-uncertainty-patterns', 'negbio/chexpert/patterns/post_negation_uncertainty.txt') argv = get_absolute_path(argv, '--neg-patterns', 'negbio/chexpert/patterns/negation.txt') # chexpert loader = NegBioLoader() extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']), Path(argv['--unmention_phrases_dir']), verbose=argv['--verbose']) neg_detector = ModifiedDetector( argv['--pre-negation-uncertainty-patterns'], argv['--neg-patterns'], argv['--post-negation-uncertainty-patterns']) aggregator = NegBioAggregator(CATEGORIES, verbose=argv['--verbose']) if argv['text']: collection = text2bioc.text2collection(argv['SOURCES']) elif argv['bioc']: with open(argv['SOURCE']) as fp: collection = bioc.load(fp) else: raise KeyError pipeline(collection, loader, ssplitter, extractor, parser, ptb2dep, neg_detector, aggregator, verbose=argv['--verbose']) with open(os.path.expanduser(argv['--output']), 'w') as fp: bioc.dump(collection, fp)
def test_scan_collection(): filenames = create_collections() output_dir = tempfile.mkdtemp() os.rmdir(output_dir) p = NegBioPipeline([('fake', FakePipe())]) p.scan(source=filenames, directory=output_dir, suffix='.xml') for filename in filenames: filename = os.path.join(output_dir, os.path.basename(filename)) with open(filename) as fp: c = bioc.load(fp) for doc in c.documents: assert doc.infons['fake']
def test_BioCXMLDocumentWriter_file(): collection = _get_collection() tmp = tempfile.mktemp() with bioc.BioCXMLDocumentWriter(tmp) as writer: writer.write_collection_info(collection) for document in collection.documents: writer.write_document(document) with open(tmp, encoding='utf8') as fp: collection = bioc.load(fp) assert_everything(collection)
def test_split_file(tmp_path): total_doc = 230 n = 7 c = get_collection(total_doc) source = tmp_path / 'foo.xml' with open(source, 'w') as fp: bioc.dump(c, fp) split.split_file(source, prefix=str(tmp_path), num_doc=n) for i in range(int(total_doc / n)): source = str(tmp_path) + '{:02x}.xml'.format(i) with open(source) as fp: subc = bioc.load(fp) assert len(subc.documents) == n last_n = int(math.ceil(total_doc / n)) if last_n > int(total_doc / n): source = str(tmp_path) + '{:02x}.xml'.format(last_n - 1) with open(source) as fp: subc = bioc.load(fp) assert len(subc.documents) == total_doc % n
def _process_pubtator_files(files: List[Path], q: mp.Queue, pickle_path: Path): for file in files: partial_index = {} with file.open() as f: collection = bioc.load(f) for i, document in enumerate(collection.documents): pmid, is_fulltext = get_pmid(document) partial_index[pmid] = (file.name, i, is_fulltext) q.put(partial_index) if pickle_path is not None: with open(pickle_path / file.with_suffix(".pkl").name, "wb") as f: pickle.dump(collection, f)
def get_figure_link(biocfile) -> List[str]: try: c = bioc.load(open(biocfile, 'r', encoding='utf8')) except: return [] figures = [] for doc in c.documents: for p in doc.passages: if len(p.text) == 0: continue p.text = p.text.replace('\n', ' ') if 'file' in p.infons and 'type' in p.infons and p.infons[ 'type'] in FIG_PASSAGE: figures.append(p.infons["file"]) return figures
def __load_collection_xml(bioc_xml: str, is_file: bool = True): """load a xml bioc collection. It will return a bioc collection object. :param bioc_xml: a str path to a bioc file or a bioc input xml string :param is_file: if True bioc_input is a path else it is a string :returns: a bioc collection object """ if is_file: with open(bioc_xml, 'r') as fp: collection = bioc.load(fp) return (collection) else: collection = bioc.loads(bioc_xml) return (collection)
def split_file(source, *, prefix: str, num_doc: int, additional_suffix: str = '.xml', suffix_length: int = 2): path_format = prefix + '{:0' + str( suffix_length) + 'x}' + additional_suffix with open(source, encoding='utf8') as fp: collection = bioc.load(fp) for i, subc in tqdm.tqdm(enumerate(itersplit(collection, num_doc))): dst = path_format.format(i) with open(dst, 'w', encoding='utf8') as fp: bioc.dump(subc, fp)
def get_figure_link(pmc, bioc_file): with open(bioc_file, encoding='utf8') as fp: c = bioc.load(fp) figures = [] for doc in c.documents: for p in doc.passages: if len(p.text) == 0: continue p.text = p.text.replace('\n', ' ') if 'file' in p.infons and 'type' in p.infons and p.infons[ 'type'] in FIG_PASSAGE: url = f'https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{p.infons["file"]}' caption = p.text.replace('\n', ' ') f = Figure(pmc, url, caption) figures.append(f) return figures
def test_level(): with pytest.raises(ValueError): BioCJsonIterWriter(io.StringIO(), level=-1) with open(file, encoding='utf8') as fp: collection = bioc.load(fp, BioCFileType.BIOC_JSON) with pytest.raises(ValueError): writer = BioCJsonIterWriter(io.StringIO(), level=bioc.SENTENCE) writer.write(collection.documents[0]) with pytest.raises(ValueError): writer = BioCJsonIterWriter(io.StringIO(), level=bioc.PASSAGE) writer.write(collection.documents[0]) with pytest.raises(ValueError): writer = BioCJsonIterWriter(io.StringIO(), level=bioc.DOCUMENT) writer.write(collection.documents[0].passages[0])
def get_figure_text(src1, src2, dest, history_file, bioc_dir): df1 = pd.read_csv(src1, dtype=str) df2 = pd.read_csv(src2, dtype=str) df = pd.concat([df1, df2], axis=0) figures = create_figures(df, history_file=history_file) docs = {} # type: Dict[str, bioc.BioCDocument] for figure in figures: pmcid = figure.pmcid if pmcid not in docs: src = bioc_dir / generate_path(pmcid) / f'{pmcid}.xml' collection = bioc.load(open(src)) docs[pmcid] = collection.documents[0] add_text(figure, docs[figure.pmcid]) with open(dest, 'w', encoding='utf8') as fp: objs = [f.to_dict() for f in figures] json.dump(objs, fp, indent=2)
def scan_document(*_, **kwargs): """ Scan each document in a list of BioC source files, apply fn, and print to directory. Args: kwargs: source(list): a list of source pathnames directory(str): output directory fn: fn should expect the following arguments in this given order: sequence1 sequence2 ... non_sequence1 non_sequence2 ... verbose(boolean): """ source = kwargs.pop('source') verbose = kwargs.pop('verbose', True) directory = os.path.expanduser(kwargs.pop('directory')) suffix = kwargs.pop('suffix') fn = kwargs.pop('fn') non_sequences = kwargs.pop('non_sequences', []) if not os.path.exists(directory): os.makedirs(directory) def catch(document, non_sequences): try: return fn(document, *non_sequences) except: logging.exception('Cannot process %s', document.id) for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose): basename = os.path.splitext(os.path.basename(pathname))[0] dstname = os.path.join(directory, '{}{}'.format(basename, suffix)) with io.open(pathname, encoding='utf8') as fp: collection = bioc.load(fp) collection.documents = [ catch(doc, non_sequences) for doc in collection.documents ] with io.open(dstname, 'w', encoding='utf8') as fp: bioc.dump(collection, fp)
def read_prediction(infile): with open(infile, 'r') as fin: try: collection = bioc.load(fin) except: logging.error('BioC file {0} not well formed'.format(infile)) raise assert collection.documents.__len__( ) != 0, "Each document should be encoded in its own collection" annotations, relations = {}, {} for passage in collection.documents[0].passages: for annotation in passage.annotations: assert annotation.id not in annotations, 'Duplicate annotation id found. Please verify{0}'.format( annotation.id) if annotation.infons['type'] in ANNOTATION_TYPE: annotations[annotation.id] = (annotation.infons['type'], annotation.locations[0].offset, annotation.locations[0].length) for passage in collection.documents[0].passages: for relation in passage.relations: assert relation.id not in relations, 'Duplicate relation id found. Please verify{0}'.format( relation.id) if relation.infons['type'] in RELATION_TYPE: if relation.nodes[0].refid in annotations and relation.nodes[ 1].refid in annotations: # Disregarding relations that have illegal annotation ids. annotation1 = annotations[relation.nodes[0].refid] annotation2 = annotations[relation.nodes[1].refid] relations[relation.id] = (relation.infons['type'], annotation1, annotation2) else: logging.debug( 'Disregarding relation id {0} from file {1} because annotation entries are not valid' .format(relation.id, infile)) return annotations, relations
def scan_collection(*_, **kwargs): """ Scan each document in a list of BioC source files, apply fn, and print to directory. Args: kwargs: source(list): a list of source pathnames directory(str): output directory fn: fn should expect the following arguments in this given order: sequence1 sequence2 ... non_sequence1 non_sequence2 ... verbose(boolean): """ source = kwargs.pop('source') verbose = kwargs.pop('verbose', True) directory = os.path.expanduser(kwargs.pop('directory')) suffix = kwargs.pop('suffix') fn = kwargs.pop('fn') non_sequences = kwargs.pop('non_sequences', None) for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose): basename = os.path.splitext(os.path.basename(pathname))[0] dstname = os.path.join(directory, '{}{}'.format(basename, suffix)) with open(pathname) as fp: collection = bioc.load(fp) try: args = [collection] + non_sequences fn(*args) except: logging.exception('Cannot process %s', collection.source) with open(dstname, 'w') as fp: bioc.dump(collection, fp)
def test_validate(self): with open(self.src) as fp: collection = bioc.load(fp) bioc.validate(collection)
def test_dumps(self): with open(self.src) as fp: collection = bioc.load(fp) s = bioc.dumps(collection) collection = bioc.loads(s) self.__test_collection(collection)
def test_load(self): with open(self.src) as fp: collection = bioc.load(fp) self.__test_collection(collection)