def get_documents_from_api(self, pmids): service_root = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml" pmids = list(pmids) if len(pmids) > self.CHUNK_SIZE: pbar = tqdm(desc="Reading", total=len(pmids)) else: pbar = None cached_pmids = [i for i in pmids if i in self._document_cache] for pmid_chunk in chunks(cached_pmids, self.CHUNK_SIZE): pbar.update(len(pmid_chunk)) yield [self._document_cache[i] for i in pmid_chunk] uncached_pmids = [i for i in pmids if i not in cached_pmids] # pmid_to_pmcid = self.maybe_map_to_pmcid(uncached_pmids) pmid_to_pmcid = {} pmids_to_retreive = [ i for i in uncached_pmids if i not in pmid_to_pmcid ] pmcids_to_retreive = [ pmid_to_pmcid[i] for i in uncached_pmids if i in pmid_to_pmcid ] for pmid_chunk in list(chunks(pmids_to_retreive, self.CHUNK_SIZE)): result = requests.get(service_root, params={ "pmids": ",".join(pmid_chunk), "concepts": "gene,chemical" }) collection = bioc.loads(result.content.decode()) yield collection.documents if pbar: pbar.update(len(pmid_chunk)) self.cache_documents(collection.documents) for pmcid_chunk in list(chunks(pmcids_to_retreive, self.CHUNK_SIZE)): result = requests.get( service_root, params={ "pmcids": ",".join(pmcid_chunk), "concepts": "gene" }, ) collection = bioc.loads(result.content.decode()) yield collection.documents if pbar: pbar.update(len(pmcid_chunk)) self.cache_documents(collection.documents)
def __decode(self, response): """ This function decodes the response from the API to a collection from which the tag information can be read. :param response: :return: """ response_decoded = response.decode(ENCODING) collection = bioc.loads(response_decoded, ENCODING) return collection
def test_BioCXMLDocumentWriter_io(): collection = _get_collection() f = io.BytesIO() writer = bioc.BioCXMLDocumentWriter(f) writer.write_collection_info(collection) for document in collection.documents: writer.write_document(document) writer.close() collection = bioc.loads(f.getvalue().decode('utf-8')) assert_everything(collection)
def __load_collection_xml(bioc_xml: str, is_file: bool = True): """load a xml bioc collection. It will return a bioc collection object. :param bioc_xml: a str path to a bioc file or a bioc input xml string :param is_file: if True bioc_input is a path else it is a string :returns: a bioc collection object """ if is_file: with open(bioc_xml, 'r') as fp: collection = bioc.load(fp) return (collection) else: collection = bioc.loads(bioc_xml) return (collection)
def get_documents(self, pmids: List[str]): available_pmids = [i for i in pmids if i in self.index] t0 = time() # self.logger.info(f"Getting {len(available_pmids)} documents from local PubTator") documents = [] for pmid in available_pmids: file = self.index[pmid][0] doc_idx = self.index[pmid][1] with open(self.path / file) as f: lines = f.readlines() document = bioc.loads(lines[0] + lines[doc_idx + 1] + lines[-1]).documents[0] pmid = get_pmid(document)[0] assert pmid in pmids documents.append(document) return [documents]
def _get_docs(self, pmids, q): docs = [] t_read = 0 t_decode = 0 for pmid in tqdm(pmids, desc="Sending"): file = self.index[pmid][0] doc_idx = self.index[pmid][1] t0 = time() with open(self.path / file) as f: lines = f.readlines() t_read += time() - t0 t1 = time() document = bioc.loads(lines[0] + lines[doc_idx + 1] + lines[-1]).documents[0] t_decode += time() - t1 pmid = get_pmid(document)[0] assert pmid in pmids docs.append(document) print(f"t_read: {t_read}s") print(f"t_decode: {t_decode}s") q.put(docs)
def test_dumps(self): with open(self.src) as fp: collection = bioc.load(fp) s = bioc.dumps(collection) collection = bioc.loads(s) self.__test_collection(collection)
def test_loads(self): with open(self.src) as fp: s = fp.read() collection = bioc.loads(s) self.__test_collection(collection)
def get_bioc_file(filename): with codecs.open(filename, 'r', 'UTF-8') as fp: data = fp.read() collection = bioc.loads(data) return collection.documents
def test_loads(): with open(file, encoding='utf8') as fp: s = fp.read() collection = bioc.loads(s) assert_everything(collection)
def test_dumps(): with open(file, encoding='utf8') as fp: collection = bioc.load(fp, BioCFileType.BIOC_JSON) s = bioc.dumps(collection, BioCFileType.BIOC_JSON) collection = bioc.loads(s, BioCFileType.BIOC_JSON) assert_everything(collection)
def test_dumps(): collection = _get_collection() s = bioc.dumps(collection) collection = bioc.loads(s) assert_everything(collection)