Ejemplo n.º 1
0
    def get_documents_from_api(self, pmids):
        service_root = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml"
        pmids = list(pmids)

        if len(pmids) > self.CHUNK_SIZE:
            pbar = tqdm(desc="Reading", total=len(pmids))
        else:
            pbar = None

        cached_pmids = [i for i in pmids if i in self._document_cache]

        for pmid_chunk in chunks(cached_pmids, self.CHUNK_SIZE):
            pbar.update(len(pmid_chunk))
            yield [self._document_cache[i] for i in pmid_chunk]

        uncached_pmids = [i for i in pmids if i not in cached_pmids]
        # pmid_to_pmcid = self.maybe_map_to_pmcid(uncached_pmids)
        pmid_to_pmcid = {}

        pmids_to_retreive = [
            i for i in uncached_pmids if i not in pmid_to_pmcid
        ]
        pmcids_to_retreive = [
            pmid_to_pmcid[i] for i in uncached_pmids if i in pmid_to_pmcid
        ]

        for pmid_chunk in list(chunks(pmids_to_retreive, self.CHUNK_SIZE)):

            result = requests.get(service_root,
                                  params={
                                      "pmids": ",".join(pmid_chunk),
                                      "concepts": "gene,chemical"
                                  })
            collection = bioc.loads(result.content.decode())
            yield collection.documents
            if pbar:
                pbar.update(len(pmid_chunk))
            self.cache_documents(collection.documents)

        for pmcid_chunk in list(chunks(pmcids_to_retreive, self.CHUNK_SIZE)):

            result = requests.get(
                service_root,
                params={
                    "pmcids": ",".join(pmcid_chunk),
                    "concepts": "gene"
                },
            )
            collection = bioc.loads(result.content.decode())
            yield collection.documents
            if pbar:
                pbar.update(len(pmcid_chunk))
            self.cache_documents(collection.documents)
Ejemplo n.º 2
0
 def __decode(self, response):
     """
     This function decodes the response from the API to a collection from which
     the tag information can be read. 
     :param response: 
     :return: 
     """
     response_decoded = response.decode(ENCODING)
     collection = bioc.loads(response_decoded, ENCODING)
     return collection
Ejemplo n.º 3
0
def test_BioCXMLDocumentWriter_io():
    collection = _get_collection()

    f = io.BytesIO()
    writer = bioc.BioCXMLDocumentWriter(f)
    writer.write_collection_info(collection)
    for document in collection.documents:
        writer.write_document(document)
    writer.close()
    collection = bioc.loads(f.getvalue().decode('utf-8'))
    assert_everything(collection)
Ejemplo n.º 4
0
    def __load_collection_xml(bioc_xml: str, is_file: bool = True):
        """load a xml bioc collection.
        It will return a bioc collection object.

        :param bioc_xml: a str path to a bioc file or a bioc input xml string
        :param is_file: if True bioc_input is a path else it is a string
        :returns:  a bioc collection object
        """
        if is_file:
            with open(bioc_xml, 'r') as fp:
                collection = bioc.load(fp)
            return (collection)
        else:
            collection = bioc.loads(bioc_xml)
            return (collection)
Ejemplo n.º 5
0
    def get_documents(self, pmids: List[str]):
        available_pmids = [i for i in pmids if i in self.index]

        t0 = time()
        # self.logger.info(f"Getting {len(available_pmids)} documents from local PubTator")
        documents = []
        for pmid in available_pmids:
            file = self.index[pmid][0]
            doc_idx = self.index[pmid][1]
            with open(self.path / file) as f:
                lines = f.readlines()
                document = bioc.loads(lines[0] + lines[doc_idx + 1] +
                                      lines[-1]).documents[0]
                pmid = get_pmid(document)[0]
                assert pmid in pmids
                documents.append(document)

        return [documents]
Ejemplo n.º 6
0
 def _get_docs(self, pmids, q):
     docs = []
     t_read = 0
     t_decode = 0
     for pmid in tqdm(pmids, desc="Sending"):
         file = self.index[pmid][0]
         doc_idx = self.index[pmid][1]
         t0 = time()
         with open(self.path / file) as f:
             lines = f.readlines()
             t_read += time() - t0
             t1 = time()
             document = bioc.loads(lines[0] + lines[doc_idx + 1] +
                                   lines[-1]).documents[0]
             t_decode += time() - t1
             pmid = get_pmid(document)[0]
             assert pmid in pmids
             docs.append(document)
     print(f"t_read: {t_read}s")
     print(f"t_decode: {t_decode}s")
     q.put(docs)
Ejemplo n.º 7
0
 def test_dumps(self):
     with open(self.src) as fp:
         collection = bioc.load(fp)
     s = bioc.dumps(collection)
     collection = bioc.loads(s)
     self.__test_collection(collection)
Ejemplo n.º 8
0
 def test_loads(self):
     with open(self.src) as fp:
         s = fp.read()
     collection = bioc.loads(s)
     self.__test_collection(collection)
Ejemplo n.º 9
0
def get_bioc_file(filename):
    with codecs.open(filename, 'r', 'UTF-8') as fp:
        data = fp.read()
        collection = bioc.loads(data)
        return collection.documents
Ejemplo n.º 10
0
def test_loads():
    with open(file, encoding='utf8') as fp:
        s = fp.read()
    collection = bioc.loads(s)
    assert_everything(collection)
Ejemplo n.º 11
0
def test_dumps():
    with open(file, encoding='utf8') as fp:
        collection = bioc.load(fp, BioCFileType.BIOC_JSON)
    s = bioc.dumps(collection, BioCFileType.BIOC_JSON)
    collection = bioc.loads(s, BioCFileType.BIOC_JSON)
    assert_everything(collection)
Ejemplo n.º 12
0
def test_dumps():
    collection = _get_collection()
    s = bioc.dumps(collection)
    collection = bioc.loads(s)
    assert_everything(collection)