Python PubMedFetch Examples, fetchers.pubmed.PubMedFetch Python Examples

Example #1

0

Show file

    def create_identifiers(self):
        # Create new PubMed identifiers for any PMIDs which are not already in
        # our database.
        new_ids = json.loads(self.results)['added']
        existing_pmids = list(
            Identifiers.objects.filter(
                database=1, unique_id__in=new_ids).values_list('unique_id',
                                                               flat=True))
        ids_to_add = list(set(new_ids) - set(existing_pmids))
        ids_to_add_len = len(ids_to_add)

        block_size = 1000.
        logging.debug("{c} IDs to be added".format(c=ids_to_add_len))
        for i in xrange(int(ceil(ids_to_add_len / block_size))):
            start_index = int(i * block_size)
            end_index = min(int(i * block_size + block_size), ids_to_add_len)
            logging.debug("Building from {s} to {e}".format(s=start_index,
                                                            e=end_index))
            fetch = PubMedFetch(id_list=ids_to_add[start_index:end_index],
                                retmax=int(block_size))
            identifiers = []
            for item in fetch.get_content():
                identifiers.append(
                    Identifiers(unique_id=item['PMID'],
                                database=1,
                                content=json.dumps(item)))
            Identifiers.objects.bulk_create(identifiers)

Example #2

0

Show file

 def test_collective_author(self):
     # this doesn't have an individual author but rather a collective author
     self.ids = [21860499]
     self.fetch = PubMedFetch(id_list=self.ids)
     self.fetch.get_content()
     self.assertEqual(self.fetch.content[0]['authors_short'],
                      u'National Toxicology Program')

Example #3

0

Show file

 def test_utf8(self):
     #these ids have UTF-8 text in the abstract; make sure we can import and
     # the abstract field captures this value.
     self.ids = [23878845, 16080930]
     self.fetch = PubMedFetch(id_list=self.ids)
     self.fetch.get_content()
     # assert that a unicode value exists in text
     self.assertTrue(self.fetch.content[0]['abstract'].find(u'\u03b1') > -1)

Example #4

0

Show file

 def test_structured_abstract(self):
     """
     Some abstracts have structure in XML; make sure HAWC can import these.
     For example: http://www.ncbi.nlm.nih.gov/pubmed/21813367
     """
     self.ids = (21813367, )
     self.fetch = PubMedFetch(id_list=self.ids)
     self.fetch.get_content()
     abstract_text = u"""<span class='abstract_label'>BACKGROUND: </span>People living or working in eastern Ohio and western West Virginia have been exposed to perfluorooctanoic acid (PFOA) released by DuPont Washington Works facilities.<br><span class='abstract_label'>OBJECTIVES: </span>Our objective was to estimate historical PFOA exposures and serum concentrations experienced by 45,276 non-occupationally exposed participants in the C8 Health Project who consented to share their residential histories and a 2005-2006 serum PFOA measurement.<br><span class='abstract_label'>METHODS: </span>We estimated annual PFOA exposure rates for each individual based on predicted calibrated water concentrations and predicted air concentrations using an environmental fate and transport model, individual residential histories, and maps of public water supply networks. We coupled individual exposure estimates with a one-compartment absorption, distribution, metabolism, and excretion (ADME) model to estimate time-dependent serum concentrations.<br><span class='abstract_label'>RESULTS: </span>For all participants (n = 45,276), predicted and observed median serum concentrations in 2005-2006 are 14.2 and 24.3 ppb, respectively [Spearman's rank correlation coefficient (r(s)) = 0.67]. For participants who provided daily public well water consumption rate and who had the same residence and workplace in one of six municipal water districts for 5 years before the serum sample (n = 1,074), predicted and observed median serum concentrations in 2005-2006 are 32.2 and 40.0 ppb, respectively (r(s) = 0.82).<br><span class='abstract_label'>CONCLUSIONS: </span>Serum PFOA concentrations predicted by linked exposure and ADME models correlated well with observed 2005-2006 human serum concentrations for C8 Health Project participants. These individualized retrospective exposure and serum estimates are being used in a variety of epidemiologic studies being conducted in this region."""
     self.assertEqual(self.fetch.content[0]['abstract'], abstract_text)

Example #5

0

Show file

File: tests.py Project: ashyhanov/hawc

 def test_utf8(self):
     #these ids have UTF-8 text in the abstract; make sure we can import and
     # the abstract field captures this value.
     self.ids = [23878845, 16080930]
     self.fetch = PubMedFetch(id_list=self.ids)
     self.fetch.get_content()
     # assert that a unicode value exists in text
     self.assertTrue(self.fetch.content[0]['abstract'].find(u'\u03b1') > -1)

Example #6

0

Show file

File: tests.py Project: ashyhanov/hawc

 def test_structured_abstract(self):
     """
     Some abstracts have structure in XML; make sure HAWC can import these.
     For example: http://www.ncbi.nlm.nih.gov/pubmed/21813367
     """
     self.ids = (21813367, )
     self.fetch = PubMedFetch(id_list=self.ids)
     self.fetch.get_content()
     abstract_text = u"""<span class='abstract_label'>BACKGROUND: </span>People living or working in eastern Ohio and western West Virginia have been exposed to perfluorooctanoic acid (PFOA) released by DuPont Washington Works facilities.<br><span class='abstract_label'>OBJECTIVES: </span>Our objective was to estimate historical PFOA exposures and serum concentrations experienced by 45,276 non-occupationally exposed participants in the C8 Health Project who consented to share their residential histories and a 2005-2006 serum PFOA measurement.<br><span class='abstract_label'>METHODS: </span>We estimated annual PFOA exposure rates for each individual based on predicted calibrated water concentrations and predicted air concentrations using an environmental fate and transport model, individual residential histories, and maps of public water supply networks. We coupled individual exposure estimates with a one-compartment absorption, distribution, metabolism, and excretion (ADME) model to estimate time-dependent serum concentrations.<br><span class='abstract_label'>RESULTS: </span>For all participants (n = 45,276), predicted and observed median serum concentrations in 2005-2006 are 14.2 and 24.3 ppb, respectively [Spearman's rank correlation coefficient (r(s)) = 0.67]. For participants who provided daily public well water consumption rate and who had the same residence and workplace in one of six municipal water districts for 5 years before the serum sample (n = 1,074), predicted and observed median serum concentrations in 2005-2006 are 32.2 and 40.0 ppb, respectively (r(s) = 0.82).<br><span class='abstract_label'>CONCLUSIONS: </span>Serum PFOA concentrations predicted by linked exposure and ADME models correlated well with observed 2005-2006 human serum concentrations for C8 Health Project participants. These individualized retrospective exposure and serum estimates are being used in a variety of epidemiologic studies being conducted in this region."""
     self.assertEqual(self.fetch.content[0]['abstract'], abstract_text)

Example #7

0

Show file

File: models.py Project: JoshAddington/hawc

    def get_pubmed_identifiers(cls, ids):
        # Return a queryset of identifiers, one for each PubMed ID. Either get
        # or create an identifier, whatever is required

        # Filter IDs which need to be imported
        idents = list(
            Identifiers.objects.filter(database=PUBMED, unique_id__in=ids).values_list("unique_id", flat=True)
        )
        need_import = tuple(set(ids) - set(idents))

        # Grab Pubmed objects
        fetch = PubMedFetch(need_import)

        # Save new Identifier objects
        for item in fetch.get_content():
            ident = Identifiers(unique_id=item["PMID"], database=PUBMED, content=json.dumps(item, encoding="utf-8"))
            ident.save()
            idents.append(ident.unique_id)

        return Identifiers.objects.filter(database=PUBMED, unique_id__in=idents)

Example #8

0

Show file

File: models.py Project: JoshAddington/hawc

    def create_identifiers(self):
        # Create new PubMed identifiers for any PMIDs which are not already in
        # our database.
        new_ids = json.loads(self.results)["added"]
        existing_pmids = list(
            Identifiers.objects.filter(database=PUBMED, unique_id__in=new_ids).values_list("unique_id", flat=True)
        )
        ids_to_add = list(set(new_ids) - set(existing_pmids))
        ids_to_add_len = len(ids_to_add)

        block_size = 1000.0
        logging.debug("{0} IDs to be added".format(ids_to_add_len))
        for i in xrange(int(ceil(ids_to_add_len / block_size))):
            start_index = int(i * block_size)
            end_index = min(int(i * block_size + block_size), ids_to_add_len)
            logging.debug("Building from {0} to {1}".format(start_index, end_index))
            fetch = PubMedFetch(id_list=ids_to_add[start_index:end_index], retmax=int(block_size))
            identifiers = []
            for item in fetch.get_content():
                identifiers.append(Identifiers(unique_id=item["PMID"], database=PUBMED, content=json.dumps(item)))
            Identifiers.objects.bulk_create(identifiers)

Example #9

0

Show file

    def get_pubmed_identifiers(cls, ids):
        # Return a queryset of identifiers, one for each PubMed ID. Either get
        # or create an identifier, whatever is required

        # Filter IDs which need to be imported
        idents = list(Identifiers.objects
            .filter(database=1, unique_id__in=ids)
            .values_list('unique_id', flat=True))
        need_import = tuple(set(ids) - set(idents))

        # Grab Pubmed objects
        fetch = PubMedFetch(need_import)

        # Save new Identifier objects
        for item in fetch.get_content():
            ident = Identifiers(unique_id=item['PMID'],
                                database=1,
                                content=json.dumps(item, encoding='utf-8'))
            ident.save()
            idents.append(ident.unique_id)

        return Identifiers.objects.filter(database=1, unique_id__in=idents)

Example #10

0

Show file

 def test_multiquery(self):
     self.fetch = PubMedFetch(id_list=self.ids, retmax=3)
     self.fetch.get_content()
     self.assertEqual(self.fetch.request_count, 2)
     self._results_check()

Example #11

0

Show file

 def test_standard_query(self):
     self.fetch = PubMedFetch(id_list=self.ids)
     self.fetch.get_content()
     self.assertEqual(self.fetch.request_count, 1)
     self._results_check()

Example #12

0

Show file

class PubMedFetchTest(TestCase):
    """
    Make sure that a PubMed search with returns the expected number of IDS,
    and that all IDs are identical to what were expected. Example from the
    PubMed quickstart guide here:

        http://www.ncbi.nlm.nih.gov/books/NBK25500/

    """
    def setUp(self):
        self.ids = [19008416, 18927361, 18787170, 18487186, 18239126, 18239125]

    def test_standard_query(self):
        self.fetch = PubMedFetch(id_list=self.ids)
        self.fetch.get_content()
        self.assertEqual(self.fetch.request_count, 1)
        self._results_check()

    def test_multiquery(self):
        self.fetch = PubMedFetch(id_list=self.ids, retmax=3)
        self.fetch.get_content()
        self.assertEqual(self.fetch.request_count, 2)
        self._results_check()

    def test_utf8(self):
        #these ids have UTF-8 text in the abstract; make sure we can import and
        # the abstract field captures this value.
        self.ids = [23878845, 16080930]
        self.fetch = PubMedFetch(id_list=self.ids)
        self.fetch.get_content()
        # assert that a unicode value exists in text
        self.assertTrue(self.fetch.content[0]['abstract'].find(u'\u03b1') > -1)

    def test_collective_author(self):
        # this doesn't have an individual author but rather a collective author
        self.ids = [21860499]
        self.fetch = PubMedFetch(id_list=self.ids)
        self.fetch.get_content()
        self.assertEqual(self.fetch.content[0]['authors_short'],
                         u'National Toxicology Program')

    def test_structured_abstract(self):
        """
        Some abstracts have structure in XML; make sure HAWC can import these.
        For example: http://www.ncbi.nlm.nih.gov/pubmed/21813367
        """
        self.ids = (21813367, )
        self.fetch = PubMedFetch(id_list=self.ids)
        self.fetch.get_content()
        abstract_text = u"""<span class='abstract_label'>BACKGROUND: </span>People living or working in eastern Ohio and western West Virginia have been exposed to perfluorooctanoic acid (PFOA) released by DuPont Washington Works facilities.<br><span class='abstract_label'>OBJECTIVES: </span>Our objective was to estimate historical PFOA exposures and serum concentrations experienced by 45,276 non-occupationally exposed participants in the C8 Health Project who consented to share their residential histories and a 2005-2006 serum PFOA measurement.<br><span class='abstract_label'>METHODS: </span>We estimated annual PFOA exposure rates for each individual based on predicted calibrated water concentrations and predicted air concentrations using an environmental fate and transport model, individual residential histories, and maps of public water supply networks. We coupled individual exposure estimates with a one-compartment absorption, distribution, metabolism, and excretion (ADME) model to estimate time-dependent serum concentrations.<br><span class='abstract_label'>RESULTS: </span>For all participants (n = 45,276), predicted and observed median serum concentrations in 2005-2006 are 14.2 and 24.3 ppb, respectively [Spearman's rank correlation coefficient (r(s)) = 0.67]. For participants who provided daily public well water consumption rate and who had the same residence and workplace in one of six municipal water districts for 5 years before the serum sample (n = 1,074), predicted and observed median serum concentrations in 2005-2006 are 32.2 and 40.0 ppb, respectively (r(s) = 0.82).<br><span class='abstract_label'>CONCLUSIONS: </span>Serum PFOA concentrations predicted by linked exposure and ADME models correlated well with observed 2005-2006 human serum concentrations for C8 Health Project participants. These individualized retrospective exposure and serum estimates are being used in a variety of epidemiologic studies being conducted in this region."""
        self.assertEqual(self.fetch.content[0]['abstract'], abstract_text)

    def _results_check(self):
        self.assertEqual(len(self.fetch.content), 6)
        self.assertListEqual([item['PMID'] for item in self.fetch.content],
                             self.ids)

        citations = [
            "Science 2008; 322 (5908):1695-9", "Science 2008; 322 (5900):357",
            "Science 2008; 321 (5895):1499-502",
            "Science 2008; 320 (5878):903-9", "Science 2008; 319 (5863):620-4",
            "Science 2008; 319 (5863):617-20"
        ]
        self.assertListEqual([item['citation'] for item in self.fetch.content],
                             citations)

        authors_short = [
            "Varambally S et al.", "Couzin J", "Mao JH et al.",
            "Bromberg KD et al.", "Schlabach MR et al.", "Silva JM et al."
        ]
        self.assertListEqual(
            [item['authors_short'] for item in self.fetch.content],
            authors_short)

Example #13

0

Show file

File: tests.py Project: ashyhanov/hawc

 def test_collective_author(self):
     # this doesn't have an individual author but rather a collective author
     self.ids = [21860499]
     self.fetch = PubMedFetch(id_list=self.ids)
     self.fetch.get_content()
     self.assertEqual(self.fetch.content[0]['authors_short'], u'National Toxicology Program')

Example #14

0

Show file

File: tests.py Project: ashyhanov/hawc

 def test_multiquery(self):
     self.fetch = PubMedFetch(id_list=self.ids, retmax=3)
     self.fetch.get_content()
     self.assertEqual(self.fetch.request_count, 2)
     self._results_check()

Example #15

0

Show file

File: tests.py Project: ashyhanov/hawc

 def test_standard_query(self):
     self.fetch = PubMedFetch(id_list=self.ids)
     self.fetch.get_content()
     self.assertEqual(self.fetch.request_count, 1)
     self._results_check()

Example #16

0

Show file

File: tests.py Project: ashyhanov/hawc

class PubMedFetchTest(TestCase):
    """
    Make sure that a PubMed search with returns the expected number of IDS,
    and that all IDs are identical to what were expected. Example from the
    PubMed quickstart guide here:

        http://www.ncbi.nlm.nih.gov/books/NBK25500/

    """
    def setUp(self):
        self.ids = [19008416, 18927361, 18787170, 18487186, 18239126, 18239125]

    def test_standard_query(self):
        self.fetch = PubMedFetch(id_list=self.ids)
        self.fetch.get_content()
        self.assertEqual(self.fetch.request_count, 1)
        self._results_check()

    def test_multiquery(self):
        self.fetch = PubMedFetch(id_list=self.ids, retmax=3)
        self.fetch.get_content()
        self.assertEqual(self.fetch.request_count, 2)
        self._results_check()

    def test_utf8(self):
        #these ids have UTF-8 text in the abstract; make sure we can import and
        # the abstract field captures this value.
        self.ids = [23878845, 16080930]
        self.fetch = PubMedFetch(id_list=self.ids)
        self.fetch.get_content()
        # assert that a unicode value exists in text
        self.assertTrue(self.fetch.content[0]['abstract'].find(u'\u03b1') > -1)

    def test_collective_author(self):
        # this doesn't have an individual author but rather a collective author
        self.ids = [21860499]
        self.fetch = PubMedFetch(id_list=self.ids)
        self.fetch.get_content()
        self.assertEqual(self.fetch.content[0]['authors_short'], u'National Toxicology Program')

    def test_structured_abstract(self):
        """
        Some abstracts have structure in XML; make sure HAWC can import these.
        For example: http://www.ncbi.nlm.nih.gov/pubmed/21813367
        """
        self.ids = (21813367, )
        self.fetch = PubMedFetch(id_list=self.ids)
        self.fetch.get_content()
        abstract_text = u"""<span class='abstract_label'>BACKGROUND: </span>People living or working in eastern Ohio and western West Virginia have been exposed to perfluorooctanoic acid (PFOA) released by DuPont Washington Works facilities.<br><span class='abstract_label'>OBJECTIVES: </span>Our objective was to estimate historical PFOA exposures and serum concentrations experienced by 45,276 non-occupationally exposed participants in the C8 Health Project who consented to share their residential histories and a 2005-2006 serum PFOA measurement.<br><span class='abstract_label'>METHODS: </span>We estimated annual PFOA exposure rates for each individual based on predicted calibrated water concentrations and predicted air concentrations using an environmental fate and transport model, individual residential histories, and maps of public water supply networks. We coupled individual exposure estimates with a one-compartment absorption, distribution, metabolism, and excretion (ADME) model to estimate time-dependent serum concentrations.<br><span class='abstract_label'>RESULTS: </span>For all participants (n = 45,276), predicted and observed median serum concentrations in 2005-2006 are 14.2 and 24.3 ppb, respectively [Spearman's rank correlation coefficient (r(s)) = 0.67]. For participants who provided daily public well water consumption rate and who had the same residence and workplace in one of six municipal water districts for 5 years before the serum sample (n = 1,074), predicted and observed median serum concentrations in 2005-2006 are 32.2 and 40.0 ppb, respectively (r(s) = 0.82).<br><span class='abstract_label'>CONCLUSIONS: </span>Serum PFOA concentrations predicted by linked exposure and ADME models correlated well with observed 2005-2006 human serum concentrations for C8 Health Project participants. These individualized retrospective exposure and serum estimates are being used in a variety of epidemiologic studies being conducted in this region."""
        self.assertEqual(self.fetch.content[0]['abstract'], abstract_text)

    def _results_check(self):
        self.assertEqual(len(self.fetch.content), 6)
        self.assertListEqual([item['PMID'] for item in self.fetch.content], self.ids)

        citations = ["Science 2008; 322 (5908):1695-9",
                     "Science 2008; 322 (5900):357",
                     "Science 2008; 321 (5895):1499-502",
                     "Science 2008; 320 (5878):903-9",
                     "Science 2008; 319 (5863):620-4",
                     "Science 2008; 319 (5863):617-20"]
        self.assertListEqual([item['citation'] for item in self.fetch.content], citations)

        authors_short = ["Varambally S et al.",
                         "Couzin J",
                         "Mao JH et al.",
                         "Bromberg KD et al.",
                         "Schlabach MR et al.",
                         "Silva JM et al."]
        self.assertListEqual([item['authors_short'] for item in self.fetch.content], authors_short)