Python URLS.build_fetchの例

プログラミング言語: Python

名前空間/パッケージ名: erpsc.core.urls

クラス/型: URLS

メソッド/関数: build_fetch

hotexamples.comのコード掲載数: 3

Python URLS.build_fetch - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのerpsc.core.urls.URLS.build_fetchの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

URLS(8)

build_info(3)

build_search(3)

build_fetch(2)

build_query(1)

check_args(1)

コード例 #1

ファイルを表示

ファイル: test_core_urls.py プロジェクト: njr175/ERP_SCANR

def test_build_fetch():
    """Test the build_fetch() method from URLS()."""

    urls = URLS(db='pubmed', retmax='500', field='id', retmode='xml')

    urls.build_fetch(['db', 'retmode'])

    assert urls.fetch

コード例 #2

ファイルを表示

ファイル: words.py プロジェクト: njr175/ERP_SCANR

    def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False):
        """Search through pubmed for all abstracts referring to a given ERP.

        The scraping does an exact word search for the ERP term given.
        It then loops through all the artciles found about that data.
        For each article, pulls title, year and word data.

        Notes
        -----
        - Pulls data using the hierarchical tag structure that organize the articles.
        - Initially, the procedure was to pull all tags of a certain type.
            For example: extract all 'DateCreated' tags.
            This procedure fails (or badly organizes data) when an articles is
                missing a particular tag.
            Now: take advantage of the hierarchy, loop through each article tag.
                From here, pull out the data, if available.
                This way, can deal with cases of missing data.
        """

        # Set date of when data was collected
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Get e-utils URLS object
        if use_hist: hist_val = 'y'
        else: hist_val = 'n'
        urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False)
        urls.build_info(['db'])
        urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
        urls.build_fetch(['db', 'retmode'])

        # Get current information about database being used
        self.get_db_info(urls.info)

        # Loop through all the erps
        #for ind, erp in enumerate(self.erps):
        for ind, lab in enumerate(self.labels):

            # Print out status
            print('Scraping words for: ', lab)

            # Initiliaze object to store data for current erp papers
            cur_erp = ERPData(lab, self.erps[ind])

            # Set up search terms - add exclusions, if there are any
            if self.exclusions[ind][0]:
                #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or') + comb_terms(self.exclusions[ind], 'not')
            else:
                #term_arg = '"' + erp[0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or')

            # Create the url for the erp search term
            url = urls.search + term_arg

            # Get page and parse
            page = self.req.get_url(url)
            page_soup = BeautifulSoup(page.content, 'lxml')

            # Using history
            if use_hist:

                #
                ret_start = 0
                ret_max = 100

                #
                count = int(page_soup.find('count').text)
                web_env = page_soup.find('webenv').text
                query_key = page_soup.find('querykey').text

                # Update History
                cur_erp.update_history('Start Scrape')

                #
                while ret_start < count:

                    #
                    art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                              '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max)
                    art_page = self.req.get_url(art_url)
                    art_page_soup = BeautifulSoup(art_page.content, "xml")

                    # Pull out articles
                    articles = art_page_soup.findAll('PubmedArticle')

                    # Loop through each article, extracting relevant information
                    for ind, art in enumerate(articles):

                        # Get ID of current article
                        new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed')
                        #new_id = int(ids[ind].text)

                        # Extract and add all relevant info from current articles to ERPData object
                        cur_erp = self.extract_add_info(cur_erp, new_id, art)

                    #
                    ret_start += ret_max

            # Without using history
            else:

                # Get all ids
                ids = page_soup.find_all('id')

                # Convert ids to string
                ids_str = _ids_to_str(ids)

                # Get article page
                art_url = urls.fetch + '&id=' + ids_str
                art_page = self.req.get_url(art_url)
                art_page_soup = BeautifulSoup(art_page.content, "xml")

                # Pull out articles
                articles = art_page_soup.findAll('PubmedArticle')

                # Update History
                cur_erp.update_history('Start Scrape')

                # Loop through each article, extracting relevant information
                for ind, art in enumerate(articles):

                    # Get ID of current article
                    new_id = int(ids[ind].text)

                    # Extract and add all relevant info from current articles to ERPData object
                    cur_erp = self.extract_add_info(cur_erp, new_id, art)

            # Check consistency of extracted results
            cur_erp.check_results()
            cur_erp.update_history('End Scrape')

            # Save out and clear data
            cur_erp.save_n_clear()

            # Add the object with current erp data to results list
            self.add_results(cur_erp)

        # Set Requester object as finished being used
        self.req.close()

コード例 #3

ファイルを表示

ファイル: words.py プロジェクト: njr175/ERP_SCANR

    def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False):
        """Search through pubmed for all abstracts referring to a given ERP.

        The scraping does an exact word search for the ERP term given.
        It then loops through all the artciles found about that data.
        For each article, pulls title, year and word data.

        Notes
        -----
        - Pulls data using the hierarchical tag structure that organize the articles.
        - Initially, the procedure was to pull all tags of a certain type.
            For example: extract all 'DateCreated' tags.
            This procedure fails (or badly organizes data) when an articles is
                missing a particular tag.
            Now: take advantage of the hierarchy, loop through each article tag.
                From here, pull out the data, if available.
                This way, can deal with cases of missing data.
        """

        # Set date of when data was collected
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Get e-utils URLS object
        if use_hist: hist_val = 'y'
        else: hist_val = 'n'
        urls = URLS(db=db,
                    usehistory=hist_val,
                    retmax=retmax,
                    retmode='xml',
                    field='TIAB',
                    auto_gen=False)
        urls.build_info(['db'])
        urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
        urls.build_fetch(['db', 'retmode'])

        # Get current information about database being used
        self.get_db_info(urls.info)

        # Loop through all the erps
        #for ind, erp in enumerate(self.erps):
        for ind, lab in enumerate(self.labels):

            # Print out status
            print('Scraping words for: ', lab)

            # Initiliaze object to store data for current erp papers
            cur_erp = ERPData(lab, self.erps[ind])

            # Set up search terms - add exclusions, if there are any
            if self.exclusions[ind][0]:
                #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or') + comb_terms(
                    self.exclusions[ind], 'not')
            else:
                #term_arg = '"' + erp[0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or')

            # Create the url for the erp search term
            url = urls.search + term_arg

            # Get page and parse
            page = self.req.get_url(url)
            page_soup = BeautifulSoup(page.content, 'lxml')

            # Using history
            if use_hist:

                #
                ret_start = 0
                ret_max = 100

                #
                count = int(page_soup.find('count').text)
                web_env = page_soup.find('webenv').text
                query_key = page_soup.find('querykey').text

                # Update History
                cur_erp.update_history('Start Scrape')

                #
                while ret_start < count:

                    #
                    art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                              '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max)
                    art_page = self.req.get_url(art_url)
                    art_page_soup = BeautifulSoup(art_page.content, "xml")

                    # Pull out articles
                    articles = art_page_soup.findAll('PubmedArticle')

                    # Loop through each article, extracting relevant information
                    for ind, art in enumerate(articles):

                        # Get ID of current article
                        new_id = _process_ids(extract(art, 'ArticleId', 'all'),
                                              'pubmed')
                        #new_id = int(ids[ind].text)

                        # Extract and add all relevant info from current articles to ERPData object
                        cur_erp = self.extract_add_info(cur_erp, new_id, art)

                    #
                    ret_start += ret_max

            # Without using history
            else:

                # Get all ids
                ids = page_soup.find_all('id')

                # Convert ids to string
                ids_str = _ids_to_str(ids)

                # Get article page
                art_url = urls.fetch + '&id=' + ids_str
                art_page = self.req.get_url(art_url)
                art_page_soup = BeautifulSoup(art_page.content, "xml")

                # Pull out articles
                articles = art_page_soup.findAll('PubmedArticle')

                # Update History
                cur_erp.update_history('Start Scrape')

                # Loop through each article, extracting relevant information
                for ind, art in enumerate(articles):

                    # Get ID of current article
                    new_id = int(ids[ind].text)

                    # Extract and add all relevant info from current articles to ERPData object
                    cur_erp = self.extract_add_info(cur_erp, new_id, art)

            # Check consistency of extracted results
            cur_erp.check_results()
            cur_erp.update_history('End Scrape')

            # Save out and clear data
            cur_erp.save_n_clear()

            # Add the object with current erp data to results list
            self.add_results(cur_erp)

        # Set Requester object as finished being used
        self.req.close()