Esempi in Python per Requester.request_url

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: lisc.requester

Classe/tipologia: Requester

Metodo/funzione: request_url

Esempi su hotexamples.com: 2

Requester.request_url in Python: 2 esempi trovati. Questi sono i migliori esempi reali in Python per lisc.requester.Requester.request_url, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Requester(12)

request_url(2)

as_dict(1)

check(1)

set_wait_time(1)

Esempio n. 1

Mostra file

File: plot_requester_urls.py Progetto: ryanhammonds/lisc

# Initialize a requester object
req = Requester()

###################################################################################################

# Set the minimum wait time between requests
req.set_wait_time(0.5)

###################################################################################################

# Use the Requester object to request some web pages
for url in [
        'https://www.google.com', 'https://www.yahoo.com',
        'https://duckduckgo.com'
]:
    page = req.request_url(url)
    print('Collecting web page \t {} \t got status code \t {}'.format(
        page.url, page.status_code))

###################################################################################################

# Check details of the requester object
req.check()

###################################################################################################

# Get information from the requester object as a dictionary
print(req.as_dict())

###################################################################################################
# URLs Object

Esempio n. 2

Mostra file

def collect_words(terms,
                  inclusions=[],
                  exclusions=[],
                  db='pubmed',
                  retmax=None,
                  field='TIAB',
                  usehistory=False,
                  api_key=None,
                  save_and_clear=False,
                  logging=None,
                  directory=None,
                  verbose=False):
    """Collect text data and metadata from EUtils using specified search term(s).

    Parameters
    ----------
    terms : list of list of str
        Search terms.
    inclusions : list of list of str, optional
        Inclusion words for search terms.
    exclusions : list of list of str, optional
        Exclusion words for search terms.
    db : str, optional, default: 'pubmed'
        Which database to access from EUtils.
    retmax : int, optional
        Maximum number of articles to return.
    field : str, optional, default: 'TIAB'
        Field to search for term within.
        Defaults to 'TIAB', which is Title/Abstract.
    usehistory : bool, optional, default: False
        Whether to use EUtils history, storing results on their server.
    api_key : str, optional
        An API key for a NCBI account.
    save_and_clear : bool, optional, default: False
        Whether to save words data to disk per term as it goes, instead of holding in memory.
    logging : {None, 'print', 'store', 'file'}
        What kind of logging, if any, to do for requested URLs.
    directory : str or SCDB, optional
        Folder or database object specifying the save location.
    verbose : bool, optional, default: False
        Whether to print out updates.

    Returns
    -------
    results : list of Articles
        Results from collecting data for each term.
    meta_data : MetaData
        Meta data from the data collection.

    Notes
    -----
    The collection does an exact word search for the term given. It then loops through all
    the articles found for that term.

    For each article, it pulls and saves out data (including title, abstract, authors, etc),
    using the hierarchical tag structure that organizes the articles.
    """

    # Get EUtils URLS object, with desired settings, and build required utility URLs
    urls = EUtils(db=db,
                  usehistory='y' if usehistory else 'n',
                  retmax=retmax,
                  retmode='xml',
                  field=field,
                  api_key=api_key)
    urls.build_url('info', settings=['db'])
    urls.build_url('search',
                   settings=['db', 'usehistory', 'retmax', 'retmode', 'field'])
    urls.build_url('fetch', settings=['db', 'retmode'])

    # Initialize results, meta data & requester
    results = []
    meta_data = MetaData()
    req = Requester(wait_time=get_wait_time(urls.authenticated),
                    logging=logging,
                    directory=directory)

    # Get current information about database being used
    meta_data.add_db_info(get_db_info(req, urls.get_url('info')))

    # Check inclusions & exclusions
    inclusions = inclusions if inclusions else [[]] * len(terms)
    exclusions = exclusions if exclusions else [[]] * len(terms)

    # Loop through all the terms
    for search, incl, excl in zip(terms, inclusions, exclusions):

        # Collect term information and make search term argument
        term = Term(search[0], search, incl, excl)
        term_arg = mk_term(term)

        if verbose:
            print('Collecting data for: ', term.label)

        # Initialize object to store data for current term articles
        arts = Articles(term)

        # Request web page
        url = urls.get_url('search', settings={'term': term_arg})
        page = req.request_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        if usehistory:

            # Get number of articles, and keys to use history
            count = int(page_soup.find('count').text)
            web_env = page_soup.find('webenv').text
            query_key = page_soup.find('querykey').text

            # Loop through, collecting article data, using history
            ret_start_it = 0
            while ret_start_it < count:

                # Set the number of articles per iteration (the ret_max per call)
                #  This defaults to 100, but will set to less if fewer needed to reach retmax
                ret_end_it = min(100, int(retmax) - ret_start_it)

                # Get article page, collect data, update position
                url_settings = {
                    'WebEnv': web_env,
                    'query_key': query_key,
                    'retstart': str(ret_start_it),
                    'retmax': str(ret_end_it)
                }
                art_url = urls.get_url('fetch', settings=url_settings)
                arts = get_articles(req, art_url, arts)
                ret_start_it += ret_end_it

                if ret_start_it >= int(retmax):
                    break

        # Without using history
        else:

            ids = page_soup.find_all('id')
            art_url = urls.get_url('fetch', settings={'id': ids_to_str(ids)})
            arts = get_articles(req, art_url, arts)

        arts._check_results()

        if save_and_clear:
            arts.save_and_clear(directory=directory)
        results.append(arts)

    meta_data.add_requester(req)

    return results, meta_data