def collect_words(terms, inclusions=[], exclusions=[], db='pubmed', retmax=None, field='TIAB', usehistory=False, api_key=None, save_and_clear=False, logging=None, directory=None, verbose=False): """Collect text data and metadata from EUtils using specified search term(s). Parameters ---------- terms : list of list of str Search terms. inclusions : list of list of str, optional Inclusion words for search terms. exclusions : list of list of str, optional Exclusion words for search terms. db : str, optional, default: 'pubmed' Which database to access from EUtils. retmax : int, optional Maximum number of articles to return. field : str, optional, default: 'TIAB' Field to search for term within. Defaults to 'TIAB', which is Title/Abstract. usehistory : bool, optional, default: False Whether to use EUtils history, storing results on their server. api_key : str, optional An API key for a NCBI account. save_and_clear : bool, optional, default: False Whether to save words data to disk per term as it goes, instead of holding in memory. logging : {None, 'print', 'store', 'file'} What kind of logging, if any, to do for requested URLs. directory : str or SCDB, optional Folder or database object specifying the save location. verbose : bool, optional, default: False Whether to print out updates. Returns ------- results : list of Articles Results from collecting data for each term. meta_data : MetaData Meta data from the data collection. Notes ----- The collection does an exact word search for the term given. It then loops through all the articles found for that term. For each article, it pulls and saves out data (including title, abstract, authors, etc), using the hierarchical tag structure that organizes the articles. """ # Get EUtils URLS object, with desired settings, and build required utility URLs urls = EUtils(db=db, usehistory='y' if usehistory else 'n', retmax=retmax, retmode='xml', field=field, api_key=api_key) urls.build_url('info', settings=['db']) urls.build_url('search', settings=['db', 'usehistory', 'retmax', 'retmode', 'field']) urls.build_url('fetch', settings=['db', 'retmode']) # Initialize results, meta data & requester results = [] meta_data = MetaData() req = Requester(wait_time=get_wait_time(urls.authenticated), logging=logging, directory=directory) # Get current information about database being used meta_data.add_db_info(get_db_info(req, urls.get_url('info'))) # Check inclusions & exclusions inclusions = inclusions if inclusions else [[]] * len(terms) exclusions = exclusions if exclusions else [[]] * len(terms) # Loop through all the terms for search, incl, excl in zip(terms, inclusions, exclusions): # Collect term information and make search term argument term = Term(search[0], search, incl, excl) term_arg = mk_term(term) if verbose: print('Collecting data for: ', term.label) # Initialize object to store data for current term articles arts = Articles(term) # Request web page url = urls.get_url('search', settings={'term': term_arg}) page = req.request_url(url) page_soup = BeautifulSoup(page.content, 'lxml') if usehistory: # Get number of articles, and keys to use history count = int(page_soup.find('count').text) web_env = page_soup.find('webenv').text query_key = page_soup.find('querykey').text # Loop through, collecting article data, using history ret_start_it = 0 while ret_start_it < count: # Set the number of articles per iteration (the ret_max per call) # This defaults to 100, but will set to less if fewer needed to reach retmax ret_end_it = min(100, int(retmax) - ret_start_it) # Get article page, collect data, update position url_settings = { 'WebEnv': web_env, 'query_key': query_key, 'retstart': str(ret_start_it), 'retmax': str(ret_end_it) } art_url = urls.get_url('fetch', settings=url_settings) arts = get_articles(req, art_url, arts) ret_start_it += ret_end_it if ret_start_it >= int(retmax): break # Without using history else: ids = page_soup.find_all('id') art_url = urls.get_url('fetch', settings={'id': ids_to_str(ids)}) arts = get_articles(req, art_url, arts) arts._check_results() if save_and_clear: arts.save_and_clear(directory=directory) results.append(arts) meta_data.add_requester(req) return results, meta_data
def collect_counts(terms_a, inclusions_a=None, exclusions_a=None, terms_b=None, inclusions_b=None, exclusions_b=None, db='pubmed', field='TIAB', api_key=None, logging=None, directory=None, verbose=False): """Collect count and term co-occurrence data from EUtils. Parameters ---------- terms_a : list of list of str Search terms. inclusions_a : list of list of str, optional Inclusion words for search terms. exclusions_a : list of list of str, optional Exclusion words for search terms. terms_b : list of list of str, optional Secondary list of search terms. inclusions_b : list of list of str, optional Inclusion words for secondary list of search terms. exclusions_b : list of list of str, optional Exclusion words for secondary list of search terms. db : str, optional, default: 'pubmed' Which database to access from EUtils. field : str, optional, default: 'TIAB' Field to search for term within. Defaults to 'TIAB', which is Title/Abstract. api_key : str, optional An API key for a NCBI account. logging : {None, 'print', 'store', 'file'}, optional What kind of logging, if any, to do for requested URLs. directory : str or SCDB, optional Folder or database object specifying the save location. verbose : bool, optional, default: False Whether to print out updates. Returns ------- co_occurences : 2d array The numbers of articles found for each combination of terms. counts : 1d array or list of 1d array Number of articles for each term independently. meta_data : dict Meta data from the data collection. Notes ----- The collection does an exact word search for two terms. The HTML page returned by the EUtils search includes a 'count' field. This field contains the number of articles with both terms. This is extracted. Examples -------- Collect counts and co-occurrences for a single set of two search terms: >>> coocs, counts, meta_data = collect_counts([['frontal lobe'], ['temporal lobe']]) Collect counts and co-occurrences for two sets of search terms: >>> coocs, counts, meta_data = collect_counts(terms_a=[['frontal lobe'], ['temporal lobe']], ... terms_b=[['attention'], ['perception']]) """ # Get e-utils URLS object. Set retmax as 0, since not using UIDs for counts urls = EUtils(db=db, retmax='0', field=field, retmode='xml', api_key=api_key) urls.build_url('info', settings=['db']) urls.build_url('search', settings=['db', 'retmax', 'retmode', 'field']) # Initialize meta data & requester meta_data = MetaData() req = Requester(wait_time=get_wait_time(urls.authenticated), logging=logging, directory=directory) # Sort out terms n_terms_a = len(terms_a) if not terms_b: square = True terms_b, inclusions_b, exclusions_b = terms_a, inclusions_a, exclusions_a else: square = False n_terms_b = len(terms_b) # Check inclusions & exclusions inclusions_a = [[]] * n_terms_a if not inclusions_a else inclusions_a inclusions_b = [[]] * n_terms_b if not inclusions_b else inclusions_b exclusions_a = [[]] * n_terms_a if not exclusions_a else exclusions_a exclusions_b = [[]] * n_terms_b if not exclusions_b else exclusions_b # Initialize count variables to the correct length counts_a = np.ones([n_terms_a], dtype=int) * -1 counts_b = np.ones([n_terms_b], dtype=int) * -1 # Initialize right size matrices to store co-occurence data co_occurences = np.ones([n_terms_a, n_terms_b], dtype=int) * -1 # Set diagonal to zero if square (term co-occurrence with itself) if square: np.fill_diagonal(co_occurences, 0) # Get current information about database being used meta_data.add_db_info(get_db_info(req, urls.get_url('info'))) # Loop through each term (list-A) for a_ind, (search_a, incl_a, excl_a) in enumerate(zip(terms_a, inclusions_a, exclusions_a)): # Make term arguments term_a = Term(search_a[0], search_a, incl_a, excl_a) term_a_arg = mk_term(term_a) if verbose: print('Running counts for: ', term_a.label) # Get number of results for current term search url = urls.get_url('search', settings={'term' : term_a_arg}) counts_a[a_ind] = get_count(req, url) # For each term in list a, loop through each term in list b for b_ind, (search_b, incl_b, excl_b) in enumerate(zip(terms_b, inclusions_b, exclusions_b)): # Skip collections of equivalent term combinations - if single term list # This will skip the diagonal row, and any combinations already collected if square and co_occurences[a_ind, b_ind] != -1: continue # Make term arguments term_b = Term(search_b[0], search_b, incl_b, excl_b) term_b_arg = mk_term(term_b) full_term_arg = join(term_a_arg, term_b_arg, 'AND') # Get number of results for current term search if not square: url = urls.get_url('search', settings={'term' : term_b_arg}) counts_b[b_ind] = get_count(req, url) # Get number of results for combination of terms url = urls.get_url('search', settings={'term' : full_term_arg}) count = get_count(req, url) co_occurences[a_ind, b_ind] = count if square: co_occurences[b_ind, a_ind] = count if square: counts = counts_a else: counts = [counts_a, counts_b] meta_data.add_requester(req) return co_occurences, counts, meta_data