def get_coauthors(self): """Retrieves basic information about co-authors as a list of namedtuples in the form (surname, given_name, id, areas, affiliation_id, name, city, country), where areas is a list of subject area codes joined by "; ". Note: These information will not be cached and are slow for large coauthor groups. """ # Get number of authors to search for res = cache_file(url=self.coauthor_link) data = loads(res.text)['search-results'] N = int(data.get('opensearch:totalResults', 0)) # Store information in namedtuples fields = 'surname given_name id areas affiliation_id name city country' coauth = namedtuple('Coauthor', fields) coauthors = [] # Iterate over search results in chunks of 25 results count = 0 while count < N: params = {'start': count, 'count': 25} res = cache_file(url=self.coauthor_link, params=params, accept='json') data = loads(res.text)['search-results'].get('entry', []) # Extract information for each coauthor for entry in data: aff = entry.get('affiliation-current', {}) try: areas = [a['$'] for a in entry.get('subject-area', [])] except TypeError: # Only one subject area given areas = [entry['subject-area']['$']] new = coauth( surname=entry['preferred-name']['surname'], given_name=entry['preferred-name'].get('given-name'), id=entry['dc:identifier'].split(':')[-1], areas='; '.join(areas), name=aff.get('affiliation-name'), affiliation_id=aff.get('affiliation-id'), city=aff.get('affiliation-city'), country=aff.get('affiliation-country')) coauthors.append(new) count += 25 return coauthors
def _parse(res, params, n, api, **kwds): """Auxiliary function to download results and parse json.""" cursor = "cursor" in params if not cursor: start = params["start"] if n == 0: return "" _json = res.get('search-results', {}).get('entry', []) # Download the remaining information in chunks while n > 0: n -= params["count"] if cursor: pointer = res['search-results']['cursor'].get('@next') params.update({'cursor': pointer}) else: start += params["count"] params.update({'start': start}) res = cache_file(url=SEARCH_URL[api], params=params, **kwds).json() _json.extend(res.get('search-results', {}).get('entry', [])) return _json
def __init__(self, query, api, refresh, view='STANDARD', count=200, max_entries=5000, cursor=False, download=True, **kwds): """Class intended as superclass to perform a search query. Parameters ---------- query : str A string of the query. api : str The name of the Scopus API to be accessed. Allowed values: AffiliationSearch, AuthorSearch, ScopusSearch. refresh : bool Whether to refresh the cached file if it exists or not. view : str The view of the file that should be downloaded. count : int (optional, default=200) The number of entries to be displayed at once. A smaller number means more queries with each query having less results. max_entries : int (optional, default=5000) Raise error when the number of results is beyond this number. To skip this check, set `max_entries` to `None`. cursor : str (optional, default=False) Whether to use the cursor in order to iterate over all search results without limit on the number of the results. In contrast to `start` parameter, the `cursor` parameter does not allow users to obtain partial results. download : bool (optional, default=True) Whether to download results (if they have not been cached) or not. kwds : key-value parings, optional Keywords passed on to requests header. Must contain fields and values specified in the respective API specification. Raises ------ ScopusQueryError If the number of search results exceeds max_entries. ValueError If the api parameteris an invalid entry. """ # Read the file contents if file exists and we are not refreshing, # otherwise download query anew and cache file fname = md5(query.encode('utf8')).hexdigest() qfile = join(get_folder(api, view), fname) if not refresh and exists(qfile): with open(qfile, "rb") as f: self._json = [loads(line) for line in f.readlines()] self._n = len(self._json) else: # Set query parameters params = {'query': query, 'count': count, 'view': view} if cursor: params.update({'cursor': '*'}) else: params.update({'start': 0}) # Download results res = cache_file(url=SEARCH_URL[api], params=params, **kwds).json() n = int(res['search-results'].get('opensearch:totalResults', 0)) self._n = n if not cursor and n > max_entries: # Stop if there are too many results text = ('Found {} matches. Set max_entries to a higher ' 'number, change your query ({}) or set ' 'subscription=True'.format(n, query)) raise ScopusQueryError(text) if download: self._json = _parse(res, params, n, api, **kwds) # Finally write out the file with open(qfile, 'wb') as f: for item in self._json: f.write('{}\n'.format(dumps(item)).encode('utf-8')) else: # Assures that properties will not result in an error self._json = [] self._view = view