def get_coauthors(self): """Return list of coauthors, their scopus-id and research areas.""" url = self.xml.find('coredata/link[@rel="coauthor-search"]').get( 'href') xml = download(url=url).text.encode('utf-8') xml = ET.fromstring(xml) coauthors = [] N = int(get_encoded_text(xml, 'opensearch:totalResults') or 0) AUTHOR = namedtuple('Author', ['name', 'scopus_id', 'affiliation', 'categories']) count = 0 while count < N: params = {'start': count, 'count': 25} xml = download(url=url, params=params).text.encode('utf-8') xml = ET.fromstring(xml) for entry in xml.findall('atom:entry', ns): given_name = get_encoded_text( entry, 'atom:preferred-name/atom:given-name') surname = get_encoded_text(entry, 'atom:preferred-name/atom:surname') coauthor_name = '{0} {1}'.format(given_name, surname) scopus_id = get_encoded_text(entry, 'dc:identifier').replace( 'AUTHOR_ID:', '') affiliation = get_encoded_text( entry, 'atom:affiliation-current/atom:affiliation-name') # get categories for this author s = ', '.join([ '{0} ({1})'.format(subject.text, subject.attrib['frequency']) for subject in entry.findall('atom:subject-area', ns) ]) coauthors += [AUTHOR(coauthor_name, scopus_id, affiliation, s)] count += 25 return coauthors
def get_coauthors(self): """Retrieves basic information about co-authors as a list of namedtuples in the form (surname, given_name, id, areas, affiliation_id, name, city, country), where areas is a list of subject area codes joined by "; ". Note: These information will not be cached and are slow for large coauthor groups. """ # Get number of authors to search for res = download(url=self.coauthor_link, accept='json') data = loads(res.text)['search-results'] N = int(data.get('opensearch:totalResults', 0)) # Store information in namedtuples fields = 'surname given_name id areas affiliation_id name city country' coauth = namedtuple('Coauthor', fields) coauthors = [] # Iterate over search results in chunks of 25 results count = 0 while count < N: params = {'start': count, 'count': 25} res = download(url=self.coauthor_link, params=params, accept='json') data = loads(res.text)['search-results'].get('entry', []) # Extract information for each coauthor for entry in data: aff = entry.get('affiliation-current', {}) try: areas = [a['$'] for a in entry.get('subject-area', [])] except TypeError: # Only one subject area given areas = [entry['subject-area']['$']] new = coauth( surname=entry['preferred-name']['surname'], given_name=entry['preferred-name'].get('given-name'), id=entry['dc:identifier'].split(':')[-1], areas='; '.join(areas), affiliation_id=aff.get('affiliation-id'), name=aff.get('affiliation-name'), city=aff.get('affiliation-city'), country=aff.get('affiliation-country')) coauthors.append(new) count += 25 return coauthors
def get_coauthors(self): """Return list of coauthors, their scopus-id and research areas.""" url = self.xml.find('coredata/link[@rel="coauthor-search"]').get('href') xml = download(url=url).text.encode('utf-8') xml = ET.fromstring(xml) coauthors = [] N = int(get_encoded_text(xml, 'opensearch:totalResults') or 0) AUTHOR = namedtuple('Author', ['name', 'scopus_id', 'affiliation', 'categories']) count = 0 while count < N: params = {'start': count, 'count': 25} xml = download(url=url, params=params).text.encode('utf-8') xml = ET.fromstring(xml) for entry in xml.findall('atom:entry', ns): given_name = get_encoded_text(entry, 'atom:preferred-name/atom:given-name') surname = get_encoded_text(entry, 'atom:preferred-name/atom:surname') coauthor_name = u'{0} {1}'.format(given_name, surname) scopus_id = get_encoded_text(entry, 'dc:identifier').replace('AUTHOR_ID:', '') affiliation = get_encoded_text(entry, 'atom:affiliation-current/atom:affiliation-name') # get categories for this author s = u', '.join(['{0} ({1})'.format(subject.text, subject.attrib['frequency']) for subject in entry.findall('atom:subject-area', ns)]) coauthors += [AUTHOR(coauthor_name, scopus_id, affiliation, s)] count += 25 return coauthors
def get_coauthors(self): """Retrieves basic information about co-authors as a list of namedtuples in the form (surname, given_name, id, areas, affiliation_id, name, city, country), where areas is a list of subject area codes joined by "; ". Note: These information will not be cached and are slow for large coauthor groups. """ # Get number of authors to search for res = download(url=self.coauthor_link, accept='json') data = loads(res.text)['search-results'] N = int(data.get('opensearch:totalResults', 0)) # Store information in namedtuples fields = 'surname given_name id areas affiliation_id name city country' coauth = namedtuple('Coauthor', fields) coauthors = [] # Iterate over search results in chunks of 25 results count = 0 while count < N: params = {'start': count, 'count': 25} res = download(url=self.coauthor_link, params=params, accept='json') data = loads(res.text)['search-results'].get('entry', []) # Extract information for each coauthor for entry in data: aff = entry.get('affiliation-current', {}) try: areas = [a['$'] for a in entry.get('subject-area', [])] except TypeError: # Only one subject area given areas = [entry['subject-area']['$']] new = coauth(surname=entry['preferred-name']['surname'], given_name=entry['preferred-name'].get('given-name'), id=entry['dc:identifier'].split(':')[-1], areas='; '.join(areas), name=aff.get('affiliation-name'), affiliation_id=aff.get('affiliation-id'), city=aff.get('affiliation-city'), country=aff.get('affiliation-country')) coauthors.append(new) count += 25 return coauthors
def _parse(res, params, n, api, **kwds): """Auxiliary function to download results and parse json.""" cursor = "cursor" in params if not cursor: start = params["start"] if n == 0: return "" _json = res.get('search-results', {}).get('entry', []) # Download the remaining information in chunks while n > 0: n -= params["count"] if cursor: pointer = res['search-results']['cursor'].get('@next') params.update({'cursor': pointer}) else: start += params["count"] params.update({'start': start}) res = download(url=URL[api], params=params, accept="json", **kwds).json() _json.extend(res.get('search-results', {}).get('entry', [])) return _json
def __init__(self, query, api, refresh, count=200, start=0, max_entries=5000, view='STANDARD', cursor=False, **kwds): """Class intended as superclass to perform a search query. Parameters ---------- query : str A string of the query. api : str The name of the Scopus API to be accessed. Allowed values: AffiliationSearch, AuthorSearch, ScopusSearch. refresh : bool Whether to refresh the cached file if it exists or not. count : int (optional, default=200) The number of entries to be displayed at once. A smaller number means more queries with each query having less results. start : int (optional, default=0) DEPRECATED! The entry number of the first search item to start with. max_entries : int (optional, default=5000) Raise error when the number of results is beyond this number. To skip this check, set `max_entries` to `None`. view : str (optional, default=STANDARD) The view of the file that should be downloaded. Will not take effect for already cached files. cursor : str (optional, default=False) Whether to use the cursor in order to iterate over all search results without limit on the number of the results. In contrast to `start` parameter, the `cursor` parameter does not allow users to obtain partial results. kwds : key-value parings, optional Keywords passed on to requests header. Must contain fields and values specified in the respective API specification. Raises ------ ScopusQueryError If the number of search results exceeds max_entries. ValueError If the api parameteris an invalid entry. """ # Checks if api not in URL: raise ValueError('api parameter must be one of ' + ', '.join(URL.keys())) if not config.has_section('Directories'): create_config() if start != 0: text = "Parameter start is deprecated and will be removed "\ "in scopus 1.6." warn(text, UserWarning) # Read the file contents if file exists and we are not refreshing, # otherwise download query anew and cache file qfile = join(config.get('Directories', api), md5(query.encode('utf8')).hexdigest()) if not refresh and exists(qfile): with open(qfile, "rb") as f: self._json = [loads(line) for line in f.readlines()] else: # Get a count of how many things to retrieve from first chunk params = {'query': query, 'count': count, 'view': view} if cursor: params.update({'cursor': '*'}) else: params.update({'start': 0}) res = download(url=URL[api], params=params, accept="json", **kwds).json() n = int(res['search-results'].get('opensearch:totalResults', 0)) if not cursor and n > max_entries: # Stop if there are too many results text = ('Found {} matches. Set max_entries to a higher ' 'number, change your query ({}) or set ' 'subscription=True'.format(n, query)) raise ScopusQueryError(text) self._json = res.get('search-results', {}).get('entry', []) if n == 0: self._json = "" # Download the remaining information in chunks while n > 0: n -= count params.update({'count': count}) if cursor: pointer = res['search-results']['cursor'].get('@next') params.update({'cursor': pointer}) else: start += count params.update({'start': start}) res = download(url=URL[api], params=params, accept="json", **kwds).json() self._json.extend( res.get('search-results', {}).get('entry', [])) # Finally write out the file with open(qfile, 'wb') as f: for item in self._json: f.write('{}\n'.format(dumps(item)).encode('utf-8'))
def __init__(self, query, api, refresh, count=200, start=0, max_entries=5000, view='STANDARD'): """Class intended as superclass to perform a search query. Parameters ---------- query : str A string of the query. api : str The name of the Scopus API to be accessed. Allowed values: AffiliationSearch, AuthorSearch, ScopusSearch. refresh : bool Whether to refresh the cached file if it exists or not. count : int (optional, default=200) The number of entries to be displayed at once. A smaller number means more queries with each query having less results. start : int (optional, default=0) The entry number of the first search item to start with. max_entries : int (optional, default=5000) Raise error when the number of results is beyond this number. The Scopus Search Engine does not allow more than 5000 entries. view : str (optional, default=STANDARD) The view of the file that should be downloaded. Will not take effect for already cached files. Allowed values: STANDARD, COMPLETE. Note: Only the ScopusSearch API additionally uses view COMPLETE. Raises ------ ScopusQueryError If the number of search results exceeds max_entries. ValueError If the api parameter or view parameter is an invalid entry. """ # Checks if api not in URL: raise ValueError('api parameter must be one of ' + ', '.join(URL.keys())) allowed_views = ('STANDARD', 'COMPLETE') if view not in allowed_views: raise ValueError('view parameter must be one of ' + ', '.join(allowed_views)) if not config.has_section('Directories'): create_config() # Read the file contents if file exists and we are not refreshing, # otherwise download query anew and cache file qfile = join(config.get('Directories', api), md5(query.encode('utf8')).hexdigest()) if not refresh and exists(qfile): with open(qfile, "rb") as f: self._json = [loads(line) for line in f.readlines()] else: # Get a count of how many things to retrieve from first chunk params = {'query': query, 'count': count, 'start': 0, 'view': view} res = download(url=URL[api], params=params, accept="json").json() n = int(res['search-results'].get('opensearch:totalResults', 0)) if n > max_entries: # Stop if there are too many results text = ('Found {} matches. Set max_entries to a higher ' 'number or change your query ({})'.format(n, query)) raise ScopusQueryError(text) self._json = res.get('search-results', {}).get('entry', []) if n == 0: self._json = "" # Download the remaining information in chunks while n > 0: n -= count start += count params.update({'count': count, 'start': start}) res = download(url=URL[api], params=params, accept="json").json() self._json.extend(res.get('search-results', {}).get('entry', [])) # Finally write out the file with open(qfile, 'wb') as f: for item in self._json: f.write('{}\n'.format(dumps(item)).encode('utf-8'))
def __init__(self, query, fields='eid', count=200, start=0, max_entries=5000, refresh=False): """Class to search a query, and retrieve a list of EIDs as results. Parameters ---------- query : str A string of the query. fields : str (optional, default='eid') The fields you want returned. Allowed fields are specified in https://dev.elsevier.com/guides/ScopusSearchViews.htm. Since currently only EIDs are stored, this parameter is being kept for later use only. count : int (optional, default=200) The number of entries to be displayed at once. A smaller number means more queries with each query having less results. start : int (optional, default=0) The entry number of the first search item to start with. refresh : bool (optional, default=False) Whether to refresh the cached file if it exists or not. max_entries : int (optional, default=5000) Raise error when the number of results is beyond this number. The Scopus Search Engine does not allow more than 5000 entries. Raises ------ Exception If the number of search results exceeds max_entries. Notes ----- XML results are cached in ~/.scopus/search/{query}. The EIDs are stored as a property named EIDS. """ qfile = os.path.join( SCOPUS_SEARCH_DIR, # We need to remove / in a DOI here so we can save # it as a file. query.replace('/', '_slash_')) if os.path.exists(qfile) and not refresh: with open(qfile) as f: self._EIDS = [ eid for eid in f.read().strip().split('\n') if eid ] else: # No cached file exists, or we are refreshing. # First, we get a count of how many things to retrieve url = 'https://api.elsevier.com/content/search/scopus' params = { 'query': query, 'field': fields, 'count': 0, 'start': 0, 'date': '2017-2018' } xml = download(url=url, params=params).text.encode('utf-8') results = ET.fromstring(xml) N = results.find('opensearch:totalResults', ns) try: N = int(N.text) except: N = 0 if N > max_entries: raise Exception(('N = {}. ' 'Set max_entries to a higher number or ' 'change your query ({})').format(N, query)) self._EIDS = [] while N > 0: params = { 'query': query, 'fields': fields, 'count': count, 'start': start, 'date': '2017-2018' } resp = download(url=url, params=params, accept="json") results = resp.json() if 'entry' in results.get('search-results', []): self._EIDS += [ str(r['eid']) for r in results['search-results']['entry'] ] start += count N -= count with open(qfile, 'wb') as f: for eid in self.EIDS: f.write('{}\n'.format(eid).encode('utf-8'))
def __init__(self, query, fields='eid', count=200, start=0, refresh=False, max_entries=1000): """Class to search a query, and retrieve a list of EIDs as results. Parameters ---------- query : str A string of the query. fields : str (optional, default='eid') The list of fields you want returned. count : int (optional, default=200) The number of entries to be displayed. start : int (optional, default=0) The entry number of the first search item to start with. refresh : bool (optional, default=False) Whether to refresh the cached file if it exists or not. Notes ----- XML results are cached in ~/.scopus/search/{query}. The EIDs are stored as a property. """ self.query = query qfile = os.path.join( SCOPUS_SEARCH_DIR, # We need to remove / in a DOI here so we can save # it as a file. query.replace('/', '_slash_')) if os.path.exists(qfile) and not refresh: with open(qfile) as f: self._EIDS = [ eid for eid in f.read().strip().split('\n') if eid ] else: # No cached file exists, or we are refreshing. # First, we get a count of how many things to retrieve url = 'http://api.elsevier.com/content/search/scopus' params = {'query': query, 'field': fields, 'count': 0, 'start': 0} xml = download(url=url, params=params).text.encode('utf-8') results = ET.fromstring(xml) N = results.find('opensearch:totalResults', ns) try: N = int(N.text) except: N = 0 if N > max_entries: raise Exception(('N = {}. ' 'Set max_entries to a higher number or ' 'change your query ({})').format(N, query)) self._EIDS = [] while N > 0: params = { 'query': query, 'fields': fields, 'count': count, 'start': start } resp = download(url=url, params=params, accept="json") results = resp.json() if 'entry' in results.get('search-results', []): self._EIDS += [ str(r['eid']) for r in results['search-results']['entry'] ] start += count N -= count with open(qfile, 'wb') as f: for eid in self.EIDS: f.write('{}\n'.format(eid).encode('utf-8'))
def __init__(self, query, filepath, url, refresh, count=200, start=0, max_entries=5000, view='STANDARD'): """Class intended as superclass to perform a search query. Parameters ---------- query : str A string of the query. filepath : str The complete filepath and -name of the cached file. url : str The API access point. refresh : bool Whether to refresh the cached file if it exists or not. count : int (optional, default=200) The number of entries to be displayed at once. A smaller number means more queries with each query having less results. start : int (optional, default=0) The entry number of the first search item to start with. max_entries : int (optional, default=5000) Raise error when the number of results is beyond this number. The Scopus Search Engine does not allow more than 5000 entries. view : str (optional, default=STANDARD) The view of the file that should be downloaded. Will not take effect for already cached files. Allowed values: STANDARD, COMPLETE. Note: Only the Scopus search API additionally uses view COMPLETE. Raises ------ Exception If the number of search results exceeds max_entries. ValueError If the view parameters contains invalid entries. """ allowed_views = ('STANDARD', 'COMPLETE') if view not in allowed_views: raise ValueError('view parameter must be one of ' + ', '.join(allowed_views)) # Read the file contents if it exists and we are not refreshing if not refresh and exists(filepath): self._json = [] with open(filepath) as f: for r in f.readlines(): self._json.append(loads(r)) # Download file if cached file doesn't exists or we are refreshing else: # First, get a count of how many things to retrieve params = {'query': query, 'count': 0, 'start': 0, 'view': view} res = get_content(filepath, url=url, refresh=refresh, params=params, accept='json') data = loads(res.decode('utf-8'))['search-results'] N = int(data.get('opensearch:totalResults', 0)) if N > max_entries: raise Exception(('Found {} matches. ' 'Set max_entries to a higher number or ' 'change your query ({})').format(N, query)) # Then download the information in chunks self._json = [] while N > 0: params.update({'count': count, 'start': start}) res = download(url=url, params=params, accept="json") results = res.json() if 'entry' in results.get('search-results', []): for r in results['search-results']['entry']: self._json.append({f: r[f] for f in r.keys()}) start += count N -= count # Finally write out the file with open(filepath, 'wb') as f: for item in self._json: f.write('{}\n'.format(dumps(item)).encode('utf-8'))
def __init__(self, query, api, refresh, count=200, start=0, max_entries=5000, view='STANDARD', cursor=False, download_results=True, **kwds): """Class intended as superclass to perform a search query. Parameters ---------- query : str A string of the query. api : str The name of the Scopus API to be accessed. Allowed values: AffiliationSearch, AuthorSearch, ScopusSearch. refresh : bool Whether to refresh the cached file if it exists or not. count : int (optional, default=200) The number of entries to be displayed at once. A smaller number means more queries with each query having less results. start : int (optional, default=0) DEPRECATED! The entry number of the first search item to start with. max_entries : int (optional, default=5000) Raise error when the number of results is beyond this number. To skip this check, set `max_entries` to `None`. view : str (optional, default=STANDARD) The view of the file that should be downloaded. Will not take effect for already cached files. cursor : str (optional, default=False) Whether to use the cursor in order to iterate over all search results without limit on the number of the results. In contrast to `start` parameter, the `cursor` parameter does not allow users to obtain partial results. download_results : bool (optional, default=True) Whether to download results (if they have not been cached) or not. kwds : key-value parings, optional Keywords passed on to requests header. Must contain fields and values specified in the respective API specification. Raises ------ ScopusQueryError If the number of search results exceeds max_entries. ValueError If the api parameteris an invalid entry. """ # Checks if api not in URL: raise ValueError('api parameter must be one of ' + ', '.join(URL.keys())) if not config.has_section('Directories'): create_config() if start != 0: text = "Parameter start is deprecated and will be removed "\ "in scopus 1.6." warn(text, UserWarning) # Read the file contents if file exists and we are not refreshing, # otherwise download query anew and cache file qfile = join(config.get('Directories', api), md5(query.encode('utf8')).hexdigest()) if not refresh and exists(qfile): with open(qfile, "rb") as f: self._json = [loads(line) for line in f.readlines()] self._n = n = len(self._json) else: # Set query parameters params = {'query': query, 'count': count, 'view': view} if cursor: params.update({'cursor': '*'}) else: params.update({'start': 0}) # Download results res = download(url=URL[api], params=params, accept="json", **kwds).json() n = int(res['search-results'].get('opensearch:totalResults', 0)) self._n = n if not cursor and n > max_entries: # Stop if there are too many results text = ('Found {} matches. Set max_entries to a higher ' 'number, change your query ({}) or set ' 'subscription=True'.format(n, query)) raise ScopusQueryError(text) if download_results: self._json = _parse(res, params, n, api, **kwds) # Finally write out the file with open(qfile, 'wb') as f: for item in self._json: f.write('{}\n'.format(dumps(item)).encode('utf-8'))
def __init__(self, query, filepath, url, refresh, count=200, start=0, max_entries=5000): """Class intended for use a superclass to perform a search query. Parameters ---------- query : str A string of the query. filepath : str The complete filepath and -name of the cached file. url : str The API access point. refresh : bool Whether to refresh the cached file if it exists or not. count : int (optional, default=200) The number of entries to be displayed at once. A smaller number means more queries with each query having less results. start : int (optional, default=0) The entry number of the first search item to start with. max_entries : int (optional, default=5000) Raise error when the number of results is beyond this number. The Scopus Search Engine does not allow more than 5000 entries. Raises ------ Exception If the number of search results exceeds max_entries. """ # Read the file contents if it exists and we are not refreshing. if not refresh and exists(filepath): self._json = [] with open(filepath) as f: for r in f.readlines(): self._json.append(loads(r)) # If cached file doesn't exists, or we are refreshing, download file. else: # First, we get a count of how many things to retrieve. params = {'query': query, 'count': 0, 'start': 0} res = get_content(filepath, url=url, refresh=refresh, params=params, accept='json') data = loads(res.decode('utf-8'))['search-results'] N = int(data.get('opensearch:totalResults', 0)) if N > max_entries: raise Exception(('Found {} matches. ' 'Set max_entries to a higher number or ' 'change your query ({})').format(N, query)) # Then we download the information in chunks. self._json = [] while N > 0: params = {'query': query, 'count': count, 'start': start} resp = download(url=url, params=params, accept="json") results = resp.json() if 'entry' in results.get('search-results', []): for r in results['search-results']['entry']: self._json.append({f: r[f] for f in r.keys()}) start += count N -= count # Finally write out the file. with open(filepath, 'wb') as f: for author in self._json: f.write('{}\n'.format(dumps(author)).encode('utf-8'))