def get_coauthors(self):
     """Retrieves basic information about co-authors as a list of
     namedtuples in the form
     (surname, given_name, id, areas, affiliation_id, name, city, country),
     where areas is a list of subject area codes joined by "; ".
     Note: These information will not be cached and are slow for large
     coauthor groups.
     """
     # Get number of authors to search for
     res = cache_file(url=self.coauthor_link)
     data = loads(res.text)['search-results']
     N = int(data.get('opensearch:totalResults', 0))
     # Store information in namedtuples
     fields = 'surname given_name id areas affiliation_id name city country'
     coauth = namedtuple('Coauthor', fields)
     coauthors = []
     # Iterate over search results in chunks of 25 results
     count = 0
     while count < N:
         params = {'start': count, 'count': 25}
         res = cache_file(url=self.coauthor_link,
                          params=params,
                          accept='json')
         data = loads(res.text)['search-results'].get('entry', [])
         # Extract information for each coauthor
         for entry in data:
             aff = entry.get('affiliation-current', {})
             try:
                 areas = [a['$'] for a in entry.get('subject-area', [])]
             except TypeError:  # Only one subject area given
                 areas = [entry['subject-area']['$']]
             new = coauth(
                 surname=entry['preferred-name']['surname'],
                 given_name=entry['preferred-name'].get('given-name'),
                 id=entry['dc:identifier'].split(':')[-1],
                 areas='; '.join(areas),
                 name=aff.get('affiliation-name'),
                 affiliation_id=aff.get('affiliation-id'),
                 city=aff.get('affiliation-city'),
                 country=aff.get('affiliation-country'))
             coauthors.append(new)
         count += 25
     return coauthors
Exemple #2
0
def _parse(res, params, n, api, **kwds):
    """Auxiliary function to download results and parse json."""
    cursor = "cursor" in params
    if not cursor:
        start = params["start"]
    if n == 0:
        return ""
    _json = res.get('search-results', {}).get('entry', [])
    # Download the remaining information in chunks
    while n > 0:
        n -= params["count"]
        if cursor:
            pointer = res['search-results']['cursor'].get('@next')
            params.update({'cursor': pointer})
        else:
            start += params["count"]
            params.update({'start': start})
        res = cache_file(url=SEARCH_URL[api], params=params, **kwds).json()
        _json.extend(res.get('search-results', {}).get('entry', []))
    return _json
Exemple #3
0
    def __init__(self,
                 query,
                 api,
                 refresh,
                 view='STANDARD',
                 count=200,
                 max_entries=5000,
                 cursor=False,
                 download=True,
                 **kwds):
        """Class intended as superclass to perform a search query.

        Parameters
        ----------
        query : str
            A string of the query.

        api : str
            The name of the Scopus API to be accessed.  Allowed values:
            AffiliationSearch, AuthorSearch, ScopusSearch.

        refresh : bool
            Whether to refresh the cached file if it exists or not.

        view : str
            The view of the file that should be downloaded.

        count : int (optional, default=200)
            The number of entries to be displayed at once.  A smaller number
            means more queries with each query having less results.

        max_entries : int (optional, default=5000)
            Raise error when the number of results is beyond this number.
            To skip this check, set `max_entries` to `None`.


        cursor : str (optional, default=False)
            Whether to use the cursor in order to iterate over all search
            results without limit on the number of the results.  In contrast
            to `start` parameter, the `cursor` parameter does not allow users
            to obtain partial results.

        download : bool (optional, default=True)
            Whether to download results (if they have not been cached) or not.

        kwds : key-value parings, optional
            Keywords passed on to requests header.  Must contain fields
            and values specified in the respective API specification.

        Raises
        ------
        ScopusQueryError
            If the number of search results exceeds max_entries.

        ValueError
            If the api parameteris an invalid entry.
        """
        # Read the file contents if file exists and we are not refreshing,
        # otherwise download query anew and cache file
        fname = md5(query.encode('utf8')).hexdigest()
        qfile = join(get_folder(api, view), fname)
        if not refresh and exists(qfile):
            with open(qfile, "rb") as f:
                self._json = [loads(line) for line in f.readlines()]
            self._n = len(self._json)
        else:
            # Set query parameters
            params = {'query': query, 'count': count, 'view': view}
            if cursor:
                params.update({'cursor': '*'})
            else:
                params.update({'start': 0})
            # Download results
            res = cache_file(url=SEARCH_URL[api], params=params, **kwds).json()
            n = int(res['search-results'].get('opensearch:totalResults', 0))
            self._n = n
            if not cursor and n > max_entries:  # Stop if there are too many results
                text = ('Found {} matches. Set max_entries to a higher '
                        'number, change your query ({}) or set '
                        'subscription=True'.format(n, query))
                raise ScopusQueryError(text)
            if download:
                self._json = _parse(res, params, n, api, **kwds)
                # Finally write out the file
                with open(qfile, 'wb') as f:
                    for item in self._json:
                        f.write('{}\n'.format(dumps(item)).encode('utf-8'))
            else:
                # Assures that properties will not result in an error
                self._json = []
        self._view = view