Beispiel #1
0
    def __init__(self, aff_id, refresh=False):
        """Class to represent an Affiliation in Scopus.

        Parameters
        ----------
        aff_id : str or int
            The Scopus Affiliation ID.  Optionally expressed
            as an Elsevier EID (i.e., in the form 10-s2.0-nnnnnnnn).

        refresh : bool (optional, default=False)
            Whether to refresh the cached file if it exists or not.

        Notes
        -----
        The files are cached in ~/.scopus/affiliation_retrieval/{aff_id}.
        """
        aff_id = str(int(str(aff_id).split('-')[-1]))

        qfile = join(config.get('Directories', 'ContentAffiliationRetrieval'),
                     aff_id)
        url = ('https://api.elsevier.com/content/affiliation/'
               'affiliation_id/{}'.format(aff_id))

        res = get_content(qfile, url=url, refresh=refresh, accept='json')
        self._json = loads(
            res.decode('utf-8'))['affiliation-retrieval-response']
Beispiel #2
0
    def __init__(self, aff_id, refresh=False):
        """Class to represent an Affiliation in Scopus.

        Parameters
        ----------
        aff_id : str or int
            The Scopus Affiliation ID.  Optionally expressed
            as an Elsevier EID (i.e., in the form 10-s2.0-nnnnnnnn).

        refresh : bool (optional, default=False)
            Whether to refresh the cached file if it exists or not.

        Notes
        -----
        The files are cached in ~/.scopus/affiliation/{aff_id}.
        """
        if config.getboolean('Warnings', 'Affiliation'):
            text = config.get('Warnings', 'Text').format('ContentAffiliationRetrieval')
            warnings.warn(text, DeprecationWarning)
            config.set('Warnings', 'Affiliation', '0')
        aff_id = str(int(str(aff_id).split('-')[-1]))

        qfile = os.path.join(SCOPUS_AFFILIATION_DIR, aff_id)
        url = ('https://api.elsevier.com/content/affiliation/'
               'affiliation_id/{}'.format(aff_id))

        self.xml = ET.fromstring(get_content(qfile, url=url, refresh=refresh))
Beispiel #3
0
    def __init__(self, query, refresh=False):
        """Class to search a query, and retrieve a list of EIDs as results.

        Parameters
        ----------
        query : str
            A string of the query.

        refresh : bool (optional, default=False)
            Whether to refresh the cached file if it exists or not.

        Raises
        ------
        Exception
            If the number of search results exceeds 5000.

        Notes
        -----
        Json results are cached in ~/.scopus/search_scoups/{fname} where fname
        is the md5-hashed version of query.

        The COMPLETE view is used to access more fields, see
        https://dev.elsevier.com/guides/ScopusSearchViews.htm.
        """

        self.query = query
        qfile = join(config.get('Directories', 'ScopusSearch'),
                     hashlib.md5(query.encode('utf8')).hexdigest())
        url = 'https://api.elsevier.com/content/search/scopus'
        Search.__init__(self, query, qfile, url, refresh,
                        max_entries=5000, count=25, start=0, view='COMPLETE')
    def __init__(self, aff_id, refresh=False):
        """Class to represent an Affiliation in Scopus.

        Parameters
        ----------
        aff_id : str or int
            The Scopus Affiliation ID.  Optionally expressed
            as an Elsevier EID (i.e., in the form 10-s2.0-nnnnnnnn).

        refresh : bool (optional, default=False)
            Whether to refresh the cached file if it exists or not.

        Notes
        -----
        The files are cached in ~/.scopus/affiliation/{aff_id}.
        """
        if config.getboolean('Warnings', 'Affiliation'):
            text = config.get('Warnings',
                              'Text').format('ContentAffiliationRetrieval')
            warnings.warn(text, DeprecationWarning)
            config.set('Warnings', 'Affiliation', '0')
        aff_id = str(int(str(aff_id).split('-')[-1]))

        qfile = os.path.join(SCOPUS_AFFILIATION_DIR, aff_id)
        url = ('https://api.elsevier.com/content/affiliation/'
               'affiliation_id/{}'.format(aff_id))

        self.xml = ET.fromstring(get_content(qfile, url=url, refresh=refresh))
Beispiel #5
0
    def __init__(self, EID, view='META_ABS', refresh=False):
        """Class to represent the results from a Scopus abstract.

        Parameters
        ----------
        EID : str
            The Scopus ID (EID) of an abstract.

        view : str (optional, default=META_ABS)
            The view of the file that should be downloaded.  Will not take
            effect for already cached files. Supported values: META, META_ABS,
            FULL, where FULL includes all information of META_ABS view and
            META_ABS includes all information of the META view .  See
            https://dev.elsevier.com/guides/AbstractRetrievalViews.htm
            for details.

        refresh : bool (optional, default=False)
            Whether to refresh the cached file if it exists or not.

        Notes
        -----
        The files are cached in ~/.scopus/xml/{eid}.
        """
        if config.getboolean('Warnings', 'Abstract'):
            text = config.get('Warnings', 'Text').format('AbstractRetrieval')
            warnings.warn(text, DeprecationWarning)
            config.set('Warnings', 'Abstract', '0')
        allowed_views = ('META', 'META_ABS', 'FULL')
        if view not in allowed_views:
            raise ValueError('view parameter must be one of ' +
                             ', '.join(allowed_views))

        # Get file content
        qfile = os.path.join(SCOPUS_XML_DIR, EID)
        url = "https://api.elsevier.com/content/abstract/eid/{}".format(EID)
        params = {'view': view}
        self.xml = ET.fromstring(
            get_content(qfile, url=url, refresh=refresh, params=params))
        # Remove default namespace if present
        remove = u'{http://www.elsevier.com/xml/svapi/abstract/dtd}'
        nsl = len(remove)
        for elem in self.xml.getiterator():
            if elem.tag.startswith(remove):
                elem.tag = elem.tag[nsl:]

        if self.xml.tag == 'service-error':
            raise Exception('\n{0}\n{1}'.format(EID, self.xml))

        self.coredata = self.xml.find('coredata', ns)
        self.items = self.xml.find('item', ns)
Beispiel #6
0
    def __init__(self, EID, view='META_ABS', refresh=False):
        """Class to represent the results from a Scopus abstract.

        Parameters
        ----------
        EID : str
            The Scopus ID (EID) of an abstract.

        view : str (optional, default=META_ABS)
            The view of the file that should be downloaded.  Will not take
            effect for already cached files. Supported values: META, META_ABS,
            FULL, where FULL includes all information of META_ABS view and
            META_ABS includes all information of the META view .  See
            https://dev.elsevier.com/guides/AbstractRetrievalViews.htm
            for details.

        refresh : bool (optional, default=False)
            Whether to refresh the cached file if it exists or not.

        Notes
        -----
        The files are cached in ~/.scopus/xml/{eid}.
        """
        if config.getboolean('Warnings', 'Abstract'):
            text = config.get('Warnings', 'Text').format('AbstractRetrieval')
            warnings.warn(text, DeprecationWarning)
            config.set('Warnings', 'Abstract', '0')
        allowed_views = ('META', 'META_ABS', 'FULL')
        if view not in allowed_views:
            raise ValueError('view parameter must be one of ' +
                             ', '.join(allowed_views))

        # Get file content
        qfile = os.path.join(SCOPUS_XML_DIR, EID)
        url = "https://api.elsevier.com/content/abstract/eid/{}".format(EID)
        params = {'view': view}
        self.xml = ET.fromstring(get_content(qfile, url=url, refresh=refresh,
                                             params=params))
        # Remove default namespace if present
        remove = u'{http://www.elsevier.com/xml/svapi/abstract/dtd}'
        nsl = len(remove)
        for elem in self.xml.getiterator():
            if elem.tag.startswith(remove):
                elem.tag = elem.tag[nsl:]

        if self.xml.tag == 'service-error':
            raise Exception('\n{0}\n{1}'.format(EID, self.xml))

        self.coredata = self.xml.find('coredata', ns)
        self.items = self.xml.find('item', ns)
Beispiel #7
0
    def __init__(self, eid, start, end=datetime.now().year, refresh=False):
        """Class to represent the results from a Scopus Citation Overview.
        See https://api.elsevier.com/documentation/guides/AbstractCitationViews.htm.

        Parameters
        ----------
        eid : str
            The EID of the abstract.

        start : str or int
            The first year for which the citation count should be loaded

        end : str or int (optional, default=datetime.now().year)
            The last year for which the citation count should be loaded.
            Default is the current year.

        refresh : bool (optional, default=False)
            Whether to refresh the cached file if it exists or not.

        Notes
        -----
        The files are cached in ~/.scopus/citation_overview/{eid}.
        Your API Key needs to be approved by Elsevier to access this view.
        """
        # Get file content
        scopus_id = eid.split('0-')[-1]
        qfile = join(config.get('Directories', 'CitationOverview'), eid)
        url = "https://api.elsevier.com/content/abstract/citations/{}".format(
            scopus_id)
        params = {'scopus_id': scopus_id, 'date': '{}-{}'.format(start, end)}
        res = get_content(qfile,
                          url=url,
                          refresh=refresh,
                          params=params,
                          accept='json')
        self._data = loads(res.decode('utf-8'))['abstract-citations-response']

        self._start = int(start)
        self._end = int(end)

        # citeInfoMatrix
        m = self._data['citeInfoMatrix']['citeInfoMatrixXML'][
            'citationMatrix']['citeInfo'][0]
        self._citeInfoMatrix = {k.split(":", 1)[-1]: v for k, v in m.items()}
        # identifier-legend
        l = self._data['identifier-legend']['identifier'][0]
        self._identifierlegend = {k.split(":", 1)[-1]: v for k, v in l.items()}
        # citeColumnTotalXML
        self._citeColumnTotalXML = self._data['citeColumnTotalXML']  # not used
Beispiel #8
0
    def __init__(self,
                 query,
                 count=200,
                 start=0,
                 max_entries=5000,
                 refresh=False):
        """Class to search a query, and retrieve a list of author IDs as results.

        Parameters
        ----------
        query : str
            A string of the query, e.g. "authlast(Einstein) and
            authfirst(Albert)".

        count : int (optional, default=200)
            The number of entries to be displayed at once.  A smaller number
            means more queries with each query having less results.

        start : int (optional, default=0)
            The entry number of the first search item to start with.

        refresh : bool (optional, default=False)
            Whether to refresh the cached file if it exists or not.

        max_entries : int (optional, default=5000)
            Raise error when the number of results is beyond this number.
            The Scopus Search Engine does not allow more than 5000 entries.

        Raises
        ------
        Exception
            If the number of search results exceeds max_entries.

        Notes
        -----
        Json results are cached in ~/.scopus/author_search/{fname}, where
        fname is the hashed version of query.

        The results are stored as a property named authors.
        """

        self.query = query
        qfile = join(config.get('Directories', 'AuthorSearch'),
                     hashlib.md5(query.encode('utf8')).hexdigest())
        url = 'https://api.elsevier.com/content/search/author'
        Search.__init__(self, query, qfile, url, refresh, count, start,
                        max_entries)
Beispiel #9
0
    def __init__(self, author_id, refresh=False):
        """Class to represent a Scopus Author query by the scopus-id.

        Parameters
        ----------
        author_id : str or int
            The ID of the author to search for.  Optionally expressed
            as an Elsevier EID (i.e., in the form 9-s2.0-nnnnnnnn).

        refresh : bool (optional, default=False)
            Whether to refresh the cached file (if it exists) or not.

        Notes
        -----
        The files are cached in ~/.scopus/author_retrieval/{author_id} (without
        eventually leading '9-s2.0-').
        """
        self._id = str(int(str(author_id).split('-')[-1]))

        qfile = join(config.get('Directories', 'AuthorRetrieval'), self._id)
        url = ('https://api.elsevier.com/content/author/'
               'author_id/{}').format(self._id)
        params = {'author_id': self._id, 'view': 'ENHANCED'}
        res = get_content(qfile,
                          url=url,
                          refresh=refresh,
                          accept='json',
                          params=params)
        self._json = loads(res.decode('utf-8'))['author-retrieval-response']
        try:
            self._json = self._json[0]
        except KeyError:
            alias_json = self._json['alias']['prism:url']
            if not isinstance(alias_json, list):
                alias_json = [alias_json]
            alias = ', '.join([d['$'].split(':')[-1] for d in alias_json])
            text = 'Author profile with ID {} has been merged and the main '\
                   'profile is now one of {}.  Please update your records '\
                   'manually.  Functionality of this object is '\
                   'reduced.'.format(author_id, alias)
            warn(text, UserWarning)
Beispiel #10
0
    def __init__(self, author_id, refresh=False, refresh_aff=False, level=1):
        """Class to represent a Scopus Author query by the scopus-id.

        Parameters
        ----------
        author_id : str or int
            The ID of the author to search for.  Optionally expressed
            as an Elsevier EID (i.e., in the form 9-s2.0-nnnnnnnn).

        refresh : bool (optional, default=False)
            Whether to refresh the cached file (if it exists) or not.

        refresh_aff : bool (optional, default=False)
            Whether to refresh the cached corresponding affiliation views
            (if they exist) or not.

        level : int (optional, default=1)
            Number of * to print in property __str__.

        Notes
        -----
        The files are cached in ~/.scopus/author/{author_id} (without
        eventually leading '9-s2.0-').
        """
        if config.getboolean('Warnings', 'Author'):
            text = config.get('Warnings', 'Text').format('AuthorRetrieval')
            warnings.warn(text, DeprecationWarning)
            config.set('Warnings', 'Author', '0')
        author_id = str(int(str(author_id).split('-')[-1]))
        self.level = level

        qfile = os.path.join(SCOPUS_AUTHOR_DIR, author_id)
        url = ('https://api.elsevier.com/content/author/'
               'author_id/{}').format(author_id)
        params = {'author_id': author_id, 'view': 'ENHANCED'}
        self.xml = ET.fromstring(
            get_content(qfile, url=url, refresh=refresh, params=params))
Beispiel #11
0
    def __init__(self, author_id, refresh=False, refresh_aff=False, level=1):
        """Class to represent a Scopus Author query by the scopus-id.

        Parameters
        ----------
        author_id : str or int
            The ID of the author to search for.  Optionally expressed
            as an Elsevier EID (i.e., in the form 9-s2.0-nnnnnnnn).

        refresh : bool (optional, default=False)
            Whether to refresh the cached file (if it exists) or not.

        refresh_aff : bool (optional, default=False)
            Whether to refresh the cached corresponding affiliation views
            (if they exist) or not.

        level : int (optional, default=1)
            Number of * to print in property __str__.

        Notes
        -----
        The files are cached in ~/.scopus/author/{author_id} (without
        eventually leading '9-s2.0-').
        """
        if config.getboolean('Warnings', 'Author'):
            text = config.get('Warnings', 'Text').format('AuthorRetrieval')
            warnings.warn(text, DeprecationWarning)
            config.set('Warnings', 'Author', '0')
        author_id = str(int(str(author_id).split('-')[-1]))
        self.level = level

        qfile = os.path.join(SCOPUS_AUTHOR_DIR, author_id)
        url = ('https://api.elsevier.com/content/author/'
               'author_id/{}').format(author_id)
        params = {'author_id': author_id, 'view': 'ENHANCED'}
        self.xml =  ET.fromstring(get_content(qfile, url=url, refresh=refresh,
                                              params=params))
Beispiel #12
0
    def __init__(self, query, api, refresh, count=200, start=0,
                 max_entries=5000, view='STANDARD'):
        """Class intended as superclass to perform a search query.

        Parameters
        ----------
        query : str
            A string of the query.

        api : str
            The name of the Scopus API to be accessed.  Allowed values:
            AffiliationSearch, AuthorSearch, ScopusSearch.

        refresh : bool
            Whether to refresh the cached file if it exists or not.

        count : int (optional, default=200)
            The number of entries to be displayed at once.  A smaller number
            means more queries with each query having less results.

        start : int (optional, default=0)
            The entry number of the first search item to start with.

        max_entries : int (optional, default=5000)
            Raise error when the number of results is beyond this number.
            The Scopus Search Engine does not allow more than 5000 entries.

        view : str (optional, default=STANDARD)
            The view of the file that should be downloaded.  Will not take
            effect for already cached files.  Allowed values: STANDARD,
            COMPLETE.
            Note: Only the ScopusSearch API additionally uses view COMPLETE.

        Raises
        ------
        ScopusQueryError
            If the number of search results exceeds max_entries.

        ValueError
            If the api parameter or view parameter is an invalid entry.
        """
        # Checks
        if api not in URL:
            raise ValueError('api parameter must be one of ' +
                             ', '.join(URL.keys()))
        allowed_views = ('STANDARD', 'COMPLETE')
        if view not in allowed_views:
            raise ValueError('view parameter must be one of ' +
                             ', '.join(allowed_views))
        if not config.has_section('Directories'):
            create_config()

        # Read the file contents if file exists and we are not refreshing,
        # otherwise download query anew and cache file
        qfile = join(config.get('Directories', api),
                     md5(query.encode('utf8')).hexdigest())
        if not refresh and exists(qfile):
            with open(qfile, "rb") as f:
                self._json = [loads(line) for line in f.readlines()]
        else:
            # Get a count of how many things to retrieve from first chunk
            params = {'query': query, 'count': count, 'start': 0, 'view': view}
            res = download(url=URL[api], params=params, accept="json").json()
            n = int(res['search-results'].get('opensearch:totalResults', 0))
            if n > max_entries:  # Stop if there are too many results
                text = ('Found {} matches. Set max_entries to a higher '
                        'number or change your query ({})'.format(n, query))
                raise ScopusQueryError(text)
            self._json = res.get('search-results', {}).get('entry', [])
            if n == 0:
                self._json = ""
            # Download the remaining information in chunks
            while n > 0:
                n -= count
                start += count
                params.update({'count': count, 'start': start})
                res = download(url=URL[api], params=params, accept="json").json()
                self._json.extend(res.get('search-results', {}).get('entry', []))
            # Finally write out the file
            with open(qfile, 'wb') as f:
                for item in self._json:
                    f.write('{}\n'.format(dumps(item)).encode('utf-8'))
Beispiel #13
0
    def __init__(self, EID, view='META_ABS', refresh=False, id_type=None):
        """Class to represent the results from a Scopus abstract.

        Parameters
        ----------
        EID : str
            The Scopus ID (EID) of an abstract.

        view : str (optional, default=META_ABS)
            The view of the file that should be downloaded.  Will not take
            effect for already cached files. Allowed values: META, META_ABS,
            FULL, where FULL includes all information of META_ABS view and
            META_ABS includes all information of the META view .  See
            https://dev.elsevier.com/guides/AbstractRetrievalViews.htm
            for details.

        refresh : bool (optional, default=False)
            Whether to refresh the cached file if it exists or not.

        id_type: str (optional, default=None)
            The type of used ID. Allowed values: None, 'eid','pii',
            'scopus_id','pubmed_id','doi'. If the value is None, the function
            tries to infer the ID type itself. Other values manually set the
            ID type to one of the types supported by Scopus.

        ValueError
            If the view parameters contains invalid entries.

        Notes
        -----
        The files are cached in ~/.scopus/abstract_retrieval/{eid}.

        DOI always contains '/' symbol, which is a path separator in some operating
        systems so '/' has to be replaced in the filename for caching.
        """
        EID = str(EID)
        allowed_views = ('META', 'META_ABS', 'FULL')
        if view not in allowed_views:
            raise ValueError('view parameter must be one of ' +
                             ', '.join(allowed_views))

        if id_type is None:
            id_type = detect_id_type(EID)
        else:
            allowed_id_types = ('eid', 'pii', 'scopus_id', 'pubmed_id', 'doi')
            if id_type not in allowed_id_types:
                raise ValueError('id_type parameter must be one of ' +
                                 ', '.join(allowed_id_types))

        qfile = join(config.get('Directories', 'AbstractRetrieval'),
                     EID.replace('/', '_'))
        url = "https://api.elsevier.com/content/abstract/{}/{}".format(
            id_type, EID)
        res = get_content(qfile,
                          url=url,
                          refresh=refresh,
                          accept='json',
                          params={'view': view})
        self._json = loads(res.decode('utf-8'))['abstracts-retrieval-response']
        self._head = self._json.get('item', {}).get('bibrecord',
                                                    {}).get('head', {})
        self._tail = self._json.get('item', {}).get('bibrecord',
                                                    {}).get('tail', {})
        if self._tail is None:
            self._tail = {}
        self._confinfo = self._head.get('source',
                                        {}).get('additional-srcinfo',
                                                {}).get('conferenceinfo', {})
Beispiel #14
0
    def __init__(self, identifier, api, refresh, id_type=None, view=None,
                 date=None):
        """Class intended as superclass to perform retrievals.

        Parameters
        ----------
        identifier : str or int
            A string of the query.

        api : str
            The name of the Scopus API to be accessed.  Allowed values:
            AbstractRetrieval, AuthorRetrieval, CitationOverview,
            ContentAffiliationRetrieval.

        refresh : bool
            Whether to refresh the cached file if it exists or not.

        id_type : str (optional, default=None)
            The type of used ID.
            Note: Will only take effect for the AbstractRetrieval API.

        view : str (optional, default=None)
            The view of the file that should be downloaded.  Will not take
            effect for already cached files.  Allowed values: STANDARD,
            COMPLETE.
            Note: Will only take effect for the AbstractRetrieval API.

        date : str (optional, default=None)
            A string combining two years with a hyphen for which citations
            should be looked up for.
            Note: Will only take effect for the CitationOverview API.

        Raises
        ------
        ValueError
            If the api parameter or view parameter is an invalid entry.
        """
        # Checks
        if api not in URL:
            raise ValueError('api parameter must be one of ' +
                             ', '.join(URL.keys()))
        if not config.has_section('Directories'):
            create_config()

        # Construct parameters
        url = URL[api]
        if api == "AbstractRetrieval":
            url += id_type + "/"
        elif api == "AuthorRetrieval":
            view = 'ENHANCED'
        params = {'view': view}
        if api == 'CitationOverview':
            params.update({'date': date, 'scopus_id': identifier.split('0-')[-1]})
        url += identifier

        # Parse file contents
        qfile = join(config.get('Directories', api),
                     identifier.replace('/', '_'))
        res = get_content(qfile, refresh, url=url, accept='json',
                          params=params)
        self._json = loads(res.decode('utf-8'))
Beispiel #15
0
    def __init__(self,
                 identifier,
                 api,
                 refresh,
                 id_type=None,
                 view=None,
                 date=None):
        """Class intended as superclass to perform retrievals.

        Parameters
        ----------
        identifier : str or int
            A string of the query.

        api : str
            The name of the Scopus API to be accessed.  Allowed values:
            AbstractRetrieval, AuthorRetrieval, CitationOverview,
            ContentAffiliationRetrieval.

        refresh : bool
            Whether to refresh the cached file if it exists or not.

        id_type : str (optional, default=None)
            The type of used ID.
            Note: Will only take effect for the AbstractRetrieval API.

        view : str (optional, default=None)
            The view of the file that should be downloaded.  Will not take
            effect for already cached files.  Allowed values: STANDARD,
            COMPLETE.
            Note: Will only take effect for the AbstractRetrieval API.

        date : str (optional, default=None)
            A string combining two years with a hyphen for which citations
            should be looked up for.
            Note: Will only take effect for the CitationOverview API.

        Raises
        ------
        ValueError
            If the api parameter or view parameter is an invalid entry.
        """
        # Checks
        if api not in URL:
            raise ValueError('api parameter must be one of ' +
                             ', '.join(URL.keys()))
        if not config.has_section('Directories'):
            create_config()

        # Construct parameters
        url = URL[api]
        if api == "AbstractRetrieval":
            url += id_type + "/"
        elif api == "AuthorRetrieval":
            view = 'ENHANCED'
        params = {'view': view}
        if api == 'CitationOverview':
            params.update({
                'date': date,
                'scopus_id': identifier.split('0-')[-1]
            })
        url += identifier

        # Parse file contents
        qfile = join(config.get('Directories', api),
                     identifier.replace('/', '_'))
        res = get_content(qfile,
                          refresh,
                          url=url,
                          accept='json',
                          params=params)
        self._json = loads(res.decode('utf-8'))
Beispiel #16
0
    def __init__(self,
                 query,
                 api,
                 refresh,
                 count=200,
                 start=0,
                 max_entries=5000,
                 view='STANDARD',
                 cursor=False,
                 **kwds):
        """Class intended as superclass to perform a search query.

        Parameters
        ----------
        query : str
            A string of the query.

        api : str
            The name of the Scopus API to be accessed.  Allowed values:
            AffiliationSearch, AuthorSearch, ScopusSearch.

        refresh : bool
            Whether to refresh the cached file if it exists or not.

        count : int (optional, default=200)
            The number of entries to be displayed at once.  A smaller number
            means more queries with each query having less results.

        start : int (optional, default=0)
            DEPRECATED! The entry number of the first search item
            to start with.

        max_entries : int (optional, default=5000)
            Raise error when the number of results is beyond this number.
            To skip this check, set `max_entries` to `None`.

        view : str (optional, default=STANDARD)
            The view of the file that should be downloaded.  Will not take
            effect for already cached files.

        cursor : str (optional, default=False)
            Whether to use the cursor in order to iterate over all search
            results without limit on the number of the results.  In contrast
            to `start` parameter, the `cursor` parameter does not allow users
            to obtain partial results.

        kwds : key-value parings, optional
            Keywords passed on to requests header.  Must contain fields
            and values specified in the respective API specification.

        Raises
        ------
        ScopusQueryError
            If the number of search results exceeds max_entries.

        ValueError
            If the api parameteris an invalid entry.
        """
        # Checks
        if api not in URL:
            raise ValueError('api parameter must be one of ' +
                             ', '.join(URL.keys()))
        if not config.has_section('Directories'):
            create_config()
        if start != 0:
            text = "Parameter start is deprecated and will be removed "\
                   "in scopus 1.6."
            warn(text, UserWarning)

        # Read the file contents if file exists and we are not refreshing,
        # otherwise download query anew and cache file
        qfile = join(config.get('Directories', api),
                     md5(query.encode('utf8')).hexdigest())
        if not refresh and exists(qfile):
            with open(qfile, "rb") as f:
                self._json = [loads(line) for line in f.readlines()]
        else:
            # Get a count of how many things to retrieve from first chunk
            params = {'query': query, 'count': count, 'view': view}
            if cursor:
                params.update({'cursor': '*'})
            else:
                params.update({'start': 0})
            res = download(url=URL[api], params=params, accept="json",
                           **kwds).json()
            n = int(res['search-results'].get('opensearch:totalResults', 0))
            if not cursor and n > max_entries:  # Stop if there are too many results
                text = ('Found {} matches. Set max_entries to a higher '
                        'number, change your query ({}) or set '
                        'subscription=True'.format(n, query))
                raise ScopusQueryError(text)
            self._json = res.get('search-results', {}).get('entry', [])
            if n == 0:
                self._json = ""
            # Download the remaining information in chunks
            while n > 0:
                n -= count
                params.update({'count': count})
                if cursor:
                    pointer = res['search-results']['cursor'].get('@next')
                    params.update({'cursor': pointer})
                else:
                    start += count
                    params.update({'start': start})
                res = download(url=URL[api],
                               params=params,
                               accept="json",
                               **kwds).json()
                self._json.extend(
                    res.get('search-results', {}).get('entry', []))
            # Finally write out the file
            with open(qfile, 'wb') as f:
                for item in self._json:
                    f.write('{}\n'.format(dumps(item)).encode('utf-8'))
Beispiel #17
0
    def __init__(self, query, api, refresh, count=200, start=0,
                 max_entries=5000, view='STANDARD', cursor=False,
                 download_results=True, **kwds):
        """Class intended as superclass to perform a search query.

        Parameters
        ----------
        query : str
            A string of the query.

        api : str
            The name of the Scopus API to be accessed.  Allowed values:
            AffiliationSearch, AuthorSearch, ScopusSearch.

        refresh : bool
            Whether to refresh the cached file if it exists or not.

        count : int (optional, default=200)
            The number of entries to be displayed at once.  A smaller number
            means more queries with each query having less results.

        start : int (optional, default=0)
            DEPRECATED! The entry number of the first search item
            to start with.

        max_entries : int (optional, default=5000)
            Raise error when the number of results is beyond this number.
            To skip this check, set `max_entries` to `None`.

        view : str (optional, default=STANDARD)
            The view of the file that should be downloaded.  Will not take
            effect for already cached files.

        cursor : str (optional, default=False)
            Whether to use the cursor in order to iterate over all search
            results without limit on the number of the results.  In contrast
            to `start` parameter, the `cursor` parameter does not allow users
            to obtain partial results.

        download_results : bool (optional, default=True)
            Whether to download results (if they have not been cached) or not.

        kwds : key-value parings, optional
            Keywords passed on to requests header.  Must contain fields
            and values specified in the respective API specification.

        Raises
        ------
        ScopusQueryError
            If the number of search results exceeds max_entries.

        ValueError
            If the api parameteris an invalid entry.
        """
        # Checks
        if api not in URL:
            raise ValueError('api parameter must be one of ' +
                             ', '.join(URL.keys()))
        if not config.has_section('Directories'):
            create_config()
        if start != 0:
            text = "Parameter start is deprecated and will be removed "\
                   "in scopus 1.6."
            warn(text, UserWarning)

        # Read the file contents if file exists and we are not refreshing,
        # otherwise download query anew and cache file
        qfile = join(config.get('Directories', api),
                     md5(query.encode('utf8')).hexdigest())
        if not refresh and exists(qfile):
            with open(qfile, "rb") as f:
                self._json = [loads(line) for line in f.readlines()]
            self._n = n = len(self._json)
        else:
            # Set query parameters
            params = {'query': query, 'count': count, 'view': view}
            if cursor:
                params.update({'cursor': '*'})
            else:
                params.update({'start': 0})
            # Download results
            res = download(url=URL[api], params=params, accept="json", **kwds).json()
            n = int(res['search-results'].get('opensearch:totalResults', 0))
            self._n = n
            if not cursor and n > max_entries:  # Stop if there are too many results
                text = ('Found {} matches. Set max_entries to a higher '
                        'number, change your query ({}) or set '
                        'subscription=True'.format(n, query))
                raise ScopusQueryError(text)
            if download_results:
                self._json = _parse(res, params, n, api, **kwds)
                # Finally write out the file
                with open(qfile, 'wb') as f:
                    for item in self._json:
                        f.write('{}\n'.format(dumps(item)).encode('utf-8'))