Beispiel #1
0
    def institution(self):
        """
        The name and identifier of the latest institution associated
        with this researcher
        """
        lst = jpath('activities-summary/employments/employment-summary',
                    self.json,
                    default=[])
        lst += jpath('activities-summary/educations/education-summary',
                     self.json,
                     default=[])

        for affiliation in lst:
            disamb = jpath('organization/disambiguated-organization',
                           affiliation,
                           default={})
            source = disamb.get('disambiguation-source')
            inst_id = disamb.get('disambiguated-organization-identifier')
            name = jpath('organization/name', affiliation)
            country = jpath('organization/address/country', affiliation)
            identifier = None
            # we skip ringgold identifiers, because they suck:
            # https://github.com/ORCID/ORCID-Source/issues/3297
            if source and inst_id and source.lower() != 'ringgold':
                identifier = unicode(source).lower() + '-' + unicode(inst_id)

            if name and country:
                return {
                    'identifier': identifier,
                    'name': name,
                    'country': country,
                }
        return None
Beispiel #2
0
 def homepage(self):
     """
     Extract an URL for that researcher (if any)
     """
     lst = jpath('orcid-profile/orcid-bio/researcher-urls/researcher-url', self.json, default=[])
     for url in lst:
         val = jpath('url/value', url)
         name = jpath('url-name/value', url)
         if name is not None and ('home' in name.lower() or 'personal' in name.lower()):
             return urlize(val)
     if len(lst):
         return urlize(jpath('url/value', lst[0])) or None
Beispiel #3
0
 def name(self):
     """
     Returns a parsed version of the "credit name" in the ORCID profile.
     If there is no such name, returns the given and family names on the profile
     (they should exist)
     """
     name_item = jpath('orcid-profile/orcid-bio/personal-details', self.json)
     name = jpath('credit-name/value', name_item)
     if name is not None:
         return parse_comma_name(name)
     return (normalize_name_words(jpath('given-names/value', name_item, '')),
             normalize_name_words(jpath('family-name/value', name_item, '')))
Beispiel #4
0
 def name(self):
     """
     Returns a parsed version of the "credit name" in the ORCID profile.
     If there is no such name, returns the given and family names on the profile
     (they should exist)
     """
     name_item = jpath('person/name', self.json)
     name = jpath('credit-name/value', name_item)
     if name:
         return parse_comma_name(name)
     return (normalize_name_words(jpath('given-names/value', name_item,
                                        '')),
             normalize_name_words(jpath('family-name/value', name_item,
                                        '')))
Beispiel #5
0
 def homepage(self):
     """
     Extract an URL for that researcher (if any)
     """
     lst = jpath('person/researcher-urls/researcher-url',
                 self.json,
                 default=[])
     for url in lst:
         val = jpath('url/value', url)
         name = jpath('url-name', url)
         if name is not None and ('home' in name.lower()
                                  or 'personal' in name.lower()):
             return urlize(val)
     if len(lst):
         return urlize(jpath('url/value', lst[0])) or None
Beispiel #6
0
    def fetch_all_records(self, filters=None, cursor="*"):
        """
        Fetches all Crossref records from their API, starting at a given date.

        :param filters: filters as specified by the REST API (as a dictionary)
        :param cursor: the initial cursor where to start the fetching
            (useful to resume failed ingestions)
        """
        if filters is None:
            filters = {}
        params = {}
        if filters:
            params['filter'] = ','.join(k + ":" + v
                                        for k, v in list(filters.items()))

        rows = 100
        next_cursor = cursor
        while next_cursor:
            params['rows'] = rows
            params['cursor'] = next_cursor
            params['mailto'] = settings.CROSSREF_MAILTO

            try:
                r = make_crossref_call('/works', params=params)
                r.raise_for_status()
                js = r.json()
                if js['status'] == 'failed':
                    raise MetadataSourceException(
                        'Querying Crossrsef with {} failed.'.format(r.url))
                found = False
                for item in jpath('message/items', js, default=[]):
                    found = True
                    yield item
                if not found:
                    break
                next_cursor = jpath('message/next-cursor', js)
                logger.info(
                    "Next cursor: {}".format(next_cursor))  # to ease recovery
            except ValueError as e:
                raise MetadataSourceException(
                    'Error while fetching CrossRef results:\nInvalid response.\n'
                    + 'Parameters were: %s\nJSON parser error was: %s' %
                    (urlencode(params), str(e)))
            except requests.exceptions.RequestException as e:
                raise MetadataSourceException(
                    'Error while fetching CrossRef results:\nError was: ' +
                    str(e))
Beispiel #7
0
 def other_names(self):
     """
     Returns the list of other names listed on the ORCiD profile.
     This includes the (given,family) name if a credit name was defined.
     """
     name_item = jpath('orcid-profile/orcid-bio/personal-details', self.json)
     names = []
     credit_name = jpath('credit-name/value', name_item)
     if credit_name is not None:
         names.append((normalize_name_words(jpath('given-names/value', name_item)),
             normalize_name_words(jpath('family-name/value', name_item))))
     other_names = jpath('other-names/other-name', name_item, default=[])
     for name in other_names:
         val = name.get('value')
         if val is not None:
             names.append(parse_comma_name(val))
     return names
Beispiel #8
0
 def other_names(self):
     """
     Returns the list of other names listed on the ORCiD profile.
     This includes the (given,family) name if a credit name was defined.
     """
     person = jpath('person', self.json)
     names = []
     credit_name = jpath('name/credit-name/value', person)
     if credit_name is not None:
         names.append((normalize_name_words(
             jpath('name/given-names/value', person, '')),
                       normalize_name_words(
                           jpath('name/family-name/value', person, ''))))
     other_names = jpath('other-names/other-name', person, default=[])
     for name in other_names:
         val = name.get('content')
         if val is not None:
             names.append(parse_comma_name(val))
     return names
Beispiel #9
0
    def fetch_all_records(self, filters=None,cursor="*"):
        """
        Fetches all Crossref records from their API, starting at a given date.

        :param filters: filters as specified by the REST API (as a dictionary)
        :param cursor: the initial cursor where to start the fetching
            (useful to resume failed ingestions)
        """
        if filters is None:
            filters = {}
        params = {}
        if filters:
            params['filter'] = ','.join(k+":"+v for k, v in list(filters.items()))

        rows = 100
        next_cursor = cursor
        while next_cursor:
            params['rows'] = rows
            params['cursor'] = next_cursor
            params['mailto'] = settings.CROSSREF_MAILTO

            try:
                r = make_crossref_call('/works', params=params)
                r.raise_for_status()
                js = r.json()
                if js['status'] == 'failed':
                    raise MetadataSourceException(
                    'Querying Crossrsef with {} failed.'.format(r.url))
                found = False
                for item in jpath('message/items', js, default=[]):
                    found = True
                    yield item
                if not found:
                    break
                next_cursor = jpath('message/next-cursor', js)
                logger.info("Next cursor: {}".format(next_cursor)) # to ease recovery
            except ValueError as e:
                raise MetadataSourceException(
                    'Error while fetching CrossRef results:\nInvalid response.\n' +
                    'Parameters were: %s\nJSON parser error was: %s' % (urlencode(params), str(e)))
            except requests.exceptions.RequestException as e:
                raise MetadataSourceException('Error while fetching CrossRef results:\nError was: '+str(e))
Beispiel #10
0
 def dois(self):
     dois = []
     for extid in self.j(
             'work-external-identifiers/work-external-identifier', []):
         if extid.get('work-external-identifier-type') == 'DOI':
             doi = to_doi(jpath('work-external-identifier-id/value', extid))
             if doi:
                 # If a DOI is available, create the paper using metadata from CrossRef.
                 # We don't do it yet, we only store the DOI, so that we can fetch them
                 # by batch later.
                 dois.append(doi)
     return dois
Beispiel #11
0
 def other_names(self):
     """
     Returns the list of other names listed on the ORCiD profile.
     This includes the (given,family) name if a credit name was defined.
     """
     name_item = jpath('orcid-profile/orcid-bio/personal-details',
                       self.json)
     names = []
     credit_name = jpath('credit-name/value', name_item)
     if credit_name is not None:
         names.append(
             (normalize_name_words(jpath('given-names/value', name_item,
                                         '')),
              normalize_name_words(jpath('family-name/value', name_item,
                                         ''))))
     other_names = jpath('other-names/other-name', name_item, default=[])
     for name in other_names:
         val = name.get('value')
         if val is not None:
             names.append(parse_comma_name(val))
     return names
Beispiel #12
0
 def doi(self):
     """
     Returns the DOI of this publication, if any.
     """
     for external_id in jpath('external-ids/external-id', self.json, []):
         if (external_id.get('external-id-type') == 'doi'
                 and external_id.get('external-id-relationship') == 'SELF'
                 and external_id.get('external-id-value')):
             doi = to_doi(external_id.get('external-id-value'))
             if doi:
                 return doi
     return None
Beispiel #13
0
    def fetch_batch(cls, dois):
        """
        Given a list of DOIs, return for each DOI a paper
        :params dois: List of DOIS
        :returns: list with Paper (or None) and DOI as key. Note that the key is lowered!
        """
        # CrossRef allows only certain characters in doi, we just remove them to get better matching
        dois = list(map(cls.remove_unapproved_characters, dois))
        # We create a dict and populate with `None`s and then override with paper objects
        papers = dict()
        for doi in dois:
            papers[doi.lower()] = None
        # We filter DOIs with comma, we do not batch them, but return them as `None`
        dois_to_fetch = cls._filter_dois_by_comma(dois)

        headers = {'User-Agent': settings.CROSSREF_USER_AGENT}
        url = 'https://api.crossref.org/works'
        s = requests.Session()

        while len(dois_to_fetch):
            dois_batch = dois_to_fetch[:cls.batch_length]
            dois_to_fetch = dois_to_fetch[cls.batch_length:]
            params = {
                'filter':
                ','.join(['doi:{}'.format(doi) for doi in dois_batch]),
                'mailto': settings.CROSSREF_MAILTO,
                'rows': cls.batch_length,
            }
            try:
                r = request_retry(
                    url,
                    params=params,
                    headers=headers,
                    session=s,
                    retries=0,  # There is probably a user waiting
                )
            except requests.exceptions.RequestException as e:
                # We skip the DOIs since we could not reach
                logger.info(e)
                continue
            items = jpath('message/items', r.json(), [])
            for item in items:
                try:
                    p = cls.to_paper(item)
                except CiteprocError:
                    logger.debug(item)
                else:
                    papers[p.get_doi()] = p

        p = [papers.get(doi.lower(), None) for doi in dois]

        return p
Beispiel #14
0
    def search_for_dois_incrementally(
            self,
            query,
            filters={},
            max_batches=max_crossref_batches_per_researcher):
        """
        Searches for DOIs for the given query and yields their metadata as it finds them.

        :param query: the search query to pass to CrossRef
        :param filters: filters as specified by the REST API
        :param max_batches: maximum number of queries to send to CrossRef
        """
        params = {}
        if query:
            params['query'] = query
        if filters:
            params['filter'] = ','.join(
                map(lambda (k, v): k + ":" + v, filters.items()))

        count = 0
        rows = 20
        offset = 0
        while not max_batches or count < max_batches:
            url = 'http://api.crossref.org/works'
            params['rows'] = rows
            params['offset'] = offset

            try:
                r = requests.get(url, params=params)
                print "CROSSREF: " + r.url
                js = r.json()
                found = False
                for item in jpath('message/items', js, default=[]):
                    found = True
                    yield item
                if not found:
                    break
            except ValueError as e:
                raise MetadataSourceException(
                    'Error while fetching CrossRef results:\nInvalid response.\n'
                    +
                    'URL was: %s\nParameters were: %s\nJSON parser error was: %s'
                    % (url, urlencode(params), unicode(e)))
            except requests.exceptions.RequestException as e:
                raise MetadataSourceException(
                    'Error while fetching CrossRef results:\nUnable to open the URL: '
                    + request + '\nError was: ' + str(e))

            offset += rows
            count += 1
Beispiel #15
0
    def search_for_dois_incrementally(self, query, filters={}, max_batches=max_crossref_batches_per_researcher):
        """
        Searches for DOIs for the given query and yields their metadata as it finds them.

        :param query: the search query to pass to CrossRef
        :param filters: filters as specified by the REST API
        :param max_batches: maximum number of queries to send to CrossRef
        """
        params = {}
        if query:
            params['query'] = query
        if filters:
            params['filter'] = ','.join(map(lambda (k,v): k+":"+v, filters.items()))
        
        count = 0
        rows = 20
        offset = 0
        while not max_batches or count < max_batches:
            url = 'http://api.crossref.org/works'
            params['rows'] = rows
            params['offset'] = offset
            
            try:
                r = requests.get(url, params=params)
                print "CROSSREF: "+r.url
                js = r.json()
                found = False
                for item in jpath('message/items', js, default=[]):
                    found = True
                    yield item
                if not found:
                    break
            except ValueError as e:
                raise MetadataSourceException('Error while fetching CrossRef results:\nInvalid response.\n'+
                        'URL was: %s\nParameters were: %s\nJSON parser error was: %s' % (url,urlencode(params),unicode(e))) 
            except requests.exceptions.RequestException as e:
                raise MetadataSourceException('Error while fetching CrossRef results:\nUnable to open the URL: '+
                        request+'\nError was: '+str(e))

            offset += rows
            count += 1
Beispiel #16
0
 def title(self):
     """
     Returns the title of this publication (always provided)
     """
     return jpath('title/title/value', self.json)
Beispiel #17
0
    def _fetch_day(cls, day):
        """
        Fetches a whole day from CrossRef
        """
        filters = {
            'from-update-date': day.isoformat(),
            'until-update-date': day.isoformat(),
        }
        params = {
            'filter':
            ','.join('{}:{}'.format(key, value)
                     for key, value in filters.items()),
            'rows':
            cls.rows,
            'mailto':
            settings.CROSSREF_MAILTO,
        }
        url = 'https://api.crossref.org/works'
        headers = {
            'User-Agent': settings.CROSSREF_USER_AGENT,
        }

        s = requests.Session()
        cursor = '*'
        total_results = 0
        loop_runs = 0
        new_papers = 0
        while cursor:
            params['cursor'] = cursor
            r = request_retry(
                url,
                params=params,
                headers=headers,
                session=s,
            )
            if cursor == '*':
                total_results = jpath('message/total-results', r.json(), 0)
                logger.info('Fetch for day: {}, number results: {}'.format(
                    day.isoformat(), total_results))
            cursor = jpath('message/next-cursor', r.json())
            items = jpath('message/items', r.json(), [])
            if len(items) == 0:
                cursor = False
            else:
                for item in items:
                    try:
                        cls.to_paper(item)
                    except CiteprocError:
                        logger.debug(item)
                    except ValueError as e:
                        logger.exception(e)
                        logger.info(item)
                    else:
                        new_papers += 1
            # After running ten times
            loop_runs += 1
            if loop_runs % cls.emit_status_every == 0:
                logger.info('Parsed another {} papers. {} more to go'.format(
                    cls.rows * cls.emit_status_every,
                    total_results - loop_runs * cls.rows))

        logger.info(
            'For day {} have {} paper been added or updated out of {}.'.format(
                day.isoformat(), new_papers, total_results))
Beispiel #18
0
    def fetch_orcid_records(self,
                            orcid_identifier,
                            profile=None,
                            use_doi=True):
        """
        Queries ORCiD to retrieve the publications associated with a given ORCiD.
        It also fetches such papers from the CrossRef search interface.

        :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON).
        :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow)
        :returns: a generator, where all the papers found are yielded. (some of them could be in
                free form, hence not imported)
        """
        cr_api = CrossRefAPI()

        # Cleanup iD:
        orcid_id = validate_orcid(orcid_identifier)
        if orcid_id is None:
            raise MetadataSourceException('Invalid ORCiD identifier')

        # Get ORCiD profile
        try:
            if profile is None:
                profile = OrcidProfile(orcid_id=orcid_id)
            else:
                profile = OrcidProfile(json=profile)
        except MetadataSourceException as e:
            print e
            return

        # As we have fetched the profile, let's update the Researcher
        self.researcher = Researcher.get_or_create_by_orcid(orcid_identifier,
                                                            profile.json,
                                                            update=True)
        if not self.researcher:
            return

        # Reference name
        ref_name = profile.name
        # curl -H "Accept: application/orcid+json"
        # 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i
        dois = []  # list of DOIs to fetch
        ignored_papers = [
        ]  # list of ignored papers due to incomplete metadata

        # Fetch publications (1st attempt with ORCiD data)
        pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work',
                     profile, [])
        for pub in pubs:
            data_paper = ORCIDDataPaper.from_orcid_metadata(
                ref_name, orcid_id, pub, stop_if_dois_exists=use_doi)
            if not data_paper:
                continue

            if data_paper.dois and use_doi:  # We want to batch it rather than manually do it.
                dois.extend(data_paper.dois)
                continue

            # If the paper is skipped due to invalid metadata.
            # We first try to reconcile it with local researcher author name.
            # Then, we consider it missed.
            if data_paper.skipped:
                data_paper = self.reconcile_paper(
                    ref_name,
                    orcid_id,
                    pub,
                    overrides={
                        'authors': [(self.researcher.name.first,
                                     self.researcher.name.last)]
                    })
                if data_paper.skipped:
                    print('%s is skipped due to incorrect metadata (%s)' %
                          (data_paper, data_paper.skip_reason))

                    ignored_papers.append(data_paper.as_dict())
                    continue

            yield self.create_paper(data_paper)

        # 2nd attempt with DOIs and CrossRef
        if use_doi:
            # Let's grab papers from CrossRef
            #for success, paper_or_metadata in self.fetch_crossref_incrementally(cr_api, orcid_id):
            #    if success:
            #        yield paper_or_metadata
            #    else:
            #        ignored_papers.append(paper_or_metadata)
            #        print('This metadata (%s) yields no paper.' %
            #              (unicode(paper_or_metadata)))

            # Let's grab papers with DOIs found in our ORCiD profile.
            # FIXME(RaitoBezarius): if we fail here, we should get back the pub
            # and yield it.
            for success, paper_or_metadata in self.fetch_metadata_from_dois(
                    cr_api, ref_name, orcid_id, dois):
                if success:
                    yield paper_or_metadata
                else:
                    ignored_papers.append(paper_or_metadata)
                    print('This metadata (%s) yields no paper.' %
                          (paper_or_metadata))

        self.warn_user_of_ignored_papers(ignored_papers)
        if ignored_papers:
            print('Warning: Total ignored papers: %d' % (len(ignored_papers)))
Beispiel #19
0
 def j(self, path, default=None):
     return jpath(path, self._pub, default)
Beispiel #20
0
 def get_contrib(js):
     return {
         'orcid': jpath('contributor-orcid', js),
         'name': jpath('credit-name/value', js),
     }
Beispiel #21
0
 def test_jpath(self):
     self.assertEqual(jpath('awesome', {}), None)
     self.assertEqual(jpath('awesome', {}, 41), 41)
     self.assertEqual(jpath('a', {'a': 'b'}, 41), 'b')
     self.assertEqual(jpath('a/b', {'a': {'b': 7}, 'c': None}, 41), 7)
     self.assertEqual(jpath('a', {'a': {'b': 7}, 'c': None}, 41), {'b': 7})
Beispiel #22
0
 def j(self, path, default=None):
     return jpath(path, self.json, default)