def delete_work_for_orcid(orcid):
     oauth_token = TestOrcidPusherBase._oauth_token(orcid)
     client = OrcidClient(oauth_token, orcid)
     all_work = client.get_all_works_summary()
     for work in all_work.get("group", []):
         putcode = work["work-summary"][0]["put-code"]
         client.delete_work(putcode)
 def test_get_putcodes_for_source_source_client_id_none(self):
     orcid = '0000-0002-4490-1930'
     client = OrcidClient('mytoken', orcid)
     response = client.get_all_works_summary()
     response.raise_for_result()
     putcodes = list(
         response.get_putcodes_for_source_iter('0000-0001-8607-8906'))
     assert len(putcodes) == 90
class OrcidPutcodeGetter(object):
    def __init__(self, orcid, oauth_token):
        self.orcid = orcid
        self.oauth_token = oauth_token
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.source_client_id_path = current_app.config[
            'ORCID_APP_CREDENTIALS']['consumer_key']

    def get_all_inspire_putcodes(self):
        """
        Get all the Inspire putcodes for the given ORCID.
        """
        putcodes = self._get_all_putcodes()
        if not putcodes:
            return
        # Filter out putcodes that do not belong to Inspire.
        for putcode, url in self._get_urls_for_putcodes(putcodes):
            if INSPIRE_WORK_URL_REGEX.match(url):
                yield putcode, url

    def _get_all_putcodes(self):
        response = self.client.get_all_works_summary()
        utils.log_service_response(logger, response,
                                   'in OrcidPutcodeGetter works summary')
        try:
            response.raise_for_result()
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)
        return list(
            response.get_putcodes_for_source(self.source_client_id_path))

    def _get_urls_for_putcodes(self, putcodes):
        # The call get_bulk_works_details_iter() can be very expensive for an
        # author with many works (if each work also has many *contributors*).
        # Fi. for an ATLAS author with ~750 works, 8 calls would be performed
        # with a total data transfer > 0.5 Gb.
        chained = []
        for response in self.client.get_bulk_works_details_iter(putcodes):
            utils.log_service_response(logger, response,
                                       'in OrcidPutcodeGetter works details')
            try:
                response.raise_for_result()
            except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
                raise exceptions.InputDataInvalidException(from_exc=exc)

            chained = itertools.chain(chained,
                                      response.get_putcodes_and_urls())
        return chained
 def __init__(self,
              orcid,
              recid,
              oauth_token,
              do_fail_if_duplicated_identifier=False,
              record_db_version=None):
     self.orcid = orcid
     self.recid = recid
     self.oauth_token = oauth_token
     self.do_fail_if_duplicated_identifier = do_fail_if_duplicated_identifier
     self.record_db_version = record_db_version
     self.inspire_record = self._get_inspire_record()
     self.cache = OrcidCache(orcid, recid)
     self.lock_name = 'orcid:{}'.format(self.orcid)
     self.client = OrcidClient(self.oauth_token, self.orcid)
     self.converter = None
    def __init__(self, orcid, recid, oauth_token):
        self.orcid = orcid
        self.recid = recid
        self.oauth_token = oauth_token

        try:
            self.inspire_record = get_db_record('lit', recid)
        except RecordGetterError as exc:
            raise exceptions.RecordNotFoundException(
                'recid={} not found for pid_type=lit'.format(self.recid),
                from_exc=exc)

        self.cache = OrcidCache(orcid, recid)
        self.lock_name = 'orcid:{}'.format(self.orcid)
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.xml_element = None
Exemple #6
0
 def __init__(
     self,
     orcid,
     recid,
     oauth_token,
     pushing_duplicated_identifier=False,
     record_db_version=None,
 ):
     self.orcid = orcid
     self.recid = str(recid)
     self.oauth_token = oauth_token
     self.pushing_duplicated_identifier = pushing_duplicated_identifier
     self.record_db_version = record_db_version
     self.inspire_record = self._get_inspire_record()
     self.cache = OrcidCache(orcid, recid)
     self.lock_name = "orcid:{}".format(self.orcid)
     self.client = OrcidClient(self.oauth_token, self.orcid)
     self.converter = None
     self.cached_author_putcodes = {}
Exemple #7
0
def get_putcode_for_work(orcid, token, recid):
    client = OrcidClient(token, orcid)
    response = client.get_all_works_summary()
    response.raise_for_result()
    source_client_id_path = config.get('orcid-api', 'consumer_key')
    putcodes = list(
        response.get_putcodes_for_source_iter(source_client_id_path))

    if not putcodes:
        return None

    # TODO: this has to be simplified when we push recids as external
    # identifier (thus just the get_all_works_summary() call is required to
    # match recids with putcodes).
    for response in client.get_bulk_works_details_iter(putcodes):
        response.raise_for_result()
        for putcode, url in response.get_putcodes_and_urls_iter():
            if url.endswith('/{}'.format(recid)):
                return putcode
 def __init__(self, orcid, recid, oauth_token,
              do_fail_if_duplicated_identifier=False, record_db_version=None):
     self.orcid = orcid
     self.recid = recid
     self.oauth_token = oauth_token
     self.do_fail_if_duplicated_identifier = do_fail_if_duplicated_identifier
     self.record_db_version = record_db_version
     self.inspire_record = self._get_inspire_record()
     self.cache = OrcidCache(orcid, recid)
     self.lock_name = 'orcid:{}'.format(self.orcid)
     self.client = OrcidClient(self.oauth_token, self.orcid)
     self.converter = None
Exemple #9
0
class OrcidPutcodeGetter(object):
    def __init__(self, orcid, oauth_token):
        self.orcid = orcid
        self.oauth_token = oauth_token
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.source_client_id_path = current_app.config[
            "ORCID_APP_CREDENTIALS"]["consumer_key"]

    def get_all_inspire_putcodes_and_recids_iter(self):
        """
        Query ORCID api and get all the Inspire putcodes for the given ORCID.
        """
        summary_response = self._get_all_works_summary()
        # `putcodes_recids` is a list like: [('43326850', 20), ('43255490', None)]
        putcodes_recids = list(
            summary_response.get_putcodes_and_recids_for_source_iter(
                self.source_client_id_path))
        putcodes_with_recids = [x for x in putcodes_recids if x[1]]
        putcodes_without_recids = [x[0] for x in putcodes_recids if not x[1]]

        for putcode, recid in putcodes_with_recids:
            yield putcode, recid

        if not putcodes_without_recids:
            return

        for putcode, recid in self._get_putcodes_and_recids_iter(
                putcodes_without_recids):
            yield putcode, recid

    def _get_all_works_summary(self):
        """
        Query ORCID api and get all the putcodes with their embedded recids
        for the given ORCID.
        An embedded recid is a recid written as external-identifier.
        """
        response = self.client.get_all_works_summary()
        LOGGER.info("Get ORCID work summary",
                    response=response,
                    orcid=self.orcid)
        try:
            response.raise_for_result()
        except (
                orcid_client_exceptions.TokenInvalidException,
                orcid_client_exceptions.TokenMismatchException,
                orcid_client_exceptions.TokenWithWrongPermissionException,
        ):
            LOGGER.info(
                "OrcidPutcodeGetter: deleting Orcid push access",
                token=self.oauth_token,
                orcid=self.orcid,
            )
            push_access_tokens.delete_access_token(self.oauth_token,
                                                   self.orcid)
            raise exceptions.TokenInvalidDeletedException
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)
        return response

    def _get_putcodes_and_recids_iter(self, putcodes):
        for putcode, url in self._get_urls_for_putcodes_iter(putcodes):
            # Filter out putcodes that do not belong to Inspire.
            if INSPIRE_WORK_URL_REGEX.match(url):
                recid = PidStoreBase.get_pid_from_record_uri(url)[1]
                if not recid:
                    LOGGER.error(
                        "OrcidPutcodeGetter: cannot parse recid from url",
                        url=url,
                        orcid=self.orcid,
                    )
                    continue
                yield putcode, recid

    def _get_urls_for_putcodes_iter(self, putcodes):
        # The call `get_bulk_works_details_iter()` can be expensive for an
        # author with many works (if each work also has many *contributors*).
        # Fi. for an ATLAS author with ~750 works (each of them with many
        # authors), 8 calls would be performed with a total data transfer > 0.5 Gb.
        chained = []
        for response in self.client.get_bulk_works_details_iter(putcodes):
            # Note: this log can be large. Consider removing it when this part
            # is considered mature.
            LOGGER.info("ORCID work details",
                        response=response,
                        orcid=self.orcid)
            try:
                response.raise_for_result()
            except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
                raise exceptions.InputDataInvalidException(from_exc=exc)

            chained = itertools.chain(chained,
                                      response.get_putcodes_and_urls_iter())
        return chained

    def get_putcodes_and_recids_by_identifiers_iter(self, identifiers):
        """
        Yield putcode and recid for each work matched by the external
        identifiers.
        Note: external identifiers of type 'other-id' are skipped.

        Args:
            identifiers (List[inspirehep.orcid.converter.ExternalIdentifier]):
                list af all external identifiers added after the xml conversion.
        """
        summary_response = self._get_all_works_summary()
        for (
                putcode,
                ids,
        ) in summary_response.get_putcodes_and_external_identifiers_iter():
            # ids is a list like:
            #   [
            #       {'external-id-relationship': 'SELF',
            #        'external-id-type': 'other-id',
            #        'external-id-url': {'value': 'http://inspireheptest.cern.ch/record/20'},
            #        'external-id-value': '20'
            #       },...
            #   ]

            # Get the recid.
            recid = self._get_recid_for_work(ids, str(putcode))

            for identifier in ids:
                id_type = identifier.get("external-id-type")
                # We are interested only in doi, arxiv, isbns.
                if not id_type or id_type.lower() == "other-id":
                    continue
                id_value = identifier.get("external-id-value")
                if not id_value:
                    continue

                if ExternalIdentifier(id_type, id_value) in identifiers:
                    yield putcode, recid

    def _get_recid_for_work(self, external_identifiers, putcode):
        """
        Get the recid for a work given its external identifiers and putcode.
        The recid might be in the external identifiers or a get_work_details()
        might be called to find it.

        Args:
            external_identifier (List[Dict]): a list like:
               [
                   {'external-id-relationship': 'SELF',
                    'external-id-type': 'other-id',
                    'external-id-url': {'value': 'http://inspireheptest.cern.ch/record/20'},
                    'external-id-value': '20'
                   },...
               ]
            putcode: putcode of the given work.

        Returns: the Inspire recid mathcing the work.
        """
        for identifier in external_identifiers:
            id_type = identifier.get("external-id-type")
            if not id_type or id_type.lower() != "other-id":
                continue

            id_url = inspire_service_orcid_utils.smartget(
                identifier, "external-id-url.value", "")
            if not re.match(r".*inspire.*", id_url, re.I):
                continue

            id_value = identifier.get("external-id-value")
            if not id_value:
                continue

            # recid found.
            return id_value

        # The recid was not found in the external_identifiers.
        # Thus we call get_bulk_works_details_iter().
        putcodes_recid = list(self._get_putcodes_and_recids_iter([putcode]))

        if putcodes_recid:
            return putcodes_recid[0][1]
class OrcidPusher(object):
    def __init__(self, orcid, recid, oauth_token):
        self.orcid = orcid
        self.recid = recid
        self.oauth_token = oauth_token

        try:
            self.inspire_record = get_db_record('lit', recid)
        except RecordGetterError as exc:
            raise exceptions.RecordNotFoundException(
                'recid={} not found for pid_type=lit'.format(self.recid),
                from_exc=exc)

        self.cache = OrcidCache(orcid, recid)
        self.lock_name = 'orcid:{}'.format(self.orcid)
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.xml_element = None

    @time_execution
    def push(self):
        putcode = self.cache.read_work_putcode()
        if not self.cache.has_work_content_changed(self.inspire_record):
            logger.info(
                'OrcidPusher cache hit for recid={} and orcid={}'.format(
                    self.recid, self.orcid))
            return putcode
        logger.info('OrcidPusher cache miss for recid={} and orcid={}'.format(
            self.recid, self.orcid))

        self.xml_element = OrcidConverter(
            record=self.inspire_record,
            url_pattern=current_app.config['LEGACY_RECORD_URL_PATTERN'],
            put_code=putcode,
        ).get_xml(do_add_bibtex_citation=True)

        try:
            putcode = self._post_or_put_work(putcode)
        except orcid_client_exceptions.WorkAlreadyExistentException:
            # We POSTed the record as new work, but it failed because the work
            # already exists (identified by the external identifiers).
            # This means we do not have the putcode, thus we cache all
            # author's putcodes and PUT the work again.
            putcode = self._cache_all_author_putcodes()
            self._post_or_put_work(putcode)

        self.cache.write_work_putcode(putcode, self.inspire_record)
        return putcode

    @time_execution
    def _post_or_put_work(self, putcode=None):
        # Note: if putcode is None, then it's a POST (it means the work is new).
        # Otherwise a PUT (it means the work already exists and it has the given
        # putcode).

        # ORCID API allows 1 POST/PUT only for the same orcid at the same time.
        # Using `distributed_lock` to achieve this.
        with distributed_lock(self.lock_name, blocking=True):
            if putcode:
                response = self.client.put_updated_work(
                    self.xml_element, putcode)
            else:
                response = self.client.post_new_work(self.xml_element)

        utils.log_service_response(
            logger, response, 'in OrcidPusher for recid={}'.format(self.recid))
        try:
            response.raise_for_result()
            putcode = response['putcode']
        except orcid_client_exceptions.WorkAlreadyExistentException:  # Only raisable by a POST.
            raise
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)
        return putcode

    @time_execution
    def _cache_all_author_putcodes(self):
        logger.info(
            'New OrcidPusher cache all author putcodes for orcid={}'.format(
                self.orcid))
        putcode_getter = OrcidPutcodeGetter(self.orcid, self.oauth_token)
        putcodes_urls = list(putcode_getter.get_all_inspire_putcodes(
        ))  # Can raise exceptions.InputDataInvalidException.

        putcode = None
        for fetched_putcode, fetched_url in putcodes_urls:
            fetched_recid = get_pid_from_record_uri(fetched_url)[1]

            if not fetched_recid:
                logger.error(
                    'OrcidPusher cache all author putcodes: cannot parse recid from url={} for orcid={}'
                    .format(fetched_url, self.orcid))
                continue

            if fetched_recid == str(self.recid):
                putcode = fetched_putcode
            cache = OrcidCache(self.orcid, fetched_recid)
            cache.write_work_putcode(fetched_putcode)

        if not putcode:
            raise exceptions.PutcodeNotFoundInOrcidException(
                'No putcode was found in ORCID API for orcid={} and recid={}.'
                ' And the POST has previously failed for the same recid because'
                ' the work had already existed'.format(self.orcid, self.recid))

        # Ensure the putcode is actually in cache.
        # Note: this step is not really necessary and it can be skipped, but
        # at this moment it helps isolate a potential issue.
        if not self.cache.read_work_putcode():
            raise exceptions.PutcodeNotFoundInCacheAfterCachingAllPutcodes(
                'No putcode={} found in cache for recid={} after having'
                ' cached all author putcodes for orcid={}'.format(
                    self.putcode, self.recid, self.orcid))

        return putcode
Exemple #11
0
class OrcidPusher(object):
    def __init__(
        self,
        orcid,
        recid,
        oauth_token,
        do_fail_if_duplicated_identifier=False,
        record_db_version=None,
    ):
        self.orcid = orcid
        self.recid = recid
        self.oauth_token = oauth_token
        self.do_fail_if_duplicated_identifier = do_fail_if_duplicated_identifier
        self.record_db_version = record_db_version
        self.inspire_record = self._get_inspire_record()
        self.cache = OrcidCache(orcid, recid)
        self.lock_name = "orcid:{}".format(self.orcid)
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.converter = None

    @time_execution
    def _get_inspire_record(self):
        try:
            inspire_record = LiteratureRecord.get_record_by_pid_value(self.recid)
        except PIDDoesNotExistError as exc:
            raise exceptions.RecordNotFoundException(
                "recid={} not found for pid_type=lit".format(self.recid), from_exc=exc
            )

        # If the record_db_version was given, then ensure we are about to push
        # the right record version.
        # This check is related to the fact the orcid push at this moment is
        # triggered by the signal after_record_update (which happens after a
        # InspireRecord.commit()). This is not the actual commit to the db which
        # might happen at a later stage or not at all.
        # Note that connecting to the proper SQLAlchemy signal would also
        # have issues: https://github.com/mitsuhiko/flask-sqlalchemy/issues/645
        if (
            self.record_db_version
            and inspire_record.model.version_id < self.record_db_version
        ):
            raise exceptions.StaleRecordDBVersionException(
                "Requested push for db version={}, but actual record db"
                " version={}".format(
                    self.record_db_version, inspire_record.model.version_id
                )
            )
        return inspire_record

    @property
    def _do_force_cache_miss(self):
        """
        Hook to force a cache miss. This can be leveraged in feature tests.
        """
        for note in self.inspire_record.get("_private_notes", []):
            if note.get("value") == "orcid-push-force-cache-miss":
                LOGGER.debug(
                    "OrcidPusher force cache miss", recid=self.recid, orcid=self.orcid
                )
                return True
        return False

    @property
    def _is_record_deleted(self):
        # Hook to force a delete. This can be leveraged in feature tests.
        for note in self.inspire_record.get("_private_notes", []):
            if note.get("value") == "orcid-push-force-delete":
                LOGGER.debug(
                    "OrcidPusher force delete", recid=self.recid, orcid=self.orcid
                )
                return True
        return self.inspire_record.get("deleted", False)

    @time_execution
    def push(self):
        putcode = None
        if not self._do_force_cache_miss:
            putcode = self.cache.read_work_putcode()
            if not self._is_record_deleted and not self.cache.has_work_content_changed(
                self.inspire_record
            ):
                LOGGER.debug(
                    "OrcidPusher cache hit", recid=self.recid, orcid=self.orcid
                )
                return putcode
        LOGGER.debug("OrcidPusher cache miss", recid=self.recid, orcid=self.orcid)

        # If the record is deleted, then delete it.
        if self._is_record_deleted:
            self._delete_work(putcode)
            return None

        self.converter = OrcidConverter(
            record=self.inspire_record,
            url_pattern=current_app.config["LEGACY_RECORD_URL_PATTERN"],
            put_code=putcode,
        )

        try:
            putcode = self._post_or_put_work(putcode)
        except orcid_client_exceptions.WorkAlreadyExistsException:
            # We POSTed the record as new work, but it failed because the work
            # already exists (identified by the external identifiers).
            # This means we do not have the putcode, thus we cache all
            # author's putcodes and PUT the work again.
            try:
                if self.do_fail_if_duplicated_identifier:
                    raise exceptions.DuplicatedExternalIdentifierPusherException
                self._push_work_with_clashing_identifier()
                putcode = self._post_or_put_work(putcode)
            except orcid_client_exceptions.WorkAlreadyExistsException:
                putcode = self._cache_all_author_putcodes()
                if not putcode:
                    msg = (
                        "No putcode was found in ORCID API for orcid={} and recid={}."
                        " And the POST has previously failed for the same recid because"
                        " the work had already existed".format(self.orcid, self.recid)
                    )
                    raise exceptions.PutcodeNotFoundInOrcidException(msg)
                putcode = self._post_or_put_work(putcode)
        except orcid_client_exceptions.DuplicatedExternalIdentifierException:
            # We PUT a record changing its identifier, but there is another work
            # in ORCID with the same identifier. We need to find out the recid
            # of the clashing work in ORCID and push a fresh version of that
            # record.
            # This scenario might be triggered by a merge of 2 records in Inspire.
            if self.do_fail_if_duplicated_identifier:
                raise exceptions.DuplicatedExternalIdentifierPusherException
            self._push_work_with_clashing_identifier()
            putcode = self._post_or_put_work(putcode)
        except orcid_client_exceptions.PutcodeNotFoundPutException:
            self.cache.delete_work_putcode()
            self.converter = OrcidConverter(
                record=self.inspire_record,
                url_pattern=current_app.config["LEGACY_RECORD_URL_PATTERN"],
                put_code=None,
            )
            putcode = self._post_or_put_work()
        except (
            orcid_client_exceptions.TokenInvalidException,
            orcid_client_exceptions.TokenMismatchException,
            orcid_client_exceptions.TokenWithWrongPermissionException,
        ):
            LOGGER.info(
                "Deleting Orcid push access", token=self.oauth_token, orcid=self.orcid
            )
            push_access_tokens.delete_access_token(self.oauth_token, self.orcid)
            raise exceptions.TokenInvalidDeletedException
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)

        self.cache.write_work_putcode(putcode, self.inspire_record)
        return putcode

    @time_execution
    def _post_or_put_work(self, putcode=None):
        # Note: if putcode is None, then it's a POST (it means the work is new).
        # Otherwise a PUT (it means the work already exists and it has the given
        # putcode).

        xml_element = self.converter.get_xml(do_add_bibtex_citation=True)
        # ORCID API allows 1 non-idempotent call only for the same orcid at
        # the same time. Using `distributed_lock` to achieve this.
        with utils.distributed_lock(self.lock_name, blocking=True):
            if putcode:
                response = self.client.put_updated_work(xml_element, putcode)
            else:
                response = self.client.post_new_work(xml_element)
        LOGGER.info("POST/PUT ORCID work", response=response, recid=self.recid)
        response.raise_for_result()
        return response["putcode"]

    @time_execution
    def _cache_all_author_putcodes(self):
        LOGGER.debug("New OrcidPusher cache all author putcodes", orcid=self.orcid)
        putcode_getter = OrcidPutcodeGetter(self.orcid, self.oauth_token)
        putcodes_recids = list(
            putcode_getter.get_all_inspire_putcodes_and_recids_iter()
        )  # Can raise exceptions.InputDataInvalidException.

        putcode = None
        for fetched_putcode, fetched_recid in putcodes_recids:
            if fetched_recid == str(self.recid):
                putcode = int(fetched_putcode)
            cache = OrcidCache(self.orcid, fetched_recid)
            cache.write_work_putcode(fetched_putcode)

        # Ensure the putcode is actually in cache.
        # Note: this step is not really necessary and it can be skipped, but
        # at this moment it helps isolate a potential issue.
        if putcode and not self.cache.read_work_putcode():
            raise exceptions.PutcodeNotFoundInCacheAfterCachingAllPutcodes(
                "No putcode={} found in cache for recid={} after having"
                " cached all author putcodes for orcid={}".format(
                    self.putcode, self.recid, self.orcid
                )
            )

        return putcode

    @time_execution
    def _delete_work(self, putcode=None):
        putcode = putcode or self._cache_all_author_putcodes()
        if not putcode:
            # Such recid does not exists (anymore?) in ORCID API.
            return

        # ORCID API allows 1 non-idempotent call only for the same orcid at
        # the same time. Using `distributed_lock` to achieve this.
        with utils.distributed_lock(self.lock_name, blocking=True):
            response = self.client.delete_work(putcode)
        try:
            response.raise_for_result()
        except orcid_client_exceptions.PutcodeNotFoundDeleteException:
            # Such putcode does not exists (anymore?) in orcid.
            pass
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)

        self.cache.delete_work_putcode()

    @time_execution
    def _push_work_with_clashing_identifier(self):
        putcode_getter = OrcidPutcodeGetter(self.orcid, self.oauth_token)

        ids = self.converter.added_external_identifiers
        for (
            putcode,
            recid,
        ) in putcode_getter.get_putcodes_and_recids_by_identifiers_iter(ids):

            if not putcode or not recid:
                continue
            if recid == self.recid:
                continue
            # Local import to avoid import error.
            from inspirehep.orcid import tasks

            max_retries = 3
            # Execute the orcid_push Celery task synchronously.
            backoff = lambda retry_count: [30, 2 * 60, 7 * 60][  # noqa: E731
                retry_count % max_retries
            ]
            utils.apply_celery_task_with_retry(
                tasks.orcid_push,
                kwargs={
                    "orcid": self.orcid,
                    "rec_id": recid,
                    "oauth_token": self.oauth_token,
                    # Set `do_fail_if_duplicated_identifier` to avoid an
                    # infinite recursive calls chain.
                    "kwargs_to_pusher": dict(
                        do_fail_if_duplicated_identifier=True,
                        record_db_version=self.record_db_version,
                    ),
                },
                max_retries=max_retries,
                countdown=backoff,
                time_limit=10 * 60,
            )
Exemple #12
0
def delete_work(orcid, token, recid):
    putcode = get_putcode_for_work(orcid, token, recid)
    if putcode:
        client = OrcidClient(token, orcid)
        client.delete_work(putcode)
 def setup(self):
     self.putcodes = [
         '43326850', '43255490', '43183518', '43857637', '43257979',
         '43938460', '43553536', '43846642', '43869107', '43466717',
         '43880082', '43852910', '44762573', '44762737', '44762744',
         '44762721', '44762617', '43257122', '43861964', '43938538',
         '43606530', '43855125', '44762615', '44762741', '43554289',
         '44762570', '44762735', '44762597', '43859780', '43941962',
         '43856818', '43938515', '43864453', '43875319', '43935537',
         '43467792', '44077351', '43554306', '44472652', '43911727',
         '43922432', '43916436', '43907796', '43924927', '43923874',
         '43938553', '43938542', '43878004', '43935695', '43881622',
         '43935569', '44231173', '43880802', '43938523', '43938458',
         '43935897', '43919253', '43918420', '43938697', '43920855',
         '43933388', '43942717', '43910178', '44515789', '43882441',
         '43935355', '43935418', '43935500', '43929711', '43935348',
         '43938613', '43919864', '43885354', '43935660', '43882622',
         '43935419', '43935519', '43942195', '43935682', '43949957',
         '43941870', '43938614', '43938644', '43941852', '43935478',
         '43937005', '44216033', '43948457', '43942230', '43938670',
         '43935725', '43942117', '43935577', '44227246', '43942042',
         '44219584', '43942229', '43942467', '43935574', '43461438',
         '43939244', '43942225', '43942110', '44218042', '44236863',
         '43942221', '43935690', '43938687', '43942306', '43326714',
         '43935600', '43935671', '43935595', '44229237', '43942579',
         '43935727', '43939389', '43935714', '44232896', '44227649',
         '43935744', '43938719', '43938710', '43942556', '44237648',
         '44226428', '43938991', '44236016', '43935746', '44236622',
         '43938809', '44234262', '43942562', '43939267', '43935804',
         '43935814', '44235446', '44238589', '43476255', '44238117',
         '43942245', '43935831', '44255508', '43935773', '43935525',
         '43349513', '43939364', '43942333', '44259358', '43334280',
         '43935879', '43474664', '43942483', '43868647', '43942582',
         '44269186', '43935857', '43939273', '44265932', '43328661',
         '43939436', '44575020', '44252784', '43473085', '43935955',
         '43329599', '43474084', '43942511', '43935852', '43325385',
         '43935788', '43942608', '43935829', '43942738', '43935875',
         '43939367', '44274797', '43328989', '43474829', '43942339',
         '43330602', '43939455', '43939372', '43943050', '43351389',
         '43328159', '43329373', '43935762', '43939467', '43943007',
         '43476291', '44272682', '43478322', '43343506', '43483181',
         '43347500', '43333264', '43858017', '43473511', '43332255',
         '43476010', '43350059', '44251364', '43475852', '43353967',
         '43849619', '43819343', '43339682', '43348858', '43333748',
         '44217143', '44232508', '43822751', '43939441', '43339402',
         '44284285', '43478099', '43356509', '43942969', '43348252',
         '43483990', '43936102', '43939877', '43935994', '44575015',
         '43939643', '44285709', '43352429', '43942965', '43364988',
         '44265579', '43939719', '43940213', '43368521', '43939725',
         '43361294', '43936167', '43293661', '43362128', '43940188',
         '43358238', '43936143', '44283137', '44284877', '43356836',
         '43939941', '44293857', '43363375', '43361159', '43365921',
         '43939949', '43941280', '43368183', '44291548', '43360300',
         '43366583', '43936275', '43370435', '43939860', '43361521',
         '43936314', '43942905', '43942981', '43292406', '43367691',
         '44317462'
     ]  # noqa: E501
     self.orcid = '0000-0002-6665-4934'  # ATLAS author.
     try:
         # Pick the token from settings_local.py first.
         self.oauth_token = inspire_service_orcid.conf.settings.OAUTH_TOKENS.get(
             self.orcid)
     except AttributeError:
         self.oauth_token = 'mytoken'
     self.client = OrcidClient(self.oauth_token, self.orcid)
class OrcidPusher(object):
    def __init__(self, orcid, recid, oauth_token,
                 do_fail_if_duplicated_identifier=False, record_db_version=None):
        self.orcid = orcid
        self.recid = recid
        self.oauth_token = oauth_token
        self.do_fail_if_duplicated_identifier = do_fail_if_duplicated_identifier
        self.record_db_version = record_db_version
        self.inspire_record = self._get_inspire_record()
        self.cache = OrcidCache(orcid, recid)
        self.lock_name = 'orcid:{}'.format(self.orcid)
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.converter = None

    @time_execution
    def _get_inspire_record(self):
        try:
            inspire_record = get_db_record('lit', self.recid)
        except RecordGetterError as exc:
            raise exceptions.RecordNotFoundException(
                'recid={} not found for pid_type=lit'.format(self.recid),
                from_exc=exc)

        # If the record_db_version was given, then ensure we are about to push
        # the right record version.
        # This check is related to the fact the orcid push at this moment is
        # triggered by the signal after_record_update (which happens after a
        # InspireRecord.commit()). This is not the actual commit to the db which
        # might happen at a later stage or not at all.
        # Note that connecting to the proper SQLAlchemy signal would also
        # have issues: https://github.com/mitsuhiko/flask-sqlalchemy/issues/645
        if self.record_db_version and inspire_record.model.version_id < self.record_db_version:
            raise exceptions.StaleRecordDBVersionException(
                'Requested push for db version={}, but actual record db'
                ' version={}'.format(self.record_db_version, inspire_record.model.version_id)
            )
        return inspire_record

    @property
    def _do_force_cache_miss(self):
        """
        Hook to force a cache miss. This can be leveraged in feature tests.
        """
        for note in self.inspire_record.get('_private_notes', []):
            if note.get('value') == 'orcid-push-force-cache-miss':
                logger.info('OrcidPusher force cache miss for recid={} and orcid={}'.format(
                    self.recid, self.orcid))
                return True
        return False

    @property
    def _is_record_deleted(self):
        # Hook to force a delete. This can be leveraged in feature tests.
        for note in self.inspire_record.get('_private_notes', []):
            if note.get('value') == 'orcid-push-force-delete':
                logger.info('OrcidPusher force delete for recid={} and orcid={}'.format(
                    self.recid, self.orcid))
                return True
        return self.inspire_record.get('deleted', False)

    @time_execution
    def push(self):
        putcode = None
        if not self._do_force_cache_miss:
            putcode = self.cache.read_work_putcode()
            if not self._is_record_deleted and \
                    not self.cache.has_work_content_changed(self.inspire_record):
                logger.info('OrcidPusher cache hit for recid={} and orcid={}'.format(
                    self.recid, self.orcid))
                return putcode
        logger.info('OrcidPusher cache miss for recid={} and orcid={}'.format(
            self.recid, self.orcid))

        # If the record is deleted, then delete it.
        if self._is_record_deleted:
            self._delete_work(putcode)
            return None

        self.converter = OrcidConverter(
            record=self.inspire_record,
            url_pattern=current_app.config['LEGACY_RECORD_URL_PATTERN'],
            put_code=putcode,
        )

        try:
            putcode = self._post_or_put_work(putcode)
        except orcid_client_exceptions.WorkAlreadyExistsException:
            # We POSTed the record as new work, but it failed because the work
            # already exists (identified by the external identifiers).
            # This means we do not have the putcode, thus we cache all
            # author's putcodes and PUT the work again.
            putcode = self._cache_all_author_putcodes()
            if not putcode:
                msg = 'No putcode was found in ORCID API for orcid={} and recid={}.'\
                    ' And the POST has previously failed for the same recid because'\
                    ' the work had already existed'.format(self.orcid, self.recid)
                raise exceptions.PutcodeNotFoundInOrcidException(msg)
            putcode = self._post_or_put_work(putcode)
        except orcid_client_exceptions.DuplicatedExternalIdentifierException:
            # We PUT a record changing its identifier, but there is another work
            # in ORCID with the same identifier. We need to find out the recid
            # of the clashing work in ORCID and push a fresh version of that
            # record.
            # This scenario might be triggered by a merge of 2 records in Inspire.
            if self.do_fail_if_duplicated_identifier:
                raise exceptions.DuplicatedExternalIdentifierPusherException
            self._push_work_with_clashing_identifier()
            putcode = self._post_or_put_work(putcode)
        except orcid_client_exceptions.PutcodeNotFoundPutException:
            self.cache.delete_work_putcode()
            putcode = self._post_or_put_work()
        except (orcid_client_exceptions.TokenInvalidException,
                orcid_client_exceptions.TokenMismatchException,
                orcid_client_exceptions.TokenWithWrongPermissionException):
            logger.info('Deleting Orcid push access token={} for orcid={}'.format(
                self.oauth_token, self.orcid))
            push_access_tokens.delete_access_token(self.oauth_token, self.orcid)
            raise exceptions.TokenInvalidDeletedException
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)

        self.cache.write_work_putcode(putcode, self.inspire_record)
        return putcode

    @time_execution
    def _post_or_put_work(self, putcode=None):
        # Note: if putcode is None, then it's a POST (it means the work is new).
        # Otherwise a PUT (it means the work already exists and it has the given
        # putcode).

        xml_element = self.converter.get_xml(do_add_bibtex_citation=True)
        # ORCID API allows 1 non-idempotent call only for the same orcid at
        # the same time. Using `distributed_lock` to achieve this.
        with distributed_lock(self.lock_name, blocking=True):
            if putcode:
                response = self.client.put_updated_work(xml_element, putcode)
            else:
                response = self.client.post_new_work(xml_element)

        utils.log_service_response(logger, response, 'in OrcidPusher for recid={}'.format(self.recid))
        response.raise_for_result()
        return response['putcode']

    @time_execution
    def _cache_all_author_putcodes(self):
        logger.info('New OrcidPusher cache all author putcodes for orcid={}'.format(self.orcid))
        putcode_getter = OrcidPutcodeGetter(self.orcid, self.oauth_token)
        putcodes_recids = list(putcode_getter.get_all_inspire_putcodes_and_recids_iter())  # Can raise exceptions.InputDataInvalidException.

        putcode = None
        for fetched_putcode, fetched_recid in putcodes_recids:
            if fetched_recid == str(self.recid):
                putcode = int(fetched_putcode)
            cache = OrcidCache(self.orcid, fetched_recid)
            cache.write_work_putcode(fetched_putcode)

        # Ensure the putcode is actually in cache.
        # Note: this step is not really necessary and it can be skipped, but
        # at this moment it helps isolate a potential issue.
        if putcode and not self.cache.read_work_putcode():
            raise exceptions.PutcodeNotFoundInCacheAfterCachingAllPutcodes(
                'No putcode={} found in cache for recid={} after having'
                ' cached all author putcodes for orcid={}'.format(
                    self.putcode, self.recid, self.orcid))

        return putcode

    @time_execution
    def _delete_work(self, putcode=None):
        putcode = putcode or self._cache_all_author_putcodes()
        if not putcode:
            # Such recid does not exists (anymore?) in ORCID API.
            return

        # ORCID API allows 1 non-idempotent call only for the same orcid at
        # the same time. Using `distributed_lock` to achieve this.
        with distributed_lock(self.lock_name, blocking=True):
            response = self.client.delete_work(putcode)
        try:
            response.raise_for_result()
        except orcid_client_exceptions.PutcodeNotFoundDeleteException:
            # Such putcode does not exists (anymore?) in orcid.
            pass
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)

        self.cache.delete_work_putcode()

    @time_execution
    def _push_work_with_clashing_identifier(self):
        putcode_getter = OrcidPutcodeGetter(self.orcid, self.oauth_token)

        ids = self.converter.added_external_identifiers
        for putcode, recid in putcode_getter.get_putcodes_and_recids_by_identifiers_iter(ids):

            if not putcode or not recid:
                continue
            # Local import to avoid import error.
            from inspirehep.modules.orcid import tasks
            max_retries = 3
            # Execute the orcid_push Celery task synchronously.
            backoff = lambda retry_count: [30, 2 * 60, 7 * 60][retry_count % max_retries]  # noqa: E731ß
            utils.apply_celery_task_with_retry(
                tasks.orcid_push,
                kwargs={
                    'orcid': self.orcid,
                    'rec_id': recid,
                    'oauth_token': self.oauth_token,
                    # Set `do_fail_if_duplicated_identifier` to avoid an
                    # infinite recursive calls chain.
                    'kwargs_to_pusher': dict(
                        do_fail_if_duplicated_identifier=True,
                        record_db_version=self.record_db_version)
                },
                max_retries=max_retries,
                countdown=backoff,
                time_limit=10 * 60,
            )
class OrcidPutcodeGetter(object):
    def __init__(self, orcid, oauth_token):
        self.orcid = orcid
        self.oauth_token = oauth_token
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.source_client_id_path = current_app.config['ORCID_APP_CREDENTIALS'][
            'consumer_key']

    def get_all_inspire_putcodes_and_recids_iter(self):
        """
        Query ORCID api and get all the Inspire putcodes for the given ORCID.
        """
        summary_response = self._get_all_works_summary()
        # `putcodes_recids` is a list like: [('43326850', 20), ('43255490', None)]
        putcodes_recids = list(summary_response.get_putcodes_and_recids_for_source_iter(
            self.source_client_id_path))
        putcodes_with_recids = [x for x in putcodes_recids if x[1]]
        putcodes_without_recids = [x[0] for x in putcodes_recids if not x[1]]

        for putcode, recid in putcodes_with_recids:
            yield putcode, recid

        if not putcodes_without_recids:
            return

        for putcode, recid in self._get_putcodes_and_recids_iter(putcodes_without_recids):
            yield putcode, recid

    def _get_all_works_summary(self):
        """
        Query ORCID api and get all the putcodes with their embedded recids
        for the given ORCID.
        An embedded recid is a recid written as external-identifier.
        """
        response = self.client.get_all_works_summary()
        utils.log_service_response(logger, response, 'in OrcidPutcodeGetter works summary')
        try:
            response.raise_for_result()
        except (orcid_client_exceptions.TokenInvalidException,
                orcid_client_exceptions.TokenMismatchException,
                orcid_client_exceptions.TokenWithWrongPermissionException):
            logger.info('OrcidPutcodeGetter: deleting Orcid push access token={} for orcid={}'.format(
                self.oauth_token, self.orcid))
            push_access_tokens.delete_access_token(self.oauth_token, self.orcid)
            raise exceptions.TokenInvalidDeletedException
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)
        return response

    def _get_putcodes_and_recids_iter(self, putcodes):
        for putcode, url in self._get_urls_for_putcodes_iter(putcodes):
            # Filter out putcodes that do not belong to Inspire.
            if INSPIRE_WORK_URL_REGEX.match(url):
                recid = get_pid_from_record_uri(url)[1]
                if not recid:
                    logger.error('OrcidPutcodeGetter: cannot parse recid from url={} for orcid={}'.format(
                        url, self.orcid))
                    continue
                yield putcode, recid

    def _get_urls_for_putcodes_iter(self, putcodes):
        # The call `get_bulk_works_details_iter()` can be expensive for an
        # author with many works (if each work also has many *contributors*).
        # Fi. for an ATLAS author with ~750 works (each of them with many
        # authors), 8 calls would be performed with a total data transfer > 0.5 Gb.
        chained = []
        for response in self.client.get_bulk_works_details_iter(putcodes):
            # Note: this log can be large. Consider removing it when this part
            # is considered mature.
            utils.log_service_response(logger, response, 'in OrcidPutcodeGetter works details')
            try:
                response.raise_for_result()
            except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
                raise exceptions.InputDataInvalidException(from_exc=exc)

            chained = itertools.chain(chained, response.get_putcodes_and_urls_iter())
        return chained

    def get_putcodes_and_recids_by_identifiers_iter(self, identifiers):
        """
        Yield putcode and recid for each work matched by the external
        identifiers.
        Note: external identifiers of type 'other-id' are skipped.

        Args:
            identifiers (List[inspirehep.modules.orcid.converter.ExternalIdentifier]):
                list af all external identifiers added after the xml conversion.
        """
        summary_response = self._get_all_works_summary()
        for putcode, ids in summary_response.get_putcodes_and_external_identifiers_iter():
            # ids is a list like:
            #   [
            #       {'external-id-relationship': 'SELF',
            #        'external-id-type': 'other-id',
            #        'external-id-url': {'value': 'http://inspireheptest.cern.ch/record/20'},
            #        'external-id-value': '20'
            #       },...
            #   ]

            # Get the recid.
            recid = self._get_recid_for_work(ids, str(putcode))

            for identifier in ids:
                id_type = identifier.get('external-id-type')
                # We are interested only in doi, arxiv, isbns.
                if not id_type or id_type.lower() == 'other-id':
                    continue
                id_value = identifier.get('external-id-value')
                if not id_value:
                    continue

                if ExternalIdentifier(id_type, id_value) in identifiers:
                    yield putcode, recid

    def _get_recid_for_work(self, external_identifiers, putcode):
        """
        Get the recid for a work given its external identifiers and putcode.
        The recid might be in the external identifiers or a get_work_details()
        might be called to find it.

        Args:
            external_identifier (List[Dict]): a list like:
               [
                   {'external-id-relationship': 'SELF',
                    'external-id-type': 'other-id',
                    'external-id-url': {'value': 'http://inspireheptest.cern.ch/record/20'},
                    'external-id-value': '20'
                   },...
               ]
            putcode: putcode of the given work.

        Returns: the Inspire recid mathcing the work.
        """
        for identifier in external_identifiers:
            id_type = identifier.get('external-id-type')
            if not id_type or id_type.lower() != 'other-id':
                continue

            id_url = inspire_service_orcid_utils.smartget(identifier, 'external-id-url.value', '')
            if not re.match(r'.*inspire.*', id_url, re.I):
                continue

            id_value = identifier.get('external-id-value')
            if not id_value:
                continue

            # recid found.
            return id_value

        # The recid was not found in the external_identifiers.
        # Thus we call get_bulk_works_details_iter().
        putcodes_recid = list(self._get_putcodes_and_recids_iter([putcode]))

        if putcodes_recid:
            return putcodes_recid[0][1]
 def client(self):
     # Pick the token from settings_local.py first.
     self.oauth_token = getattr(inspire_service_orcid.conf.settings,
                                'OAUTH_TOKENS',
                                {}).get(self.orcid, 'mytoken')
     return OrcidClient(self.oauth_token, self.orcid)
 def test_invalid_token(self):
     client = OrcidClient('invalidtoken', self.orcid)
     response = client.delete_work(self.putcode)
     with pytest.raises(exceptions.TokenInvalidException):
         response.raise_for_result()
     assert not response.ok
 def test_invalid_token(self):
     client = OrcidClient('invalidtoken', self.orcid)
     response = client.post_new_work(self.xml_element)
     with pytest.raises(exceptions.TokenInvalidException):
         response.raise_for_result()
     assert not response.ok
 def test_invalid_token(self):
     client = OrcidClient('invalidtoken', self.orcid)
     response = client.get_all_works_summary()
     with pytest.raises(exceptions.TokenInvalidException):
         response.raise_for_result()
     assert not response.ok
 def orcid_client(self):
     return OrcidClient(self.oauth_token, self.orcid)
 def __init__(self, orcid, oauth_token):
     self.orcid = orcid
     self.oauth_token = oauth_token
     self.client = OrcidClient(self.oauth_token, self.orcid)
     self.source_client_id_path = current_app.config['ORCID_APP_CREDENTIALS'][
         'consumer_key']
class TestGenerateGetBulkWorksDetails(object):
    def setup(self):
        self.putcodes = [
            '43326850', '43255490', '43183518', '43857637', '43257979',
            '43938460', '43553536', '43846642', '43869107', '43466717',
            '43880082', '43852910', '44762573', '44762737', '44762744',
            '44762721', '44762617', '43257122', '43861964', '43938538',
            '43606530', '43855125', '44762615', '44762741', '43554289',
            '44762570', '44762735', '44762597', '43859780', '43941962',
            '43856818', '43938515', '43864453', '43875319', '43935537',
            '43467792', '44077351', '43554306', '44472652', '43911727',
            '43922432', '43916436', '43907796', '43924927', '43923874',
            '43938553', '43938542', '43878004', '43935695', '43881622',
            '43935569', '44231173', '43880802', '43938523', '43938458',
            '43935897', '43919253', '43918420', '43938697', '43920855',
            '43933388', '43942717', '43910178', '44515789', '43882441',
            '43935355', '43935418', '43935500', '43929711', '43935348',
            '43938613', '43919864', '43885354', '43935660', '43882622',
            '43935419', '43935519', '43942195', '43935682', '43949957',
            '43941870', '43938614', '43938644', '43941852', '43935478',
            '43937005', '44216033', '43948457', '43942230', '43938670',
            '43935725', '43942117', '43935577', '44227246', '43942042',
            '44219584', '43942229', '43942467', '43935574', '43461438',
            '43939244', '43942225', '43942110', '44218042', '44236863',
            '43942221', '43935690', '43938687', '43942306', '43326714',
            '43935600', '43935671', '43935595', '44229237', '43942579',
            '43935727', '43939389', '43935714', '44232896', '44227649',
            '43935744', '43938719', '43938710', '43942556', '44237648',
            '44226428', '43938991', '44236016', '43935746', '44236622',
            '43938809', '44234262', '43942562', '43939267', '43935804',
            '43935814', '44235446', '44238589', '43476255', '44238117',
            '43942245', '43935831', '44255508', '43935773', '43935525',
            '43349513', '43939364', '43942333', '44259358', '43334280',
            '43935879', '43474664', '43942483', '43868647', '43942582',
            '44269186', '43935857', '43939273', '44265932', '43328661',
            '43939436', '44575020', '44252784', '43473085', '43935955',
            '43329599', '43474084', '43942511', '43935852', '43325385',
            '43935788', '43942608', '43935829', '43942738', '43935875',
            '43939367', '44274797', '43328989', '43474829', '43942339',
            '43330602', '43939455', '43939372', '43943050', '43351389',
            '43328159', '43329373', '43935762', '43939467', '43943007',
            '43476291', '44272682', '43478322', '43343506', '43483181',
            '43347500', '43333264', '43858017', '43473511', '43332255',
            '43476010', '43350059', '44251364', '43475852', '43353967',
            '43849619', '43819343', '43339682', '43348858', '43333748',
            '44217143', '44232508', '43822751', '43939441', '43339402',
            '44284285', '43478099', '43356509', '43942969', '43348252',
            '43483990', '43936102', '43939877', '43935994', '44575015',
            '43939643', '44285709', '43352429', '43942965', '43364988',
            '44265579', '43939719', '43940213', '43368521', '43939725',
            '43361294', '43936167', '43293661', '43362128', '43940188',
            '43358238', '43936143', '44283137', '44284877', '43356836',
            '43939941', '44293857', '43363375', '43361159', '43365921',
            '43939949', '43941280', '43368183', '44291548', '43360300',
            '43366583', '43936275', '43370435', '43939860', '43361521',
            '43936314', '43942905', '43942981', '43292406', '43367691',
            '44317462'
        ]  # noqa: E501
        self.orcid = '0000-0002-6665-4934'  # ATLAS author.
        try:
            # Pick the token from settings_local.py first.
            self.oauth_token = inspire_service_orcid.conf.settings.OAUTH_TOKENS.get(
                self.orcid)
        except AttributeError:
            self.oauth_token = 'mytoken'
        self.client = OrcidClient(self.oauth_token, self.orcid)

    def test_happy_flow(self):
        for response in self.client.get_bulk_works_details_iter(self.putcodes):
            response.raise_for_result()
            assert response.ok
            assert str(
                response['bulk'][0]['work']['put-code']) in self.putcodes
            assert str(
                response['bulk'][-1]['work']['put-code']) in self.putcodes

    def test_too_many_putcodes(self):
        from inspire_service_orcid import client
        with mock.patch.object(client,
                               'MAX_PUTCODES_PER_WORKS_DETAILS_REQUEST', 101):
            for response in self.client.get_bulk_works_details_iter(
                [str(x) for x in range(101)]):
                with pytest.raises(
                        exceptions.ExceedMaxNumberOfPutCodesException):
                    response.raise_for_result()

    def test_get_putcodes_and_urls(self):
        for response in self.client.get_bulk_works_details_iter(self.putcodes):
            response.raise_for_result()
            assert response.ok
            putcodes_and_urls = list(response.get_putcodes_and_urls_iter())
            # Note: the recorded cassette returns the same result for each for loop.
            assert putcodes_and_urls[0] == (
                '43183518', 'http://inspirehep.net/record/1665234')
            assert putcodes_and_urls[-1] == (
                '44227246', 'http://inspirehep.net/record/1515025')

    def test_single_work_error(self):
        self.putcodes = ['51540408', '51496313']
        result = []
        for response in self.client.get_bulk_works_details_iter(self.putcodes):
            response.raise_for_result()
            assert response.ok
            result += (list(response.get_putcodes_and_urls_iter()))

        assert result == [('51496313',
                           'http://inspireheptest.cern.ch/record/20')]
 def __init__(self, orcid, oauth_token):
     self.orcid = orcid
     self.oauth_token = oauth_token
     self.client = OrcidClient(self.oauth_token, self.orcid)
     self.source_client_id_path = current_app.config[
         'ORCID_APP_CREDENTIALS']['consumer_key']
Exemple #24
0
class OrcidPusher(object):
    def __init__(
        self,
        orcid,
        recid,
        oauth_token,
        pushing_duplicated_identifier=False,
        record_db_version=None,
    ):
        self.orcid = orcid
        self.recid = str(recid)
        self.oauth_token = oauth_token
        self.pushing_duplicated_identifier = pushing_duplicated_identifier
        self.record_db_version = record_db_version
        self.inspire_record = self._get_inspire_record()
        self.cache = OrcidCache(orcid, recid)
        self.lock_name = "orcid:{}".format(self.orcid)
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.converter = None
        self.cached_author_putcodes = {}

    @time_execution
    def _get_inspire_record(self):
        try:
            inspire_record = LiteratureRecord.get_record_by_pid_value(
                self.recid)
        except PIDDoesNotExistError as exc:
            raise exceptions.RecordNotFoundException(
                "recid={} not found for pid_type=lit".format(self.recid),
                from_exc=exc)

        # If the record_db_version was given, then ensure we are about to push
        # the right record version.
        # This check is related to the fact the orcid push at this moment is
        # triggered by the signal after_record_update (which happens after a
        # InspireRecord.commit()). This is not the actual commit to the db which
        # might happen at a later stage or not at all.
        # Note that connecting to the proper SQLAlchemy signal would also
        # have issues: https://github.com/mitsuhiko/flask-sqlalchemy/issues/645
        if (self.record_db_version
                and inspire_record.model.version_id < self.record_db_version):
            raise exceptions.StaleRecordDBVersionException(
                "Requested push for db version={}, but actual record db"
                " version={}".format(self.record_db_version,
                                     inspire_record.model.version_id))
        return inspire_record

    @property
    def _do_force_cache_miss(self):
        """
        Hook to force a cache miss. This can be leveraged in feature tests.
        """
        for note in self.inspire_record.get("_private_notes", []):
            if note.get("value") == "orcid-push-force-cache-miss":
                LOGGER.debug("OrcidPusher force cache miss",
                             recid=self.recid,
                             orcid=self.orcid)
                return True
        return False

    @property
    def _is_record_deleted(self):
        # Hook to force a delete. This can be leveraged in feature tests.
        for note in self.inspire_record.get("_private_notes", []):
            if note.get("value") == "orcid-push-force-delete":
                LOGGER.debug("OrcidPusher force delete",
                             recid=self.recid,
                             orcid=self.orcid)
                return True
        return self.inspire_record.get("deleted", False)

    @time_execution  # noqa: C901
    def push(self):
        putcode = None
        if not self._do_force_cache_miss:
            putcode = self.cache.read_work_putcode()
            if not self._is_record_deleted and not self.cache.has_work_content_changed(
                    self.inspire_record):
                LOGGER.debug("OrcidPusher cache hit",
                             recid=self.recid,
                             orcid=self.orcid)
                return putcode
        LOGGER.debug("OrcidPusher cache miss",
                     recid=self.recid,
                     orcid=self.orcid)

        # If the record is deleted, then delete it.
        if self._is_record_deleted:
            self._delete_work(putcode)
            return None

        self.converter = OrcidConverter(
            record=self.inspire_record,
            url_pattern=current_app.config["LEGACY_RECORD_URL_PATTERN"],
            put_code=putcode,
        )

        try:
            putcode = self._post_or_put_work(putcode)
        except orcid_client_exceptions.WorkAlreadyExistsException:
            # We POSTed the record as new work, but it failed because
            # a work with the same identifier is already in ORCID.
            # This can mean two things:
            # 1. the record itself is already in ORCID, but we don't have the putcode;
            # 2. a different record with the same external identifier is already in ORCID.
            # We first try to fix 1. by caching all author's putcodes and PUT the work again.
            # If the putcode wasn't found we are probably facing case 2.
            # so we try to push once again works with clashing identifiers
            # to update them and resolve the potential conflict.
            if self.pushing_duplicated_identifier:
                raise exceptions.DuplicatedExternalIdentifierPusherException
            putcode = self._cache_all_author_putcodes()
            if not putcode:
                try:
                    self._push_work_with_clashing_identifier()
                    putcode = self._post_or_put_work(putcode)
                except orcid_client_exceptions.WorkAlreadyExistsException:
                    # The PUT/POST failed despite pushing works with clashing identifiers
                    # and we can't do anything about this.
                    raise exceptions.DuplicatedExternalIdentifierPusherException
            else:
                self._post_or_put_work(putcode)
        except orcid_client_exceptions.DuplicatedExternalIdentifierException:
            # We PUT a record changing its identifier, but there is another work
            # in ORCID with the same identifier. We need to find out the recid
            # of the clashing work in ORCID and push a fresh version of that
            # record.
            # This scenario might be triggered by a merge of 2 records in Inspire.
            if not self.pushing_duplicated_identifier:
                self._push_work_with_clashing_identifier()
            # Raised exception will cause retry of celery task
            raise exceptions.DuplicatedExternalIdentifierPusherException
        except orcid_client_exceptions.PutcodeNotFoundPutException:
            # We try to push the work with invalid putcode, so we delete
            # its putcode and push it without any putcode.
            # If it turns out that the record already exists
            # in ORCID we search for the putcode by caching
            # all author's putcodes and PUT the work again.
            self.cache.delete_work_putcode()
            self.converter = OrcidConverter(
                record=self.inspire_record,
                url_pattern=current_app.config["LEGACY_RECORD_URL_PATTERN"],
                put_code=None,
            )
            putcode = self._cache_all_author_putcodes()
            self._post_or_put_work(putcode)
        except (
                orcid_client_exceptions.TokenInvalidException,
                orcid_client_exceptions.TokenMismatchException,
                orcid_client_exceptions.TokenWithWrongPermissionException,
        ):
            LOGGER.info("Deleting Orcid push access",
                        token=self.oauth_token,
                        orcid=self.orcid)
            push_access_tokens.delete_access_token(self.oauth_token,
                                                   self.orcid)
            raise exceptions.TokenInvalidDeletedException
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)

        self.cache.write_work_putcode(putcode, self.inspire_record)
        return putcode

    @time_execution
    def _post_or_put_work(self, putcode=None):
        # Note: if putcode is None, then it's a POST (it means the work is new).
        # Otherwise a PUT (it means the work already exists and it has the given
        # putcode).

        xml_element = self.converter.get_xml(do_add_bibtex_citation=True)
        # ORCID API allows 1 non-idempotent call only for the same orcid at
        # the same time. Using `distributed_lock` to achieve this.

        with utils.distributed_lock(self.lock_name, blocking=True):
            if putcode:
                response = self.client.put_updated_work(xml_element, putcode)
            else:
                response = self.client.post_new_work(xml_element)
        LOGGER.info("POST/PUT ORCID work", recid=self.recid)
        response.raise_for_result()
        return response["putcode"]

    def _delete_works_with_duplicated_putcodes(self, cached_putcodes_recids):
        unique_recids_putcodes = {}
        for fetched_putcode, fetched_recid in cached_putcodes_recids:
            if fetched_recid in unique_recids_putcodes:
                self._delete_work(fetched_putcode)
            else:
                unique_recids_putcodes[fetched_recid] = fetched_putcode
        return unique_recids_putcodes

    @time_execution
    def _cache_all_author_putcodes(self):
        LOGGER.debug("New OrcidPusher cache all author putcodes",
                     orcid=self.orcid)

        if not self.cached_author_putcodes:
            putcode_getter = OrcidPutcodeGetter(self.orcid, self.oauth_token)
            putcodes_recids = list(
                putcode_getter.get_all_inspire_putcodes_and_recids_iter())
            self.cached_author_putcodes = self._delete_works_with_duplicated_putcodes(
                putcodes_recids)

        putcode = None
        for fetched_recid, fetched_putcode in self.cached_author_putcodes.items(
        ):
            if fetched_recid == self.recid:
                putcode = int(fetched_putcode)
            cache = OrcidCache(self.orcid, fetched_recid)
            cache.write_work_putcode(fetched_putcode)

        # Ensure the putcode is actually in cache.
        # Note: this step is not really necessary and it can be skipped, but
        # at this moment it helps isolate a potential issue.
        if putcode and not self.cache.read_work_putcode():
            raise exceptions.PutcodeNotFoundInCacheAfterCachingAllPutcodes(
                "No putcode={} found in cache for recid={} after having"
                " cached all author putcodes for orcid={}".format(
                    self.putcode, self.recid, self.orcid))

        return putcode

    @time_execution
    def _delete_work(self, putcode=None):
        putcode = putcode or self._cache_all_author_putcodes()
        if not putcode:
            # Such recid does not exists (anymore?) in ORCID API.
            return

        # ORCID API allows 1 non-idempotent call only for the same orcid at
        # the same time. Using `distributed_lock` to achieve this.
        with utils.distributed_lock(self.lock_name, blocking=True):
            response = self.client.delete_work(putcode)
        try:
            response.raise_for_result()
        except orcid_client_exceptions.PutcodeNotFoundDeleteException:
            # Such putcode does not exists (anymore?) in orcid.
            pass
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)

        self.cache.delete_work_putcode()

    @time_execution
    def _push_work_with_clashing_identifier(self):
        putcode_getter = OrcidPutcodeGetter(self.orcid, self.oauth_token)

        ids = self.converter.added_external_identifiers
        putcodes_recids = putcode_getter.get_putcodes_and_recids_by_identifiers_iter(
            ids)
        updated_putcodes_recid = self._delete_works_with_duplicated_putcodes(
            putcodes_recids)

        for (recid, putcode) in updated_putcodes_recid.items():

            if not putcode or not recid:
                continue
            if recid == self.recid:
                continue
            # Local import to avoid import error.
            from inspirehep.orcid import tasks

            tasks.orcid_push(
                self.orcid,
                recid,
                self.oauth_token,
                dict(
                    pushing_duplicated_identifier=True,
                    record_db_version=self.record_db_version,
                ),
            )