Beispiel #1
0
    def fetch(self, orcid_id, instance=settings.ORCID_BASE_DOMAIN):
        """
        Fetches the profile by id using the public API.

        :param id: the ORCID identifier to fetch
        :param instance: the domain name of the instance to use (orcid.org or sandbox.orcid.org)
        """
        if instance not in ['orcid.org', 'sandbox.orcid.org']:
            raise ValueError('Unexpected instance')
        try:
            headers = {'Accept': 'application/orcid+json'}
            profile_req = requests.get('http://pub.%s/v1.2/%s/orcid-profile' %
                                       (instance, orcid_id),
                                       headers=headers)
            parsed = profile_req.json()
            if parsed.get('orcid-profile') is None:
                # TEMPORARY: also check from the sandbox
                if instance == 'orcid.org':
                    return self.fetch(orcid_id, instance='sandbox.orcid.org')
                raise ValueError
            self.json = parsed
        except (requests.exceptions.HTTPError, ValueError):
            raise MetadataSourceException('The ORCiD %s could not be found' %
                                          orcid_id)
        except TypeError:
            raise MetadataSourceException(
                'The ORCiD %s returned invalid JSON.' % orcid_id)
Beispiel #2
0
def urlopen_retry(url, **kwargs):  # data, timeout, retries, delay, backoff):
    data = kwargs.get('data', None)
    timeout = kwargs.get('timeout', 10)
    retries = kwargs.get('retries', 3)
    delay = kwargs.get('delay', 5)
    backoff = kwargs.get('backoff', 2)
    headers = kwargs.get('headers', {})
    try:
        r = requests.get(url,
                         params=data,
                         timeout=timeout,
                         headers=headers,
                         allow_redirects=True)
        return r.text
    except requests.exceptions.Timeout as e:
        if retries <= 0:
            raise MetadataSourceException('Timeout: ' + str(e))
    except requests.exceptions.ConnectionError as e:
        if retries <= 0:
            raise MetadataSourceException('Connection error: ' + str(e))
    except requests.exceptions.RequestException as e:
        raise MetadataSourceException('Request error: ' + str(e))

    print "Retrying in " + str(delay) + " seconds..."
    print "URL: " + url
    sleep(delay)
    return urlopen_retry(url,
                         data=data,
                         timeout=timeout,
                         retries=retries - 1,
                         delay=delay * backoff,
                         backoff=backoff)
Beispiel #3
0
    def perform_romeo_query(self, search_terms):
        search_terms = search_terms.copy()
        if self.api_key:
            search_terms['ak'] = self.api_key

        # Perform the query
        try:
            req = requests.get(self.base_url, params=search_terms, timeout=20)
        except requests.exceptions.RequestException as e:
            raise MetadataSourceException('Error while querying RoMEO.\n' +
                                          'URL was: '+self.base_url+'\n' +
                                          'Parameters were: '+str(search_terms)+'\n' +
                                          'Error is: '+str(e))

        # Parse it
        try:
            parser = ET.XMLParser(encoding='ISO-8859-1')
            root = ET.parse(BytesIO(req.content), parser)
        except ET.ParseError as e:
            raise MetadataSourceException('RoMEO returned an invalid XML response.\n' +
                                          'URL was: '+self.base_url+'\n' +
                                          'Parameters were: '+str(search_terms)+'\n' +
                                          'Error is: '+str(e))

        return root
Beispiel #4
0
def perform_romeo_query(search_terms):
    search_terms = search_terms.copy()
    if ROMEO_API_KEY:
        search_terms['ak'] = ROMEO_API_KEY
    base_url = 'http://' + ROMEO_API_DOMAIN + '/romeo/api29.php'

    # Perform the query
    try:
        response = urlopen_retry(base_url, data=search_terms).encode('utf-8')
    except requests.exceptions.RequestException as e:
        raise MetadataSourceException('Error while querying RoMEO.\n' +
                                      'URL was: ' + base_url + '\n' +
                                      'Parameters were: ' + str(search_terms) +
                                      '\n' + 'Error is: ' + str(e))

    # Parse it
    try:
        parser = ET.XMLParser(encoding='utf-8')
        root = ET.parse(BytesIO(response), parser)
    except ET.ParseError as e:
        with open('/tmp/romeo_response.xml', 'w') as f:
            f.write(response)
            f.write('\n')
        raise MetadataSourceException(
            'RoMEO returned an invalid XML response, dumped at /tmp/romeo_response.xml\n'
            + 'URL was: ' + base_url + '\n' + 'Parameters were: ' +
            str(search_terms) + '\n' + 'Error is: ' + str(e))

    return root
Beispiel #5
0
def get_or_create_publisher(romeo_xml_description):
    """
    Retrieves from the model, or creates into the model,
    the publisher corresponding to the <publisher> description
    from RoMEO
    """
    xml = romeo_xml_description
    romeo_id = None
    try:
        romeo_id = xml.attrib['id']
    except KeyError:
        raise MetadataSourceException(
            'RoMEO did not provide a publisher id.\n' + 'URL was: ' + request)

    name = None
    try:
        raw_name = xml.findall('./name')[0].text.strip()
        name = fromstring(kill_html(sanitize_html(raw_name))).text
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the publisher\'s name.\n' + 'URL was: ' +
            request)

    alias = None
    try:
        alias = nstrip(xml.findall('./alias')[0].text)
        if alias:
            alias = fromstring(kill_html(sanitize_html(alias))).text
    except KeyError, IndexError:
        pass
Beispiel #6
0
def fetch_dois_by_batch(doi_list):
    """
    Fetch a list of DOIs by batch (useful when refreshing the list of publications
    of a given researcher, as the records have most likely been already cached before
    by the proxy)
    """
    def results_list_to_dict(results):
        dct = {}
        for item in results:
            if item and 'DOI' in item:
                dct[item['DOI']] = item
        return dct

    if len(doi_list) == 0:
        return []
    elif len(doi_list) > nb_dois_per_batch:
        first_dois = fetch_dois_by_batch(doi_list[:nb_dois_per_batch])
        last_dois = fetch_dois_by_batch(doi_list[nb_dois_per_batch:])
        return first_dois + last_dois

    # Given how we are joining the DOIs, they cannot contain commas
    params = {
        'filter':
        ','.join(['doi:' + doi for doi in doi_list if ',' not in doi]),
        'mailto': settings.CROSSREF_MAILTO
    }
    req = None
    try:
        # First we fetch dois by batch from CrossRef. That's fast, but only
        # works for CrossRef DOIs
        req = make_crossref_call('/works', params=params)
        req.raise_for_status()
        results = req.json()['message'].get('items', [])
        dct = results_list_to_dict(results)

        # Some DOIs might not be in the results list, because they are issued by other organizations
        # We fetch them using our proxy (cached content negociation)
        missing_dois = list(set(doi_list) - set(dct.keys()))
        if missing_dois:
            req = requests.post(
                'https://' + settings.DOI_PROXY_DOMAIN + '/batch',
                {'dois': json.dumps(missing_dois)})
            req.raise_for_status()
            missing_dois_dct = results_list_to_dict(req.json())
            dct.update(missing_dois_dct)

        result = [dct.get(doi) for doi in doi_list]
        return result
    except RequestException as e:
        raise MetadataSourceException('Connecting to the DOI proxy at ' +
                                      req.url + ' failed: ' + str(e))
    except ValueError as e:
        raise MetadataSourceException(
            'Invalid JSON returned by the DOI proxy: ' + str(e))
    except KeyError:
        return []
    except requests.exceptions.RequestException as e:
        raise MetadataSourceException(
            'Failed to retrieve batch metadata from the proxy: ' + str(e))
Beispiel #7
0
    def search_for_dois_incrementally(
            self,
            query,
            filters={},
            max_batches=max_crossref_batches_per_researcher):
        """
        Searches for DOIs for the given query and yields their metadata as it finds them.

        :param query: the search query to pass to CrossRef
        :param filters: filters as specified by the REST API
        :param max_batches: maximum number of queries to send to CrossRef
        """
        params = {}
        if query:
            params['query'] = query
        if filters:
            params['filter'] = ','.join(
                map(lambda (k, v): k + ":" + v, filters.items()))

        count = 0
        rows = 20
        offset = 0
        while not max_batches or count < max_batches:
            url = 'http://api.crossref.org/works'
            params['rows'] = rows
            params['offset'] = offset

            try:
                r = requests.get(url, params=params)
                print "CROSSREF: " + r.url
                js = r.json()
                found = False
                for item in jpath('message/items', js, default=[]):
                    found = True
                    yield item
                if not found:
                    break
            except ValueError as e:
                raise MetadataSourceException(
                    'Error while fetching CrossRef results:\nInvalid response.\n'
                    +
                    'URL was: %s\nParameters were: %s\nJSON parser error was: %s'
                    % (url, urlencode(params), unicode(e)))
            except requests.exceptions.RequestException as e:
                raise MetadataSourceException(
                    'Error while fetching CrossRef results:\nUnable to open the URL: '
                    + request + '\nError was: ' + str(e))

            offset += rows
            count += 1
Beispiel #8
0
def get_or_create_publisher(romeo_xml_description):
    """
    Retrieves from the model, or creates into the model,
    the publisher corresponding to the <publisher> description
    from RoMEO
    """
    xml = romeo_xml_description
    romeo_id = None
    try:
        romeo_id = xml.attrib['id']
    except KeyError:
        raise MetadataSourceException('RoMEO did not provide a publisher id.')

    name = None
    try:
        raw_name = xml.findall('./name')[0].text.strip()
        name = fromstring(kill_html(sanitize_html(raw_name))).text
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the publisher\'s name.')

    alias = None
    try:
        alias = nstrip(xml.findall('./alias')[0].text)
        if alias:
            alias = fromstring(kill_html(sanitize_html(alias))).text
    except (KeyError, IndexError):
        pass

    # Check if we already have it
    matches = None
    if alias:
        matches = Publisher.objects.filter(romeo_id=romeo_id,
                                           name__iexact=name,
                                           alias__iexact=alias)
    else:
        matches = Publisher.objects.filter(romeo_id=romeo_id,
                                           name__iexact=name,
                                           alias__isnull=True)
    if matches:
        return matches[0]

    # Otherwise, create it
    url = None
    try:
        url = nstrip(xml.findall('./homeurl')[0].text)
    except KeyError, IndexError:
        pass
Beispiel #9
0
def fetch_journal(search_terms, matching_mode='exact'):
    """
    Fetch the journal data from RoMEO. Returns an Journal object.
    search_terms should be a dictionnary object containing at least one of these fields:
    """
    allowed_fields = ['issn', 'jtitle']
    # Make the title HTML-safe before searching for it in the database or in the API
    if 'title' in search_terms:
        search_terms['title'] = kill_html(search_terms['title'])
    original_search_terms = search_terms.copy()

    # Check the arguments
    if not all(
            map(lambda x: x in allowed_fields, (key for key in search_terms))):
        raise ValueError('The search terms have to belong to ' +
                         str(allowed_fields) + 'but the dictionary I got is ' +
                         str(search_terms))

    # Remove diacritics (because it has to be sent in ASCII to ROMEO)
    for key in search_terms:
        search_terms[key] = remove_diacritics(search_terms[key])

    # First check we don't have it already
    journal = find_journal_in_model(search_terms)
    if journal:
        return journal

    # Perform the query
    root = perform_romeo_query(search_terms)

    # Find the matching journals (if any)
    journals = list(root.findall('./journals/journal'))
    if not journals:
        # Retry with a less restrictive matching type
        if matching_mode == 'exact':
            return fetch_journal(original_search_terms, 'contains')
        return None
    if len(journals) > 1:
        print("Warning, " + str(len(journals)) +
              " journals match the RoMEO request, " +
              "defaulting to the first one")
        # TODO different behaviour: get the ISSN and try again.
    journal = journals[0]

    names = list(journal.findall('./jtitle'))
    if not names:
        raise MetadataSourceException(
            'RoMEO returned a journal without title.\n' + 'URL was: ' +
            request)
    if len(names) > 1:
        print("Warning, " + str(len(names)) +
              " names provided for one journal, " +
              "defaulting to the first one")
    name = kill_html(names[0].text)

    issn = None
    try:
        issn = nstrip(journal.findall('./issn')[0].text)
    except KeyError, IndexError:
        pass
Beispiel #10
0
    def fetch_all_records(self, filters=None, cursor="*"):
        """
        Fetches all Crossref records from their API, starting at a given date.

        :param filters: filters as specified by the REST API (as a dictionary)
        :param cursor: the initial cursor where to start the fetching
            (useful to resume failed ingestions)
        """
        if filters is None:
            filters = {}
        params = {}
        if filters:
            params['filter'] = ','.join(k + ":" + v
                                        for k, v in list(filters.items()))

        rows = 100
        next_cursor = cursor
        while next_cursor:
            params['rows'] = rows
            params['cursor'] = next_cursor
            params['mailto'] = settings.CROSSREF_MAILTO

            try:
                r = make_crossref_call('/works', params=params)
                r.raise_for_status()
                js = r.json()
                if js['status'] == 'failed':
                    raise MetadataSourceException(
                        'Querying Crossrsef with {} failed.'.format(r.url))
                found = False
                for item in jpath('message/items', js, default=[]):
                    found = True
                    yield item
                if not found:
                    break
                next_cursor = jpath('message/next-cursor', js)
                logger.info(
                    "Next cursor: {}".format(next_cursor))  # to ease recovery
            except ValueError as e:
                raise MetadataSourceException(
                    'Error while fetching CrossRef results:\nInvalid response.\n'
                    + 'Parameters were: %s\nJSON parser error was: %s' %
                    (urlencode(params), str(e)))
            except requests.exceptions.RequestException as e:
                raise MetadataSourceException(
                    'Error while fetching CrossRef results:\nError was: ' +
                    str(e))
Beispiel #11
0
def request_retry(url, **kwargs):
    """
    Retries a request, with throttling and exponential back-off.
    
    :param url: the URL to fetch
    :param data: the GET parameters
    :param headers: the HTTP headers
    :param timeout: the number of seconds to wait before declaring that an individual request timed out (default 10)
    :param retries: the number of times to retry a query (default 3)
    :param delay: the minimum delay between requests (default 5)
    :param backoff: the multiple used when raising the delay after an unsuccessful query (default 2)
    """
    data = kwargs.get('data', None)
    timeout = kwargs.get('timeout', 10)
    retries = kwargs.get('retries', 3)
    delay = kwargs.get('delay', 5)
    backoff = kwargs.get('backoff', 2)
    headers = kwargs.get('headers', {})
    try:
        r = requests.get(url,
                         params=data,
                         timeout=timeout,
                         headers=headers,
                         allow_redirects=True)
        r.raise_for_status()
        return r
    except requests.exceptions.Timeout as e:
        if retries <= 0:
            raise MetadataSourceException('Timeout: '+str(e))
    except requests.exceptions.ConnectionError as e:
        if retries <= 0:
            raise MetadataSourceException('Connection error: '+str(e))
    except requests.exceptions.RequestException as e:
        if retries <= 0:
            raise MetadataSourceException('Request error: '+str(e))

    logger.info("Retrying in "+str(delay)+" seconds with url "+url)
    sleep(delay)
    return request_retry(url,
                         data=data,
                         timeout=timeout,
                         retries=retries-1,
                         delay=delay*backoff,
                         backoff=backoff)
Beispiel #12
0
 def fetch(self):
     """
     Fetches the profile by id using the public API.
     This only fetches the summaries, subsequent requests will be made for works.
     """
     try:
         parsed = self.request_element('')
         if parsed.get('orcid-identifier') is None:
             # TEMPORARY: also check from the sandbox
             if self.instance == 'orcid.org':
                 self.instance = 'sandbox.orcid.org'
                 return self.fetch()
             raise ValueError
         self.json = parsed
     except (requests.exceptions.HTTPError, ValueError):
         raise MetadataSourceException(
             'The ORCiD {id} could not be found'.format(id=self.id))
     except TypeError:
         raise MetadataSourceException(
             'The ORCiD {id} returned invalid JSON.'.format(id=self.id))
Beispiel #13
0
def fetch_zotero_by_DOI(doi):
    """
    Fetch Zotero metadata for a given DOI.
    Works only with the doi_cache proxy.
    """
    try:
        request = requests.get('http://'+DOI_PROXY_DOMAIN+'/zotero/'+doi)
        return request.json()
    except ValueError as e:
        raise MetadataSourceException('Error while fetching Zotero metadata:\nInvalid JSON response.\n'+
                'Error: '+str(e))
Beispiel #14
0
    def get_or_create_by_orcid(cls, orcid, profile=None, user=None):
        """
        Creates (or returns an existing) researcher from its ORCID id.

        :param profile: an :class:`OrcidProfile` object if it has already been fetched
                        from the API (otherwise we will fetch it ourselves)
        :param user: an user to associate with the profile.
        :returns: a :class:`Researcher` if everything went well, raises MetadataSourceException otherwise
        """
        researcher = None
        if orcid is None:
            raise MetadataSourceException('Invalid ORCID id')
        try:
            researcher = Researcher.objects.get(orcid=orcid)
        except Researcher.DoesNotExist:
            if profile is None:
                profile = OrcidProfile(id=orcid)
            else:
                profile = OrcidProfile(json=profile)
            name = profile.name
            homepage = profile.homepage
            email = profile.email
            researcher = Researcher.create_by_name(name[0],
                                                   name[1],
                                                   orcid=orcid,
                                                   user=user,
                                                   homepage=homepage,
                                                   email=email)

            # Ensure that extra info is added.
            save = False
            for kw, val in [('homepage', homepage), ('orcid', orcid),
                            ('email', email)]:
                if not researcher.__dict__[kw] and val:
                    researcher.__dict__[kw] = val
                    save = True
            if save:
                researcher.save()

            for variant in profile.other_names:
                confidence = name_similarity(variant, variant)
                name = Name.lookup_name(variant)
                researcher.add_name_variant(name, confidence)

        return researcher
Beispiel #15
0
def fetch_metadata_by_DOI(doi):
    """
    Fetch the metadata for a single DOI.
    This is supported by the standard proxy, doi.org,
    as well as more advanced proxies such as doi_cache
    """
    if doi is None:
        return
    addheaders = {'Accept': 'application/citeproc+json'}
    try:
        request = 'http://'+DOI_PROXY_DOMAIN+'/'+doi
        response = urlopen_retry(request,
                                 timeout=crossref_timeout,
                                 headers=addheaders,
                                 retries=0)
        parsed = json.loads(response)
        return parsed
    except ValueError as e:
        raise MetadataSourceException('Error while fetching DOI metadata:\nInvalid JSON response.\n' +
                                      'Error: '+str(e))
Beispiel #16
0
    def fetch_orcid_records(self,
                            orcid_identifier,
                            profile=None,
                            use_doi=True):
        """
        Queries ORCiD to retrieve the publications associated with a given ORCiD.
        It also fetches such papers from the CrossRef search interface.

        :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON).
        :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow)
        :returns: a generator, where all the papers found are yielded. (some of them could be in
                free form, hence not imported)
        """
        cr_api = CrossRefAPI()

        # Cleanup iD:
        orcid_id = validate_orcid(orcid_identifier)
        if orcid_id is None:
            raise MetadataSourceException('Invalid ORCiD identifier')

        # Get ORCiD profile
        try:
            if profile is None:
                profile = OrcidProfile(orcid_id=orcid_id)
            else:
                profile = OrcidProfile(json=profile)
        except MetadataSourceException as e:
            print e
            return

        # As we have fetched the profile, let's update the Researcher
        self.researcher = Researcher.get_or_create_by_orcid(orcid_identifier,
                                                            profile.json,
                                                            update=True)
        if not self.researcher:
            return

        # Reference name
        ref_name = profile.name
        ignored_papers = [
        ]  # list of ignored papers due to incomplete metadata

        # Get summary publications and separate them in two classes:
        # - the ones with DOIs, that we will fetch with CrossRef
        dois_and_putcodes = []  # list of (DOIs,putcode) to fetch
        # - the ones without: we will fetch ORCID's metadata about them
        #   and try to create a paper with what they provide
        put_codes = []
        for summary in profile.work_summaries:
            if summary.doi and use_doi:
                dois_and_putcodes.append((summary.doi, summary.put_code))
            else:
                put_codes.append(summary.put_code)

        # 1st attempt with DOIs and CrossRef
        if use_doi:
            # Let's grab papers with DOIs found in our ORCiD profile.
            dois = [doi for doi, put_code in dois_and_putcodes]
            for idx, (success, paper_or_metadata) in enumerate(
                    self.fetch_metadata_from_dois(cr_api, ref_name, orcid_id,
                                                  dois)):
                if success:
                    yield paper_or_metadata
                else:
                    put_codes.append(dois_and_putcodes[idx][1])

        # 2nd attempt with ORCID's own crappy metadata
        works = profile.fetch_works(put_codes)
        for work in works:
            if not work:
                continue

            # If the paper is skipped due to invalid metadata.
            # We first try to reconcile it with local researcher author name.
            # Then, we consider it missed.
            if work.skipped:
                print(work.json)
                print(work.skip_reason)
                print('work skipped due to incorrect metadata (%s)' %
                      (work.skip_reason))

                ignored_papers.append(work.as_dict())
                continue

            yield self.create_paper(work)

        self.warn_user_of_ignored_papers(ignored_papers)
        if ignored_papers:
            print('Warning: Total ignored papers: %d' % (len(ignored_papers)))
Beispiel #17
0
    def fetch_orcid_records(self, orcid_identifier, profile=None, use_doi=True):
        """
        Queries ORCiD to retrieve the publications associated with a given ORCiD.
        It also fetches such papers from the CrossRef search interface.

        :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON).
        :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow)
        :returns: a generator, where all the papers found are yielded. (some of them could be in
                free form, hence not imported)
        """
        crps = CrossRefPaperSource(self.ccf)

        # Cleanup iD:
        orcid_id = validate_orcid(orcid_identifier)
        if orcid_id is None:
            raise MetadataSourceException('Invalid ORCiD identifier')

        # Get ORCiD profile
        try:
            if profile is None:
                profile = OrcidProfile(id=orcid_id)
            else:
                profile = OrcidProfile(json=profile)
        except MetadataSourceException as e:
            print e
            return

        # Reference name
        ref_name = profile.name
        # curl -H "Accept: application/orcid+json" 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i
        dois = [] # list of DOIs to fetch
        ignored_papers = [] # list of ignored papers due to incomplete metadata

        # Fetch publications (1st attempt with ORCiD data)
        pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work', profile, [])
        for pub in pubs:
            data_paper = ORCIDDataPaper.from_orcid_metadata(
                ref_name,
                orcid_id,
                pub,
                stop_if_dois_exists=use_doi
            )

            if data_paper.dois and use_doi: # We want to batch it rather than manually do it.
                dois.extend(data_paper.dois)
                continue

            # If the paper is skipped due to invalid metadata.
            # We first try to reconcile it with local researcher author name.
            # Then, we consider it missed.
            if data_paper.skipped:
                print ('%s is skipped due to incorrect metadata (%s)' % (data_paper, data_paper.skip_reason))

                print ('Trying to reconcile it with local researcher.')
                data_paper = self.reconcile_paper(
                    ref_name,
                    orcid_id,
                    pub,
                    overrides={
                        'authors': [(self.researcher.name.first, self.researcher.name.last)]
                    }
                )
                if data_paper.skipped:
                    ignored_papers.append(data_paper.as_dict())
                    continue

            yield self.create_paper(data_paper)

        # 2nd attempt with DOIs and CrossRef
        if use_doi:
            # Let's grab papers from CrossRef
            for success, paper_or_metadata in self.fetch_crossref_incrementally(crps, orcid_id):
                if success:
                    yield paper_or_metadata
                else:
                    ignored_papers.append(paper_or_metadata)
                    print ('This metadata (%s) yields no paper.' % (metadata))

            # Let's grab papers with DOIs found in our ORCiD profile.
            # FIXME(RaitoBezarius): if we fail here, we should get back the pub and yield it.
            for success, paper_or_metadata in self.fetch_metadata_from_dois(crps, ref_name, orcid_id, dois):
                if success:
                    yield paper_or_metadata
                else:
                    ignored_papers.append(paper_or_metadata)
                    print ('This metadata (%s) yields no paper.' % (paper_or_metadata))
       
        self.warn_user_of_ignored_papers(ignored_papers)
        if ignored_papers:
            print ('Warning: Total ignored papers: %d' % (len(ignored_papers)))
Beispiel #18
0
    def fetch_orcid_records(self, id, profile=None, use_doi=True):
        """
        Queries ORCiD to retrieve the publications associated with a given ORCiD.
        It also fetches such papers from the CrossRef search interface.

        :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON).
        :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow)
        :returns: a generator, where all the papers found are yielded. (some of them could be in
                free form, hence not imported)
        """
        crps = CrossRefPaperSource(self.ccf)

        # Cleanup iD:
        id = validate_orcid(id)
        if id is None:
            raise MetadataSourceException('Invalid ORCiD identifier')

        # Get ORCiD profile
        try:
            if profile is None:
                profile = OrcidProfile(id=id)
            else:
                profile = OrcidProfile(json=profile)
        except MetadataSourceException as e:
            print e
            return

        # Reference name
        ref_name = profile.name
        # curl -H "Accept: application/orcid+json" 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i
        dois = []  # list of DOIs to fetch
        papers = []  # list of papers created
        records_found = 0  # how many records did we successfully import from the profile?

        # Fetch publications
        pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work',
                     profile, [])
        for pub in pubs:

            def j(path, default=None):
                return jpath(path, pub, default)

            # DOI
            doi = None
            for extid in j(
                    'work-external-identifiers/work-external-identifier', []):
                if extid.get('work-external-identifier-type') == 'DOI':
                    doi = to_doi(
                        jpath('work-external-identifier-id/value', extid))
                    if doi:
                        # If a DOI is available, create the paper using metadata from CrossRef.
                        # We don't do it yet, we only store the DOI, so that we can fetch them
                        # by batch later.
                        dois.append(doi)

            if doi and use_doi:
                continue

            # Extract information from ORCiD

            # Title
            title = j('work-title/title/value')
            if title is None:
                print "Warning: Skipping ORCID publication: no title"

            # Type
            doctype = orcid_to_doctype(j('work-type', 'other'))

            # Contributors (ignored for now as they are very often not present)
            def get_contrib(js):
                return {
                    'orcid': jpath('contributor-orcid', js),
                    'name': jpath('credit-name/value', js),
                }

            contributors = map(get_contrib,
                               j('work-contributors/contributor', []))

            author_names = filter(lambda x: x is not None,
                                  map(lambda x: x['name'], contributors))
            authors = map(parse_comma_name, author_names)
            pubdate = None
            # ORCiD internal id
            identifier = j('put-code')
            affiliations = map(lambda x: x['orcid'], contributors)
            # Pubdate
            year = parse_int(j('publication-date/year/value'), 1970)
            month = parse_int(j('publication-date/month/value'), 01)
            day = parse_int(j('publication-date/day/value'), 01)
            pubdate = None
            try:
                pubdate = date(year=year, month=01, day=01)
                pubdate = date(year=year, month=month, day=01)
                pubdate = date(year=year, month=month, day=day)
            except ValueError:
                if pubdate is None:
                    print "Invalid publication date in ORCID publication, skipping"
                    continue

            # Citation type: metadata format
            citation_format = j('work-citation/work-citation-type')
            print citation_format
            bibtex = j('work-citation/citation')

            if bibtex is not None:
                try:
                    entry = parse_bibtex(bibtex)

                    if entry.get('author', []) == []:
                        print "Warning: Skipping ORCID publication: no authors."
                        print j('work-citation/citation')
                    if not authors:
                        authors = entry['author']
                except ValueError:
                    pass

            affiliations = affiliate_author_with_orcid(
                ref_name, id, authors, initial_affiliations=affiliations)

            authors = map(name_lookup_cache.lookup, authors)

            if not authors:
                print "No authors found, skipping"
                continue

            # Create paper:
            paper = BarePaper.create(title, authors, pubdate, 'VISIBLE',
                                     affiliations)

            record = BareOaiRecord(source=orcid_oai_source,
                                   identifier=identifier,
                                   splash_url='http://orcid.org/' + id,
                                   pubtype=doctype)

            paper.add_oairecord(record)
            yield paper

        if use_doi:
            for metadata in crps.search_for_dois_incrementally(
                    '', {'orcid': id}):
                try:
                    paper = crps.save_doi_metadata(metadata)
                    if paper:
                        yield paper
                except ValueError as e:
                    print "Saving CrossRef record from ORCID failed: %s" % unicode(
                        e)

            # Now we add the DOIs found in the ORCID profile.
            doi_metadata = fetch_dois(dois)
            for metadata in doi_metadata:
                try:
                    authors = map(convert_to_name_pair, metadata['author'])
                    affiliations = affiliate_author_with_orcid(
                        ref_name, id, authors)
                    paper = crps.save_doi_metadata(metadata, affiliations)
                    if not paper:
                        continue
                    record = BareOaiRecord(source=orcid_oai_source,
                                           identifier='orcid:' + id + ':' +
                                           metadata['DOI'],
                                           splash_url='http://orcid.org/' + id,
                                           pubtype=paper.doctype)
                    paper.add_oairecord(record)
                    yield paper
                except (KeyError, ValueError, TypeError):
                    pass
Beispiel #19
0
                                           alias__isnull=True)
    if matches:
        return matches[0]

    # Otherwise, create it
    url = None
    try:
        url = nstrip(xml.findall('./homeurl')[0].text)
    except KeyError, IndexError:
        pass

    preprint = None
    try:
        preprint = xml.findall('./preprints/prearchiving')[0].text.strip()
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the preprint policy.')

    postprint = None
    try:
        postprint = xml.findall('./postprints/postarchiving')[0].text.strip()
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the postprint policy.')

    pdfversion = None
    try:
        pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip()
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the pdf archiving policy.')
Beispiel #20
0
def fetch_journal(search_terms, matching_mode='exact'):
    """
    Fetch the journal data from RoMEO. Returns an Journal object.
    search_terms should be a dictionnary object containing at least one of these fields:
    """
    allowed_fields = ['issn', 'jtitle']
    terms = search_terms.copy()
    # Make the title HTML-safe before searching for it in the database or in
    # the API
    if 'title' in terms:
        terms['title'] = kill_html(terms['title'])

    # Check the arguments
    if not all(key in allowed_fields for key in terms):
        raise ValueError('The search terms have to belong to ' +
                         str(allowed_fields) + 'but the dictionary I got is ' +
                         str(terms))

    # Remove diacritics (because it has to be sent in ASCII to ROMEO)
    for key in terms:
        terms[key] = remove_diacritics(terms[key])
        if len(terms[key]) > 256:
            return None

    # First check we don't have it already
    journal = find_journal_in_model(terms)
    if journal:
        return journal

    # Perform the query
    if matching_mode != 'exact':
        terms['qtype'] = matching_mode
    root = perform_romeo_query(terms)

    # Find the matching journals (if any)
    journals = list(root.findall('./journals/journal'))

    if not journals:
        return None
    elif len(journals) > 1:
        print("Warning, " + str(len(journals)) +
              " journals match the RoMEO request, " +
              "defaulting to the first one")
        # TODO different behaviour: get the ISSN and try again.
    journal = journals[0]

    names = list(journal.findall('./jtitle'))
    if not names:
        raise MetadataSourceException(
            'RoMEO returned a journal without title.\n' + 'Terms were: ' +
            unicode(terms))
    if len(names) > 1:
        print("Warning, " + str(len(names)) +
              " names provided for one journal, " +
              "defaulting to the first one")
    name = kill_html(names[0].text)

    issn = None
    try:
        issn = nstrip(journal.findall('./issn')[0].text)
    except (KeyError, IndexError):
        pass

    # Now we may have additional info, so it's worth trying again in the model
    model_journal = find_journal_in_model({'issn': issn, 'jtitle': name})
    if model_journal:
        return model_journal

    # Otherwise we need to find the publisher
    publishers = root.findall('./publishers/publisher')
    if not publishers:
        return None
    # TODO here we shouldn't default to the first one but look it up using the
    # <romeopub>
    publisher_desc = publishers[0]

    publisher = get_or_create_publisher(publisher_desc)

    result = Journal(title=name, issn=issn, publisher=publisher)
    result.save()
    return result
Beispiel #21
0
    if matches:
        return matches[0]

    # Otherwise, create it
    url = None
    try:
        url = nstrip(xml.findall('./homeurl')[0].text)
    except KeyError, IndexError:
        pass

    preprint = None
    try:
        preprint = xml.findall('./preprints/prearchiving')[0].text.strip()
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the preprint policy.\n' + 'URL was: ' +
            request)

    postprint = None
    try:
        postprint = xml.findall('./postprints/postarchiving')[0].text.strip()
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the postprint policy.\n' + 'URL was: ' +
            request)

    pdfversion = None
    try:
        pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip()
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
Beispiel #22
0
    def get_or_create_publisher(self, romeo_xml_description):
        """
        Retrieves from the model, or creates into the model,
        the publisher corresponding to the <publisher> description
        from RoMEO.

        If the data from RoMEO is more fresh than what we have
        in cache, we update our model.
        """
        xml = romeo_xml_description
        romeo_id = None
        try:
            romeo_id = xml.attrib['id']
        except KeyError:
            raise MetadataSourceException('RoMEO did not provide a publisher id.')

        romeo_parent_id = None
        try:
            romeo_parent_id = xml.attrib['parentid']
        except KeyError:
            pass

        name = None
        try:
            raw_name = xml.findall('./name')[0].text.strip()
            name = fromstring(kill_html(sanitize_html(raw_name))).text
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the publisher\'s name.')

        alias = None
        try:
            alias = nstrip(xml.findall('./alias')[0].text)
            if alias:
                alias = fromstring(kill_html(sanitize_html(alias))).text
        except (KeyError, IndexError):
            pass

        last_update = self._get_romeo_date(xml, './dateupdated')

        # Check if we already have it.
        # Sadly the romeo_id is not unique (as publishers imported from doaj
        # all get the same id, so we have to use the name too).
        matches = None
        if re.match(r'\d+', romeo_id): # numeric ids are unambiguous
            matches = Publisher.objects.filter(romeo_id=romeo_id)
        elif alias:
            matches = Publisher.objects.filter(
                romeo_id=romeo_id, name__iexact=name, alias__iexact=alias)
        else:
            matches = Publisher.objects.filter(
                romeo_id=romeo_id, name__iexact=name, alias__isnull=True)
        if matches:
            first_match = matches[0]
            if first_match.last_updated is not None and first_match.last_updated >= last_update:
                return matches[0]

        # Otherwise, create it
        url = None
        try:
            url = nstrip(xml.findall('./homeurl')[0].text)
        except (KeyError, IndexError):
            pass

        preprint = None
        try:
            preprint = xml.findall('./preprints/prearchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the preprint policy.')

        postprint = None
        try:
            postprint = xml.findall('./postprints/postarchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the postprint policy.')

        pdfversion = None
        try:
            pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the pdf archiving policy.')

        # Compute OA status of the publisher
        status = 'UNK'

        if not matches:
            publisher = Publisher()
        else:
            publisher = matches[0]

        publisher.name = name
        publisher.alias = alias
        publisher.url = url
        publisher.preprint = preprint
        publisher.postprint = postprint
        publisher.pdfversion = pdfversion
        publisher.romeo_id = romeo_id
        publisher.romeo_parent_id = romeo_parent_id
        publisher.oa_status = status
        publisher.last_updated = last_update
        publisher.save()

        if matches:
            publisher.publishercopyrightlink_set.all().delete()
            publisher.publisherrestrictiondetail_set.all().delete()
            publisher.publishercondition_set.all().delete()

        # Add the conditions, restrictions, and copyright
        for restriction in xml.findall('./preprints/prerestrictions/prerestriction'):
            self.add_restriction(restriction, 'preprint', publisher)

        for restriction in xml.findall('./postprints/postrestrictions/postrestriction'):
            self.add_restriction(restriction, 'postprint', publisher)

        for restriction in xml.findall('./pdfversion/pdfrestrictions/pdfrestriction'):
            self.add_restriction(restriction, 'pdfversion', publisher)

        for condition in xml.findall('./conditions/condition'):
            if condition.text:
                c = PublisherCondition(publisher=publisher,
                                       text=condition.text.strip())
                c.save()

        # Update the publisher status
        publisher.oa_status = publisher.classify_oa_status()
        publisher.save(update_fields=['oa_status'])

        # TODO: if the OA status has changed, then we should update the journals and papers accordingly with the
        # adequate task

        for link in xml.findall('./copyrightlinks/copyrightlink'):
            text = None
            url = None
            texts = link.findall('./copyrightlinktext')
            if texts:
                text = nstrip(texts[0].text)
            urls = link.findall('./copyrightlinkurl')
            if urls:
                url = nstrip(urls[0].text)
            if url and text:
                cplink = PublisherCopyrightLink(
                    text=text, url=url[:1024], publisher=publisher)
                cplink.save()

        return publisher