Ejemplo n.º 1
0
    def test_oa_policy_select_function(self):
        # oa_for_license is essentially a convenience wrapper around
        # oa_from_rights which just selects the appropriate license from the
        # licenses module.
        # We are not going to test the correctness of the rights definitions,
        # that is up to plugin content tests. We are just going to see if the
        # wrapper works as expected.

        # try a license which should exist and should be OA
        # We don't care what value it returns, we just care about accessing
        # dict. All we're checking is that no exceptions are raised.
        dummy_open_access_value = oa_policy.oa_for_license('cc-by')

        self.assertRaises(KeyError, oa_policy.oa_for_license, 'should_not_exist')
Ejemplo n.º 2
0
    def license_detect(self, record):
        """
        To respond to the provider identifier: http://elife.elifesciences.org
        
        This should determine the licence conditions of the eLife article and populate
        the record['bibjson']['license'] (note the US spelling) field.
        """

        # List of licensing statements to look for on this publisher's pages.
        # In eLife's case they take the form of {xpath string: meaning object}
        # since we're not scraping HTML, we're using an XML API.
        # meaning['type'] identifies the license (see licenses.py)
        # and meaning['version'] identifies the license version (if available)
        elife_license_mappings = [
            {'//license[@xlink:href="http://creativecommons.org/licenses/by/3.0/" and @license-type="open-access"]': 
                {
                    'type': 'cc-by', 'version':'3.0',
                    # also declare some properties which override info about this license in the licenses list (see licenses module)
                    'url': 'http://creativecommons.org/licenses/by/3.0/'
                }
            }
        ]

        # 1. get DOI from record object
        doi = record['provider'].get('doi')

        if doi:
        # 2. query elife XML api
            url = 'http://elife.elifesciences.org/elife-source-xml/' + doi
            response = requests.get(url)

            try:
                xml = etree.fromstring(response.text.decode("utf-8"))
            except Exception as e:
                log.error("Error parsing the XML from " + url)
                log.error(e)
        
            # process the XML response
            namespaces = {'xlink': 'http://www.w3.org/1999/xlink'}

            for mapping in elife_license_mappings:
                xpath = mapping.keys()[0]
                meaning = mapping[xpath]
                elements = xml.xpath(xpath, namespaces=namespaces)

                if len(elements) > 0:
                    lic_type = meaning['type']
        
                    # license identified, now use that to construct the license object
                    license = deepcopy(LICENSES[lic_type])
                    license['open_access'] = oa_policy.oa_for_license(lic_type)
                    # set some defaults which have to be there, even if empty
                    license.setdefault('version','')
                    license.setdefault('description','')
                    license.setdefault('jurisdiction','') # TODO later (or later version of OAG!)
        
                    # Copy over all information about the license from the license
                    # statement mapping. In essence, transfer the knowledge of the 
                    # publisher plugin authors to the license object.
                    # Consequence: Values coming from the publisher plugin overwrite
                    # values specified in the licenses module.
                    license.update(meaning)
        
                    # add provenance information to the license object
                    provenance = {
                        'handler': self._short_name,
                        'handler_version': self.__version__,
                        'date': datetime.strftime(datetime.now(), config.date_format),
                        'source': url,
                        'agent': config.agent,
                        'category': 'xml_api', # TODO we need to think how the
                            # users get to know what the values here mean.. docs?
                        'description': 'License decided by querying the eLife XML API at ' + url
                    }
        
                    license['provenance'] = provenance
        
                    record['bibjson'].setdefault('license', [])
                    record['bibjson']['license'].append(license)
Ejemplo n.º 3
0
    def simple_extract(self, lic_statements, record, url,
            first_match=False):
        """
        Generic code which looks for a particular string in a given web
        page (URL), determines the licence conditions of the article and
        populates the record['bibjson']['license'] (note the US
        spelling) field.

        The URL it analyses, the statements it looks for and the
        resulting licenses are passed in. This is not a plugin for a
        particular publisher - it just contains (allows re-use) the
        logic that any "dumb string matching" plugin would use.

        :param lic_statements: licensing statements to look for on this
        publisher's pages. Take the form of {statement: meaning} where
        meaning['type'] identifies the license (see licenses.py) and
        meaning['version'] identifies the license version (if available)
        See a publisher plugin for an example, e.g. bmc.py

        :param record: a request for the OAG status of an article, see
        OAG docs for more info.

        :param url: source url of the item to be fetched. This is where
        the HTML page that's going to be scraped is expected to reside.

        :param first_match: stop trying license statements if one of
        them is found at the target url. By default, this code will try
        out all supplied license statements and simply add multiple
        'license' objects to the record it's been passed. If you want
        "first successfuly match only" behaviour, set this to True.
        """

        # get content
        r = requests.get(url)
        # logging.debug('got content')
        content = self.normalise_string(r.content)
        
        # see if one of the licensing statements is in content 
        # and populate record with appropriate license info
        for statement_mapping in lic_statements:
            # get the statement string itself - always the first key of the dict
            # mapping statements to licensing info
            statement = statement_mapping.keys()[0]

            # use a modified version of the license statement for
            # comparison - one which has been subjected to the same
            # normalisation as the incoming content (whitespace,
            # lowercasing etc.)
            cmp_statement = self.normalise_string(statement)

            # logging.debug(cmp_statement)

            if cmp_statement in content:
                
                # logging.debug('... matches')

                # okay, statement found on the page -> get license type
                lic_type = statement_mapping[statement]['type']

                # license identified, now use that to construct the license object
                license = deepcopy(LICENSES[lic_type])
                license['open_access'] = oa_policy.oa_for_license(lic_type)
                # set some defaults which have to be there, even if empty
                license.setdefault('version','')
                license.setdefault('description','')
                license.setdefault('jurisdiction','') # TODO later (or later version of OAG!)
                
                # Copy over all information about the license from the license
                # statement mapping. In essence, transfer the knowledge of the 
                # publisher plugin authors to the license object.
                # Consequence: Values coming from the publisher plugin overwrite
                # values specified in the licenses module.
                license.update(statement_mapping[statement])
                
                # add provenance information to the license object
                provenance = {
                    'date': datetime.strftime(datetime.now(), config.date_format),
                    'source': url,
                    'agent': config.agent,
                    'category': 'page_scrape', # TODO we need to think how the
                        # users get to know what the values here mean.. docs?
                    'description': self.gen_provenance_description(url, statement),
                    'handler': self._short_name, # the name of the plugin processing this record
                    'handler_version': self.__version__ # version of the plugin processing this record
                }

                license['provenance'] = provenance

                record['bibjson'].setdefault('license', [])
                record['bibjson']['license'].append(license)

                if first_match:
                    break
Ejemplo n.º 4
0
    def simple_extract(self, lic_statements, record, url, first_match=False, content='', handler=''):
        """
        Generic code which looks for a particular string in a given web
        page (URL), determines the licence conditions of the article and
        populates the record['bibjson']['license'] (note the US
        spelling) field.

        The URL it analyses, the statements it looks for and the
        resulting licenses are passed in. This is not a plugin for a
        particular publisher - it just contains (allows re-use) the
        logic that any "dumb string matching" plugin would use.

        :param lic_statements: licensing statements to look for on this
        publisher's pages. Take the form of {statement: meaning} where
        meaning['type'] identifies the license (see licenses.py) and
        meaning['version'] identifies the license version (if available)
        See a publisher plugin for an example, e.g. bmc.py

        :param record: a request for the OAG status of an article, see
        OAG docs for more info.

        :param url: source url of the item to be fetched. This is where
        the HTML page that's going to be scraped is expected to reside.

        :param first_match: stop trying license statements if one of
        them is found at the target url. By default, this code will try
        out all supplied license statements and simply add multiple
        'license' objects to the record it's been passed. If you want
        "first successful match only" behaviour, set this to True.
        """
        if not handler:
            handler = self._short_name  # can't put it in the method signature above, self is unresolved

        if not content:
            # get content from the web unless it's being passed into this method
            r, content, source_size = util.http_stream_get(url)
            if r.status_code != requests.codes.ok:
                raise PluginException(PluginException.HTTP, "could not retrieve content from " + url + " - " + str(r.status_code))
        else:
            source_size = len(content)

        content = self.normalise_string(content)

        if not content:
            return
        
        # see if one of the licensing statements is in content 
        # and populate record with appropriate license info
        for statement_mapping in lic_statements:
            # get the statement string itself - always the first key of the dict
            # mapping statements to licensing info
            statement = statement_mapping.keys()[0]

            # use a modified version of the license statement for
            # comparison - one which has been subjected to the same
            # normalisation as the incoming content (whitespace,
            # lowercasing etc.)
            cmp_statement = self.normalise_string(statement)
            # do not try to match empty statements, will always result in a match
            if not cmp_statement:
                continue

            # logging.debug(cmp_statement)

            #content = content.decode('utf-8', errors='replace').encode('utf-8', errors='replace')
            #print 'cmp statement type', type(cmp_statement)
            #print 'content type', type(content)

            #if type(cmp_statement) == unicode:
            #    print 'converting cmp_statement to str'
            #    cmp_statement = cmp_statement.encode('utf-8', 'ignore')
            #if type(content) == unicode:
            #    content = content.encode('utf-8', 'ignore')

            if type(cmp_statement) == str:
                #print 'converting cmp_statement to unicode'
                cmp_statement = cmp_statement.decode('utf-8', 'replace')
            if type(content) == str:
                content = content.decode('utf-8', 'replace')

            #print 'after safeguards'
            #print 'cmp statement type', type(cmp_statement)
            #print 'content type', type(content)

            match = cmp_statement in content

            if not match:
                cmp_statement = self.strip_html(cmp_statement)
                content = self.strip_html(content)
                if cmp_statement:  # if there's anything left of the statement after the html stripping...
                                   # otherwise '' in 'string' == True! so lots of false positives
                    match = cmp_statement in content
                else:
                    continue

            if match:
                # logging.debug('... matches')

                # okay, statement found on the page -> get license type
                lic_type = statement_mapping[statement]['type']

                # license identified, now use that to construct the license object
                license = deepcopy(LICENSES[lic_type])
                license['open_access'] = oa_policy.oa_for_license(lic_type)
                # set some defaults which have to be there, even if empty
                license.setdefault('version','')
                license.setdefault('description','')
                license.setdefault('jurisdiction','') # TODO later (or later version of OAG!)
                
                # Copy over all information about the license from the license
                # statement mapping. In essence, transfer the knowledge of the 
                # publisher plugin authors to the license object.
                # Consequence: Values coming from the publisher plugin overwrite
                # values specified in the licenses module.
                license.update(statement_mapping[statement])
                
                # add provenance information to the license object
                provenance = {
                    'date': datetime.strftime(datetime.now(), config.date_format),
                    'source': url,
                    "source_size" : source_size,
                    'agent': config.agent,
                    'category': 'page_scrape', # TODO we need to think how the
                        # users get to know what the values here mean.. docs?
                    'description': self.gen_provenance_description(url, statement),
                    'handler': handler, # the name of the plugin processing this record
                    'handler_version': self.__version__ # version of the plugin processing this record
                }

                license['provenance'] = provenance
                record.add_license_object(license)
                
                if first_match:
                    break
Ejemplo n.º 5
0
    def license_detect(self, record):
        # 1. get DOI from record object
        # doi = record['provider'].get('doi')
        doi = record.provider_doi  # it MUST HAVE the canonical DOI prefix, "doi:" or "DOI:"

        if doi:
        # 2. query Elsevier XML api
            url = 'http://api.elsevier.com/content/article/' + doi
            response = requests.get(url)

            # determine the size of the request
            # (we ignore the content-length header, and just always use the number of bytes that we
            # calculate ourselves)
            source_size = len(bytes(response.content))

            response.encoding = 'utf-8'
            content = response.text
            if type(content) == str:
                content = content.decode('utf-8', 'replace')
            
            try:
                xml = etree.fromstring(content)
            except Exception as e:
                log.error("Error parsing the XML from " + url)
                log.error(e)
                return None  # no point in doing anything else, so just do what
                             # Python would do anyway upon reaching the end of this function
        
            # process the XML response
            namespaces = {'elsevierapi': 'http://www.elsevier.com/xml/svapi/article/dtd'}

            # is it open access at all?
            # case insensitive search for the value "true" in the relevant element
            xpath_oa = "//elsevierapi:openaccessArticle//text()[contains(translate(., 'EURT', 'eurt'), 'true')]"
            it_is_oa = len(xml.xpath(xpath_oa, namespaces=namespaces)) > 0

            # now try to get the license too
            lic_type = None
            lic_version = None
            url_to_record = None

            xpath_license_extract = '//elsevierapi:openaccessUserLicense'
            elements = xml.xpath(xpath_license_extract, namespaces=namespaces)
            if len(elements) > 0:
                license_url = elements[0].text

                if license_url:
                    cleaned_license_url = self.clean_url(license_url)

                    urlparts = cleaned_license_url.split('/')
                    if urlparts[0] == 'creativecommons.org':
                        try:
                            lic_type = 'cc-' + urlparts[2]
                            # if we get to here we know what the license is, i.e. "a success"
                            # so we can use the URL *they* specified
                            url_to_record = license_url
                            try:
                                lic_version = urlparts[3]
                            except IndexError:
                                # we know which CC license but don't know which version
                                # that's OK, just don't assert a version when creating
                                # the license record below
                                pass
                        except IndexError:
                            # it is a creative commons URL, but we can't find the license type part
                            # so it's of no use .. all that's left is to slap free-to-read on it
                            # if Elsevier says the article's OA
                            if it_is_oa:
                                lic_type = 'free-to-read'

            if it_is_oa and not lic_type:
                # Elsevier says the article's OA but we could not determine a license at all
                lic_type = 'free-to-read'

            meaning = {}
            if lic_type:
                meaning['type'] = lic_type
            if lic_version:
                meaning['version'] = lic_version
            if url_to_record:
                meaning['url'] = url_to_record

            if lic_type:
                # license identified, now use that to construct the license object
                license = deepcopy(LICENSES[lic_type])
                license['open_access'] = oa_policy.oa_for_license(lic_type)
                # set some defaults which have to be there, even if empty
                license.setdefault('version','')
                license.setdefault('description','')
                license.setdefault('jurisdiction','')

                # Copy over all information about the license from the license
                # statement mapping. In essence, transfer the knowledge of the
                # publisher plugin authors to the license object.
                # Consequence: Values coming from the publisher plugin overwrite
                # values specified in the licenses module.
                license.update(meaning)

                # add provenance information to the license object
                provenance = {
                    'handler': self._short_name,
                    'handler_version': self.__version__,
                    'date': datetime.strftime(datetime.now(), config.date_format),
                    'source': url,
                    "source_size" : source_size,
                    'agent': config.agent,
                    'category': 'xml_api', # TODO we need to think how the
                        # users get to know what the values here mean.. docs?
                    'description': 'License decided by querying the Elsevier XML API at ' + url
                }

                license['provenance'] = provenance
                record.add_license_object(license)

        return (self._short_name, self.__version__)
Ejemplo n.º 6
0
    def license_detect(self, record):
        """
        To respond to the provider identifier: http://elife.elifesciences.org
        
        This should determine the licence conditions of the eLife article and populate
        the record['bibjson']['license'] (note the US spelling) field.
        """

        # List of licensing statements to look for on this publisher's pages.
        # In eLife's case they take the form of {xpath string: meaning object}
        # since we're not scraping HTML, we're using an XML API.
        # meaning['type'] identifies the license (see licenses.py)
        # and meaning['version'] identifies the license version (if available)
        elife_license_mappings = self._license_mappings

        # 1. get DOI from record object
        # doi = record['provider'].get('doi')
        doi = record.doi_without_prefix  # it MUST NOT HAVE the canonical DOI prefix, "doi:" or "DOI:"

        if doi:
        # 2. query elife XML api
            url = 'http://elife.elifesciences.org/elife-source-xml/' + doi
            response = requests.get(url)
            
            # determine the size of the request
            # (we ignore the content-length header, and just always use the number of bytes that we
            # calculate ourselves)
            source_size = len(bytes(response.content))

            try:
                xml = etree.fromstring(response.text.decode("utf-8", "ignore"))
            except Exception as e:
                log.error("Error parsing the XML from " + url)
                log.error(e)
                return None  # no point in doing anything else, so just do what
                             # Python would do anyway upon reaching the end of this function
        
            # process the XML response
            namespaces = {'xlink': 'http://www.w3.org/1999/xlink'}

            for mapping in elife_license_mappings:
                xpath = mapping.keys()[0]
                meaning = mapping[xpath]
                elements = xml.xpath(xpath, namespaces=namespaces)

                if len(elements) > 0:
                    lic_type = meaning['type']
        
                    # license identified, now use that to construct the license object
                    license = deepcopy(LICENSES[lic_type])
                    license['open_access'] = oa_policy.oa_for_license(lic_type)
                    # set some defaults which have to be there, even if empty
                    license.setdefault('version','')
                    license.setdefault('description','')
                    license.setdefault('jurisdiction','') # TODO later (or later version of OAG!)
        
                    # Copy over all information about the license from the license
                    # statement mapping. In essence, transfer the knowledge of the 
                    # publisher plugin authors to the license object.
                    # Consequence: Values coming from the publisher plugin overwrite
                    # values specified in the licenses module.
                    license.update(meaning)
        
                    # add provenance information to the license object
                    provenance = {
                        'handler': self._short_name,
                        'handler_version': self.__version__,
                        'date': datetime.strftime(datetime.now(), config.date_format),
                        'source': url,
                        "source_size" : source_size,
                        'agent': config.agent,
                        'category': 'xml_api', # TODO we need to think how the
                            # users get to know what the values here mean.. docs?
                        'description': 'License decided by querying the eLife XML API at ' + url
                    }
        
                    license['provenance'] = provenance
                    record.add_license_object(license)

        return (self._short_name, self.__version__)