def test_oa_policy_select_function(self): # oa_for_license is essentially a convenience wrapper around # oa_from_rights which just selects the appropriate license from the # licenses module. # We are not going to test the correctness of the rights definitions, # that is up to plugin content tests. We are just going to see if the # wrapper works as expected. # try a license which should exist and should be OA # We don't care what value it returns, we just care about accessing # dict. All we're checking is that no exceptions are raised. dummy_open_access_value = oa_policy.oa_for_license('cc-by') self.assertRaises(KeyError, oa_policy.oa_for_license, 'should_not_exist')
def license_detect(self, record): """ To respond to the provider identifier: http://elife.elifesciences.org This should determine the licence conditions of the eLife article and populate the record['bibjson']['license'] (note the US spelling) field. """ # List of licensing statements to look for on this publisher's pages. # In eLife's case they take the form of {xpath string: meaning object} # since we're not scraping HTML, we're using an XML API. # meaning['type'] identifies the license (see licenses.py) # and meaning['version'] identifies the license version (if available) elife_license_mappings = [ {'//license[@xlink:href="http://creativecommons.org/licenses/by/3.0/" and @license-type="open-access"]': { 'type': 'cc-by', 'version':'3.0', # also declare some properties which override info about this license in the licenses list (see licenses module) 'url': 'http://creativecommons.org/licenses/by/3.0/' } } ] # 1. get DOI from record object doi = record['provider'].get('doi') if doi: # 2. query elife XML api url = 'http://elife.elifesciences.org/elife-source-xml/' + doi response = requests.get(url) try: xml = etree.fromstring(response.text.decode("utf-8")) except Exception as e: log.error("Error parsing the XML from " + url) log.error(e) # process the XML response namespaces = {'xlink': 'http://www.w3.org/1999/xlink'} for mapping in elife_license_mappings: xpath = mapping.keys()[0] meaning = mapping[xpath] elements = xml.xpath(xpath, namespaces=namespaces) if len(elements) > 0: lic_type = meaning['type'] # license identified, now use that to construct the license object license = deepcopy(LICENSES[lic_type]) license['open_access'] = oa_policy.oa_for_license(lic_type) # set some defaults which have to be there, even if empty license.setdefault('version','') license.setdefault('description','') license.setdefault('jurisdiction','') # TODO later (or later version of OAG!) # Copy over all information about the license from the license # statement mapping. In essence, transfer the knowledge of the # publisher plugin authors to the license object. # Consequence: Values coming from the publisher plugin overwrite # values specified in the licenses module. license.update(meaning) # add provenance information to the license object provenance = { 'handler': self._short_name, 'handler_version': self.__version__, 'date': datetime.strftime(datetime.now(), config.date_format), 'source': url, 'agent': config.agent, 'category': 'xml_api', # TODO we need to think how the # users get to know what the values here mean.. docs? 'description': 'License decided by querying the eLife XML API at ' + url } license['provenance'] = provenance record['bibjson'].setdefault('license', []) record['bibjson']['license'].append(license)
def simple_extract(self, lic_statements, record, url, first_match=False): """ Generic code which looks for a particular string in a given web page (URL), determines the licence conditions of the article and populates the record['bibjson']['license'] (note the US spelling) field. The URL it analyses, the statements it looks for and the resulting licenses are passed in. This is not a plugin for a particular publisher - it just contains (allows re-use) the logic that any "dumb string matching" plugin would use. :param lic_statements: licensing statements to look for on this publisher's pages. Take the form of {statement: meaning} where meaning['type'] identifies the license (see licenses.py) and meaning['version'] identifies the license version (if available) See a publisher plugin for an example, e.g. bmc.py :param record: a request for the OAG status of an article, see OAG docs for more info. :param url: source url of the item to be fetched. This is where the HTML page that's going to be scraped is expected to reside. :param first_match: stop trying license statements if one of them is found at the target url. By default, this code will try out all supplied license statements and simply add multiple 'license' objects to the record it's been passed. If you want "first successfuly match only" behaviour, set this to True. """ # get content r = requests.get(url) # logging.debug('got content') content = self.normalise_string(r.content) # see if one of the licensing statements is in content # and populate record with appropriate license info for statement_mapping in lic_statements: # get the statement string itself - always the first key of the dict # mapping statements to licensing info statement = statement_mapping.keys()[0] # use a modified version of the license statement for # comparison - one which has been subjected to the same # normalisation as the incoming content (whitespace, # lowercasing etc.) cmp_statement = self.normalise_string(statement) # logging.debug(cmp_statement) if cmp_statement in content: # logging.debug('... matches') # okay, statement found on the page -> get license type lic_type = statement_mapping[statement]['type'] # license identified, now use that to construct the license object license = deepcopy(LICENSES[lic_type]) license['open_access'] = oa_policy.oa_for_license(lic_type) # set some defaults which have to be there, even if empty license.setdefault('version','') license.setdefault('description','') license.setdefault('jurisdiction','') # TODO later (or later version of OAG!) # Copy over all information about the license from the license # statement mapping. In essence, transfer the knowledge of the # publisher plugin authors to the license object. # Consequence: Values coming from the publisher plugin overwrite # values specified in the licenses module. license.update(statement_mapping[statement]) # add provenance information to the license object provenance = { 'date': datetime.strftime(datetime.now(), config.date_format), 'source': url, 'agent': config.agent, 'category': 'page_scrape', # TODO we need to think how the # users get to know what the values here mean.. docs? 'description': self.gen_provenance_description(url, statement), 'handler': self._short_name, # the name of the plugin processing this record 'handler_version': self.__version__ # version of the plugin processing this record } license['provenance'] = provenance record['bibjson'].setdefault('license', []) record['bibjson']['license'].append(license) if first_match: break
def simple_extract(self, lic_statements, record, url, first_match=False, content='', handler=''): """ Generic code which looks for a particular string in a given web page (URL), determines the licence conditions of the article and populates the record['bibjson']['license'] (note the US spelling) field. The URL it analyses, the statements it looks for and the resulting licenses are passed in. This is not a plugin for a particular publisher - it just contains (allows re-use) the logic that any "dumb string matching" plugin would use. :param lic_statements: licensing statements to look for on this publisher's pages. Take the form of {statement: meaning} where meaning['type'] identifies the license (see licenses.py) and meaning['version'] identifies the license version (if available) See a publisher plugin for an example, e.g. bmc.py :param record: a request for the OAG status of an article, see OAG docs for more info. :param url: source url of the item to be fetched. This is where the HTML page that's going to be scraped is expected to reside. :param first_match: stop trying license statements if one of them is found at the target url. By default, this code will try out all supplied license statements and simply add multiple 'license' objects to the record it's been passed. If you want "first successful match only" behaviour, set this to True. """ if not handler: handler = self._short_name # can't put it in the method signature above, self is unresolved if not content: # get content from the web unless it's being passed into this method r, content, source_size = util.http_stream_get(url) if r.status_code != requests.codes.ok: raise PluginException(PluginException.HTTP, "could not retrieve content from " + url + " - " + str(r.status_code)) else: source_size = len(content) content = self.normalise_string(content) if not content: return # see if one of the licensing statements is in content # and populate record with appropriate license info for statement_mapping in lic_statements: # get the statement string itself - always the first key of the dict # mapping statements to licensing info statement = statement_mapping.keys()[0] # use a modified version of the license statement for # comparison - one which has been subjected to the same # normalisation as the incoming content (whitespace, # lowercasing etc.) cmp_statement = self.normalise_string(statement) # do not try to match empty statements, will always result in a match if not cmp_statement: continue # logging.debug(cmp_statement) #content = content.decode('utf-8', errors='replace').encode('utf-8', errors='replace') #print 'cmp statement type', type(cmp_statement) #print 'content type', type(content) #if type(cmp_statement) == unicode: # print 'converting cmp_statement to str' # cmp_statement = cmp_statement.encode('utf-8', 'ignore') #if type(content) == unicode: # content = content.encode('utf-8', 'ignore') if type(cmp_statement) == str: #print 'converting cmp_statement to unicode' cmp_statement = cmp_statement.decode('utf-8', 'replace') if type(content) == str: content = content.decode('utf-8', 'replace') #print 'after safeguards' #print 'cmp statement type', type(cmp_statement) #print 'content type', type(content) match = cmp_statement in content if not match: cmp_statement = self.strip_html(cmp_statement) content = self.strip_html(content) if cmp_statement: # if there's anything left of the statement after the html stripping... # otherwise '' in 'string' == True! so lots of false positives match = cmp_statement in content else: continue if match: # logging.debug('... matches') # okay, statement found on the page -> get license type lic_type = statement_mapping[statement]['type'] # license identified, now use that to construct the license object license = deepcopy(LICENSES[lic_type]) license['open_access'] = oa_policy.oa_for_license(lic_type) # set some defaults which have to be there, even if empty license.setdefault('version','') license.setdefault('description','') license.setdefault('jurisdiction','') # TODO later (or later version of OAG!) # Copy over all information about the license from the license # statement mapping. In essence, transfer the knowledge of the # publisher plugin authors to the license object. # Consequence: Values coming from the publisher plugin overwrite # values specified in the licenses module. license.update(statement_mapping[statement]) # add provenance information to the license object provenance = { 'date': datetime.strftime(datetime.now(), config.date_format), 'source': url, "source_size" : source_size, 'agent': config.agent, 'category': 'page_scrape', # TODO we need to think how the # users get to know what the values here mean.. docs? 'description': self.gen_provenance_description(url, statement), 'handler': handler, # the name of the plugin processing this record 'handler_version': self.__version__ # version of the plugin processing this record } license['provenance'] = provenance record.add_license_object(license) if first_match: break
def license_detect(self, record): # 1. get DOI from record object # doi = record['provider'].get('doi') doi = record.provider_doi # it MUST HAVE the canonical DOI prefix, "doi:" or "DOI:" if doi: # 2. query Elsevier XML api url = 'http://api.elsevier.com/content/article/' + doi response = requests.get(url) # determine the size of the request # (we ignore the content-length header, and just always use the number of bytes that we # calculate ourselves) source_size = len(bytes(response.content)) response.encoding = 'utf-8' content = response.text if type(content) == str: content = content.decode('utf-8', 'replace') try: xml = etree.fromstring(content) except Exception as e: log.error("Error parsing the XML from " + url) log.error(e) return None # no point in doing anything else, so just do what # Python would do anyway upon reaching the end of this function # process the XML response namespaces = {'elsevierapi': 'http://www.elsevier.com/xml/svapi/article/dtd'} # is it open access at all? # case insensitive search for the value "true" in the relevant element xpath_oa = "//elsevierapi:openaccessArticle//text()[contains(translate(., 'EURT', 'eurt'), 'true')]" it_is_oa = len(xml.xpath(xpath_oa, namespaces=namespaces)) > 0 # now try to get the license too lic_type = None lic_version = None url_to_record = None xpath_license_extract = '//elsevierapi:openaccessUserLicense' elements = xml.xpath(xpath_license_extract, namespaces=namespaces) if len(elements) > 0: license_url = elements[0].text if license_url: cleaned_license_url = self.clean_url(license_url) urlparts = cleaned_license_url.split('/') if urlparts[0] == 'creativecommons.org': try: lic_type = 'cc-' + urlparts[2] # if we get to here we know what the license is, i.e. "a success" # so we can use the URL *they* specified url_to_record = license_url try: lic_version = urlparts[3] except IndexError: # we know which CC license but don't know which version # that's OK, just don't assert a version when creating # the license record below pass except IndexError: # it is a creative commons URL, but we can't find the license type part # so it's of no use .. all that's left is to slap free-to-read on it # if Elsevier says the article's OA if it_is_oa: lic_type = 'free-to-read' if it_is_oa and not lic_type: # Elsevier says the article's OA but we could not determine a license at all lic_type = 'free-to-read' meaning = {} if lic_type: meaning['type'] = lic_type if lic_version: meaning['version'] = lic_version if url_to_record: meaning['url'] = url_to_record if lic_type: # license identified, now use that to construct the license object license = deepcopy(LICENSES[lic_type]) license['open_access'] = oa_policy.oa_for_license(lic_type) # set some defaults which have to be there, even if empty license.setdefault('version','') license.setdefault('description','') license.setdefault('jurisdiction','') # Copy over all information about the license from the license # statement mapping. In essence, transfer the knowledge of the # publisher plugin authors to the license object. # Consequence: Values coming from the publisher plugin overwrite # values specified in the licenses module. license.update(meaning) # add provenance information to the license object provenance = { 'handler': self._short_name, 'handler_version': self.__version__, 'date': datetime.strftime(datetime.now(), config.date_format), 'source': url, "source_size" : source_size, 'agent': config.agent, 'category': 'xml_api', # TODO we need to think how the # users get to know what the values here mean.. docs? 'description': 'License decided by querying the Elsevier XML API at ' + url } license['provenance'] = provenance record.add_license_object(license) return (self._short_name, self.__version__)
def license_detect(self, record): """ To respond to the provider identifier: http://elife.elifesciences.org This should determine the licence conditions of the eLife article and populate the record['bibjson']['license'] (note the US spelling) field. """ # List of licensing statements to look for on this publisher's pages. # In eLife's case they take the form of {xpath string: meaning object} # since we're not scraping HTML, we're using an XML API. # meaning['type'] identifies the license (see licenses.py) # and meaning['version'] identifies the license version (if available) elife_license_mappings = self._license_mappings # 1. get DOI from record object # doi = record['provider'].get('doi') doi = record.doi_without_prefix # it MUST NOT HAVE the canonical DOI prefix, "doi:" or "DOI:" if doi: # 2. query elife XML api url = 'http://elife.elifesciences.org/elife-source-xml/' + doi response = requests.get(url) # determine the size of the request # (we ignore the content-length header, and just always use the number of bytes that we # calculate ourselves) source_size = len(bytes(response.content)) try: xml = etree.fromstring(response.text.decode("utf-8", "ignore")) except Exception as e: log.error("Error parsing the XML from " + url) log.error(e) return None # no point in doing anything else, so just do what # Python would do anyway upon reaching the end of this function # process the XML response namespaces = {'xlink': 'http://www.w3.org/1999/xlink'} for mapping in elife_license_mappings: xpath = mapping.keys()[0] meaning = mapping[xpath] elements = xml.xpath(xpath, namespaces=namespaces) if len(elements) > 0: lic_type = meaning['type'] # license identified, now use that to construct the license object license = deepcopy(LICENSES[lic_type]) license['open_access'] = oa_policy.oa_for_license(lic_type) # set some defaults which have to be there, even if empty license.setdefault('version','') license.setdefault('description','') license.setdefault('jurisdiction','') # TODO later (or later version of OAG!) # Copy over all information about the license from the license # statement mapping. In essence, transfer the knowledge of the # publisher plugin authors to the license object. # Consequence: Values coming from the publisher plugin overwrite # values specified in the licenses module. license.update(meaning) # add provenance information to the license object provenance = { 'handler': self._short_name, 'handler_version': self.__version__, 'date': datetime.strftime(datetime.now(), config.date_format), 'source': url, "source_size" : source_size, 'agent': config.agent, 'category': 'xml_api', # TODO we need to think how the # users get to know what the values here mean.. docs? 'description': 'License decided by querying the eLife XML API at ' + url } license['provenance'] = provenance record.add_license_object(license) return (self._short_name, self.__version__)