def update(self, path): doc = etree.parse(path) xpath = XPath(doc, nsmap={'x': 'http://example.org/data'}) self.root = doc.getroot() id = xpath.string('//x:id') self.id = 'oai:example-%s' % id self.modified = xpath.date('//x:modified') self.deleted = False author_data = [] for num, el in enumerate(xpath('//x:author'), 1): first = xpath.string('//x:author[%d]/x:givenName' % num) sur = xpath.string('//x:author[%d]/x:familyName' % num) name = u'%s %s' % (first, sur) author_data.append({ 'name': [name], 'surname': [sur], 'firstname': [first], 'role': [u'aut'] }) self.metadata = { 'identifier': [u'http://example.org/data/%s' % id], 'title': [xpath.string('//x:title')], 'subject': xpath.strings('//x:subject'), 'description': [xpath.string('//x:abstract')], 'creator': [d['name'][0] for d in author_data], 'author_data': author_data, 'language': [u'en'], 'date': [xpath.string('//x:issued')] } self.sets = { u'example': { u'name': u'example', u'description': u'An Example Set' } } access = xpath.string('//x:access') if access == 'public': self.sets[u'public'] = { u'name': u'public', u'description': u'Public access' } self.metadata['rights'] = [u'open access'] elif access == 'private': self.sets[u'private'] = { u'name': u'private', u'description': u'Private access' } self.metadata['rights'] = [u'restricted access']
def update(self, path): doc = etree.parse(path) xpath = XPath(doc, nsmap={'x':'http://example.org/data'}) self.root = doc.getroot() id = xpath.string('//x:id') self.id = 'oai:example-%s' % id self.modified = xpath.date('//x:modified') self.deleted = False author_data = [] for num, el in enumerate(xpath('//x:author'), 1): first = xpath.string('//x:author[%d]/x:givenName' % num) sur = xpath.string('//x:author[%d]/x:familyName' % num) name = u'%s %s' % (first, sur) author_data.append({'name': [name], 'surname': [sur], 'firstname': [first], 'role': [u'aut']}) self.metadata = {'identifier': [u'http://example.org/data/%s' % id], 'title': [xpath.string('//x:title')], 'subject': xpath.strings('//x:subject'), 'description': [xpath.string('//x:abstract')], 'creator': [d['name'][0] for d in author_data], 'author_data': author_data, 'language': [u'en'], 'date': [xpath.string('//x:issued')]} self.sets = {u'example': {u'name':u'example', u'description':u'An Example Set'}} access = xpath.string('//x:access') if access == 'public': self.sets[u'public'] = {u'name':u'public', u'description':u'Public access'} self.metadata['rights'] = [u'open access'] elif access == 'private': self.sets[u'private'] = {u'name':u'private', u'description':u'Private access'} self.metadata['rights'] = [u'restricted access']
def update(self, path): try: tree = etree.parse(path) # Select all element nodes in namespace query = "descendant-or-self::*[namespace-uri()!='']" for element in tree.xpath(query): # Replace element name with its local name element.tag = etree.QName(element).localname except etree.ParseError: log = get_moai_log() log.warning("Failed to parse %s".format(path)) return xpath = XPath(tree, nsmap={}) # DOI id = xpath.string( "/metadata/System/Persistent_Identifier_Datapackage[Identifier_Scheme='DOI']/Identifier" ) if not id: log = get_moai_log() log.warning( "Missing Persistent Identifier (DOI) of Datapackage in " + path) return self.id = 'oai:%s' % id self.metadata['identifier'] = [id] # Last modified last_modified = xpath.string('//Last_Modified_Date') if not last_modified: log = get_moai_log() log.warning("Missing Last Modified Time in %s".format(path)) self.modified = datetime.now() - timedelta(days=1) else: ret = datetime.strptime(last_modified[0:19], '%Y-%m-%dT%H:%M:%S') if len(last_modified) > 19: if last_modified[19] == '+': ret -= timedelta(hours=int(last_modified[20:22]), minutes=int(last_modified[22:])) elif last_modified[19] == '-': ret += timedelta(hours=int(last_modified[20:22]), minutes=int(last_modified[22:])) self.modified = ret # Creators and contributors author_data = [] creators = xpath.strings('//Creator/Name') if creators: self.metadata['creator'] = creators for creator in creators: author_data.append({u"name": creator, u"role": [u"auth"]}) contributors = xpath.strings('//Contributor/Name') if contributors: self.metadata['contributor'] = contributors for contributor in contributors: author_data.append({u"name": contributor, u"role": [u"cont"]}) self.metadata["author_data"] = author_data # Funding references fundingRefs = [] funders = xpath('//Funding_Reference') if len(funders): for funder in funders: funderDict = {} if funder.find('Funder_Name') is not None: funderDict["name"] = funder.find('Funder_Name').text if funder.find('Properties/Award_Number') is not None: funderDict["awardNumber"] = funder.find( 'Properties/Award_Number').text fundingRefs.append(funderDict) self.metadata["fundingReferences"] = fundingRefs # Related datapackages i.e. related identifiers relatedIdentifiers = [] packages = xpath('//Related_Datapackage') if len(packages): for package in packages: relatedDict = {} if package.find('Properties/Title') is not None: relatedDict["title"] = package.find( 'Properties/Title').text if package.find( 'Properties/Persistent_Identifier/Identifier_Scheme' ) is not None: relatedDict["relatedIdentifierScheme"] = package.find( 'Properties/Persistent_Identifier/Identifier_Scheme' ).text if package.find('Properties/Persistent_Identifier/Identifier' ) is not None: relatedDict["relatedIdentifier"] = package.find( 'Properties/Persistent_Identifier/Identifier').text if package.find('Relation_Type') is not None: relatedDict["relationType"] = package.find( 'Relation_Type').text relatedIdentifiers.append(relatedDict) self.metadata["relatedIdentifiers"] = relatedIdentifiers # Contributors datacite - yoda contributor can hold n idf/idf_schemes. Does datacite? dataciteContributors = [] dcContributors = xpath('//Contributor') if len(dcContributors): for contrib in dcContributors: contribDict = {} if contrib.find('Name') is not None: contribDict["name"] = contrib.find('Name').text if contrib.find('Properties/Contributor_Type') is not None: contribDict["type"] = contrib.find( 'Properties/Contributor_Type').text if contrib.find('Properties') is not None: affiliations = [] personIdentifiers = [] children = contrib.find('Properties') for child in children: if child.tag == 'Affiliation': affiliations.append(child.text) elif child.tag == 'Person_Identifier': nameIdentifier = '' nameIdentifierScheme = '' piChildren = child.getchildren() for piChild in piChildren: if piChild.tag == 'Name_Identifier': nameIdentifier = piChild.text elif piChild.tag == 'Name_Identifier_Scheme': nameIdentifierScheme = piChild.text personIdentifiers.append( {nameIdentifierScheme: nameIdentifier}) contribDict['affiliation'] = affiliations contribDict['name_identifiers'] = personIdentifiers dataciteContributors.append(contribDict) self.metadata['dataciteContributors'] = dataciteContributors # Creators datacite - yoda creators can hold n idf/idf_schemes. Does datacite? dataciteCreators = [] dcCreators = xpath('//Creator') if len(dcCreators): for creator in dcCreators: creatorDict = {} if creator.find('Name') is not None: creatorDict['name'] = creator.find('Name').text if creator.find('Properties') is not None: affiliations = [] personIdentifiers = [] children = creator.find('Properties') for child in children: if child.tag == 'Affiliation': affiliations.append(child.text) elif child.tag == 'Person_Identifier': nameIdentifier = '' nameIdentifierScheme = '' piChildren = child.getchildren() for piChild in piChildren: if piChild.tag == 'Name_Identifier': nameIdentifier = piChild.text elif piChild.tag == 'Name_Identifier_Scheme': nameIdentifierScheme = piChild.text personIdentifiers.append( {nameIdentifierScheme: nameIdentifier}) creatorDict['affiliation'] = affiliations creatorDict['name_identifiers'] = personIdentifiers dataciteCreators.append(creatorDict) self.metadata['dataciteCreators'] = dataciteCreators title = xpath.string('//Title') if title: self.metadata['title'] = [title] # Add collection name collectionName = xpath.string('//Collection_Name') if collectionName: self.metadata['collectionName'] = [collectionName] description = xpath.string('//Description') if description: self.metadata['description'] = [description] language = xpath.string('//Language') if language: self.metadata['language'] = [language[0:2]] elif title: self.metadata['language'] = ['en'] version = xpath.string('//Version') if version: self.metadata['version'] = version # Dates - handling dublin core datesinxml = [ xpath.string('//Publication_Date'), xpath.string('//Embargo_End_Date') ] dates = [d for d in datesinxml if d] if dates: self.metadata['date'] = dates # Dates - handling datacite dataciteDates = {} if xpath.string('//System/Last_Modified_Date'): dataciteDates['Updated'] = xpath.string( '//System/Last_Modified_Date')[0:10] if xpath.string('//Embargo_End_Date'): dataciteDates['Available'] = xpath.string( '//Embargo_End_Date')[0:10] # embargo is handled differently in test schema - old school flex date embargo = xpath('//Embargo_End_Date') #if embargo.find('Embargo_End_Date_YYYY_MM_DD') is not None: # embargoEndDate = embargo.find('Embargo_End_Date_YYYY_MM_DD').text #elif embargo.find('Embargo_End_Date_YYYY_MM') is not None: # embargoEndDate = embargo.find('Embargo_End_Date_YYYY_MM').text #elif embargo.find('Embargo_End_Date_YYYY') is not None: # embargoEndDate = embargo.find('Embargo_End_Date_YYYY').text #else: # embargoEndDate = embargo.text[0:10] start = xpath.string('//Collected/Start_Date') end = xpath.string('//Collected/End_Date') if start is not None and end is not None: dataciteDates['Collected'] = start + '/' + end self.metadata['dataciteDates'] = dataciteDates # Year of publication self.metadata['publicationYear'] = xpath.string( '//Publication_Date')[0:4] # Rights # License_URL is used here. # This is actually wrong -> must be License_URI. # I won't change it though, as I can't oversee the consequences of this data being present all of a sudden. # Without proper testing. # FOr datacite License_URI is required. Therefore, for now I add this as an extra key/val pair in the JSON representation rightsinxml = [ xpath.string('//License'), xpath.string('//System/License_URL') ] rights = [r for r in rightsinxml if r] if rights: self.metadata['rights'] = rights # License URL -specifically for datacite rightsLicenseURI = xpath.string('//System/License_URI') if rightsLicenseURI: self.metadata['rightsLicenseURL'] = rightsLicenseURI accessRestriction = xpath.string('//Data_Access_Restriction') if accessRestriction: if accessRestriction.startswith('Open'): self.metadata['accessRights'] = 'Open Access' self.metadata[ 'accessRightsURI'] = 'info:eu-repo/semantics/openAccess' elif accessRestriction.startswith('Restricted'): self.metadata['accessRights'] = 'Restricted Access' self.metadata[ 'accessRightsURI'] = 'info:eu-repo/semantics/restrictedAccess' elif accessRestriction.startswith('Closed'): self.metadata['accessRights'] = 'Closed Access' self.metadata[ 'accessRightsURI'] = 'info:eu-repo/semantics/closedAccess' subjectinxml = xpath.strings('//Discipline') + xpath.strings('//Tag') subject = [s for s in subjectinxml if s] if subject: self.metadata['subject'] = subject # Datacite will handle tags and disciplines differently - both will fall under Subjects dcdisciplines = xpath.strings('//Discipline') self.metadata['dataciteDisciplines'] = dcdisciplines dctags = xpath.strings('//Tag') self.metadata['dataciteTags'] = dctags locations = xpath.strings('//Covered_Geolocation_Place') self.metadata[ 'dataciteLocations'] = locations # extra field as there's a conflict with locations below geoLocation = xpath.strings('//geoLocation') westBoundLongitudes = xpath.strings('//geoLocation/westBoundLongitude') eastBoundLongitudes = xpath.strings('//geoLocation/eastBoundLongitude') southBoundLatitudes = xpath.strings('//geoLocation/southBoundLatitude') northBoundLatitudes = xpath.strings('//geoLocation/northBoundLatitude') # Bounding box: left,bottom,right,top boxes = [] for west, south, east, north in zip(westBoundLongitudes, southBoundLatitudes, eastBoundLongitudes, northBoundLatitudes): box = ",".join([west, south, east, north]) boxes.append(box) perioddates = [ xpath.string('//Covered_Period/Start_Date'), xpath.string('//Covered_Period/End_Date') ] period = "/".join([d for d in perioddates if d]) if period and geoLocation: coverage = locations + [period] + boxes elif geoLocation: coverage = locations + boxes elif period: coverage = locations + [period] else: coverage = locations if coverage: self.metadata['coverage'] = coverage # dataType now fed by yoda-metadata.xml instead of being hardcoded as 'Dataset' in datacite.py dataType = xpath.string('//Data_Type') if dataType: self.metadata['dataType'] = dataType
def test_namespaces(self): doc = etree.fromstring( '<doc xmlns="urn:spam"><string>Spam!</string></doc>') xpath = XPath(doc, nsmap={'spam': 'urn:spam'}) self.assertEqual(xpath.string('//spam:string'), 'Spam!')
def test_identify(self): xml = urllib.request.urlopen('http://test?verb=Identify').read() doc = etree.fromstring(xml) xpath = XPath(doc, nsmap={"oai": "http://www.openarchives.org/OAI/2.0/"}) self.assertEqual(xpath.string('//oai:repositoryName'), 'Test Server')
def test_namespaces(self): doc = etree.fromstring( '<doc xmlns="urn:spam"><string>Spam!</string></doc>') xpath = XPath(doc, nsmap={'spam': 'urn:spam'}) self.assertEquals(xpath.string('//spam:string'), u'Spam!')
def test_identify(self): xml = urllib2.urlopen('http://test?verb=Identify').read() doc = etree.fromstring(xml) xpath = XPath(doc, nsmap= {"oai" :"http://www.openarchives.org/OAI/2.0/"}) self.assertEquals(xpath.string('//oai:repositoryName'),u'Test Server')