def dcmsRecordToHtml(record): """ Converts a DataCite Metadata Scheme <http://schema.datacite.org/> record to an XHTML table. The record should be unencoded. Returns None on error. """ try: r = lxml.etree.tostring(_stylesheet(util.parseXmlString(record)), encoding=unicode) assert r.startswith("<table") return r except: return None
def kmlPolygonToDatacite(kml): """ Converts a polygon defined in a KML <http://www.opengeospatial.org/standards/kml> version 2.2 or 2.3 document to a DataCite 4.0 <geoLocationPolygon> element. The return is a pair (lxml.etree.Element, [warning, ...]) if successful or a string error message if not. The conversion fails for the usual reasons (malformed KML, etc.) but also if the document defines more than one geometry or does not define a polygon. Polygon holes and non-zero altitude coordinates are ignored and result in warnings. """ try: root = util.parseXmlString(kml) except Exception, e: return "XML parse error: " + util.formatException(e)
def crossrefToDatacite(record, overrides={}): """ Converts a Crossref Deposit Schema <http://help.crossref.org/deposit_schema> document to a DataCite Metadata Scheme <http://schema.datacite.org/> record. 'overrides' is a dictionary of individual metadata element names (e.g., "datacite.title") and values that override the conversion values that would normally be drawn from the input document. Throws an exception on error. """ d = {} for k, v in overrides.items(): d[k] = lxml.etree.XSLT.strparam(v) return lxml.etree.tostring(_crossrefTransform(util.parseXmlString(record), **d), encoding=unicode)
def _mapDatacite(metadata): if _get(metadata, "datacite"): try: root = util.parseXmlString(_get(metadata, "datacite")) m = _rootTagRE.match(root.tag) assert m != None ns = {"N": m.group(1)} # Concatenate all creators. creator = " ; ".join( _text(n) for n in root.xpath("N:creators/N:creator/N:creatorName", namespaces=ns) if _text(n) != None) if creator == "": creator = None # Take the first title only. l = root.xpath("N:titles/N:title", namespaces=ns) if len(l) > 0: title = _text(l[0]) else: title = None l = root.xpath("N:publisher", namespaces=ns) if len(l) > 0: publisher = _text(l[0]) else: publisher = None l = root.xpath("N:publicationYear", namespaces=ns) if len(l) > 0: date = _text(l[0]) else: date = None l = root.xpath("N:resourceType", namespaces=ns) if len(l) > 0: if l[0].attrib.get("resourceTypeGeneral", "").strip() != "": type = l[0].attrib["resourceTypeGeneral"].strip() if _text(l[0]) != None: type += "/" + _text(l[0]) else: type = None else: type = None return KernelMetadata(creator, title, publisher, date, type) except: return _mapDataciteItemized(metadata) else: return _mapDataciteItemized(metadata)
def upgradeDcmsRecord(record, parseString=True, returnString=True): """ Converts a DataCite Metadata Scheme <http://schema.datacite.org/> record (supplied as an unencoded Unicode string if 'parseString' is true, or a root lxml.etree.Element object if not) to the latest version of the schema (currently, version 4). If 'returnString' is true, the record is returned as an unencoded Unicode string, in which case the record has no XML declaration. Otherwise, an lxml.etree.Element object is returned. In both cases, the root element's xsi:schemaLocation attribute is set or added as necessary. """ if parseString: root = util.parseXmlString(record) else: root = record root.attrib[ "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"] = ( "http://datacite.org/schema/kernel-4 " + "http://schema.datacite.org/meta/kernel-4/metadata.xsd") m = _schemaVersionRE.match(root.tag) if m.group(1) == "4": # Nothing to do. if returnString: return lxml.etree.tostring(root, encoding=unicode) else: return root def q(elementName): return "{http://datacite.org/schema/kernel-4}" + elementName def changeNamespace(node): if node.tag is not lxml.etree.Comment: # The order is important here: parent before children. node.tag = q(node.tag.split("}")[1]) for child in node: changeNamespace(child) changeNamespace(root) ns = {"N": "http://datacite.org/schema/kernel-4"} # Resource type is required as of version 4. e = root.xpath("//N:resourceType", namespaces=ns) assert len(e) <= 1 if len(e) == 1: if e[0].attrib["resourceTypeGeneral"] == "Film": e[0].attrib["resourceTypeGeneral"] = "Audiovisual" else: e = lxml.etree.SubElement(root, q("resourceType")) e.attrib["resourceTypeGeneral"] = "Other" e.text = "(:unav)" # There's no way to assign new types to start and end dates, so just # delete them. for e in root.xpath("//N:date", namespaces=ns): if e.attrib["dateType"] in ["StartDate", "EndDate"]: e.getparent().remove(e) for e in root.xpath("//N:dates", namespaces=ns): if len(e) == 0: e.getparent().remove(e) # The contributor type "Funder" went away in version 4. for e in root.xpath("//N:contributor[@contributorType='Funder']", namespaces=ns): fr = root.xpath("//N:fundingReferences", namespaces=ns) if len(fr) > 0: fr = fr[0] else: fr = lxml.etree.SubElement(root, q("fundingReferences")) for n in e.xpath("N:contributorName", namespaces=ns): lxml.etree.SubElement( lxml.etree.SubElement(fr, q("fundingReference")), q("funderName")).text = n.text e.getparent().remove(e) for e in root.xpath("//N:contributors", namespaces=ns): if len(e) == 0: e.getparent().remove(e) # Geometry changes in version 4. for e in root.xpath("//N:geoLocationPoint", namespaces=ns): if len(e) == 0: coords = e.text.split() if len(coords) == 2: lxml.etree.SubElement(e, q("pointLongitude")).text = coords[1] lxml.etree.SubElement(e, q("pointLatitude")).text = coords[0] e.text = None else: # Should never happen. e.getparent().remove(e) for e in root.xpath("//N:geoLocationBox", namespaces=ns): if len(e) == 0: coords = e.text.split() if len(coords) == 4: lxml.etree.SubElement(e, q("westBoundLongitude")).text = coords[1] lxml.etree.SubElement(e, q("eastBoundLongitude")).text = coords[3] lxml.etree.SubElement(e, q("southBoundLatitude")).text = coords[0] lxml.etree.SubElement(e, q("northBoundLatitude")).text = coords[2] e.text = None else: # Should never happen. e.getparent().remove(e) lxml.etree.cleanup_namespaces(root) if returnString: return lxml.etree.tostring(root, encoding=unicode) else: return root
def dataciteXmlToFormElements(document): """ Converts a DataCite XML record to a dictionary of form elements. All non-content (comments, etc.) is discarded. Whitespace is processed and empty element and attribute values are discarded. Dictionary keys follow the pattern of element and attribute XPaths, e.g., the schemeURI attribute in the following XML fragment: <resource> <creators> <creator>...</creator> <creator> <nameIdentifier schemeURI="..."> is identified by key: creators-creator-1-nameIdentifier_0-schemeURI Repeatable elements are indexed at the top level only; lower-level repeatable elements (e.g., contributor affiliations) are concatenated. However, certain repeatable elements (see _numberedElementContainers), such as nameIdentifier in the example above, are indexed, but with underscores. An additional tweak to the naming pattern is that the key for the content of a top-level repeatable element carries an extra component that echoes the element name, as in: alternateIdentifiers-alternateIdentifier-0-alternateIdentifier alternateIdentifiers-alternateIdentifier-1-alternateIdentifier <br> elements in descriptions are replaced with newlines. """ document = datacite.upgradeDcmsRecord(document) d = {} def tagName(tag): return tag.split("}")[1] def getElementChildren(node): return list(node.iterchildren(lxml.etree.Element)) def getText(node): t = node.text or "" for c in node.iterchildren(): t += c.tail or "" return t def processNode(path, node, index=None, separator="-"): tag = tagName(node.tag) if path == "": mypath = tag else: mypath = "%s-%s" % (path, tag) if index != None: mypath += "%s%d" % (separator, index) mypathx = "%s-%s" % (mypath, tag) else: mypathx = mypath for a in node.attrib: v = node.attrib[a].strip() if v != "": d["%s-%s" % (mypath, a)] = v if tag in _repeatableElementContainers: for i, c in enumerate(getElementChildren(node)): processNode(mypath, c, i) elif tag in _numberedElementContainers: indexes = {t: -1 for t in _numberedElementContainers[tag]} for c in getElementChildren(node): if tagName(c.tag) in indexes: indexes[tagName(c.tag)] += 1 processNode(mypath, c, indexes[tagName(c.tag)], separator="_") else: processNode(mypath, c) else: if tag == "description": # The only mixed-content element type in the schema; <br>'s # get replaced with newlines. v = node.text or "" for c in node.iterchildren(): if isinstance(c.tag, basestring) and tagName( c.tag) == "br": v += "\n" v += c.tail or "" v = v.strip() if v != "": d[mypathx] = v elif tag == "geoLocationPolygon": d[mypathx] = geometry_util.datacitePolygonToInternal(node) else: children = getElementChildren(node) if len(children) > 0: for c in children: processNode(mypath, c) else: v = getText(node).strip() if v != "": if mypathx in d: # Repeatable elements not explicitly handled have their # content concatenated. d[mypathx] += " ; " + v else: d[mypathx] = v root = util.parseXmlString(document) for c in getElementChildren(root): processNode("", c) fc = _separateByFormType(d) return fc