Python parseXmlStringの例、util.parseXmlString Pythonの例

コード例 #1

0

ファイルを表示

ファイル: datacite.py プロジェクト: HEG-INCIPIT/ARKetype

def dcmsRecordToHtml(record):
    """
  Converts a DataCite Metadata Scheme <http://schema.datacite.org/>
  record to an XHTML table.  The record should be unencoded.  Returns
  None on error.
  """
    try:
        r = lxml.etree.tostring(_stylesheet(util.parseXmlString(record)),
                                encoding=unicode)
        assert r.startswith("<table")
        return r
    except:
        return None

コード例 #2

0

ファイルを表示

def kmlPolygonToDatacite(kml):
    """
  Converts a polygon defined in a KML
  <http://www.opengeospatial.org/standards/kml> version 2.2 or 2.3
  document to a DataCite 4.0 <geoLocationPolygon> element.  The return
  is a pair (lxml.etree.Element, [warning, ...]) if successful or a
  string error message if not.  The conversion fails for the usual
  reasons (malformed KML, etc.) but also if the document defines more
  than one geometry or does not define a polygon.  Polygon holes and
  non-zero altitude coordinates are ignored and result in warnings.
  """
    try:
        root = util.parseXmlString(kml)
    except Exception, e:
        return "XML parse error: " + util.formatException(e)

コード例 #3

0

ファイルを表示

ファイル: datacite.py プロジェクト: HEG-INCIPIT/ARKetype

def crossrefToDatacite(record, overrides={}):
    """
  Converts a Crossref Deposit Schema
  <http://help.crossref.org/deposit_schema> document to a DataCite
  Metadata Scheme <http://schema.datacite.org/> record.  'overrides'
  is a dictionary of individual metadata element names (e.g.,
  "datacite.title") and values that override the conversion values
  that would normally be drawn from the input document.  Throws an
  exception on error.
  """
    d = {}
    for k, v in overrides.items():
        d[k] = lxml.etree.XSLT.strparam(v)
    return lxml.etree.tostring(_crossrefTransform(util.parseXmlString(record),
                                                  **d),
                               encoding=unicode)

コード例 #4

0

ファイルを表示

def _mapDatacite(metadata):
    if _get(metadata, "datacite"):
        try:
            root = util.parseXmlString(_get(metadata, "datacite"))
            m = _rootTagRE.match(root.tag)
            assert m != None
            ns = {"N": m.group(1)}
            # Concatenate all creators.
            creator = " ; ".join(
                _text(n)
                for n in root.xpath("N:creators/N:creator/N:creatorName",
                                    namespaces=ns) if _text(n) != None)
            if creator == "":
                creator = None
            # Take the first title only.
            l = root.xpath("N:titles/N:title", namespaces=ns)
            if len(l) > 0:
                title = _text(l[0])
            else:
                title = None
            l = root.xpath("N:publisher", namespaces=ns)
            if len(l) > 0:
                publisher = _text(l[0])
            else:
                publisher = None
            l = root.xpath("N:publicationYear", namespaces=ns)
            if len(l) > 0:
                date = _text(l[0])
            else:
                date = None
            l = root.xpath("N:resourceType", namespaces=ns)
            if len(l) > 0:
                if l[0].attrib.get("resourceTypeGeneral", "").strip() != "":
                    type = l[0].attrib["resourceTypeGeneral"].strip()
                    if _text(l[0]) != None:
                        type += "/" + _text(l[0])
                else:
                    type = None
            else:
                type = None
            return KernelMetadata(creator, title, publisher, date, type)
        except:
            return _mapDataciteItemized(metadata)
    else:
        return _mapDataciteItemized(metadata)

コード例 #5

0

ファイルを表示

ファイル: datacite.py プロジェクト: HEG-INCIPIT/ARKetype

def upgradeDcmsRecord(record, parseString=True, returnString=True):
    """
  Converts a DataCite Metadata Scheme <http://schema.datacite.org/>
  record (supplied as an unencoded Unicode string if 'parseString' is
  true, or a root lxml.etree.Element object if not) to the latest
  version of the schema (currently, version 4).  If 'returnString' is
  true, the record is returned as an unencoded Unicode string, in
  which case the record has no XML declaration.  Otherwise, an
  lxml.etree.Element object is returned.  In both cases, the root
  element's xsi:schemaLocation attribute is set or added as necessary.
  """
    if parseString:
        root = util.parseXmlString(record)
    else:
        root = record
    root.attrib[
        "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"] = (
            "http://datacite.org/schema/kernel-4 " +
            "http://schema.datacite.org/meta/kernel-4/metadata.xsd")
    m = _schemaVersionRE.match(root.tag)
    if m.group(1) == "4":
        # Nothing to do.
        if returnString:
            return lxml.etree.tostring(root, encoding=unicode)
        else:
            return root

    def q(elementName):
        return "{http://datacite.org/schema/kernel-4}" + elementName

    def changeNamespace(node):
        if node.tag is not lxml.etree.Comment:
            # The order is important here: parent before children.
            node.tag = q(node.tag.split("}")[1])
            for child in node:
                changeNamespace(child)

    changeNamespace(root)
    ns = {"N": "http://datacite.org/schema/kernel-4"}
    # Resource type is required as of version 4.
    e = root.xpath("//N:resourceType", namespaces=ns)
    assert len(e) <= 1
    if len(e) == 1:
        if e[0].attrib["resourceTypeGeneral"] == "Film":
            e[0].attrib["resourceTypeGeneral"] = "Audiovisual"
    else:
        e = lxml.etree.SubElement(root, q("resourceType"))
        e.attrib["resourceTypeGeneral"] = "Other"
        e.text = "(:unav)"
    # There's no way to assign new types to start and end dates, so just
    # delete them.
    for e in root.xpath("//N:date", namespaces=ns):
        if e.attrib["dateType"] in ["StartDate", "EndDate"]:
            e.getparent().remove(e)
    for e in root.xpath("//N:dates", namespaces=ns):
        if len(e) == 0:
            e.getparent().remove(e)
    # The contributor type "Funder" went away in version 4.
    for e in root.xpath("//N:contributor[@contributorType='Funder']",
                        namespaces=ns):
        fr = root.xpath("//N:fundingReferences", namespaces=ns)
        if len(fr) > 0:
            fr = fr[0]
        else:
            fr = lxml.etree.SubElement(root, q("fundingReferences"))
        for n in e.xpath("N:contributorName", namespaces=ns):
            lxml.etree.SubElement(
                lxml.etree.SubElement(fr, q("fundingReference")),
                q("funderName")).text = n.text
        e.getparent().remove(e)
    for e in root.xpath("//N:contributors", namespaces=ns):
        if len(e) == 0:
            e.getparent().remove(e)
    # Geometry changes in version 4.
    for e in root.xpath("//N:geoLocationPoint", namespaces=ns):
        if len(e) == 0:
            coords = e.text.split()
            if len(coords) == 2:
                lxml.etree.SubElement(e, q("pointLongitude")).text = coords[1]
                lxml.etree.SubElement(e, q("pointLatitude")).text = coords[0]
                e.text = None
            else:
                # Should never happen.
                e.getparent().remove(e)
    for e in root.xpath("//N:geoLocationBox", namespaces=ns):
        if len(e) == 0:
            coords = e.text.split()
            if len(coords) == 4:
                lxml.etree.SubElement(e,
                                      q("westBoundLongitude")).text = coords[1]
                lxml.etree.SubElement(e,
                                      q("eastBoundLongitude")).text = coords[3]
                lxml.etree.SubElement(e,
                                      q("southBoundLatitude")).text = coords[0]
                lxml.etree.SubElement(e,
                                      q("northBoundLatitude")).text = coords[2]
                e.text = None
            else:
                # Should never happen.
                e.getparent().remove(e)
    lxml.etree.cleanup_namespaces(root)
    if returnString:
        return lxml.etree.tostring(root, encoding=unicode)
    else:
        return root

コード例 #6

0

ファイルを表示

ファイル: datacite_xml.py プロジェクト: HEG-INCIPIT/ARKetype

def dataciteXmlToFormElements(document):
    """
  Converts a DataCite XML record to a dictionary of form elements.
  All non-content (comments, etc.) is discarded.  Whitespace is
  processed and empty element and attribute values are discarded.
  Dictionary keys follow the pattern of element and attribute XPaths,
  e.g., the schemeURI attribute in the following XML fragment:

    <resource>
      <creators>
        <creator>...</creator>
        <creator>
          <nameIdentifier schemeURI="...">

  is identified by key:

    creators-creator-1-nameIdentifier_0-schemeURI

  Repeatable elements are indexed at the top level only; lower-level
  repeatable elements (e.g., contributor affiliations) are
  concatenated.  However, certain repeatable elements (see
  _numberedElementContainers), such as nameIdentifier in the example
  above, are indexed, but with underscores.  An additional tweak to
  the naming pattern is that the key for the content of a top-level
  repeatable element carries an extra component that echoes the
  element name, as in:

    alternateIdentifiers-alternateIdentifier-0-alternateIdentifier
    alternateIdentifiers-alternateIdentifier-1-alternateIdentifier

  <br> elements in descriptions are replaced with newlines.
  """
    document = datacite.upgradeDcmsRecord(document)
    d = {}

    def tagName(tag):
        return tag.split("}")[1]

    def getElementChildren(node):
        return list(node.iterchildren(lxml.etree.Element))

    def getText(node):
        t = node.text or ""
        for c in node.iterchildren():
            t += c.tail or ""
        return t

    def processNode(path, node, index=None, separator="-"):
        tag = tagName(node.tag)
        if path == "":
            mypath = tag
        else:
            mypath = "%s-%s" % (path, tag)
        if index != None:
            mypath += "%s%d" % (separator, index)
            mypathx = "%s-%s" % (mypath, tag)
        else:
            mypathx = mypath
        for a in node.attrib:
            v = node.attrib[a].strip()
            if v != "":
                d["%s-%s" % (mypath, a)] = v
        if tag in _repeatableElementContainers:
            for i, c in enumerate(getElementChildren(node)):
                processNode(mypath, c, i)
        elif tag in _numberedElementContainers:
            indexes = {t: -1 for t in _numberedElementContainers[tag]}
            for c in getElementChildren(node):
                if tagName(c.tag) in indexes:
                    indexes[tagName(c.tag)] += 1
                    processNode(mypath,
                                c,
                                indexes[tagName(c.tag)],
                                separator="_")
                else:
                    processNode(mypath, c)
        else:
            if tag == "description":
                # The only mixed-content element type in the schema; <br>'s
                # get replaced with newlines.
                v = node.text or ""
                for c in node.iterchildren():
                    if isinstance(c.tag, basestring) and tagName(
                            c.tag) == "br":
                        v += "\n"
                    v += c.tail or ""
                v = v.strip()
                if v != "":
                    d[mypathx] = v
            elif tag == "geoLocationPolygon":
                d[mypathx] = geometry_util.datacitePolygonToInternal(node)
            else:
                children = getElementChildren(node)
                if len(children) > 0:
                    for c in children:
                        processNode(mypath, c)
                else:
                    v = getText(node).strip()
                    if v != "":
                        if mypathx in d:
                            # Repeatable elements not explicitly handled have their
                            # content concatenated.
                            d[mypathx] += " ; " + v
                        else:
                            d[mypathx] = v

    root = util.parseXmlString(document)
    for c in getElementChildren(root):
        processNode("", c)
    fc = _separateByFormType(d)
    return fc