コード例 #1
0
def parseMetadataCSV(job, metadataCSVFilePath):
    """
    Parses the metadata.csv into a dict with entries for each file.

    Each file's entry is an OrderedDict containing the column header and a list of values for each column.

    Example CSV:
    Filename,dc.title,dc.type,dc.type,Other metadata
    objects/foo.jpg,Foo,Photograph,Still Image,Taken on a sunny day
    objects/bar/,Bar,Photograph,Still Image,All taken on a rainy day

    Produces:
    {
        'objects/foo.jpg': OrderedDict(dc.title=[Foo], dc.type=[Photograph, Still Image], Other metadata=[Taken on a sunny day])
        'objects/bar': OrderedDict(dc.title=[Bar], dc.date=[Photograph, Still Image], Other metadata=[All taken on a rainy day])
    }

    :param metadataCSVFilePath: Path to the metadata CSV to parse
    :return: {<filename>: OrderedDict(<metadata name>: [<metadata value>]) }
    """
    metadata = {}
    # use universal newline mode to support unusual newlines, like \r
    with open(metadataCSVFilePath, "rbU") as f:
        reader = csv.reader(f)
        # Parse first row as header
        header = next(reader)
        # Strip filename column, strip whitespace from header values
        header = [h.strip() for h in header[1:]]
        # Parse data
        for row in reader:
            if not row:
                continue
            entry_name = row[0]
            if entry_name.endswith("/"):
                entry_name = entry_name[:-1]
            # Strip file/dir name from values
            row = row[1:]
            values = archivematicaFunctions.OrderedListsDict(zip(header, row))
            if entry_name in metadata and metadata[entry_name] != values:
                job.pyprint(
                    "Metadata for",
                    entry_name,
                    "being overwritten. Old:",
                    metadata[entry_name],
                    "New:",
                    values,
                    file=sys.stderr,
                )
            metadata[entry_name] = values

    return collections.OrderedDict(metadata)  # Return a normal OrderedDict
コード例 #2
0
def parseDmdSec(dmdSec, label='[Placeholder title]'):
    """
    Parses a dmdSec into a dict with child tag names and their values

    :param dmdSec: dmdSec elements
    :param label: Default title if not provided. Required by CONTENTdm
    :returns: Dict of {<child element tag>: [<value>, ...]
    """
    # If the dmdSec object is empty (i.e, no DC metadata has been assigned
    # in the dashboard, and there was no metadata.csv or other metadata file
    # in the transfer), return a placeholder title.
    if dmdSec is None:
        return collections.OrderedDict([('title', [label])])
    elementsDict = archivematicaFunctions.OrderedListsDict()

    # If we are dealing with a DOM object representing the Dublin Core metadata,
    # check to see if there is a title (required by CONTENTdm). If not, assign a
    # placeholder title.
    mdType = dmdSec.xpath('mets:mdWrap/@MDTYPE', namespaces=ns.NSMAP)
    if mdType == 'DC':
        dcTitlesDom = dmdSec.findall('.//dcterms:title', namespaces=ns.NSMAP)
        if not dcTitlesDom:
            elementsDict['title'] = label

    # Iterate over all descendants and put in the return dict
    # Key is the element's tag name, value is a list of the element's text
    xmldata = dmdSec.find('.//mets:xmlData', namespaces=ns.NSMAP)
    for element in xmldata.iterdescendants():
        tagname = element.tag
        # Strip namespace prefix
        # TODO can tag names be unicode?
        tagname = re.sub(r'{\S+}', '', tagname)  # \S = non whitespace
        if tagname in ('dublincore', ):
            continue
        elementsDict[
            tagname] = element.text or ''  # OrderedListsDict appends to lists as needed

    return collections.OrderedDict(elementsDict)