def create_div(divs, parent, filesec, attributes, path=''): """Recursively create fileSec and structmap divs based on directory structure. :divs: Current directory or file in directory structure walkthrough :parent: Parent element in structMap :filesec: filesec element :attributes: The following keys: all_amd_refs: XML element tree of administrative metadata references all_dmd_refs: XML element tree of descriptive metadata references filelist: Sorted list of digital objects (file paths) type_attr: Structmap type file_ids: Dict with file paths and identifiers workspace: Workspace path :path: Current path in directory structure walkthrough :returns: ``None`` """ fptr_list = [] property_list = [] div_list = [] for div in divs.keys(): div_path = os.path.join(path, div) # It's a file, lets create file+fptr elements if div_path in attributes["filelist"]: fptr = mets.fptr( get_fileid(filesec, div_path, attributes['file_ids'])) div_elem = add_file_div(div_path, fptr, attributes) if div_elem is not None: property_list.append(div_elem) else: fptr_list.append(fptr) # It's not a file, lets create a div element else: amdids = get_md_references(attributes["all_amd_refs"], directory=div_path) dmdsec_id = get_md_references(attributes["all_dmd_refs"], directory=div_path) if attributes["structmap_type"] == 'Directory-physical': div_elem = mets.div(type_attr='directory', label=div, dmdid=dmdsec_id, admid=amdids) else: div_elem = mets.div(type_attr=div, dmdid=dmdsec_id, admid=amdids) div_list.append(div_elem) create_div(divs[div], div_elem, filesec, attributes, div_path) # Add fptr list first, then div list for fptr in fptr_list: parent.append(fptr) for div_elem in property_list: parent.append(div_elem) for div_elem in div_list: parent.append(div_elem)
def create_structmap(workspace, filesec, filelist, type_attr=None, root_type=None): """Creates METS document element tree that contains structural map. :param workspace: directory from which some files are searhed :param filesec: fileSec element :param filelist: Sorted list of digital objects (file paths) :param type_attr: TYPE attribute of structMap element :param root_type: TYPE attribute of root div element :returns: structural map element """ amdids = get_md_references(workspace, directory='.') dmdids = get_md_references(workspace, directory='.', ref_type='dmd') if type_attr == 'Directory-physical': container_div = mets.div(type_attr='directory', label='.', dmdid=dmdids, admid=amdids) else: root_type = root_type if root_type else 'directory' container_div = mets.div(type_attr=root_type, dmdid=dmdids, admid=amdids) structmap = mets.structmap(type_attr=type_attr) structmap.append(container_div) divs = div_structure(filelist) create_div(workspace, divs, container_div, filesec, filelist, type_attr=type_attr) mets_element = mets.mets(child_elements=[structmap]) ET.cleanup_namespaces(mets_element) return ET.ElementTree(mets_element)
def create_div(workspace, divs, parent, filesec, filelist, path='', type_attr=None): """Recursively create fileSec and structmap divs based on directory structure. :param workspace: Workspace path :param divs: Current directory or file in directory structure walkthrough :param parent: Parent element in structMap :param filesec: filesec element :param filelist: Sorted list of digital objects (file paths) :param path: Current path in directory structure walkthrough :param type_attr: Structmap type :returns: ``None`` """ fptr_list = [] property_list = [] div_list = [] for div in divs.keys(): div_path = os.path.join(path, div) # It's a file, lets create file+fptr elements if div_path in filelist: fileid = get_fileid(filesec, div_path) fptr = mets.fptr(fileid) div_el = add_file_properties(workspace, div_path, fptr) if div_el is not None: property_list.append(div_el) else: fptr_list.append(fptr) # It's not a file, lets create a div element else: div_path = os.path.join(path, div) amdids = get_md_references(workspace, directory=div_path) dmdsec_id = get_md_references(workspace, directory=div_path, ref_type='dmd') if type_attr == 'Directory-physical': div_el = mets.div(type_attr='directory', label=div, dmdid=dmdsec_id, admid=amdids) else: div_el = mets.div(type_attr=div, dmdid=dmdsec_id, admid=amdids) div_list.append(div_el) create_div(workspace, divs[div], div_el, filesec, filelist, div_path, type_attr) # Add fptr list first, then div list for fptr_elem in fptr_list: parent.append(fptr_elem) for div_elem in property_list: parent.append(div_elem) for div_elem in div_list: parent.append(div_elem)
def main(arguments=None): """The main method for compile_sturctmap""" args = parse_arguments(arguments) structmap = mets.structmap(type_attr=args.type_attr) mets_structmap = mets.mets(child_elements=[structmap]) filegrp = mets.filegrp() filesec = mets.filesec(child_elements=[filegrp]) mets_filesec = mets.mets(child_elements=[filesec]) _, dmdsec_id = ids_for_files(args.workspace, None, 'dmdsec.xml', dash_count=0) if args.dmdsec_struct == 'ead3': container_div = mets.div(type_attr='logical') structmap.append(container_div) create_ead3_structmap(args.dmdsec_loc, args.workspace, container_div, filegrp, dmdsec_id) else: amdids = get_links_event_agent(args.workspace, None) container_div = mets.div(type_attr='directory', dmdid=dmdsec_id, admid=amdids) structmap.append(container_div) divs = div_structure(args.workspace) create_structmap(args.workspace, divs, container_div, filegrp) if args.stdout: print h.serialize(mets_filesec) print h.serialize(mets_structmap) output_sm_file = os.path.join(args.workspace, 'structmap.xml') output_fs_file = os.path.join(args.workspace, 'filesec.xml') if not os.path.exists(os.path.dirname(output_sm_file)): os.makedirs(os.path.dirname(output_sm_file)) if not os.path.exists(os.path.dirname(output_fs_file)): os.makedirs(os.path.dirname(output_fs_file)) with open(output_sm_file, 'w+') as outfile: outfile.write(h.serialize(mets_structmap)) with open(output_fs_file, 'w+') as outfile: outfile.write(h.serialize(mets_filesec)) print "compile_structmap created files: %s %s" % (output_sm_file, output_fs_file) return 0
def add_file_properties(workspace, path, fptr): """Create a div element with file properties :param properties: File properties :param path: File path :param fptr: Element fptr for file :returns: Div element with properties or None """ pkl_name = None for amdref in get_md_references(workspace, path=path): pkl_name = os.path.join(workspace, '{}-scraper.pkl'.format(amdref[1:])) if os.path.isfile(pkl_name): break if pkl_name is None or not os.path.isfile(pkl_name): return None with open(pkl_name, 'rb') as pkl_file: file_metadata_dict = pickle.load(pkl_file) properties = {} if 'properties' not in file_metadata_dict[0]: return None else: properties = file_metadata_dict[0]['properties'] if 'order' in properties: div_el = mets.div(type_attr='file', order=properties['order']) div_el.append(fptr) return div_el return None
def create_ead3_structmap(descfile, workspace, structmap, filegrp, dmdsec_id): """Create structmap based on ead3 descriptive metadata structure. """ import_xml = ET.parse(descfile) root = import_xml.getroot() if root.xpath("//ead3:archdesc/@otherlevel", namespaces=NAMESPACES): level = root.xpath("//ead3:archdesc/@otherlevel", namespaces=NAMESPACES)[0] else: level = root.xpath("//ead3:archdesc/@level", namespaces=NAMESPACES)[0] amdids = get_links_event_agent(workspace, None) div_ead = mets.div(type_attr='archdesc', label=level, dmdid=dmdsec_id, admid=amdids) if len(root.xpath("//ead3:archdesc/ead3:dsc", namespaces=NAMESPACES)) > 0: for ead3_c in root.xpath("//ead3:dsc/*", namespaces=NAMESPACES): if len(ET.QName(ead3_c.tag).localname) > 1: cnum = str(ET.QName(ead3_c.tag).localname)[-2:] else: cnum = None ead3_c_div(ead3_c, div_ead, filegrp, workspace, cnum=cnum) structmap.append(div_ead)
def ead3_c_div(parent, structmap, filegrp, workspace, filelist): """Create div elements based on ead3 c elements. Fptr elements are created based on ead dao elements. The Ead3 elements tags are put into @type and the @level or @otherlevel attributes from ead3 will be put into @label. :parent: Element to follow in EAD3 :div: Div element in structmap :filegrp: fileGrp element :workspace: Workspace path :filelist: Sorted list of digital objects (file paths) """ try: label = parent.xpath(("./@otherlevel | ./@level"), namespaces=NAMESPACES)[0] except IndexError: label = ET.QName(parent.tag).localname c_div = mets.div(type_attr=(ET.QName(parent.tag).localname), label=label) for elem in parent.findall("./*"): if ET.QName(elem.tag).localname in ALLOWED_C_SUBS: ead3_c_div(elem, c_div, filegrp, workspace, filelist) hrefs = collect_dao_hrefs(parent) c_div = add_fptrs_div_ead( c_div=c_div, hrefs=hrefs, filelist=filelist, filegrp=filegrp, workspace=workspace) structmap.append(c_div)
def create_structmap(workspace, divs, structmap, filegrp, path=''): """Create structmap based on directory structure """ fptr_list = [] div_list = [] for div in divs.keys(): # It's a file if there is "-techmd.xml", lets create file+fptr # elements if div.endswith('-techmd.xml'): div = div[:-len('-techmd.xml')] div_path = encode_path(os.path.join(decode_path(path), div)) amdids = get_links_event_agent(workspace, div_path) fileid = add_file_to_filesec(workspace, div_path, filegrp, amdids) fptr = mets.fptr(fileid) fptr_list.append(fptr) # It's not a file, lets create a div element else: div_path = encode_path(os.path.join(decode_path(path), div)) amdids = get_links_event_agent(workspace, div_path) _, dmdsec_id = ids_for_files(workspace, div_path, 'dmdsec.xml') div_el = mets.div(type_attr=div, dmdid=dmdsec_id, admid=amdids) div_list.append(div_el) create_structmap(workspace, divs[div], div_el, filegrp, div_path) # Add fptr list first, then div list for fptr_elem in fptr_list: structmap.append(fptr_elem) for div_elem in div_list: structmap.append(div_elem)
def create_ead3_structmap(filegrp, attributes): """Create structmap based on ead3 descriptive metadata structure. :filegrp: fileGrp element :attributes: The following keys: all_amd_refs: XML element tree of administrative metadata references all_dmd_refs: XML element tree of descriptive metadata references filelist: Sorted list of digital objects (file paths) dmdsec_loc: EAD3 descriptive metadata file structmap_type: TYPE attribute of structMap element workspace: Workspace path """ structmap = mets.structmap(type_attr=attributes["structmap_type"]) container_div = mets.div(type_attr='logical') root = ET.parse(attributes["dmdsec_loc"]).getroot() try: label = root.xpath(("//ead3:archdesc/@otherlevel | " "//ead3:archdesc/@level"), namespaces=NAMESPACES)[0] except IndexError: label = 'archdesc' amdids = get_md_references(attributes["all_amd_refs"], directory='.') dmdids = get_md_references(attributes["all_dmd_refs"], directory='.') div_ead = mets.div(type_attr='archdesc', label=label, dmdid=dmdids, admid=amdids) if root.xpath("//ead3:archdesc/ead3:dsc", namespaces=NAMESPACES): for elem in root.xpath("//ead3:dsc/*", namespaces=NAMESPACES): if ET.QName(elem.tag).localname in ALLOWED_C_SUBS: ead3_c_div(elem, div_ead, filegrp, attributes) container_div.append(div_ead) structmap.append(container_div) mets_element = mets.mets(child_elements=[structmap]) ET.cleanup_namespaces(mets_element) return ET.ElementTree(mets_element)
def create_ead3_structmap(descfile, workspace, filegrp, filelist, type_attr): """Create structmap based on ead3 descriptive metadata structure. :desc_file: EAD3 descriptive metadata file :workspace: Workspace path :structmap: Structmap element :filegrp: fileGrp element :filelist: Sorted list of digital objects (file paths) :type_attr: TYPE attribute of structMap element """ structmap = mets.structmap(type_attr=type_attr) container_div = mets.div(type_attr='logical') root = ET.parse(descfile).getroot() try: label = root.xpath(("//ead3:archdesc/@otherlevel | " "//ead3:archdesc/@level"), namespaces=NAMESPACES)[0] except IndexError: label = 'archdesc' amdids = get_md_references(workspace, directory='.') dmdids = get_md_references(workspace, directory='.', ref_type='dmd') div_ead = mets.div(type_attr='archdesc', label=label, dmdid=dmdids, admid=amdids) if len(root.xpath("//ead3:archdesc/ead3:dsc", namespaces=NAMESPACES)) > 0: for elem in root.xpath("//ead3:dsc/*", namespaces=NAMESPACES): if ET.QName(elem.tag).localname in ALLOWED_C_SUBS: ead3_c_div(elem, div_ead, filegrp, workspace, filelist) container_div.append(div_ead) structmap.append(container_div) mets_element = mets.mets(child_elements=[structmap]) ET.cleanup_namespaces(mets_element) return ET.ElementTree(mets_element)
def ead3_c_div(parent, structmap, filegrp, workspace, cnum=None): """Create div elements based on ead3 c elements. Fptr elements are created based on ead dao elements. The Ead3 elements tags are put into @type and the @level or @otherlevel attributes from ead3 will be put into @label. """ allowed_c_subs = [ 'c', 'c01', 'c02', 'c03', 'c04', 'c05', 'c06', 'c07', 'c08', 'c09', 'c10', 'c11', 'c12' ] if parent.xpath("./@otherlevel"): level = parent.xpath("./@otherlevel")[0] else: level = parent.xpath("./@level")[0] if cnum: c_div = mets.div(type_attr=('c' + str(cnum)), label=level) cnum_sub = str('0') + str(int(cnum) + 1) else: c_div = mets.div(type_attr='c', label=level) cnum_sub = None for elem in parent.findall("./*"): if ET.QName(elem.tag).localname in allowed_c_subs: ead3_c_div(elem, c_div, filegrp, workspace, cnum=cnum_sub) for files in parent.xpath("./ead3:did/*", namespaces=NAMESPACES): if ET.QName(files.tag).localname in ['dao', 'daoset']: if ET.QName(files.tag).localname == 'daoset': tech_file = encode_path( files.xpath("./ead3:dao/@href", namespaces=NAMESPACES)[0]) else: tech_file = encode_path(files.xpath("./@href")[0]) amdids = get_links_event_agent(workspace, tech_file) fileid = add_file_to_filesec(workspace, tech_file, filegrp, amdids) dao = mets.fptr(fileid=fileid) c_div.append(dao) structmap.append(c_div)
def create_structmap(filesec, **attributes): """ Creates METS document element tree that contains structural map. :filesec: fileSec element :attributes: The following keys: all_amd_refs: XML element tree of administrative metadata references all_dmd_refs: XML element tree of descriptive metadata references filelist: Sorted list of digital objects (file paths) structmap_type: TYPE attribute of structMap element root_type: TYPE attribute of root div element file_ids: Dict with file paths and identifiers workspace: Workspace path :returns: structural map element """ attributes = get_reference_lists(**_attribute_values(attributes)) amdids = get_md_references(attributes["all_amd_refs"], directory='.') dmdids = get_md_references(attributes["all_dmd_refs"], directory='.') if attributes["structmap_type"] == 'Directory-physical': container_div = mets.div(type_attr='directory', label='.', dmdid=dmdids, admid=amdids) else: container_div = mets.div(type_attr=attributes["root_type"], dmdid=dmdids, admid=amdids) structmap = mets.structmap(type_attr=attributes["structmap_type"]) structmap.append(container_div) divs = div_structure(attributes["filelist"]) create_div(divs, container_div, filesec, attributes) mets_element = mets.mets(child_elements=[structmap]) ET.cleanup_namespaces(mets_element) return ET.ElementTree(mets_element)
def run(self): """Create a METS document that contains logical structural map. Logical structural map is based on dataset metadata retrieved from Metax. :returns: ``None`` """ # Read the generated physical structmap from file physical_structmap = ET.parse( os.path.join(self.sip_creation_path, 'structmap.xml')) # Get dmdsec id from physical_structmap dmdsec_id = physical_structmap.getroot()[0][0].attrib['DMDID'] # Get provenance id's provenance_ids = self.get_provenance_ids() # Init logical structmap logical_structmap = mets.structmap(type_attr='Fairdata-logical') mets_structmap = mets.mets(child_elements=[logical_structmap]) # Create logical structmap categories = self.find_file_categories() wrapper_div = mets.div(type_attr='logical', dmdid=[dmdsec_id], admid=provenance_ids) for category in categories: div = mets.div(type_attr=category) for filename in categories.get(category): fileid = self.get_fileid(encode_path(filename, safe='/')) div.append(mets.fptr(fileid)) wrapper_div.append(div) logical_structmap.append(wrapper_div) with self.output().open('wb') as output: output.write(h.serialize(mets_structmap))
def ead3_c_div(parent, structmap, filegrp, workspace, filelist): """Create div elements based on ead3 c elements. Fptr elements are created based on ead dao elements. The Ead3 elements tags are put into @type and the @level or @otherlevel attributes from ead3 will be put into @label. :parent: Element to follow in EAD3 :div: Div element in structmap :filegrp: fileGrp element :workspace: Workspace path :filelist: Sorted list of digital objects (file paths) """ try: label = parent.xpath(("./@otherlevel | ./@level"), namespaces=NAMESPACES)[0] except IndexError: label = ET.QName(parent.tag).localname c_div = mets.div(type_attr=(ET.QName(parent.tag).localname), label=label) for elem in parent.findall("./*"): if ET.QName(elem.tag).localname in ALLOWED_C_SUBS: ead3_c_div(elem, c_div, filegrp, workspace, filelist) dao_elems = [] for elem in parent.xpath("./ead3:did/*", namespaces=NAMESPACES): if ET.QName(elem.tag).localname in ['dao', 'daoset']: if ET.QName(elem.tag).localname == 'daoset': for dao_href in elem.xpath("./ead3:dao/@href", namespaces=NAMESPACES): dao_elems.append(dao_href) else: dao_elems.append(elem.xpath("./@href")[0]) for href in dao_elems: if href.startswith('/'): href = href[1:] amd_file = [x for x in filelist if href in x][0] fileid = add_file_to_filesec(workspace, amd_file, filegrp) fptr = mets.fptr(fileid=fileid) c_div.append(fptr) structmap.append(c_div)
def add_file_div(workspace, path, fptr, type_attr='file'): """Create a div element with file properties :param properties: File properties :param path: File path :param fptr: Element fptr for file :param type_attr: The TYPE attribute value for the div :returns: Div element with properties or None """ properties = file_properties(workspace, path) if properties and 'order' in properties: div_el = mets.div(type_attr=type_attr, order=properties['order']) div_el.append(fptr) return div_el return None
def add_file_div(path, fptr, attributes, type_attr='file'): """Create a div element with file properties :path: File path :fptr: Element fptr for file :attributes: The following keys: all_amd_refs: XML element tree of administrative metadata references workspace: Workspace path :type_attr: The TYPE attribute value for the div :returns: Div element with properties or None """ properties = file_properties(path, attributes) if properties and 'order' in properties: div_el = mets.div(type_attr=type_attr, order=properties['order']) div_el.append(fptr) return div_el return None
def fix_1_4_mets(root): """Migrates from catalog version 1.4 or 1.4.1 to newer by writing the following changes into the mets file: - adds the @MDTYPEVERSION attribute to all mets:mdWrap elements - writes charset from textMD to premis:formatName for text files - moves MIX metadata blocks from the premis:objectCharacteristicsExtension metadata to an own techMD metadata block - adds a new div as parent div if structmap has several child divs - sets METSRIGHTS as OTHERMDTYPE """ NAMESPACES['textmd'] = 'http://www.kdk.fi/standards/textmd' for elem in root.xpath( "./mets:amdSec/*/mets:mdWrap | ./mets:dmdSec/" "mets:mdWrap", namespaces=NAMESPACES): mdtype = elem.get('MDTYPE') if mdtype == 'OTHER': mdtype = elem.get('OTHERMDTYPE') version = MDTYPEVERSIONS[mdtype] # MODS version has to comply with version given in MODS metadata # If missing, use the default value already given. if mdtype == "MODS": mods_version = elem.xpath("./mets:xmlData/mods:mods/@version", namespaces=NAMESPACES) if mods_version and mods_version[0].strip(): version = mods_version[0].strip() elem.set('MDTYPEVERSION', version) root = set_charset_from_textmd(root) for premis_mix in root.xpath( './mets:amdSec/mets:techMD/mets:mdWrap/mets:xmlData/premis:object/' 'premis:objectCharacteristics/' 'premis:objectCharacteristicsExtension/mix:mix', namespaces=NAMESPACES): root = move_mix(root, premis_mix) list_amdsec = [] mets_amdsec = root.xpath('./mets:amdSec', namespaces=NAMESPACES)[0] for elem in mets_amdsec: list_amdsec.append(copy.deepcopy(elem)) mets_amdsec.remove(elem) list_amdsec.sort(key=mets.order) for elem in list_amdsec: mets_amdsec.append(elem) structmap = root.xpath('./mets:structMap', namespaces=NAMESPACES)[0] if len(root.xpath('./mets:structMap/mets:div', namespaces=NAMESPACES)) > 1: div_elements = [] for div in structmap: div_elements.append(div) div.getparent().remove(div) div1 = mets.div(type_attr='WRAPPER', div_elements=div_elements) structmap.append(div1) for rightsmd in root.xpath('./mets:amdSec/mets:rightsMD', namespaces=NAMESPACES): mdwrap = rightsmd.xpath("./mets:mdWrap", namespaces=NAMESPACES)[0] if mdwrap.get('MDTYPE') == 'METSRIGHTS': mdwrap.set('MDTYPE', 'OTHER') mdwrap.set('OTHERMDTYPE', 'METSRIGHTS') mdwrap.set('MDTYPEVERSION', MDTYPEVERSIONS['METSRIGHTS']) return root