def readRootMetsFromIP(tarFile): mets_info_entries = [ member for member in tarFile.getmembers() if re.match(mets_entry_pattern, member.name) ] if len(mets_info_entries) == 1: logger.info("Root METS file found in container file") root_mets_file_entry = mets_info_entries[0].name root_mets_file_entry_base_dir = os.path.dirname(root_mets_file_entry) root_mets_content = read_textfile_from_tar(tarFile, root_mets_file_entry) root_mets = ET.fromstring(bytes(root_mets_content, 'utf-8')) return (root_mets, root_mets_file_entry_base_dir) return None
def validate_and_detect_ips(ips_root_dir, user_id, task=None, task_log=None): # task parameter is provided by huey context task_log.log("Loading information packages from user directory: %s" % ips_root_dir) # TODO: implement validation and detection of ips u = User.objects.get(id=user_id) # delete all user records first ret = DetectedInformationPackage.objects.filter(user=u).delete() # noqa # read tar files all_files_in_user_dir = list_files_in_dir(ips_root_dir) container_files_in_user_dir = [f for f in all_files_in_user_dir if re.match(input_file_filter_regex, f)] task_log.log("There are %d container files in the user's directory" % len(container_files_in_user_dir)) for file_in_user_dir in container_files_in_user_dir: task_log.log(html_bold("Container file: %s" % file_in_user_dir)) mime_str = str(get_mime_type(file_in_user_dir)) if mime_str not in supported_processing_mime_types: task_log.log("File skipped: Mime type not supported: %s ") else: try: object_path = os.path.join(ips_root_dir, file_in_user_dir) t = tarfile.open(object_path, 'r') mets_info_entries = [member for member in t.getmembers() if re.match(mets_entry_pattern, member.name)] if len(mets_info_entries) == 1: task_log.log("Root METS file found in container file") root_mets_file_entry = mets_info_entries[0].name root_mets_file_entry_base_dir = os.path.dirname(root_mets_file_entry) root_mets_content = read_textfile_from_tar(t, root_mets_file_entry) parsed_mets = ET.fromstring(bytes(root_mets_content, 'utf-8')) package_mets_schema_location = get_schema_location_by_regex(parsed_mets, r".*/METS/.*") mets_schema_locations = get_schema_locations(parsed_mets) mets_schema_locations_container_entries = [ os.path.join(root_mets_file_entry_base_dir, loc) for loc in mets_schema_locations.values() ] # extract schema files to local user directory extract_container_entries(t, mets_schema_locations_container_entries, "%s/schemas" % ips_root_dir) mets_schema_path = os.path.join(ips_root_dir, extracted_schemas_directory, root_mets_file_entry_base_dir, package_mets_schema_location) if package_mets_schema_location: if not package_mets_schema_location.startswith("schemas/"): task_log.error("schema files must be included in the 'schemas' directory") else: xmlval = XmlValidation() try: parsed_mets_schema = ET.parse(mets_schema_path) validation_result = xmlval.validate_XML(parsed_mets, parsed_mets_schema) task_log.success("The root METS file is %s" % "valid" if validation_result.valid else "invalid") if len(validation_result.err) > 0: task_log.log("Errors: " % ", ".join(validation_result.err)) except Exception as e: task_log.error("Errors occurred when validating root METS file:") task_log.log("%s" % str(e)) else: task_log.warning("Root METS file not validated because no METS schema location was provided") # create record only if OBJID is available if 'OBJID' in parsed_mets.attrib: obj_id = parsed_mets.attrib['OBJID'] task_log.log("Object ID: %s" % obj_id) DetectedInformationPackage.objects.create( task_id=task.id, information_package=obj_id, title="First IP", ip_base_dir=root_mets_file_entry_base_dir, ip_filename=file_in_user_dir, user=u, selected=False ).save() else: task_log.log("No object ID defined!") t.close() except tarfile.TarError: print("Error reading tar file: %s" % file_in_user_dir) task_log.log("Finished detecting information packages") return True
def get_basic_metadata(request, file_path): user_data_path = os.path.join(ip_data_path, request.user.username) vars = environment_variables(request) if not vars['selected_ip']: return {} selected_ip = vars['selected_ip'] print(file_path) archive_file_path = os.path.join(user_data_path, selected_ip.ip_filename) t = tarfile.open(archive_file_path, 'r') file_content = read_textfile_from_tar(t, file_path) tmp_file_path = "/tmp/%s" % randomword(10) res_events = [] try: title = "" date = "" with open(tmp_file_path, 'w') as tmp_file: tmp_file.write(file_content) if fnmatch.fnmatch(file_path, metadata_file_pattern_ead): pead = ParsedEad("/tmp", tmp_file_path) dao_elements = pead.get_dao_elements() actual = pead.ead_tree.getroot().tag unit_titles = [] unit_dates = [] for dao_elm in dao_elements: unit_titles.append( pead._first_md_val_ancpath(dao_elm, "unittitle")) unit_dates.append( pead._first_md_val_ancpath(dao_elm, "unitdate")) title = unit_titles[0] date = unit_dates[0] events = "" elif fnmatch.fnmatch(file_path, metadata_file_pattern_premis): structure = get_ip_structure(request) logical_view = search(structure, "logical_view_data") events = search(logical_view, "events") for event in events: if len(event): res_events.append({ 'type': event[0]['type'], 'datetime': event[0]['datetime'], 'agent': event[0]['linking_agent_id']['value'] }) title = "Root PREMIS" date = "20.09.2017" md_type = ead_md_type if fnmatch.fnmatch(file_path, metadata_file_pattern_ead) \ else premis_md_type if fnmatch.fnmatch(file_path, metadata_file_pattern_premis) else "Other" return JsonResponse( { 'success': True, 'type': md_type, 'title': title, 'date': date, 'events': res_events, 'file_path': file_path }, status=200) except Exception as error: logger.exception(error) return JsonResponse({ 'success': False, 'error': str(error) }, status=500)
def get_ip_overview_context(request): user_data_path = os.path.join(ip_data_path, request.user.username) vars = environment_variables(request) if not vars['selected_ip']: return {} object_path = os.path.join(user_data_path, vars['selected_ip'].ip_filename) tarFile = tarfile.open(object_path, 'r') mets_info_entries = [ member for member in tarFile.getmembers() if re.match(mets_entry_pattern, member.name) ] if len(mets_info_entries) == 1: logger.info("Root METS file found in container file") root_mets_file_entry = mets_info_entries[0].name root_mets_file_entry_base_dir = os.path.dirname(root_mets_file_entry) root_mets_content = read_textfile_from_tar(tarFile, root_mets_file_entry) root_mets = ET.fromstring(bytes(root_mets_content, 'utf-8')) all_schemas = [] for root_structMap in root_mets.iter( '{http://www.loc.gov/METS/}structMap'): if root_structMap.get('TYPE') == 'PHYSICAL': for div in root_structMap.find( '{http://www.loc.gov/METS/}div'): label = div.get('LABEL') if label == 'schemas': schemas = get_schemas_section( div, root_mets, root_mets_file_entry_base_dir) all_schemas += [ schema['text'] for schema in schemas['nodes'] ] continue #print(root_mets.attrib['OBJID']) # for child in parsed_mets: # print(child.tag, child.attrib) # for neighbor in parsed_mets.iter('neighbor'): # print(neighbor.attrib) overview = {} total_size = 0 total_number_content_files = 0 content_mime_types = [] representations = [] overview['object_id'] = root_mets.attrib['OBJID'] ead_info_entries = [ member for member in tarFile.getmembers() if re.match(ead_entry_pattern, member.name) ] if len(ead_info_entries) == 1: logger.info("EAD file found in container file") root_ead_file_entry = ead_info_entries[0].name root_ead_file_entry_base_dir = os.path.dirname(root_ead_file_entry) root_ead_content = read_textfile_from_tar(tarFile, root_ead_file_entry) root_ead = ET.fromstring(bytes(root_ead_content, 'utf-8')) found = [ element.text for element in root_ead.iter( '{http://ead3.archivists.org/schema/}titleproper') ] #TODO: test for empty overview['title'] = found[0] else: overview['title'] = "Unknown. EAD file missing." for root_fileGrp in root_mets.iter( '{http://www.loc.gov/METS/}fileGrp'): if root_fileGrp.attrib['USE'] == 'representations': print(root_fileGrp.tag, root_fileGrp.attrib) for root_file in root_fileGrp.iter( '{http://www.loc.gov/METS/}file'): FLocat = root_file.find('{http://www.loc.gov/METS/}FLocat') rep_mets_file_entry = FLocat.get( "{http://www.w3.org/1999/xlink}href") rep_mets_file_entry = root_mets_file_entry_base_dir + rep_mets_file_entry.strip( '.') rep_mets_content = read_textfile_from_tar( tarFile, rep_mets_file_entry) rep_mets = ET.fromstring(bytes(rep_mets_content, 'utf-8')) representation = {} representation['identifier'] = rep_mets.get('OBJID') print(rep_mets) for rep_fileGrp in rep_mets.iter( '{http://www.loc.gov/METS/}fileGrp'): print(rep_fileGrp.tag, rep_fileGrp.attrib) for rep_file in rep_fileGrp.iter( '{http://www.loc.gov/METS/}file'): mimetype = rep_file.get('MIMETYPE') #print(mimetype) #mime = MimeTypes() #file_mimetype, _ = mime.guess_type(file_url) representation[ 'label'] = get_representation_label_for_id( root_mets, root_file.get('ID')) #representation['description'] = "From Where???" content_mime_types.append(rep_file.get('MIMETYPE')) total_size += int(rep_file.get('SIZE')) total_number_content_files += 1 representations.append(representation) overview['representations'] = representations total_number_representations = len(representations) overview['stats'] = { "total_size": total_size, "total_number_content_files": total_number_content_files, "total_number_representations": total_number_representations, "schemas": ','.join(all_schemas), "content_mime_types": ", ".join(list(set(content_mime_types))), } return overview
def get_representation_section(div, root_mets, tarFile, root_mets_file_entry_base_dir): if type(root_mets) == tuple: e = root_mets[0] else: e = root_mets fileSec = next(e.iter('{http://www.loc.gov/METS/}fileSec')) label = div.get('LABEL') rep_node = {} for fptr in div.iter('{http://www.loc.gov/METS/}fptr'): file_name = fileSec_get_file_for_id(fileSec, fptr.get('FILEID')) if file_name is not None: if file_name.endswith('METS.xml'): rep_mets_file_entry = root_mets_file_entry_base_dir + file_name.strip( '.') rep_mets_content = read_textfile_from_tar( tarFile, rep_mets_file_entry) rep_mets = ET.fromstring(bytes(rep_mets_content, 'utf-8')) representation = {} rep_node = { "icon": "fa fa-ibox fa-fw", "text": rep_mets.get('OBJID'), "nodes": [] } for rep_structMap in rep_mets.iter( '{http://www.loc.gov/METS/}structMap'): if rep_structMap.get('TYPE') == 'PHYSICAL': for div in rep_structMap.find( '{http://www.loc.gov/METS/}div'): label = div.get('LABEL') if label == 'data': data = get_data_section( div, rep_mets, os.path.dirname(rep_mets_file_entry)) rep_node['nodes'].append(data) continue if label == 'schemas': schemas = get_schemas_section( div, rep_mets, os.path.dirname(rep_mets_file_entry)) rep_node['nodes'].append(schemas) continue if label == 'metadata': metadata = get_metadata_section( div, rep_mets, os.path.dirname(rep_mets_file_entry)) # Read premis and create premis entry for entry in metadata['nodes']: if entry['text'].endswith('premis.xml'): premis_file_entry = os.path.dirname( rep_mets_file_entry ) + entry['text'].strip(".") print(premis_file_entry) rep_premis_content = read_textfile_from_tar( tarFile, premis_file_entry) rep_premis = ET.fromstring( bytes(rep_premis_content, 'utf-8')) metadata['premis'] = { 'representations': getRepresentations(rep_premis), 'events': getPemisEvents(rep_premis) } rep_node['nodes'].append(metadata) continue return rep_node