def ead_editor(self, request, pk=None): ip = self.get_object() try: structure = ip.get_profile('sip').structure except AttributeError: return Response("No SIP profile for IP created yet", status=status.HTTP_400_BAD_REQUEST) ead_dir, ead_name = find_destination("archival_description_file", structure) if ead_name is None: return Response("No EAD file for IP found", status=status.HTTP_404_NOT_FOUND) xmlfile = os.path.join(ip.object_path, ead_dir, ead_name) if request.method == 'GET': try: with open(xmlfile) as f: s = f.read() return Response({"data": s}) except IOError: open(xmlfile, 'a').close() return Response({"data": ""}) content = request.POST.get("content", '') with open(xmlfile, "w") as f: f.write(str(content)) return Response("Content written to %s" % xmlfile)
def get_content_type_file(self): ctsdir, ctsfile = find_destination('content_type_specification', self.get_structure()) if ctsdir is None: return None return parseContent(os.path.join(ctsdir, ctsfile), fill_specification_data(ip=self))
def CompareRepresentationXMLFiles(self): Validation.objects.filter(task=self.get_processtask()).delete() ip = InformationPackage.objects.get(pk=self.ip) reps_path, reps_dir = find_destination("representations", ip.get_structure(), ip.object_path) if reps_path is None: return None representations_dir = os.path.join(reps_path, reps_dir) for p in find_pointers(os.path.join(ip.object_path, ip.content_mets_path)): rep_mets_path = p.path rep_mets_path = os.path.join(ip.object_path, rep_mets_path) rep_path = os.path.relpath(rep_mets_path, representations_dir) rep_path = PurePath(rep_path).parts[0] rep_premis_path = get_premis_ref(etree.parse(rep_mets_path)).path rep_premis_path = os.path.join(representations_dir, rep_path, rep_premis_path) validator = XMLComparisonValidator( context=rep_premis_path, options={ 'rootdir': os.path.join(representations_dir, rep_path), 'representation': rep_path, 'recursive': False, }, task=self.get_processtask(), ip=self.ip, responsible=ip.responsible, ) validator.validate(rep_mets_path) msg = "All XML files in the representations have the same set of files" self.create_success_event(msg)
def run(self): Validation.objects.filter(task=self.get_processtask()).delete() ip = InformationPackage.objects.get(pk=self.ip) reps_path, reps_dir = find_destination("representations", ip.get_structure(), ip.object_path) if reps_path is None: return None representations_dir = os.path.join(reps_path, reps_dir) for p in find_pointers(ip.content_mets_path): rep_mets_path = p.path rep_mets_path = os.path.join(ip.object_path, rep_mets_path) rep_path = os.path.relpath(rep_mets_path, representations_dir) rep_path = PurePath(rep_path).parts[0] rep_premis_path = get_premis_ref(etree.parse(rep_mets_path)).path rep_premis_path = os.path.join(representations_dir, rep_path, rep_premis_path) validator = XMLComparisonValidator( context=rep_premis_path, options={ 'rootdir': os.path.join(representations_dir, rep_path), 'representation': rep_path, }, task=self.get_processtask(), ip=self.ip, responsible=ip.responsible, ) validator.validate(rep_mets_path)
def get_premis_file_path(self): premis_dir, premis_name = find_destination( "preservation_description_file", self.get_structure()) if premis_dir is not None: path = os.path.join(premis_dir, premis_name) path = parseContent(path, fill_specification_data(ip=self)) else: path = 'metadata/premis.xml' return normalize_path(os.path.join(self.object_path, path))
def get_content_mets_file_path(self): mets_dir, mets_name = find_destination("mets_file", self.get_structure()) if mets_dir is not None: path = os.path.join(mets_dir, mets_name) path = parseContent(path, fill_specification_data(ip=self)) else: path = 'mets.xml' return normalize_path(os.path.join(self.object_path, path))
def run(self, verify=True): ip = self.get_information_package() ip_profile_type = ip.get_package_type_display().lower() ip_profile = ip.get_profile_rel(ip_profile_type).profile structure = ip.get_structure() rootdir = ip.object_path specifications = [ip_profile.specification, get_event_spec()] premis_profile_rel = ip.get_profile_rel('preservation_metadata') if premis_profile_rel is not None: specifications.append(premis_profile_rel.profile.specification) self.logger.debug(u'Downloading schemas') for spec in specifications: schema_preserve_loc = spec.get('-schemaPreservationLocation', 'xsd_files') if schema_preserve_loc and structure: reldir, _ = find_destination(schema_preserve_loc, structure) dirname = os.path.join(rootdir, reldir) else: dirname = rootdir for schema in spec.get('-schemasToPreserve', []): dst = os.path.join(dirname, os.path.basename(schema)) self.logger.info(u'Downloading schema from {} to {}'.format( schema, dst)) try: r = requests.get(schema, stream=True, verify=verify) r.raise_for_status() with open(dst, 'wb') as f: for chunk in r: f.write(chunk) except Exception: self.logger.exception( u'Download of schema failed: {}'.format(schema)) try: self.logger.debug( u'Deleting downloaded file if it exists: {}'. format(dst)) os.remove(dst) except OSError as e: if e.errno != errno.ENOENT: self.logger.exception( u'Failed to delete downloaded file: {}'.format( dst)) raise else: self.logger.info( u'Deleted downloaded file: {}'.format(dst)) raise else: self.logger.info(u'Downloaded schema to {}'.format(dst)) else: self.logger.info(u'No schemas to download')
def transform(self, path): # move all dirs and files (except those specified in IP profile) to content structure = self.ip.get_structure() content_dir, content_name = find_destination('content', structure) content_path = os.path.join(self.ip.object_path, content_dir, content_name) reserved = [x['use'] for x in structure if 'use' in x] for f in os.listdir(path): if f not in reserved: shutil.move(os.path.join(path, f), content_path)
def get_events_file_path(self, from_container=False): if not from_container and os.path.isfile(self.object_path): return os.path.splitext(self.object_path)[0] + '_ipevents.xml' ip_profile = self.get_profile(self.get_package_type_display().lower()) structure = ip_profile.structure events_dir, events_file = find_destination('events_file', structure) if events_dir is not None: full_path = os.path.join(events_dir, events_file) return normalize_path( parseContent(full_path, fill_specification_data(ip=self))) return 'ipevents.xml'
def forward(apps, schema_editor): InformationPackage = apps.get_model("ip", "InformationPackage") db_alias = schema_editor.connection.alias for ip in InformationPackage.objects.using(db_alias).filter( package_type=IP.AIP, sip_objid='').iterator(): ip.sip_objid = ip.object_identifier_value if ip.state in ('Prepared', 'Receiving'): ip.sip_path = ip.sip_objid else: structure = get_structure(ip) content_dir, content_name = find_destination('content', structure) content_path = os.path.join(content_dir, content_name) ip.sip_path = normalize_path( os.path.join(content_path, ip.sip_objid)) ip.save()
def download_schemas(ip, logger, verify): ip_profile_type = ip.get_package_type_display().lower() ip_profile = ip.get_profile_rel(ip_profile_type).profile structure = ip.get_structure() rootdir = ip.object_path specifications = [ip_profile.specification, get_event_spec()] premis_profile_rel = ip.get_profile_rel('preservation_metadata') if premis_profile_rel is not None: specifications.append(premis_profile_rel.profile.specification) for spec in specifications: schema_preserve_loc = spec.get('-schemaPreservationLocation', 'xsd_files') if schema_preserve_loc and structure: reldir, _ = find_destination(schema_preserve_loc, structure) dirname = os.path.join(rootdir, reldir) else: dirname = rootdir for schema in spec.get('-schemasToPreserve', []): download_schema(dirname, logger, schema, verify)
def run(self, template=None, dirname=None, structure=[], root=""): schemaPreserveLoc = template.get('-schemaPreservationLocation') if schemaPreserveLoc and structure: dirname, _ = find_destination(schemaPreserveLoc, structure) dirname = os.path.join(root, dirname) for schema in template.get('-schemasToPreserve', []): dst = os.path.join(dirname, os.path.basename(schema)) t = ProcessTask.objects.create( name="ESSArch_Core.tasks.DownloadFile", params={ 'src': schema, 'dst': dst }, processstep_id=self.step, processstep_pos=self.step_pos, responsible_id=self.responsible, information_package_id=self.ip, ) t.run().get()
def preserve_new_generation(new_ip): generate_premis = new_ip.profile_locked('preservation_metadata') has_representations = find_destination( "representations", new_ip.get_structure(), new_ip.object_path, )[1] is not None # remove existing premis and mets paths: mets_path = os.path.join(new_ip.object_path, new_ip.get_content_mets_file_path()) try: os.remove(mets_path) except FileNotFoundError: pass events_file = os.path.join(new_ip.object_path, new_ip.get_events_file_path()) try: os.remove(events_file) except FileNotFoundError: pass if generate_premis: premis_profile_data = new_ip.get_profile_data('preservation_metadata') data = fill_specification_data(premis_profile_data, ip=new_ip) premis_path = parseContent(new_ip.get_premis_file_path(), data) full_premis_path = os.path.join(new_ip.object_path, premis_path) try: os.remove(full_premis_path) except FileNotFoundError: pass workflow = [ { "step": True, "name": "Generate AIP", "children": [ { "name": "ESSArch_Core.ip.tasks.DownloadSchemas", "label": "Download Schemas", }, { "step": True, "name": "Create Log File", "children": [ { "name": "ESSArch_Core.ip.tasks.GenerateEventsXML", "label": "Generate events xml file", }, { "name": "ESSArch_Core.tasks.AppendEvents", "label": "Add events to xml file", }, { "name": "ESSArch_Core.ip.tasks.AddPremisIPObjectElementToEventsFile", "label": "Add premis IP object to xml file", }, ] }, { "name": "ESSArch_Core.ip.tasks.GenerateContentMetadata", "label": "Generate contentmetadata", }, ] }, { "step": True, "name": "Validate AIP", "children": [{ "name": "ESSArch_Core.tasks.ValidateXMLFile", "label": "Validate content-mets", "params": { "xml_filename": "{{_CONTENT_METS_PATH}}", } }, { "name": "ESSArch_Core.tasks.ValidateXMLFile", "if": generate_premis, "label": "Validate premis", "params": { "xml_filename": "{{_PREMIS_PATH}}", } }, { "name": "ESSArch_Core.tasks.ValidateLogicalPhysicalRepresentation", "label": "Diff-check against content-mets", "args": ["{{_OBJPATH}}", "{{_CONTENT_METS_PATH}}"], }, { "name": "ESSArch_Core.tasks.CompareXMLFiles", "if": generate_premis, "label": "Compare premis and content-mets", "args": ["{{_PREMIS_PATH}}", "{{_CONTENT_METS_PATH}}"], "params": { 'recursive': False }, }, { "name": "ESSArch_Core.tasks.CompareRepresentationXMLFiles", "if": has_representations and generate_premis, "label": "Compare representation premis and mets", }] }, { "name": "ESSArch_Core.tasks.UpdateIPSizeAndCount", "label": "Update IP size and file count", }, ] workflow += new_ip.create_preservation_workflow() workflow = create_workflow(workflow, new_ip, name='Preserve Information Package', eager=True) workflow.run()
def fill_specification_data(data=None, sa=None, ip=None, ignore=None): from ESSArch_Core.profiles.models import ProfileIP data = data or {} ignore = ignore or [] data = LazyDict(data) if sa: data.update(_fill_sa_specification_data(sa)) if ip: if not sa and ip.submission_agreement is not None: sa = ip.submission_agreement data.update(_fill_sa_specification_data(sa)) if ip.submission_agreement_data is not None: for k, v in ip.submission_agreement_data.data.items(): data['SA_{}'.format(k)] = v data['_OBJID'] = ip.object_identifier_value data['_OBJUUID'] = str(ip.pk) data['_OBJLABEL'] = ip.label data['_OBJPATH'] = ip.object_path try: structure = ip.get_structure() content_dir, content_name = find_destination('content', structure) data['_CONTENTPATH'] = PurePath(ip.object_path).joinpath( content_dir, content_name).as_posix() except (ProfileIP.DoesNotExist, TypeError): data['_CONTENTPATH'] = ip.object_path data['_INNER_IP_OBJID'] = ip.sip_objid data['_INNER_IP_PATH'] = ip.sip_path data['_STARTDATE'] = ip.start_date data['_ENDDATE'] = ip.end_date data['_INFORMATIONCLASS'] = ip.information_class if '_CTS_PATH' not in ignore: data['_CTS_PATH'] = (ip.get_content_type_file, ) if '_CTS_SCHEMA_PATH' not in ignore: data['_CTS_SCHEMA_PATH'] = (ip.get_content_type_schema_file, ) data['_CONTENT_METS_PATH'] = os.path.join(ip.object_path, ip.content_mets_path) data['_CONTENT_METS_CREATE_DATE'] = ip.content_mets_create_date data['_CONTENT_METS_SIZE'] = ip.content_mets_size data[ '_CONTENT_METS_DIGEST_ALGORITHM'] = ip.get_content_mets_digest_algorithm_display( ) data['_CONTENT_METS_DIGEST'] = ip.content_mets_digest data['_PACKAGE_METS_PATH'] = ip.package_mets_path data['_PACKAGE_METS_CREATE_DATE'] = ip.package_mets_create_date data['_PACKAGE_METS_SIZE'] = ip.package_mets_size data[ '_PACKAGE_METS_DIGEST_ALGORITHM'] = ip.get_package_mets_digest_algorithm_display( ) data['_PACKAGE_METS_DIGEST'] = ip.package_mets_digest data['_TEMP_CONTAINER_PATH'] = (ip.get_temp_container_path, ) data['_TEMP_METS_PATH'] = (ip.get_temp_container_xml_path, ) data['_TEMP_AIC_METS_PATH'] = ( ip.get_temp_container_aic_xml_path, ) if ip.aic else None if ip.get_package_type_display() in ['SIP', 'DIP', 'AIP']: data['_PREMIS_PATH'] = os.path.join( ip.object_path, ip.get_premis_file_path( )) if ip.get_premis_file_path() else None data['allow_unknown_file_types'] = ( ip.get_allow_unknown_file_types, ) data['_IP_CONTAINER_FORMAT'] = (ip.get_container_format, ) data['_IP_PACKAGE_TYPE'] = ip.get_package_type_display() if ip.policy is not None: data['_POLICYUUID'] = ip.policy.pk data['_POLICYID'] = ip.policy.policy_id data['_POLICYNAME'] = ip.policy.policy_name data['POLICY_INGEST_PATH'] = ip.policy.ingest_path.value else: try: transfer_project_data = ip.get_profile_data('transfer_project') data['_POLICYUUID'] = transfer_project_data.get( 'storage_policy_uuid') data['_POLICYID'] = transfer_project_data.get( 'storage_policy_id') data['_POLICYNAME'] = transfer_project_data.get( 'storage_policy_name') except ObjectDoesNotExist: pass data['_AGENTS'] = ( _get_agents, ip, ) profile_ids = zip(lowercase_profile_types, [ "_PROFILE_" + x.upper().replace(' ', '_') + "_ID" for x in profile_types ]) for (profile_type, key) in profile_ids: data[key] = (_get_profile_id_by_type, profile_type, ip) for p in Parameter.objects.iterator(): data['_PARAMETER_%s' % p.entity.upper()] = p.value for p in Path.objects.iterator(): data['_PATH_%s' % p.entity.upper()] = p.value return data
def test_find_destination(self): structure = [ { 'type': 'file', 'name': 'mets.xml', 'use': 'mets_file', }, { 'type': 'folder', 'name': 'content', 'use': 'content', 'children': [ { 'type': 'file', "name": 'metadata.xml', 'use': 'content_type_specification' }, { 'type': 'file', "name": 'metadata.xsd', 'use': 'content_type_specification_schema' }, ], }, { 'type': 'folder', 'name': 'metadata', 'use': 'metadata', 'children': [ { 'type': 'file', 'use': 'xsd_files', 'name': 'xsd_files' }, { 'type': 'file', 'name': 'premis.xml', 'use': 'preservation_description_file', }, { 'type': 'file', 'name': 'ead.xml', 'use': 'archival_description_file', }, { 'type': 'file', 'name': 'eac.xml', 'use': 'authoritive_information_file', }, ] }, ] tests = ( ('mets_file', ('', 'mets.xml')), ('xsd_files', ('metadata', 'xsd_files')), ('preservation_description_file', ('metadata', 'premis.xml')), ('foo', (None, None)), ) for value, expected in tests: with self.subTest(value=value): self.assertEqual(find_destination(value, structure), expected)
def _run(self): def get_information_packages(): return self.rule.information_packages.filter( active=True, ).exclude(conversion_job_entries__job=self, ) ips = get_information_packages() for ip in ips.order_by( '-cached').iterator(): # convert cached IPs first run_cached_ip(ip) policy = ip.policy srcdir = os.path.join(policy.cache_storage.value, ip.object_identifier_value) new_ip = ip.create_new_generation(ip.state, ip.responsible, None) dstdir = os.path.join(policy.cache_storage.value, new_ip.object_identifier_value) new_ip.object_path = dstdir new_ip.save() aip_profile = new_ip.get_profile_rel('aip').profile aip_profile_data = new_ip.get_profile_data('aip') mets_dir, mets_name = find_destination("mets_file", aip_profile.structure) mets_path = os.path.join(srcdir, mets_dir, mets_name) # copy files to new generation shutil.copytree(srcdir, dstdir) # convert files specified in rule for pattern, spec in self.rule.specification.items(): target = spec['target'] tool = spec['tool'] for path in iglob(dstdir + '/' + pattern): if os.path.isdir(path): for root, dirs, files in walk(path): rel = os.path.relpath(root, dstdir) for f in files: fpath = os.path.join(root, f) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=os.path.join(rel, f)) convert_file(fpath, target) os.remove(fpath) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() elif os.path.isfile(path): rel = os.path.relpath(path, dstdir) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=rel, ) convert_file(path, target) os.remove(path) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() # preserve new generation preserve_new_generation(aip_profile, aip_profile_data, dstdir, ip, mets_path, new_ip, policy)
def preserve_new_generation(aip_profile, aip_profile_data, dstdir, ip, mets_path, new_ip, policy): sa = new_ip.submission_agreement try: os.remove(mets_path) except OSError as e: if e.errno != errno.ENOENT: raise files_to_create = OrderedDict() try: premis_profile = new_ip.get_profile_rel( 'preservation_metadata').profile premis_profile_data = ip.get_profile_data('preservation_metadata') except ProfileIP.DoesNotExist: pass else: premis_dir, premis_name = find_destination( "preservation_description_file", aip_profile.structure) premis_path = os.path.join(dstdir, premis_dir, premis_name) try: os.remove(premis_path) except OSError as e: if e.errno != errno.ENOENT: raise files_to_create[premis_path] = { 'spec': premis_profile.specification, 'data': fill_specification_data(premis_profile_data, ip=new_ip, sa=sa), } files_to_create[mets_path] = { 'spec': aip_profile.specification, 'data': fill_specification_data(aip_profile_data, ip=new_ip, sa=sa), } t = ProcessTask.objects.create( name='ESSArch_Core.tasks.GenerateXML', params={ 'filesToCreate': files_to_create, 'folderToParse': dstdir, }, responsible=new_ip.responsible, information_package=new_ip, ) t.run().get() dsttar = dstdir + '.tar' dstxml = dstdir + '.xml' objid = new_ip.object_identifier_value with tarfile.open(dsttar, 'w') as tar: for root, dirs, files in walk(dstdir): rel = os.path.relpath(root, dstdir) for d in dirs: src = os.path.join(root, d) arc = os.path.join(objid, rel, d) arc = os.path.normpath(arc) index_path(new_ip, src) tar.add(src, arc, recursive=False) for f in files: src = os.path.join(root, f) index_path(new_ip, src) tar.add(src, os.path.normpath(os.path.join(objid, rel, f))) algorithm = policy.get_checksum_algorithm_display() checksum = calculate_checksum(dsttar, algorithm=algorithm) info = fill_specification_data(new_ip.get_profile_data('aip_description'), ip=new_ip, sa=sa) info["_IP_CREATEDATE"] = timestamp_to_datetime( creation_date(dsttar)).isoformat() aip_desc_profile = new_ip.get_profile('aip_description') files_to_create = { dstxml: { 'spec': aip_desc_profile.specification, 'data': info } } ProcessTask.objects.create( name="ESSArch_Core.tasks.GenerateXML", params={ "filesToCreate": files_to_create, "folderToParse": dsttar, "extra_paths_to_parse": [mets_path], "algorithm": algorithm, }, information_package=new_ip, responsible=new_ip.responsible, ).run().get() InformationPackage.objects.filter(pk=new_ip.pk).update( message_digest=checksum, message_digest_algorithm=policy.checksum_algorithm, ) ProcessTask.objects.create( name='ESSArch_Core.tasks.UpdateIPSizeAndCount', information_package=new_ip, responsible=new_ip.responsible, ).run().get() t = ProcessTask.objects.create( name='workflow.tasks.StoreAIP', information_package=new_ip, responsible=new_ip.responsible, ) t.run()
def _run_archive_object(self): def get_information_packages(): return self.rule.information_packages.filter( Q( Q(appraisal_date__lte=timezone.now()) | Q(appraisal_date__isnull=True)), active=True, ).exclude(appraisal_job_entries__job=self, ) ips = get_information_packages() logger.info( 'Running appraisal job {} on {} information packages'.format( self.pk, ips.count())) for ip in ips.order_by('-cached').iterator(): # run cached IPs first run_cached_ip(ip) # inactivate old generations InformationPackage.objects.filter( aic=ip.aic, generation__lte=ip.generation).update(active=False) policy = ip.policy srcdir = os.path.join(policy.cache_storage.value, ip.object_identifier_value) if not self.rule.specification: # register all files for root, dirs, files in walk(srcdir): rel = os.path.relpath(root, srcdir) for f in files: job_entry = AppraisalJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, document=os.path.join(rel, f)) job_entry.end_date = timezone.now() job_entry.save() else: new_ip = ip.create_new_generation(ip.state, ip.responsible, None) dstdir = os.path.join(policy.cache_storage.value, new_ip.object_identifier_value) new_ip.object_path = dstdir new_ip.save() aip_profile = new_ip.get_profile_rel('aip').profile aip_profile_data = new_ip.get_profile_data('aip') mets_dir, mets_name = find_destination("mets_file", aip_profile.structure) mets_path = os.path.join(srcdir, mets_dir, mets_name) # copy files to new generation shutil.copytree(srcdir, dstdir) # delete files specified in rule for pattern in self.rule.specification: for path in iglob(dstdir + '/' + pattern): if os.path.isdir(path): for root, dirs, files in walk(path): rel = os.path.relpath(root, dstdir) for f in files: fpath = os.path.join(root, f) job_entry = AppraisalJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, document=os.path.join(rel, f)) os.remove(fpath) job_entry.end_date = timezone.now() job_entry.save() elif os.path.isfile(path): rel = os.path.relpath(path, dstdir) job_entry = AppraisalJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, document=rel, ) os.remove(path) job_entry.end_date = timezone.now() job_entry.save() # preserve new generation preserve_new_generation(aip_profile, aip_profile_data, dstdir, ip, mets_path, new_ip, policy)
def run(self, purpose=None, delete_sip=False): self.logger.debug('Receiving SIP') aip = InformationPackage.objects.get(pk=self.ip) algorithm = aip.get_checksum_algorithm() container = aip.object_path objid, container_type = os.path.splitext(os.path.basename(container)) container_type = container_type.lower() xml = aip.package_mets_path aip.package_mets_create_date = timestamp_to_datetime( creation_date(xml)).isoformat() aip.package_mets_size = os.path.getsize(xml) aip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] aip.package_mets_digest = calculate_checksum(xml, algorithm=algorithm) aip.generation = 0 aic = InformationPackage.objects.create( package_type=InformationPackage.AIC, responsible=aip.responsible, label=aip.label, start_date=aip.start_date, end_date=aip.end_date) old_sip_path = aip.object_path aip.aic = aic aip_dir = os.path.join(aip.policy.ingest_path.value, objid) aip.object_path = aip_dir try: os.makedirs(aip_dir) except OSError as e: if e.errno != errno.EEXIST: raise aip.save() dst_path, dst_name = find_destination('sip', aip.get_profile('aip').structure, aip.object_path) if dst_path is None: dst_path, dst_name = find_destination( 'content', aip.get_profile('aip').structure, aip.object_path) dst_name, = self.parse_params(dst_name) dst = os.path.join(dst_path, dst_name) sip_profile = aip.submission_agreement.profile_sip try: shutil.rmtree(dst) except FileNotFoundError: pass if aip.policy.receive_extract_sip: temp = Path.objects.cached('entity', 'temp', 'value') with tempfile.TemporaryDirectory(dir=temp) as tmpdir: self.logger.debug('Extracting {} to {}'.format( container, tmpdir)) if container_type == '.tar': with tarfile.open(container) as tar: root_member_name = tar.getnames()[0] tar.extractall(tmpdir) elif container_type == '.zip': with zipfile.ZipFile(container) as zipf: root_member_name = zipf.namelist()[0] zipf.extractall(tmpdir) else: raise ValueError( 'Invalid container type: {}'.format(container)) dst = os.path.join(dst, '') try: os.makedirs(dst) except OSError as e: if e.errno != errno.EEXIST: raise tmpsrc = tmpdir if len(os.listdir(tmpdir)) == 1 and os.listdir( tmpdir)[0] == root_member_name: new_tmpsrc = os.path.join(tmpdir, root_member_name) if os.path.isdir(new_tmpsrc): tmpsrc = new_tmpsrc self.logger.debug('Moving content of {} to {}'.format( tmpsrc, dst)) for f in os.listdir(tmpsrc): shutil.move(os.path.join(tmpsrc, f), dst) self.logger.debug('Deleting {}'.format(tmpdir)) aip.sip_path = os.path.relpath(dst, aip.object_path) else: self.logger.debug('Copying {} to {}'.format(container, dst)) shutil.copy2(container, dst) aip.sip_path = os.path.relpath( os.path.join(dst, os.path.basename(container)), aip.object_path) sip_mets_dir, sip_mets_file = find_destination('mets_file', sip_profile.structure, aip.sip_path) if os.path.isfile(aip.sip_path): sip_mets_data = parse_mets( open_file( os.path.join(aip.object_path, sip_mets_dir, sip_mets_file), container=aip.sip_path, container_prefix=aip.object_identifier_value, )) else: sip_mets_data = parse_mets( open_file( os.path.join(aip.object_path, sip_mets_dir, sip_mets_file))) # prefix all SIP data sip_mets_data = { f'SIP_{k.upper()}': v for k, v in sip_mets_data.items() } aip_profile_rel_data = aip.get_profile_rel('aip').data aip_profile_rel_data.data.update(sip_mets_data) aip_profile_rel_data.save() if delete_sip: delete_path(old_sip_path) delete_path(pathlib.Path(old_sip_path).with_suffix('.xml')) self.logger.debug('sip_path set to {}'.format(aip.sip_path)) aip.save()
def ReceiveSIP(self, purpose=None, delete_sip=False): logger = logging.getLogger('essarch.workflow.tasks.ReceiveSIP') logger.debug('Receiving SIP') ip = self.get_information_package() algorithm = ip.get_checksum_algorithm() container = ip.object_path objid, container_type = os.path.splitext(os.path.basename(container)) container_type = container_type.lower() xml = ip.package_mets_path ip.package_mets_create_date = timestamp_to_datetime( creation_date(xml)).isoformat() ip.package_mets_size = os.path.getsize(xml) ip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] ip.package_mets_digest = calculate_checksum(xml, algorithm=algorithm) ip.object_path = os.path.join(ip.policy.ingest_path.value, ip.object_identifier_value) ip.save() sip_dst_path, sip_dst_name = find_destination('sip', ip.get_structure(), ip.object_path) if sip_dst_path is None: sip_dst_path, sip_dst_name = find_destination('content', ip.get_structure(), ip.object_path) sip_dst_name, = self.parse_params(sip_dst_name) sip_dst = os.path.join(sip_dst_path, sip_dst_name) if ip.policy.receive_extract_sip: # remove any existing directory from previous attempts delete_path(sip_dst) temp = Path.objects.get(entity='temp').value with tempfile.TemporaryDirectory(dir=temp) as tmpdir: logger.debug('Extracting {} to {}'.format(container, tmpdir)) if container_type == '.tar': with tarfile.open(container) as tar: root_member_name = tar.getnames()[0] tar.extractall(tmpdir) elif container_type == '.zip': with zipfile.ZipFile(container) as zipf: root_member_name = zipf.namelist()[0] zipf.extractall(tmpdir) else: raise ValueError( 'Invalid container type: {}'.format(container)) sip_dst = os.path.join(sip_dst, '') os.makedirs(sip_dst) tmpsrc = tmpdir if len(os.listdir(tmpdir)) == 1 and os.listdir( tmpdir)[0] == root_member_name: new_tmpsrc = os.path.join(tmpdir, root_member_name) if os.path.isdir(new_tmpsrc): tmpsrc = new_tmpsrc logger.debug('Moving content of {} to {}'.format(tmpsrc, sip_dst)) for f in os.listdir(tmpsrc): shutil.move(os.path.join(tmpsrc, f), sip_dst) logger.debug('Deleting {}'.format(tmpdir)) else: logger.debug('Copying {} to {}'.format(container, sip_dst)) shutil.copy2(container, sip_dst) ip.sip_path = os.path.relpath(sip_dst, ip.object_path) ip.save() self.create_success_event("Received SIP") return sip_dst
def _run(self): def get_information_packages(job): return self.rule.information_packages.filter( active=True, ).exclude(conversion_job_entries__job=self, ) ips = get_information_packages(self) for ip in ips.order_by( '-cached').iterator(): # convert cached IPs first while not ip.cached: with allow_join_result(): t, created = ProcessTask.objects.get_or_create( name='workflow.tasks.CacheAIP', information_package=ip, defaults={ 'responsible': ip.responsible, 'eager': False }) if not created: t.run() time.sleep(10) ip.refresh_from_db() policy = ip.policy srcdir = os.path.join(policy.cache_storage.value, ip.object_identifier_value) new_ip = ip.create_new_generation(ip.state, ip.responsible, None) dstdir = os.path.join(policy.cache_storage.value, new_ip.object_identifier_value) new_ip.object_path = dstdir new_ip.save() aip_profile = new_ip.get_profile_rel('aip').profile aip_profile_data = new_ip.get_profile_data('aip') mets_dir, mets_name = find_destination("mets_file", aip_profile.structure) mets_path = os.path.join(srcdir, mets_dir, mets_name) mets_tree = etree.parse(mets_path) # copy files to new generation shutil.copytree(srcdir, dstdir) # convert files specified in rule for pattern, spec in six.iteritems(self.rule.specification): target = spec['target'] tool = spec['tool'] for path in iglob(dstdir + '/' + pattern): if os.path.isdir(path): for root, dirs, files in walk(path): rel = os.path.relpath(root, dstdir) for f in files: fpath = os.path.join(root, f) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=os.path.join(rel, f)) convert_file(fpath, target) os.remove(fpath) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() elif os.path.isfile(path): rel = os.path.relpath(path, dstdir) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=rel, ) convert_file(path, target) os.remove(path) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() # preserve new generation sa = new_ip.submission_agreement try: os.remove(mets_path) except OSError as e: if e.errno != errno.ENOENT: raise filesToCreate = OrderedDict() try: premis_profile = new_ip.get_profile_rel( 'preservation_metadata').profile premis_profile_data = ip.get_profile_data( 'preservation_metadata') except ProfileIP.DoesNotExist: pass else: premis_dir, premis_name = find_destination( "preservation_description_file", aip_profile.structure) premis_path = os.path.join(dstdir, premis_dir, premis_name) try: os.remove(premis_path) except OSError as e: if e.errno != errno.ENOENT: raise filesToCreate[premis_path] = { 'spec': premis_profile.specification, 'data': fill_specification_data(premis_profile_data, ip=new_ip, sa=sa), } filesToCreate[mets_path] = { 'spec': aip_profile.specification, 'data': fill_specification_data(aip_profile_data, ip=new_ip, sa=sa), } t = ProcessTask.objects.create( name='ESSArch_Core.tasks.GenerateXML', params={ 'filesToCreate': filesToCreate, 'folderToParse': dstdir, }, responsible=new_ip.responsible, information_package=new_ip, ) t.run().get() dsttar = dstdir + '.tar' dstxml = dstdir + '.xml' objid = new_ip.object_identifier_value with tarfile.open(dsttar, 'w') as tar: for root, dirs, files in walk(dstdir): rel = os.path.relpath(root, dstdir) for d in dirs: src = os.path.join(root, d) arc = os.path.join(objid, rel, d) arc = os.path.normpath(arc) index_path(new_ip, src) tar.add(src, arc, recursive=False) for f in files: src = os.path.join(root, f) index_path(new_ip, src) tar.add(src, os.path.normpath(os.path.join(objid, rel, f))) algorithm = policy.get_checksum_algorithm_display() checksum = calculate_checksum(dsttar, algorithm=algorithm) info = fill_specification_data( new_ip.get_profile_data('aip_description'), ip=new_ip, sa=sa) info["_IP_CREATEDATE"] = timestamp_to_datetime( creation_date(dsttar)).isoformat() aip_desc_profile = new_ip.get_profile('aip_description') filesToCreate = { dstxml: { 'spec': aip_desc_profile.specification, 'data': info } } ProcessTask.objects.create( name="ESSArch_Core.tasks.GenerateXML", params={ "filesToCreate": filesToCreate, "folderToParse": dsttar, "extra_paths_to_parse": [mets_path], "algorithm": algorithm, }, information_package=new_ip, responsible=new_ip.responsible, ).run().get() InformationPackage.objects.filter(pk=new_ip.pk).update( message_digest=checksum, message_digest_algorithm=policy.checksum_algorithm, ) ProcessTask.objects.create( name='ESSArch_Core.tasks.UpdateIPSizeAndCount', information_package=new_ip, responsible=new_ip.responsible, ).run().get() t = ProcessTask.objects.create( name='workflow.tasks.StoreAIP', information_package=new_ip, responsible=new_ip.responsible, ) t.run()
def fill_specification_data(data=None, sa=None, ip=None): data = data or {} if sa: data['_SA_ID'] = str(sa.pk) data['_SA_NAME'] = sa.name if ip: if not sa and ip.submission_agreement is not None: sa = ip.submission_agreement data['_SA_ID'] = str(sa.pk) data['_SA_NAME'] = sa.name data['_OBJID'] = ip.object_identifier_value data['_OBJUUID'] = str(ip.pk) data['_OBJLABEL'] = ip.label data['_OBJPATH'] = ip.object_path data['_INNER_IP_OBJID'] = ip.sip_objid data['_INNER_IP_PATH'] = ip.sip_path data['_STARTDATE'] = ip.start_date data['_ENDDATE'] = ip.end_date data['_INFORMATIONCLASS'] = ip.information_class data['_CONTENT_METS_PATH'] = ip.content_mets_path data['_CONTENT_METS_CREATE_DATE'] = ip.content_mets_create_date data['_CONTENT_METS_SIZE'] = ip.content_mets_size data[ '_CONTENT_METS_DIGEST_ALGORITHM'] = ip.get_content_mets_digest_algorithm_display( ) data['_CONTENT_METS_DIGEST'] = ip.content_mets_digest data['_PACKAGE_METS_PATH'] = ip.package_mets_path data['_PACKAGE_METS_CREATE_DATE'] = ip.package_mets_create_date data['_PACKAGE_METS_SIZE'] = ip.package_mets_size data[ '_PACKAGE_METS_DIGEST_ALGORITHM'] = ip.get_package_mets_digest_algorithm_display( ) data['_PACKAGE_METS_DIGEST'] = ip.package_mets_digest if ip.get_package_type_display() in ['SIP', 'AIP']: ip_profile = ip.get_profile(ip.get_package_type_display().lower()) if ip_profile is not None: premis_dir, premis_file = find_destination( "preservation_description_file", ip_profile.structure) if premis_dir is not None and premis_file is not None: data['_PREMIS_PATH'] = os.path.join( ip.object_path, premis_dir, premis_file) data['allow_unknown_file_types'] = ip.get_profile_data( ip.get_package_type_display().lower()).get( 'allow_unknown_file_types', False) try: # do we have a transfer project profile? ip.get_profile('transfer_project') except AttributeError: container = 'TAR' else: container = ip.get_container_format() data['_IP_CONTAINER_FORMAT'] = container.upper() if ip.policy is not None: data['_POLICYUUID'] = ip.policy.pk data['_POLICYID'] = ip.policy.policy_id data['_POLICYNAME'] = ip.policy.policy_name data['POLICY_INGEST_PATH'] = ip.policy.ingest_path.value else: try: # do we have a transfer project profile? ip.get_profile('transfer_project') except AttributeError: pass else: transfer_project_data = ip.get_profile_data('transfer_project') data['_POLICYUUID'] = transfer_project_data.get( 'archive_policy_uuid') data['_POLICYID'] = transfer_project_data.get( 'archive_policy_id') data['_POLICYNAME'] = transfer_project_data.get( 'archive_policy_name') data['_AGENTS'] = {} for a in ip.agents.all(): agent = { '_AGENTS_NAME': a.name, '_AGENTS_NOTES': [{ '_AGENTS_NOTE': n.note } for n in a.notes.all()], } if a.other_role: agent['_AGENTS_ROLE'] = 'OTHER' agent['_AGENTS_OTHERROLE'] = a.role else: agent['_AGENTS_ROLE'] = a.role if a.other_type: agent['_AGENTS_TYPE'] = 'OTHER' agent['_AGENTS_OTHERTYPE'] = a.type else: agent['_AGENTS_TYPE'] = a.type agent_key = '{role}_{type}'.format(role=a.role.upper(), type=a.type.upper()) data['_AGENTS'][agent_key] = agent profile_ids = zip([x.lower().replace(' ', '_') for x in profile_types], [ "_PROFILE_" + x.upper().replace(' ', '_') + "_ID" for x in profile_types ]) for (profile_type, key) in profile_ids: try: data[key] = str(ip.get_profile(profile_type).pk) except AttributeError: pass for p in Parameter.objects.iterator(): data['_PARAMETER_%s' % p.entity.upper()] = p.value for p in Path.objects.iterator(): data['_PATH_%s' % p.entity.upper()] = p.value without_underscores = _remove_leading_underscores(data) data.update(without_underscores) return data