def parse_file(filepath, fid, relpath=None, algorithm='SHA-256', rootdir='', provided_data=None): if not relpath: relpath = filepath if provided_data is None: provided_data = {} relpath = win_to_posix(relpath) fileinfo = { 'FName': os.path.basename(relpath), 'FExtension': os.path.splitext(relpath)[1][1:], 'FDir': rootdir, 'FParentDir': os.path.basename(os.path.dirname(filepath)), 'FID': str(uuid.uuid4()), 'daotype': "borndigital", 'href': relpath, 'FMimetype': fid.get_mimetype(filepath), 'FSize': str(os.path.getsize(filepath)), 'FUse': 'Datafile', 'FChecksumType': algorithm, 'FLoctype': 'URL', 'FLinkType': 'simple', 'FChecksumLib': 'ESSArch', 'FIDType': 'UUID', } # We only do heavy computations if their values aren't included in # provided_data if 'FCreated' not in provided_data: timestamp = creation_date(filepath) createdate = timestamp_to_datetime(timestamp) fileinfo['FCreated'] = createdate.isoformat() if 'FChecksum' not in provided_data: fileinfo['FChecksum'] = checksum.calculate_checksum( filepath, algorithm) if 'FEncrypted' not in provided_data: fileinfo['FEncrypted'] = fid.identify_file_encryption(filepath) if any(x not in provided_data for x in ['FFormatName', 'FFormatVersion', 'FFormatRegistryKey']): (format_name, format_version, format_registry_key) = fid.identify_file_format(filepath) fileinfo['FFormatName'] = format_name fileinfo['FFormatVersion'] = format_version fileinfo['FFormatRegistryKey'] = format_registry_key for key, value in provided_data.items(): fileinfo[key] = value return fileinfo
def run(self): ip = self.get_information_package() mets_path = ip.get_content_mets_file_path() profile_type = ip.get_package_type_display().lower() profile_rel = ip.get_profile_rel(profile_type) profile_data = ip.get_profile_data(profile_type) files_to_create = { mets_path: { 'spec': profile_rel.profile.specification, 'data': fill_specification_data(profile_data, ip=ip) } } algorithm = ip.get_checksum_algorithm() generator = XMLGenerator() generator.generate(files_to_create, folderToParse=ip.object_path, algorithm=algorithm) ip.content_mets_path = mets_path ip.content_mets_create_date = timestamp_to_datetime( creation_date(mets_path)).isoformat() ip.content_mets_size = os.path.getsize(mets_path) ip.content_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] ip.content_mets_digest = calculate_checksum(mets_path, algorithm=algorithm) ip.save()
def generate_content_mets(ip): mets_path = ip.get_content_mets_file_path() full_mets_path = os.path.join(ip.object_path, mets_path) profile_type = ip.get_package_type_display().lower() profile_rel = ip.get_profile_rel(profile_type) profile_data = ip.get_profile_data(profile_type) files_to_create = { full_mets_path: { 'spec': profile_rel.profile.specification, 'data': fill_specification_data(profile_data, ip=ip) } } algorithm = ip.get_checksum_algorithm() allow_unknown_file_types = ip.get_allow_unknown_file_types() allow_encrypted_files = ip.get_allow_encrypted_files() generator = XMLGenerator( allow_unknown_file_types=allow_unknown_file_types, allow_encrypted_files=allow_encrypted_files, ) generator.generate(files_to_create, folderToParse=ip.object_path, algorithm=algorithm) ip.content_mets_path = mets_path ip.content_mets_create_date = timestamp_to_datetime( creation_date(full_mets_path)).isoformat() ip.content_mets_size = os.path.getsize(full_mets_path) ip.content_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] ip.content_mets_digest = calculate_checksum(full_mets_path, algorithm=algorithm) ip.save()
def generate_package_mets(ip): sa = ip.submission_agreement if ip.package_type == InformationPackage.SIP: profile_type = 'submit_description' elif ip.package_type == InformationPackage.AIP: profile_type = 'aip_description' else: raise ValueError( 'Cannot create package mets for IP of type {package_type}'.format( package_type=ip.package_type ) ) profile_rel = ip.get_profile_rel(profile_type) profile_data = ip.get_profile_data(profile_type) xmlpath = os.path.splitext(ip.object_path)[0] + '.xml' data = fill_specification_data(profile_data, ip=ip, sa=sa) data["_IP_CREATEDATE"] = timestamp_to_datetime(creation_date(ip.object_path)).isoformat() files_to_create = { xmlpath: { 'spec': profile_rel.profile.specification, 'data': data } } algorithm = ip.get_checksum_algorithm() generator = XMLGenerator() generator.generate(files_to_create, folderToParse=ip.object_path, algorithm=algorithm) ip.package_mets_path = normalize_path(xmlpath) ip.package_mets_create_date = timestamp_to_datetime(creation_date(xmlpath)).isoformat() ip.package_mets_size = os.path.getsize(xmlpath) ip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[algorithm.upper()] ip.package_mets_digest = calculate_checksum(xmlpath, algorithm=algorithm) ip.save()
def copy_file_remotely(src, dst, requests_session, block_size=DEFAULT_BLOCK_SIZE): fsize = os.stat(src).st_size idx = 0 time_start = time.time() upload_id = copy_chunk_remotely(src, dst, idx * block_size, requests_session=requests_session, file_size=fsize, block_size=block_size) idx += 1 while idx * block_size <= fsize: copy_chunk_remotely(src, dst, idx * block_size, requests_session=requests_session, file_size=fsize, block_size=block_size, upload_id=upload_id) idx += 1 md5 = calculate_checksum(src, algorithm='MD5', block_size=block_size) completion_url = dst.rstrip('/') + '_complete/' m = MultipartEncoder( fields={ 'path': os.path.basename(src), 'upload_id': upload_id, 'md5': md5, 'dst': requests_session.params.get('dst') }) headers = {'Content-Type': m.content_type} _send_completion_request(requests_session, completion_url, m, headers) time_end = time.time() time_elapsed = time_end - time_start fsize_mb = fsize / MB try: mb_per_sec = fsize_mb / time_elapsed except ZeroDivisionError: mb_per_sec = fsize_mb logger.info('Copied {} ({} MB) to {} at {} MB/Sec ({} sec)'.format( src, fsize_mb, dst, mb_per_sec, time_elapsed))
def copy_file_remotely(src, dst, requests_session, block_size=DEFAULT_BLOCK_SIZE): fsize = os.stat(src).st_size idx = 0 time_start = time.time() upload_id = copy_chunk_remotely(src, dst, idx * block_size, requests_session=requests_session, file_size=fsize, block_size=block_size) idx += 1 while idx * block_size <= fsize: copy_chunk_remotely(src, dst, idx * block_size, requests_session=requests_session, file_size=fsize, block_size=block_size, upload_id=upload_id) idx += 1 md5 = calculate_checksum(src, algorithm='MD5', block_size=block_size) completion_url = dst.rstrip('/') + '_complete/' m = MultipartEncoder( fields={ 'path': os.path.basename(src), 'upload_id': upload_id, 'md5': md5, 'dst': requests_session.params.get('dst') } ) headers = {'Content-Type': m.content_type} @retry(retry=retry_if_exception_type(RequestException), reraise=True, stop=stop_after_attempt(5), wait=wait_fixed(60), before_sleep=before_sleep_log(logger, logging.DEBUG)) def send_completion_request(): response = requests_session.post(completion_url, data=m, headers=headers, timeout=60) response.raise_for_status() send_completion_request() time_end = time.time() time_elapsed = time_end - time_start fsize_mb = fsize / MB try: mb_per_sec = fsize_mb / time_elapsed except ZeroDivisionError: mb_per_sec = fsize_mb logger.info( 'Copied {} ({} MB) to {} at {} MB/Sec ({} sec)'.format( src, fsize_mb, dst, mb_per_sec, time_elapsed ) )
def generate_content_metadata(ip): files_to_create = {} generate_premis = ip.profile_locked('preservation_metadata') if generate_premis: premis_profile_type = 'preservation_metadata' premis_profile_rel = ip.get_profile_rel(premis_profile_type) premis_profile_data = ip.get_profile_data(premis_profile_type) data = fill_specification_data(premis_profile_data, ip=ip) premis_path = parseContent(ip.get_premis_file_path(), data) full_premis_path = os.path.join(ip.object_path, premis_path) files_to_create[full_premis_path] = { 'spec': premis_profile_rel.profile.specification, 'data': data, } mets_path = ip.get_content_mets_file_path() full_mets_path = os.path.join(ip.object_path, mets_path) profile_type = ip.get_package_type_display().lower() profile_rel = ip.get_profile_rel(profile_type) profile_data = ip.get_profile_data(profile_type) files_to_create[full_mets_path] = { 'spec': profile_rel.profile.specification, 'data': fill_specification_data(profile_data, ip=ip), } parsed_files = profile_rel.data.parsed_files extra_paths_to_parse = profile_rel.data.extra_paths_to_parse algorithm = ip.get_checksum_algorithm() allow_unknown_file_types = ip.get_allow_unknown_file_types() allow_encrypted_files = ip.get_allow_encrypted_files() generator = XMLGenerator( allow_unknown_file_types=allow_unknown_file_types, allow_encrypted_files=allow_encrypted_files, ) generator.generate(files_to_create, folderToParse=ip.object_path, algorithm=algorithm, parsed_files=parsed_files, extra_paths_to_parse=extra_paths_to_parse) ip.content_mets_path = mets_path ip.content_mets_create_date = timestamp_to_datetime( creation_date(full_mets_path)).isoformat() ip.content_mets_size = os.path.getsize(full_mets_path) ip.content_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] ip.content_mets_digest = calculate_checksum(full_mets_path, algorithm=algorithm) ip.save()
def validate(self, filepath, expected=None): logger.debug('Validating checksum of %s' % filepath) val_obj = Validation.objects.create( filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, } ) expected = self.options['expected'].format(**self.data) if self.context == 'checksum_str': checksum = expected.lower() elif self.context == 'checksum_file': with open(expected, 'r') as checksum_file: checksum = checksum_file.read().strip() elif self.context == 'xml_file': xml_el, _ = find_file(filepath, xmlfile=expected) checksum = xml_el.checksum passed = False try: actual_checksum = calculate_checksum(filepath, algorithm=self.algorithm, block_size=self.block_size) if actual_checksum != checksum: raise ValidationError("checksum for %s is not valid (%s != %s)" % ( filepath, checksum, actual_checksum )) passed = True except Exception: val_obj.message = traceback.format_exc() raise else: message = 'Successfully validated checksum of %s' % filepath val_obj.message = message logger.info(message) finally: val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save(update_fields=['time_done', 'passed', 'message'])
def copy_file_remotely(src, dst, requests_session=None, block_size=65536): file_size = os.stat(src).st_size idx = 0 upload_id = copy_chunk(src, dst, idx * block_size, file_size, requests_session=requests_session, block_size=block_size) idx += 1 while idx * block_size <= file_size: copy_chunk(src, dst, idx * block_size, requests_session=requests_session, file_size=file_size, block_size=block_size, upload_id=upload_id) idx += 1 md5 = calculate_checksum(src, algorithm='MD5', block_size=block_size) completion_url = dst.rstrip('/') + '_complete/' m = MultipartEncoder(fields={ 'path': os.path.basename(src), 'upload_id': upload_id, 'md5': md5, }) headers = {'Content-Type': m.content_type} @retry(stop_max_attempt_number=5, wait_fixed=60000) def send_completion_request(): response = requests_session.post(completion_url, data=m, headers=headers) response.raise_for_status() send_completion_request()
def preserve_new_generation(aip_profile, aip_profile_data, dstdir, ip, mets_path, new_ip, policy): sa = new_ip.submission_agreement try: os.remove(mets_path) except OSError as e: if e.errno != errno.ENOENT: raise files_to_create = OrderedDict() try: premis_profile = new_ip.get_profile_rel( 'preservation_metadata').profile premis_profile_data = ip.get_profile_data('preservation_metadata') except ProfileIP.DoesNotExist: pass else: premis_dir, premis_name = find_destination( "preservation_description_file", aip_profile.structure) premis_path = os.path.join(dstdir, premis_dir, premis_name) try: os.remove(premis_path) except OSError as e: if e.errno != errno.ENOENT: raise files_to_create[premis_path] = { 'spec': premis_profile.specification, 'data': fill_specification_data(premis_profile_data, ip=new_ip, sa=sa), } files_to_create[mets_path] = { 'spec': aip_profile.specification, 'data': fill_specification_data(aip_profile_data, ip=new_ip, sa=sa), } t = ProcessTask.objects.create( name='ESSArch_Core.tasks.GenerateXML', params={ 'filesToCreate': files_to_create, 'folderToParse': dstdir, }, responsible=new_ip.responsible, information_package=new_ip, ) t.run().get() dsttar = dstdir + '.tar' dstxml = dstdir + '.xml' objid = new_ip.object_identifier_value with tarfile.open(dsttar, 'w') as tar: for root, dirs, files in walk(dstdir): rel = os.path.relpath(root, dstdir) for d in dirs: src = os.path.join(root, d) arc = os.path.join(objid, rel, d) arc = os.path.normpath(arc) index_path(new_ip, src) tar.add(src, arc, recursive=False) for f in files: src = os.path.join(root, f) index_path(new_ip, src) tar.add(src, os.path.normpath(os.path.join(objid, rel, f))) algorithm = policy.get_checksum_algorithm_display() checksum = calculate_checksum(dsttar, algorithm=algorithm) info = fill_specification_data(new_ip.get_profile_data('aip_description'), ip=new_ip, sa=sa) info["_IP_CREATEDATE"] = timestamp_to_datetime( creation_date(dsttar)).isoformat() aip_desc_profile = new_ip.get_profile('aip_description') files_to_create = { dstxml: { 'spec': aip_desc_profile.specification, 'data': info } } ProcessTask.objects.create( name="ESSArch_Core.tasks.GenerateXML", params={ "filesToCreate": files_to_create, "folderToParse": dsttar, "extra_paths_to_parse": [mets_path], "algorithm": algorithm, }, information_package=new_ip, responsible=new_ip.responsible, ).run().get() InformationPackage.objects.filter(pk=new_ip.pk).update( message_digest=checksum, message_digest_algorithm=policy.checksum_algorithm, ) ProcessTask.objects.create( name='ESSArch_Core.tasks.UpdateIPSizeAndCount', information_package=new_ip, responsible=new_ip.responsible, ).run().get() t = ProcessTask.objects.create( name='workflow.tasks.StoreAIP', information_package=new_ip, responsible=new_ip.responsible, ) t.run()
def _get_checksum(self, input_file, relpath=None): path = relpath or input_file algorithm = self.checksum_algorithms.get( path) or self.default_algorithm return calculate_checksum(input_file, algorithm=algorithm)
def _get_checksum(self, input_file): algorithm = self.checksum_algorithms.get(input_file, self.default_algorithm) return calculate_checksum(input_file, algorithm=algorithm)
def run(self, purpose=None, delete_sip=False): self.logger.debug('Receiving SIP') aip = InformationPackage.objects.get(pk=self.ip) algorithm = aip.get_checksum_algorithm() container = aip.object_path objid, container_type = os.path.splitext(os.path.basename(container)) container_type = container_type.lower() xml = aip.package_mets_path aip.package_mets_create_date = timestamp_to_datetime( creation_date(xml)).isoformat() aip.package_mets_size = os.path.getsize(xml) aip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] aip.package_mets_digest = calculate_checksum(xml, algorithm=algorithm) aip.generation = 0 aic = InformationPackage.objects.create( package_type=InformationPackage.AIC, responsible=aip.responsible, label=aip.label, start_date=aip.start_date, end_date=aip.end_date) old_sip_path = aip.object_path aip.aic = aic aip_dir = os.path.join(aip.policy.ingest_path.value, objid) aip.object_path = aip_dir try: os.makedirs(aip_dir) except OSError as e: if e.errno != errno.EEXIST: raise aip.save() dst_path, dst_name = find_destination('sip', aip.get_profile('aip').structure, aip.object_path) if dst_path is None: dst_path, dst_name = find_destination( 'content', aip.get_profile('aip').structure, aip.object_path) dst_name, = self.parse_params(dst_name) dst = os.path.join(dst_path, dst_name) sip_profile = aip.submission_agreement.profile_sip try: shutil.rmtree(dst) except FileNotFoundError: pass if aip.policy.receive_extract_sip: temp = Path.objects.cached('entity', 'temp', 'value') with tempfile.TemporaryDirectory(dir=temp) as tmpdir: self.logger.debug('Extracting {} to {}'.format( container, tmpdir)) if container_type == '.tar': with tarfile.open(container) as tar: root_member_name = tar.getnames()[0] tar.extractall(tmpdir) elif container_type == '.zip': with zipfile.ZipFile(container) as zipf: root_member_name = zipf.namelist()[0] zipf.extractall(tmpdir) else: raise ValueError( 'Invalid container type: {}'.format(container)) dst = os.path.join(dst, '') try: os.makedirs(dst) except OSError as e: if e.errno != errno.EEXIST: raise tmpsrc = tmpdir if len(os.listdir(tmpdir)) == 1 and os.listdir( tmpdir)[0] == root_member_name: new_tmpsrc = os.path.join(tmpdir, root_member_name) if os.path.isdir(new_tmpsrc): tmpsrc = new_tmpsrc self.logger.debug('Moving content of {} to {}'.format( tmpsrc, dst)) for f in os.listdir(tmpsrc): shutil.move(os.path.join(tmpsrc, f), dst) self.logger.debug('Deleting {}'.format(tmpdir)) aip.sip_path = os.path.relpath(dst, aip.object_path) else: self.logger.debug('Copying {} to {}'.format(container, dst)) shutil.copy2(container, dst) aip.sip_path = os.path.relpath( os.path.join(dst, os.path.basename(container)), aip.object_path) sip_mets_dir, sip_mets_file = find_destination('mets_file', sip_profile.structure, aip.sip_path) if os.path.isfile(aip.sip_path): sip_mets_data = parse_mets( open_file( os.path.join(aip.object_path, sip_mets_dir, sip_mets_file), container=aip.sip_path, container_prefix=aip.object_identifier_value, )) else: sip_mets_data = parse_mets( open_file( os.path.join(aip.object_path, sip_mets_dir, sip_mets_file))) # prefix all SIP data sip_mets_data = { f'SIP_{k.upper()}': v for k, v in sip_mets_data.items() } aip_profile_rel_data = aip.get_profile_rel('aip').data aip_profile_rel_data.data.update(sip_mets_data) aip_profile_rel_data.save() if delete_sip: delete_path(old_sip_path) delete_path(pathlib.Path(old_sip_path).with_suffix('.xml')) self.logger.debug('sip_path set to {}'.format(aip.sip_path)) aip.save()
def ReceiveSIP(self, purpose=None, delete_sip=False): logger = logging.getLogger('essarch.workflow.tasks.ReceiveSIP') logger.debug('Receiving SIP') ip = self.get_information_package() algorithm = ip.get_checksum_algorithm() container = ip.object_path objid, container_type = os.path.splitext(os.path.basename(container)) container_type = container_type.lower() xml = ip.package_mets_path ip.package_mets_create_date = timestamp_to_datetime( creation_date(xml)).isoformat() ip.package_mets_size = os.path.getsize(xml) ip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] ip.package_mets_digest = calculate_checksum(xml, algorithm=algorithm) ip.object_path = os.path.join(ip.policy.ingest_path.value, ip.object_identifier_value) ip.save() sip_dst_path, sip_dst_name = find_destination('sip', ip.get_structure(), ip.object_path) if sip_dst_path is None: sip_dst_path, sip_dst_name = find_destination('content', ip.get_structure(), ip.object_path) sip_dst_name, = self.parse_params(sip_dst_name) sip_dst = os.path.join(sip_dst_path, sip_dst_name) if ip.policy.receive_extract_sip: # remove any existing directory from previous attempts delete_path(sip_dst) temp = Path.objects.get(entity='temp').value with tempfile.TemporaryDirectory(dir=temp) as tmpdir: logger.debug('Extracting {} to {}'.format(container, tmpdir)) if container_type == '.tar': with tarfile.open(container) as tar: root_member_name = tar.getnames()[0] tar.extractall(tmpdir) elif container_type == '.zip': with zipfile.ZipFile(container) as zipf: root_member_name = zipf.namelist()[0] zipf.extractall(tmpdir) else: raise ValueError( 'Invalid container type: {}'.format(container)) sip_dst = os.path.join(sip_dst, '') os.makedirs(sip_dst) tmpsrc = tmpdir if len(os.listdir(tmpdir)) == 1 and os.listdir( tmpdir)[0] == root_member_name: new_tmpsrc = os.path.join(tmpdir, root_member_name) if os.path.isdir(new_tmpsrc): tmpsrc = new_tmpsrc logger.debug('Moving content of {} to {}'.format(tmpsrc, sip_dst)) for f in os.listdir(tmpsrc): shutil.move(os.path.join(tmpsrc, f), sip_dst) logger.debug('Deleting {}'.format(tmpdir)) else: logger.debug('Copying {} to {}'.format(container, sip_dst)) shutil.copy2(container, sip_dst) ip.sip_path = os.path.relpath(sip_dst, ip.object_path) ip.save() self.create_success_event("Received SIP") return sip_dst
def _run(self): def get_information_packages(job): return self.rule.information_packages.filter( active=True, ).exclude(conversion_job_entries__job=self, ) ips = get_information_packages(self) for ip in ips.order_by( '-cached').iterator(): # convert cached IPs first while not ip.cached: with allow_join_result(): t, created = ProcessTask.objects.get_or_create( name='workflow.tasks.CacheAIP', information_package=ip, defaults={ 'responsible': ip.responsible, 'eager': False }) if not created: t.run() time.sleep(10) ip.refresh_from_db() policy = ip.policy srcdir = os.path.join(policy.cache_storage.value, ip.object_identifier_value) new_ip = ip.create_new_generation(ip.state, ip.responsible, None) dstdir = os.path.join(policy.cache_storage.value, new_ip.object_identifier_value) new_ip.object_path = dstdir new_ip.save() aip_profile = new_ip.get_profile_rel('aip').profile aip_profile_data = new_ip.get_profile_data('aip') mets_dir, mets_name = find_destination("mets_file", aip_profile.structure) mets_path = os.path.join(srcdir, mets_dir, mets_name) mets_tree = etree.parse(mets_path) # copy files to new generation shutil.copytree(srcdir, dstdir) # convert files specified in rule for pattern, spec in six.iteritems(self.rule.specification): target = spec['target'] tool = spec['tool'] for path in iglob(dstdir + '/' + pattern): if os.path.isdir(path): for root, dirs, files in walk(path): rel = os.path.relpath(root, dstdir) for f in files: fpath = os.path.join(root, f) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=os.path.join(rel, f)) convert_file(fpath, target) os.remove(fpath) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() elif os.path.isfile(path): rel = os.path.relpath(path, dstdir) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=rel, ) convert_file(path, target) os.remove(path) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() # preserve new generation sa = new_ip.submission_agreement try: os.remove(mets_path) except OSError as e: if e.errno != errno.ENOENT: raise filesToCreate = OrderedDict() try: premis_profile = new_ip.get_profile_rel( 'preservation_metadata').profile premis_profile_data = ip.get_profile_data( 'preservation_metadata') except ProfileIP.DoesNotExist: pass else: premis_dir, premis_name = find_destination( "preservation_description_file", aip_profile.structure) premis_path = os.path.join(dstdir, premis_dir, premis_name) try: os.remove(premis_path) except OSError as e: if e.errno != errno.ENOENT: raise filesToCreate[premis_path] = { 'spec': premis_profile.specification, 'data': fill_specification_data(premis_profile_data, ip=new_ip, sa=sa), } filesToCreate[mets_path] = { 'spec': aip_profile.specification, 'data': fill_specification_data(aip_profile_data, ip=new_ip, sa=sa), } t = ProcessTask.objects.create( name='ESSArch_Core.tasks.GenerateXML', params={ 'filesToCreate': filesToCreate, 'folderToParse': dstdir, }, responsible=new_ip.responsible, information_package=new_ip, ) t.run().get() dsttar = dstdir + '.tar' dstxml = dstdir + '.xml' objid = new_ip.object_identifier_value with tarfile.open(dsttar, 'w') as tar: for root, dirs, files in walk(dstdir): rel = os.path.relpath(root, dstdir) for d in dirs: src = os.path.join(root, d) arc = os.path.join(objid, rel, d) arc = os.path.normpath(arc) index_path(new_ip, src) tar.add(src, arc, recursive=False) for f in files: src = os.path.join(root, f) index_path(new_ip, src) tar.add(src, os.path.normpath(os.path.join(objid, rel, f))) algorithm = policy.get_checksum_algorithm_display() checksum = calculate_checksum(dsttar, algorithm=algorithm) info = fill_specification_data( new_ip.get_profile_data('aip_description'), ip=new_ip, sa=sa) info["_IP_CREATEDATE"] = timestamp_to_datetime( creation_date(dsttar)).isoformat() aip_desc_profile = new_ip.get_profile('aip_description') filesToCreate = { dstxml: { 'spec': aip_desc_profile.specification, 'data': info } } ProcessTask.objects.create( name="ESSArch_Core.tasks.GenerateXML", params={ "filesToCreate": filesToCreate, "folderToParse": dsttar, "extra_paths_to_parse": [mets_path], "algorithm": algorithm, }, information_package=new_ip, responsible=new_ip.responsible, ).run().get() InformationPackage.objects.filter(pk=new_ip.pk).update( message_digest=checksum, message_digest_algorithm=policy.checksum_algorithm, ) ProcessTask.objects.create( name='ESSArch_Core.tasks.UpdateIPSizeAndCount', information_package=new_ip, responsible=new_ip.responsible, ).run().get() t = ProcessTask.objects.create( name='workflow.tasks.StoreAIP', information_package=new_ip, responsible=new_ip.responsible, ) t.run()