def generate_package_mets(ip): sa = ip.submission_agreement if ip.package_type == InformationPackage.SIP: profile_type = 'submit_description' elif ip.package_type == InformationPackage.AIP: profile_type = 'aip_description' else: raise ValueError( 'Cannot create package mets for IP of type {package_type}'.format( package_type=ip.package_type ) ) profile_rel = ip.get_profile_rel(profile_type) profile_data = ip.get_profile_data(profile_type) xmlpath = os.path.splitext(ip.object_path)[0] + '.xml' data = fill_specification_data(profile_data, ip=ip, sa=sa) data["_IP_CREATEDATE"] = timestamp_to_datetime(creation_date(ip.object_path)).isoformat() files_to_create = { xmlpath: { 'spec': profile_rel.profile.specification, 'data': data } } algorithm = ip.get_checksum_algorithm() generator = XMLGenerator() generator.generate(files_to_create, folderToParse=ip.object_path, algorithm=algorithm) ip.package_mets_path = normalize_path(xmlpath) ip.package_mets_create_date = timestamp_to_datetime(creation_date(xmlpath)).isoformat() ip.package_mets_size = os.path.getsize(xmlpath) ip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[algorithm.upper()] ip.package_mets_digest = calculate_checksum(xmlpath, algorithm=algorithm) ip.save()
def test_list_multiple_files_in_folder(self): archive_path = self.create_archive_file('tar') self.ip.object_path = archive_path self.ip.save() files = [ f for f in os.listdir(self.textdir) if os.path.isfile(os.path.join(self.textdir, f)) ] expected_entries = [] for f in files: expected_entries.append({ 'type': 'file', 'name': f, 'size': 1, 'modified': timestamp_to_datetime( os.stat(os.path.join(self.textdir, f)).st_mtime) }) entries = self.ip.list_files(path=self.textdir) self.assertCountEqual(entries, expected_entries) self.assertEqual(len(entries), 3)
def parse_file(filepath, fid, relpath=None, algorithm='SHA-256', rootdir='', provided_data=None): if not relpath: relpath = filepath if provided_data is None: provided_data = {} relpath = win_to_posix(relpath) fileinfo = { 'FName': os.path.basename(relpath), 'FExtension': os.path.splitext(relpath)[1][1:], 'FDir': rootdir, 'FParentDir': os.path.basename(os.path.dirname(filepath)), 'FID': str(uuid.uuid4()), 'daotype': "borndigital", 'href': relpath, 'FMimetype': fid.get_mimetype(filepath), 'FSize': str(os.path.getsize(filepath)), 'FUse': 'Datafile', 'FChecksumType': algorithm, 'FLoctype': 'URL', 'FLinkType': 'simple', 'FChecksumLib': 'ESSArch', 'FIDType': 'UUID', } # We only do heavy computations if their values aren't included in # provided_data if 'FCreated' not in provided_data: timestamp = creation_date(filepath) createdate = timestamp_to_datetime(timestamp) fileinfo['FCreated'] = createdate.isoformat() if 'FChecksum' not in provided_data: fileinfo['FChecksum'] = checksum.calculate_checksum( filepath, algorithm) if 'FEncrypted' not in provided_data: fileinfo['FEncrypted'] = fid.identify_file_encryption(filepath) if any(x not in provided_data for x in ['FFormatName', 'FFormatVersion', 'FFormatRegistryKey']): (format_name, format_version, format_registry_key) = fid.identify_file_format(filepath) fileinfo['FFormatName'] = format_name fileinfo['FFormatVersion'] = format_version fileinfo['FFormatRegistryKey'] = format_registry_key for key, value in provided_data.items(): fileinfo[key] = value return fileinfo
def index_document(ip, filepath, id): with open(filepath, 'rb') as f: content = f.read() encoded_content = base64.b64encode(content).decode("ascii") filename = os.path.basename(filepath) extension = os.path.splitext(filename)[1][1:] dirname = os.path.dirname(filepath) href = normalize_path(os.path.relpath(dirname, ip.object_path)) href = '' if href == '.' else href size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) doc = File(_id=id, name=filename, type="document", filename=filename, extension=extension, href=href, ip=str(ip.pk), data=encoded_content, size=size, modified=modified, current_version=True) doc.save(pipeline='ingest_attachment') return doc
def generate_content_mets(ip): mets_path = ip.get_content_mets_file_path() full_mets_path = os.path.join(ip.object_path, mets_path) profile_type = ip.get_package_type_display().lower() profile_rel = ip.get_profile_rel(profile_type) profile_data = ip.get_profile_data(profile_type) files_to_create = { full_mets_path: { 'spec': profile_rel.profile.specification, 'data': fill_specification_data(profile_data, ip=ip) } } algorithm = ip.get_checksum_algorithm() allow_unknown_file_types = ip.get_allow_unknown_file_types() allow_encrypted_files = ip.get_allow_encrypted_files() generator = XMLGenerator( allow_unknown_file_types=allow_unknown_file_types, allow_encrypted_files=allow_encrypted_files, ) generator.generate(files_to_create, folderToParse=ip.object_path, algorithm=algorithm) ip.content_mets_path = mets_path ip.content_mets_create_date = timestamp_to_datetime( creation_date(full_mets_path)).isoformat() ip.content_mets_size = os.path.getsize(full_mets_path) ip.content_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] ip.content_mets_digest = calculate_checksum(full_mets_path, algorithm=algorithm) ip.save()
def run(self): ip = self.get_information_package() mets_path = ip.get_content_mets_file_path() profile_type = ip.get_package_type_display().lower() profile_rel = ip.get_profile_rel(profile_type) profile_data = ip.get_profile_data(profile_type) files_to_create = { mets_path: { 'spec': profile_rel.profile.specification, 'data': fill_specification_data(profile_data, ip=ip) } } algorithm = ip.get_checksum_algorithm() generator = XMLGenerator() generator.generate(files_to_create, folderToParse=ip.object_path, algorithm=algorithm) ip.content_mets_path = mets_path ip.content_mets_create_date = timestamp_to_datetime( creation_date(mets_path)).isoformat() ip.content_mets_size = os.path.getsize(mets_path) ip.content_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] ip.content_mets_digest = calculate_checksum(mets_path, algorithm=algorithm) ip.save()
def index_document(tag_version, filepath): with open(filepath, 'rb') as f: content = f.read() ip = tag_version.tag.information_package encoded_content = base64.b64encode(content).decode("ascii") extension = os.path.splitext(tag_version.name)[1][1:] dirname = os.path.dirname(filepath) href = normalize_path(os.path.relpath(dirname, ip.object_path)) href = '' if href == '.' else href size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) tag_version.custom_fields = { 'extension': extension, 'dirname': dirname, 'href': href, 'filename': tag_version.name, 'size': size, 'modified': modified, } doc = File.from_obj(tag_version) doc.data = encoded_content try: doc.save(pipeline='ingest_attachment') except ElasticsearchException: logger.exception('Failed to index {}'.format(filepath)) raise return doc, tag_version
def parse_document(self, ip, rootdir, document, act, parent): id = str(uuid.uuid4()) name = document.get("Namn") desc = document.get("Beskrivning") filepath = document.get('Lank') if ip is not None: filepath = os.path.join(ip.object_path, ip.sip_path, document.get('Lank')) elif rootdir is not None: filepath = os.path.join(rootdir, document.get('Lank')) href = os.path.dirname(os.path.relpath(filepath, rootdir)) href = '' if href == '.' else href filename = os.path.basename(filepath) ext = os.path.splitext(filepath)[1][1:] with open(filepath, 'rb') as f: content = f.read() encoded_content = base64.b64encode(content).decode("ascii") size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) d = File( _id=id, name=name, type='Bilaga', archive=act.archive, desc=desc, filename=filename, href=href, extension=ext, data=encoded_content, size=size, modified=modified, current_version=True, ip=act.ip, task_id=str(self.task.pk), ) tag = Tag(information_package=ip, task=self.task) tag_version = TagVersion(pk=d.meta.id, tag=tag, elastic_index=d._index._name, name=d.name, type=d.type, reference_code='') tag_repr = TagStructure( tag=tag, parent=parent, structure=parent.structure, tree_id=parent.tree_id, lft=0, rght=0, level=0, ) self.indexed_files.append(filepath) d_dict = d.to_dict(include_meta=True) d_dict['pipeline'] = 'ingest_attachment' return tag, tag_version, tag_repr, d_dict
def test_list_folder(self): path = tempfile.mkdtemp(dir=self.datadir) self.assertEqual( self.ip.list_files(), [{ 'type': 'dir', 'name': os.path.basename(path), 'size': 0, 'modified': timestamp_to_datetime(os.stat(path).st_mtime) }])
def parse_document(self, ip, rootdir, document, act, parent, archive): id = str(uuid.uuid4()) name = document.get("Namn") desc = document.get("Beskrivning") filepath = os.path.join('content', document.get('Lank')) if ip is not None: filepath = os.path.join(ip.object_path, ip.sip_path, 'content', document.get('Lank')) elif rootdir is not None: filepath = os.path.join(rootdir, 'content', document.get('Lank')) href = os.path.dirname(os.path.relpath(filepath, rootdir)) href = '' if href == '.' else href filename = os.path.basename(filepath) ext = os.path.splitext(filepath)[1][1:] encoded_content = get_encoded_content_from_file(filepath) size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) custom_fields = { 'filename': filename, 'href': href, 'extension': ext, 'size': size, 'modified': modified, } tag = Tag.objects.create(information_package=ip, task=self.task) tag_version_type, _ = TagVersionType.objects.get_or_create( name='Bilaga') tag_version = TagVersion.objects.create( pk=id, tag=tag, elastic_index='document', name=name, description=desc, type=tag_version_type, reference_code='', custom_fields=custom_fields, ) tag_repr = TagStructure.objects.create( tag=tag, parent=parent, structure=parent.structure, ) self.indexed_files.append(filepath) d = File.from_obj(tag_version, archive) d.data = encoded_content d_dict = d.to_dict(include_meta=True) d_dict['pipeline'] = 'ingest_attachment' return tag, tag_version, tag_repr, d_dict
def test_list_file(self): fd, path = tempfile.mkstemp(dir=self.datadir) os.close(fd) self.assertEqual( self.ip.list_files(), [{ 'type': 'file', 'name': os.path.basename(path), 'size': 0, 'modified': timestamp_to_datetime(os.stat(path).st_mtime) }])
def test_list_folder_content(self): path = tempfile.mkdtemp(dir=self.datadir) fd, filepath = tempfile.mkstemp(dir=path) os.close(fd) self.assertEqual( self.ip.list_files(path=path), [{ 'type': 'file', 'name': os.path.basename(filepath), 'size': os.stat(filepath).st_size, 'modified': timestamp_to_datetime(os.stat(filepath).st_mtime) }])
def test_list_root_folder_when_xml_exists_with_no_params(self): archive_path = self.create_archive_file('tar') xml_path = self.create_mets_xml_file('archive_file.xml') self.ip.object_path = archive_path self.ip.save() entries = self.ip.list_files(path='') self.assertEqual( entries, [{ 'type': 'file', 'name': os.path.basename(archive_path), 'size': os.path.getsize(archive_path), 'modified': timestamp_to_datetime( os.stat(archive_path).st_mtime) }, { 'type': 'file', 'name': os.path.basename(xml_path), 'size': os.path.getsize(xml_path), 'modified': timestamp_to_datetime(os.stat(xml_path).st_mtime) }])
def generate_content_metadata(ip): files_to_create = {} generate_premis = ip.profile_locked('preservation_metadata') if generate_premis: premis_profile_type = 'preservation_metadata' premis_profile_rel = ip.get_profile_rel(premis_profile_type) premis_profile_data = ip.get_profile_data(premis_profile_type) data = fill_specification_data(premis_profile_data, ip=ip) premis_path = parseContent(ip.get_premis_file_path(), data) full_premis_path = os.path.join(ip.object_path, premis_path) files_to_create[full_premis_path] = { 'spec': premis_profile_rel.profile.specification, 'data': data, } mets_path = ip.get_content_mets_file_path() full_mets_path = os.path.join(ip.object_path, mets_path) profile_type = ip.get_package_type_display().lower() profile_rel = ip.get_profile_rel(profile_type) profile_data = ip.get_profile_data(profile_type) files_to_create[full_mets_path] = { 'spec': profile_rel.profile.specification, 'data': fill_specification_data(profile_data, ip=ip), } parsed_files = profile_rel.data.parsed_files extra_paths_to_parse = profile_rel.data.extra_paths_to_parse algorithm = ip.get_checksum_algorithm() allow_unknown_file_types = ip.get_allow_unknown_file_types() allow_encrypted_files = ip.get_allow_encrypted_files() generator = XMLGenerator( allow_unknown_file_types=allow_unknown_file_types, allow_encrypted_files=allow_encrypted_files, ) generator.generate(files_to_create, folderToParse=ip.object_path, algorithm=algorithm, parsed_files=parsed_files, extra_paths_to_parse=extra_paths_to_parse) ip.content_mets_path = mets_path ip.content_mets_create_date = timestamp_to_datetime( creation_date(full_mets_path)).isoformat() ip.content_mets_size = os.path.getsize(full_mets_path) ip.content_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] ip.content_mets_digest = calculate_checksum(full_mets_path, algorithm=algorithm) ip.save()
def index_document(tag_version, filepath): exclude_file_format_from_indexing_content = settings.EXCLUDE_FILE_FORMAT_FROM_INDEXING_CONTENT fid = FormatIdentifier() (format_name, format_version, format_registry_key) = fid.identify_file_format(filepath) if format_registry_key not in exclude_file_format_from_indexing_content: index_file_content = True else: index_file_content = False ip = tag_version.tag.information_package extension = os.path.splitext(tag_version.name)[1][1:] dirname = os.path.dirname(filepath) href = normalize_path(os.path.relpath(dirname, ip.object_path)) href = '' if href == '.' else href size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) tag_version.custom_fields = { 'extension': extension, 'dirname': dirname, 'href': href, 'filename': tag_version.name, 'size': size, 'modified': modified, 'formatname': format_name, 'formatversion': format_version, 'formatkey': format_registry_key, } doc = File.from_obj(tag_version) try: if index_file_content: with open(filepath, 'rb') as f: content = f.read() doc.data = base64.b64encode(content).decode("ascii") doc.save(pipeline='ingest_attachment') else: logger.debug('Skip to index file content for {}'.format(filepath)) doc.save() except ElasticsearchException: logger.exception('Failed to index {}'.format(filepath)) raise return doc, tag_version
def identify_ip(self, request): fname = request.data.get('label') spec_data = request.data.get('specification_data', {}) uip = Path.objects.get(entity="path_ingest_unidentified").value container_file = os.path.join(uip, fname) if not os.path.isfile(container_file): return Response( {'status': '%s does not exist' % container_file}, status=status.HTTP_400_BAD_REQUEST ) spec = json.loads(open( os.path.join(settings.BASE_DIR, 'templates/SDTemplate.json') ).read()) ip_id = uuid.uuid4() spec_data['_OBJID'] = unicode(ip_id) spec_data['_OBJLABEL'] = spec_data.pop('LABEL') spec_data['_IP_CREATEDATE'] = timestamp_to_datetime( creation_date(container_file) ).isoformat() infoxml = u'%s.xml' % unicode(ip_id) infoxml = os.path.join(uip, infoxml) ProcessTask( name='preingest.tasks.GenerateXML', params={ 'info': spec_data, 'filesToCreate': { infoxml: spec }, 'folderToParse': container_file, }, ).run_eagerly() return Response({'status': 'Identified IP, created %s' % infoxml})
def files(self, path=''): mimetypes.suffix_map = {} mimetypes.encodings_map = {} mimetypes.types_map = {} mimetypes.common_types = {} mimetypes_file = Path.objects.get( entity="path_mimetypes_definitionfile").value mimetypes.init(files=[mimetypes_file]) mtypes = mimetypes.types_map MAX_FILE_SIZE = 100000000 # 100 MB if os.path.isfile(self.object_path): container = self.object_path xml = os.path.splitext(self.object_path)[0] + '.xml' if path.startswith(os.path.basename(container)): fullpath = os.path.join(os.path.dirname(container), path) if tarfile.is_tarfile(container): with tarfile.open(container) as tar: if fullpath == container: entries = [] for member in tar.getmembers(): if not member.isfile(): continue entries.append({ "name": member.name, "type": 'file', "size": member.size, "modified": timestamp_to_datetime(member.mtime), }) return Response(entries) else: subpath = fullpath[len(container) + 1:] try: member = tar.getmember(subpath) if not member.isfile(): raise exceptions.NotFound f = tar.extractfile(member) content_type = mtypes.get( os.path.splitext(subpath)[1]) response = HttpResponse( f.read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( f.name) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( f.name) return response except KeyError: raise exceptions.NotFound elif zipfile.is_zipfile(container): with zipfile.ZipFile(container) as zipf: if fullpath == container: entries = [] for member in zipf.filelist: if member.filename.endswith('/'): continue entries.append({ "name": member.filename, "type": 'file', "size": member.file_size, "modified": datetime.datetime(*member.date_time), }) return Response(entries) else: subpath = fullpath[len(container) + 1:] try: f = zipf.open(subpath) content_type = mtypes.get( os.path.splitext(subpath)[1]) response = HttpResponse( f.read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( f.name) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( f.name) return response except KeyError: raise exceptions.NotFound content_type = mtypes.get(os.path.splitext(fullpath)[1]) response = HttpResponse(open(fullpath).read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( fullpath) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( fullpath) return response elif os.path.isfile(xml) and path == os.path.basename(xml): fullpath = os.path.join(os.path.dirname(container), path) content_type = mtypes.get(os.path.splitext(fullpath)[1]) response = HttpResponse(open(fullpath).read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( fullpath) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( fullpath) return response elif path == '': entries = [] entries.append({ "name": os.path.basename(container), "type": 'file', "size": os.path.getsize(container), "modified": timestamp_to_datetime(os.path.getmtime(container)), }) if os.path.isfile(xml): entries.append({ "name": os.path.basename(xml), "type": 'file', "size": os.path.getsize(xml), "modified": timestamp_to_datetime(os.path.getmtime(xml)), }) return Response(entries) elif path is not None: raise exceptions.NotFound entries = [] fullpath = os.path.join(self.object_path, path) if not in_directory(fullpath, self.object_path): raise exceptions.ParseError('Illegal path %s' % path) if not os.path.exists(fullpath): raise exceptions.NotFound if os.path.isfile(fullpath): content_type = mtypes.get(os.path.splitext(fullpath)[1]) response = HttpResponse(open(fullpath).read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( fullpath) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( fullpath) return response for entry in get_files_and_dirs(fullpath): entry_type = "dir" if entry.is_dir() else "file" if entry_type == 'file' and re.search( r'\_\d+$', entry.name) is not None: # file chunk continue size, _ = get_tree_size_and_count(entry.path) entries.append({ "name": os.path.basename(entry.path), "type": entry_type, "size": size, "modified": timestamp_to_datetime(entry.stat().st_mtime), }) sorted_entries = sorted(entries, key=itemgetter('name')) return Response(sorted_entries)
def run(self, filepath=None, mimetype=None, relpath=None, algorithm='SHA-256', rootdir=''): if not relpath: relpath = filepath relpath = win_to_posix(relpath) timestamp = creation_date(filepath) createdate = timestamp_to_datetime(timestamp) checksum_task = ProcessTask( name="ESSArch_Core.tasks.CalculateChecksum", params={ "filename": filepath, "algorithm": algorithm }, processstep_id=self.step, responsible_id=self.responsible, information_package_id=self.ip) fileformat_task = ProcessTask( name="ESSArch_Core.tasks.IdentifyFileFormat", params={ "filename": filepath, }, processstep_id=self.step, responsible_id=self.responsible, information_package_id=self.ip) ProcessTask.objects.bulk_create([checksum_task, fileformat_task]) checksum = checksum_task.run().get() self.set_progress(50, total=100) (format_name, format_version, format_registry_key) = fileformat_task.run().get() fileinfo = { 'FName': os.path.basename(relpath), 'FDir': rootdir, 'FChecksum': checksum, 'FID': str(uuid.uuid4()), 'daotype': "borndigital", 'href': relpath, 'FMimetype': mimetype, 'FCreated': createdate.isoformat(), 'FFormatName': format_name, 'FFormatVersion': format_version, 'FFormatRegistryKey': format_registry_key, 'FSize': str(os.path.getsize(filepath)), 'FUse': 'Datafile', 'FChecksumType': algorithm, 'FLoctype': 'URL', 'FLinkType': 'simple', 'FChecksumLib': 'hashlib', 'FLocationType': 'URI', 'FIDType': 'UUID', } return fileinfo
def _run(self): def get_information_packages(job): return self.rule.information_packages.filter( active=True, ).exclude(conversion_job_entries__job=self, ) ips = get_information_packages(self) for ip in ips.order_by( '-cached').iterator(): # convert cached IPs first while not ip.cached: with allow_join_result(): t, created = ProcessTask.objects.get_or_create( name='workflow.tasks.CacheAIP', information_package=ip, defaults={ 'responsible': ip.responsible, 'eager': False }) if not created: t.run() time.sleep(10) ip.refresh_from_db() policy = ip.policy srcdir = os.path.join(policy.cache_storage.value, ip.object_identifier_value) new_ip = ip.create_new_generation(ip.state, ip.responsible, None) dstdir = os.path.join(policy.cache_storage.value, new_ip.object_identifier_value) new_ip.object_path = dstdir new_ip.save() aip_profile = new_ip.get_profile_rel('aip').profile aip_profile_data = new_ip.get_profile_data('aip') mets_dir, mets_name = find_destination("mets_file", aip_profile.structure) mets_path = os.path.join(srcdir, mets_dir, mets_name) mets_tree = etree.parse(mets_path) # copy files to new generation shutil.copytree(srcdir, dstdir) # convert files specified in rule for pattern, spec in six.iteritems(self.rule.specification): target = spec['target'] tool = spec['tool'] for path in iglob(dstdir + '/' + pattern): if os.path.isdir(path): for root, dirs, files in walk(path): rel = os.path.relpath(root, dstdir) for f in files: fpath = os.path.join(root, f) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=os.path.join(rel, f)) convert_file(fpath, target) os.remove(fpath) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() elif os.path.isfile(path): rel = os.path.relpath(path, dstdir) job_entry = ConversionJobEntry.objects.create( job=self, start_date=timezone.now(), ip=ip, old_document=rel, ) convert_file(path, target) os.remove(path) job_entry.new_document = os.path.splitext( job_entry.old_document)[0] + '.' + target job_entry.end_date = timezone.now() job_entry.tool = tool job_entry.save() # preserve new generation sa = new_ip.submission_agreement try: os.remove(mets_path) except OSError as e: if e.errno != errno.ENOENT: raise filesToCreate = OrderedDict() try: premis_profile = new_ip.get_profile_rel( 'preservation_metadata').profile premis_profile_data = ip.get_profile_data( 'preservation_metadata') except ProfileIP.DoesNotExist: pass else: premis_dir, premis_name = find_destination( "preservation_description_file", aip_profile.structure) premis_path = os.path.join(dstdir, premis_dir, premis_name) try: os.remove(premis_path) except OSError as e: if e.errno != errno.ENOENT: raise filesToCreate[premis_path] = { 'spec': premis_profile.specification, 'data': fill_specification_data(premis_profile_data, ip=new_ip, sa=sa), } filesToCreate[mets_path] = { 'spec': aip_profile.specification, 'data': fill_specification_data(aip_profile_data, ip=new_ip, sa=sa), } t = ProcessTask.objects.create( name='ESSArch_Core.tasks.GenerateXML', params={ 'filesToCreate': filesToCreate, 'folderToParse': dstdir, }, responsible=new_ip.responsible, information_package=new_ip, ) t.run().get() dsttar = dstdir + '.tar' dstxml = dstdir + '.xml' objid = new_ip.object_identifier_value with tarfile.open(dsttar, 'w') as tar: for root, dirs, files in walk(dstdir): rel = os.path.relpath(root, dstdir) for d in dirs: src = os.path.join(root, d) arc = os.path.join(objid, rel, d) arc = os.path.normpath(arc) index_path(new_ip, src) tar.add(src, arc, recursive=False) for f in files: src = os.path.join(root, f) index_path(new_ip, src) tar.add(src, os.path.normpath(os.path.join(objid, rel, f))) algorithm = policy.get_checksum_algorithm_display() checksum = calculate_checksum(dsttar, algorithm=algorithm) info = fill_specification_data( new_ip.get_profile_data('aip_description'), ip=new_ip, sa=sa) info["_IP_CREATEDATE"] = timestamp_to_datetime( creation_date(dsttar)).isoformat() aip_desc_profile = new_ip.get_profile('aip_description') filesToCreate = { dstxml: { 'spec': aip_desc_profile.specification, 'data': info } } ProcessTask.objects.create( name="ESSArch_Core.tasks.GenerateXML", params={ "filesToCreate": filesToCreate, "folderToParse": dsttar, "extra_paths_to_parse": [mets_path], "algorithm": algorithm, }, information_package=new_ip, responsible=new_ip.responsible, ).run().get() InformationPackage.objects.filter(pk=new_ip.pk).update( message_digest=checksum, message_digest_algorithm=policy.checksum_algorithm, ) ProcessTask.objects.create( name='ESSArch_Core.tasks.UpdateIPSizeAndCount', information_package=new_ip, responsible=new_ip.responsible, ).run().get() t = ProcessTask.objects.create( name='workflow.tasks.StoreAIP', information_package=new_ip, responsible=new_ip.responsible, ) t.run()
def submit(self, request, pk=None): """ Submits the specified information package Args: pk: The primary key (id) of the information package to submit Returns: None """ ip = self.get_object() if ip.State != "Created": raise ValueError( "The IP (%s) is in the state '%s' but should be 'Created'" % (pk, ip.State) ) validators = request.data.get('validators', {}) validate_xml_file = validators.get('validate_xml_file', False) validate_file_format = validators.get('validate_file_format', False) validate_integrity = validators.get('validate_integrity', False) validate_logical_physical_representation = validators.get('validate_logical_physical_representation', False) step = ProcessStep.objects.create( name="Submit SIP", information_package=ip ) step.tasks.add(ProcessTask.objects.create( name="preingest.tasks.UpdateIPStatus", params={ "ip": ip, "status": "Submitting", }, processstep_pos=0, log=EventIP, information_package=ip, responsible=self.request.user, )) reception = Path.objects.get(entity="path_preingest_reception").value sd_profile = ip.get_profile('submit_description') container_format = ip.get_container_format() container_file = os.path.join(reception, str(ip.pk) + ".%s" % container_format.lower()) sa = ip.SubmissionAgreement info = sd_profile.fill_specification_data(sa, ip) info["_IP_CREATEDATE"] = timestamp_to_datetime(creation_date(container_file)).isoformat() infoxml = os.path.join(reception, str(ip.pk) + ".xml") filesToCreate = { infoxml: sd_profile.specification } step.tasks.add(ProcessTask.objects.create( name="preingest.tasks.GenerateXML", params={ "info": info, "filesToCreate": filesToCreate, "folderToParse": container_file, "algorithm": ip.get_checksum_algorithm(), }, processstep_pos=10, log=EventIP, information_package=ip, responsible=self.request.user, )) if validate_xml_file: step.tasks.add( ProcessTask.objects.create( name="preingest.tasks.ValidateXMLFile", params={ "xml_filename": infoxml }, processstep_pos=14, log=EventIP, information_package=ip, responsible=self.request.user, ) ) if validate_file_format or validate_integrity: step.tasks.add( ProcessTask.objects.create( name="preingest.tasks.ValidateFiles", params={ "ip": ip, "rootdir": reception, "xmlfile": infoxml, "validate_fileformat": validate_file_format, "validate_integrity": validate_integrity, }, processstep_pos=15, log=EventIP, information_package=ip, responsible=self.request.user, ) ) if validate_logical_physical_representation: step.tasks.add( ProcessTask.objects.create( name="preingest.tasks.ValidateLogicalPhysicalRepresentation", params={ "files": [os.path.basename(ip.ObjectPath)], "xmlfile": infoxml, }, processstep_pos=16, log=EventIP, information_package=ip, responsible=self.request.user, ) ) step.tasks.add(ProcessTask.objects.create( name="preingest.tasks.SubmitSIP", params={ "ip": ip }, processstep_pos=20, log=EventIP, information_package=ip, responsible=self.request.user, )) if ip.get_email_recipient(): recipients = [ip.get_email_recipient()] subject = request.data.get('subject') body = request.data.get('body') attachments = [ip.ObjectPath] step.tasks.add(ProcessTask.objects.create( name="ESSArch_Core.tasks.SendEmail", params={ 'sender': self.request.user.email, 'recipients': recipients, 'subject': subject, 'body': body, 'attachments': attachments }, processstep_pos=25, information_package=ip, responsible=self.request.user )) step.tasks.add(ProcessTask.objects.create( name="preingest.tasks.UpdateIPStatus", params={ "ip": ip, "status": "Submitted" }, processstep_pos=30, log=EventIP, information_package=ip, responsible=self.request.user, )) step.save() step.run() return Response({'status': 'submitting ip'})
def list_files(self, path=''): fullpath = os.path.join(self.object_path, path).rstrip('/') if os.path.basename(self.object_path) == path and os.path.isfile( self.object_path): if tarfile.is_tarfile(self.object_path): with tarfile.open(self.object_path) as tar: entries = [] for member in tar.getmembers(): if not member.isfile(): continue entries.append({ "name": member.name, "type": 'file', "size": member.size, "modified": timestamp_to_datetime(member.mtime), }) return entries elif zipfile.is_zipfile(self.object_path) and os.path.splitext( self.object_path)[1] == '.zip': with zipfile.ZipFile(self.object_path) as zipf: entries = [] for member in zipf.filelist: if member.filename.endswith('/'): continue entries.append({ "name": member.filename, "type": 'file', "size": member.file_size, "modified": datetime(*member.date_time), }) return entries if os.path.isfile(self.object_path) and not path: container = self.object_path xml = os.path.splitext(container)[0] + '.xml' entries = [{ "name": os.path.basename(container), "type": 'file', "size": os.path.getsize(container), "modified": timestamp_to_datetime(os.path.getmtime(container)), }] if os.path.isfile(xml): entries.append({ "name": os.path.basename(xml), "type": 'file', "size": os.path.getsize(xml), "modified": timestamp_to_datetime(os.path.getmtime(xml)), }) return entries entries = [] for entry in sorted(get_files_and_dirs(fullpath), key=lambda x: x.name): entry_type = "dir" if entry.is_dir() else "file" size, _ = get_tree_size_and_count(entry.path) entries.append({ "name": os.path.basename(entry.path), "type": entry_type, "size": size, "modified": timestamp_to_datetime(entry.stat().st_mtime), }) return entries
def run(self, filepath=None, mimetype=None, relpath=None, algorithm='SHA-256'): if not relpath: relpath = filepath relpath = win_to_posix(relpath) timestamp = creation_date(filepath) createdate = timestamp_to_datetime(timestamp) checksum_task = ProcessTask.objects.create( name="preingest.tasks.CalculateChecksum", params={ "filename": filepath, "algorithm": algorithm } ) fileformat_task = ProcessTask.objects.create( name="preingest.tasks.IdentifyFileFormat", params={ "filename": filepath, } ) checksum_task.log = self.taskobj.log checksum_task.information_package = self.taskobj.information_package checksum_task.responsible = self.taskobj.responsible fileformat_task.log = self.taskobj.log fileformat_task.information_package = self.taskobj.information_package fileformat_task.responsible = self.taskobj.responsible if self.taskobj is not None and self.taskobj.processstep is not None: checksum_task.processstep = self.taskobj.processstep fileformat_task.processstep = self.taskobj.processstep checksum_task.save() fileformat_task.save() checksum = checksum_task.run_eagerly() self.set_progress(50, total=100) fileformat = fileformat_task.run_eagerly() fileinfo = { 'FName': os.path.basename(relpath), 'FChecksum': checksum, 'FID': str(uuid.uuid4()), 'daotype': "borndigital", 'href': relpath, 'FMimetype': mimetype, 'FCreated': createdate.isoformat(), 'FFormatName': fileformat, 'FSize': str(os.path.getsize(filepath)), 'FUse': 'Datafile', 'FChecksumType': algorithm, 'FLoctype': 'URL', 'FLinkType': 'simple', 'FChecksumLib': 'hashlib', 'FLocationType': 'URI', 'FIDType': 'UUID', } self.set_progress(100, total=100) return fileinfo
def preserve_new_generation(aip_profile, aip_profile_data, dstdir, ip, mets_path, new_ip, policy): sa = new_ip.submission_agreement try: os.remove(mets_path) except OSError as e: if e.errno != errno.ENOENT: raise files_to_create = OrderedDict() try: premis_profile = new_ip.get_profile_rel( 'preservation_metadata').profile premis_profile_data = ip.get_profile_data('preservation_metadata') except ProfileIP.DoesNotExist: pass else: premis_dir, premis_name = find_destination( "preservation_description_file", aip_profile.structure) premis_path = os.path.join(dstdir, premis_dir, premis_name) try: os.remove(premis_path) except OSError as e: if e.errno != errno.ENOENT: raise files_to_create[premis_path] = { 'spec': premis_profile.specification, 'data': fill_specification_data(premis_profile_data, ip=new_ip, sa=sa), } files_to_create[mets_path] = { 'spec': aip_profile.specification, 'data': fill_specification_data(aip_profile_data, ip=new_ip, sa=sa), } t = ProcessTask.objects.create( name='ESSArch_Core.tasks.GenerateXML', params={ 'filesToCreate': files_to_create, 'folderToParse': dstdir, }, responsible=new_ip.responsible, information_package=new_ip, ) t.run().get() dsttar = dstdir + '.tar' dstxml = dstdir + '.xml' objid = new_ip.object_identifier_value with tarfile.open(dsttar, 'w') as tar: for root, dirs, files in walk(dstdir): rel = os.path.relpath(root, dstdir) for d in dirs: src = os.path.join(root, d) arc = os.path.join(objid, rel, d) arc = os.path.normpath(arc) index_path(new_ip, src) tar.add(src, arc, recursive=False) for f in files: src = os.path.join(root, f) index_path(new_ip, src) tar.add(src, os.path.normpath(os.path.join(objid, rel, f))) algorithm = policy.get_checksum_algorithm_display() checksum = calculate_checksum(dsttar, algorithm=algorithm) info = fill_specification_data(new_ip.get_profile_data('aip_description'), ip=new_ip, sa=sa) info["_IP_CREATEDATE"] = timestamp_to_datetime( creation_date(dsttar)).isoformat() aip_desc_profile = new_ip.get_profile('aip_description') files_to_create = { dstxml: { 'spec': aip_desc_profile.specification, 'data': info } } ProcessTask.objects.create( name="ESSArch_Core.tasks.GenerateXML", params={ "filesToCreate": files_to_create, "folderToParse": dsttar, "extra_paths_to_parse": [mets_path], "algorithm": algorithm, }, information_package=new_ip, responsible=new_ip.responsible, ).run().get() InformationPackage.objects.filter(pk=new_ip.pk).update( message_digest=checksum, message_digest_algorithm=policy.checksum_algorithm, ) ProcessTask.objects.create( name='ESSArch_Core.tasks.UpdateIPSizeAndCount', information_package=new_ip, responsible=new_ip.responsible, ).run().get() t = ProcessTask.objects.create( name='workflow.tasks.StoreAIP', information_package=new_ip, responsible=new_ip.responsible, ) t.run()
def run(self, purpose=None, delete_sip=False): self.logger.debug('Receiving SIP') aip = InformationPackage.objects.get(pk=self.ip) algorithm = aip.get_checksum_algorithm() container = aip.object_path objid, container_type = os.path.splitext(os.path.basename(container)) container_type = container_type.lower() xml = aip.package_mets_path aip.package_mets_create_date = timestamp_to_datetime( creation_date(xml)).isoformat() aip.package_mets_size = os.path.getsize(xml) aip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] aip.package_mets_digest = calculate_checksum(xml, algorithm=algorithm) aip.generation = 0 aic = InformationPackage.objects.create( package_type=InformationPackage.AIC, responsible=aip.responsible, label=aip.label, start_date=aip.start_date, end_date=aip.end_date) old_sip_path = aip.object_path aip.aic = aic aip_dir = os.path.join(aip.policy.ingest_path.value, objid) aip.object_path = aip_dir try: os.makedirs(aip_dir) except OSError as e: if e.errno != errno.EEXIST: raise aip.save() dst_path, dst_name = find_destination('sip', aip.get_profile('aip').structure, aip.object_path) if dst_path is None: dst_path, dst_name = find_destination( 'content', aip.get_profile('aip').structure, aip.object_path) dst_name, = self.parse_params(dst_name) dst = os.path.join(dst_path, dst_name) sip_profile = aip.submission_agreement.profile_sip try: shutil.rmtree(dst) except FileNotFoundError: pass if aip.policy.receive_extract_sip: temp = Path.objects.cached('entity', 'temp', 'value') with tempfile.TemporaryDirectory(dir=temp) as tmpdir: self.logger.debug('Extracting {} to {}'.format( container, tmpdir)) if container_type == '.tar': with tarfile.open(container) as tar: root_member_name = tar.getnames()[0] tar.extractall(tmpdir) elif container_type == '.zip': with zipfile.ZipFile(container) as zipf: root_member_name = zipf.namelist()[0] zipf.extractall(tmpdir) else: raise ValueError( 'Invalid container type: {}'.format(container)) dst = os.path.join(dst, '') try: os.makedirs(dst) except OSError as e: if e.errno != errno.EEXIST: raise tmpsrc = tmpdir if len(os.listdir(tmpdir)) == 1 and os.listdir( tmpdir)[0] == root_member_name: new_tmpsrc = os.path.join(tmpdir, root_member_name) if os.path.isdir(new_tmpsrc): tmpsrc = new_tmpsrc self.logger.debug('Moving content of {} to {}'.format( tmpsrc, dst)) for f in os.listdir(tmpsrc): shutil.move(os.path.join(tmpsrc, f), dst) self.logger.debug('Deleting {}'.format(tmpdir)) aip.sip_path = os.path.relpath(dst, aip.object_path) else: self.logger.debug('Copying {} to {}'.format(container, dst)) shutil.copy2(container, dst) aip.sip_path = os.path.relpath( os.path.join(dst, os.path.basename(container)), aip.object_path) sip_mets_dir, sip_mets_file = find_destination('mets_file', sip_profile.structure, aip.sip_path) if os.path.isfile(aip.sip_path): sip_mets_data = parse_mets( open_file( os.path.join(aip.object_path, sip_mets_dir, sip_mets_file), container=aip.sip_path, container_prefix=aip.object_identifier_value, )) else: sip_mets_data = parse_mets( open_file( os.path.join(aip.object_path, sip_mets_dir, sip_mets_file))) # prefix all SIP data sip_mets_data = { f'SIP_{k.upper()}': v for k, v in sip_mets_data.items() } aip_profile_rel_data = aip.get_profile_rel('aip').data aip_profile_rel_data.data.update(sip_mets_data) aip_profile_rel_data.save() if delete_sip: delete_path(old_sip_path) delete_path(pathlib.Path(old_sip_path).with_suffix('.xml')) self.logger.debug('sip_path set to {}'.format(aip.sip_path)) aip.save()
def submit(self, request, pk=None): """ Submits the specified information package Args: pk: The primary key (id) of the information package to submit Returns: None """ ip = self.get_object() if ip.State != "Created": raise ValueError( "The IP (%s) is in the state '%s' but should be 'Created'" % (pk, ip.State)) validators = request.data.get('validators', {}) validate_xml_file = validators.get('validate_xml_file', False) validate_file_format = validators.get('validate_file_format', False) validate_integrity = validators.get('validate_integrity', False) validate_logical_physical_representation = validators.get( 'validate_logical_physical_representation', False) step = ProcessStep.objects.create(name="Submit SIP", information_package=ip) step.tasks.add( ProcessTask.objects.create( name="preingest.tasks.UpdateIPStatus", params={ "ip": ip, "status": "Submitting", }, processstep_pos=0, log=EventIP, information_package=ip, responsible=self.request.user, )) reception = Path.objects.get(entity="path_preingest_reception").value sd_profile = ip.get_profile('submit_description') container_format = ip.get_container_format() container_file = os.path.join( reception, str(ip.pk) + ".%s" % container_format.lower()) sa = ip.SubmissionAgreement info = sd_profile.fill_specification_data(sa, ip) info["_IP_CREATEDATE"] = timestamp_to_datetime( creation_date(container_file)).isoformat() infoxml = os.path.join(reception, str(ip.pk) + ".xml") filesToCreate = {infoxml: sd_profile.specification} step.tasks.add( ProcessTask.objects.create( name="preingest.tasks.GenerateXML", params={ "info": info, "filesToCreate": filesToCreate, "folderToParse": container_file, "algorithm": ip.get_checksum_algorithm(), }, processstep_pos=10, log=EventIP, information_package=ip, responsible=self.request.user, )) if validate_xml_file: step.tasks.add( ProcessTask.objects.create( name="preingest.tasks.ValidateXMLFile", params={"xml_filename": infoxml}, processstep_pos=14, log=EventIP, information_package=ip, responsible=self.request.user, )) if validate_file_format or validate_integrity: step.tasks.add( ProcessTask.objects.create( name="preingest.tasks.ValidateFiles", params={ "ip": ip, "rootdir": reception, "xmlfile": infoxml, "validate_fileformat": validate_file_format, "validate_integrity": validate_integrity, }, processstep_pos=15, log=EventIP, information_package=ip, responsible=self.request.user, )) if validate_logical_physical_representation: step.tasks.add( ProcessTask.objects.create( name= "preingest.tasks.ValidateLogicalPhysicalRepresentation", params={ "files": [os.path.basename(ip.ObjectPath)], "xmlfile": infoxml, }, processstep_pos=16, log=EventIP, information_package=ip, responsible=self.request.user, )) step.tasks.add( ProcessTask.objects.create( name="preingest.tasks.SubmitSIP", params={"ip": ip}, processstep_pos=20, log=EventIP, information_package=ip, responsible=self.request.user, )) if ip.get_email_recipient(): recipients = [ip.get_email_recipient()] subject = request.data.get('subject') body = request.data.get('body') attachments = [ip.ObjectPath] step.tasks.add( ProcessTask.objects.create(name="ESSArch_Core.tasks.SendEmail", params={ 'sender': self.request.user.email, 'recipients': recipients, 'subject': subject, 'body': body, 'attachments': attachments }, processstep_pos=25, information_package=ip, responsible=self.request.user)) step.tasks.add( ProcessTask.objects.create( name="preingest.tasks.UpdateIPStatus", params={ "ip": ip, "status": "Submitted" }, processstep_pos=30, log=EventIP, information_package=ip, responsible=self.request.user, )) step.save() step.run() return Response({'status': 'submitting ip'})
def list(self, request): reception = Path.objects.get(entity="path_ingest_reception").value uip = Path.objects.get(entity="path_ingest_unidentified").value ips = [] for xmlfile in glob.glob(os.path.join(reception, "*.xml")) + glob.glob(os.path.join(uip, "*.xml")): if os.path.isfile(xmlfile): if xmlfile.startswith(uip): srcdir = uip else: srcdir = reception ip = self.parseFile(xmlfile, srcdir) if not InformationPackage.objects.filter(id=ip['id']).exists(): ips.append(ip) for container_file in glob.glob(os.path.join(uip, "*.tar")) + glob.glob(os.path.join(uip, "*.zip")): ip = { 'Label': os.path.basename(container_file), 'CreateDate': str(timestamp_to_datetime(creation_date(container_file)).isoformat()), 'State': 'Unidentified', 'status': 0, 'step_state': celery_states.SUCCESS, } include = True for xmlfile in glob.glob(os.path.join(uip, "*.xml")): if os.path.isfile(xmlfile): doc = etree.parse(xmlfile) root = doc.getroot() el = root.xpath('.//*[local-name()="%s"]' % "FLocat")[0] if ip['Label'] == get_value_from_path(el, "@href").split('file:///')[1]: include = False break if include: ips.append(ip) from_db = InformationPackage.objects.filter(State='Receiving').prefetch_related( Prefetch('profileip_set', to_attr='profiles'), ) serializer = InformationPackageSerializer( data=from_db, many=True, context={'request': request} ) serializer.is_valid() ips.extend(serializer.data) try: ordering = request.query_params.get('ordering', '') reverse = ordering.startswith('-') ordering = remove_prefix(ordering, '-') ips = sorted(ips, key=lambda k: k[ordering], reverse=reverse) except KeyError: pass paginator = LinkHeaderPagination() page = paginator.paginate_queryset(ips, request) if page is not None: return paginator.get_paginated_response(page) return Response(ips)
def ReceiveSIP(self, purpose=None, delete_sip=False): logger = logging.getLogger('essarch.workflow.tasks.ReceiveSIP') logger.debug('Receiving SIP') ip = self.get_information_package() algorithm = ip.get_checksum_algorithm() container = ip.object_path objid, container_type = os.path.splitext(os.path.basename(container)) container_type = container_type.lower() xml = ip.package_mets_path ip.package_mets_create_date = timestamp_to_datetime( creation_date(xml)).isoformat() ip.package_mets_size = os.path.getsize(xml) ip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[ algorithm.upper()] ip.package_mets_digest = calculate_checksum(xml, algorithm=algorithm) ip.object_path = os.path.join(ip.policy.ingest_path.value, ip.object_identifier_value) ip.save() sip_dst_path, sip_dst_name = find_destination('sip', ip.get_structure(), ip.object_path) if sip_dst_path is None: sip_dst_path, sip_dst_name = find_destination('content', ip.get_structure(), ip.object_path) sip_dst_name, = self.parse_params(sip_dst_name) sip_dst = os.path.join(sip_dst_path, sip_dst_name) if ip.policy.receive_extract_sip: # remove any existing directory from previous attempts delete_path(sip_dst) temp = Path.objects.get(entity='temp').value with tempfile.TemporaryDirectory(dir=temp) as tmpdir: logger.debug('Extracting {} to {}'.format(container, tmpdir)) if container_type == '.tar': with tarfile.open(container) as tar: root_member_name = tar.getnames()[0] tar.extractall(tmpdir) elif container_type == '.zip': with zipfile.ZipFile(container) as zipf: root_member_name = zipf.namelist()[0] zipf.extractall(tmpdir) else: raise ValueError( 'Invalid container type: {}'.format(container)) sip_dst = os.path.join(sip_dst, '') os.makedirs(sip_dst) tmpsrc = tmpdir if len(os.listdir(tmpdir)) == 1 and os.listdir( tmpdir)[0] == root_member_name: new_tmpsrc = os.path.join(tmpdir, root_member_name) if os.path.isdir(new_tmpsrc): tmpsrc = new_tmpsrc logger.debug('Moving content of {} to {}'.format(tmpsrc, sip_dst)) for f in os.listdir(tmpsrc): shutil.move(os.path.join(tmpsrc, f), sip_dst) logger.debug('Deleting {}'.format(tmpdir)) else: logger.debug('Copying {} to {}'.format(container, sip_dst)) shutil.copy2(container, sip_dst) ip.sip_path = os.path.relpath(sip_dst, ip.object_path) ip.save() self.create_success_event("Received SIP") return sip_dst
def list_files(self, path=''): fullpath = os.path.join(self.object_path, path).rstrip('/') if os.path.basename(self.object_path) == path and os.path.isfile( self.object_path): if tarfile.is_tarfile(self.object_path): with tarfile.open(self.object_path) as tar: entries = [] for member in tar.getmembers(): if not member.isfile(): continue entries.append({ "name": member.name, "type": 'file', "size": member.size, "modified": timestamp_to_datetime(member.mtime), }) return entries elif zipfile.is_zipfile(self.object_path) and os.path.splitext( self.object_path)[1] == '.zip': with zipfile.ZipFile(self.object_path) as zipf: entries = [] for member in zipf.filelist: if member.filename.endswith('/'): continue entries.append({ "name": member.filename, "type": 'file', "size": member.file_size, "modified": datetime(*member.date_time), }) return entries if os.path.isfile(self.object_path) and not path: container = self.object_path xml = os.path.splitext(container)[0] + '.xml' entries = [{ "name": os.path.basename(container), "type": 'file', "size": os.path.getsize(container), "modified": timestamp_to_datetime(os.path.getmtime(container)), }] if os.path.isfile(xml): entries.append({ "name": os.path.basename(xml), "type": 'file', "size": os.path.getsize(xml), "modified": timestamp_to_datetime(os.path.getmtime(xml)), }) return entries entries = [] for entry in sorted(get_files_and_dirs(fullpath), key=lambda x: x.name): try: entry_type = "dir" if entry.is_dir() else "file" size, _ = get_tree_size_and_count(entry.path) entries.append({ "name": os.path.basename(entry.path), "type": entry_type, "size": size, "modified": timestamp_to_datetime(entry.stat().st_mtime), }) except OSError as e: # the file might be deleted (e.g. temporary upload files) while we get additional data, # if they are we ignore them. If there is another error, we raise it if e.errno != errno.ENOENT: raise return entries