def find_files_in_path_not_in_external_dirs(fid, path, external, algorithm, rootdir=""): files = [] external = [e[1] for e in external] for root, _dirnames, filenames in walk(path): for fname in filenames: filepath = os.path.join(root, fname) relpath = os.path.relpath(filepath, path) in_external = False for e in external: if in_directory(relpath, e): in_external = True if in_external: continue fileinfo = parse_file(filepath, fid, relpath, algorithm=algorithm, rootdir=rootdir) files.append(fileinfo) return files
def Action(self, tool, pattern, rootdir, options, purpose=None): def _convert(path, rootdir, tool, options): tool.run(path, rootdir, options) relpath = PurePath(path).relative_to(rootdir).as_posix() EventIP.objects.create( eventType_id=50750, eventOutcome=EventIP.SUCCESS, eventOutcomeDetailNote='{type} {relpath}'.format( type=tool.type.capitalize(), relpath=relpath), linkingObjectIdentifierValue=str( self.get_information_package().pk), linkingAgentIdentifierValue=User.objects.get(pk=self.responsible)) if tool.delete_original: os.remove(path) ip = self.get_information_package() tool = ActionTool.objects.get(name=tool) msg = '{type} job started, purpose: {purpose}'.format( type=tool.type.capitalize(), purpose=purpose) self.create_success_event(msg) if tool.file_processing: for path in iglob(rootdir + '/' + pattern, case_sensitive=False): if not in_directory(path, rootdir): raise ValueError( 'Invalid file-pattern accessing files outside of package') if os.path.isdir(path): for root, _dirs, files in os.walk(path): for f in files: fpath = os.path.join(root, f) _convert(fpath, rootdir, tool, options) else: _convert(path, rootdir, tool, options) else: filepath = os.path.join(rootdir, pattern) tool.run(filepath, rootdir, options) if tool.delete_original: os.remove(filepath) Notification.objects.create(message='{type} job done for "{ip}"'.format( type=tool.type.capitalize(), ip=ip.object_identifier_value), level=logging.INFO, user_id=self.responsible, refresh=True) msg = '{type} job done, purpose: {purpose}'.format( type=tool.type.capitalize(), purpose=purpose) self.create_success_event(msg)
def _run(self): self.delete_event_type = EventType.objects.get(eventType=50750) ips = self.information_packages tmpdir = Path.objects.get(entity='temp').value for ip in ips.iterator(): storage_obj: Optional[StorageObject] = ip.storage.readable( ).fastest().first() if storage_obj is None: raise NoReadableStorage new_ip = ip.create_new_generation(ip.state, ip.responsible, None) new_ip_tmpdir = os.path.join(tmpdir, new_ip.object_identifier_value) storage_obj.read(new_ip_tmpdir, None, extract=True) new_ip.object_path = new_ip_tmpdir new_ip.save() # convert files specified in rule for pattern, spec in self.specification.items(): tool = ActionTool.objects.get(name=spec['tool']) options = spec['options'] for path in iglob(new_ip_tmpdir + '/' + pattern, case_sensitive=False): if not in_directory(path, new_ip_tmpdir): raise ValueError( 'Invalid file-pattern accessing files outside of package' ) if os.path.isdir(path): for root, _dirs, files in walk(path): for f in files: fpath = os.path.join(root, f) self.convert(ip, fpath, new_ip_tmpdir, tool, options, new_ip) else: self.convert(ip, path, new_ip_tmpdir, tool, options, new_ip) with allow_join_result(): preserve_new_generation(new_ip)
def _run(self): self.delete_event_type = EventType.objects.get(eventType=50710) entries = [] for t in self.tags.select_related('current_version').exclude( current_version__elastic_index='document').all(): entries.append( AppraisalJobEntry( job=self, start_date=timezone.now(), end_date=timezone.now(), component=t.current_version, )) AppraisalJobEntry.objects.bulk_create(entries) ips = self.information_packages logger.info( 'Running appraisal job {} on {} information packages'.format( self.pk, ips.count())) delete_packages = getattr(settings, 'DELETE_PACKAGES_ON_APPRAISAL', False) tmpdir = Path.objects.get(entity='temp').value for ip in ips.iterator(): storage_obj: Optional[StorageObject] = ip.storage.readable( ).fastest().first() if storage_obj is None: raise NoReadableStorage if not self.package_file_pattern: ip_tmpdir = os.path.join(tmpdir, ip.object_identifier_value) os.makedirs(ip_tmpdir, exist_ok=True) storage_obj.read(ip_tmpdir, None, extract=True) # register all files job_entry_start_date = timezone.now() job_entry_end_date = timezone.now() job_entries = [] for root, _dirs, files in walk(ip_tmpdir): for f in files: rel = PurePath(os.path.join( root, f)).relative_to(ip_tmpdir).as_posix() job_entries.append( AppraisalJobEntry( job=self, start_date=job_entry_start_date, end_date=job_entry_end_date, ip=ip, document=rel, )) EventIP.objects.create( eventType=self.delete_event_type, eventOutcome=EventIP.SUCCESS, eventOutcomeDetailNote='Deleted {}'.format(rel), linkingObjectIdentifierValue=ip. object_identifier_value, ) AppraisalJobEntry.objects.bulk_create(job_entries) if delete_packages: for storage_obj in ip.storage.all(): storage_obj.delete_files() ip.delete() else: # inactivate old generations InformationPackage.objects.filter( aic=ip.aic, generation__lte=ip.generation).update( active=False, last_changed_local=timezone.now()) else: new_ip = ip.create_new_generation(ip.state, ip.responsible, None) new_ip_tmpdir = os.path.join(tmpdir, new_ip.object_identifier_value) storage_obj.read(new_ip_tmpdir, None, extract=True) new_ip.object_path = new_ip_tmpdir new_ip.save() # delete files specified in rule for pattern in cast(List[str], self.package_file_pattern): for path in iglob(new_ip_tmpdir + '/' + pattern, case_sensitive=False): if not in_directory(path, new_ip_tmpdir): raise ValueError( 'Invalid file-pattern accessing files outside of package' ) if os.path.isdir(path): for root, _dirs, files in walk(path): for f in files: rel = PurePath(os.path.join( root, f)).relative_to( new_ip_tmpdir).as_posix() self.delete_file(ip, os.path.join(root, f), rel, new_ip) shutil.rmtree(path) else: rel = PurePath(path).relative_to( new_ip_tmpdir).as_posix() self.delete_file(ip, path, rel, new_ip) self.delete_document_tags(ip, new_ip, new_ip_tmpdir) with allow_join_result(): preserve_new_generation(new_ip) ip.tags.exclude( current_version__elastic_index='document', ).update( information_package=new_ip) if delete_packages: for storage_obj in ip.storage.all(): storage_obj.delete_files() ip.delete() else: # inactivate old generations InformationPackage.objects.filter( aic=ip.aic, generation__lte=ip.generation).update(active=False) ip.tags.filter( current_version__elastic_index='document').delete() document_tag_ips = InformationPackage.objects.exclude( appraisal_jobs=self).filter( tags__appraisal_jobs=self, tags__current_version__elastic_index='document', ).distinct() for ip in document_tag_ips.iterator(): storage_obj: Optional[StorageObject] = ip.storage.readable( ).fastest().first() if storage_obj is None: raise NoReadableStorage new_ip = ip.create_new_generation(ip.state, ip.responsible, None) new_ip_tmpdir = os.path.join(tmpdir, new_ip.object_identifier_value) storage_obj.read(new_ip_tmpdir, None, extract=True) new_ip.object_path = new_ip_tmpdir new_ip.save() self.delete_document_tags(ip, new_ip, new_ip_tmpdir) with allow_join_result(): preserve_new_generation(new_ip) ip.tags.exclude( current_version__elastic_index='document', ).update( information_package=new_ip) if delete_packages: for storage_obj in ip.storage.all(): storage_obj.delete_files() ip.delete() else: # inactivate old generations InformationPackage.objects.filter( aic=ip.aic, generation__lte=ip.generation).update(active=False) ip.tags.filter( current_version__elastic_index='document').delete() self.tags.all().delete()
def files(self, path=''): mimetypes.suffix_map = {} mimetypes.encodings_map = {} mimetypes.types_map = {} mimetypes.common_types = {} mimetypes_file = Path.objects.get( entity="path_mimetypes_definitionfile").value mimetypes.init(files=[mimetypes_file]) mtypes = mimetypes.types_map MAX_FILE_SIZE = 100000000 # 100 MB if os.path.isfile(self.object_path): container = self.object_path xml = os.path.splitext(self.object_path)[0] + '.xml' if path.startswith(os.path.basename(container)): fullpath = os.path.join(os.path.dirname(container), path) if tarfile.is_tarfile(container): with tarfile.open(container) as tar: if fullpath == container: entries = [] for member in tar.getmembers(): if not member.isfile(): continue entries.append({ "name": member.name, "type": 'file', "size": member.size, "modified": timestamp_to_datetime(member.mtime), }) return Response(entries) else: subpath = fullpath[len(container) + 1:] try: member = tar.getmember(subpath) if not member.isfile(): raise exceptions.NotFound f = tar.extractfile(member) content_type = mtypes.get( os.path.splitext(subpath)[1]) response = HttpResponse( f.read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( f.name) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( f.name) return response except KeyError: raise exceptions.NotFound elif zipfile.is_zipfile(container): with zipfile.ZipFile(container) as zipf: if fullpath == container: entries = [] for member in zipf.filelist: if member.filename.endswith('/'): continue entries.append({ "name": member.filename, "type": 'file', "size": member.file_size, "modified": datetime.datetime(*member.date_time), }) return Response(entries) else: subpath = fullpath[len(container) + 1:] try: f = zipf.open(subpath) content_type = mtypes.get( os.path.splitext(subpath)[1]) response = HttpResponse( f.read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( f.name) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( f.name) return response except KeyError: raise exceptions.NotFound content_type = mtypes.get(os.path.splitext(fullpath)[1]) response = HttpResponse(open(fullpath).read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( fullpath) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( fullpath) return response elif os.path.isfile(xml) and path == os.path.basename(xml): fullpath = os.path.join(os.path.dirname(container), path) content_type = mtypes.get(os.path.splitext(fullpath)[1]) response = HttpResponse(open(fullpath).read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( fullpath) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( fullpath) return response elif path == '': entries = [] entries.append({ "name": os.path.basename(container), "type": 'file', "size": os.path.getsize(container), "modified": timestamp_to_datetime(os.path.getmtime(container)), }) if os.path.isfile(xml): entries.append({ "name": os.path.basename(xml), "type": 'file', "size": os.path.getsize(xml), "modified": timestamp_to_datetime(os.path.getmtime(xml)), }) return Response(entries) elif path is not None: raise exceptions.NotFound entries = [] fullpath = os.path.join(self.object_path, path) if not in_directory(fullpath, self.object_path): raise exceptions.ParseError('Illegal path %s' % path) if not os.path.exists(fullpath): raise exceptions.NotFound if os.path.isfile(fullpath): content_type = mtypes.get(os.path.splitext(fullpath)[1]) response = HttpResponse(open(fullpath).read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( fullpath) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( fullpath) return response for entry in get_files_and_dirs(fullpath): entry_type = "dir" if entry.is_dir() else "file" if entry_type == 'file' and re.search( r'\_\d+$', entry.name) is not None: # file chunk continue size, _ = get_tree_size_and_count(entry.path) entries.append({ "name": os.path.basename(entry.path), "type": entry_type, "size": size, "modified": timestamp_to_datetime(entry.stat().st_mtime), }) sorted_entries = sorted(entries, key=itemgetter('name')) return Response(sorted_entries)
def files(self, request, pk=None): ip = self.get_object() if request.method not in permissions.SAFE_METHODS: if ip.state not in ['Prepared', 'Uploading']: raise exceptions.ParseError( "Cannot delete or add content of an IP that is not in 'Prepared' or 'Uploading' state" ) if request.method == 'DELETE': try: path = request.data['path'] except KeyError: return Response('Path parameter missing', status=status.HTTP_400_BAD_REQUEST) root = ip.object_path fullpath = os.path.join(root, path) if not in_directory(fullpath, root): raise exceptions.ParseError('Illegal path %s' % path) try: shutil.rmtree(fullpath) except OSError as e: if e.errno == errno.ENOENT: raise exceptions.NotFound('Path does not exist') if e.errno != errno.ENOTDIR: raise os.remove(fullpath) return Response(status=status.HTTP_204_NO_CONTENT) if request.method == 'POST': try: path = request.data['path'] except KeyError: return Response('Path parameter missing', status=status.HTTP_400_BAD_REQUEST) try: pathtype = request.data['type'] except KeyError: return Response('Type parameter missing', status=status.HTTP_400_BAD_REQUEST) root = ip.object_path fullpath = os.path.join(root, path) if not in_directory(fullpath, root): raise exceptions.ParseError('Illegal path %s' % path) if pathtype == 'dir': try: os.makedirs(fullpath) except OSError as e: if e.errno == errno.EEXIST: raise exceptions.ParseError( 'Directory %s already exists' % path) raise elif pathtype == 'file': open(fullpath, 'a').close() else: return Response('Type must be either "file" or "dir"', status=status.HTTP_400_BAD_REQUEST) return Response(path, status=status.HTTP_201_CREATED) path = request.query_params.get('path', '').rstrip('/') download = request.query_params.get('download', False) return ip.get_path_response(path, request, force_download=download, paginator=self.paginator)
def validate_path(self, path): fullpath = os.path.join(self.object_path, path) if not in_directory(fullpath, self.object_path) and fullpath != os.path.splitext( self.object_path)[0] + '.xml': raise exceptions.ValidationError(u'Illegal path: {s}'.format(path))