def index_document(tag_version, filepath): with open(filepath, 'rb') as f: content = f.read() ip = tag_version.tag.information_package encoded_content = base64.b64encode(content).decode("ascii") extension = os.path.splitext(tag_version.name)[1][1:] dirname = os.path.dirname(filepath) href = normalize_path(os.path.relpath(dirname, ip.object_path)) href = '' if href == '.' else href size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) tag_version.custom_fields = { 'extension': extension, 'dirname': dirname, 'href': href, 'filename': tag_version.name, 'size': size, 'modified': modified, } doc = File.from_obj(tag_version) doc.data = encoded_content try: doc.save(pipeline='ingest_attachment') except ElasticsearchException: logger.exception('Failed to index {}'.format(filepath)) raise return doc, tag_version
def enough_space_available(dst: str, src: str, raise_exception: bool = False) -> bool: """ Tells if there is enough space available at path dst for src to be copied there :param src: Path to be copied :param dst: Destination :param raise_exception: Raises exception if set to true and enough space is not available :return: True if src can be copied to dst, else False """ src_size, _ = get_tree_size_and_count(src) dst_free_space = shutil.disk_usage(dst).free try: assert src_size <= dst_free_space except AssertionError: if raise_exception: raise NoSpaceLeftError( f'Not enough space available for {src} at {dst}') return False return True
def index_document(ip, filepath, id): with open(filepath, 'rb') as f: content = f.read() encoded_content = base64.b64encode(content).decode("ascii") filename = os.path.basename(filepath) extension = os.path.splitext(filename)[1][1:] dirname = os.path.dirname(filepath) href = normalize_path(os.path.relpath(dirname, ip.object_path)) href = '' if href == '.' else href size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) doc = File(_id=id, name=filename, type="document", filename=filename, extension=extension, href=href, ip=str(ip.pk), data=encoded_content, size=size, modified=modified, current_version=True) doc.save(pipeline='ingest_attachment') return doc
def parse_document(self, ip, rootdir, document, act, parent): id = str(uuid.uuid4()) name = document.get("Namn") desc = document.get("Beskrivning") filepath = document.get('Lank') if ip is not None: filepath = os.path.join(ip.object_path, ip.sip_path, document.get('Lank')) elif rootdir is not None: filepath = os.path.join(rootdir, document.get('Lank')) href = os.path.dirname(os.path.relpath(filepath, rootdir)) href = '' if href == '.' else href filename = os.path.basename(filepath) ext = os.path.splitext(filepath)[1][1:] with open(filepath, 'rb') as f: content = f.read() encoded_content = base64.b64encode(content).decode("ascii") size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) d = File( _id=id, name=name, type='Bilaga', archive=act.archive, desc=desc, filename=filename, href=href, extension=ext, data=encoded_content, size=size, modified=modified, current_version=True, ip=act.ip, task_id=str(self.task.pk), ) tag = Tag(information_package=ip, task=self.task) tag_version = TagVersion(pk=d.meta.id, tag=tag, elastic_index=d._index._name, name=d.name, type=d.type, reference_code='') tag_repr = TagStructure( tag=tag, parent=parent, structure=parent.structure, tree_id=parent.tree_id, lft=0, rght=0, level=0, ) self.indexed_files.append(filepath) d_dict = d.to_dict(include_meta=True) d_dict['pipeline'] = 'ingest_attachment' return tag, tag_version, tag_repr, d_dict
def run(self, ip=None): path = InformationPackage.objects.values_list('object_path', flat=True).get(pk=ip) size, count = get_tree_size_and_count(path) InformationPackage.objects.filter(pk=ip).update(object_size=size, object_num_items=count) return size, count
def parse_document(self, ip, rootdir, document, act, parent, archive): id = str(uuid.uuid4()) name = document.get("Namn") desc = document.get("Beskrivning") filepath = os.path.join('content', document.get('Lank')) if ip is not None: filepath = os.path.join(ip.object_path, ip.sip_path, 'content', document.get('Lank')) elif rootdir is not None: filepath = os.path.join(rootdir, 'content', document.get('Lank')) href = os.path.dirname(os.path.relpath(filepath, rootdir)) href = '' if href == '.' else href filename = os.path.basename(filepath) ext = os.path.splitext(filepath)[1][1:] encoded_content = get_encoded_content_from_file(filepath) size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) custom_fields = { 'filename': filename, 'href': href, 'extension': ext, 'size': size, 'modified': modified, } tag = Tag.objects.create(information_package=ip, task=self.task) tag_version_type, _ = TagVersionType.objects.get_or_create( name='Bilaga') tag_version = TagVersion.objects.create( pk=id, tag=tag, elastic_index='document', name=name, description=desc, type=tag_version_type, reference_code='', custom_fields=custom_fields, ) tag_repr = TagStructure.objects.create( tag=tag, parent=parent, structure=parent.structure, ) self.indexed_files.append(filepath) d = File.from_obj(tag_version, archive) d.data = encoded_content d_dict = d.to_dict(include_meta=True) d_dict['pipeline'] = 'ingest_attachment' return tag, tag_version, tag_repr, d_dict
def UpdateIPSizeAndCount(self): ip = self.ip path = InformationPackage.objects.values_list('object_path', flat=True).get(pk=ip) size, count = get_tree_size_and_count(path) InformationPackage.objects.filter(pk=ip).update( object_size=size, object_num_items=count, last_changed_local=timezone.now(), ) msg = "Updated size and count of IP" self.create_success_event(msg) return size, count
def index_document(tag_version, filepath): exclude_file_format_from_indexing_content = settings.EXCLUDE_FILE_FORMAT_FROM_INDEXING_CONTENT fid = FormatIdentifier() (format_name, format_version, format_registry_key) = fid.identify_file_format(filepath) if format_registry_key not in exclude_file_format_from_indexing_content: index_file_content = True else: index_file_content = False ip = tag_version.tag.information_package extension = os.path.splitext(tag_version.name)[1][1:] dirname = os.path.dirname(filepath) href = normalize_path(os.path.relpath(dirname, ip.object_path)) href = '' if href == '.' else href size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) tag_version.custom_fields = { 'extension': extension, 'dirname': dirname, 'href': href, 'filename': tag_version.name, 'size': size, 'modified': modified, 'formatname': format_name, 'formatversion': format_version, 'formatkey': format_registry_key, } doc = File.from_obj(tag_version) try: if index_file_content: with open(filepath, 'rb') as f: content = f.read() doc.data = base64.b64encode(content).decode("ascii") doc.save(pipeline='ingest_attachment') else: logger.debug('Skip to index file content for {}'.format(filepath)) doc.save() except ElasticsearchException: logger.exception('Failed to index {}'.format(filepath)) raise return doc, tag_version
def files(self, path=''): mimetypes.suffix_map = {} mimetypes.encodings_map = {} mimetypes.types_map = {} mimetypes.common_types = {} mimetypes_file = Path.objects.get( entity="path_mimetypes_definitionfile").value mimetypes.init(files=[mimetypes_file]) mtypes = mimetypes.types_map MAX_FILE_SIZE = 100000000 # 100 MB if os.path.isfile(self.object_path): container = self.object_path xml = os.path.splitext(self.object_path)[0] + '.xml' if path.startswith(os.path.basename(container)): fullpath = os.path.join(os.path.dirname(container), path) if tarfile.is_tarfile(container): with tarfile.open(container) as tar: if fullpath == container: entries = [] for member in tar.getmembers(): if not member.isfile(): continue entries.append({ "name": member.name, "type": 'file', "size": member.size, "modified": timestamp_to_datetime(member.mtime), }) return Response(entries) else: subpath = fullpath[len(container) + 1:] try: member = tar.getmember(subpath) if not member.isfile(): raise exceptions.NotFound f = tar.extractfile(member) content_type = mtypes.get( os.path.splitext(subpath)[1]) response = HttpResponse( f.read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( f.name) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( f.name) return response except KeyError: raise exceptions.NotFound elif zipfile.is_zipfile(container): with zipfile.ZipFile(container) as zipf: if fullpath == container: entries = [] for member in zipf.filelist: if member.filename.endswith('/'): continue entries.append({ "name": member.filename, "type": 'file', "size": member.file_size, "modified": datetime.datetime(*member.date_time), }) return Response(entries) else: subpath = fullpath[len(container) + 1:] try: f = zipf.open(subpath) content_type = mtypes.get( os.path.splitext(subpath)[1]) response = HttpResponse( f.read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( f.name) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( f.name) return response except KeyError: raise exceptions.NotFound content_type = mtypes.get(os.path.splitext(fullpath)[1]) response = HttpResponse(open(fullpath).read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( fullpath) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( fullpath) return response elif os.path.isfile(xml) and path == os.path.basename(xml): fullpath = os.path.join(os.path.dirname(container), path) content_type = mtypes.get(os.path.splitext(fullpath)[1]) response = HttpResponse(open(fullpath).read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( fullpath) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( fullpath) return response elif path == '': entries = [] entries.append({ "name": os.path.basename(container), "type": 'file', "size": os.path.getsize(container), "modified": timestamp_to_datetime(os.path.getmtime(container)), }) if os.path.isfile(xml): entries.append({ "name": os.path.basename(xml), "type": 'file', "size": os.path.getsize(xml), "modified": timestamp_to_datetime(os.path.getmtime(xml)), }) return Response(entries) elif path is not None: raise exceptions.NotFound entries = [] fullpath = os.path.join(self.object_path, path) if not in_directory(fullpath, self.object_path): raise exceptions.ParseError('Illegal path %s' % path) if not os.path.exists(fullpath): raise exceptions.NotFound if os.path.isfile(fullpath): content_type = mtypes.get(os.path.splitext(fullpath)[1]) response = HttpResponse(open(fullpath).read(), content_type=content_type) response[ 'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename( fullpath) if content_type is None: response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( fullpath) return response for entry in get_files_and_dirs(fullpath): entry_type = "dir" if entry.is_dir() else "file" if entry_type == 'file' and re.search( r'\_\d+$', entry.name) is not None: # file chunk continue size, _ = get_tree_size_and_count(entry.path) entries.append({ "name": os.path.basename(entry.path), "type": entry_type, "size": size, "modified": timestamp_to_datetime(entry.stat().st_mtime), }) sorted_entries = sorted(entries, key=itemgetter('name')) return Response(sorted_entries)
def ObjectSizeAndNum(self): return get_tree_size_and_count(self.ObjectPath)
def list_files(self, path=''): fullpath = os.path.join(self.object_path, path).rstrip('/') if os.path.basename(self.object_path) == path and os.path.isfile( self.object_path): if tarfile.is_tarfile(self.object_path): with tarfile.open(self.object_path) as tar: entries = [] for member in tar.getmembers(): if not member.isfile(): continue entries.append({ "name": member.name, "type": 'file', "size": member.size, "modified": timestamp_to_datetime(member.mtime), }) return entries elif zipfile.is_zipfile(self.object_path) and os.path.splitext( self.object_path)[1] == '.zip': with zipfile.ZipFile(self.object_path) as zipf: entries = [] for member in zipf.filelist: if member.filename.endswith('/'): continue entries.append({ "name": member.filename, "type": 'file', "size": member.file_size, "modified": datetime(*member.date_time), }) return entries if os.path.isfile(self.object_path) and not path: container = self.object_path xml = os.path.splitext(container)[0] + '.xml' entries = [{ "name": os.path.basename(container), "type": 'file', "size": os.path.getsize(container), "modified": timestamp_to_datetime(os.path.getmtime(container)), }] if os.path.isfile(xml): entries.append({ "name": os.path.basename(xml), "type": 'file', "size": os.path.getsize(xml), "modified": timestamp_to_datetime(os.path.getmtime(xml)), }) return entries entries = [] for entry in sorted(get_files_and_dirs(fullpath), key=lambda x: x.name): entry_type = "dir" if entry.is_dir() else "file" size, _ = get_tree_size_and_count(entry.path) entries.append({ "name": os.path.basename(entry.path), "type": entry_type, "size": size, "modified": timestamp_to_datetime(entry.stat().st_mtime), }) return entries
def list_files(self, path=''): fullpath = os.path.join(self.object_path, path).rstrip('/') if os.path.basename(self.object_path) == path and os.path.isfile( self.object_path): if tarfile.is_tarfile(self.object_path): with tarfile.open(self.object_path) as tar: entries = [] for member in tar.getmembers(): if not member.isfile(): continue entries.append({ "name": member.name, "type": 'file', "size": member.size, "modified": timestamp_to_datetime(member.mtime), }) return entries elif zipfile.is_zipfile(self.object_path) and os.path.splitext( self.object_path)[1] == '.zip': with zipfile.ZipFile(self.object_path) as zipf: entries = [] for member in zipf.filelist: if member.filename.endswith('/'): continue entries.append({ "name": member.filename, "type": 'file', "size": member.file_size, "modified": datetime(*member.date_time), }) return entries if os.path.isfile(self.object_path) and not path: container = self.object_path xml = os.path.splitext(container)[0] + '.xml' entries = [{ "name": os.path.basename(container), "type": 'file', "size": os.path.getsize(container), "modified": timestamp_to_datetime(os.path.getmtime(container)), }] if os.path.isfile(xml): entries.append({ "name": os.path.basename(xml), "type": 'file', "size": os.path.getsize(xml), "modified": timestamp_to_datetime(os.path.getmtime(xml)), }) return entries entries = [] for entry in sorted(get_files_and_dirs(fullpath), key=lambda x: x.name): try: entry_type = "dir" if entry.is_dir() else "file" size, _ = get_tree_size_and_count(entry.path) entries.append({ "name": os.path.basename(entry.path), "type": entry_type, "size": size, "modified": timestamp_to_datetime(entry.stat().st_mtime), }) except OSError as e: # the file might be deleted (e.g. temporary upload files) while we get additional data, # if they are we ignore them. If there is another error, we raise it if e.errno != errno.ENOENT: raise return entries