Exemple #1
0
def index_document(tag_version, filepath):
    with open(filepath, 'rb') as f:
        content = f.read()

    ip = tag_version.tag.information_package
    encoded_content = base64.b64encode(content).decode("ascii")
    extension = os.path.splitext(tag_version.name)[1][1:]
    dirname = os.path.dirname(filepath)
    href = normalize_path(os.path.relpath(dirname, ip.object_path))
    href = '' if href == '.' else href
    size, _ = get_tree_size_and_count(filepath)
    modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

    tag_version.custom_fields = {
        'extension': extension,
        'dirname': dirname,
        'href': href,
        'filename': tag_version.name,
        'size': size,
        'modified': modified,
    }

    doc = File.from_obj(tag_version)
    doc.data = encoded_content

    try:
        doc.save(pipeline='ingest_attachment')
    except ElasticsearchException:
        logger.exception('Failed to index {}'.format(filepath))
        raise
    return doc, tag_version
Exemple #2
0
def enough_space_available(dst: str,
                           src: str,
                           raise_exception: bool = False) -> bool:
    """
    Tells if there is enough space available at
    path dst for src to be copied there

    :param src: Path to be copied
    :param dst: Destination
    :param raise_exception: Raises exception if set to true and enough space
        is not available
    :return: True if src can be copied to dst, else False
    """

    src_size, _ = get_tree_size_and_count(src)
    dst_free_space = shutil.disk_usage(dst).free

    try:
        assert src_size <= dst_free_space
    except AssertionError:
        if raise_exception:
            raise NoSpaceLeftError(
                f'Not enough space available for {src} at {dst}')

        return False

    return True
Exemple #3
0
def index_document(ip, filepath, id):
    with open(filepath, 'rb') as f:
        content = f.read()

    encoded_content = base64.b64encode(content).decode("ascii")
    filename = os.path.basename(filepath)
    extension = os.path.splitext(filename)[1][1:]
    dirname = os.path.dirname(filepath)
    href = normalize_path(os.path.relpath(dirname, ip.object_path))
    href = '' if href == '.' else href
    size, _ = get_tree_size_and_count(filepath)
    modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

    doc = File(_id=id,
               name=filename,
               type="document",
               filename=filename,
               extension=extension,
               href=href,
               ip=str(ip.pk),
               data=encoded_content,
               size=size,
               modified=modified,
               current_version=True)
    doc.save(pipeline='ingest_attachment')
    return doc
Exemple #4
0
    def parse_document(self, ip, rootdir, document, act, parent):
        id = str(uuid.uuid4())
        name = document.get("Namn")
        desc = document.get("Beskrivning")

        filepath = document.get('Lank')
        if ip is not None:
            filepath = os.path.join(ip.object_path, ip.sip_path, document.get('Lank'))
        elif rootdir is not None:
            filepath = os.path.join(rootdir, document.get('Lank'))

        href = os.path.dirname(os.path.relpath(filepath, rootdir))
        href = '' if href == '.' else href
        filename = os.path.basename(filepath)
        ext = os.path.splitext(filepath)[1][1:]

        with open(filepath, 'rb') as f:
            content = f.read()
            encoded_content = base64.b64encode(content).decode("ascii")

        size, _ = get_tree_size_and_count(filepath)
        modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

        d = File(
            _id=id,
            name=name,
            type='Bilaga',
            archive=act.archive,
            desc=desc,
            filename=filename,
            href=href,
            extension=ext,
            data=encoded_content,
            size=size,
            modified=modified,
            current_version=True,
            ip=act.ip,
            task_id=str(self.task.pk),
        )

        tag = Tag(information_package=ip, task=self.task)
        tag_version = TagVersion(pk=d.meta.id, tag=tag,
                                 elastic_index=d._index._name,
                                 name=d.name, type=d.type,
                                 reference_code='')
        tag_repr = TagStructure(
            tag=tag,
            parent=parent,
            structure=parent.structure,
            tree_id=parent.tree_id,
            lft=0,
            rght=0,
            level=0,
        )
        self.indexed_files.append(filepath)

        d_dict = d.to_dict(include_meta=True)
        d_dict['pipeline'] = 'ingest_attachment'
        return tag, tag_version, tag_repr, d_dict
Exemple #5
0
    def run(self, ip=None):
        path = InformationPackage.objects.values_list('object_path',
                                                      flat=True).get(pk=ip)
        size, count = get_tree_size_and_count(path)

        InformationPackage.objects.filter(pk=ip).update(object_size=size,
                                                        object_num_items=count)

        return size, count
Exemple #6
0
    def parse_document(self, ip, rootdir, document, act, parent, archive):
        id = str(uuid.uuid4())
        name = document.get("Namn")
        desc = document.get("Beskrivning")

        filepath = os.path.join('content', document.get('Lank'))
        if ip is not None:
            filepath = os.path.join(ip.object_path, ip.sip_path, 'content',
                                    document.get('Lank'))
        elif rootdir is not None:
            filepath = os.path.join(rootdir, 'content', document.get('Lank'))

        href = os.path.dirname(os.path.relpath(filepath, rootdir))
        href = '' if href == '.' else href
        filename = os.path.basename(filepath)
        ext = os.path.splitext(filepath)[1][1:]

        encoded_content = get_encoded_content_from_file(filepath)

        size, _ = get_tree_size_and_count(filepath)
        modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

        custom_fields = {
            'filename': filename,
            'href': href,
            'extension': ext,
            'size': size,
            'modified': modified,
        }

        tag = Tag.objects.create(information_package=ip, task=self.task)
        tag_version_type, _ = TagVersionType.objects.get_or_create(
            name='Bilaga')
        tag_version = TagVersion.objects.create(
            pk=id,
            tag=tag,
            elastic_index='document',
            name=name,
            description=desc,
            type=tag_version_type,
            reference_code='',
            custom_fields=custom_fields,
        )
        tag_repr = TagStructure.objects.create(
            tag=tag,
            parent=parent,
            structure=parent.structure,
        )
        self.indexed_files.append(filepath)

        d = File.from_obj(tag_version, archive)
        d.data = encoded_content
        d_dict = d.to_dict(include_meta=True)
        d_dict['pipeline'] = 'ingest_attachment'
        return tag, tag_version, tag_repr, d_dict
Exemple #7
0
def UpdateIPSizeAndCount(self):
    ip = self.ip
    path = InformationPackage.objects.values_list('object_path', flat=True).get(pk=ip)
    size, count = get_tree_size_and_count(path)

    InformationPackage.objects.filter(pk=ip).update(
        object_size=size, object_num_items=count,
        last_changed_local=timezone.now(),
    )

    msg = "Updated size and count of IP"
    self.create_success_event(msg)

    return size, count
Exemple #8
0
def index_document(tag_version, filepath):
    exclude_file_format_from_indexing_content = settings.EXCLUDE_FILE_FORMAT_FROM_INDEXING_CONTENT

    fid = FormatIdentifier()
    (format_name, format_version, format_registry_key) = fid.identify_file_format(filepath)
    if format_registry_key not in exclude_file_format_from_indexing_content:
        index_file_content = True
    else:
        index_file_content = False

    ip = tag_version.tag.information_package
    extension = os.path.splitext(tag_version.name)[1][1:]
    dirname = os.path.dirname(filepath)
    href = normalize_path(os.path.relpath(dirname, ip.object_path))
    href = '' if href == '.' else href
    size, _ = get_tree_size_and_count(filepath)
    modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

    tag_version.custom_fields = {
        'extension': extension,
        'dirname': dirname,
        'href': href,
        'filename': tag_version.name,
        'size': size,
        'modified': modified,
        'formatname': format_name,
        'formatversion': format_version,
        'formatkey': format_registry_key,
    }

    doc = File.from_obj(tag_version)

    try:
        if index_file_content:
            with open(filepath, 'rb') as f:
                content = f.read()
            doc.data = base64.b64encode(content).decode("ascii")
            doc.save(pipeline='ingest_attachment')
        else:
            logger.debug('Skip to index file content for {}'.format(filepath))
            doc.save()
    except ElasticsearchException:
        logger.exception('Failed to index {}'.format(filepath))
        raise
    return doc, tag_version
Exemple #9
0
    def files(self, path=''):
        mimetypes.suffix_map = {}
        mimetypes.encodings_map = {}
        mimetypes.types_map = {}
        mimetypes.common_types = {}
        mimetypes_file = Path.objects.get(
            entity="path_mimetypes_definitionfile").value
        mimetypes.init(files=[mimetypes_file])
        mtypes = mimetypes.types_map

        MAX_FILE_SIZE = 100000000  # 100 MB

        if os.path.isfile(self.object_path):
            container = self.object_path
            xml = os.path.splitext(self.object_path)[0] + '.xml'

            if path.startswith(os.path.basename(container)):
                fullpath = os.path.join(os.path.dirname(container), path)

                if tarfile.is_tarfile(container):
                    with tarfile.open(container) as tar:
                        if fullpath == container:
                            entries = []
                            for member in tar.getmembers():
                                if not member.isfile():
                                    continue

                                entries.append({
                                    "name":
                                    member.name,
                                    "type":
                                    'file',
                                    "size":
                                    member.size,
                                    "modified":
                                    timestamp_to_datetime(member.mtime),
                                })
                            return Response(entries)
                        else:
                            subpath = fullpath[len(container) + 1:]
                            try:
                                member = tar.getmember(subpath)

                                if not member.isfile():
                                    raise exceptions.NotFound

                                f = tar.extractfile(member)
                                content_type = mtypes.get(
                                    os.path.splitext(subpath)[1])
                                response = HttpResponse(
                                    f.read(), content_type=content_type)
                                response[
                                    'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename(
                                        f.name)
                                if content_type is None:
                                    response[
                                        'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                                            f.name)
                                return response
                            except KeyError:
                                raise exceptions.NotFound

                elif zipfile.is_zipfile(container):
                    with zipfile.ZipFile(container) as zipf:
                        if fullpath == container:
                            entries = []
                            for member in zipf.filelist:
                                if member.filename.endswith('/'):
                                    continue

                                entries.append({
                                    "name":
                                    member.filename,
                                    "type":
                                    'file',
                                    "size":
                                    member.file_size,
                                    "modified":
                                    datetime.datetime(*member.date_time),
                                })
                            return Response(entries)
                        else:
                            subpath = fullpath[len(container) + 1:]
                            try:
                                f = zipf.open(subpath)
                                content_type = mtypes.get(
                                    os.path.splitext(subpath)[1])
                                response = HttpResponse(
                                    f.read(), content_type=content_type)
                                response[
                                    'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename(
                                        f.name)
                                if content_type is None:
                                    response[
                                        'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                                            f.name)
                                return response
                            except KeyError:
                                raise exceptions.NotFound

                content_type = mtypes.get(os.path.splitext(fullpath)[1])
                response = HttpResponse(open(fullpath).read(),
                                        content_type=content_type)
                response[
                    'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename(
                        fullpath)
                if content_type is None:
                    response[
                        'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                            fullpath)
                return response
            elif os.path.isfile(xml) and path == os.path.basename(xml):
                fullpath = os.path.join(os.path.dirname(container), path)
                content_type = mtypes.get(os.path.splitext(fullpath)[1])
                response = HttpResponse(open(fullpath).read(),
                                        content_type=content_type)
                response[
                    'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename(
                        fullpath)
                if content_type is None:
                    response[
                        'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                            fullpath)
                return response
            elif path == '':
                entries = []

                entries.append({
                    "name":
                    os.path.basename(container),
                    "type":
                    'file',
                    "size":
                    os.path.getsize(container),
                    "modified":
                    timestamp_to_datetime(os.path.getmtime(container)),
                })

                if os.path.isfile(xml):
                    entries.append({
                        "name":
                        os.path.basename(xml),
                        "type":
                        'file',
                        "size":
                        os.path.getsize(xml),
                        "modified":
                        timestamp_to_datetime(os.path.getmtime(xml)),
                    })
                return Response(entries)

            elif path is not None:
                raise exceptions.NotFound

        entries = []
        fullpath = os.path.join(self.object_path, path)

        if not in_directory(fullpath, self.object_path):
            raise exceptions.ParseError('Illegal path %s' % path)

        if not os.path.exists(fullpath):
            raise exceptions.NotFound

        if os.path.isfile(fullpath):
            content_type = mtypes.get(os.path.splitext(fullpath)[1])
            response = HttpResponse(open(fullpath).read(),
                                    content_type=content_type)
            response[
                'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename(
                    fullpath)
            if content_type is None:
                response[
                    'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                        fullpath)
            return response

        for entry in get_files_and_dirs(fullpath):
            entry_type = "dir" if entry.is_dir() else "file"

            if entry_type == 'file' and re.search(
                    r'\_\d+$', entry.name) is not None:  # file chunk
                continue

            size, _ = get_tree_size_and_count(entry.path)

            entries.append({
                "name":
                os.path.basename(entry.path),
                "type":
                entry_type,
                "size":
                size,
                "modified":
                timestamp_to_datetime(entry.stat().st_mtime),
            })

        sorted_entries = sorted(entries, key=itemgetter('name'))
        return Response(sorted_entries)
Exemple #10
0
 def ObjectSizeAndNum(self):
     return get_tree_size_and_count(self.ObjectPath)
Exemple #11
0
    def list_files(self, path=''):
        fullpath = os.path.join(self.object_path, path).rstrip('/')
        if os.path.basename(self.object_path) == path and os.path.isfile(
                self.object_path):
            if tarfile.is_tarfile(self.object_path):
                with tarfile.open(self.object_path) as tar:
                    entries = []
                    for member in tar.getmembers():
                        if not member.isfile():
                            continue

                        entries.append({
                            "name":
                            member.name,
                            "type":
                            'file',
                            "size":
                            member.size,
                            "modified":
                            timestamp_to_datetime(member.mtime),
                        })
                    return entries

            elif zipfile.is_zipfile(self.object_path) and os.path.splitext(
                    self.object_path)[1] == '.zip':
                with zipfile.ZipFile(self.object_path) as zipf:
                    entries = []
                    for member in zipf.filelist:
                        if member.filename.endswith('/'):
                            continue

                        entries.append({
                            "name": member.filename,
                            "type": 'file',
                            "size": member.file_size,
                            "modified": datetime(*member.date_time),
                        })
                    return entries

        if os.path.isfile(self.object_path) and not path:
            container = self.object_path
            xml = os.path.splitext(container)[0] + '.xml'
            entries = [{
                "name":
                os.path.basename(container),
                "type":
                'file',
                "size":
                os.path.getsize(container),
                "modified":
                timestamp_to_datetime(os.path.getmtime(container)),
            }]

            if os.path.isfile(xml):
                entries.append({
                    "name":
                    os.path.basename(xml),
                    "type":
                    'file',
                    "size":
                    os.path.getsize(xml),
                    "modified":
                    timestamp_to_datetime(os.path.getmtime(xml)),
                })

            return entries

        entries = []
        for entry in sorted(get_files_and_dirs(fullpath),
                            key=lambda x: x.name):
            entry_type = "dir" if entry.is_dir() else "file"
            size, _ = get_tree_size_and_count(entry.path)

            entries.append({
                "name":
                os.path.basename(entry.path),
                "type":
                entry_type,
                "size":
                size,
                "modified":
                timestamp_to_datetime(entry.stat().st_mtime),
            })

        return entries
Exemple #12
0
    def list_files(self, path=''):
        fullpath = os.path.join(self.object_path, path).rstrip('/')
        if os.path.basename(self.object_path) == path and os.path.isfile(
                self.object_path):
            if tarfile.is_tarfile(self.object_path):
                with tarfile.open(self.object_path) as tar:
                    entries = []
                    for member in tar.getmembers():
                        if not member.isfile():
                            continue

                        entries.append({
                            "name":
                            member.name,
                            "type":
                            'file',
                            "size":
                            member.size,
                            "modified":
                            timestamp_to_datetime(member.mtime),
                        })
                    return entries

            elif zipfile.is_zipfile(self.object_path) and os.path.splitext(
                    self.object_path)[1] == '.zip':
                with zipfile.ZipFile(self.object_path) as zipf:
                    entries = []
                    for member in zipf.filelist:
                        if member.filename.endswith('/'):
                            continue

                        entries.append({
                            "name": member.filename,
                            "type": 'file',
                            "size": member.file_size,
                            "modified": datetime(*member.date_time),
                        })
                    return entries

        if os.path.isfile(self.object_path) and not path:
            container = self.object_path
            xml = os.path.splitext(container)[0] + '.xml'
            entries = [{
                "name":
                os.path.basename(container),
                "type":
                'file',
                "size":
                os.path.getsize(container),
                "modified":
                timestamp_to_datetime(os.path.getmtime(container)),
            }]

            if os.path.isfile(xml):
                entries.append({
                    "name":
                    os.path.basename(xml),
                    "type":
                    'file',
                    "size":
                    os.path.getsize(xml),
                    "modified":
                    timestamp_to_datetime(os.path.getmtime(xml)),
                })

            return entries

        entries = []
        for entry in sorted(get_files_and_dirs(fullpath),
                            key=lambda x: x.name):
            try:
                entry_type = "dir" if entry.is_dir() else "file"
                size, _ = get_tree_size_and_count(entry.path)

                entries.append({
                    "name":
                    os.path.basename(entry.path),
                    "type":
                    entry_type,
                    "size":
                    size,
                    "modified":
                    timestamp_to_datetime(entry.stat().st_mtime),
                })
            except OSError as e:
                # the file might be deleted (e.g. temporary upload files) while we get additional data,
                # if they are we ignore them. If there is another error, we raise it

                if e.errno != errno.ENOENT:
                    raise

        return entries