Ejemplo n.º 1
0
def generate_package_mets(ip):
    sa = ip.submission_agreement
    if ip.package_type == InformationPackage.SIP:
        profile_type = 'submit_description'
    elif ip.package_type == InformationPackage.AIP:
        profile_type = 'aip_description'
    else:
        raise ValueError(
            'Cannot create package mets for IP of type {package_type}'.format(
                package_type=ip.package_type
            )
        )
    profile_rel = ip.get_profile_rel(profile_type)
    profile_data = ip.get_profile_data(profile_type)
    xmlpath = os.path.splitext(ip.object_path)[0] + '.xml'
    data = fill_specification_data(profile_data, ip=ip, sa=sa)
    data["_IP_CREATEDATE"] = timestamp_to_datetime(creation_date(ip.object_path)).isoformat()
    files_to_create = {
        xmlpath: {
            'spec': profile_rel.profile.specification,
            'data': data
        }
    }
    algorithm = ip.get_checksum_algorithm()

    generator = XMLGenerator()
    generator.generate(files_to_create, folderToParse=ip.object_path, algorithm=algorithm)

    ip.package_mets_path = normalize_path(xmlpath)
    ip.package_mets_create_date = timestamp_to_datetime(creation_date(xmlpath)).isoformat()
    ip.package_mets_size = os.path.getsize(xmlpath)
    ip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[algorithm.upper()]
    ip.package_mets_digest = calculate_checksum(xmlpath, algorithm=algorithm)
    ip.save()
Ejemplo n.º 2
0
    def test_list_multiple_files_in_folder(self):
        archive_path = self.create_archive_file('tar')
        self.ip.object_path = archive_path
        self.ip.save()

        files = [
            f for f in os.listdir(self.textdir)
            if os.path.isfile(os.path.join(self.textdir, f))
        ]
        expected_entries = []
        for f in files:
            expected_entries.append({
                'type':
                'file',
                'name':
                f,
                'size':
                1,
                'modified':
                timestamp_to_datetime(
                    os.stat(os.path.join(self.textdir, f)).st_mtime)
            })

        entries = self.ip.list_files(path=self.textdir)

        self.assertCountEqual(entries, expected_entries)
        self.assertEqual(len(entries), 3)
Ejemplo n.º 3
0
def parse_file(filepath,
               fid,
               relpath=None,
               algorithm='SHA-256',
               rootdir='',
               provided_data=None):
    if not relpath:
        relpath = filepath

    if provided_data is None:
        provided_data = {}

    relpath = win_to_posix(relpath)

    fileinfo = {
        'FName': os.path.basename(relpath),
        'FExtension': os.path.splitext(relpath)[1][1:],
        'FDir': rootdir,
        'FParentDir': os.path.basename(os.path.dirname(filepath)),
        'FID': str(uuid.uuid4()),
        'daotype': "borndigital",
        'href': relpath,
        'FMimetype': fid.get_mimetype(filepath),
        'FSize': str(os.path.getsize(filepath)),
        'FUse': 'Datafile',
        'FChecksumType': algorithm,
        'FLoctype': 'URL',
        'FLinkType': 'simple',
        'FChecksumLib': 'ESSArch',
        'FIDType': 'UUID',
    }

    # We only do heavy computations if their values aren't included in
    # provided_data

    if 'FCreated' not in provided_data:
        timestamp = creation_date(filepath)
        createdate = timestamp_to_datetime(timestamp)
        fileinfo['FCreated'] = createdate.isoformat()

    if 'FChecksum' not in provided_data:
        fileinfo['FChecksum'] = checksum.calculate_checksum(
            filepath, algorithm)

    if 'FEncrypted' not in provided_data:
        fileinfo['FEncrypted'] = fid.identify_file_encryption(filepath)

    if any(x not in provided_data
           for x in ['FFormatName', 'FFormatVersion', 'FFormatRegistryKey']):
        (format_name, format_version,
         format_registry_key) = fid.identify_file_format(filepath)

        fileinfo['FFormatName'] = format_name
        fileinfo['FFormatVersion'] = format_version
        fileinfo['FFormatRegistryKey'] = format_registry_key

    for key, value in provided_data.items():
        fileinfo[key] = value

    return fileinfo
Ejemplo n.º 4
0
def index_document(ip, filepath, id):
    with open(filepath, 'rb') as f:
        content = f.read()

    encoded_content = base64.b64encode(content).decode("ascii")
    filename = os.path.basename(filepath)
    extension = os.path.splitext(filename)[1][1:]
    dirname = os.path.dirname(filepath)
    href = normalize_path(os.path.relpath(dirname, ip.object_path))
    href = '' if href == '.' else href
    size, _ = get_tree_size_and_count(filepath)
    modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

    doc = File(_id=id,
               name=filename,
               type="document",
               filename=filename,
               extension=extension,
               href=href,
               ip=str(ip.pk),
               data=encoded_content,
               size=size,
               modified=modified,
               current_version=True)
    doc.save(pipeline='ingest_attachment')
    return doc
Ejemplo n.º 5
0
def generate_content_mets(ip):
    mets_path = ip.get_content_mets_file_path()
    full_mets_path = os.path.join(ip.object_path, mets_path)
    profile_type = ip.get_package_type_display().lower()
    profile_rel = ip.get_profile_rel(profile_type)
    profile_data = ip.get_profile_data(profile_type)
    files_to_create = {
        full_mets_path: {
            'spec': profile_rel.profile.specification,
            'data': fill_specification_data(profile_data, ip=ip)
        }
    }
    algorithm = ip.get_checksum_algorithm()

    allow_unknown_file_types = ip.get_allow_unknown_file_types()
    allow_encrypted_files = ip.get_allow_encrypted_files()
    generator = XMLGenerator(
        allow_unknown_file_types=allow_unknown_file_types,
        allow_encrypted_files=allow_encrypted_files,
    )
    generator.generate(files_to_create,
                       folderToParse=ip.object_path,
                       algorithm=algorithm)

    ip.content_mets_path = mets_path
    ip.content_mets_create_date = timestamp_to_datetime(
        creation_date(full_mets_path)).isoformat()
    ip.content_mets_size = os.path.getsize(full_mets_path)
    ip.content_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[
        algorithm.upper()]
    ip.content_mets_digest = calculate_checksum(full_mets_path,
                                                algorithm=algorithm)
    ip.save()
Ejemplo n.º 6
0
    def run(self):
        ip = self.get_information_package()
        mets_path = ip.get_content_mets_file_path()
        profile_type = ip.get_package_type_display().lower()
        profile_rel = ip.get_profile_rel(profile_type)
        profile_data = ip.get_profile_data(profile_type)
        files_to_create = {
            mets_path: {
                'spec': profile_rel.profile.specification,
                'data': fill_specification_data(profile_data, ip=ip)
            }
        }
        algorithm = ip.get_checksum_algorithm()

        generator = XMLGenerator()
        generator.generate(files_to_create,
                           folderToParse=ip.object_path,
                           algorithm=algorithm)

        ip.content_mets_path = mets_path
        ip.content_mets_create_date = timestamp_to_datetime(
            creation_date(mets_path)).isoformat()
        ip.content_mets_size = os.path.getsize(mets_path)
        ip.content_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[
            algorithm.upper()]
        ip.content_mets_digest = calculate_checksum(mets_path,
                                                    algorithm=algorithm)
        ip.save()
Ejemplo n.º 7
0
def index_document(tag_version, filepath):
    with open(filepath, 'rb') as f:
        content = f.read()

    ip = tag_version.tag.information_package
    encoded_content = base64.b64encode(content).decode("ascii")
    extension = os.path.splitext(tag_version.name)[1][1:]
    dirname = os.path.dirname(filepath)
    href = normalize_path(os.path.relpath(dirname, ip.object_path))
    href = '' if href == '.' else href
    size, _ = get_tree_size_and_count(filepath)
    modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

    tag_version.custom_fields = {
        'extension': extension,
        'dirname': dirname,
        'href': href,
        'filename': tag_version.name,
        'size': size,
        'modified': modified,
    }

    doc = File.from_obj(tag_version)
    doc.data = encoded_content

    try:
        doc.save(pipeline='ingest_attachment')
    except ElasticsearchException:
        logger.exception('Failed to index {}'.format(filepath))
        raise
    return doc, tag_version
Ejemplo n.º 8
0
    def parse_document(self, ip, rootdir, document, act, parent):
        id = str(uuid.uuid4())
        name = document.get("Namn")
        desc = document.get("Beskrivning")

        filepath = document.get('Lank')
        if ip is not None:
            filepath = os.path.join(ip.object_path, ip.sip_path, document.get('Lank'))
        elif rootdir is not None:
            filepath = os.path.join(rootdir, document.get('Lank'))

        href = os.path.dirname(os.path.relpath(filepath, rootdir))
        href = '' if href == '.' else href
        filename = os.path.basename(filepath)
        ext = os.path.splitext(filepath)[1][1:]

        with open(filepath, 'rb') as f:
            content = f.read()
            encoded_content = base64.b64encode(content).decode("ascii")

        size, _ = get_tree_size_and_count(filepath)
        modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

        d = File(
            _id=id,
            name=name,
            type='Bilaga',
            archive=act.archive,
            desc=desc,
            filename=filename,
            href=href,
            extension=ext,
            data=encoded_content,
            size=size,
            modified=modified,
            current_version=True,
            ip=act.ip,
            task_id=str(self.task.pk),
        )

        tag = Tag(information_package=ip, task=self.task)
        tag_version = TagVersion(pk=d.meta.id, tag=tag,
                                 elastic_index=d._index._name,
                                 name=d.name, type=d.type,
                                 reference_code='')
        tag_repr = TagStructure(
            tag=tag,
            parent=parent,
            structure=parent.structure,
            tree_id=parent.tree_id,
            lft=0,
            rght=0,
            level=0,
        )
        self.indexed_files.append(filepath)

        d_dict = d.to_dict(include_meta=True)
        d_dict['pipeline'] = 'ingest_attachment'
        return tag, tag_version, tag_repr, d_dict
Ejemplo n.º 9
0
 def test_list_folder(self):
     path = tempfile.mkdtemp(dir=self.datadir)
     self.assertEqual(
         self.ip.list_files(),
         [{
             'type': 'dir',
             'name': os.path.basename(path),
             'size': 0,
             'modified': timestamp_to_datetime(os.stat(path).st_mtime)
         }])
Ejemplo n.º 10
0
    def parse_document(self, ip, rootdir, document, act, parent, archive):
        id = str(uuid.uuid4())
        name = document.get("Namn")
        desc = document.get("Beskrivning")

        filepath = os.path.join('content', document.get('Lank'))
        if ip is not None:
            filepath = os.path.join(ip.object_path, ip.sip_path, 'content',
                                    document.get('Lank'))
        elif rootdir is not None:
            filepath = os.path.join(rootdir, 'content', document.get('Lank'))

        href = os.path.dirname(os.path.relpath(filepath, rootdir))
        href = '' if href == '.' else href
        filename = os.path.basename(filepath)
        ext = os.path.splitext(filepath)[1][1:]

        encoded_content = get_encoded_content_from_file(filepath)

        size, _ = get_tree_size_and_count(filepath)
        modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

        custom_fields = {
            'filename': filename,
            'href': href,
            'extension': ext,
            'size': size,
            'modified': modified,
        }

        tag = Tag.objects.create(information_package=ip, task=self.task)
        tag_version_type, _ = TagVersionType.objects.get_or_create(
            name='Bilaga')
        tag_version = TagVersion.objects.create(
            pk=id,
            tag=tag,
            elastic_index='document',
            name=name,
            description=desc,
            type=tag_version_type,
            reference_code='',
            custom_fields=custom_fields,
        )
        tag_repr = TagStructure.objects.create(
            tag=tag,
            parent=parent,
            structure=parent.structure,
        )
        self.indexed_files.append(filepath)

        d = File.from_obj(tag_version, archive)
        d.data = encoded_content
        d_dict = d.to_dict(include_meta=True)
        d_dict['pipeline'] = 'ingest_attachment'
        return tag, tag_version, tag_repr, d_dict
Ejemplo n.º 11
0
 def test_list_file(self):
     fd, path = tempfile.mkstemp(dir=self.datadir)
     os.close(fd)
     self.assertEqual(
         self.ip.list_files(),
         [{
             'type': 'file',
             'name': os.path.basename(path),
             'size': 0,
             'modified': timestamp_to_datetime(os.stat(path).st_mtime)
         }])
Ejemplo n.º 12
0
 def test_list_folder_content(self):
     path = tempfile.mkdtemp(dir=self.datadir)
     fd, filepath = tempfile.mkstemp(dir=path)
     os.close(fd)
     self.assertEqual(
         self.ip.list_files(path=path),
         [{
             'type': 'file',
             'name': os.path.basename(filepath),
             'size': os.stat(filepath).st_size,
             'modified': timestamp_to_datetime(os.stat(filepath).st_mtime)
         }])
Ejemplo n.º 13
0
    def test_list_root_folder_when_xml_exists_with_no_params(self):
        archive_path = self.create_archive_file('tar')
        xml_path = self.create_mets_xml_file('archive_file.xml')
        self.ip.object_path = archive_path
        self.ip.save()

        entries = self.ip.list_files(path='')

        self.assertEqual(
            entries,
            [{
                'type': 'file',
                'name': os.path.basename(archive_path),
                'size': os.path.getsize(archive_path),
                'modified': timestamp_to_datetime(
                    os.stat(archive_path).st_mtime)
            }, {
                'type': 'file',
                'name': os.path.basename(xml_path),
                'size': os.path.getsize(xml_path),
                'modified': timestamp_to_datetime(os.stat(xml_path).st_mtime)
            }])
Ejemplo n.º 14
0
def generate_content_metadata(ip):
    files_to_create = {}

    generate_premis = ip.profile_locked('preservation_metadata')
    if generate_premis:
        premis_profile_type = 'preservation_metadata'
        premis_profile_rel = ip.get_profile_rel(premis_profile_type)
        premis_profile_data = ip.get_profile_data(premis_profile_type)
        data = fill_specification_data(premis_profile_data, ip=ip)
        premis_path = parseContent(ip.get_premis_file_path(), data)
        full_premis_path = os.path.join(ip.object_path, premis_path)
        files_to_create[full_premis_path] = {
            'spec': premis_profile_rel.profile.specification,
            'data': data,
        }

    mets_path = ip.get_content_mets_file_path()
    full_mets_path = os.path.join(ip.object_path, mets_path)
    profile_type = ip.get_package_type_display().lower()
    profile_rel = ip.get_profile_rel(profile_type)
    profile_data = ip.get_profile_data(profile_type)
    files_to_create[full_mets_path] = {
        'spec': profile_rel.profile.specification,
        'data': fill_specification_data(profile_data, ip=ip),
    }

    parsed_files = profile_rel.data.parsed_files
    extra_paths_to_parse = profile_rel.data.extra_paths_to_parse
    algorithm = ip.get_checksum_algorithm()
    allow_unknown_file_types = ip.get_allow_unknown_file_types()
    allow_encrypted_files = ip.get_allow_encrypted_files()
    generator = XMLGenerator(
        allow_unknown_file_types=allow_unknown_file_types,
        allow_encrypted_files=allow_encrypted_files,
    )
    generator.generate(files_to_create,
                       folderToParse=ip.object_path,
                       algorithm=algorithm,
                       parsed_files=parsed_files,
                       extra_paths_to_parse=extra_paths_to_parse)

    ip.content_mets_path = mets_path
    ip.content_mets_create_date = timestamp_to_datetime(
        creation_date(full_mets_path)).isoformat()
    ip.content_mets_size = os.path.getsize(full_mets_path)
    ip.content_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[
        algorithm.upper()]
    ip.content_mets_digest = calculate_checksum(full_mets_path,
                                                algorithm=algorithm)
    ip.save()
Ejemplo n.º 15
0
def index_document(tag_version, filepath):
    exclude_file_format_from_indexing_content = settings.EXCLUDE_FILE_FORMAT_FROM_INDEXING_CONTENT

    fid = FormatIdentifier()
    (format_name, format_version, format_registry_key) = fid.identify_file_format(filepath)
    if format_registry_key not in exclude_file_format_from_indexing_content:
        index_file_content = True
    else:
        index_file_content = False

    ip = tag_version.tag.information_package
    extension = os.path.splitext(tag_version.name)[1][1:]
    dirname = os.path.dirname(filepath)
    href = normalize_path(os.path.relpath(dirname, ip.object_path))
    href = '' if href == '.' else href
    size, _ = get_tree_size_and_count(filepath)
    modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

    tag_version.custom_fields = {
        'extension': extension,
        'dirname': dirname,
        'href': href,
        'filename': tag_version.name,
        'size': size,
        'modified': modified,
        'formatname': format_name,
        'formatversion': format_version,
        'formatkey': format_registry_key,
    }

    doc = File.from_obj(tag_version)

    try:
        if index_file_content:
            with open(filepath, 'rb') as f:
                content = f.read()
            doc.data = base64.b64encode(content).decode("ascii")
            doc.save(pipeline='ingest_attachment')
        else:
            logger.debug('Skip to index file content for {}'.format(filepath))
            doc.save()
    except ElasticsearchException:
        logger.exception('Failed to index {}'.format(filepath))
        raise
    return doc, tag_version
Ejemplo n.º 16
0
    def identify_ip(self, request):
        fname = request.data.get('label')
        spec_data = request.data.get('specification_data', {})

        uip = Path.objects.get(entity="path_ingest_unidentified").value
        container_file = os.path.join(uip, fname)

        if not os.path.isfile(container_file):
            return Response(
                {'status': '%s does not exist' % container_file},
                status=status.HTTP_400_BAD_REQUEST
            )

        spec = json.loads(open(
            os.path.join(settings.BASE_DIR, 'templates/SDTemplate.json')
        ).read())

        ip_id = uuid.uuid4()

        spec_data['_OBJID'] = unicode(ip_id)
        spec_data['_OBJLABEL'] = spec_data.pop('LABEL')
        spec_data['_IP_CREATEDATE'] = timestamp_to_datetime(
            creation_date(container_file)
        ).isoformat()

        infoxml = u'%s.xml' % unicode(ip_id)
        infoxml = os.path.join(uip, infoxml)

        ProcessTask(
            name='preingest.tasks.GenerateXML',
            params={
                'info': spec_data,
                'filesToCreate': {
                    infoxml: spec
                },
                'folderToParse': container_file,
            },
        ).run_eagerly()

        return Response({'status': 'Identified IP, created %s' % infoxml})
Ejemplo n.º 17
0
    def files(self, path=''):
        mimetypes.suffix_map = {}
        mimetypes.encodings_map = {}
        mimetypes.types_map = {}
        mimetypes.common_types = {}
        mimetypes_file = Path.objects.get(
            entity="path_mimetypes_definitionfile").value
        mimetypes.init(files=[mimetypes_file])
        mtypes = mimetypes.types_map

        MAX_FILE_SIZE = 100000000  # 100 MB

        if os.path.isfile(self.object_path):
            container = self.object_path
            xml = os.path.splitext(self.object_path)[0] + '.xml'

            if path.startswith(os.path.basename(container)):
                fullpath = os.path.join(os.path.dirname(container), path)

                if tarfile.is_tarfile(container):
                    with tarfile.open(container) as tar:
                        if fullpath == container:
                            entries = []
                            for member in tar.getmembers():
                                if not member.isfile():
                                    continue

                                entries.append({
                                    "name":
                                    member.name,
                                    "type":
                                    'file',
                                    "size":
                                    member.size,
                                    "modified":
                                    timestamp_to_datetime(member.mtime),
                                })
                            return Response(entries)
                        else:
                            subpath = fullpath[len(container) + 1:]
                            try:
                                member = tar.getmember(subpath)

                                if not member.isfile():
                                    raise exceptions.NotFound

                                f = tar.extractfile(member)
                                content_type = mtypes.get(
                                    os.path.splitext(subpath)[1])
                                response = HttpResponse(
                                    f.read(), content_type=content_type)
                                response[
                                    'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename(
                                        f.name)
                                if content_type is None:
                                    response[
                                        'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                                            f.name)
                                return response
                            except KeyError:
                                raise exceptions.NotFound

                elif zipfile.is_zipfile(container):
                    with zipfile.ZipFile(container) as zipf:
                        if fullpath == container:
                            entries = []
                            for member in zipf.filelist:
                                if member.filename.endswith('/'):
                                    continue

                                entries.append({
                                    "name":
                                    member.filename,
                                    "type":
                                    'file',
                                    "size":
                                    member.file_size,
                                    "modified":
                                    datetime.datetime(*member.date_time),
                                })
                            return Response(entries)
                        else:
                            subpath = fullpath[len(container) + 1:]
                            try:
                                f = zipf.open(subpath)
                                content_type = mtypes.get(
                                    os.path.splitext(subpath)[1])
                                response = HttpResponse(
                                    f.read(), content_type=content_type)
                                response[
                                    'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename(
                                        f.name)
                                if content_type is None:
                                    response[
                                        'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                                            f.name)
                                return response
                            except KeyError:
                                raise exceptions.NotFound

                content_type = mtypes.get(os.path.splitext(fullpath)[1])
                response = HttpResponse(open(fullpath).read(),
                                        content_type=content_type)
                response[
                    'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename(
                        fullpath)
                if content_type is None:
                    response[
                        'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                            fullpath)
                return response
            elif os.path.isfile(xml) and path == os.path.basename(xml):
                fullpath = os.path.join(os.path.dirname(container), path)
                content_type = mtypes.get(os.path.splitext(fullpath)[1])
                response = HttpResponse(open(fullpath).read(),
                                        content_type=content_type)
                response[
                    'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename(
                        fullpath)
                if content_type is None:
                    response[
                        'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                            fullpath)
                return response
            elif path == '':
                entries = []

                entries.append({
                    "name":
                    os.path.basename(container),
                    "type":
                    'file',
                    "size":
                    os.path.getsize(container),
                    "modified":
                    timestamp_to_datetime(os.path.getmtime(container)),
                })

                if os.path.isfile(xml):
                    entries.append({
                        "name":
                        os.path.basename(xml),
                        "type":
                        'file',
                        "size":
                        os.path.getsize(xml),
                        "modified":
                        timestamp_to_datetime(os.path.getmtime(xml)),
                    })
                return Response(entries)

            elif path is not None:
                raise exceptions.NotFound

        entries = []
        fullpath = os.path.join(self.object_path, path)

        if not in_directory(fullpath, self.object_path):
            raise exceptions.ParseError('Illegal path %s' % path)

        if not os.path.exists(fullpath):
            raise exceptions.NotFound

        if os.path.isfile(fullpath):
            content_type = mtypes.get(os.path.splitext(fullpath)[1])
            response = HttpResponse(open(fullpath).read(),
                                    content_type=content_type)
            response[
                'Content-Disposition'] = 'inline; filename="%s"' % os.path.basename(
                    fullpath)
            if content_type is None:
                response[
                    'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                        fullpath)
            return response

        for entry in get_files_and_dirs(fullpath):
            entry_type = "dir" if entry.is_dir() else "file"

            if entry_type == 'file' and re.search(
                    r'\_\d+$', entry.name) is not None:  # file chunk
                continue

            size, _ = get_tree_size_and_count(entry.path)

            entries.append({
                "name":
                os.path.basename(entry.path),
                "type":
                entry_type,
                "size":
                size,
                "modified":
                timestamp_to_datetime(entry.stat().st_mtime),
            })

        sorted_entries = sorted(entries, key=itemgetter('name'))
        return Response(sorted_entries)
Ejemplo n.º 18
0
    def run(self,
            filepath=None,
            mimetype=None,
            relpath=None,
            algorithm='SHA-256',
            rootdir=''):
        if not relpath:
            relpath = filepath

        relpath = win_to_posix(relpath)

        timestamp = creation_date(filepath)
        createdate = timestamp_to_datetime(timestamp)

        checksum_task = ProcessTask(
            name="ESSArch_Core.tasks.CalculateChecksum",
            params={
                "filename": filepath,
                "algorithm": algorithm
            },
            processstep_id=self.step,
            responsible_id=self.responsible,
            information_package_id=self.ip)

        fileformat_task = ProcessTask(
            name="ESSArch_Core.tasks.IdentifyFileFormat",
            params={
                "filename": filepath,
            },
            processstep_id=self.step,
            responsible_id=self.responsible,
            information_package_id=self.ip)

        ProcessTask.objects.bulk_create([checksum_task, fileformat_task])

        checksum = checksum_task.run().get()
        self.set_progress(50, total=100)
        (format_name, format_version,
         format_registry_key) = fileformat_task.run().get()

        fileinfo = {
            'FName': os.path.basename(relpath),
            'FDir': rootdir,
            'FChecksum': checksum,
            'FID': str(uuid.uuid4()),
            'daotype': "borndigital",
            'href': relpath,
            'FMimetype': mimetype,
            'FCreated': createdate.isoformat(),
            'FFormatName': format_name,
            'FFormatVersion': format_version,
            'FFormatRegistryKey': format_registry_key,
            'FSize': str(os.path.getsize(filepath)),
            'FUse': 'Datafile',
            'FChecksumType': algorithm,
            'FLoctype': 'URL',
            'FLinkType': 'simple',
            'FChecksumLib': 'hashlib',
            'FLocationType': 'URI',
            'FIDType': 'UUID',
        }

        return fileinfo
Ejemplo n.º 19
0
    def _run(self):
        def get_information_packages(job):
            return self.rule.information_packages.filter(
                active=True, ).exclude(conversion_job_entries__job=self, )

        ips = get_information_packages(self)

        for ip in ips.order_by(
                '-cached').iterator():  # convert cached IPs first
            while not ip.cached:
                with allow_join_result():
                    t, created = ProcessTask.objects.get_or_create(
                        name='workflow.tasks.CacheAIP',
                        information_package=ip,
                        defaults={
                            'responsible': ip.responsible,
                            'eager': False
                        })

                    if not created:
                        t.run()

                time.sleep(10)
                ip.refresh_from_db()

            policy = ip.policy
            srcdir = os.path.join(policy.cache_storage.value,
                                  ip.object_identifier_value)

            new_ip = ip.create_new_generation(ip.state, ip.responsible, None)

            dstdir = os.path.join(policy.cache_storage.value,
                                  new_ip.object_identifier_value)

            new_ip.object_path = dstdir
            new_ip.save()

            aip_profile = new_ip.get_profile_rel('aip').profile
            aip_profile_data = new_ip.get_profile_data('aip')

            mets_dir, mets_name = find_destination("mets_file",
                                                   aip_profile.structure)
            mets_path = os.path.join(srcdir, mets_dir, mets_name)

            mets_tree = etree.parse(mets_path)

            # copy files to new generation
            shutil.copytree(srcdir, dstdir)

            # convert files specified in rule
            for pattern, spec in six.iteritems(self.rule.specification):
                target = spec['target']
                tool = spec['tool']

                for path in iglob(dstdir + '/' + pattern):
                    if os.path.isdir(path):
                        for root, dirs, files in walk(path):
                            rel = os.path.relpath(root, dstdir)

                            for f in files:
                                fpath = os.path.join(root, f)
                                job_entry = ConversionJobEntry.objects.create(
                                    job=self,
                                    start_date=timezone.now(),
                                    ip=ip,
                                    old_document=os.path.join(rel, f))
                                convert_file(fpath, target)

                                os.remove(fpath)

                                job_entry.new_document = os.path.splitext(
                                    job_entry.old_document)[0] + '.' + target
                                job_entry.end_date = timezone.now()
                                job_entry.tool = tool
                                job_entry.save()

                    elif os.path.isfile(path):
                        rel = os.path.relpath(path, dstdir)

                        job_entry = ConversionJobEntry.objects.create(
                            job=self,
                            start_date=timezone.now(),
                            ip=ip,
                            old_document=rel,
                        )
                        convert_file(path, target)

                        os.remove(path)

                        job_entry.new_document = os.path.splitext(
                            job_entry.old_document)[0] + '.' + target
                        job_entry.end_date = timezone.now()
                        job_entry.tool = tool
                        job_entry.save()

            # preserve new generation
            sa = new_ip.submission_agreement

            try:
                os.remove(mets_path)
            except OSError as e:
                if e.errno != errno.ENOENT:
                    raise

            filesToCreate = OrderedDict()

            try:
                premis_profile = new_ip.get_profile_rel(
                    'preservation_metadata').profile
                premis_profile_data = ip.get_profile_data(
                    'preservation_metadata')
            except ProfileIP.DoesNotExist:
                pass
            else:
                premis_dir, premis_name = find_destination(
                    "preservation_description_file", aip_profile.structure)
                premis_path = os.path.join(dstdir, premis_dir, premis_name)

                try:
                    os.remove(premis_path)
                except OSError as e:
                    if e.errno != errno.ENOENT:
                        raise

                filesToCreate[premis_path] = {
                    'spec':
                    premis_profile.specification,
                    'data':
                    fill_specification_data(premis_profile_data,
                                            ip=new_ip,
                                            sa=sa),
                }

            filesToCreate[mets_path] = {
                'spec':
                aip_profile.specification,
                'data':
                fill_specification_data(aip_profile_data, ip=new_ip, sa=sa),
            }

            t = ProcessTask.objects.create(
                name='ESSArch_Core.tasks.GenerateXML',
                params={
                    'filesToCreate': filesToCreate,
                    'folderToParse': dstdir,
                },
                responsible=new_ip.responsible,
                information_package=new_ip,
            )
            t.run().get()

            dsttar = dstdir + '.tar'
            dstxml = dstdir + '.xml'

            objid = new_ip.object_identifier_value

            with tarfile.open(dsttar, 'w') as tar:
                for root, dirs, files in walk(dstdir):
                    rel = os.path.relpath(root, dstdir)
                    for d in dirs:
                        src = os.path.join(root, d)
                        arc = os.path.join(objid, rel, d)
                        arc = os.path.normpath(arc)
                        index_path(new_ip, src)
                        tar.add(src, arc, recursive=False)

                    for f in files:
                        src = os.path.join(root, f)
                        index_path(new_ip, src)
                        tar.add(src,
                                os.path.normpath(os.path.join(objid, rel, f)))

            algorithm = policy.get_checksum_algorithm_display()
            checksum = calculate_checksum(dsttar, algorithm=algorithm)

            info = fill_specification_data(
                new_ip.get_profile_data('aip_description'), ip=new_ip, sa=sa)
            info["_IP_CREATEDATE"] = timestamp_to_datetime(
                creation_date(dsttar)).isoformat()

            aip_desc_profile = new_ip.get_profile('aip_description')
            filesToCreate = {
                dstxml: {
                    'spec': aip_desc_profile.specification,
                    'data': info
                }
            }

            ProcessTask.objects.create(
                name="ESSArch_Core.tasks.GenerateXML",
                params={
                    "filesToCreate": filesToCreate,
                    "folderToParse": dsttar,
                    "extra_paths_to_parse": [mets_path],
                    "algorithm": algorithm,
                },
                information_package=new_ip,
                responsible=new_ip.responsible,
            ).run().get()

            InformationPackage.objects.filter(pk=new_ip.pk).update(
                message_digest=checksum,
                message_digest_algorithm=policy.checksum_algorithm,
            )

            ProcessTask.objects.create(
                name='ESSArch_Core.tasks.UpdateIPSizeAndCount',
                information_package=new_ip,
                responsible=new_ip.responsible,
            ).run().get()

            t = ProcessTask.objects.create(
                name='workflow.tasks.StoreAIP',
                information_package=new_ip,
                responsible=new_ip.responsible,
            )

            t.run()
Ejemplo n.º 20
0
    def submit(self, request, pk=None):
        """
        Submits the specified information package

        Args:
            pk: The primary key (id) of the information package to submit

        Returns:
            None
        """

        ip = self.get_object()

        if ip.State != "Created":
            raise ValueError(
                "The IP (%s) is in the state '%s' but should be 'Created'" % (pk, ip.State)
            )

        validators = request.data.get('validators', {})

        validate_xml_file = validators.get('validate_xml_file', False)
        validate_file_format = validators.get('validate_file_format', False)
        validate_integrity = validators.get('validate_integrity', False)
        validate_logical_physical_representation = validators.get('validate_logical_physical_representation', False)

        step = ProcessStep.objects.create(
            name="Submit SIP",
            information_package=ip
        )

        step.tasks.add(ProcessTask.objects.create(
            name="preingest.tasks.UpdateIPStatus",
            params={
                "ip": ip,
                "status": "Submitting",
            },
            processstep_pos=0,
            log=EventIP,
            information_package=ip,
            responsible=self.request.user,
        ))

        reception = Path.objects.get(entity="path_preingest_reception").value

        sd_profile = ip.get_profile('submit_description')

        container_format = ip.get_container_format()
        container_file = os.path.join(reception, str(ip.pk) + ".%s" % container_format.lower())

        sa = ip.SubmissionAgreement

        info = sd_profile.fill_specification_data(sa, ip)
        info["_IP_CREATEDATE"] = timestamp_to_datetime(creation_date(container_file)).isoformat()

        infoxml = os.path.join(reception, str(ip.pk) + ".xml")

        filesToCreate = {
            infoxml: sd_profile.specification
        }

        step.tasks.add(ProcessTask.objects.create(
            name="preingest.tasks.GenerateXML",
            params={
                "info": info,
                "filesToCreate": filesToCreate,
                "folderToParse": container_file,
                "algorithm": ip.get_checksum_algorithm(),
            },
            processstep_pos=10,
            log=EventIP,
            information_package=ip,
            responsible=self.request.user,
        ))

        if validate_xml_file:
            step.tasks.add(
                ProcessTask.objects.create(
                    name="preingest.tasks.ValidateXMLFile",
                    params={
                        "xml_filename": infoxml
                    },
                    processstep_pos=14,
                    log=EventIP,
                    information_package=ip,
                    responsible=self.request.user,
                )
            )

        if validate_file_format or validate_integrity:
            step.tasks.add(
                ProcessTask.objects.create(
                    name="preingest.tasks.ValidateFiles",
                    params={
                        "ip": ip,
                        "rootdir": reception,
                        "xmlfile": infoxml,
                        "validate_fileformat": validate_file_format,
                        "validate_integrity": validate_integrity,
                    },
                    processstep_pos=15,
                    log=EventIP,
                    information_package=ip,
                    responsible=self.request.user,
                )
            )

        if validate_logical_physical_representation:
            step.tasks.add(
                ProcessTask.objects.create(
                    name="preingest.tasks.ValidateLogicalPhysicalRepresentation",
                    params={
                        "files": [os.path.basename(ip.ObjectPath)],
                        "xmlfile": infoxml,
                    },
                    processstep_pos=16,
                    log=EventIP,
                    information_package=ip,
                    responsible=self.request.user,
                )
            )

        step.tasks.add(ProcessTask.objects.create(
            name="preingest.tasks.SubmitSIP",
            params={
                "ip": ip
            },
            processstep_pos=20,
            log=EventIP,
            information_package=ip,
            responsible=self.request.user,
        ))

        if ip.get_email_recipient():
            recipients = [ip.get_email_recipient()]
            subject = request.data.get('subject')
            body = request.data.get('body')

            attachments = [ip.ObjectPath]

            step.tasks.add(ProcessTask.objects.create(
                name="ESSArch_Core.tasks.SendEmail",
                params={
                    'sender': self.request.user.email,
                    'recipients': recipients,
                    'subject': subject,
                    'body': body,
                    'attachments': attachments
                },
                processstep_pos=25,
                information_package=ip,
                responsible=self.request.user
            ))

        step.tasks.add(ProcessTask.objects.create(
            name="preingest.tasks.UpdateIPStatus",
            params={
                "ip": ip,
                "status": "Submitted"
            },
            processstep_pos=30,
            log=EventIP,
            information_package=ip,
            responsible=self.request.user,
        ))

        step.save()
        step.run()

        return Response({'status': 'submitting ip'})
Ejemplo n.º 21
0
    def list_files(self, path=''):
        fullpath = os.path.join(self.object_path, path).rstrip('/')
        if os.path.basename(self.object_path) == path and os.path.isfile(
                self.object_path):
            if tarfile.is_tarfile(self.object_path):
                with tarfile.open(self.object_path) as tar:
                    entries = []
                    for member in tar.getmembers():
                        if not member.isfile():
                            continue

                        entries.append({
                            "name":
                            member.name,
                            "type":
                            'file',
                            "size":
                            member.size,
                            "modified":
                            timestamp_to_datetime(member.mtime),
                        })
                    return entries

            elif zipfile.is_zipfile(self.object_path) and os.path.splitext(
                    self.object_path)[1] == '.zip':
                with zipfile.ZipFile(self.object_path) as zipf:
                    entries = []
                    for member in zipf.filelist:
                        if member.filename.endswith('/'):
                            continue

                        entries.append({
                            "name": member.filename,
                            "type": 'file',
                            "size": member.file_size,
                            "modified": datetime(*member.date_time),
                        })
                    return entries

        if os.path.isfile(self.object_path) and not path:
            container = self.object_path
            xml = os.path.splitext(container)[0] + '.xml'
            entries = [{
                "name":
                os.path.basename(container),
                "type":
                'file',
                "size":
                os.path.getsize(container),
                "modified":
                timestamp_to_datetime(os.path.getmtime(container)),
            }]

            if os.path.isfile(xml):
                entries.append({
                    "name":
                    os.path.basename(xml),
                    "type":
                    'file',
                    "size":
                    os.path.getsize(xml),
                    "modified":
                    timestamp_to_datetime(os.path.getmtime(xml)),
                })

            return entries

        entries = []
        for entry in sorted(get_files_and_dirs(fullpath),
                            key=lambda x: x.name):
            entry_type = "dir" if entry.is_dir() else "file"
            size, _ = get_tree_size_and_count(entry.path)

            entries.append({
                "name":
                os.path.basename(entry.path),
                "type":
                entry_type,
                "size":
                size,
                "modified":
                timestamp_to_datetime(entry.stat().st_mtime),
            })

        return entries
Ejemplo n.º 22
0
    def run(self, filepath=None, mimetype=None, relpath=None, algorithm='SHA-256'):
        if not relpath:
            relpath = filepath

        relpath = win_to_posix(relpath)

        timestamp = creation_date(filepath)
        createdate = timestamp_to_datetime(timestamp)

        checksum_task = ProcessTask.objects.create(
            name="preingest.tasks.CalculateChecksum",
            params={
                "filename": filepath,
                "algorithm": algorithm
            }
        )

        fileformat_task = ProcessTask.objects.create(
            name="preingest.tasks.IdentifyFileFormat",
            params={
                "filename": filepath,
            }
        )

        checksum_task.log = self.taskobj.log
        checksum_task.information_package = self.taskobj.information_package
        checksum_task.responsible = self.taskobj.responsible

        fileformat_task.log = self.taskobj.log
        fileformat_task.information_package = self.taskobj.information_package
        fileformat_task.responsible = self.taskobj.responsible

        if self.taskobj is not None and self.taskobj.processstep is not None:
            checksum_task.processstep = self.taskobj.processstep
            fileformat_task.processstep = self.taskobj.processstep

        checksum_task.save()
        fileformat_task.save()

        checksum = checksum_task.run_eagerly()
        self.set_progress(50, total=100)
        fileformat = fileformat_task.run_eagerly()

        fileinfo = {
            'FName': os.path.basename(relpath),
            'FChecksum': checksum,
            'FID': str(uuid.uuid4()),
            'daotype': "borndigital",
            'href': relpath,
            'FMimetype': mimetype,
            'FCreated': createdate.isoformat(),
            'FFormatName': fileformat,
            'FSize': str(os.path.getsize(filepath)),
            'FUse': 'Datafile',
            'FChecksumType': algorithm,
            'FLoctype': 'URL',
            'FLinkType': 'simple',
            'FChecksumLib': 'hashlib',
            'FLocationType': 'URI',
            'FIDType': 'UUID',
        }

        self.set_progress(100, total=100)

        return fileinfo
Ejemplo n.º 23
0
def preserve_new_generation(aip_profile, aip_profile_data, dstdir, ip,
                            mets_path, new_ip, policy):
    sa = new_ip.submission_agreement

    try:
        os.remove(mets_path)
    except OSError as e:
        if e.errno != errno.ENOENT:
            raise

    files_to_create = OrderedDict()

    try:
        premis_profile = new_ip.get_profile_rel(
            'preservation_metadata').profile
        premis_profile_data = ip.get_profile_data('preservation_metadata')
    except ProfileIP.DoesNotExist:
        pass
    else:
        premis_dir, premis_name = find_destination(
            "preservation_description_file", aip_profile.structure)
        premis_path = os.path.join(dstdir, premis_dir, premis_name)

        try:
            os.remove(premis_path)
        except OSError as e:
            if e.errno != errno.ENOENT:
                raise

        files_to_create[premis_path] = {
            'spec': premis_profile.specification,
            'data': fill_specification_data(premis_profile_data,
                                            ip=new_ip,
                                            sa=sa),
        }

    files_to_create[mets_path] = {
        'spec': aip_profile.specification,
        'data': fill_specification_data(aip_profile_data, ip=new_ip, sa=sa),
    }

    t = ProcessTask.objects.create(
        name='ESSArch_Core.tasks.GenerateXML',
        params={
            'filesToCreate': files_to_create,
            'folderToParse': dstdir,
        },
        responsible=new_ip.responsible,
        information_package=new_ip,
    )
    t.run().get()

    dsttar = dstdir + '.tar'
    dstxml = dstdir + '.xml'

    objid = new_ip.object_identifier_value

    with tarfile.open(dsttar, 'w') as tar:
        for root, dirs, files in walk(dstdir):
            rel = os.path.relpath(root, dstdir)
            for d in dirs:
                src = os.path.join(root, d)
                arc = os.path.join(objid, rel, d)
                arc = os.path.normpath(arc)
                index_path(new_ip, src)
                tar.add(src, arc, recursive=False)

            for f in files:
                src = os.path.join(root, f)
                index_path(new_ip, src)
                tar.add(src, os.path.normpath(os.path.join(objid, rel, f)))

    algorithm = policy.get_checksum_algorithm_display()
    checksum = calculate_checksum(dsttar, algorithm=algorithm)

    info = fill_specification_data(new_ip.get_profile_data('aip_description'),
                                   ip=new_ip,
                                   sa=sa)
    info["_IP_CREATEDATE"] = timestamp_to_datetime(
        creation_date(dsttar)).isoformat()

    aip_desc_profile = new_ip.get_profile('aip_description')
    files_to_create = {
        dstxml: {
            'spec': aip_desc_profile.specification,
            'data': info
        }
    }

    ProcessTask.objects.create(
        name="ESSArch_Core.tasks.GenerateXML",
        params={
            "filesToCreate": files_to_create,
            "folderToParse": dsttar,
            "extra_paths_to_parse": [mets_path],
            "algorithm": algorithm,
        },
        information_package=new_ip,
        responsible=new_ip.responsible,
    ).run().get()

    InformationPackage.objects.filter(pk=new_ip.pk).update(
        message_digest=checksum,
        message_digest_algorithm=policy.checksum_algorithm,
    )

    ProcessTask.objects.create(
        name='ESSArch_Core.tasks.UpdateIPSizeAndCount',
        information_package=new_ip,
        responsible=new_ip.responsible,
    ).run().get()

    t = ProcessTask.objects.create(
        name='workflow.tasks.StoreAIP',
        information_package=new_ip,
        responsible=new_ip.responsible,
    )

    t.run()
Ejemplo n.º 24
0
    def run(self, purpose=None, delete_sip=False):
        self.logger.debug('Receiving SIP')
        aip = InformationPackage.objects.get(pk=self.ip)
        algorithm = aip.get_checksum_algorithm()
        container = aip.object_path
        objid, container_type = os.path.splitext(os.path.basename(container))
        container_type = container_type.lower()
        xml = aip.package_mets_path
        aip.package_mets_create_date = timestamp_to_datetime(
            creation_date(xml)).isoformat()
        aip.package_mets_size = os.path.getsize(xml)
        aip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[
            algorithm.upper()]
        aip.package_mets_digest = calculate_checksum(xml, algorithm=algorithm)
        aip.generation = 0
        aic = InformationPackage.objects.create(
            package_type=InformationPackage.AIC,
            responsible=aip.responsible,
            label=aip.label,
            start_date=aip.start_date,
            end_date=aip.end_date)
        old_sip_path = aip.object_path
        aip.aic = aic
        aip_dir = os.path.join(aip.policy.ingest_path.value, objid)
        aip.object_path = aip_dir
        try:
            os.makedirs(aip_dir)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        aip.save()

        dst_path, dst_name = find_destination('sip',
                                              aip.get_profile('aip').structure,
                                              aip.object_path)
        if dst_path is None:
            dst_path, dst_name = find_destination(
                'content',
                aip.get_profile('aip').structure, aip.object_path)

        dst_name, = self.parse_params(dst_name)
        dst = os.path.join(dst_path, dst_name)

        sip_profile = aip.submission_agreement.profile_sip

        try:
            shutil.rmtree(dst)
        except FileNotFoundError:
            pass

        if aip.policy.receive_extract_sip:
            temp = Path.objects.cached('entity', 'temp', 'value')
            with tempfile.TemporaryDirectory(dir=temp) as tmpdir:
                self.logger.debug('Extracting {} to {}'.format(
                    container, tmpdir))
                if container_type == '.tar':
                    with tarfile.open(container) as tar:
                        root_member_name = tar.getnames()[0]
                        tar.extractall(tmpdir)
                elif container_type == '.zip':
                    with zipfile.ZipFile(container) as zipf:
                        root_member_name = zipf.namelist()[0]
                        zipf.extractall(tmpdir)
                else:
                    raise ValueError(
                        'Invalid container type: {}'.format(container))

                dst = os.path.join(dst, '')
                try:
                    os.makedirs(dst)
                except OSError as e:
                    if e.errno != errno.EEXIST:
                        raise

                tmpsrc = tmpdir
                if len(os.listdir(tmpdir)) == 1 and os.listdir(
                        tmpdir)[0] == root_member_name:
                    new_tmpsrc = os.path.join(tmpdir, root_member_name)
                    if os.path.isdir(new_tmpsrc):
                        tmpsrc = new_tmpsrc

                self.logger.debug('Moving content of {} to {}'.format(
                    tmpsrc, dst))

                for f in os.listdir(tmpsrc):
                    shutil.move(os.path.join(tmpsrc, f), dst)

                self.logger.debug('Deleting {}'.format(tmpdir))

            aip.sip_path = os.path.relpath(dst, aip.object_path)
        else:
            self.logger.debug('Copying {} to {}'.format(container, dst))
            shutil.copy2(container, dst)
            aip.sip_path = os.path.relpath(
                os.path.join(dst, os.path.basename(container)),
                aip.object_path)

        sip_mets_dir, sip_mets_file = find_destination('mets_file',
                                                       sip_profile.structure,
                                                       aip.sip_path)
        if os.path.isfile(aip.sip_path):
            sip_mets_data = parse_mets(
                open_file(
                    os.path.join(aip.object_path, sip_mets_dir, sip_mets_file),
                    container=aip.sip_path,
                    container_prefix=aip.object_identifier_value,
                ))
        else:
            sip_mets_data = parse_mets(
                open_file(
                    os.path.join(aip.object_path, sip_mets_dir,
                                 sip_mets_file)))

        # prefix all SIP data
        sip_mets_data = {
            f'SIP_{k.upper()}': v
            for k, v in sip_mets_data.items()
        }

        aip_profile_rel_data = aip.get_profile_rel('aip').data
        aip_profile_rel_data.data.update(sip_mets_data)
        aip_profile_rel_data.save()

        if delete_sip:
            delete_path(old_sip_path)
            delete_path(pathlib.Path(old_sip_path).with_suffix('.xml'))

        self.logger.debug('sip_path set to {}'.format(aip.sip_path))
        aip.save()
Ejemplo n.º 25
0
    def submit(self, request, pk=None):
        """
        Submits the specified information package

        Args:
            pk: The primary key (id) of the information package to submit

        Returns:
            None
        """

        ip = self.get_object()

        if ip.State != "Created":
            raise ValueError(
                "The IP (%s) is in the state '%s' but should be 'Created'" %
                (pk, ip.State))

        validators = request.data.get('validators', {})

        validate_xml_file = validators.get('validate_xml_file', False)
        validate_file_format = validators.get('validate_file_format', False)
        validate_integrity = validators.get('validate_integrity', False)
        validate_logical_physical_representation = validators.get(
            'validate_logical_physical_representation', False)

        step = ProcessStep.objects.create(name="Submit SIP",
                                          information_package=ip)

        step.tasks.add(
            ProcessTask.objects.create(
                name="preingest.tasks.UpdateIPStatus",
                params={
                    "ip": ip,
                    "status": "Submitting",
                },
                processstep_pos=0,
                log=EventIP,
                information_package=ip,
                responsible=self.request.user,
            ))

        reception = Path.objects.get(entity="path_preingest_reception").value

        sd_profile = ip.get_profile('submit_description')

        container_format = ip.get_container_format()
        container_file = os.path.join(
            reception,
            str(ip.pk) + ".%s" % container_format.lower())

        sa = ip.SubmissionAgreement

        info = sd_profile.fill_specification_data(sa, ip)
        info["_IP_CREATEDATE"] = timestamp_to_datetime(
            creation_date(container_file)).isoformat()

        infoxml = os.path.join(reception, str(ip.pk) + ".xml")

        filesToCreate = {infoxml: sd_profile.specification}

        step.tasks.add(
            ProcessTask.objects.create(
                name="preingest.tasks.GenerateXML",
                params={
                    "info": info,
                    "filesToCreate": filesToCreate,
                    "folderToParse": container_file,
                    "algorithm": ip.get_checksum_algorithm(),
                },
                processstep_pos=10,
                log=EventIP,
                information_package=ip,
                responsible=self.request.user,
            ))

        if validate_xml_file:
            step.tasks.add(
                ProcessTask.objects.create(
                    name="preingest.tasks.ValidateXMLFile",
                    params={"xml_filename": infoxml},
                    processstep_pos=14,
                    log=EventIP,
                    information_package=ip,
                    responsible=self.request.user,
                ))

        if validate_file_format or validate_integrity:
            step.tasks.add(
                ProcessTask.objects.create(
                    name="preingest.tasks.ValidateFiles",
                    params={
                        "ip": ip,
                        "rootdir": reception,
                        "xmlfile": infoxml,
                        "validate_fileformat": validate_file_format,
                        "validate_integrity": validate_integrity,
                    },
                    processstep_pos=15,
                    log=EventIP,
                    information_package=ip,
                    responsible=self.request.user,
                ))

        if validate_logical_physical_representation:
            step.tasks.add(
                ProcessTask.objects.create(
                    name=
                    "preingest.tasks.ValidateLogicalPhysicalRepresentation",
                    params={
                        "files": [os.path.basename(ip.ObjectPath)],
                        "xmlfile": infoxml,
                    },
                    processstep_pos=16,
                    log=EventIP,
                    information_package=ip,
                    responsible=self.request.user,
                ))

        step.tasks.add(
            ProcessTask.objects.create(
                name="preingest.tasks.SubmitSIP",
                params={"ip": ip},
                processstep_pos=20,
                log=EventIP,
                information_package=ip,
                responsible=self.request.user,
            ))

        if ip.get_email_recipient():
            recipients = [ip.get_email_recipient()]
            subject = request.data.get('subject')
            body = request.data.get('body')

            attachments = [ip.ObjectPath]

            step.tasks.add(
                ProcessTask.objects.create(name="ESSArch_Core.tasks.SendEmail",
                                           params={
                                               'sender':
                                               self.request.user.email,
                                               'recipients': recipients,
                                               'subject': subject,
                                               'body': body,
                                               'attachments': attachments
                                           },
                                           processstep_pos=25,
                                           information_package=ip,
                                           responsible=self.request.user))

        step.tasks.add(
            ProcessTask.objects.create(
                name="preingest.tasks.UpdateIPStatus",
                params={
                    "ip": ip,
                    "status": "Submitted"
                },
                processstep_pos=30,
                log=EventIP,
                information_package=ip,
                responsible=self.request.user,
            ))

        step.save()
        step.run()

        return Response({'status': 'submitting ip'})
Ejemplo n.º 26
0
    def list(self, request):
        reception = Path.objects.get(entity="path_ingest_reception").value
        uip = Path.objects.get(entity="path_ingest_unidentified").value
        ips = []

        for xmlfile in glob.glob(os.path.join(reception, "*.xml")) + glob.glob(os.path.join(uip, "*.xml")):
            if os.path.isfile(xmlfile):
                if xmlfile.startswith(uip):
                    srcdir = uip
                else:
                    srcdir = reception

                ip = self.parseFile(xmlfile, srcdir)
                if not InformationPackage.objects.filter(id=ip['id']).exists():
                    ips.append(ip)

        for container_file in glob.glob(os.path.join(uip, "*.tar")) + glob.glob(os.path.join(uip, "*.zip")):
            ip = {
                'Label': os.path.basename(container_file),
                'CreateDate': str(timestamp_to_datetime(creation_date(container_file)).isoformat()),
                'State': 'Unidentified',
                'status': 0,
                'step_state': celery_states.SUCCESS,
            }

            include = True

            for xmlfile in glob.glob(os.path.join(uip, "*.xml")):
                if os.path.isfile(xmlfile):
                    doc = etree.parse(xmlfile)
                    root = doc.getroot()

                    el = root.xpath('.//*[local-name()="%s"]' % "FLocat")[0]
                    if ip['Label'] == get_value_from_path(el, "@href").split('file:///')[1]:
                        include = False
                        break

            if include:
                ips.append(ip)

        from_db = InformationPackage.objects.filter(State='Receiving').prefetch_related(
            Prefetch('profileip_set', to_attr='profiles'),
        )
        serializer = InformationPackageSerializer(
            data=from_db, many=True, context={'request': request}
        )
        serializer.is_valid()
        ips.extend(serializer.data)

        try:
            ordering = request.query_params.get('ordering', '')
            reverse = ordering.startswith('-')
            ordering = remove_prefix(ordering, '-')
            ips = sorted(ips, key=lambda k: k[ordering], reverse=reverse)
        except KeyError:
            pass

        paginator = LinkHeaderPagination()
        page = paginator.paginate_queryset(ips, request)
        if page is not None:
            return paginator.get_paginated_response(page)

        return Response(ips)
Ejemplo n.º 27
0
def ReceiveSIP(self, purpose=None, delete_sip=False):
    logger = logging.getLogger('essarch.workflow.tasks.ReceiveSIP')
    logger.debug('Receiving SIP')
    ip = self.get_information_package()
    algorithm = ip.get_checksum_algorithm()
    container = ip.object_path
    objid, container_type = os.path.splitext(os.path.basename(container))
    container_type = container_type.lower()
    xml = ip.package_mets_path
    ip.package_mets_create_date = timestamp_to_datetime(
        creation_date(xml)).isoformat()
    ip.package_mets_size = os.path.getsize(xml)
    ip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[
        algorithm.upper()]
    ip.package_mets_digest = calculate_checksum(xml, algorithm=algorithm)

    ip.object_path = os.path.join(ip.policy.ingest_path.value,
                                  ip.object_identifier_value)
    ip.save()

    sip_dst_path, sip_dst_name = find_destination('sip', ip.get_structure(),
                                                  ip.object_path)
    if sip_dst_path is None:
        sip_dst_path, sip_dst_name = find_destination('content',
                                                      ip.get_structure(),
                                                      ip.object_path)

    sip_dst_name, = self.parse_params(sip_dst_name)
    sip_dst = os.path.join(sip_dst_path, sip_dst_name)

    if ip.policy.receive_extract_sip:
        # remove any existing directory from previous attempts
        delete_path(sip_dst)

        temp = Path.objects.get(entity='temp').value
        with tempfile.TemporaryDirectory(dir=temp) as tmpdir:
            logger.debug('Extracting {} to {}'.format(container, tmpdir))
            if container_type == '.tar':
                with tarfile.open(container) as tar:
                    root_member_name = tar.getnames()[0]
                    tar.extractall(tmpdir)
            elif container_type == '.zip':
                with zipfile.ZipFile(container) as zipf:
                    root_member_name = zipf.namelist()[0]
                    zipf.extractall(tmpdir)
            else:
                raise ValueError(
                    'Invalid container type: {}'.format(container))

            sip_dst = os.path.join(sip_dst, '')
            os.makedirs(sip_dst)

            tmpsrc = tmpdir
            if len(os.listdir(tmpdir)) == 1 and os.listdir(
                    tmpdir)[0] == root_member_name:
                new_tmpsrc = os.path.join(tmpdir, root_member_name)
                if os.path.isdir(new_tmpsrc):
                    tmpsrc = new_tmpsrc

            logger.debug('Moving content of {} to {}'.format(tmpsrc, sip_dst))

            for f in os.listdir(tmpsrc):
                shutil.move(os.path.join(tmpsrc, f), sip_dst)

            logger.debug('Deleting {}'.format(tmpdir))
    else:
        logger.debug('Copying {} to {}'.format(container, sip_dst))
        shutil.copy2(container, sip_dst)

    ip.sip_path = os.path.relpath(sip_dst, ip.object_path)
    ip.save()
    self.create_success_event("Received SIP")
    return sip_dst
Ejemplo n.º 28
0
    def list_files(self, path=''):
        fullpath = os.path.join(self.object_path, path).rstrip('/')
        if os.path.basename(self.object_path) == path and os.path.isfile(
                self.object_path):
            if tarfile.is_tarfile(self.object_path):
                with tarfile.open(self.object_path) as tar:
                    entries = []
                    for member in tar.getmembers():
                        if not member.isfile():
                            continue

                        entries.append({
                            "name":
                            member.name,
                            "type":
                            'file',
                            "size":
                            member.size,
                            "modified":
                            timestamp_to_datetime(member.mtime),
                        })
                    return entries

            elif zipfile.is_zipfile(self.object_path) and os.path.splitext(
                    self.object_path)[1] == '.zip':
                with zipfile.ZipFile(self.object_path) as zipf:
                    entries = []
                    for member in zipf.filelist:
                        if member.filename.endswith('/'):
                            continue

                        entries.append({
                            "name": member.filename,
                            "type": 'file',
                            "size": member.file_size,
                            "modified": datetime(*member.date_time),
                        })
                    return entries

        if os.path.isfile(self.object_path) and not path:
            container = self.object_path
            xml = os.path.splitext(container)[0] + '.xml'
            entries = [{
                "name":
                os.path.basename(container),
                "type":
                'file',
                "size":
                os.path.getsize(container),
                "modified":
                timestamp_to_datetime(os.path.getmtime(container)),
            }]

            if os.path.isfile(xml):
                entries.append({
                    "name":
                    os.path.basename(xml),
                    "type":
                    'file',
                    "size":
                    os.path.getsize(xml),
                    "modified":
                    timestamp_to_datetime(os.path.getmtime(xml)),
                })

            return entries

        entries = []
        for entry in sorted(get_files_and_dirs(fullpath),
                            key=lambda x: x.name):
            try:
                entry_type = "dir" if entry.is_dir() else "file"
                size, _ = get_tree_size_and_count(entry.path)

                entries.append({
                    "name":
                    os.path.basename(entry.path),
                    "type":
                    entry_type,
                    "size":
                    size,
                    "modified":
                    timestamp_to_datetime(entry.stat().st_mtime),
                })
            except OSError as e:
                # the file might be deleted (e.g. temporary upload files) while we get additional data,
                # if they are we ignore them. If there is another error, we raise it

                if e.errno != errno.ENOENT:
                    raise

        return entries