Ejemplo n.º 1
0
    def ead_editor(self, request, pk=None):
        ip = self.get_object()
        try:
            structure = ip.get_profile('sip').structure
        except AttributeError:
            return Response("No SIP profile for IP created yet",
                            status=status.HTTP_400_BAD_REQUEST)

        ead_dir, ead_name = find_destination("archival_description_file",
                                             structure)

        if ead_name is None:
            return Response("No EAD file for IP found",
                            status=status.HTTP_404_NOT_FOUND)

        xmlfile = os.path.join(ip.object_path, ead_dir, ead_name)

        if request.method == 'GET':

            try:
                with open(xmlfile) as f:
                    s = f.read()
                    return Response({"data": s})
            except IOError:
                open(xmlfile, 'a').close()
                return Response({"data": ""})

        content = request.POST.get("content", '')

        with open(xmlfile, "w") as f:
            f.write(str(content))
            return Response("Content written to %s" % xmlfile)
Ejemplo n.º 2
0
 def get_content_type_file(self):
     ctsdir, ctsfile = find_destination('content_type_specification',
                                        self.get_structure())
     if ctsdir is None:
         return None
     return parseContent(os.path.join(ctsdir, ctsfile),
                         fill_specification_data(ip=self))
Ejemplo n.º 3
0
def CompareRepresentationXMLFiles(self):
    Validation.objects.filter(task=self.get_processtask()).delete()
    ip = InformationPackage.objects.get(pk=self.ip)

    reps_path, reps_dir = find_destination("representations", ip.get_structure(), ip.object_path)
    if reps_path is None:
        return None

    representations_dir = os.path.join(reps_path, reps_dir)

    for p in find_pointers(os.path.join(ip.object_path, ip.content_mets_path)):
        rep_mets_path = p.path
        rep_mets_path = os.path.join(ip.object_path, rep_mets_path)
        rep_path = os.path.relpath(rep_mets_path, representations_dir)
        rep_path = PurePath(rep_path).parts[0]

        rep_premis_path = get_premis_ref(etree.parse(rep_mets_path)).path
        rep_premis_path = os.path.join(representations_dir, rep_path, rep_premis_path)

        validator = XMLComparisonValidator(
            context=rep_premis_path,
            options={
                'rootdir': os.path.join(representations_dir, rep_path),
                'representation': rep_path,
                'recursive': False,
            },
            task=self.get_processtask(),
            ip=self.ip,
            responsible=ip.responsible,
        )
        validator.validate(rep_mets_path)

    msg = "All XML files in the representations have the same set of files"
    self.create_success_event(msg)
Ejemplo n.º 4
0
    def run(self):
        Validation.objects.filter(task=self.get_processtask()).delete()
        ip = InformationPackage.objects.get(pk=self.ip)

        reps_path, reps_dir = find_destination("representations", ip.get_structure(), ip.object_path)
        if reps_path is None:
            return None

        representations_dir = os.path.join(reps_path, reps_dir)

        for p in find_pointers(ip.content_mets_path):
            rep_mets_path = p.path
            rep_mets_path = os.path.join(ip.object_path, rep_mets_path)
            rep_path = os.path.relpath(rep_mets_path, representations_dir)
            rep_path = PurePath(rep_path).parts[0]

            rep_premis_path = get_premis_ref(etree.parse(rep_mets_path)).path
            rep_premis_path = os.path.join(representations_dir, rep_path, rep_premis_path)

            validator = XMLComparisonValidator(
                context=rep_premis_path,
                options={
                    'rootdir': os.path.join(representations_dir, rep_path),
                    'representation': rep_path,
                },
                task=self.get_processtask(),
                ip=self.ip,
                responsible=ip.responsible,
            )
            validator.validate(rep_mets_path)
Ejemplo n.º 5
0
    def get_premis_file_path(self):
        premis_dir, premis_name = find_destination(
            "preservation_description_file", self.get_structure())
        if premis_dir is not None:
            path = os.path.join(premis_dir, premis_name)
            path = parseContent(path, fill_specification_data(ip=self))
        else:
            path = 'metadata/premis.xml'

        return normalize_path(os.path.join(self.object_path, path))
Ejemplo n.º 6
0
    def get_content_mets_file_path(self):
        mets_dir, mets_name = find_destination("mets_file",
                                               self.get_structure())
        if mets_dir is not None:
            path = os.path.join(mets_dir, mets_name)
            path = parseContent(path, fill_specification_data(ip=self))
        else:
            path = 'mets.xml'

        return normalize_path(os.path.join(self.object_path, path))
Ejemplo n.º 7
0
    def run(self, verify=True):
        ip = self.get_information_package()
        ip_profile_type = ip.get_package_type_display().lower()
        ip_profile = ip.get_profile_rel(ip_profile_type).profile
        structure = ip.get_structure()
        rootdir = ip.object_path

        specifications = [ip_profile.specification, get_event_spec()]
        premis_profile_rel = ip.get_profile_rel('preservation_metadata')
        if premis_profile_rel is not None:
            specifications.append(premis_profile_rel.profile.specification)

        self.logger.debug(u'Downloading schemas')
        for spec in specifications:
            schema_preserve_loc = spec.get('-schemaPreservationLocation',
                                           'xsd_files')
            if schema_preserve_loc and structure:
                reldir, _ = find_destination(schema_preserve_loc, structure)
                dirname = os.path.join(rootdir, reldir)
            else:
                dirname = rootdir

            for schema in spec.get('-schemasToPreserve', []):
                dst = os.path.join(dirname, os.path.basename(schema))
                self.logger.info(u'Downloading schema from {} to {}'.format(
                    schema, dst))
                try:
                    r = requests.get(schema, stream=True, verify=verify)
                    r.raise_for_status()
                    with open(dst, 'wb') as f:
                        for chunk in r:
                            f.write(chunk)
                except Exception:
                    self.logger.exception(
                        u'Download of schema failed: {}'.format(schema))
                    try:
                        self.logger.debug(
                            u'Deleting downloaded file if it exists: {}'.
                            format(dst))
                        os.remove(dst)
                    except OSError as e:
                        if e.errno != errno.ENOENT:
                            self.logger.exception(
                                u'Failed to delete downloaded file: {}'.format(
                                    dst))
                            raise
                    else:
                        self.logger.info(
                            u'Deleted downloaded file: {}'.format(dst))
                    raise
                else:
                    self.logger.info(u'Downloaded schema to {}'.format(dst))
        else:
            self.logger.info(u'No schemas to download')
Ejemplo n.º 8
0
    def transform(self, path):
        # move all dirs and files (except those specified in IP profile) to content

        structure = self.ip.get_structure()
        content_dir, content_name = find_destination('content', structure)
        content_path = os.path.join(self.ip.object_path, content_dir,
                                    content_name)

        reserved = [x['use'] for x in structure if 'use' in x]
        for f in os.listdir(path):
            if f not in reserved:
                shutil.move(os.path.join(path, f), content_path)
Ejemplo n.º 9
0
    def get_events_file_path(self, from_container=False):
        if not from_container and os.path.isfile(self.object_path):
            return os.path.splitext(self.object_path)[0] + '_ipevents.xml'

        ip_profile = self.get_profile(self.get_package_type_display().lower())
        structure = ip_profile.structure

        events_dir, events_file = find_destination('events_file', structure)
        if events_dir is not None:
            full_path = os.path.join(events_dir, events_file)
            return normalize_path(
                parseContent(full_path, fill_specification_data(ip=self)))

        return 'ipevents.xml'
def forward(apps, schema_editor):
    InformationPackage = apps.get_model("ip", "InformationPackage")
    db_alias = schema_editor.connection.alias

    for ip in InformationPackage.objects.using(db_alias).filter(
            package_type=IP.AIP, sip_objid='').iterator():
        ip.sip_objid = ip.object_identifier_value
        if ip.state in ('Prepared', 'Receiving'):
            ip.sip_path = ip.sip_objid
        else:
            structure = get_structure(ip)
            content_dir, content_name = find_destination('content', structure)
            content_path = os.path.join(content_dir, content_name)
            ip.sip_path = normalize_path(
                os.path.join(content_path, ip.sip_objid))
        ip.save()
Ejemplo n.º 11
0
def download_schemas(ip, logger, verify):
    ip_profile_type = ip.get_package_type_display().lower()
    ip_profile = ip.get_profile_rel(ip_profile_type).profile
    structure = ip.get_structure()
    rootdir = ip.object_path

    specifications = [ip_profile.specification, get_event_spec()]
    premis_profile_rel = ip.get_profile_rel('preservation_metadata')
    if premis_profile_rel is not None:
        specifications.append(premis_profile_rel.profile.specification)

    for spec in specifications:
        schema_preserve_loc = spec.get('-schemaPreservationLocation',
                                       'xsd_files')
        if schema_preserve_loc and structure:
            reldir, _ = find_destination(schema_preserve_loc, structure)
            dirname = os.path.join(rootdir, reldir)
        else:
            dirname = rootdir

        for schema in spec.get('-schemasToPreserve', []):
            download_schema(dirname, logger, schema, verify)
Ejemplo n.º 12
0
    def run(self, template=None, dirname=None, structure=[], root=""):
        schemaPreserveLoc = template.get('-schemaPreservationLocation')

        if schemaPreserveLoc and structure:
            dirname, _ = find_destination(schemaPreserveLoc, structure)
            dirname = os.path.join(root, dirname)

        for schema in template.get('-schemasToPreserve', []):
            dst = os.path.join(dirname, os.path.basename(schema))

            t = ProcessTask.objects.create(
                name="ESSArch_Core.tasks.DownloadFile",
                params={
                    'src': schema,
                    'dst': dst
                },
                processstep_id=self.step,
                processstep_pos=self.step_pos,
                responsible_id=self.responsible,
                information_package_id=self.ip,
            )

            t.run().get()
Ejemplo n.º 13
0
def preserve_new_generation(new_ip):
    generate_premis = new_ip.profile_locked('preservation_metadata')
    has_representations = find_destination(
        "representations",
        new_ip.get_structure(),
        new_ip.object_path,
    )[1] is not None

    # remove existing premis and mets paths:
    mets_path = os.path.join(new_ip.object_path,
                             new_ip.get_content_mets_file_path())
    try:
        os.remove(mets_path)
    except FileNotFoundError:
        pass

    events_file = os.path.join(new_ip.object_path,
                               new_ip.get_events_file_path())
    try:
        os.remove(events_file)
    except FileNotFoundError:
        pass

    if generate_premis:
        premis_profile_data = new_ip.get_profile_data('preservation_metadata')
        data = fill_specification_data(premis_profile_data, ip=new_ip)
        premis_path = parseContent(new_ip.get_premis_file_path(), data)
        full_premis_path = os.path.join(new_ip.object_path, premis_path)
        try:
            os.remove(full_premis_path)
        except FileNotFoundError:
            pass

    workflow = [
        {
            "step":
            True,
            "name":
            "Generate AIP",
            "children": [
                {
                    "name": "ESSArch_Core.ip.tasks.DownloadSchemas",
                    "label": "Download Schemas",
                },
                {
                    "step":
                    True,
                    "name":
                    "Create Log File",
                    "children": [
                        {
                            "name": "ESSArch_Core.ip.tasks.GenerateEventsXML",
                            "label": "Generate events xml file",
                        },
                        {
                            "name": "ESSArch_Core.tasks.AppendEvents",
                            "label": "Add events to xml file",
                        },
                        {
                            "name":
                            "ESSArch_Core.ip.tasks.AddPremisIPObjectElementToEventsFile",
                            "label": "Add premis IP object to xml file",
                        },
                    ]
                },
                {
                    "name": "ESSArch_Core.ip.tasks.GenerateContentMetadata",
                    "label": "Generate contentmetadata",
                },
            ]
        },
        {
            "step":
            True,
            "name":
            "Validate AIP",
            "children": [{
                "name": "ESSArch_Core.tasks.ValidateXMLFile",
                "label": "Validate content-mets",
                "params": {
                    "xml_filename": "{{_CONTENT_METS_PATH}}",
                }
            }, {
                "name": "ESSArch_Core.tasks.ValidateXMLFile",
                "if": generate_premis,
                "label": "Validate premis",
                "params": {
                    "xml_filename": "{{_PREMIS_PATH}}",
                }
            }, {
                "name":
                "ESSArch_Core.tasks.ValidateLogicalPhysicalRepresentation",
                "label": "Diff-check against content-mets",
                "args": ["{{_OBJPATH}}", "{{_CONTENT_METS_PATH}}"],
            }, {
                "name": "ESSArch_Core.tasks.CompareXMLFiles",
                "if": generate_premis,
                "label": "Compare premis and content-mets",
                "args": ["{{_PREMIS_PATH}}", "{{_CONTENT_METS_PATH}}"],
                "params": {
                    'recursive': False
                },
            }, {
                "name": "ESSArch_Core.tasks.CompareRepresentationXMLFiles",
                "if": has_representations and generate_premis,
                "label": "Compare representation premis and mets",
            }]
        },
        {
            "name": "ESSArch_Core.tasks.UpdateIPSizeAndCount",
            "label": "Update IP size and file count",
        },
    ]

    workflow += new_ip.create_preservation_workflow()
    workflow = create_workflow(workflow,
                               new_ip,
                               name='Preserve Information Package',
                               eager=True)
    workflow.run()
Ejemplo n.º 14
0
def fill_specification_data(data=None, sa=None, ip=None, ignore=None):
    from ESSArch_Core.profiles.models import ProfileIP

    data = data or {}
    ignore = ignore or []

    data = LazyDict(data)

    if sa:
        data.update(_fill_sa_specification_data(sa))

    if ip:
        if not sa and ip.submission_agreement is not None:
            sa = ip.submission_agreement
            data.update(_fill_sa_specification_data(sa))

        if ip.submission_agreement_data is not None:
            for k, v in ip.submission_agreement_data.data.items():
                data['SA_{}'.format(k)] = v

        data['_OBJID'] = ip.object_identifier_value
        data['_OBJUUID'] = str(ip.pk)
        data['_OBJLABEL'] = ip.label
        data['_OBJPATH'] = ip.object_path

        try:
            structure = ip.get_structure()
            content_dir, content_name = find_destination('content', structure)
            data['_CONTENTPATH'] = PurePath(ip.object_path).joinpath(
                content_dir, content_name).as_posix()
        except (ProfileIP.DoesNotExist, TypeError):
            data['_CONTENTPATH'] = ip.object_path

        data['_INNER_IP_OBJID'] = ip.sip_objid
        data['_INNER_IP_PATH'] = ip.sip_path
        data['_STARTDATE'] = ip.start_date
        data['_ENDDATE'] = ip.end_date
        data['_INFORMATIONCLASS'] = ip.information_class

        if '_CTS_PATH' not in ignore:
            data['_CTS_PATH'] = (ip.get_content_type_file, )
        if '_CTS_SCHEMA_PATH' not in ignore:
            data['_CTS_SCHEMA_PATH'] = (ip.get_content_type_schema_file, )

        data['_CONTENT_METS_PATH'] = os.path.join(ip.object_path,
                                                  ip.content_mets_path)
        data['_CONTENT_METS_CREATE_DATE'] = ip.content_mets_create_date
        data['_CONTENT_METS_SIZE'] = ip.content_mets_size
        data[
            '_CONTENT_METS_DIGEST_ALGORITHM'] = ip.get_content_mets_digest_algorithm_display(
            )
        data['_CONTENT_METS_DIGEST'] = ip.content_mets_digest

        data['_PACKAGE_METS_PATH'] = ip.package_mets_path
        data['_PACKAGE_METS_CREATE_DATE'] = ip.package_mets_create_date
        data['_PACKAGE_METS_SIZE'] = ip.package_mets_size
        data[
            '_PACKAGE_METS_DIGEST_ALGORITHM'] = ip.get_package_mets_digest_algorithm_display(
            )
        data['_PACKAGE_METS_DIGEST'] = ip.package_mets_digest

        data['_TEMP_CONTAINER_PATH'] = (ip.get_temp_container_path, )
        data['_TEMP_METS_PATH'] = (ip.get_temp_container_xml_path, )
        data['_TEMP_AIC_METS_PATH'] = (
            ip.get_temp_container_aic_xml_path, ) if ip.aic else None

        if ip.get_package_type_display() in ['SIP', 'DIP', 'AIP']:
            data['_PREMIS_PATH'] = os.path.join(
                ip.object_path, ip.get_premis_file_path(
                )) if ip.get_premis_file_path() else None
            data['allow_unknown_file_types'] = (
                ip.get_allow_unknown_file_types, )

        data['_IP_CONTAINER_FORMAT'] = (ip.get_container_format, )
        data['_IP_PACKAGE_TYPE'] = ip.get_package_type_display()

        if ip.policy is not None:
            data['_POLICYUUID'] = ip.policy.pk
            data['_POLICYID'] = ip.policy.policy_id
            data['_POLICYNAME'] = ip.policy.policy_name
            data['POLICY_INGEST_PATH'] = ip.policy.ingest_path.value
        else:
            try:
                transfer_project_data = ip.get_profile_data('transfer_project')
                data['_POLICYUUID'] = transfer_project_data.get(
                    'storage_policy_uuid')
                data['_POLICYID'] = transfer_project_data.get(
                    'storage_policy_id')
                data['_POLICYNAME'] = transfer_project_data.get(
                    'storage_policy_name')
            except ObjectDoesNotExist:
                pass

        data['_AGENTS'] = (
            _get_agents,
            ip,
        )

        profile_ids = zip(lowercase_profile_types, [
            "_PROFILE_" + x.upper().replace(' ', '_') + "_ID"
            for x in profile_types
        ])

        for (profile_type, key) in profile_ids:
            data[key] = (_get_profile_id_by_type, profile_type, ip)

    for p in Parameter.objects.iterator():
        data['_PARAMETER_%s' % p.entity.upper()] = p.value

    for p in Path.objects.iterator():
        data['_PATH_%s' % p.entity.upper()] = p.value

    return data
Ejemplo n.º 15
0
    def test_find_destination(self):
        structure = [
            {
                'type': 'file',
                'name': 'mets.xml',
                'use': 'mets_file',
            },
            {
                'type':
                'folder',
                'name':
                'content',
                'use':
                'content',
                'children': [
                    {
                        'type': 'file',
                        "name": 'metadata.xml',
                        'use': 'content_type_specification'
                    },
                    {
                        'type': 'file',
                        "name": 'metadata.xsd',
                        'use': 'content_type_specification_schema'
                    },
                ],
            },
            {
                'type':
                'folder',
                'name':
                'metadata',
                'use':
                'metadata',
                'children': [
                    {
                        'type': 'file',
                        'use': 'xsd_files',
                        'name': 'xsd_files'
                    },
                    {
                        'type': 'file',
                        'name': 'premis.xml',
                        'use': 'preservation_description_file',
                    },
                    {
                        'type': 'file',
                        'name': 'ead.xml',
                        'use': 'archival_description_file',
                    },
                    {
                        'type': 'file',
                        'name': 'eac.xml',
                        'use': 'authoritive_information_file',
                    },
                ]
            },
        ]

        tests = (
            ('mets_file', ('', 'mets.xml')),
            ('xsd_files', ('metadata', 'xsd_files')),
            ('preservation_description_file', ('metadata', 'premis.xml')),
            ('foo', (None, None)),
        )

        for value, expected in tests:
            with self.subTest(value=value):
                self.assertEqual(find_destination(value, structure), expected)
Ejemplo n.º 16
0
    def _run(self):
        def get_information_packages():
            return self.rule.information_packages.filter(
                active=True, ).exclude(conversion_job_entries__job=self, )

        ips = get_information_packages()

        for ip in ips.order_by(
                '-cached').iterator():  # convert cached IPs first
            run_cached_ip(ip)

            policy = ip.policy
            srcdir = os.path.join(policy.cache_storage.value,
                                  ip.object_identifier_value)

            new_ip = ip.create_new_generation(ip.state, ip.responsible, None)

            dstdir = os.path.join(policy.cache_storage.value,
                                  new_ip.object_identifier_value)

            new_ip.object_path = dstdir
            new_ip.save()

            aip_profile = new_ip.get_profile_rel('aip').profile
            aip_profile_data = new_ip.get_profile_data('aip')

            mets_dir, mets_name = find_destination("mets_file",
                                                   aip_profile.structure)
            mets_path = os.path.join(srcdir, mets_dir, mets_name)

            # copy files to new generation
            shutil.copytree(srcdir, dstdir)

            # convert files specified in rule
            for pattern, spec in self.rule.specification.items():
                target = spec['target']
                tool = spec['tool']

                for path in iglob(dstdir + '/' + pattern):
                    if os.path.isdir(path):
                        for root, dirs, files in walk(path):
                            rel = os.path.relpath(root, dstdir)

                            for f in files:
                                fpath = os.path.join(root, f)
                                job_entry = ConversionJobEntry.objects.create(
                                    job=self,
                                    start_date=timezone.now(),
                                    ip=ip,
                                    old_document=os.path.join(rel, f))
                                convert_file(fpath, target)

                                os.remove(fpath)

                                job_entry.new_document = os.path.splitext(
                                    job_entry.old_document)[0] + '.' + target
                                job_entry.end_date = timezone.now()
                                job_entry.tool = tool
                                job_entry.save()

                    elif os.path.isfile(path):
                        rel = os.path.relpath(path, dstdir)

                        job_entry = ConversionJobEntry.objects.create(
                            job=self,
                            start_date=timezone.now(),
                            ip=ip,
                            old_document=rel,
                        )
                        convert_file(path, target)

                        os.remove(path)

                        job_entry.new_document = os.path.splitext(
                            job_entry.old_document)[0] + '.' + target
                        job_entry.end_date = timezone.now()
                        job_entry.tool = tool
                        job_entry.save()

            # preserve new generation
            preserve_new_generation(aip_profile, aip_profile_data, dstdir, ip,
                                    mets_path, new_ip, policy)
Ejemplo n.º 17
0
def preserve_new_generation(aip_profile, aip_profile_data, dstdir, ip,
                            mets_path, new_ip, policy):
    sa = new_ip.submission_agreement

    try:
        os.remove(mets_path)
    except OSError as e:
        if e.errno != errno.ENOENT:
            raise

    files_to_create = OrderedDict()

    try:
        premis_profile = new_ip.get_profile_rel(
            'preservation_metadata').profile
        premis_profile_data = ip.get_profile_data('preservation_metadata')
    except ProfileIP.DoesNotExist:
        pass
    else:
        premis_dir, premis_name = find_destination(
            "preservation_description_file", aip_profile.structure)
        premis_path = os.path.join(dstdir, premis_dir, premis_name)

        try:
            os.remove(premis_path)
        except OSError as e:
            if e.errno != errno.ENOENT:
                raise

        files_to_create[premis_path] = {
            'spec': premis_profile.specification,
            'data': fill_specification_data(premis_profile_data,
                                            ip=new_ip,
                                            sa=sa),
        }

    files_to_create[mets_path] = {
        'spec': aip_profile.specification,
        'data': fill_specification_data(aip_profile_data, ip=new_ip, sa=sa),
    }

    t = ProcessTask.objects.create(
        name='ESSArch_Core.tasks.GenerateXML',
        params={
            'filesToCreate': files_to_create,
            'folderToParse': dstdir,
        },
        responsible=new_ip.responsible,
        information_package=new_ip,
    )
    t.run().get()

    dsttar = dstdir + '.tar'
    dstxml = dstdir + '.xml'

    objid = new_ip.object_identifier_value

    with tarfile.open(dsttar, 'w') as tar:
        for root, dirs, files in walk(dstdir):
            rel = os.path.relpath(root, dstdir)
            for d in dirs:
                src = os.path.join(root, d)
                arc = os.path.join(objid, rel, d)
                arc = os.path.normpath(arc)
                index_path(new_ip, src)
                tar.add(src, arc, recursive=False)

            for f in files:
                src = os.path.join(root, f)
                index_path(new_ip, src)
                tar.add(src, os.path.normpath(os.path.join(objid, rel, f)))

    algorithm = policy.get_checksum_algorithm_display()
    checksum = calculate_checksum(dsttar, algorithm=algorithm)

    info = fill_specification_data(new_ip.get_profile_data('aip_description'),
                                   ip=new_ip,
                                   sa=sa)
    info["_IP_CREATEDATE"] = timestamp_to_datetime(
        creation_date(dsttar)).isoformat()

    aip_desc_profile = new_ip.get_profile('aip_description')
    files_to_create = {
        dstxml: {
            'spec': aip_desc_profile.specification,
            'data': info
        }
    }

    ProcessTask.objects.create(
        name="ESSArch_Core.tasks.GenerateXML",
        params={
            "filesToCreate": files_to_create,
            "folderToParse": dsttar,
            "extra_paths_to_parse": [mets_path],
            "algorithm": algorithm,
        },
        information_package=new_ip,
        responsible=new_ip.responsible,
    ).run().get()

    InformationPackage.objects.filter(pk=new_ip.pk).update(
        message_digest=checksum,
        message_digest_algorithm=policy.checksum_algorithm,
    )

    ProcessTask.objects.create(
        name='ESSArch_Core.tasks.UpdateIPSizeAndCount',
        information_package=new_ip,
        responsible=new_ip.responsible,
    ).run().get()

    t = ProcessTask.objects.create(
        name='workflow.tasks.StoreAIP',
        information_package=new_ip,
        responsible=new_ip.responsible,
    )

    t.run()
Ejemplo n.º 18
0
    def _run_archive_object(self):
        def get_information_packages():
            return self.rule.information_packages.filter(
                Q(
                    Q(appraisal_date__lte=timezone.now())
                    | Q(appraisal_date__isnull=True)),
                active=True,
            ).exclude(appraisal_job_entries__job=self, )

        ips = get_information_packages()
        logger.info(
            'Running appraisal job {} on {} information packages'.format(
                self.pk, ips.count()))

        for ip in ips.order_by('-cached').iterator():  # run cached IPs first
            run_cached_ip(ip)

            # inactivate old generations
            InformationPackage.objects.filter(
                aic=ip.aic, generation__lte=ip.generation).update(active=False)

            policy = ip.policy
            srcdir = os.path.join(policy.cache_storage.value,
                                  ip.object_identifier_value)

            if not self.rule.specification:
                # register all files
                for root, dirs, files in walk(srcdir):
                    rel = os.path.relpath(root, srcdir)

                    for f in files:
                        job_entry = AppraisalJobEntry.objects.create(
                            job=self,
                            start_date=timezone.now(),
                            ip=ip,
                            document=os.path.join(rel, f))
                        job_entry.end_date = timezone.now()
                        job_entry.save()
            else:
                new_ip = ip.create_new_generation(ip.state, ip.responsible,
                                                  None)

                dstdir = os.path.join(policy.cache_storage.value,
                                      new_ip.object_identifier_value)

                new_ip.object_path = dstdir
                new_ip.save()

                aip_profile = new_ip.get_profile_rel('aip').profile
                aip_profile_data = new_ip.get_profile_data('aip')

                mets_dir, mets_name = find_destination("mets_file",
                                                       aip_profile.structure)
                mets_path = os.path.join(srcdir, mets_dir, mets_name)

                # copy files to new generation
                shutil.copytree(srcdir, dstdir)

                # delete files specified in rule
                for pattern in self.rule.specification:
                    for path in iglob(dstdir + '/' + pattern):
                        if os.path.isdir(path):
                            for root, dirs, files in walk(path):
                                rel = os.path.relpath(root, dstdir)

                                for f in files:
                                    fpath = os.path.join(root, f)
                                    job_entry = AppraisalJobEntry.objects.create(
                                        job=self,
                                        start_date=timezone.now(),
                                        ip=ip,
                                        document=os.path.join(rel, f))
                                    os.remove(fpath)
                                    job_entry.end_date = timezone.now()
                                    job_entry.save()

                        elif os.path.isfile(path):
                            rel = os.path.relpath(path, dstdir)

                            job_entry = AppraisalJobEntry.objects.create(
                                job=self,
                                start_date=timezone.now(),
                                ip=ip,
                                document=rel,
                            )
                            os.remove(path)
                            job_entry.end_date = timezone.now()
                            job_entry.save()

                # preserve new generation
                preserve_new_generation(aip_profile, aip_profile_data, dstdir,
                                        ip, mets_path, new_ip, policy)
Ejemplo n.º 19
0
    def run(self, purpose=None, delete_sip=False):
        self.logger.debug('Receiving SIP')
        aip = InformationPackage.objects.get(pk=self.ip)
        algorithm = aip.get_checksum_algorithm()
        container = aip.object_path
        objid, container_type = os.path.splitext(os.path.basename(container))
        container_type = container_type.lower()
        xml = aip.package_mets_path
        aip.package_mets_create_date = timestamp_to_datetime(
            creation_date(xml)).isoformat()
        aip.package_mets_size = os.path.getsize(xml)
        aip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[
            algorithm.upper()]
        aip.package_mets_digest = calculate_checksum(xml, algorithm=algorithm)
        aip.generation = 0
        aic = InformationPackage.objects.create(
            package_type=InformationPackage.AIC,
            responsible=aip.responsible,
            label=aip.label,
            start_date=aip.start_date,
            end_date=aip.end_date)
        old_sip_path = aip.object_path
        aip.aic = aic
        aip_dir = os.path.join(aip.policy.ingest_path.value, objid)
        aip.object_path = aip_dir
        try:
            os.makedirs(aip_dir)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        aip.save()

        dst_path, dst_name = find_destination('sip',
                                              aip.get_profile('aip').structure,
                                              aip.object_path)
        if dst_path is None:
            dst_path, dst_name = find_destination(
                'content',
                aip.get_profile('aip').structure, aip.object_path)

        dst_name, = self.parse_params(dst_name)
        dst = os.path.join(dst_path, dst_name)

        sip_profile = aip.submission_agreement.profile_sip

        try:
            shutil.rmtree(dst)
        except FileNotFoundError:
            pass

        if aip.policy.receive_extract_sip:
            temp = Path.objects.cached('entity', 'temp', 'value')
            with tempfile.TemporaryDirectory(dir=temp) as tmpdir:
                self.logger.debug('Extracting {} to {}'.format(
                    container, tmpdir))
                if container_type == '.tar':
                    with tarfile.open(container) as tar:
                        root_member_name = tar.getnames()[0]
                        tar.extractall(tmpdir)
                elif container_type == '.zip':
                    with zipfile.ZipFile(container) as zipf:
                        root_member_name = zipf.namelist()[0]
                        zipf.extractall(tmpdir)
                else:
                    raise ValueError(
                        'Invalid container type: {}'.format(container))

                dst = os.path.join(dst, '')
                try:
                    os.makedirs(dst)
                except OSError as e:
                    if e.errno != errno.EEXIST:
                        raise

                tmpsrc = tmpdir
                if len(os.listdir(tmpdir)) == 1 and os.listdir(
                        tmpdir)[0] == root_member_name:
                    new_tmpsrc = os.path.join(tmpdir, root_member_name)
                    if os.path.isdir(new_tmpsrc):
                        tmpsrc = new_tmpsrc

                self.logger.debug('Moving content of {} to {}'.format(
                    tmpsrc, dst))

                for f in os.listdir(tmpsrc):
                    shutil.move(os.path.join(tmpsrc, f), dst)

                self.logger.debug('Deleting {}'.format(tmpdir))

            aip.sip_path = os.path.relpath(dst, aip.object_path)
        else:
            self.logger.debug('Copying {} to {}'.format(container, dst))
            shutil.copy2(container, dst)
            aip.sip_path = os.path.relpath(
                os.path.join(dst, os.path.basename(container)),
                aip.object_path)

        sip_mets_dir, sip_mets_file = find_destination('mets_file',
                                                       sip_profile.structure,
                                                       aip.sip_path)
        if os.path.isfile(aip.sip_path):
            sip_mets_data = parse_mets(
                open_file(
                    os.path.join(aip.object_path, sip_mets_dir, sip_mets_file),
                    container=aip.sip_path,
                    container_prefix=aip.object_identifier_value,
                ))
        else:
            sip_mets_data = parse_mets(
                open_file(
                    os.path.join(aip.object_path, sip_mets_dir,
                                 sip_mets_file)))

        # prefix all SIP data
        sip_mets_data = {
            f'SIP_{k.upper()}': v
            for k, v in sip_mets_data.items()
        }

        aip_profile_rel_data = aip.get_profile_rel('aip').data
        aip_profile_rel_data.data.update(sip_mets_data)
        aip_profile_rel_data.save()

        if delete_sip:
            delete_path(old_sip_path)
            delete_path(pathlib.Path(old_sip_path).with_suffix('.xml'))

        self.logger.debug('sip_path set to {}'.format(aip.sip_path))
        aip.save()
Ejemplo n.º 20
0
def ReceiveSIP(self, purpose=None, delete_sip=False):
    logger = logging.getLogger('essarch.workflow.tasks.ReceiveSIP')
    logger.debug('Receiving SIP')
    ip = self.get_information_package()
    algorithm = ip.get_checksum_algorithm()
    container = ip.object_path
    objid, container_type = os.path.splitext(os.path.basename(container))
    container_type = container_type.lower()
    xml = ip.package_mets_path
    ip.package_mets_create_date = timestamp_to_datetime(
        creation_date(xml)).isoformat()
    ip.package_mets_size = os.path.getsize(xml)
    ip.package_mets_digest_algorithm = MESSAGE_DIGEST_ALGORITHM_CHOICES_DICT[
        algorithm.upper()]
    ip.package_mets_digest = calculate_checksum(xml, algorithm=algorithm)

    ip.object_path = os.path.join(ip.policy.ingest_path.value,
                                  ip.object_identifier_value)
    ip.save()

    sip_dst_path, sip_dst_name = find_destination('sip', ip.get_structure(),
                                                  ip.object_path)
    if sip_dst_path is None:
        sip_dst_path, sip_dst_name = find_destination('content',
                                                      ip.get_structure(),
                                                      ip.object_path)

    sip_dst_name, = self.parse_params(sip_dst_name)
    sip_dst = os.path.join(sip_dst_path, sip_dst_name)

    if ip.policy.receive_extract_sip:
        # remove any existing directory from previous attempts
        delete_path(sip_dst)

        temp = Path.objects.get(entity='temp').value
        with tempfile.TemporaryDirectory(dir=temp) as tmpdir:
            logger.debug('Extracting {} to {}'.format(container, tmpdir))
            if container_type == '.tar':
                with tarfile.open(container) as tar:
                    root_member_name = tar.getnames()[0]
                    tar.extractall(tmpdir)
            elif container_type == '.zip':
                with zipfile.ZipFile(container) as zipf:
                    root_member_name = zipf.namelist()[0]
                    zipf.extractall(tmpdir)
            else:
                raise ValueError(
                    'Invalid container type: {}'.format(container))

            sip_dst = os.path.join(sip_dst, '')
            os.makedirs(sip_dst)

            tmpsrc = tmpdir
            if len(os.listdir(tmpdir)) == 1 and os.listdir(
                    tmpdir)[0] == root_member_name:
                new_tmpsrc = os.path.join(tmpdir, root_member_name)
                if os.path.isdir(new_tmpsrc):
                    tmpsrc = new_tmpsrc

            logger.debug('Moving content of {} to {}'.format(tmpsrc, sip_dst))

            for f in os.listdir(tmpsrc):
                shutil.move(os.path.join(tmpsrc, f), sip_dst)

            logger.debug('Deleting {}'.format(tmpdir))
    else:
        logger.debug('Copying {} to {}'.format(container, sip_dst))
        shutil.copy2(container, sip_dst)

    ip.sip_path = os.path.relpath(sip_dst, ip.object_path)
    ip.save()
    self.create_success_event("Received SIP")
    return sip_dst
Ejemplo n.º 21
0
    def _run(self):
        def get_information_packages(job):
            return self.rule.information_packages.filter(
                active=True, ).exclude(conversion_job_entries__job=self, )

        ips = get_information_packages(self)

        for ip in ips.order_by(
                '-cached').iterator():  # convert cached IPs first
            while not ip.cached:
                with allow_join_result():
                    t, created = ProcessTask.objects.get_or_create(
                        name='workflow.tasks.CacheAIP',
                        information_package=ip,
                        defaults={
                            'responsible': ip.responsible,
                            'eager': False
                        })

                    if not created:
                        t.run()

                time.sleep(10)
                ip.refresh_from_db()

            policy = ip.policy
            srcdir = os.path.join(policy.cache_storage.value,
                                  ip.object_identifier_value)

            new_ip = ip.create_new_generation(ip.state, ip.responsible, None)

            dstdir = os.path.join(policy.cache_storage.value,
                                  new_ip.object_identifier_value)

            new_ip.object_path = dstdir
            new_ip.save()

            aip_profile = new_ip.get_profile_rel('aip').profile
            aip_profile_data = new_ip.get_profile_data('aip')

            mets_dir, mets_name = find_destination("mets_file",
                                                   aip_profile.structure)
            mets_path = os.path.join(srcdir, mets_dir, mets_name)

            mets_tree = etree.parse(mets_path)

            # copy files to new generation
            shutil.copytree(srcdir, dstdir)

            # convert files specified in rule
            for pattern, spec in six.iteritems(self.rule.specification):
                target = spec['target']
                tool = spec['tool']

                for path in iglob(dstdir + '/' + pattern):
                    if os.path.isdir(path):
                        for root, dirs, files in walk(path):
                            rel = os.path.relpath(root, dstdir)

                            for f in files:
                                fpath = os.path.join(root, f)
                                job_entry = ConversionJobEntry.objects.create(
                                    job=self,
                                    start_date=timezone.now(),
                                    ip=ip,
                                    old_document=os.path.join(rel, f))
                                convert_file(fpath, target)

                                os.remove(fpath)

                                job_entry.new_document = os.path.splitext(
                                    job_entry.old_document)[0] + '.' + target
                                job_entry.end_date = timezone.now()
                                job_entry.tool = tool
                                job_entry.save()

                    elif os.path.isfile(path):
                        rel = os.path.relpath(path, dstdir)

                        job_entry = ConversionJobEntry.objects.create(
                            job=self,
                            start_date=timezone.now(),
                            ip=ip,
                            old_document=rel,
                        )
                        convert_file(path, target)

                        os.remove(path)

                        job_entry.new_document = os.path.splitext(
                            job_entry.old_document)[0] + '.' + target
                        job_entry.end_date = timezone.now()
                        job_entry.tool = tool
                        job_entry.save()

            # preserve new generation
            sa = new_ip.submission_agreement

            try:
                os.remove(mets_path)
            except OSError as e:
                if e.errno != errno.ENOENT:
                    raise

            filesToCreate = OrderedDict()

            try:
                premis_profile = new_ip.get_profile_rel(
                    'preservation_metadata').profile
                premis_profile_data = ip.get_profile_data(
                    'preservation_metadata')
            except ProfileIP.DoesNotExist:
                pass
            else:
                premis_dir, premis_name = find_destination(
                    "preservation_description_file", aip_profile.structure)
                premis_path = os.path.join(dstdir, premis_dir, premis_name)

                try:
                    os.remove(premis_path)
                except OSError as e:
                    if e.errno != errno.ENOENT:
                        raise

                filesToCreate[premis_path] = {
                    'spec':
                    premis_profile.specification,
                    'data':
                    fill_specification_data(premis_profile_data,
                                            ip=new_ip,
                                            sa=sa),
                }

            filesToCreate[mets_path] = {
                'spec':
                aip_profile.specification,
                'data':
                fill_specification_data(aip_profile_data, ip=new_ip, sa=sa),
            }

            t = ProcessTask.objects.create(
                name='ESSArch_Core.tasks.GenerateXML',
                params={
                    'filesToCreate': filesToCreate,
                    'folderToParse': dstdir,
                },
                responsible=new_ip.responsible,
                information_package=new_ip,
            )
            t.run().get()

            dsttar = dstdir + '.tar'
            dstxml = dstdir + '.xml'

            objid = new_ip.object_identifier_value

            with tarfile.open(dsttar, 'w') as tar:
                for root, dirs, files in walk(dstdir):
                    rel = os.path.relpath(root, dstdir)
                    for d in dirs:
                        src = os.path.join(root, d)
                        arc = os.path.join(objid, rel, d)
                        arc = os.path.normpath(arc)
                        index_path(new_ip, src)
                        tar.add(src, arc, recursive=False)

                    for f in files:
                        src = os.path.join(root, f)
                        index_path(new_ip, src)
                        tar.add(src,
                                os.path.normpath(os.path.join(objid, rel, f)))

            algorithm = policy.get_checksum_algorithm_display()
            checksum = calculate_checksum(dsttar, algorithm=algorithm)

            info = fill_specification_data(
                new_ip.get_profile_data('aip_description'), ip=new_ip, sa=sa)
            info["_IP_CREATEDATE"] = timestamp_to_datetime(
                creation_date(dsttar)).isoformat()

            aip_desc_profile = new_ip.get_profile('aip_description')
            filesToCreate = {
                dstxml: {
                    'spec': aip_desc_profile.specification,
                    'data': info
                }
            }

            ProcessTask.objects.create(
                name="ESSArch_Core.tasks.GenerateXML",
                params={
                    "filesToCreate": filesToCreate,
                    "folderToParse": dsttar,
                    "extra_paths_to_parse": [mets_path],
                    "algorithm": algorithm,
                },
                information_package=new_ip,
                responsible=new_ip.responsible,
            ).run().get()

            InformationPackage.objects.filter(pk=new_ip.pk).update(
                message_digest=checksum,
                message_digest_algorithm=policy.checksum_algorithm,
            )

            ProcessTask.objects.create(
                name='ESSArch_Core.tasks.UpdateIPSizeAndCount',
                information_package=new_ip,
                responsible=new_ip.responsible,
            ).run().get()

            t = ProcessTask.objects.create(
                name='workflow.tasks.StoreAIP',
                information_package=new_ip,
                responsible=new_ip.responsible,
            )

            t.run()
Ejemplo n.º 22
0
def fill_specification_data(data=None, sa=None, ip=None):
    data = data or {}

    if sa:
        data['_SA_ID'] = str(sa.pk)
        data['_SA_NAME'] = sa.name

    if ip:
        if not sa and ip.submission_agreement is not None:
            sa = ip.submission_agreement
            data['_SA_ID'] = str(sa.pk)
            data['_SA_NAME'] = sa.name

        data['_OBJID'] = ip.object_identifier_value
        data['_OBJUUID'] = str(ip.pk)
        data['_OBJLABEL'] = ip.label
        data['_OBJPATH'] = ip.object_path
        data['_INNER_IP_OBJID'] = ip.sip_objid
        data['_INNER_IP_PATH'] = ip.sip_path
        data['_STARTDATE'] = ip.start_date
        data['_ENDDATE'] = ip.end_date
        data['_INFORMATIONCLASS'] = ip.information_class

        data['_CONTENT_METS_PATH'] = ip.content_mets_path
        data['_CONTENT_METS_CREATE_DATE'] = ip.content_mets_create_date
        data['_CONTENT_METS_SIZE'] = ip.content_mets_size
        data[
            '_CONTENT_METS_DIGEST_ALGORITHM'] = ip.get_content_mets_digest_algorithm_display(
            )
        data['_CONTENT_METS_DIGEST'] = ip.content_mets_digest

        data['_PACKAGE_METS_PATH'] = ip.package_mets_path
        data['_PACKAGE_METS_CREATE_DATE'] = ip.package_mets_create_date
        data['_PACKAGE_METS_SIZE'] = ip.package_mets_size
        data[
            '_PACKAGE_METS_DIGEST_ALGORITHM'] = ip.get_package_mets_digest_algorithm_display(
            )
        data['_PACKAGE_METS_DIGEST'] = ip.package_mets_digest

        if ip.get_package_type_display() in ['SIP', 'AIP']:
            ip_profile = ip.get_profile(ip.get_package_type_display().lower())
            if ip_profile is not None:
                premis_dir, premis_file = find_destination(
                    "preservation_description_file", ip_profile.structure)
                if premis_dir is not None and premis_file is not None:
                    data['_PREMIS_PATH'] = os.path.join(
                        ip.object_path, premis_dir, premis_file)
            data['allow_unknown_file_types'] = ip.get_profile_data(
                ip.get_package_type_display().lower()).get(
                    'allow_unknown_file_types', False)

        try:
            # do we have a transfer project profile?
            ip.get_profile('transfer_project')
        except AttributeError:
            container = 'TAR'
        else:
            container = ip.get_container_format()

        data['_IP_CONTAINER_FORMAT'] = container.upper()

        if ip.policy is not None:
            data['_POLICYUUID'] = ip.policy.pk
            data['_POLICYID'] = ip.policy.policy_id
            data['_POLICYNAME'] = ip.policy.policy_name
            data['POLICY_INGEST_PATH'] = ip.policy.ingest_path.value
        else:
            try:
                # do we have a transfer project profile?
                ip.get_profile('transfer_project')
            except AttributeError:
                pass
            else:
                transfer_project_data = ip.get_profile_data('transfer_project')
                data['_POLICYUUID'] = transfer_project_data.get(
                    'archive_policy_uuid')
                data['_POLICYID'] = transfer_project_data.get(
                    'archive_policy_id')
                data['_POLICYNAME'] = transfer_project_data.get(
                    'archive_policy_name')

        data['_AGENTS'] = {}
        for a in ip.agents.all():
            agent = {
                '_AGENTS_NAME': a.name,
                '_AGENTS_NOTES': [{
                    '_AGENTS_NOTE': n.note
                } for n in a.notes.all()],
            }

            if a.other_role:
                agent['_AGENTS_ROLE'] = 'OTHER'
                agent['_AGENTS_OTHERROLE'] = a.role
            else:
                agent['_AGENTS_ROLE'] = a.role

            if a.other_type:
                agent['_AGENTS_TYPE'] = 'OTHER'
                agent['_AGENTS_OTHERTYPE'] = a.type
            else:
                agent['_AGENTS_TYPE'] = a.type

            agent_key = '{role}_{type}'.format(role=a.role.upper(),
                                               type=a.type.upper())
            data['_AGENTS'][agent_key] = agent

        profile_ids = zip([x.lower().replace(' ', '_') for x in profile_types],
                          [
                              "_PROFILE_" + x.upper().replace(' ', '_') + "_ID"
                              for x in profile_types
                          ])

        for (profile_type, key) in profile_ids:
            try:
                data[key] = str(ip.get_profile(profile_type).pk)
            except AttributeError:
                pass

    for p in Parameter.objects.iterator():
        data['_PARAMETER_%s' % p.entity.upper()] = p.value

    for p in Path.objects.iterator():
        data['_PATH_%s' % p.entity.upper()] = p.value

    without_underscores = _remove_leading_underscores(data)
    data.update(without_underscores)
    return data