Exemple #1
0
    def run(self, ip=None, xmlfile=None, validate_fileformat=True, validate_integrity=True, rootdir=None):
        step = ProcessStep.objects.create(
            name="Validate Files",
            parallel=True,
            parent_step=self.taskobj.processstep
        )

        if any([validate_fileformat, validate_integrity]):
            if rootdir is None:
                rootdir = ip.ObjectPath

            doc = etree.ElementTree(file=xmlfile)

            for elname, props in settings.FILE_ELEMENTS.iteritems():
                for f in doc.xpath('.//*[local-name()="%s"]' % elname):
                    fpath = get_value_from_path(f, props["path"])

                    if fpath:
                        fpath = remove_prefix(fpath, props.get("pathprefix", ""))

                    fformat = get_value_from_path(f, props.get("format"))
                    checksum = get_value_from_path(f, props.get("checksum"))
                    algorithm = get_value_from_path(f, props.get("checksumtype"))

                    if validate_fileformat and fformat is not None:
                        step.tasks.add(ProcessTask.objects.create(
                            name=self.fileformat_task,
                            params={
                                "filename": os.path.join(rootdir, fpath),
                                "fileformat": fformat,
                            },
                            log=self.taskobj.log,
                            information_package=ip,
                            responsible=self.taskobj.responsible,
                        ))

                    if validate_integrity and checksum is not None:
                        step.tasks.add(ProcessTask.objects.create(
                            name=self.checksum_task,
                            params={
                                "filename": os.path.join(rootdir, fpath),
                                "checksum": checksum,
                                "algorithm": algorithm,
                            },
                            log=self.taskobj.log,
                            information_package=ip,
                            responsible=self.taskobj.responsible,
                        ))

        self.taskobj.log = None
        self.taskobj.save(update_fields=['log'])
        self.set_progress(100, total=100)

        with allow_join_result():
            return step.run().get()
Exemple #2
0
    def path(self, path):
        if path is None:
            self.paths = self.props.get('path', [''])

            if isinstance(self.paths, str):
                self.paths = [self.paths]

            for path in self.paths:
                path = get_value_from_path(self.el, path)

                if path is not None:
                    break

            self.path_prefix = self.props.get('pathprefix', [])
            for prefix in sorted(self.path_prefix, key=len, reverse=True):
                no_prefix = remove_prefix(path, prefix)

                if no_prefix != path:
                    path = no_prefix
                    break

            if self.props.get('path_includes_root', False):
                path = path.split('/', 1)[-1]

            path = path.lstrip('/ ')

        self._path = normalize_path(path)
Exemple #3
0
    def __init__(self, el, props, path=None, rootdir=None):
        '''
        args:
            el: lxml.etree._Element
            props: 'dict with properties from FILE_ELEMENTS'
        '''

        self.el = el
        self.props = props
        self.path = path
        self.checksum = get_value_from_path(el, props.get('checksum', ''))
        self.checksum = self.checksum.lower() if self.checksum is not None else self.checksum
        self.checksum_type = get_value_from_path(el, props.get('checksumtype', ''))
        self.checksum_type = self.checksum_type.lower() if self.checksum_type is not None else self.checksum_type
        self.size = get_value_from_path(el, props.get('size', ''))
        self.size = int(self.size) if self.size is not None else None
        self.format = get_value_from_path(el, props.get('format', ''))
Exemple #4
0
    def __init__(self, el, props, path=None, rootdir=None):
        '''
        args:
            el: lxml.etree._Element
            props: 'dict with properties from FILE_ELEMENTS'
        '''

        self.path = path
        if self.path is None:
            self.paths = props.get('path', [''])

            if isinstance(self.paths, six.string_types):
                self.paths = [self.paths]

            for path in self.paths:
                self.path = get_value_from_path(el, path)

                if self.path is not None:
                    break

            self.path_prefix = props.get('pathprefix', [])
            for prefix in sorted(self.path_prefix, key=len, reverse=True):
                no_prefix = remove_prefix(self.path, prefix)

                if no_prefix != self.path:
                    self.path = no_prefix
                    break

            if props.get('path_includes_root', False):
                self.path = self.path.split('/', 1)[-1]

            self.path = self.path.lstrip('/ ')

        self.checksum = get_value_from_path(el, props.get('checksum', ''))
        self.checksum = self.checksum.lower(
        ) if self.checksum is not None else self.checksum
        self.checksum_type = get_value_from_path(el,
                                                 props.get('checksumtype', ''))
        self.checksum_type = self.checksum_type.lower(
        ) if self.checksum_type is not None else self.checksum_type
        self.size = get_value_from_path(el, props.get('size', ''))
        self.size = int(self.size) if self.size is not None else None
        self.format = get_value_from_path(el, props.get('format', ''))
Exemple #5
0
def get_objectpath(el):
    try:
        e = el.xpath('.//*[local-name()="%s"]' % "FLocat")[0]
        if e is not None:
            val = get_value_from_path(e, "@href")
            try:
                return val.split('file:///')[1]
            except IndexError:
                return val
    except IndexError:
        return None
Exemple #6
0
    def __init__(self, el, props):
        '''
        args:
            el: lxml.etree._Element
            props: 'dict with properties from FILE_ELEMENTS'
        '''

        self.path = get_value_from_path(el, props.get('path', ''))
        self.path_prefix = props.get('pathprefix', [])
        for prefix in sorted(self.path_prefix, key=len, reverse=True):
            no_prefix = remove_prefix(self.path, prefix)

            if no_prefix != self.path:
                self.path = no_prefix
                break

        self.path = self.path.lstrip('/ ')

        self.checksum = get_value_from_path(el, props.get('checksum', ''))
        self.checksum_type = get_value_from_path(el, props.get('checksumtype', ''))

        self.format = get_value_from_path(el, props.get('format', ''))
Exemple #7
0
    def run(self, dirname=None, files=[], files_reldir=None, xmlfile=None):
        if dirname:
            xmlrelpath = os.path.relpath(xmlfile, dirname)
            xmlrelpath = remove_prefix(xmlrelpath, "./")
        else:
            xmlrelpath = xmlfile

        doc = etree.ElementTree(file=xmlfile)

        root = doc.getroot()

        logical_files = set()
        physical_files = set()

        for elname, props in settings.FILE_ELEMENTS.iteritems():
            for f in doc.xpath('.//*[local-name()="%s"]' % elname):
                filename = get_value_from_path(f, props["path"])

                if filename:
                    filename = remove_prefix(filename, props.get("pathprefix", ""))
                    logical_files.add(filename)

        if dirname:
            for root, dirs, filenames in os.walk(dirname):
                for f in filenames:
                    if f != xmlrelpath:
                        reldir = os.path.relpath(root, dirname)
                        relfile = os.path.join(reldir, f)
                        relfile = win_to_posix(relfile)
                        relfile = remove_prefix(relfile, "./")

                        physical_files.add(relfile)

        for f in files:
            if files_reldir:
                f = os.path.relpath(f, files_reldir)
            physical_files.add(f)

        assert logical_files == physical_files, "the logical representation differs from the physical"
        self.set_progress(100, total=100)
        return "Success"
Exemple #8
0
    def destroy(self, request, pk=None):
        reception = Path.objects.get(entity="path_ingest_reception").value
        uip = Path.objects.get(entity="path_ingest_unidentified").value

        xmlfile = os.path.join(reception, "%s.xml" % pk)
        srcdir = reception

        if not os.path.isfile(xmlfile):
            xmlfile = os.path.join(uip, "%s.xml" % pk)
            srcdir = uip

        if os.path.isfile(xmlfile):
            doc = etree.parse(xmlfile)
            root = doc.getroot()

            el = root.xpath('.//*[local-name()="%s"]' % "FLocat")[0]
            objpath = get_value_from_path(el, "@href").split('file:///')[1]
            path = os.path.join(srcdir, objpath)

            try:
                shutil.rmtree(path)
            except OSError as e:
                if e.errno in [errno.ENOENT, errno.ENOTDIR]:
                    os.remove(path)
                else:
                    raise
            finally:
                for fl in glob.glob(os.path.splitext(xmlfile)[0] + "*"):
                    try:
                        os.remove(fl)
                    except:
                        raise

        if InformationPackage.objects.filter(pk=pk).exists():
            return super(InformationPackageViewSet, self).destroy(request, pk=pk)
        else:
            return Response(status=status.HTTP_204_NO_CONTENT)
Exemple #9
0
 def test_get_value_from_path_when_attribute_is_missing(self):
     xml = self.get_simple_xml()
     root_xml = objectify.fromstring(xml)
     self.assertEqual(
         get_value_from_path(root_xml, "anmerkningar@non_existing_attr"),
         None)
Exemple #10
0
 def test_get_value_from_path_when_path_is_none(self):
     xml = self.get_simple_xml()
     root_xml = objectify.fromstring(xml)
     self.assertEqual(get_value_from_path(root_xml, None), None)
Exemple #11
0
def parse_submit_description(xmlfile, srcdir=''):
    ip = {}
    doc = etree.parse(xmlfile)
    root = doc.getroot()

    if root.xpath('local-name()').lower() != 'mets':
        raise ValueError('%s is not a valid mets file' % xmlfile)

    try:
        # try getting objid with prefix
        ip['id'] = root.attrib['OBJID'].split(':')[1]
    except IndexError:
        # no prefix, try getting objid without prefix
        ip['id'] = root.attrib['OBJID']
    except KeyError:
        # no objid available, use the name of the xml file
        ip['id'] = os.path.splitext(os.path.basename(xmlfile))[0]

    ip['object_identifier_value'] = ip['id']
    ip['label'] = root.get('LABEL', '')

    try:
        ip['create_date'] = root.find("{*}metsHdr").get('CREATEDATE')
        ip['entry_date'] = ip['create_date']
    except AttributeError:
        pass

    objpath = get_objectpath(root)

    if objpath:
        ip['object_path'] = os.path.join(srcdir, objpath)
        ip['object_size'] = os.stat(ip['object_path']).st_size

    ip['information_class'] = get_value_from_path(root, '@INFORMATIONCLASS')

    ip['altrecordids'] = get_altrecordids(root)

    ip['start_date'] = ip['altrecordids'].get('STARTDATE', [None])[0]
    ip['end_date'] = ip['altrecordids'].get('ENDDATE', [None])[0]

    codes = ip['altrecordids'].get('REFERENCECODE', [])
    ip['reference_codes'] = [parse_reference_code(code) for code in codes]

    if ip['information_class'] is None:
        try:
            ip['information_class'] = ip['altrecordids'].get(
                'INFORMATIONCLASS')[0]
        except TypeError:
            ip['information_class'] = None

    try:
        ip['information_class'] = [
            int(s) for s in ip['information_class'].split() if s.isdigit()
        ][0]
    except (KeyError, AttributeError):
        if ip['information_class'] is not None:
            raise

    ip['agents'] = {}
    for a in get_agents(root):
        other_role = a.get("ROLE") == 'OTHER'
        other_type = a.get("TYPE") == 'OTHER'
        agent_role = a.get("OTHERROLE") if other_role else a.get("ROLE")
        agent_type = a.get("OTHERTYPE") if other_type else a.get("TYPE")
        name = a.xpath('*[local-name()="name"]')[0].text
        notes = [n.text for n in a.xpath('*[local-name()="note"]')]
        ip['agents']['{role}_{type}'.format(role=agent_role,
                                            type=agent_type)] = {
                                                'name': name,
                                                'notes': notes
                                            }

    try:
        ip['system_version'] = get_agent(root,
                                         ROLE='ARCHIVIST',
                                         TYPE='OTHER',
                                         OTHERTYPE='SOFTWARE')['notes'][0],
    except IndexError:
        pass

    try:
        ip['system_type'] = get_agent(root,
                                      ROLE='ARCHIVIST',
                                      TYPE='OTHER',
                                      OTHERTYPE='SOFTWARE')['notes'][1],
    except IndexError:
        pass

    return ip
Exemple #12
0
    def list(self, request):
        reception = Path.objects.get(entity="path_ingest_reception").value
        uip = Path.objects.get(entity="path_ingest_unidentified").value
        ips = []

        for xmlfile in glob.glob(os.path.join(reception, "*.xml")) + glob.glob(os.path.join(uip, "*.xml")):
            if os.path.isfile(xmlfile):
                if xmlfile.startswith(uip):
                    srcdir = uip
                else:
                    srcdir = reception

                ip = self.parseFile(xmlfile, srcdir)
                if not InformationPackage.objects.filter(id=ip['id']).exists():
                    ips.append(ip)

        for container_file in glob.glob(os.path.join(uip, "*.tar")) + glob.glob(os.path.join(uip, "*.zip")):
            ip = {
                'Label': os.path.basename(container_file),
                'CreateDate': str(timestamp_to_datetime(creation_date(container_file)).isoformat()),
                'State': 'Unidentified',
                'status': 0,
                'step_state': celery_states.SUCCESS,
            }

            include = True

            for xmlfile in glob.glob(os.path.join(uip, "*.xml")):
                if os.path.isfile(xmlfile):
                    doc = etree.parse(xmlfile)
                    root = doc.getroot()

                    el = root.xpath('.//*[local-name()="%s"]' % "FLocat")[0]
                    if ip['Label'] == get_value_from_path(el, "@href").split('file:///')[1]:
                        include = False
                        break

            if include:
                ips.append(ip)

        from_db = InformationPackage.objects.filter(State='Receiving').prefetch_related(
            Prefetch('profileip_set', to_attr='profiles'),
        )
        serializer = InformationPackageSerializer(
            data=from_db, many=True, context={'request': request}
        )
        serializer.is_valid()
        ips.extend(serializer.data)

        try:
            ordering = request.query_params.get('ordering', '')
            reverse = ordering.startswith('-')
            ordering = remove_prefix(ordering, '-')
            ips = sorted(ips, key=lambda k: k[ordering], reverse=reverse)
        except KeyError:
            pass

        paginator = LinkHeaderPagination()
        page = paginator.paginate_queryset(ips, request)
        if page is not None:
            return paginator.get_paginated_response(page)

        return Response(ips)
Exemple #13
0
 def get_objectpath(self, el):
     e = el.xpath('.//*[local-name()="%s"]' % "FLocat")[0]
     if e is not None:
         return get_value_from_path(e, "@href").split('file:///')[1]
Exemple #14
0
def parse_submit_description(xmlfile, srcdir=''):
    ip = {}
    doc = etree.parse(xmlfile)
    root = doc.getroot()

    try:
        ip['id'] = root.get('OBJID').split(':')[1]
    except:
        ip['id'] = root.get('OBJID')

    ip['object_identifier_value'] = ip['id']
    ip['label'] = root.get('LABEL')
    ip['create_date'] = root.find("{*}metsHdr").get('CREATEDATE')

    objpath = get_objectpath(root)

    if objpath:
        ip['object_path'] = os.path.join(srcdir, objpath)
        ip['object_size'] = os.stat(ip['object_path']).st_size

    ip['information_class'] = get_value_from_path(root, '@INFORMATIONCLASS')

    ip['altrecordids'] = get_altrecordids(root)

    codes = ip['altrecordids'].get('REFERENCECODE', [])
    ip['reference_codes'] = [parse_reference_code(code) for code in codes]

    if ip['information_class'] is None:
        try:
            ip['information_class'] = ip['altrecordids'].get('INFORMATIONCLASS')[0]
        except TypeError:
            ip['information_class'] = None

    try:
        ip['information_class'] = [int(s) for s in ip['information_class'].split() if s.isdigit()][0]
    except (KeyError, AttributeError):
        ip['information_class'] = 0

    try:
        ip['archivist_organization'] = {
            'name': get_agent(root, ROLE='ARCHIVIST', TYPE='ORGANIZATION')['name']
        }
    except TypeError:
        pass

    try:
        ip['creator_organization'] = get_agent(root, ROLE='CREATOR', TYPE='ORGANIZATION')['name']
    except TypeError:
        pass

    try:
        ip['submitter_organization'] = get_agent(root, ROLE='OTHER', OTHERROLE='SUBMITTER', TYPE='ORGANIZATION')['name']
    except TypeError:
        pass

    try:
        ip['submitter_individual'] = get_agent(root, ROLE='OTHER', OTHERROLE='SUBMITTER', TYPE='INDIVIDUAL')['name']
    except TypeError:
        pass

    try:
        ip['producer_organization'] = get_agent(root, ROLE='OTHER', OTHERROLE='PRODUCER', TYPE='ORGANIZATION')['name']
    except TypeError:
        pass

    try:
        ip['producer_individual'] = get_agent(root, ROLE='OTHER', OTHERROLE='PRODUCER', TYPE='INDIVIDUAL')['name']
    except TypeError:
        pass

    try:
        ip['ipowner_organization'] = get_agent(root, ROLE='IPOWNER', TYPE='ORGANIZATION')['name']
    except TypeError:
        pass

    try:
        ip['preservation_organization'] = get_agent(root, ROLE='PRESERVATION', TYPE='ORGANIZATION')['name']
    except TypeError:
        pass

    try:
        ip['system_name'] = get_agent(root, ROLE='ARCHIVIST', TYPE='OTHER', OTHERTYPE='SOFTWARE')['name']
    except TypeError:
        pass

    try:
        ip['system_version'] = get_agent(root, ROLE='ARCHIVIST', TYPE='OTHER', OTHERTYPE='SOFTWARE')['notes'][0],
    except TypeError:
        pass

    try:
        ip['system_type'] = get_agent(root, ROLE='ARCHIVIST', TYPE='OTHER', OTHERTYPE='SOFTWARE')['notes'][1],
    except TypeError:
        pass

    return ip