def run(self, ip=None, xmlfile=None, validate_fileformat=True, validate_integrity=True, rootdir=None): step = ProcessStep.objects.create( name="Validate Files", parallel=True, parent_step=self.taskobj.processstep ) if any([validate_fileformat, validate_integrity]): if rootdir is None: rootdir = ip.ObjectPath doc = etree.ElementTree(file=xmlfile) for elname, props in settings.FILE_ELEMENTS.iteritems(): for f in doc.xpath('.//*[local-name()="%s"]' % elname): fpath = get_value_from_path(f, props["path"]) if fpath: fpath = remove_prefix(fpath, props.get("pathprefix", "")) fformat = get_value_from_path(f, props.get("format")) checksum = get_value_from_path(f, props.get("checksum")) algorithm = get_value_from_path(f, props.get("checksumtype")) if validate_fileformat and fformat is not None: step.tasks.add(ProcessTask.objects.create( name=self.fileformat_task, params={ "filename": os.path.join(rootdir, fpath), "fileformat": fformat, }, log=self.taskobj.log, information_package=ip, responsible=self.taskobj.responsible, )) if validate_integrity and checksum is not None: step.tasks.add(ProcessTask.objects.create( name=self.checksum_task, params={ "filename": os.path.join(rootdir, fpath), "checksum": checksum, "algorithm": algorithm, }, log=self.taskobj.log, information_package=ip, responsible=self.taskobj.responsible, )) self.taskobj.log = None self.taskobj.save(update_fields=['log']) self.set_progress(100, total=100) with allow_join_result(): return step.run().get()
def path(self, path): if path is None: self.paths = self.props.get('path', ['']) if isinstance(self.paths, str): self.paths = [self.paths] for path in self.paths: path = get_value_from_path(self.el, path) if path is not None: break self.path_prefix = self.props.get('pathprefix', []) for prefix in sorted(self.path_prefix, key=len, reverse=True): no_prefix = remove_prefix(path, prefix) if no_prefix != path: path = no_prefix break if self.props.get('path_includes_root', False): path = path.split('/', 1)[-1] path = path.lstrip('/ ') self._path = normalize_path(path)
def __init__(self, el, props, path=None, rootdir=None): ''' args: el: lxml.etree._Element props: 'dict with properties from FILE_ELEMENTS' ''' self.el = el self.props = props self.path = path self.checksum = get_value_from_path(el, props.get('checksum', '')) self.checksum = self.checksum.lower() if self.checksum is not None else self.checksum self.checksum_type = get_value_from_path(el, props.get('checksumtype', '')) self.checksum_type = self.checksum_type.lower() if self.checksum_type is not None else self.checksum_type self.size = get_value_from_path(el, props.get('size', '')) self.size = int(self.size) if self.size is not None else None self.format = get_value_from_path(el, props.get('format', ''))
def __init__(self, el, props, path=None, rootdir=None): ''' args: el: lxml.etree._Element props: 'dict with properties from FILE_ELEMENTS' ''' self.path = path if self.path is None: self.paths = props.get('path', ['']) if isinstance(self.paths, six.string_types): self.paths = [self.paths] for path in self.paths: self.path = get_value_from_path(el, path) if self.path is not None: break self.path_prefix = props.get('pathprefix', []) for prefix in sorted(self.path_prefix, key=len, reverse=True): no_prefix = remove_prefix(self.path, prefix) if no_prefix != self.path: self.path = no_prefix break if props.get('path_includes_root', False): self.path = self.path.split('/', 1)[-1] self.path = self.path.lstrip('/ ') self.checksum = get_value_from_path(el, props.get('checksum', '')) self.checksum = self.checksum.lower( ) if self.checksum is not None else self.checksum self.checksum_type = get_value_from_path(el, props.get('checksumtype', '')) self.checksum_type = self.checksum_type.lower( ) if self.checksum_type is not None else self.checksum_type self.size = get_value_from_path(el, props.get('size', '')) self.size = int(self.size) if self.size is not None else None self.format = get_value_from_path(el, props.get('format', ''))
def get_objectpath(el): try: e = el.xpath('.//*[local-name()="%s"]' % "FLocat")[0] if e is not None: val = get_value_from_path(e, "@href") try: return val.split('file:///')[1] except IndexError: return val except IndexError: return None
def __init__(self, el, props): ''' args: el: lxml.etree._Element props: 'dict with properties from FILE_ELEMENTS' ''' self.path = get_value_from_path(el, props.get('path', '')) self.path_prefix = props.get('pathprefix', []) for prefix in sorted(self.path_prefix, key=len, reverse=True): no_prefix = remove_prefix(self.path, prefix) if no_prefix != self.path: self.path = no_prefix break self.path = self.path.lstrip('/ ') self.checksum = get_value_from_path(el, props.get('checksum', '')) self.checksum_type = get_value_from_path(el, props.get('checksumtype', '')) self.format = get_value_from_path(el, props.get('format', ''))
def run(self, dirname=None, files=[], files_reldir=None, xmlfile=None): if dirname: xmlrelpath = os.path.relpath(xmlfile, dirname) xmlrelpath = remove_prefix(xmlrelpath, "./") else: xmlrelpath = xmlfile doc = etree.ElementTree(file=xmlfile) root = doc.getroot() logical_files = set() physical_files = set() for elname, props in settings.FILE_ELEMENTS.iteritems(): for f in doc.xpath('.//*[local-name()="%s"]' % elname): filename = get_value_from_path(f, props["path"]) if filename: filename = remove_prefix(filename, props.get("pathprefix", "")) logical_files.add(filename) if dirname: for root, dirs, filenames in os.walk(dirname): for f in filenames: if f != xmlrelpath: reldir = os.path.relpath(root, dirname) relfile = os.path.join(reldir, f) relfile = win_to_posix(relfile) relfile = remove_prefix(relfile, "./") physical_files.add(relfile) for f in files: if files_reldir: f = os.path.relpath(f, files_reldir) physical_files.add(f) assert logical_files == physical_files, "the logical representation differs from the physical" self.set_progress(100, total=100) return "Success"
def destroy(self, request, pk=None): reception = Path.objects.get(entity="path_ingest_reception").value uip = Path.objects.get(entity="path_ingest_unidentified").value xmlfile = os.path.join(reception, "%s.xml" % pk) srcdir = reception if not os.path.isfile(xmlfile): xmlfile = os.path.join(uip, "%s.xml" % pk) srcdir = uip if os.path.isfile(xmlfile): doc = etree.parse(xmlfile) root = doc.getroot() el = root.xpath('.//*[local-name()="%s"]' % "FLocat")[0] objpath = get_value_from_path(el, "@href").split('file:///')[1] path = os.path.join(srcdir, objpath) try: shutil.rmtree(path) except OSError as e: if e.errno in [errno.ENOENT, errno.ENOTDIR]: os.remove(path) else: raise finally: for fl in glob.glob(os.path.splitext(xmlfile)[0] + "*"): try: os.remove(fl) except: raise if InformationPackage.objects.filter(pk=pk).exists(): return super(InformationPackageViewSet, self).destroy(request, pk=pk) else: return Response(status=status.HTTP_204_NO_CONTENT)
def test_get_value_from_path_when_attribute_is_missing(self): xml = self.get_simple_xml() root_xml = objectify.fromstring(xml) self.assertEqual( get_value_from_path(root_xml, "anmerkningar@non_existing_attr"), None)
def test_get_value_from_path_when_path_is_none(self): xml = self.get_simple_xml() root_xml = objectify.fromstring(xml) self.assertEqual(get_value_from_path(root_xml, None), None)
def parse_submit_description(xmlfile, srcdir=''): ip = {} doc = etree.parse(xmlfile) root = doc.getroot() if root.xpath('local-name()').lower() != 'mets': raise ValueError('%s is not a valid mets file' % xmlfile) try: # try getting objid with prefix ip['id'] = root.attrib['OBJID'].split(':')[1] except IndexError: # no prefix, try getting objid without prefix ip['id'] = root.attrib['OBJID'] except KeyError: # no objid available, use the name of the xml file ip['id'] = os.path.splitext(os.path.basename(xmlfile))[0] ip['object_identifier_value'] = ip['id'] ip['label'] = root.get('LABEL', '') try: ip['create_date'] = root.find("{*}metsHdr").get('CREATEDATE') ip['entry_date'] = ip['create_date'] except AttributeError: pass objpath = get_objectpath(root) if objpath: ip['object_path'] = os.path.join(srcdir, objpath) ip['object_size'] = os.stat(ip['object_path']).st_size ip['information_class'] = get_value_from_path(root, '@INFORMATIONCLASS') ip['altrecordids'] = get_altrecordids(root) ip['start_date'] = ip['altrecordids'].get('STARTDATE', [None])[0] ip['end_date'] = ip['altrecordids'].get('ENDDATE', [None])[0] codes = ip['altrecordids'].get('REFERENCECODE', []) ip['reference_codes'] = [parse_reference_code(code) for code in codes] if ip['information_class'] is None: try: ip['information_class'] = ip['altrecordids'].get( 'INFORMATIONCLASS')[0] except TypeError: ip['information_class'] = None try: ip['information_class'] = [ int(s) for s in ip['information_class'].split() if s.isdigit() ][0] except (KeyError, AttributeError): if ip['information_class'] is not None: raise ip['agents'] = {} for a in get_agents(root): other_role = a.get("ROLE") == 'OTHER' other_type = a.get("TYPE") == 'OTHER' agent_role = a.get("OTHERROLE") if other_role else a.get("ROLE") agent_type = a.get("OTHERTYPE") if other_type else a.get("TYPE") name = a.xpath('*[local-name()="name"]')[0].text notes = [n.text for n in a.xpath('*[local-name()="note"]')] ip['agents']['{role}_{type}'.format(role=agent_role, type=agent_type)] = { 'name': name, 'notes': notes } try: ip['system_version'] = get_agent(root, ROLE='ARCHIVIST', TYPE='OTHER', OTHERTYPE='SOFTWARE')['notes'][0], except IndexError: pass try: ip['system_type'] = get_agent(root, ROLE='ARCHIVIST', TYPE='OTHER', OTHERTYPE='SOFTWARE')['notes'][1], except IndexError: pass return ip
def list(self, request): reception = Path.objects.get(entity="path_ingest_reception").value uip = Path.objects.get(entity="path_ingest_unidentified").value ips = [] for xmlfile in glob.glob(os.path.join(reception, "*.xml")) + glob.glob(os.path.join(uip, "*.xml")): if os.path.isfile(xmlfile): if xmlfile.startswith(uip): srcdir = uip else: srcdir = reception ip = self.parseFile(xmlfile, srcdir) if not InformationPackage.objects.filter(id=ip['id']).exists(): ips.append(ip) for container_file in glob.glob(os.path.join(uip, "*.tar")) + glob.glob(os.path.join(uip, "*.zip")): ip = { 'Label': os.path.basename(container_file), 'CreateDate': str(timestamp_to_datetime(creation_date(container_file)).isoformat()), 'State': 'Unidentified', 'status': 0, 'step_state': celery_states.SUCCESS, } include = True for xmlfile in glob.glob(os.path.join(uip, "*.xml")): if os.path.isfile(xmlfile): doc = etree.parse(xmlfile) root = doc.getroot() el = root.xpath('.//*[local-name()="%s"]' % "FLocat")[0] if ip['Label'] == get_value_from_path(el, "@href").split('file:///')[1]: include = False break if include: ips.append(ip) from_db = InformationPackage.objects.filter(State='Receiving').prefetch_related( Prefetch('profileip_set', to_attr='profiles'), ) serializer = InformationPackageSerializer( data=from_db, many=True, context={'request': request} ) serializer.is_valid() ips.extend(serializer.data) try: ordering = request.query_params.get('ordering', '') reverse = ordering.startswith('-') ordering = remove_prefix(ordering, '-') ips = sorted(ips, key=lambda k: k[ordering], reverse=reverse) except KeyError: pass paginator = LinkHeaderPagination() page = paginator.paginate_queryset(ips, request) if page is not None: return paginator.get_paginated_response(page) return Response(ips)
def get_objectpath(self, el): e = el.xpath('.//*[local-name()="%s"]' % "FLocat")[0] if e is not None: return get_value_from_path(e, "@href").split('file:///')[1]
def parse_submit_description(xmlfile, srcdir=''): ip = {} doc = etree.parse(xmlfile) root = doc.getroot() try: ip['id'] = root.get('OBJID').split(':')[1] except: ip['id'] = root.get('OBJID') ip['object_identifier_value'] = ip['id'] ip['label'] = root.get('LABEL') ip['create_date'] = root.find("{*}metsHdr").get('CREATEDATE') objpath = get_objectpath(root) if objpath: ip['object_path'] = os.path.join(srcdir, objpath) ip['object_size'] = os.stat(ip['object_path']).st_size ip['information_class'] = get_value_from_path(root, '@INFORMATIONCLASS') ip['altrecordids'] = get_altrecordids(root) codes = ip['altrecordids'].get('REFERENCECODE', []) ip['reference_codes'] = [parse_reference_code(code) for code in codes] if ip['information_class'] is None: try: ip['information_class'] = ip['altrecordids'].get('INFORMATIONCLASS')[0] except TypeError: ip['information_class'] = None try: ip['information_class'] = [int(s) for s in ip['information_class'].split() if s.isdigit()][0] except (KeyError, AttributeError): ip['information_class'] = 0 try: ip['archivist_organization'] = { 'name': get_agent(root, ROLE='ARCHIVIST', TYPE='ORGANIZATION')['name'] } except TypeError: pass try: ip['creator_organization'] = get_agent(root, ROLE='CREATOR', TYPE='ORGANIZATION')['name'] except TypeError: pass try: ip['submitter_organization'] = get_agent(root, ROLE='OTHER', OTHERROLE='SUBMITTER', TYPE='ORGANIZATION')['name'] except TypeError: pass try: ip['submitter_individual'] = get_agent(root, ROLE='OTHER', OTHERROLE='SUBMITTER', TYPE='INDIVIDUAL')['name'] except TypeError: pass try: ip['producer_organization'] = get_agent(root, ROLE='OTHER', OTHERROLE='PRODUCER', TYPE='ORGANIZATION')['name'] except TypeError: pass try: ip['producer_individual'] = get_agent(root, ROLE='OTHER', OTHERROLE='PRODUCER', TYPE='INDIVIDUAL')['name'] except TypeError: pass try: ip['ipowner_organization'] = get_agent(root, ROLE='IPOWNER', TYPE='ORGANIZATION')['name'] except TypeError: pass try: ip['preservation_organization'] = get_agent(root, ROLE='PRESERVATION', TYPE='ORGANIZATION')['name'] except TypeError: pass try: ip['system_name'] = get_agent(root, ROLE='ARCHIVIST', TYPE='OTHER', OTHERTYPE='SOFTWARE')['name'] except TypeError: pass try: ip['system_version'] = get_agent(root, ROLE='ARCHIVIST', TYPE='OTHER', OTHERTYPE='SOFTWARE')['notes'][0], except TypeError: pass try: ip['system_type'] = get_agent(root, ROLE='ARCHIVIST', TYPE='OTHER', OTHERTYPE='SOFTWARE')['notes'][1], except TypeError: pass return ip