Exemple #1
0
    def in_valid_paths(self, root, path, valid_paths):
        for valid_path in [p for p in valid_paths if isinstance(p, str)]:
            if path in list(map(normalize_path, glob(valid_path))):
                return True

        for valid_path in [p for p in valid_paths if not isinstance(p, str)]:
            for nested_valid_path in valid_path:
                for found_nested_path, matches in iglob(nested_valid_path,
                                                        with_matches=True):
                    found_nested_path = normalize_path(found_nested_path)
                    if found_nested_path == path:
                        # check matches
                        matches = map(normalize_path, matches)
                        for match in matches:
                            for related_path in valid_path:
                                if related_path != found_nested_path:
                                    related_path = related_path.replace(
                                        '*', match, 1)

                                    if not os.path.isfile(related_path):
                                        rel_path = normalize_path(
                                            os.path.relpath(path, root))
                                        rel_related_path = normalize_path(
                                            os.path.relpath(
                                                related_path, root))
                                        raise ValidationError(
                                            '{file} missing related file {related}'
                                            .format(file=rel_path,
                                                    related=rel_related_path))

                        return True

        raise ValidationError('{file} is not allowed'.format(file=path))
Exemple #2
0
    def validate(self, filepath, expected=None):
        logger.debug('Validating format of %s' % filepath)

        name, version, reg_key = expected
        if not any(f is not None for f in (name, version, reg_key)):
            raise ValueError(
                'At least one of name, version and registry key is required')

        val_obj = Validation.objects.create(filename=filepath,
                                            time_started=timezone.now(),
                                            validator=self.__class__.__name__,
                                            required=self.required,
                                            task=self.task,
                                            information_package=self.ip,
                                            responsible=self.responsible,
                                            specification={
                                                'context': self.context,
                                                'options': self.options,
                                            })

        passed = False
        try:
            actual_name, actual_version, actual_reg_key = self.fid.identify_file_format(
                filepath)
            if name and name != actual_name:
                raise ValidationError(
                    "format name for {} is not valid, ({} !={})".format(
                        filepath, name, actual_name))
            if version and version != actual_version:
                raise ValidationError(
                    "format version for {} is not valid, ({} != {})".format(
                        filepath, version, actual_version))
            if reg_key and reg_key != actual_reg_key:
                raise ValidationError(
                    "format registry key for {} is not valid, ({} != {})".
                    format(filepath, reg_key, actual_reg_key))

            passed = True
        except ValidationError:
            val_obj.message = traceback.format_exc()
            raise
        else:
            message = 'Successfully validated checksum of %s' % filepath
            val_obj.message = message
            logger.info(message)
        finally:
            val_obj.time_done = timezone.now()
            val_obj.passed = passed
            val_obj.save(update_fields=['time_done', 'passed', 'message'])
Exemple #3
0
    def validate(self, filepath, expected=None):
        logger.debug("Validating %s with Mediaconch" % filepath)

        val_obj = Validation.objects.create(filename=filepath,
                                            time_started=timezone.now(),
                                            validator=self.__class__.__name__,
                                            required=self.required,
                                            task=self.task,
                                            information_package=self.ip,
                                            responsible=self.responsible,
                                            specification={
                                                'context': self.context,
                                                'options': self.options,
                                            })

        passed = False
        try:
            out, err, returncode = run_mediaconch(filepath,
                                                  policy=self.context)

            if returncode:
                logger.warning("Mediaconch validation of %s failed, %s" %
                               (filepath, err))
                raise ValidationError(err)

            parser = etree.XMLParser(remove_blank_text=True)
            root = etree.XML(out, parser=parser)

            passed = get_outcome(root)
            message = etree.tostring(root,
                                     xml_declaration=True,
                                     encoding='UTF-8')

            if not passed:
                logger.warning("Mediaconch validation of %s failed, %s" %
                               (filepath, message))
                raise ValidationError(message)
        except Exception:
            val_obj.message = traceback.format_exc()
            raise
        else:
            val_obj.message = message
            logger.info("Successful Mediaconch validation of %s" % filepath)
        finally:
            val_obj.time_done = timezone.now()
            val_obj.passed = passed
            val_obj.save(update_fields=['time_done', 'passed', 'message'])

        return message
Exemple #4
0
    def validate_folder(self, path, node):
        valid_paths = node.get('valid_paths', [])
        allow_empty = node.get('allow_empty', True)
        required_files = list(
            map(normalize_path, [
                req.format(**self.data)
                for req in node.get('required_files', [])
            ]))
        file_count = 0

        for idx, valid in enumerate(valid_paths):
            if isinstance(valid, str):
                valid_paths[idx] = normalize_path(
                    os.path.join(path, valid).format(**self.data))
            else:
                for nested_idx, nested_valid in enumerate(valid):
                    valid[nested_idx] = normalize_path(
                        os.path.join(path, nested_valid).format(**self.data))

        for root, dirs, files in walk(path):
            for f in files:
                file_count += 1
                if len(valid_paths):
                    try:
                        self.in_valid_paths(
                            path, normalize_path(os.path.join(root, f)),
                            valid_paths)
                    except ValidationError as validation_exc:
                        try:
                            self.update_required_files(
                                os.path.relpath(root, path), f, required_files)
                        except ValueError:
                            raise validation_exc

                if len(required_files):
                    try:
                        self.update_required_files(os.path.relpath(root, path),
                                                   f, required_files)
                    except ValueError:
                        pass

        if not allow_empty and file_count == 0:
            raise ValidationError(
                '{path} is not allowed to be empty'.format(path=path))

        if len(required_files):
            raise ValidationError('Missing {files} in {path}'.format(
                files=','.join(required_files), path=path))
Exemple #5
0
    def validate(self, filepath, expected=None):
        logger.debug('Validating syntax of {xml}'.format(xml=filepath))

        etree.clear_error_log()
        started = timezone.now()

        try:
            etree.parse(filepath)
        except etree.XMLSyntaxError as e:
            msg = 'Syntax validation of {xml} failed'.format(xml=filepath)
            logger.exception(msg)
            done = timezone.now()
            validation_objs = []
            for error in e.error_log:
                message = '{line}: {msg}'.format(line=error.line,
                                                 msg=error.message)
                validation_objs.append(
                    Validation(
                        passed=False,
                        validator=self.__class__.__name__,
                        filename=filepath,
                        message=message,
                        time_started=started,
                        time_done=done,
                        information_package_id=self.ip,
                        task=self.task,
                    ))

            Validation.objects.bulk_create(validation_objs, 100)
            raise ValidationError(msg,
                                  errors=[o.message for o in validation_objs])
        except Exception as e:
            logger.exception(
                'Unknown error during syntax validation of {xml}'.format(
                    xml=filepath))
            done = timezone.now()
            Validation.objects.create(
                passed=False,
                validator=self.__class__.__name__,
                filename=filepath,
                message=str(e),
                time_started=started,
                time_done=done,
                information_package_id=self.ip,
                task=self.task,
            )
            raise

        Validation.objects.create(
            passed=True,
            validator=self.__class__.__name__,
            filename=filepath,
            time_started=started,
            time_done=timezone.now(),
            information_package_id=self.ip,
            task=self.task,
        )
        logger.info(
            "Successful syntax validation of {xml}".format(xml=filepath))
Exemple #6
0
    def validate(self, path, expected=None):
        xmlfile = self.context
        objs = []
        self._reset_dicts()
        self._reset_counters()
        logger.debug(u'Validating {path} against {xml}'.format(path=path,
                                                               xml=xmlfile))
        checksum_in_context_file = self.checksums.get(path)

        if checksum_in_context_file:
            try:
                self._pop_checksum_dict(self.deleted, checksum_in_context_file,
                                        path)
                self._pop_checksum_dict(self.present, checksum_in_context_file,
                                        path)
            except (KeyError, ValueError):
                pass

        skip_files = [os.path.relpath(xmlfile, self.rootdir)]
        skip_files.extend([p.path for p in find_pointers(path)])
        skip_files = list(map(normalize_path, skip_files))
        for f in find_files(path, rootdir=self.rootdir, skip_files=skip_files):
            if f in self.exclude:
                continue
            objs.append(self._validate(f))

        delete_count = self._validate_deleted_files(objs)
        self._validate_present_files(objs)

        if checksum_in_context_file:
            try:
                self.deleted[checksum_in_context_file].append(path)
            except KeyError:
                self.deleted[checksum_in_context_file] = [path]

            try:
                self.present[checksum_in_context_file].append(path)
            except KeyError:
                self.present[checksum_in_context_file] = [path]

        objs = [o for o in objs if o is not None]
        Validation.objects.bulk_create(objs, batch_size=100)

        if delete_count + self.added + self.changed + self.renamed > 0:
            msg = (
                'Comparison of {path} against {xml} failed: '
                '{cfmd} confirmed, {a} added, {c} changed, {r} renamed, {d} deleted'
            ).format(path=path,
                     xml=self.context,
                     cfmd=self.confirmed,
                     a=self.added,
                     c=self.changed,
                     r=self.renamed,
                     d=delete_count)
            logger.warning(msg)
            raise ValidationError(msg)

        logger.info(u"Successful comparison of {path} against {xml}".format(
            path=path, xml=self.context))
Exemple #7
0
    def validate(self, filepath, expected=None):
        logger.debug('Validating checksum of %s' % filepath)
        val_obj = Validation.objects.create(
            filename=filepath,
            time_started=timezone.now(),
            validator=self.__class__.__name__,
            required=self.required,
            task=self.task,
            information_package=self.ip,
            responsible=self.responsible,
            specification={
                'context': self.context,
                'options': self.options,
            }
        )

        expected = self.options['expected'].format(**self.data)

        if self.context == 'checksum_str':
            checksum = expected.lower()
        elif self.context == 'checksum_file':
            with open(expected, 'r') as checksum_file:
                checksum = checksum_file.read().strip()
        elif self.context == 'xml_file':
            xml_el, _ = find_file(filepath, xmlfile=expected)
            checksum = xml_el.checksum

        passed = False
        try:
            actual_checksum = calculate_checksum(filepath, algorithm=self.algorithm, block_size=self.block_size)
            if actual_checksum != checksum:
                raise ValidationError("checksum for %s is not valid (%s != %s)" % (
                    filepath, checksum, actual_checksum
                ))
            passed = True
        except Exception:
            val_obj.message = traceback.format_exc()
            raise
        else:
            message = 'Successfully validated checksum of %s' % filepath
            val_obj.message = message
            logger.info(message)
        finally:
            val_obj.time_done = timezone.now()
            val_obj.passed = passed
            val_obj.save(update_fields=['time_done', 'passed', 'message'])
Exemple #8
0
    def validate(self, filepath, expected=None):
        logger.debug('Validating filename of %s' % filepath)

        val_obj = Validation(filename=filepath,
                             time_started=timezone.now(),
                             validator=self.__class__.__name__,
                             required=self.required,
                             task=self.task,
                             information_package=self.ip,
                             responsible=self.responsible,
                             specification={
                                 'context': self.context,
                                 'options': self.options,
                             })

        passed = False
        try:
            if expected is None:
                if os.path.isfile(filepath):
                    expected = DEFAULT_EXPECTED_FILE
                else:
                    expected = DEFAULT_EXPECTED_DIR

            if not re.search(expected, os.path.basename(filepath)):
                message = "Filename validation of {} failed, it does not match {}".format(
                    filepath, expected)
                logger.warning(message)
                raise ValidationError(message)

            passed = True

        except Exception:
            val_obj.message = traceback.format_exc()
            raise
        else:
            val_obj.message = 'Successfully validated filename of {}'.format(
                filepath)
            logger.info(val_obj.message)
        finally:
            val_obj.time_done = timezone.now()
            val_obj.passed = passed
            val_obj.save()
Exemple #9
0
    def validate(self, filepath, expected=None):
        logger.debug('Validating filename of %s' % filepath)

        if expected is None:
            raise ValueError('Expected fields not provided')

        encoding = self.options.get('encoding', 'utf-8')
        filler = self.options.get('filler', ' ')

        self.errors = []
        self.warnings = 0
        self._validate(filepath, expected, encoding, filler)

        if len(self.errors):
            msg = 'Fixed-width validation of {} failed with {} error(s)'.format(
                filepath, len(self.errors))
            logger.error(msg)
            raise ValidationError(msg, errors=self.errors)

        logger.info('Successful fixed-width validation of {}'.format(filepath))
Exemple #10
0
    def validate(self, path, expected=None):
        xmlfile = self.context
        objs = []
        self._reset_dicts()
        self._reset_counters()
        logger.debug('Validating {path} against {xml}'.format(path=path,
                                                              xml=xmlfile))

        if os.path.isdir(path):
            for root, _dirs, files in walk(path):
                for f in files:
                    filepath = normalize_path(os.path.join(root, f))
                    if filepath in self.exclude or filepath == xmlfile:
                        continue
                    objs.append(self._validate(filepath))
        else:
            objs.append(self._validate(path))

        delete_count = self._validate_deleted_files(objs)
        self._validate_present_files(objs)

        objs = [o for o in objs if o is not None]
        Validation.objects.bulk_create(objs, batch_size=100)

        if delete_count + self.added + self.changed + self.renamed > 0:
            msg = (
                'Diff-check validation of {path} against {xml} failed: '
                '{cfmd} confirmed, {a} added, {c} changed, {r} renamed, {d} deleted'
            ).format(path=path,
                     xml=self.context,
                     cfmd=self.confirmed,
                     a=self.added,
                     c=self.changed,
                     r=self.renamed,
                     d=delete_count)
            logger.warning(msg)
            raise ValidationError(msg)

        logger.info(
            "Successful diff-check validation of {path} against {xml}".format(
                path=path, xml=self.context))
Exemple #11
0
    def validate(self, filepath, expected=None, encoding=None):
        logger.debug('Validating csv: %s' % filepath)
        time_started = timezone.now()

        column_number = self.options['column_number']
        delimiter = self.options.get('delimiter', ',')

        try:
            errors = self._validate(filepath, column_number, delimiter,
                                    encoding)

        except Exception:
            logger.exception(
                'Unknown error occurred when validating {}'.format(filepath))
            raise
        else:
            if len(errors) > 0:
                msg = 'CSV validation of {} failed with {} error(s)'.format(
                    filepath, len(errors))
                logger.error(msg)
                raise ValidationError(msg, errors=errors)

            message = 'Successfully validated csv: {}'.format(filepath)
            time_done = timezone.now()
            Validation.objects.create(filename=filepath,
                                      validator=self.__class__.__name__,
                                      required=self.required,
                                      task=self.task,
                                      information_package=self.ip,
                                      responsible=self.responsible,
                                      passed=True,
                                      message=message,
                                      time_started=time_started,
                                      time_done=time_done,
                                      specification={
                                          'context': self.context,
                                          'options': self.options,
                                      })
            logger.info(message)
Exemple #12
0
    def validate(self, filepath, expected=None):
        logger.debug('Validating encryption of %s' % filepath)
        result = self.is_file_encrypted(filepath)

        val_obj = Validation.objects.create(filename=filepath,
                                            time_started=timezone.now(),
                                            validator=self.__class__.__name__,
                                            required=self.required,
                                            task=self.task,
                                            information_package=self.ip,
                                            responsible=self.responsible,
                                            specification={
                                                'context': self.context,
                                                'options': self.options,
                                            })

        passed = False
        try:
            if result is not None and result != expected:
                if expected is True:
                    expected_msg = "{} is expected to be encrypted"
                else:
                    expected_msg = "{} is not expected to be encrypted"

                raise ValidationError(expected_msg.format(filepath))

            passed = True

        except ValidationError:
            val_obj.message = traceback.format_exc()
            raise
        else:
            message = 'Successfully validated encryption of %s' % filepath
            val_obj.message = message
            logger.info(message)
        finally:
            val_obj.time_done = timezone.now()
            val_obj.passed = passed
            val_obj.save(update_fields=['time_done', 'passed', 'message'])
    def validate(self, filepath):
        logger.debug('Validating extension of %s' % filepath)

        val_obj = Validation(
            filename=filepath,
            time_started=timezone.now(),
            validator=self.__class__.__name__,
            required=self.required,
            task=self.task,
            information_package=self.ip,
            responsible=self.responsible,
            specification={
                'context': self.context,
                'options': self.options,
            }
        )

        passed = False
        try:
            if re.search(REPEATED_PATTERN, filepath):
                message = "Extension validation of {} failed, repeated extensions found".format(filepath)
                logger.warning(message)
                raise ValidationError(message)

            passed = True

        except Exception:
            val_obj.message = traceback.format_exc()
            raise
        else:
            val_obj.message = 'Successfully validated extension of {}'.format(filepath)
            logger.info(val_obj.message)
        finally:
            val_obj.time_done = timezone.now()
            val_obj.passed = passed
            val_obj.save()
Exemple #14
0
    def validate(self, filepath):
        logger.info(f'Validating {filepath} with Warcio')
        passed = True
        message = f'Successfully validated warc {filepath}'
        val_obj = Validation.objects.create(filename=filepath,
                                            time_started=timezone.now(),
                                            validator=self.__class__.__name__,
                                            required=self.required,
                                            task=self.task,
                                            information_package=self.ip,
                                            responsible=self.responsible,
                                            specification={
                                                'context': self.context,
                                                'options': self.options,
                                            })

        try:
            with open(filepath, 'rb') as stream:
                it = ArchiveIterator(stream, check_digests=True)
                for record in it:
                    digest_present = (
                        record.rec_headers.get_header('WARC-Payload-Digest')
                        or record.rec_headers.get_header('WARC-Block-Digest'))

                    _read_entire_stream(record.content_stream())

                    d_msg = None
                    output = []

                    rec_id = record.rec_headers.get_header('WARC-Record-ID')
                    rec_type = record.rec_headers.get_header('WARC-Type')
                    rec_offset = it.get_record_offset()

                    if record.digest_checker.passed is False:
                        message = record.digest_checker.problems
                        passed = False
                        raise ValidationError(message)

                    elif record.digest_checker.passed is True:
                        d_msg = 'digest pass'
                    elif record.digest_checker.passed is None:
                        if digest_present and rec_type == 'revisit':
                            d_msg = 'digest present but not checked (revisit)'
                        elif digest_present:  # pragma: no cover
                            # should not happen
                            d_msg = 'digest present but not checked'
                        else:
                            d_msg = 'no digest to check'

                    if d_msg:
                        logger.debug(
                            f'offset {rec_offset} WARC-Record-ID {rec_id} {rec_type} ({d_msg})'
                        )
                    if output:
                        logger.debug(
                            f'offset {rec_offset} WARC-Record-ID {rec_id} {rec_type} {output}'
                        )

        except ArchiveLoadFailed as e:
            logger.warning(f'Warcio validation of {filepath} failed')
            passed = False
            message = f'<pre>{traceback.format_exc()}</pre>'
            raise ValidationError(
                f'saw exception ArchiveLoadFailed: {str(e).rstrip()}')

        finally:
            val_obj.message = message
            logger.info(message)
            val_obj.time_done = timezone.now()
            val_obj.passed = passed
            val_obj.save(update_fields=['time_done', 'passed', 'message'])
Exemple #15
0
    def validate(self, filepath, expected=None):
        if self.context:
            logger.debug('Validating schema of {xml} against {schema}'.format(
                xml=filepath, schema=self.context))
        else:
            logger.debug('Validating schema of {xml}'.format(xml=filepath))

        rootdir = self.options.get('rootdir')
        etree.clear_error_log()
        started = timezone.now()
        relpath = os.path.relpath(filepath, rootdir)
        try:
            validate_against_schema(filepath, self.context, rootdir)
        except etree.DocumentInvalid as e:
            msg = 'Schema validation of {xml} failed'.format(xml=filepath)
            logger.exception(msg)
            done = timezone.now()
            validation_objs = []
            for error in e.error_log:
                message = '{line}: {msg}'.format(line=error.line,
                                                 msg=error.message)
                validation_objs.append(
                    Validation(
                        passed=False,
                        validator=self.__class__.__name__,
                        filename=relpath,
                        message=message,
                        time_started=started,
                        time_done=done,
                        information_package_id=self.ip,
                        task=self.task,
                    ))

            Validation.objects.bulk_create(validation_objs, 100)
            raise ValidationError(msg,
                                  errors=[o.message for o in validation_objs])
        except Exception as e:
            msg = 'Unknown error during schema validation of {xml}'.format(
                xml=filepath)
            logger.exception(msg)
            done = timezone.now()
            Validation.objects.create(
                passed=False,
                validator=self.__class__.__name__,
                filename=relpath,
                message=str(e),
                time_started=started,
                time_done=done,
                information_package_id=self.ip,
                task=self.task,
            )
            raise

        Validation.objects.create(
            passed=True,
            validator=self.__class__.__name__,
            filename=relpath,
            time_started=started,
            time_done=timezone.now(),
            information_package_id=self.ip,
            task=self.task,
        )
        logger.info(
            "Successful schema validation of {xml}".format(xml=filepath))