def in_valid_paths(self, root, path, valid_paths): for valid_path in [p for p in valid_paths if isinstance(p, str)]: if path in list(map(normalize_path, glob(valid_path))): return True for valid_path in [p for p in valid_paths if not isinstance(p, str)]: for nested_valid_path in valid_path: for found_nested_path, matches in iglob(nested_valid_path, with_matches=True): found_nested_path = normalize_path(found_nested_path) if found_nested_path == path: # check matches matches = map(normalize_path, matches) for match in matches: for related_path in valid_path: if related_path != found_nested_path: related_path = related_path.replace( '*', match, 1) if not os.path.isfile(related_path): rel_path = normalize_path( os.path.relpath(path, root)) rel_related_path = normalize_path( os.path.relpath( related_path, root)) raise ValidationError( '{file} missing related file {related}' .format(file=rel_path, related=rel_related_path)) return True raise ValidationError('{file} is not allowed'.format(file=path))
def validate(self, filepath, expected=None): logger.debug('Validating format of %s' % filepath) name, version, reg_key = expected if not any(f is not None for f in (name, version, reg_key)): raise ValueError( 'At least one of name, version and registry key is required') val_obj = Validation.objects.create(filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, }) passed = False try: actual_name, actual_version, actual_reg_key = self.fid.identify_file_format( filepath) if name and name != actual_name: raise ValidationError( "format name for {} is not valid, ({} !={})".format( filepath, name, actual_name)) if version and version != actual_version: raise ValidationError( "format version for {} is not valid, ({} != {})".format( filepath, version, actual_version)) if reg_key and reg_key != actual_reg_key: raise ValidationError( "format registry key for {} is not valid, ({} != {})". format(filepath, reg_key, actual_reg_key)) passed = True except ValidationError: val_obj.message = traceback.format_exc() raise else: message = 'Successfully validated checksum of %s' % filepath val_obj.message = message logger.info(message) finally: val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save(update_fields=['time_done', 'passed', 'message'])
def validate(self, filepath, expected=None): logger.debug("Validating %s with Mediaconch" % filepath) val_obj = Validation.objects.create(filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, }) passed = False try: out, err, returncode = run_mediaconch(filepath, policy=self.context) if returncode: logger.warning("Mediaconch validation of %s failed, %s" % (filepath, err)) raise ValidationError(err) parser = etree.XMLParser(remove_blank_text=True) root = etree.XML(out, parser=parser) passed = get_outcome(root) message = etree.tostring(root, xml_declaration=True, encoding='UTF-8') if not passed: logger.warning("Mediaconch validation of %s failed, %s" % (filepath, message)) raise ValidationError(message) except Exception: val_obj.message = traceback.format_exc() raise else: val_obj.message = message logger.info("Successful Mediaconch validation of %s" % filepath) finally: val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save(update_fields=['time_done', 'passed', 'message']) return message
def validate_folder(self, path, node): valid_paths = node.get('valid_paths', []) allow_empty = node.get('allow_empty', True) required_files = list( map(normalize_path, [ req.format(**self.data) for req in node.get('required_files', []) ])) file_count = 0 for idx, valid in enumerate(valid_paths): if isinstance(valid, str): valid_paths[idx] = normalize_path( os.path.join(path, valid).format(**self.data)) else: for nested_idx, nested_valid in enumerate(valid): valid[nested_idx] = normalize_path( os.path.join(path, nested_valid).format(**self.data)) for root, dirs, files in walk(path): for f in files: file_count += 1 if len(valid_paths): try: self.in_valid_paths( path, normalize_path(os.path.join(root, f)), valid_paths) except ValidationError as validation_exc: try: self.update_required_files( os.path.relpath(root, path), f, required_files) except ValueError: raise validation_exc if len(required_files): try: self.update_required_files(os.path.relpath(root, path), f, required_files) except ValueError: pass if not allow_empty and file_count == 0: raise ValidationError( '{path} is not allowed to be empty'.format(path=path)) if len(required_files): raise ValidationError('Missing {files} in {path}'.format( files=','.join(required_files), path=path))
def validate(self, filepath, expected=None): logger.debug('Validating syntax of {xml}'.format(xml=filepath)) etree.clear_error_log() started = timezone.now() try: etree.parse(filepath) except etree.XMLSyntaxError as e: msg = 'Syntax validation of {xml} failed'.format(xml=filepath) logger.exception(msg) done = timezone.now() validation_objs = [] for error in e.error_log: message = '{line}: {msg}'.format(line=error.line, msg=error.message) validation_objs.append( Validation( passed=False, validator=self.__class__.__name__, filename=filepath, message=message, time_started=started, time_done=done, information_package_id=self.ip, task=self.task, )) Validation.objects.bulk_create(validation_objs, 100) raise ValidationError(msg, errors=[o.message for o in validation_objs]) except Exception as e: logger.exception( 'Unknown error during syntax validation of {xml}'.format( xml=filepath)) done = timezone.now() Validation.objects.create( passed=False, validator=self.__class__.__name__, filename=filepath, message=str(e), time_started=started, time_done=done, information_package_id=self.ip, task=self.task, ) raise Validation.objects.create( passed=True, validator=self.__class__.__name__, filename=filepath, time_started=started, time_done=timezone.now(), information_package_id=self.ip, task=self.task, ) logger.info( "Successful syntax validation of {xml}".format(xml=filepath))
def validate(self, path, expected=None): xmlfile = self.context objs = [] self._reset_dicts() self._reset_counters() logger.debug(u'Validating {path} against {xml}'.format(path=path, xml=xmlfile)) checksum_in_context_file = self.checksums.get(path) if checksum_in_context_file: try: self._pop_checksum_dict(self.deleted, checksum_in_context_file, path) self._pop_checksum_dict(self.present, checksum_in_context_file, path) except (KeyError, ValueError): pass skip_files = [os.path.relpath(xmlfile, self.rootdir)] skip_files.extend([p.path for p in find_pointers(path)]) skip_files = list(map(normalize_path, skip_files)) for f in find_files(path, rootdir=self.rootdir, skip_files=skip_files): if f in self.exclude: continue objs.append(self._validate(f)) delete_count = self._validate_deleted_files(objs) self._validate_present_files(objs) if checksum_in_context_file: try: self.deleted[checksum_in_context_file].append(path) except KeyError: self.deleted[checksum_in_context_file] = [path] try: self.present[checksum_in_context_file].append(path) except KeyError: self.present[checksum_in_context_file] = [path] objs = [o for o in objs if o is not None] Validation.objects.bulk_create(objs, batch_size=100) if delete_count + self.added + self.changed + self.renamed > 0: msg = ( 'Comparison of {path} against {xml} failed: ' '{cfmd} confirmed, {a} added, {c} changed, {r} renamed, {d} deleted' ).format(path=path, xml=self.context, cfmd=self.confirmed, a=self.added, c=self.changed, r=self.renamed, d=delete_count) logger.warning(msg) raise ValidationError(msg) logger.info(u"Successful comparison of {path} against {xml}".format( path=path, xml=self.context))
def validate(self, filepath, expected=None): logger.debug('Validating checksum of %s' % filepath) val_obj = Validation.objects.create( filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, } ) expected = self.options['expected'].format(**self.data) if self.context == 'checksum_str': checksum = expected.lower() elif self.context == 'checksum_file': with open(expected, 'r') as checksum_file: checksum = checksum_file.read().strip() elif self.context == 'xml_file': xml_el, _ = find_file(filepath, xmlfile=expected) checksum = xml_el.checksum passed = False try: actual_checksum = calculate_checksum(filepath, algorithm=self.algorithm, block_size=self.block_size) if actual_checksum != checksum: raise ValidationError("checksum for %s is not valid (%s != %s)" % ( filepath, checksum, actual_checksum )) passed = True except Exception: val_obj.message = traceback.format_exc() raise else: message = 'Successfully validated checksum of %s' % filepath val_obj.message = message logger.info(message) finally: val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save(update_fields=['time_done', 'passed', 'message'])
def validate(self, filepath, expected=None): logger.debug('Validating filename of %s' % filepath) val_obj = Validation(filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, }) passed = False try: if expected is None: if os.path.isfile(filepath): expected = DEFAULT_EXPECTED_FILE else: expected = DEFAULT_EXPECTED_DIR if not re.search(expected, os.path.basename(filepath)): message = "Filename validation of {} failed, it does not match {}".format( filepath, expected) logger.warning(message) raise ValidationError(message) passed = True except Exception: val_obj.message = traceback.format_exc() raise else: val_obj.message = 'Successfully validated filename of {}'.format( filepath) logger.info(val_obj.message) finally: val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save()
def validate(self, filepath, expected=None): logger.debug('Validating filename of %s' % filepath) if expected is None: raise ValueError('Expected fields not provided') encoding = self.options.get('encoding', 'utf-8') filler = self.options.get('filler', ' ') self.errors = [] self.warnings = 0 self._validate(filepath, expected, encoding, filler) if len(self.errors): msg = 'Fixed-width validation of {} failed with {} error(s)'.format( filepath, len(self.errors)) logger.error(msg) raise ValidationError(msg, errors=self.errors) logger.info('Successful fixed-width validation of {}'.format(filepath))
def validate(self, path, expected=None): xmlfile = self.context objs = [] self._reset_dicts() self._reset_counters() logger.debug('Validating {path} against {xml}'.format(path=path, xml=xmlfile)) if os.path.isdir(path): for root, _dirs, files in walk(path): for f in files: filepath = normalize_path(os.path.join(root, f)) if filepath in self.exclude or filepath == xmlfile: continue objs.append(self._validate(filepath)) else: objs.append(self._validate(path)) delete_count = self._validate_deleted_files(objs) self._validate_present_files(objs) objs = [o for o in objs if o is not None] Validation.objects.bulk_create(objs, batch_size=100) if delete_count + self.added + self.changed + self.renamed > 0: msg = ( 'Diff-check validation of {path} against {xml} failed: ' '{cfmd} confirmed, {a} added, {c} changed, {r} renamed, {d} deleted' ).format(path=path, xml=self.context, cfmd=self.confirmed, a=self.added, c=self.changed, r=self.renamed, d=delete_count) logger.warning(msg) raise ValidationError(msg) logger.info( "Successful diff-check validation of {path} against {xml}".format( path=path, xml=self.context))
def validate(self, filepath, expected=None, encoding=None): logger.debug('Validating csv: %s' % filepath) time_started = timezone.now() column_number = self.options['column_number'] delimiter = self.options.get('delimiter', ',') try: errors = self._validate(filepath, column_number, delimiter, encoding) except Exception: logger.exception( 'Unknown error occurred when validating {}'.format(filepath)) raise else: if len(errors) > 0: msg = 'CSV validation of {} failed with {} error(s)'.format( filepath, len(errors)) logger.error(msg) raise ValidationError(msg, errors=errors) message = 'Successfully validated csv: {}'.format(filepath) time_done = timezone.now() Validation.objects.create(filename=filepath, validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, passed=True, message=message, time_started=time_started, time_done=time_done, specification={ 'context': self.context, 'options': self.options, }) logger.info(message)
def validate(self, filepath, expected=None): logger.debug('Validating encryption of %s' % filepath) result = self.is_file_encrypted(filepath) val_obj = Validation.objects.create(filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, }) passed = False try: if result is not None and result != expected: if expected is True: expected_msg = "{} is expected to be encrypted" else: expected_msg = "{} is not expected to be encrypted" raise ValidationError(expected_msg.format(filepath)) passed = True except ValidationError: val_obj.message = traceback.format_exc() raise else: message = 'Successfully validated encryption of %s' % filepath val_obj.message = message logger.info(message) finally: val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save(update_fields=['time_done', 'passed', 'message'])
def validate(self, filepath): logger.debug('Validating extension of %s' % filepath) val_obj = Validation( filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, } ) passed = False try: if re.search(REPEATED_PATTERN, filepath): message = "Extension validation of {} failed, repeated extensions found".format(filepath) logger.warning(message) raise ValidationError(message) passed = True except Exception: val_obj.message = traceback.format_exc() raise else: val_obj.message = 'Successfully validated extension of {}'.format(filepath) logger.info(val_obj.message) finally: val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save()
def validate(self, filepath): logger.info(f'Validating {filepath} with Warcio') passed = True message = f'Successfully validated warc {filepath}' val_obj = Validation.objects.create(filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, }) try: with open(filepath, 'rb') as stream: it = ArchiveIterator(stream, check_digests=True) for record in it: digest_present = ( record.rec_headers.get_header('WARC-Payload-Digest') or record.rec_headers.get_header('WARC-Block-Digest')) _read_entire_stream(record.content_stream()) d_msg = None output = [] rec_id = record.rec_headers.get_header('WARC-Record-ID') rec_type = record.rec_headers.get_header('WARC-Type') rec_offset = it.get_record_offset() if record.digest_checker.passed is False: message = record.digest_checker.problems passed = False raise ValidationError(message) elif record.digest_checker.passed is True: d_msg = 'digest pass' elif record.digest_checker.passed is None: if digest_present and rec_type == 'revisit': d_msg = 'digest present but not checked (revisit)' elif digest_present: # pragma: no cover # should not happen d_msg = 'digest present but not checked' else: d_msg = 'no digest to check' if d_msg: logger.debug( f'offset {rec_offset} WARC-Record-ID {rec_id} {rec_type} ({d_msg})' ) if output: logger.debug( f'offset {rec_offset} WARC-Record-ID {rec_id} {rec_type} {output}' ) except ArchiveLoadFailed as e: logger.warning(f'Warcio validation of {filepath} failed') passed = False message = f'<pre>{traceback.format_exc()}</pre>' raise ValidationError( f'saw exception ArchiveLoadFailed: {str(e).rstrip()}') finally: val_obj.message = message logger.info(message) val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save(update_fields=['time_done', 'passed', 'message'])
def validate(self, filepath, expected=None): if self.context: logger.debug('Validating schema of {xml} against {schema}'.format( xml=filepath, schema=self.context)) else: logger.debug('Validating schema of {xml}'.format(xml=filepath)) rootdir = self.options.get('rootdir') etree.clear_error_log() started = timezone.now() relpath = os.path.relpath(filepath, rootdir) try: validate_against_schema(filepath, self.context, rootdir) except etree.DocumentInvalid as e: msg = 'Schema validation of {xml} failed'.format(xml=filepath) logger.exception(msg) done = timezone.now() validation_objs = [] for error in e.error_log: message = '{line}: {msg}'.format(line=error.line, msg=error.message) validation_objs.append( Validation( passed=False, validator=self.__class__.__name__, filename=relpath, message=message, time_started=started, time_done=done, information_package_id=self.ip, task=self.task, )) Validation.objects.bulk_create(validation_objs, 100) raise ValidationError(msg, errors=[o.message for o in validation_objs]) except Exception as e: msg = 'Unknown error during schema validation of {xml}'.format( xml=filepath) logger.exception(msg) done = timezone.now() Validation.objects.create( passed=False, validator=self.__class__.__name__, filename=relpath, message=str(e), time_started=started, time_done=done, information_package_id=self.ip, task=self.task, ) raise Validation.objects.create( passed=True, validator=self.__class__.__name__, filename=relpath, time_started=started, time_done=timezone.now(), information_package_id=self.ip, task=self.task, ) logger.info( "Successful schema validation of {xml}".format(xml=filepath))