class FormatValidator(BaseValidator): """ Validates the format of a file against the given ``context``. """ def __init__(self, *args, **kwargs): super(FormatValidator, self).__init__(*args, **kwargs) allow_unknown = self.options.get('allow_unknown_file_types', False) self.fid = FormatIdentifier(allow_unknown_file_types=allow_unknown) def validate(self, filepath, expected=None): logger.debug('Validating format of %s' % filepath) name, version, reg_key = expected if not any(f is not None for f in (name, version, reg_key)): raise ValueError( 'At least one of name, version and registry key is required') val_obj = Validation.objects.create(filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, }) passed = False try: actual_name, actual_version, actual_reg_key = self.fid.identify_file_format( filepath) if name and name != actual_name: raise ValidationError( "format name for {} is not valid, ({} !={})".format( filepath, name, actual_name)) if version and version != actual_version: raise ValidationError( "format version for {} is not valid, ({} != {})".format( filepath, version, actual_version)) if reg_key and reg_key != actual_reg_key: raise ValidationError( "format registry key for {} is not valid, ({} != {})". format(filepath, reg_key, actual_reg_key)) passed = True except ValidationError: val_obj.message = traceback.format_exc() raise else: message = 'Successfully validated checksum of %s' % filepath val_obj.message = message logger.info(message) finally: val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save(update_fields=['time_done', 'passed', 'message'])
def setUp(self): self.content = b'test file' self.test_file = tempfile.NamedTemporaryFile(suffix='.txt', delete=False) self.addCleanup(os.remove, self.test_file.name) self.test_file.write(self.content) self.test_file.seek(0) self.test_file.close() fid = FormatIdentifier() self.expected = fid.identify_file_format(self.test_file.name)
def index_document(tag_version, filepath): exclude_file_format_from_indexing_content = settings.EXCLUDE_FILE_FORMAT_FROM_INDEXING_CONTENT fid = FormatIdentifier() (format_name, format_version, format_registry_key) = fid.identify_file_format(filepath) if format_registry_key not in exclude_file_format_from_indexing_content: index_file_content = True else: index_file_content = False ip = tag_version.tag.information_package extension = os.path.splitext(tag_version.name)[1][1:] dirname = os.path.dirname(filepath) href = normalize_path(os.path.relpath(dirname, ip.object_path)) href = '' if href == '.' else href size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) tag_version.custom_fields = { 'extension': extension, 'dirname': dirname, 'href': href, 'filename': tag_version.name, 'size': size, 'modified': modified, 'formatname': format_name, 'formatversion': format_version, 'formatkey': format_registry_key, } doc = File.from_obj(tag_version) try: if index_file_content: with open(filepath, 'rb') as f: content = f.read() doc.data = base64.b64encode(content).decode("ascii") doc.save(pipeline='ingest_attachment') else: logger.debug('Skip to index file content for {}'.format(filepath)) doc.save() except ElasticsearchException: logger.exception('Failed to index {}'.format(filepath)) raise return doc, tag_version
def validate_file_format(filename, format_name, format_registry_key, format_version): """ Validates the format of the given file """ fid = FormatIdentifier() actual_format_name, actual_format_version, actual_format_registry_key = fid.identify_file_format( filename) if format_name: assert actual_format_name == format_name, ( "format name for %s is not valid, (%s != %s)" % filename, format_name, actual_format_name) if format_version: assert actual_format_version == format_version, "format version for %s is not valid" % filename if format_registry_key: assert actual_format_registry_key == format_registry_key, ( "format registry key for %s is not valid" % filename) return "Success"
def cli(path): fid = FormatIdentifier() res = fid.identify_file_format(path) click.echo(res)