def upload_asset_to_path_hook(instance, filename=None): '''This returns the asset upload path on S3 and compute the asset file multihash Args: instance: Asset Asset instance filename: string file name of the uploaded asset Returns: Asset file path to use on S3 ''' logger.debug('Start computing asset file %s multihash', filename) start = time.time() ctx = hashlib.sha256() for chunk in instance.file.chunks(settings.UPLOAD_FILE_CHUNK_SIZE): ctx.update(chunk) mhash = multihash.to_hex_string(multihash.encode(ctx.digest(), 'sha2-256')) # set the hash to the storage to use it for upload signing, this temporary attribute is # then used by storages.S3Storage to set the MetaData.sha256 setattr(instance.file.storage, '_tmp_sha256', ctx.hexdigest()) logger.debug( 'Set uploaded file %s multihash %s to checksum:multihash; computation done in %ss', filename, mhash, time.time() - start) instance.checksum_multihash = mhash return get_asset_path(instance.item, instance.name)
def get_sha256_multihash(content): '''Get the sha2-256 multihash of the bytes content Args: content: bytes Returns: sha256 multihash string ''' digest = hashlib.sha256(content).digest() return multihash.to_hex_string(multihash.encode(digest, 'sha2-256'))
def decode(value): """ Decode HEX multi hash. :param bytes value: an encoded content :return: the decoded content :rtype: str """ cid = make_cid(value) return multihash.to_hex_string(multihash.decode(cid.multihash).digest)
def create_multihash_string(digest, hash_code): '''Returns a multihash string from a digest Args: digest: string hash_code: string | int hash code (sha2-256, md5, ...) Returns: string multihash string ''' return multihash.to_hex_string(multihash.encode(digest, hash_code))
def _validate_asset_file_checksum(href, expected_multihash, asset_multihash): expected_multihash = multihash.decode(from_hex_string(expected_multihash)) logger.debug( 'Validate asset file checksum at %s with multihash %s/%s (from headers), expected %s/%s ' '(from checksum:multishash attribute)', href, to_hex_string(asset_multihash.digest), asset_multihash.name, to_hex_string(expected_multihash.digest), expected_multihash.name) if asset_multihash.name != expected_multihash.name: logger.error( 'Asset at href %s, with multihash name=%s digest=%s, doesn\'t match the expected ' 'multihash name=%s digest=%s defined in checksum:multihash attribute', href, asset_multihash.name, to_hex_string(asset_multihash.digest), expected_multihash.name, to_hex_string(expected_multihash.digest)) raise serializers.ValidationError({ 'href': _(f"Asset at href {href} has a {asset_multihash.name} multihash while a " f"{expected_multihash.name} multihash is defined in the checksum:multihash " "attribute") }) if asset_multihash != expected_multihash: logger.error( 'Asset at href %s, with multihash name=%s digest=%s, doesn\'t match the ' 'checksum:multihash value name=%s digest=%s', href, asset_multihash.name, to_hex_string(asset_multihash.digest), expected_multihash.name, to_hex_string(expected_multihash.digest)) raise serializers.ValidationError({ 'href': _(f"Asset at href {href} with {asset_multihash.name} hash " f"{to_hex_string(asset_multihash.digest)} doesn't match the " f"checksum:multihash {to_hex_string(expected_multihash.digest)}") })
def encode(codec, value): """ Encode a content hash. :param str codec: a codec of a content hash :param str value: a value of a content hash :return: the resulting content hash :rtype: str """ profile = get_profile(codec) value = profile.encode(value) value = multicodec.add_prefix(codec, value) return multihash.to_hex_string(value)
def multihash_checksum_sha256(file_path: Union[str, BytesIO]): """Generate the checksum multihash. This method follows the spec `multihash <https://github.com/multiformats/multihash>`_. We use `sha256` as described in ``check_sum``. The multihash spec defines the code `0x12` for `sha256` and must have `0x20` (32 chars) length. See more in https://github.com/multiformats/py-multihash/blob/master/multihash/constants.py#L4 Args: file_path (str|BytesIo): Path to the file Returns: A string-like hash in hex-decimal """ sha256 = 0x12 sha256_length = 0x20 _hash = _multihash.encode(digest=check_sum(file_path), code=sha256, length=sha256_length) return _multihash.to_hex_string(_hash)
def create_multipart_upload(self, key, asset, checksum_multihash): '''Create a multi part upload on the backend Args: key: string key on the S3 backend for which we want to create a multipart upload asset: Asset Asset metadata model associated with the S3 backend key checksum_multihash: string Checksum multihash (must be sha256) of the future file to be uploaded Returns: string Upload Id of the created multipart upload ''' sha256 = to_hex_string(parse_multihash(checksum_multihash).digest) response = self.call_s3_api( self.s3.create_multipart_upload, Bucket=settings.AWS_STORAGE_BUCKET_NAME, Key=key, Metadata={'sha256': sha256}, CacheControl=', '.join( ['public', f'max-age={settings.STORAGE_ASSETS_CACHE_SECONDS}']), ContentType=asset.media_type, log_extra={ 'collection': asset.item.collection.name, 'item': asset.item.name, 'asset': asset.name }) logger.info('S3 Multipart upload successfully created: upload_id=%s', response['UploadId'], extra={ 's3_response': response, 'upload_id': response['UploadId'], 'asset': asset.name }) return response['UploadId']
def test_to_hex_string_invalid_type(self, value): """ to_hex_string: raises TypeError for invalid types """ with pytest.raises(TypeError) as excinfo: to_hex_string(value) assert 'multihash should be bytes' in str(excinfo.value)
def test_to_hex_string_valid(self, value): """ to_hex_string: test if it passes for all valid cases """ code = value['encoding']['code'] buffer = encode(bytes.fromhex(value['hex']), code) assert to_hex_string(buffer) == hexlify(buffer).decode()
import hashlib import multihash from topo_processor.file_system.get_fs import get_fs CHUNK_SIZE = 1024 * 1024 # 1MB async def multihash_as_hex(path: str) -> str: file_hash = hashlib.sha256() with get_fs(path).open(path, "rb") as file: while chunk := file.read(CHUNK_SIZE): file_hash.update(chunk) return multihash.to_hex_string( multihash.encode(file_hash.digest(), "sha2-256"))
def test_invalid_to_hex_string(self): # In actual fact, type checking should catch this for us with pytest.raises(TypeError): multihash.to_hex_string('hello world')
def test_valid_to_hex_string(self, valid): for case in valid: code = case['encoding']['code'] buf = multihash.encode(bytes.fromhex(case['hex']), code) assert multihash.to_hex_string(buf) == hexlify(buf).decode()