def test_fileinstance_copy_contents(app, db, dummy_location): """Test copy contents.""" counter = dict(called=False) def callback(total, size): counter['called'] = True # Create source and set data. data = b('this is some data') src = FileInstance.create() src.set_contents(BytesIO(data), default_location=dummy_location.uri) db.session.commit() # Create destination - and use it to copy_contents from another object. dst = FileInstance.create() assert dst.size == 0 assert dst.uri is None db.session.commit() # Copy contents dst.copy_contents( src, progress_callback=callback, default_location=dummy_location.uri) db.session.commit() assert dst.size == src.size assert dst.checksum == src.checksum assert dst.uri != src.uri assert counter['called'] # Read data fp = dst.storage().open() assert data == fp.read() fp.close()
def test_fileinstance_copy_contents(app, db, dummy_location): """Test copy contents.""" counter = dict(called=False) def callback(total, size): counter['called'] = True # Create source and set data. data = b('this is some data') src = FileInstance.create() src.set_contents(BytesIO(data), location=dummy_location) db.session.commit() # Create destination - and use it to copy_contents from another object. dst = FileInstance.create() assert dst.size == 0 assert dst.uri is None db.session.commit() # Copy contents dst.copy_contents(src, progress_callback=callback, location=dummy_location) db.session.commit() assert dst.size == src.size assert dst.checksum == src.checksum assert dst.uri != src.uri assert counter['called'] # Read data fp = dst.storage().open() assert data == fp.read() fp.close()
def test_object_restore(app, db, dummy_location): """Restore object.""" f1 = FileInstance(uri="f1", size=1, checksum="mychecksum") f2 = FileInstance(uri="f2", size=2, checksum="mychecksum2") db.session.add(f1) db.session.add(f2) b1 = Bucket.create() obj1 = ObjectVersion.create(b1, "test").set_file(f1) ObjectVersion.create(b1, "test").set_file(f2) obj_deleted = ObjectVersion.delete(b1, "test") db.session.commit() assert ObjectVersion.query.count() == 3 # Cannot restore a deleted version. pytest.raises(InvalidOperationError, obj_deleted.restore) # Restore first version obj_new = obj1.restore() db.session.commit() assert ObjectVersion.query.count() == 4 assert obj_new.is_head is True assert obj_new.version_id != obj1.version_id assert obj_new.key == obj1.key assert obj_new.file_id == obj1.file_id assert obj_new.bucket == obj1.bucket
def test_fileinstance_get(app, db, dummy_location): """Test fileinstance get.""" f = FileInstance.create() db.session.commit() # Get existing file. assert FileInstance.get(f.id) is not None # Non-existing files returns none assert FileInstance.get(uuid.uuid4()) is None
def test_fileinstance_get_by_uri(app, db, dummy_location): """Test file get by uri.""" f = FileInstance.create() f.uri = "LICENSE" db.session.commit() assert FileInstance.get_by_uri("LICENSE") is not None FileInstance.get_by_uri("NOTVALID") is None pytest.raises(AssertionError, FileInstance.get_by_uri, None)
def create_b2safe_file(external_pids, bucket): """Create a FileInstance which contains a PID in its uri.""" validate_schema( external_pids, { 'type': 'array', 'items': { 'type': 'object', 'properties': { 'ePIC_PID': { 'type': 'string' }, 'key': { 'type': 'string' } }, 'additionalProperties': False, 'required': ['ePIC_PID', 'key'] } }) keys_list = [e['key'] for e in external_pids] keys_set = set(keys_list) if len(keys_list) != len(keys_set): raise InvalidDepositError([ FieldError('external_pids', 'Field external_pids contains duplicate keys.') ]) for external_pid in external_pids: if not external_pid['ePIC_PID'].startswith('http://hdl.handle.net/'): external_pid['ePIC_PID'] = 'http://hdl.handle.net/' + \ external_pid['ePIC_PID'] if external_pid['key'].startswith('/'): raise InvalidDepositError([ FieldError('external_pids', 'File key cannot start with a "/".') ]) try: # Create the file instance if it does not already exist file_instance = FileInstance.get_by_uri(external_pid['ePIC_PID']) if file_instance is None: file_instance = FileInstance.create() file_instance.set_uri(external_pid['ePIC_PID'], 1, 0, storage_class='B') assert file_instance.storage_class == 'B' # Add the file to the bucket if it is not already in it current_version = ObjectVersion.get(bucket, external_pid['key']) if not current_version or \ current_version.file_id != file_instance.id: ObjectVersion.create(bucket, external_pid['key'], file_instance.id) except IntegrityError as e: raise InvalidDepositError( [FieldError('external_pids', 'File URI already exists.')])
def _get_frames(cls, master_video): """Get Frames.""" return [ FileInstance.get(f['file_id']).uri for f in CDSVideosFilesIterator.get_video_frames( master_file=master_video) ]
def upload(self, pid=None, *args, **kwargs): """Upload action for file/repository.""" with UpdateDepositPermission(self).require(403): data = request.get_json() fileinfo = self._construct_fileinfo(data['url'], data['type']) if request: _, record = request.view_args.get('pid_value').data record_id = str(record.id) filename = fileinfo['filename'] obj = ObjectVersion.create(bucket=record.files.bucket, key=filename) obj.file = FileInstance.create() record.files.flush() record.files[filename]['source_url'] = data['url'] if data['type'] == 'url': if data['url'].startswith( ('https://github', 'https://gitlab.cern.ch', 'root://')): download_url.delay(record_id, data['url'], fileinfo) else: raise FileUploadError( 'Please provide a valid file url.') else: if data['url'].startswith( ('https://github', 'https://gitlab.cern.ch')): download_repo.delay(record_id, data['url'], filename) else: raise FileUploadError( 'Please provide a valid repository url.') return self
def handle_record_files(data, bucket, files, skip_files): """Handles record files.""" for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file try: f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get("size"), file.get("checksum")) obj = ObjectVersion.create(bucket, filename, _file_id=f.id) file.update({ 'bucket': str(obj.bucket_id), 'checksum': obj.file.checksum, 'key': obj.key, 'version_id': str(obj.version_id), }) except Exception as e: click.echo('Recid {0} file {1} could not be loaded due ' 'to {2}.'.format(data.get('recid'), filename, str(e))) continue
def save_file(self, content, filename, size, failed=False): """Save file with given content in deposit bucket. If downloading a content failed, file will be still created, with tag `failed`. :param content: stream :param filename: name that file will be saved with :param size: size of content :param failed: if failed during downloading the content """ obj = ObjectVersion.create(bucket=self.files.bucket, key=filename) obj.file = FileInstance.create() self.files.flush() if not failed: self.files[filename].file.set_contents( content, default_location=self.files.bucket.location.uri, size=size) print('File {} saved ({}b).\n'.format(filename, size)) else: ObjectVersionTag.create(object_version=obj, key='status', value='failed') print('File {} not saved.\n'.format(filename)) self.files.flush() db.session.commit() return obj
def test_b2share_storage_with_pid(base_app, app, tmp_location, login_user, test_users): """Check that the storage class will redirect pid files.""" pid = 'http://hdl.handle.net/11304/74c66f0b-f814-4202-9dcb-4889ba9b1047' with app.app_context(): # Disable access control for this test tmp_location = Location.query.first() with db.session.begin_nested(): bucket = Bucket.create(tmp_location, storage_class='B') pid_file = FileInstance.create() pid_file.set_uri(pid, 1, 0, storage_class='B') ObjectVersion.create(bucket, 'test.txt', pid_file.id) db.session.commit() url = url_for('invenio_files_rest.object_api', bucket_id=bucket.id, key='test.txt') try: with app.app_context(): permission = current_files_rest.permission_factory current_files_rest.permission_factory = allow_all # Check that accessing the file redirects to the PID with app.test_client() as client: resp = client.get(url) assert resp.headers['Location'] == pid assert resp.status_code == 302 finally: with app.app_context(): current_files_rest.permission_factory = permission
def delete_record(self, fileinstance_id, record_uuid): """Delete a record. :param fileinstance_id: The file instance id. :param record_uuid: The record's uuid. """ # get the FileInstance object file_instance = FileInstance.get(fileinstance_id) # get the uri of the file for the directory of the folder uri = file_instance.uri # building the path to delete by storing the index of the folder data i = uri.find('data') # removing the record indexing, the record and the file instance recind = RecordIndexer() recind.delete_by_id(record_uuid=record_uuid) self.delete_bucket() FileInstance.query.filter_by(id=fileinstance_id).delete() PersistentIdentifier.query.filter_by(object_uuid=record_uuid).delete() db.session.commit() # removing the file on disk and the folder containing it # the full path is /home/<user>/.local/share/virtualenvs/ # fare-platform-<code>/var/instance/data/<f1>/<f2>/<bucketid>/<filename> # after have stored the index of the folder "data", where there are all # the records, the path is passed to the function below # and trimmed at <f1>, a folder name composed by 2 character, # at the index "i" is added 8 because is the number of # character for completing the path, terminating at "<f1>/" shutil.rmtree(uri[:i + 8]) current_app.logger.info("Deleted file= " + self['title'] + ", by user= " + current_user.email)
def test_pyfilesystemstorage(app, db, dummy_location): """Test pyfs storage.""" # Create bucket and object with db.session.begin_nested(): b1 = Bucket.create() obj = ObjectVersion.create(b1, "LICENSE") obj.file = FileInstance.create() storage = PyFilesystemStorage(obj.file, base_uri=obj.bucket.location.uri) counter = dict(size=0) def callback(total, size): counter['size'] = size data = b("this is some content") stream = BytesIO(data) loc, size, checksum = storage.save(stream, progress_callback=callback) # Verify checksum, size and location. m = hashlib.md5() m.update(data) assert "md5:{0}".format(m.hexdigest()) == checksum assert size == len(data) assert loc == join( dummy_location.uri, str(obj.file.id), "data")
def test_pyfilesystemstorage_make_path(): """Test path for files.""" fi = FileInstance.create() fi.id = uuid.uuid5(uuid.NAMESPACE_DNS, 'Testing-') fs = PyFilesystemStorage(fi, base_uri='Base') assert 'Base/45/629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path() assert 'Base/4/5629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 1) assert 'Base/4/5/6/29316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(3, 1) assert 'Base/456/29316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 3) # If length 0, it should take the default value. assert 'Base/45/629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 0) # If dimensions are 0, it should take the default value. assert 'Base/4/5629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(0, 1) # Length of each partition is too long. with pytest.raises(AssertionError): fs.make_path(1, 50) # Number of partitions is too high. with pytest.raises(AssertionError): fs.make_path(50, 1) # Both values produce the exception. with pytest.raises(AssertionError): fs.make_path(50, 50)
def handle_record_files(data, bucket, files, skip_files): """Handles record files.""" for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file try: f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get( "size"), file.get("checksum")) obj = ObjectVersion.create( bucket, filename, _file_id=f.id ) file.update({ 'bucket': str(obj.bucket_id), 'checksum': obj.file.checksum, 'key': obj.key, 'version_id': str(obj.version_id), }) except Exception as e: click.echo( 'Recid {0} file {1} could not be loaded due ' 'to {2}.'.format(data.get('recid'), filename, str(e))) continue
def test_fileinstance_set_contents(app, db, dummy_location): """Test file instance create.""" counter = dict(called=False) def callback(total, size): counter['called'] = True f = FileInstance.create() db.session.commit() assert f.readable is False assert f.writable is True data = BytesIO(b("test file instance set contents")) f.set_contents( data, default_location=dummy_location.uri, progress_callback=callback) db.session.commit() assert f.readable is True assert f.writable is False assert counter['called'] pytest.raises( ValueError, f.set_contents, BytesIO(b("different content")), location=dummy_location, )
def test_object_relink_all(app, db, dummy_location): """Test relinking files.""" b1 = Bucket.create() obj1 = ObjectVersion.create( b1, "relink-test", stream=BytesIO(b('relinkthis'))) ObjectVersion.create(b1, "do-not-touch", stream=BytesIO(b('na'))) b1.snapshot() db.session.commit() assert ObjectVersion.query.count() == 4 assert FileInstance.query.count() == 2 fnew = FileInstance.create() fnew.copy_contents(obj1.file, default_location=b1.location.uri) db.session.commit() fold = obj1.file assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 2 assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 0 ObjectVersion.relink_all(obj1.file, fnew) db.session.commit() assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 0 assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 2
def test_object_relink_all(app, db, dummy_location): """Test relinking files.""" b1 = Bucket.create() obj1 = ObjectVersion.create( b1, "relink-test", stream=BytesIO(b('relinkthis'))) ObjectVersion.create(b1, "do-not-touch", stream=BytesIO(b('na'))) b1.snapshot() db.session.commit() assert ObjectVersion.query.count() == 4 assert FileInstance.query.count() == 2 fnew = FileInstance.create() fnew.copy_contents(obj1.file, location=b1.location) db.session.commit() fold = obj1.file assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 2 assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 0 ObjectVersion.relink_all(obj1.file, fnew) db.session.commit() assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 0 assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 2
def datasets(skip_files): """Load demo datasets records.""" from invenio_db import db from invenio_records_files.api import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from cernopendata.modules.records.minters.datasetid import \ cernopendata_datasetid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/datasets-v1.0.0.json') data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/datasets') datasets_json = glob.glob(os.path.join(data, '*.json')) # FIXME: change the treatment of `files` according to `records` fixtures. for filename in datasets_json: click.echo('Loading datasets from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() # (TOFIX) Remove if statement in production # as every dataset record should have a doi if data.get('doi', None): cernopendata_datasetid_minter(id, data) else: cernopendata_recid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) bucket = Bucket.create() RecordsBuckets.create(record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get("size"), file.get("checksum")) ObjectVersion.create(bucket, filename, _file_id=f.id) db.session.commit() indexer.index(record) db.session.expunge_all()
def test_fileinstance_create(app, db, dummy_location): """Test file instance create.""" f = FileInstance.create() assert f.id assert f.readable is False assert f.writable is True assert f.uri is None assert f.size == 0 assert f.checksum is None assert f.last_check_at is None assert f.last_check is None db.session.commit() # Check unique constraint on URI with none values. f = FileInstance.create() f = FileInstance.create() db.session.commit()
def data_policies(skip_files): """Load demo Data Policy records.""" from invenio_db import db from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets from invenio_records_files.api import Record from invenio_records.models import RecordMetadata indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/data-policies-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data') data_policies_json = glob.glob(os.path.join(data, '*.json')) for filename in data_policies_json: click.echo('Loading data-policies from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() cernopendata_recid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) bucket = Bucket.create() RecordsBuckets.create( record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get( "size"), file.get("checksum")) ObjectVersion.create( bucket, filename, _file_id=f.id ) db.session.commit() indexer.index(record) db.session.expunge_all()
def create_b2safe_file(external_pids, bucket): """Create a FileInstance which contains a PID in its uri.""" validate_schema(external_pids, { 'type': 'array', 'items': { 'type': 'object', 'properties': { 'ePIC_PID': {'type': 'string'}, 'key': {'type': 'string'} }, 'additionalProperties': False, 'required': ['ePIC_PID', 'key'] } }) keys_list = [e['key'] for e in external_pids] keys_set = set(keys_list) if len(keys_list) != len(keys_set): raise InvalidDepositError([FieldError('external_pids', 'Field external_pids contains duplicate keys.')]) for external_pid in external_pids: if not external_pid['ePIC_PID'].startswith('http://hdl.handle.net/'): external_pid['ePIC_PID'] = 'http://hdl.handle.net/' + \ external_pid['ePIC_PID'] if external_pid['key'].startswith('/'): raise InvalidDepositError( [FieldError('external_pids', 'File key cannot start with a "/".')]) try: # Create the file instance if it does not already exist file_instance = FileInstance.get_by_uri(external_pid['ePIC_PID']) if file_instance is None: file_instance = FileInstance.create() file_instance.set_uri( external_pid['ePIC_PID'], 1, 0, storage_class='B') assert file_instance.storage_class == 'B' # Add the file to the bucket if it is not already in it current_version = ObjectVersion.get(bucket, external_pid['key']) if not current_version or \ current_version.file_id != file_instance.id: ObjectVersion.create(bucket, external_pid['key'], file_instance.id) except IntegrityError as e: raise InvalidDepositError( [FieldError('external_pids', 'File URI already exists.')])
def test_storage_interface(): """Test storage interface.""" f = FileInstance.create() s = Storage(f) pytest.raises(NotImplementedError, s.open) pytest.raises(NotImplementedError, s.send_file) pytest.raises(NotImplementedError, s.save, None) pytest.raises(NotImplementedError, s.compute_checksum, None)
def test_object_set_file(app, db, dummy_location): """Test object set file.""" b = Bucket.create() f = FileInstance(uri="f1", size=1, checksum="mychecksum") obj = ObjectVersion.create(b, "test").set_file(f) db.session.commit() assert obj.file == f assert pytest.raises(FileInstanceAlreadySetError, obj.set_file, f)
def test_pyfs_send_file_fail(app, db, dummy_location): """Test send file.""" f = FileInstance.create() f.set_contents(BytesIO(b("test")), location=dummy_location) with patch('invenio_files_rest.storage.send_stream') as send_stream: send_stream.side_effect = OSError(errno.EPERM, "Permission problem") with app.test_request_context(): pytest.raises(StorageError, f.send_file)
def test_sip_file_model(db): """Test the SIPFile model.""" sip1 = SIP.create('json', '{}') file1 = FileInstance.create() sipfile1 = SIPFile(sip_id=sip1.id, filepath="foobar.zip", file_id=file1.id) db.session.add(sipfile1) db.session.commit() assert SIP.query.count() == 1 assert SIPFile.query.count() == 1
def test_publish_process_files(api_app, db, location): """Test _process_files changing master tags on bucket snapshots.""" deposit = CDSDeposit.create( dict( date='1/2/3', category='cat', type='type', title=dict(title='title'), report_number=['1234'], videos=[]), bucket_location='videos') # deposit has no files, so _process_files must yield None with deposit._process_files(None, dict()) as data: assert data is None bucket = deposit.files.bucket master_obj = ObjectVersion.create( bucket=bucket, key='master', _file_id=FileInstance.create()) number_of_slaves = 10 for i in range(number_of_slaves): slave_obj = ObjectVersion.create( bucket=bucket, key='slave{}.mp4'.format(i + 1), _file_id=FileInstance.create()) ObjectVersionTag.create(slave_obj, 'master', master_obj.version_id) ObjectVersionTag.create(slave_obj, 'media_type', 'video') ObjectVersionTag.create(slave_obj, 'context_type', 'subformat') assert Bucket.query.count() == 1 with deposit._process_files(None, dict()): # the snapshot bucket must have been created assert Bucket.query.count() == 2 for bucket in Bucket.query.all(): master_version = [str(obj.version_id) for obj in bucket.objects if 'master' not in obj.get_tags()][0] # the master of each slave must be in the same bucket for obj in bucket.objects: if str(obj.version_id) != master_version: assert obj.get_tags()['master'] == master_version assert obj.get_tags()['media_type'] == 'video' assert obj.get_tags()['context_type'] == 'subformat'
def test_fileinstance_copy_contents_invalid(app, db, dummy_location): """Test invalid copy contents.""" # Source not readable src = FileInstance.create() dst = FileInstance.create() pytest.raises(ValueError, dst.copy_contents, src) # Create valid source data = b('this is some data') src = FileInstance.create() src.set_contents(BytesIO(data), location=dummy_location) db.session.commit() # Destination not writable dst.writable = False pytest.raises(ValueError, dst.copy_contents, src) # Size is not 0 dst.writable = True dst.size = 1 pytest.raises(ValueError, dst.copy_contents, src)
def test_fileinstance_copy_contents_invalid(app, db, dummy_location): """Test invalid copy contents.""" # Source not readable src = FileInstance.create() dst = FileInstance.create() pytest.raises(ValueError, dst.copy_contents, src) # Create valid source data = b('this is some data') src = FileInstance.create() src.set_contents(BytesIO(data), default_location=dummy_location.uri) db.session.commit() # Destination not writable dst.writable = False pytest.raises(ValueError, dst.copy_contents, src) # Size is not 0 dst.writable = True dst.size = 1 pytest.raises(ValueError, dst.copy_contents, src)
def test_pyfilesystemstorage_checksum_fail(app, db, dummy_location): """Test fixity problems.""" # Raise an error during checksum calculation def callback(total, size): raise OSError(errno.EPERM, "Permission") f = FileInstance.create() f.set_contents(BytesIO(b("test")), location=dummy_location) pytest.raises( StorageError, PyFilesystemStorage(f).compute_checksum, progress_callback=callback)
def _verify_file_and_symlink(record, file_id, filename): # verify uploaded file exists uploaded_file = FileInstance.get(file_id) uploaded_file_path = uploaded_file.uri assert os.path.exists(uploaded_file_path) # verify symlink exists symlink_path = _get_symlink_path(record, filename) assert os.path.exists(symlink_path) # verify symlink points to the correct file assert os.path.realpath(symlink_path) == os.path.realpath( uploaded_file_path)
def test_object_version_tags(app, db, dummy_location): """Test object version tags.""" f = FileInstance(uri="f1", size=1, checksum="mychecksum") db.session.add(f) db.session.commit() b = Bucket.create() obj1 = ObjectVersion.create(b, "test").set_file(f) ObjectVersionTag.create(obj1, "mykey", "testvalue") ObjectVersionTag.create(obj1, "another_key", "another value") db.session.commit() # Duplicate key pytest.raises( IntegrityError, ObjectVersionTag.create, obj1, "mykey", "newvalue") # Test get assert ObjectVersionTag.query.count() == 2 assert ObjectVersionTag.get(obj1, "mykey").value == "testvalue" assert ObjectVersionTag.get_value(obj1.version_id, "another_key") \ == "another value" assert ObjectVersionTag.get_value(obj1, "invalid") is None # Test delete ObjectVersionTag.delete(obj1, "mykey") assert ObjectVersionTag.query.count() == 1 ObjectVersionTag.delete(obj1, "invalid") assert ObjectVersionTag.query.count() == 1 # Create or update ObjectVersionTag.create_or_update(obj1, "another_key", "newval") ObjectVersionTag.create_or_update(obj1.version_id, "newkey", "testval") db.session.commit() assert ObjectVersionTag.get_value(obj1, "another_key") == "newval" assert ObjectVersionTag.get_value(obj1, "newkey") == "testval" # Get tags as dictionary assert obj1.get_tags() == dict(another_key="newval", newkey="testval") obj2 = ObjectVersion.create(b, 'test2') assert obj2.get_tags() == dict() # Copy object version obj_copy = obj1.copy() db.session.commit() assert obj_copy.get_tags() == dict(another_key="newval", newkey="testval") assert ObjectVersionTag.query.count() == 4 # Cascade delete ObjectVersion.query.delete() db.session.commit() assert ObjectVersionTag.query.count() == 0
def create_file(self, bucket, f): """Create a single file.""" # Ensure that file instance get's created with the same ID as it is # used in the REST API. fileinstance = FileInstance( id=f['id'], writable=True, readable=False, size=0, ) db.session.add(fileinstance) fileinstance.set_uri(f['uri'], f['size'], f['checksum']) obj = ObjectVersion.create(bucket, f['key']).set_file(fileinstance) return (dict( bucket=str(obj.bucket.id), key=obj.key, checksum=obj.file.checksum, size=obj.file.size, version_id=str(obj.version_id), type=f['type'], ), fileinstance)
def test_fileinstance_send_file(app, db, dummy_location): """Test file instance send file.""" f = FileInstance.create() # File not readable pytest.raises(FileInstanceUnreadableError, f.send_file) # Write data data = b("test file instance set contents") f.set_contents(BytesIO(data), default_location=dummy_location.uri) db.session.commit() # Send data with app.test_request_context(): res = f.send_file('test.txt') assert int(res.headers['Content-Length']) == len(data)
def test_fileinstance_send_file(app, db, dummy_location): """Test file instance send file.""" f = FileInstance.create() # File not readable pytest.raises(ValueError, f.send_file) # Write data data = b("test file instance set contents") f.set_contents(BytesIO(data), location=dummy_location) db.session.commit() # Send data with app.test_request_context(): res = f.send_file() assert int(res.headers['Content-Length']) == len(data)
def software(skip_files): """Load demo software records.""" from invenio_db import db from invenio_records_files.api import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.softid import \ cernopendata_softid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/software-v1.0.0.json') data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/software') software_json = glob.glob(os.path.join(data, '*.json')) # FIXME: change the treatment of `files` according to `records` fixtures. for filename in software_json: with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() cernopendata_softid_minter(id, data) record = Record.create(data, id_=id) record['$schema'] = schema bucket = Bucket.create() RecordsBuckets.create(record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get("size"), file.get("checksum")) ObjectVersion.create(bucket, filename, _file_id=f.id) db.session.commit() indexer.index(record) db.session.expunge_all()
def create_file(self, bucket, key, file_versions): """Create a single file with all versions.""" objs = [] for file_ver in file_versions: f = FileInstance.create().set_uri( file_ver['full_path'], file_ver['size'], 'md5:{0}'.format(file_ver['checksum']), ) obj = ObjectVersion.create(bucket, key).set_file(f) obj.created = arrow.get( file_ver['creation_date']).datetime.replace(tzinfo=None) objs.append(obj) # Set head version db.session.commit() return objs[-1]
def loaddemofiles(source, force=False): """Load demo files.""" s = stat(source) with open(source, 'rb') as fp: m = hashlib.md5() m.update(fp.read()) checksum = "md5:{0}".format(m.hexdigest()) # Create a file instance with db.session.begin_nested(): f = FileInstance.create() f.set_uri(source, s.st_size, checksum) # Replace all objects associated files. ObjectVersion.query.update({ObjectVersion.file_id: str(f.id)}) db.session.commit()
def delete_file_instance(obj: ObjectVersion): """Delete file on filesystem and mark as not readable.""" current_app.logger.debug(f"Delete file instance: {str(obj)}") if obj.file_id: f = FileInstance.get(str(obj.file_id)) # type: FileInstance is_readable = f.readable # Mark file not readable f.readable = False # Remove the file on disk if is_readable: f.storage().delete() db.session.commit()
def test_transfer_rsync(app, db, location): """Test factories.transfer_rsync function.""" # config app.config['SIPSTORE_ARCHIVER_DIRECTORY_BUILDER'] = \ 'helpers:archive_directory_builder' app.config['SIPSTORE_ARCHIVER_METADATA_TYPES'] = ['test'] # SIP sip = SIP.create() # SIPMetadataType mtype = SIPMetadataType(title='Test', name='test', format='json') db.session.add(mtype) # SIPMetadata mcontent = {'title': 'title', 'author': 'me'} meth = SIPMetadata(sip=sip, type=mtype, content=json.dumps(mcontent)) db.session.add(meth) # SIPFile f = FileInstance.create() fcontent = b'weighted companion cube\n' f.set_contents(BytesIO(fcontent), default_location=location.uri) sfile = SIPFile(sip=sip, file=f, filepath='portal.txt') db.session.add(sfile) db.session.commit() # EXPORT folder = path.join(location.uri, 'lulz') params = { 'server': '', 'user': '', 'destination': folder, 'args': '-az' } factories.transfer_rsync(sip.id, params) # TEST assert not path.exists(path.join(location.uri, 'test')) assert path.isdir(folder) assert path.isdir(path.join(folder, 'files')) assert path.isfile(path.join(folder, 'files', 'portal.txt')) assert path.isdir(path.join(folder, 'metadata')) assert path.isfile(path.join(folder, 'metadata', 'test.json')) with open(path.join(folder, 'files', 'portal.txt'), 'rb') as fp: assert fp.read() == fcontent with open(path.join(folder, 'metadata', 'test.json'), 'r') as fp: assert json.loads(fp.read()) == mcontent
def process_x_cap_files(record, x_cap_files): """Process files, update record.""" result = [] old_keys = set(record.files.keys) used_keys = set() # Download new files. urls = { error.url for error in x_cap_files if error.condition and error.url } for url in urls: if url not in record.files: result.append(url) obj = ObjectVersion.create(bucket=record.files.bucket, key=url) obj.file = FileInstance.create() record.files.flush() record.files[url]['source_url'] = url # Update file key for external URLs. for error in x_cap_files: if error.url: error.update_file_key(error.url) # Calculate references. keyfunc = attrgetter('file_key') for key, errors in groupby(sorted(x_cap_files, key=keyfunc), keyfunc): if key is None: continue refs = extract_refs_from_errors(errors) if refs: used_keys.add(key) record.files[key]['refs'] = refs for key in old_keys - used_keys: record.files[key]['refs'] = [] return result
def test_pyfilesystemstorage(app, db, dummy_location): """Test pyfs storage.""" # Create bucket and object with db.session.begin_nested(): b1 = Bucket.create() obj = ObjectVersion.create(b1, "LICENSE") obj.file = FileInstance.create() storage = PyFilesystemStorage(obj.file, base_uri=obj.bucket.location.uri) counter = dict(size=0) def callback(total, size): counter['size'] = size def test_file_save(data, **kwargs): stream = BytesIO(data) loc, size, checksum = storage.save(stream, progress_callback=callback, **kwargs) # Verify checksum, size and location. m = hashlib.md5() m.update(data) assert "md5:{0}".format(m.hexdigest()) == checksum assert size == len(data) assert loc == join(dummy_location.uri, str(obj.file.id)[0:2], str(obj.file.id)[2:], 'data') data = b("this is some content") # test without size test_file_save(data) # test with correct size test_file_save(data, size=len(data)) # test with wrong sizes with pytest.raises(UnexpectedFileSizeError): test_file_save(data, size=len(data) - 1) with pytest.raises(UnexpectedFileSizeError): test_file_save(data, size=len(data) + 1)
def upload_to_zenodo(bucket_id, filename): """Upload code to zenodo.""" zenodo_server_url = current_app.config.get('ZENODO_SERVER_URL') params = {"access_token": current_app.config.get( 'ZENODO_ACCESS_TOKEN')} filename = filename + '.tar.gz' r = requests.post(zenodo_server_url, params=params, json={}, ) file_obj = ObjectVersion.get(bucket_id, filename) file = FileInstance.get(file_obj.file_id) bucket_url = r.json()['links']['bucket'] with open(file.uri, 'rb') as fp: response = requests.put( bucket_url + '/{}'.format(filename), data=fp, params=params, ) return jsonify({"status": response.status_code})
def upload(self, pid=None, *args, **kwargs): """Upload action for file/repository.""" with UpdateDepositPermission(self).require(403): data = request.get_json() fileinfo = self._construct_fileinfo(data['url'], data['type']) if request: _, record = request.view_args.get('pid_value').data record_id = str(record.id) filename = fileinfo['filename'] obj = ObjectVersion.create( bucket=record.files.bucket, key=filename ) obj.file = FileInstance.create() record.files.flush() record.files[filename]['source_url'] = data['url'] if data['type'] == 'url': if data['url'].startswith( ('https://github', 'https://gitlab.cern.ch', 'root://')): download_url.delay(record_id, data['url'], fileinfo) else: raise FileUploadError( 'Please provide a valid file url.') else: if data['url'].startswith( ('https://github', 'https://gitlab.cern.ch')): download_repo.delay(record_id, data['url'], filename) else: raise FileUploadError( 'Please provide a valid repository url.') return self
def create_files_and_sip(deposit, dep_pid): """Create deposit Bucket, Files and SIPs.""" from invenio_pidstore.errors import PIDDoesNotExistError from invenio_pidstore.models import PersistentIdentifier from invenio_sipstore.errors import SIPUserDoesNotExist from invenio_sipstore.models import SIP, RecordSIP, SIPFile from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets from invenio_db import db buc = Bucket.create() recbuc = RecordsBuckets(record_id=deposit.id, bucket_id=buc.id) db.session.add(recbuc) deposit.setdefault('_deposit', dict()) deposit.setdefault('_files', list()) files = deposit.get('files', []) sips = deposit.get('sips', []) recid = None if sips: recids = [int(sip['metadata']['recid']) for sip in sips] if len(set(recids)) > 1: logger.error('Multiple recids ({recids}) found in deposit {depid}' ' does not exists.'.format(recids=recids, depid=dep_pid.pid_value)) raise DepositMultipleRecids(dep_pid.pid_value, list(set(recids))) elif recids: # If only one recid recid = recids[0] # Store the path -> FileInstance mappings for SIPFile creation later dep_file_instances = list() for file_ in files: fi = FileInstance.create() fi.set_uri(file_['path'], file_['size'], file_['checksum']) ov = ObjectVersion.create(buc, file_['name'], _file_id=fi.id) file_meta = dict( bucket=str(buc.id), key=file_['name'], checksum=file_['checksum'], size=file_['size'], version_id=str(ov.version_id), ) deposit['_files'].append(file_meta) dep_file_instances.append((file_['path'], fi)) for idx, sip in enumerate(sips): agent = None user_id = None if sip['agents']: agent = dict( ip_address=sip['agents'][0].get('ip_address', ""), email=sip['agents'][0].get('email_address', ""), ) user_id = sip['agents'][0]['user_id'] content = sip['package'] sip_format = 'marcxml' try: sip = SIP.create(sip_format, content, user_id=user_id, agent=agent) except SIPUserDoesNotExist: logger.exception('User ID {user_id} referred in deposit {depid} ' 'does not exists.'.format( user_id=user_id, depid=dep_pid.pid_value)) raise DepositSIPUserDoesNotExist(dep_pid.pid_value, user_id) # If recid was found, attach it to SIP # TODO: This is always uses the first recid, as we quit if multiple # recids are found in the sips information if recid: try: pid = PersistentIdentifier.get(pid_type='recid', pid_value=recid) record_sip = RecordSIP(sip_id=sip.id, pid_id=pid.id) db.session.add(record_sip) except PIDDoesNotExistError: logger.exception('Record {recid} referred in ' 'Deposit {depid} does not exists.'.format( recid=recid, depid=dep_pid.pid_value)) raise DepositRecidDoesNotExist(dep_pid.pid_value, recid) if idx == 0: for fp, fi in dep_file_instances: sipf = SIPFile(sip_id=sip.id, filepath=fp, file_id=fi.id) db.session.add(sipf) deposit.commit() db.session.commit() return deposit
def sips(db, locations, sip_metadata_types): """Fixture for the SIP objects sharing multiple files. Four SIPs are sharing three files in the following way: SIP-1: File1 SIP-2: File1, File2 SIP-3: File2(renamed on SIPFile, but same FileInstance), File3 SIP-4: File4, File5, File6 """ # A SIP with agent info sip1 = SIP.create(agent={ 'email': '*****@*****.**', 'orcid': '1111-1111-1111-1111', 'ip_address': '1.1.1.1' }) sip1api = SIPApi(sip1) sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>') sip1api.attach_metadata('json-test', '{"title": "JSON 1"}') # Metadata 'txt-test', although attached should not be archived # (see conftest configuration) sip1api.attach_metadata('txt-test', 'Title: TXT 1') file1 = FileInstance.create() file1.set_contents(BytesIO(b('test')), default_location=locations['default'].uri) sip1file1 = SIPFile(sip_id=sip1.id, filepath="foobar.txt", file_id=file1.id) db_.session.add(sip1file1) sip2 = SIP.create() sip2api = SIPApi(sip2) sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>') sip2api.attach_metadata('json-test', '{"title": "JSON 2"}') file2 = FileInstance.create() file2.set_contents(BytesIO(b'test-second'), default_location=locations['default'].uri) sip2file1 = SIPFile(sip_id=sip2.id, filepath="foobar.txt", file_id=file1.id) sip2file2 = SIPFile(sip_id=sip2.id, filepath="foobar2.txt", file_id=file2.id) db_.session.add(sip2file1) db_.session.add(sip2file2) sip3 = SIP.create() sip3api = SIPApi(sip3) sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>') sip3api.attach_metadata('json-test', '{"title": "JSON 3"}') file3 = FileInstance.create() file3.set_contents(BytesIO(b'test-third'), default_location=locations['default'].uri) sip3file2 = SIPFile(sip_id=sip3.id, filepath="foobar2-renamed.txt", file_id=file2.id) sip3file3 = SIPFile(sip_id=sip3.id, filepath="foobar3.txt", file_id=file3.id) db_.session.add(sip3file2) db_.session.add(sip3file3) # A SIP with naughty filenames sip4 = SIP.create() sip4api = SIPApi(sip4) sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>') sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}') file4 = FileInstance.create() file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')), default_location=locations['default'].uri) file5 = FileInstance.create() file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')), default_location=locations['default'].uri) file6 = FileInstance.create() file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')), default_location=locations['default'].uri) sip5file4 = SIPFile(sip_id=sip4.id, filepath="../../foobar.txt", file_id=file4.id) sip5file5 = SIPFile(sip_id=sip4.id, filepath="http://maliciouswebsite.com/hack.js", file_id=file5.id) sip5file6 = SIPFile(sip_id=sip4.id, filepath="łóżźćąę.dat", file_id=file6.id) db_.session.add(sip5file4) db_.session.add(sip5file5) db_.session.add(sip5file6) # A SIP with metadata-only changes sip5 = SIP.create() sip5api = SIPApi(sip5) sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>') db_.session.commit() return [sip1api, sip2api, sip3api, sip4api, sip5api]
def datasets(skip_files): """Load demo datasets records.""" from invenio_db import db from invenio_records_files.api import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from cernopendata.modules.records.minters.datasetid import \ cernopendata_datasetid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/datasets-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/datasets') datasets_json = glob.glob(os.path.join(data, '*.json')) # FIXME: change the treatment of `files` according to `records` fixtures. for filename in datasets_json: click.echo('Loading datasets from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() # (TOFIX) Remove if statement in production # as every dataset record should have a doi if data.get('doi', None): cernopendata_datasetid_minter(id, data) else: cernopendata_recid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) bucket = Bucket.create() RecordsBuckets.create( record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get( "size"), file.get("checksum")) ObjectVersion.create( bucket, filename, _file_id=f.id ) db.session.commit() indexer.index(record) db.session.expunge_all()
def test_fileinstance_validation(app, db, dummy_location): """Test validating the FileInstance.""" f = FileInstance.create() f.set_uri('x' * 255, 1000, 1000) # Should not raise pytest.raises(ValueError, f.set_uri, 'x' * 256, 1000, 1000)