def migrate_file(f): file_id = f['fileinfo'].get('_id', '') if file_id: file_path = util.path_from_uuid(file_id) if not target_fs.get_file_info(file_id, file_path): log.debug( ' file aready has id field, just copy to target storage') src_fs = get_src_fs_by_file_path(file_path) log.debug(' file found in %s' % src_fs) old_file = src_fs.open(file_id, file_path, 'rb') new_file = target_fs.open(file_id, file_path, 'wb') buffer_copy(old_file, new_file, CHUNK_SIZE) old_file.close() new_file.close() else: log.debug(' file is aready present in target storage, skipping') else: file_id = str(uuid.uuid4()) log.debug(' generated uuid: %s', file_id) f_old_path = util.path_from_hash(f['fileinfo']['hash']) log.debug(' file old path: %s', f_old_path) f_new_path = util.path_from_uuid(file_id) log.debug(' file new path: %s', f_new_path) log.debug(' copy file to target storage') old_file = local_fs.open(None, f_old_path, 'rb') new_file = target_fs.open(file_id, f_new_path, 'wb') buffer_copy(old_file, new_file, CHUNK_SIZE) old_file.close() new_file.close() update_set = { f['prefix'] + '.$.modified': datetime.datetime.utcnow(), f['prefix'] + '.$._id': file_id } # Update the file with the newly generated UUID updated_doc = db[f['container']].find_one_and_update( { '_id': f['container_id'], f['prefix'] + '.name': f['fileinfo']['name'], f['prefix'] + '.hash': f['fileinfo']['hash'] }, {'$set': update_set}) if not updated_doc: log.info( 'Probably the following file has been updated during the migration ' 'and its hash is changed, cleaning up from the new filesystem') target_fs.remove_file(file_id, f_new_path)
def files_to_migrate(data_builder, api_db, as_admin, randstr, file_form): # Create a project session_id = data_builder.create_session() files = [] # Create an UUID file file_name_1 = '%s.csv' % randstr() file_content_1 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_1, file_content_1))) file_info = api_db['sessions'].find_one({'files.name': file_name_1})['files'][0] file_id_1 = file_info['_id'] url_1 = '/sessions/' + session_id + '/files/' + file_name_1 files.append( (session_id, file_name_1, url_1, util.path_from_uuid(file_id_1), str(file_info['provider_id']), file_id_1)) yield files # Clean up, get the files files = api_db['sessions'].find_one({'_id': ObjectId(session_id)})['files'] # Delete the files but the session still exists in the DB with now missing data for f in files: try: source_fs = get_provider(f['provider_id']).storage_plugin source_fs.remove_file(f['_id'], None) except: pass
def legacy_cas_file(as_admin, api_db, data_builder, randstr, file_form): """Yield legacy CAS file""" project = data_builder.create_project() file_name = '%s.csv' % randstr() file_content = randstr() as_admin.post('/projects/' + project + '/files', files=file_form((file_name, file_content))) file_info = api_db['projects'].find_one({'files.name': file_name})['files'][0] file_id = file_info['_id'] file_hash = file_info['hash'] # verify cas backward compatibility api_db['projects'].find_one_and_update({'files.name': file_name}, {'$unset': { 'files.$._id': '' }}) file_path = unicode(util.path_from_hash(file_hash)) target_dir = fs.path.dirname(file_path) if not config.local_fs.exists(target_dir): config.local_fs.makedirs(target_dir) fs.move.move_file(src_fs=config.fs, src_path=util.path_from_uuid(file_id), dst_fs=config.local_fs, dst_path=file_path) yield (project, file_name, file_content) # clean up config.local_fs.remove(file_path) config.local_fs.removetree(target_dir) api_db['projects'].delete_one({'_id': project})
def move_file(src_storage, src_id, dst_storage, dst_path): src_path = util.path_from_uuid(src_id) target_dir = fs.path.dirname(dst_path) with src_storage.storage_plugin.open( src_id, src_path, 'rb') as src_fp, dst_storage.storage_plugin.open( None, dst_path, 'wb') as dst_fp: shutil.copyfileobj(src_fp, dst_fp) src_storage.storage_plugin.remove_file(src_id, src_path)
def migrate_file(f): file_id = f['fileinfo'].get('_id', '') if file_id: file_path = util.path_from_uuid(file_id) if not target_fs.isfile(file_path): log.debug(' file aready has id field, just copy to target storage') src_fs = get_src_fs_by_file_path(file_path) log.debug(' file found in %s' % src_fs) dst_dir = fs.path.dirname(file_path) target_fs.makedirs(dst_dir, recreate=True) fs.move.copy_file(src_fs=src_fs, src_path=file_path, dst_fs=target_fs, dst_path=file_path) else: log.debug(' file is aready present in target storage, skipping') else: file_id = str(uuid.uuid4()) log.debug(' generated uuid: %s', file_id) f_old_path = util.path_from_hash(f['fileinfo']['hash']) log.debug(' file old path: %s', f_old_path) f_new_path = util.path_from_uuid(file_id) log.debug(' file new path: %s', f_new_path) log.debug(' copy file to target storage') dst_dir = fs.path.dirname(f_new_path) target_fs.makedirs(dst_dir, recreate=True) fs.move.copy_file(src_fs=local_fs, src_path=f_old_path, dst_fs=target_fs, dst_path=f_new_path) update_set = { f['prefix'] + '.$.modified': datetime.datetime.utcnow(), f['prefix'] + '.$._id': file_id } # Update the file with the newly generated UUID updated_doc = db[f['container']].find_one_and_update( {'_id': f['container_id'], f['prefix'] + '.name': f['fileinfo']['name'], f['prefix'] + '.hash': f['fileinfo']['hash']}, {'$set': update_set} ) if not updated_doc: log.info('Probably the following file has been updated during the migration ' 'and its hash is changed, cleaning up from the new filesystem') target_fs.remove(f_new_path)
def move_file(src_id, dst_storage, dst_path): dst_fs = dst_storage.get_fs() src_path = util.path_from_uuid(src_id) target_dir = fs.path.dirname(dst_path) if not dst_fs.exists(target_dir): dst_fs.makedirs(target_dir) with config.primary_storage.open(src_id, src_path, 'rb') as src_fp, dst_fs.open( dst_path, 'wb') as dst_fp: shutil.copyfileobj(src_fp, dst_fp) config.primary_storage.remove_file(src_id, src_path)
def migrate_gear_files(f): file_id = f['exchange'].get('rootfs-id', '') if file_id: file_path = util.path_from_uuid(file_id) if not target_fs.get_file_info(file_id, file_path): log.debug( ' file aready has id field, just copy to target storage') src_fs = get_src_fs_by_file_path(file_path) log.debug(' file found in %s' % src_fs) old_file = src_fs.open(file_id, file_path, 'rb') new_file = target_fs.open(file_id, file_path, 'wb') buffer_copy(old_file, new_file, CHUNK_SIZE) old_file.close() new_file.close() else: log.debug(' file is aready present in target storage, skipping') else: file_id = str(uuid.uuid4()) file_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') f_old_path = util.path_from_hash(file_hash) log.debug(' file old path: %s', f_old_path) f_new_path = util.path_from_uuid(file_id) log.debug(' file new path: %s', f_new_path) log.debug(' copy file to target storage') old_file = local_fs.open(None, f_old_path, 'rb') new_file = target_fs.open(file_id, f_new_path, 'wb') buffer_copy(old_file, new_file, CHUNK_SIZE) old_file.close() new_file.close() update_set = { 'modified': datetime.datetime.utcnow(), 'exchange.rootfs-id': file_id } # Update the gear with the newly generated UUID db['gears'].find_one_and_update({'_id': f['gear_id']}, {'$set': update_set})
def test_file_replaced_handling(files_to_migrate, migrate_storage, as_admin, file_form, api_db, mocker, caplog): origin_find_one_and_update = pymongo.collection.Collection.find_one_and_update def mocked(*args, **kwargs): self = args[0] filter = args[1] update = args[2] as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_1, 'new_content'))) return origin_find_one_and_update(self, filter, update) with mocker.mock_module.patch.object(pymongo.collection.Collection, 'find_one_and_update', mocked): # get file storing by hash in legacy storage (session_id, file_name_1, url_1, file_path_1) = files_to_migrate[0] # get ile storing by uuid in legacy storage (_, file_name_2, url_2, file_path_2) = files_to_migrate[1] # run the migration migrate_storage.main('--containers') file_1_id = api_db['sessions'].find_one({'files.name': file_name_1 })['files'][0]['_id'] file_2_id = api_db['sessions'].find_one({'files.name': file_name_2 })['files'][1]['_id'] assert config.primary_storage.get_file_info( file_1_id, util.path_from_uuid(file_1_id)) is not None assert config.primary_storage.get_file_info( file_2_id, util.path_from_uuid(file_2_id)) is not None assert any( log.message == 'Probably the following file has been updated during the migration and its hash is changed, cleaning up from the new filesystem' for log in caplog.records)
def migrate_gear_files(f): file_id = f['exchange'].get('rootfs-id', '') if file_id: file_path = util.path_from_uuid(file_id) if not target_fs.isfile(file_path): log.debug(' file aready has id field, just copy to target storage') src_fs = get_src_fs_by_file_path(file_path) log.debug(' file found in %s' % src_fs) dst_dir = fs.path.dirname(file_path) target_fs.makedirs(dst_dir, recreate=True) fs.move.copy_file(src_fs=src_fs, src_path=file_path, dst_fs=target_fs, dst_path=file_path) else: log.debug(' file is aready present in target storage, skipping') else: file_id = str(uuid.uuid4()) file_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') f_old_path = util.path_from_hash(file_hash) log.debug(' file old path: %s', f_old_path) f_new_path = util.path_from_uuid(file_id) log.debug(' file new path: %s', f_new_path) log.debug(' copy file to target storage') dst_dir = fs.path.dirname(f_new_path) target_fs.makedirs(dst_dir, recreate=True) fs.move.copy_file(src_fs=local_fs, src_path=f_old_path, dst_fs=target_fs, dst_path=f_new_path) update_set = { 'modified': datetime.datetime.utcnow(), 'exchange.rootfs-id': file_id } # Update the gear with the newly generated UUID db['gears'].find_one_and_update( {'_id': f['gear_id']}, {'$set': update_set} )
def cleanup_files(remove_all, origins, project_id, job_phi): log.info( 'Cleanup deleted container (projects, acquisitions, sessions, collections, analyses) files...' ) deleted_date_cutoff = datetime.datetime.now() - datetime.timedelta( hours=72) container_ids = [] for container in cont_names: log.info("Cleaning up %s" % container) pipeline = [ { "$match": { "$or": [{ "files.deleted": { "$lte": deleted_date_cutoff } }, { "deleted": { "$lte": deleted_date_cutoff } }] } }, { "$project": { "files": { "$ifNull": [ { "$filter": { "input": "$files", "as": "item", "cond": { "$or": [ { "$and": [ # $lte return true if the deleted field not exists { "$lte": [ "$$item.deleted", deleted_date_cutoff ] }, { "$ifNull": [ "$$item.deleted", False ] } ] }, { "$and": [ # $lte return true if the deleted field not exists { "$lte": [ "$deleted", deleted_date_cutoff ] }, { "$ifNull": ["$deleted", False] } ] } ] } } }, [] ] }, "deleted": 1 } } ] if project_id: # Use the id field or parents.project field to filter results # instead of date of deletion project_filter = { '$or': [{ '_id': bson.ObjectId(project_id) }, { 'parents.project': bson.ObjectId(project_id) }] } # We don't care about time of deletetion for single project snipes pipeline[0]['$match'].pop('$or') deleted_filter = { '$or': [{ 'files.deleted': { '$exists': True } }, { 'deleted': { '$exists': True } }] } pipeline[0]['$match']['$and'] = [deleted_filter, project_filter] pipeline[1]['$project'] = {'files': 1, 'deleted': 1} cursor = db.get_collection(container).aggregate(pipeline) job_operations = [] job_log_operations = [] jobs_modified = 0 job_logs_deleted = 0 for document in cursor: document_deleted = False if project_id and job_phi: # Append the container id to the list to purge jobs of phi container_ids.append(document['_id']) if document.get('deleted'): # if the document is deleted, remove it from the database # since it might have phi from engine uploads # NOTE: we only do this if job-phi is also set so that we can if needed, # go back and delete the job phi response = db.get_collection(container).delete_one( {'_id': document['_id']}) document_deleted = response.deleted_count == 1 for i, f in enumerate(document.get('files', [])): if not remove_all and f['origin']['type'] not in origins: log.debug( ' skipping %s/%s/%s since it was uploaded by %s', container, document['_id'], f['name'], f['origin']['type']) continue log.debug( ' file marked to delete: %s, parent marked to delete: %s', f.get('deleted', False), document.get('deleted', False)) log.debug(' removing %s/%s/%s', container, document['_id'], f['name']) if f.get('_id'): uuid_path = util.path_from_uuid(f['_id']) if fs.get_file_info(f['_id'], uuid_path): log.debug(' removing from %s', fs) fs.remove_file(f['_id'], uuid_path) if not document_deleted: # only need to remove the file from the database # if the document wasn't already removed log.debug(' removing from database') update_result = db.get_collection( container).update_one( {'_id': document['_id']}, {'$pull': { 'files': { '_id': f['_id'] } }}) if not update_result.modified_count == 1: log.error( ' couldn\'t remove file from database') exit(1) if len(container_ids) == 100: # Chunking the number of jobs to find to # Number of jobs from 100 containers job_operations, job_log_operations = generate_job_operations( container_ids) result = execute_job_operations(job_operations, job_log_operations) jobs_modified += result[0] job_logs_deleted += result[1] container_ids = [] if container_ids: # find the jobs % 100 left job_operations, job_log_operations = generate_job_operations( container_ids) result = execute_job_operations(job_operations, job_log_operations) jobs_modified += result[0] job_logs_deleted += result[1] container_ids = [] log.debug('Purged phi from %s, and removed %s jobs logs', jobs_modified, job_logs_deleted)
def test_cleanup_single_project(data_builder, default_payload, randstr, file_form, as_admin, as_drone, api_db, cleanup_deleted): project_id = data_builder.create_project() session_id = data_builder.create_session() acquisition_id = data_builder.create_acquisition() file_name_1 = '%s.csv' % randstr() file_content_1 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_1, file_content_1))) file_info = api_db['sessions'].find_one({'files.name': file_name_1})['files'][0] file_id_1 = file_info['_id'] # Create ad-hoc analysis r = as_admin.post('/sessions/' + session_id + '/analyses', json={ 'label': 'offline', 'inputs': [{ 'type': 'session', 'id': session_id, 'name': file_name_1 }] }) assert r.ok analysis = r.json()['_id'] # get the ticket r = as_admin.get('/sessions/' + session_id + '/files/' + file_name_1, params={'ticket': ''}) assert r.ok ticket = r.json()['ticket'] # download the file assert as_admin.get('/sessions/' + session_id + '/files/' + file_name_1, params={ 'ticket': ticket }).ok # run a job gear_doc = default_payload['gear']['gear'] gear_doc['inputs'] = {'dicom': {'base': 'file'}} gear = data_builder.create_gear(gear=gear_doc) job_data = { 'gear_id': gear, 'inputs': { 'dicom': { 'type': 'session', 'id': session_id, 'name': file_name_1 } }, 'config': { 'two-digit multiple of ten': 20 }, 'destination': { 'type': 'acquisition', 'id': acquisition_id }, 'tags': ['test-tag'] } # add job with explicit destination r = as_admin.post('/jobs/add', json=job_data) assert r.ok job_id = r.json()['_id'] # start job (Adds logs) r = as_admin.get('/jobs/next') assert r.ok # prepare completion (send success status before engine upload) r = as_drone.post('/jobs/' + job_id + '/prepare-complete') assert r.ok # verify that job ticket has been created job_ticket = api_db.job_tickets.find_one({'job': job_id}) assert job_ticket['timestamp'] produced_metadata = { 'project': { 'label': 'engine project', 'info': { 'test': 'p' } }, 'session': { 'label': 'engine session', 'subject': { 'code': 'engine subject', 'sex': 'male', 'age': 86400 }, 'info': { 'test': 's' } }, 'acquisition': { 'label': 'engine acquisition', 'timestamp': '2016-06-20T21:57:36+00:00', 'info': { 'test': 'a' }, 'files': [{ 'name': 'result.txt', 'type': 'text', 'info': { 'test': 'f0' } }] } } # engine upload r = as_drone.post('/engine', params={ 'level': 'acquisition', 'id': acquisition_id, 'job': job_id, 'job_ticket': job_ticket['_id'] }, files=file_form('result.txt', meta=produced_metadata)) assert r.ok # Make sure produced metadata and logs exist r = as_admin.get('/jobs/' + job_id) assert r.ok job = r.json() assert job.get('produced_metadata') r = as_admin.get('/jobs/' + job_id + '/logs') assert r.ok assert r.json().get('logs') # Try cleaning undeleted project cleanup_deleted.main('--log-level', 'DEBUG', '--all', '--project', project_id, '--job-phi') # Make sure file is still there assert config.primary_storage.get_file_info(file_id_1, util.path_from_uuid(file_id_1)) # Make sure job phi is still there r = as_admin.get('/jobs/' + job_id) assert r.ok job = r.json() assert job.get('produced_metadata') r = as_admin.get('/jobs/' + job_id + '/logs') assert r.ok assert r.json().get('logs') # delete the project r = as_admin.delete('/projects/' + project_id) assert r.ok # Run cleanup again cleanup_deleted.main('--log-level', 'DEBUG', '--all', '--project', project_id, '--job-phi') # Make sure file is not there assert not config.primary_storage.get_file_info( file_id_1, util.path_from_uuid(file_id_1)) # Check job phi r = as_admin.get('/jobs/' + job_id) assert r.ok job = r.json() assert not job.get('produced_metadata') r = as_admin.get('/jobs/' + job_id + '/logs') assert r.ok assert not r.json().get('logs') assert not api_db.projects.find_one({'_id': ObjectId(project_id)}) assert not api_db.subjects.find_one( {'parents.project': ObjectId(project_id)}) assert not api_db.sessions.find_one( {'parents.project': ObjectId(project_id)}) assert not api_db.acquisitions.find_one( {'parents.project': ObjectId(project_id)}) assert not api_db.analyses.find_one( {'parents.project': ObjectId(project_id)})
def gears_to_migrate(api_db, as_admin, randstr, file_form): def gen_gear_meta(gear_name): return { 'gear': { "version": '0.0.1', "config": {}, "name": gear_name, "inputs": { "file": { "base": "file", "description": "Any image." } }, "maintainer": "Test", "description": "Test", "license": "Other", "author": "Test", "url": "http://example.example", "label": "Test Gear", "flywheel": "0", "source": "http://example.example" } } gears = [] gear_name_1 = randstr() file_name = '%s.tar.gz' % randstr() file_content = randstr() r = as_admin.post('/gears/temp', files=file_form((file_name, file_content), meta=gen_gear_meta(gear_name_1))) gear_id_1 = r.json()['_id'] r = as_admin.get('/gears/' + gear_id_1) gear_json_1 = r.json() file_hash__1 = 'v0-' + gear_json_1['exchange']['rootfs-hash'].replace( ':', '-') file_id_1 = gear_json_1['exchange']['rootfs-id'] file_path = unicode(util.path_from_hash(file_hash__1)) target_dir = fs.path.dirname(file_path) if not config.local_fs.get_fs().exists(target_dir): config.local_fs.get_fs().makedirs(target_dir) move_file(file_id_1, config.local_fs, file_path) api_db['gears'].find_one_and_update({'_id': ObjectId(gear_id_1)}, {'$unset': { 'exchange.rootfs-id': '' }}) gears.append((gear_id_1, file_path)) gear_name_2 = randstr() file_name = '%s.tar.gz' % randstr() file_content = randstr() r = as_admin.post('/gears/temp', files=file_form((file_name, file_content), meta=gen_gear_meta(gear_name_2))) gear_id_2 = r.json()['_id'] r = as_admin.get('/gears/' + gear_id_2) gear_json_2 = r.json() file_id_2 = gear_json_2['exchange']['rootfs-id'] file_path = unicode(util.path_from_uuid(file_id_2)) target_dir = fs.path.dirname(file_path) if not config.local_fs.get_fs().exists(target_dir): config.local_fs._fs.makedirs(target_dir) move_file(file_id_2, config.local_fs, file_path) gears.append((gear_id_2, file_path)) yield gears # clean up gear_json_1 = api_db['gears'].find_one({'_id': ObjectId(gear_id_1)}) gear_json_2 = api_db['gears'].find_one({'_id': ObjectId(gear_id_2)}) files_to_delete = [] files_to_delete.append( util.path_from_uuid(gear_json_1['exchange'].get('rootfs-id', ''))) files_to_delete.append( util.path_from_uuid(gear_json_1['exchange'].get('rootfs-hash', ''))) files_to_delete.append( util.path_from_uuid(gear_json_2['exchange'].get('rootfs-id', ''))) for f_path in files_to_delete: try: config.primary_storage.remove_file(None, f_path) except: pass api_db['gears'].delete_one({'_id': ObjectId(gear_id_1)}) api_db['gears'].delete_one({'_id': ObjectId(gear_id_2)})
def files_to_migrate(data_builder, api_db, as_admin, randstr, file_form): # Create a project session_id = data_builder.create_session() files = [] # Create a CAS file file_name_1 = '%s.csv' % randstr() file_content_1 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_1, file_content_1))) file_info = api_db['sessions'].find_one({'files.name': file_name_1})['files'][0] file_id_1 = file_info['_id'] file_hash_1 = file_info['hash'] url_1 = '/sessions/' + session_id + '/files/' + file_name_1 api_db['sessions'].find_one_and_update({'files.name': file_name_1}, {'$unset': { 'files.$._id': '' }}) move_file_to_legacy(file_id_1, util.path_from_hash(file_hash_1)) files.append( (session_id, file_name_1, url_1, util.path_from_hash(file_hash_1))) # Create an UUID file file_name_2 = '%s.csv' % randstr() file_content_2 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_2, file_content_2))) file_info = api_db['sessions'].find_one({'files.name': file_name_2})['files'][1] file_id_2 = file_info['_id'] url_2 = '/sessions/' + session_id + '/files/' + file_name_2 move_file_to_legacy(file_id_2, util.path_from_uuid(file_id_2)) files.append( (session_id, file_name_2, url_2, util.path_from_uuid(file_id_2))) ### Temp fix for 3-way split storages, see api.config.local_fs2 for details # Create an UUID file in legacy/v1 for testing 3-way split storage file_name_3 = '%s.csv' % randstr() file_content_3 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_3, file_content_3))) file_info = api_db['sessions'].find_one({'files.name': file_name_3})['files'][2] file_id_3 = file_info['_id'] url_3 = '/sessions/' + session_id + '/files/' + file_name_3 move_file_to_legacy2(file_id_3, util.path_from_uuid(file_id_3)) files.append( (session_id, file_name_3, url_3, util.path_from_uuid(file_id_3))) ### yield files # Clean up, get the files files = api_db['sessions'].find_one({'_id': ObjectId(session_id)})['files'] # Delete the files for f in files: try: config.primary_storage.remove_file(f['_id'], util.path_from_uuid(f['_id'])) except: pass
def test_download_k(data_builder, file_form, as_admin, as_root, api_db, legacy_cas_file): project = data_builder.create_project(label='project1') session = data_builder.create_session(label='session1', project=project) session2 = data_builder.create_session(label='session1', project=project) session3 = data_builder.create_session(label='session1', project=project) session4 = data_builder.create_session(label='session/1', project=project) acquisition = data_builder.create_acquisition(session=session) acquisition2 = data_builder.create_acquisition(session=session2) acquisition3 = data_builder.create_acquisition(session=session3) acquisition4 = data_builder.create_acquisition(session=session4) # upload the same file to each container created and use different tags to # facilitate download filter tests: # acquisition: [], session: ['plus'], project: ['plus', 'minus'] file_name = 'test.csv' as_admin.post('/acquisitions/' + acquisition + '/files', files=file_form( file_name, meta={'name': file_name, 'type': 'csv'})) as_admin.post('/acquisitions/' + acquisition2 + '/files', files=file_form( file_name, meta={'name': file_name, 'type': 'csv'})) as_admin.post('/acquisitions/' + acquisition3 + '/files', files=file_form( 'test.txt', meta={'name': file_name, 'type': 'text'})) as_admin.post('/acquisitions/' + acquisition4 + '/files', files=file_form( 'test.txt', meta={'name': file_name, 'type': 'text'})) as_admin.post('/sessions/' + session + '/files', files=file_form( file_name, meta={'name': file_name, 'type': 'csv', 'tags': ['plus']})) as_admin.post('/projects/' + project + '/files', files=file_form( file_name, meta={'name': file_name, 'type': 'csv', 'tags': ['plus', 'minus']})) # also a deleted file to make sure it doesn't show up as_admin.post('/acquisitions/' + acquisition + '/files', files=file_form( file_name, meta={'name': 'deleted_'+file_name, 'type': 'csv'})) r = as_admin.delete('/acquisitions/' + acquisition + '/files/deleted_' + file_name) assert r.ok missing_object_id = '000000000000000000000000' # Try to download w/ nonexistent ticket r = as_admin.get('/download', params={'ticket': missing_object_id}) assert r.status_code == 404 # Retrieve a ticket for a batch download as superuser r = as_root.post('/download', json={ 'optional': False, 'filters': [{'tags': { '-': ['minus'] }}], 'nodes': [ {'level': 'project', '_id': project}, ] }) assert r.ok ticket = r.json()['ticket'] # Perform the download r = as_root.get('/download', params={'ticket': ticket}) assert r.ok # Retrieve a ticket for a batch download r = as_admin.post('/download', json={ 'optional': False, 'filters': [{'tags': { '-': ['minus'] }}], 'nodes': [ {'level': 'project', '_id': project}, ] }) assert r.ok ticket = r.json()['ticket'] # Perform the download r = as_admin.get('/download', params={'ticket': ticket}) assert r.ok tar_file = cStringIO.StringIO(r.content) tar = tarfile.open(mode="r", fileobj=tar_file) # Verify a single file in tar with correct file name found_second_session = False found_third_session = False found_fourth_session = False for tarinfo in tar: assert os.path.basename(tarinfo.name) == file_name if 'session1_0' in str(tarinfo.name): found_second_session = True if 'session1_1' in str(tarinfo.name): found_third_session = True if 'session1_2' in str(tarinfo.name): found_fourth_session = True assert found_second_session assert found_third_session assert found_fourth_session tar.close() # Download one session with many acquisitions and make sure they are in the same subject folder acquisition3 = data_builder.create_acquisition(session=session) r = as_admin.post('/acquisitions/' + acquisition3 + '/files', files=file_form( file_name, meta={'name': file_name, 'type': 'csv'})) assert r.ok r = as_admin.post('/download', json={ 'optional': False, 'nodes': [ {'level': 'acquisition', '_id': acquisition}, {'level': 'acquisition', '_id': acquisition3}, ] }) assert r.ok ticket = r.json()['ticket'] # Perform the download r = as_admin.get('/download', params={'ticket': ticket}) assert r.ok tar_file = cStringIO.StringIO(r.content) tar = tarfile.open(mode="r", fileobj=tar_file) # Verify a single file in tar with correct file name found_second_session = False for tarinfo in tar: assert os.path.basename(tarinfo.name) == file_name if 'session1_0' in str(tarinfo.name): found_second_session = True assert not found_second_session tar.close() # Try to perform the download from a different IP update_result = api_db.downloads.update_one( {'_id': ticket}, {'$set': {'ip': '255.255.255.255'}}) assert update_result.modified_count == 1 r = as_admin.get('/download', params={'ticket': ticket}) assert r.status_code == 400 # Try to retrieve a ticket referencing nonexistent containers r = as_admin.post('/download', json={ 'optional': False, 'nodes': [ {'level': 'project', '_id': missing_object_id}, {'level': 'session', '_id': missing_object_id}, {'level': 'acquisition', '_id': missing_object_id}, ] }) assert r.status_code == 404 # Try to retrieve ticket for bulk download w/ invalid container name # (not project|session|acquisition) r = as_admin.post('/download', params={'bulk': 'true'}, json={ 'files': [{'container_name': 'subject', 'container_id': missing_object_id, 'filename': 'nosuch.csv'}] }) assert r.status_code == 400 # Try to retrieve ticket for bulk download referencing nonexistent file r = as_admin.post('/download', params={'bulk': 'true'}, json={ 'files': [{'container_name': 'project', 'container_id': project, 'filename': 'nosuch.csv'}] }) assert r.status_code == 404 # Retrieve ticket for bulk download r = as_admin.post('/download', params={'bulk': 'true'}, json={ 'files': [{'container_name': 'project', 'container_id': project, 'filename': file_name}] }) assert r.ok ticket = r.json()['ticket'] # Perform the download using symlinks r = as_admin.get('/download', params={'ticket': ticket, 'symlinks': 'true'}) assert r.ok # test legacy cas file handling (project_legacy, file_name_legacy, file_content) = legacy_cas_file r = as_admin.post('/download', json={ 'optional': False, 'nodes': [ {'level': 'project', '_id': project_legacy}, ] }) assert r.ok ticket = r.json()['ticket'] # Perform the download r = as_admin.get('/download', params={'ticket': ticket}) assert r.ok tar_file = cStringIO.StringIO(r.content) tar = tarfile.open(mode="r", fileobj=tar_file) # Verify a single file in tar with correct file name for tarinfo in tar: assert os.path.basename(tarinfo.name) == file_name_legacy tar.close() # test missing file hangling file_id = api_db.acquisitions.find_one({'_id': ObjectId(acquisition)})['files'][0]['_id'] config.fs.remove(util.path_from_uuid(file_id)) r = as_admin.post('/download', json={ 'optional': False, 'nodes': [ {'level': 'acquisition', '_id': acquisition}, {'level': 'acquisition', '_id': acquisition3}, ] }) assert r.ok ticket = r.json()['ticket'] # Perform the download r = as_admin.get('/download', params={'ticket': ticket}) assert r.ok tar_file = cStringIO.StringIO(r.content) tar = tarfile.open(mode="r", fileobj=tar_file) # Verify a single file in tar with correct file name tarinfo_list = list(tar) # it contains two files assert len(tarinfo_list) == 2 assert len([tarinfo for tarinfo in tarinfo_list if tarinfo.name.endswith('.MISSING')]) == 1 tar.close()
def cleanup_files(remove_all, origins): log.info('Cleanup deleted container (projects, acquisitions, sessions, collections, analyses) files...') d = datetime.datetime.now() - datetime.timedelta(hours=72) for container in cont_names: log.info("Cleaning up %s" % container) cursor = db.get_collection(container).aggregate([ { "$match": { "$or": [ {"files.deleted": {"$lte": d}}, {"deleted": {"$lte": d}} ] } }, { "$project": { "files": { "$ifNull": [ { "$filter": { "input": "$files", "as": "item", "cond": { "$or": [ { "$and": [ # $lte return true if the deleted field not exists {"$lte": ["$$item.deleted", d]}, {"$ifNull": ["$$item.deleted", False]} ] }, { "$and": [ # $lte return true if the deleted field not exists {"$lte": ["$deleted", d]}, {"$ifNull": ["$deleted", False]} ] } ] } } }, [] ] }, "deleted": 1 } } ]) for document in cursor: for i, f in enumerate(document.get('files', [])): if not remove_all and f['origin']['type'] not in origins: log.debug(' skipping %s/%s/%s since it was uploaded by %s', container, document['_id'], f['name'], f['origin']['type']) continue log.debug(' file marked to delete: %s, parent marked to delete: %s', f.get('deleted', False), document.get('deleted', False)) log.debug(' removing %s/%s/%s', container, document['_id'], f['name']) if f.get('_id'): uuid_path = util.path_from_uuid(f['_id']) if fs.exists(uuid_path): log.debug(' removing from %s', fs) fs.remove(uuid_path) log.debug(' removing from database') updated_doc = db.get_collection(container).update({'_id': document['_id']}, {'$pull': {'files': {'_id': f['_id']}}}) if not updated_doc['nModified']: log.error(' couldn\'t remove file from database') exit(1)
def test_cleanup_deleted_files(data_builder, randstr, file_form, as_admin, api_db, cleanup_deleted, with_site_settings): project = data_builder.create_project(providers={'storage': 'deadbeefdeadbeefdeadbeef'}) subject = data_builder.create_subject(project=project, code='deleted_files_test') session_id = data_builder.create_session() file_name_1 = '%s.csv' % randstr() file_content_1 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_1, file_content_1))) # get the ticket r = as_admin.get('/sessions/' + session_id + '/files/' + file_name_1, params={'ticket': ''}) assert r.ok ticket = r.json()['ticket'] # download the file assert as_admin.get('/sessions/' + session_id + '/files/' + file_name_1, params={'ticket': ticket}).ok # Test that the file won't be deleted if it was deleted in the last 72 hours d = datetime.datetime.now() - datetime.timedelta(hours=70) api_db['sessions'].find_one_and_update( {'files.name': file_name_1}, {'$set': {'files.$.deleted': d}} ) file_info = api_db['sessions'].find_one( {'files.name': file_name_1} )['files'][0] file_id_1 = file_info['_id'] cleanup_deleted.main('--log-level', 'DEBUG', '--reaper') # TODO: we will have to be sure we get the same provider when we move to multi provider support storage_service = StorageProviderService() storage = storage_service.determine_provider(None, None, force_site_provider=True) assert storage.storage_plugin.get_file_info(file_id_1, util.path_from_uuid(file_id_1)) is not None # file won't be deleted after 72 hours if the origin is a user d = datetime.datetime.now() - datetime.timedelta(hours=73) api_db['sessions'].find_one_and_update( {'files.name': file_name_1}, {'$set': {'files.$.deleted': d}} ) cleanup_deleted.main('--log-level', 'DEBUG', '--reaper') assert storage.storage_plugin.get_file_info(file_id_1, util.path_from_uuid(file_id_1)) is not None # file deleted after 72 hours if the origin is not a user api_db['sessions'].find_one_and_update( {'files.name': file_name_1}, {'$set': {'files.$.origin.type': 'device'}} ) cleanup_deleted.main('--log-level', 'DEBUG', '--reaper') # file removed from the filesystem assert storage.storage_plugin.get_file_info(file_id_1, util.path_from_uuid(file_id_1)) is None # file also removed from the database document = api_db['sessions'].find_one( {'files.name': file_name_1} ) assert document is None # check when the parent container is deleted session_id_2 = data_builder.create_session() file_name_2 = '%s.csv' % randstr() file_content_2 = randstr() as_admin.post('/sessions/' + session_id_2 + '/files', files=file_form((file_name_2, file_content_2))) file_name_3 = '%s.csv' % randstr() file_content_3 = randstr() as_admin.post('/sessions/' + session_id_2 + '/files', files=file_form((file_name_3, file_content_3))) # Test that the file won't be deleted if it was deleted in the last 72 hours d = datetime.datetime.now() - datetime.timedelta(hours=70) # Mark session as deleted api_db['sessions'].find_one_and_update( {'_id': ObjectId(session_id_2)}, {'$set': {'deleted': d}} ) # Upload two test file file_info = api_db['sessions'].find_one( {'files.name': file_name_2} )['files'][0] file_id_2 = file_info['_id'] file_info = api_db['sessions'].find_one( {'files.name': file_name_3} )['files'][1] file_id_3 = file_info['_id'] cleanup_deleted.main('--log-level', 'DEBUG', '--reaper') # files still exist assert storage.storage_plugin.get_file_info(file_id_2, util.path_from_uuid(file_id_2)) is not None assert storage.storage_plugin.get_file_info(file_id_3, util.path_from_uuid(file_id_3)) is not None # file won't be deleted after 72 hours if the origin is a user d = datetime.datetime.now() - datetime.timedelta(hours=73) api_db['sessions'].find_one_and_update( {'_id': ObjectId(session_id_2)}, {'$set': {'deleted': d}} ) cleanup_deleted.main('--log-level', 'DEBUG', '--reaper') assert storage.storage_plugin.get_file_info(file_id_2, util.path_from_uuid(file_id_2)) is not None assert storage.storage_plugin.get_file_info(file_id_3, util.path_from_uuid(file_id_3)) is not None # file deleted after 72 hours if the origin is not a user api_db['sessions'].find_one_and_update( {'files.name': file_name_2}, {'$set': {'files.$.origin.type': 'device'}} ) cleanup_deleted.main('--log-level', 'DEBUG', '--reaper') # first file removed from the filesystem assert storage.storage_plugin.get_file_info(file_id_2, util.path_from_uuid(file_id_2)) is None # but the second file is still there assert storage.storage_plugin.get_file_info(file_id_3, util.path_from_uuid(file_id_3)) is not None # upload a file into the first session to see that it is kept when we use the --all flag # but others which are marked to delete will be removed file_name_4 = '%s.csv' % randstr() file_content_4 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_4, file_content_4))) file_info = api_db['sessions'].find_one( {'files.name': file_name_4} )['files'][0] file_id_4 = file_info['_id'] # with --all flag we delete every files which are marked to delete # don't care about the origin cleanup_deleted.main('--log-level', 'DEBUG', '--all') assert storage.storage_plugin.get_file_info(file_id_3, util.path_from_uuid(file_id_3)) is None # we keep files which are not marked assert storage.storage_plugin.get_file_info(file_id_4, util.path_from_uuid(file_id_4)) is not None # Mark the first session as deleted api_db['sessions'].find_one_and_update( {'_id': ObjectId(session_id)}, {'$set': {'deleted': d}} ) # now the fourth file will be deleted too cleanup_deleted.main('--log-level', 'DEBUG', '--all') assert storage.storage_plugin.get_file_info(file_id_4, util.path_from_uuid(file_id_4)) is None
def test_cleanup_single_project(data_builder, default_payload, randstr, file_form, as_admin, as_drone, api_db, cleanup_deleted, with_site_settings, site_gear): # Some tests are leaving partial jobs in the db that kill the tests # This is a quick and dirty way to get to a clean state without filtering api_db.jobs.remove({}) # Projects must have a provider for job/gear uploads to work storage_service = StorageProviderService() storage = storage_service.determine_provider(None, None, force_site_provider=True) project_id = data_builder.create_project(providers={'storage': str(storage.provider_id)}) session_id = data_builder.create_session() acquisition_id = data_builder.create_acquisition() file_name_1 = '%s.csv' % randstr() file_content_1 = randstr() as_admin.post('/sessions/' + session_id + '/files', files=file_form((file_name_1, file_content_1))) file_info = api_db['sessions'].find_one( {'files.name': file_name_1} )['files'][0] file_id_1 = file_info['_id'] # Create ad-hoc analysis r = as_admin.post('/sessions/' + session_id + '/analyses', json={ 'label': 'offline', 'inputs': [{'type': 'session', 'id': session_id, 'name': file_name_1}] }) assert r.ok analysis = r.json()['_id'] # get the ticket r = as_admin.get('/sessions/' + session_id + '/files/' + file_name_1, params={'ticket': ''}) assert r.ok ticket = r.json()['ticket'] # download the file assert as_admin.get('/sessions/' + session_id + '/files/' + file_name_1, params={'ticket': ticket}).ok # run a job import bson api_db.gears.update({'_id': bson.ObjectId(site_gear)}, {'$set': {'gear.inputs': {'dicom': {'base': 'file'}}}}) gear = site_gear job_data = { 'gear_id': gear, 'inputs': { 'dicom': { 'type': 'session', 'id': session_id, 'name': file_name_1 } }, 'config': { 'two-digit multiple of ten': 20 }, 'destination': { 'type': 'acquisition', 'id': acquisition_id }, 'tags': [ 'test-tag' ] } # add job with explicit destination r = as_admin.post('/jobs/add', json=job_data) assert r.ok job_id = r.json()['_id'] # start job (Adds logs) r = as_admin.get('/jobs/next') assert r.ok # prepare completion (send success status before engine upload) r = as_drone.post('/jobs/' + job_id + '/prepare-complete') assert r.ok # verify that job ticket has been created job_ticket = api_db.job_tickets.find_one({'job': job_id}) assert job_ticket['timestamp'] produced_metadata = { 'project': { 'label': 'engine project', 'info': {'test': 'p'} }, 'session': { 'label': 'engine session', 'subject': {'code': 'engine subject', 'sex': 'male', 'age': 86400}, 'info': {'test': 's'} }, 'acquisition': { 'label': 'engine acquisition', 'timestamp': '2016-06-20T21:57:36+00:00', 'info': {'test': 'a'}, 'files': [{ 'name': 'result.txt', 'type': 'text', 'info': {'test': 'f0'} }] } } # engine upload r = as_drone.post('/engine', params={'level': 'acquisition', 'id': acquisition_id, 'job': job_id, 'job_ticket': job_ticket['_id']}, files=file_form('result.txt', meta=produced_metadata) ) assert r.ok # Make sure produced metadata and logs exist r = as_admin.get('/jobs/' + job_id) assert r.ok job = r.json() assert job.get('produced_metadata') r = as_admin.get('/jobs/' + job_id + '/logs') assert r.ok assert r.json().get('logs') # Try cleaning undeleted project cleanup_deleted.main('--log-level', 'DEBUG', '--all', '--project', project_id, '--job-phi') # Make sure file is still there assert storage.storage_plugin.get_file_info(file_id_1, util.path_from_uuid(file_id_1)) # Make sure job phi is still there r = as_admin.get('/jobs/' + job_id) assert r.ok job = r.json() assert job.get('produced_metadata') r = as_admin.get('/jobs/' + job_id + '/logs') assert r.ok assert r.json().get('logs') # delete the project r = as_admin.delete('/projects/' + project_id) assert r.ok # Run cleanup again cleanup_deleted.main('--log-level', 'DEBUG', '--all', '--project', project_id, '--job-phi') # Make sure file is not there assert not storage.storage_plugin.get_file_info(file_id_1, util.path_from_uuid(file_id_1)) # Check job phi r = as_admin.get('/jobs/' + job_id) assert r.ok job = r.json() assert not job.get('produced_metadata') r = as_admin.get('/jobs/' + job_id + '/logs') assert r.ok assert not r.json().get('logs') assert not api_db.projects.find_one({'_id': ObjectId(project_id)}) assert not api_db.subjects.find_one({'parents.project': ObjectId(project_id)}) assert not api_db.sessions.find_one({'parents.project': ObjectId(project_id)}) assert not api_db.acquisitions.find_one({'parents.project': ObjectId(project_id)}) assert not api_db.analyses.find_one({'parents.project': ObjectId(project_id)})