def test_can_parse_upload_package_with_some_duplicates_and_duplicated_tsv_entries( self): storage_client = self.app.config['storage_client'] items_before = self.db.items.count_documents({}) with tempfile.TemporaryDirectory() as tmpdirname: data_path = os.path.join(DATA_PATH, '5p0xMAG_small') package_path = os.path.join(tmpdirname, 'package.tar.bz2') self.create_package_from_directory(data_path, package_path) upload_id = self.upload_package(package_path) parse_upload_package(upload_id, self.db, storage_client) upload_doc = upload.get(self.db, upload_id, with_default_projection=False) self.assertEqual('finished', upload_doc.state) self.assertEqual(17, upload_doc.image_count) self.assertEqual(0, upload_doc.duplicate_image_count) self.assertEqual(0, upload_doc.broken_record_count) self.assertCountEqual([], upload_doc.duplicate_filenames) self.assertCountEqual([], upload_doc.broken_records) items_after = self.db.items.count_documents({}) self.assertNotEqual(items_before, items_after) items_inbeetween = self.db.items.count_documents({}) with tempfile.TemporaryDirectory() as tmpdirname: data_path = os.path.join( DATA_PATH, '5p0xMAG_small_2_duplicates_and_tsv_duplicates') package_path = os.path.join(tmpdirname, 'package.tar.bz2') self.create_package_from_directory(data_path, package_path) upload_id = self.upload_package(package_path) parse_upload_package(upload_id, self.db, storage_client) upload_doc = upload.get(self.db, upload_id, with_default_projection=False) self.assertEqual('finished', upload_doc.state) self.assertEqual(9, upload_doc.image_count) self.assertEqual(4, upload_doc.duplicate_image_count) self.assertEqual(0, upload_doc.broken_record_count) self.assertCountEqual([ 'SPC-EAWAG-5P0X-1543968111037290-9650556340265-000309-002-3712-0-52-40.jpeg', 'SPC-EAWAG-5P0X-1543968111037290-9650556340265-000309-002-3712-0-52-40.jpeg', 'SPC-EAWAG-5P0X-1543968114038057-9650559340515-000339-001-3536-32-68-92.jpeg', 'SPC-EAWAG-5P0X-1543968114038057-9650559340515-000339-001-3536-32-68-92.jpeg' ], upload_doc.duplicate_filenames) self.assertCountEqual([], upload_doc.broken_records) items_after = self.db.items.count_documents({}) self.assertNotEqual(items_before, items_after) self.assertNotEqual(items_inbeetween, items_after)
def test_can_parse_upload_package_with_duplicates_only(self): storage_client = self.app.config['storage_client'] items_before = self.db.items.count_documents({}) with tempfile.TemporaryDirectory() as tmpdirname: data_path = os.path.join(DATA_PATH, '5p0xMAG_small') package_path = os.path.join(tmpdirname, 'package.tar.bz2') self.create_package_from_directory(data_path, package_path) upload_id = self.upload_package(package_path) parse_upload_package(upload_id, self.db, storage_client) upload_doc = upload.get(self.db, upload_id, with_default_projection=False) self.assertEqual('finished', upload_doc.state) self.assertEqual(17, upload_doc.image_count) self.assertEqual(0, upload_doc.duplicate_image_count) self.assertEqual(0, upload_doc.broken_record_count) self.assertCountEqual([], upload_doc.duplicate_filenames) self.assertCountEqual([], upload_doc.broken_records) items_after = self.db.items.count_documents({}) self.assertNotEqual(items_before, items_after) items_inbeetween = self.db.items.count_documents({}) with tempfile.TemporaryDirectory() as tmpdirname: data_path = os.path.join(DATA_PATH, '5p0xMAG_3_entries') package_path = os.path.join(tmpdirname, 'package.tar.bz2') self.create_package_from_directory(data_path, package_path) upload_id = self.upload_package(package_path) parse_upload_package(upload_id, self.db, storage_client) upload_doc = upload.get(self.db, upload_id, with_default_projection=False) self.assertEqual('finished', upload_doc.state) self.assertEqual(3, upload_doc.image_count) self.assertEqual(3, upload_doc.duplicate_image_count) self.assertEqual(0, upload_doc.broken_record_count) self.assertCountEqual([ 'SPC-EAWAG-5P0X-1543968085030435-9650530338104-000049-002-2838-1090-48-32.jpeg', 'SPC-EAWAG-5P0X-1543968169050193-9650614345087-000889-004-2636-0-100-128.jpeg', 'SPC-EAWAG-5P0X-1543968172024020-9650617345336-000919-002-1364-290-64-72.jpeg' ], upload_doc.duplicate_filenames) self.assertCountEqual([], upload_doc.broken_records) items_after = self.db.items.count_documents({}) self.assertNotEqual(items_before, items_after) self.assertEqual(items_inbeetween, items_after)
def test_can_parse_upload_package_with_valid_spc_native_tar_format(self): storage_client = self.app.config['storage_client'] items_before = self.db.items.count_documents({}) with tempfile.TemporaryDirectory() as tmpdirname: data_path = os.path.join(DATA_PATH, '25_feb_upload_example_small') package_path = os.path.join(tmpdirname, 'package.tar') self.create_package_from_directory(data_path, package_path, compression='') upload_id = self.upload_package(package_path) parse_upload_package(upload_id, self.db, storage_client) upload_doc = upload.get(self.db, upload_id, with_default_projection=False) self.assertEqual('finished', upload_doc.state) self.assertEqual(8, upload_doc.image_count) self.assertEqual(0, upload_doc.duplicate_image_count) self.assertEqual(0, upload_doc.broken_record_count) self.assertCountEqual([], upload_doc.duplicate_filenames) self.assertCountEqual([], upload_doc.broken_records) items_after = self.db.items.count_documents({}) self.assertNotEqual(items_before, items_after)
def test_can_parse_upload_package_with_some_fields_as_infs_or_nans(self): storage_client = self.app.config['storage_client'] items_before = self.db.items.count_documents({}) with tempfile.TemporaryDirectory() as tmpdirname: data_path = os.path.join(DATA_PATH, '5p0xMAG_small_with_infs_and_nans') package_path = os.path.join(tmpdirname, 'package.tar.bz2') self.create_package_from_directory(data_path, package_path) upload_id = self.upload_package(package_path) parse_upload_package(upload_id, self.db, storage_client) upload_doc = upload.get(self.db, upload_id, with_default_projection=False) self.assertEqual('finished', upload_doc.state) self.assertEqual(17, upload_doc.image_count) self.assertEqual(0, upload_doc.duplicate_image_count) self.assertEqual(2, upload_doc.broken_record_count) self.assertCountEqual([], upload_doc.duplicate_filenames) self.assertCountEqual([ 'SPC-EAWAG-5P0X-1543968157067352-9650602344089-000769-002-3546-2354-48-48.jpeg', 'SPC-EAWAG-5P0X-1543968114038057-9650559340515-000339-001-3536-32-68-92.jpeg' ], upload_doc.broken_records) items_after = self.db.items.count_documents({}) self.assertNotEqual(items_before, items_after)
def test_can_parse_upload_package_with_duplicated_fields_filenames_in_tsv( self): storage_client = self.app.config['storage_client'] items_before = self.db.items.count_documents({}) with tempfile.TemporaryDirectory() as tmpdirname: data_path = os.path.join( DATA_PATH, '5p0xMAG_small_with_tsv_duplicated_filenames') package_path = os.path.join(tmpdirname, 'package.tar.bz2') self.create_package_from_directory(data_path, package_path) upload_id = self.upload_package(package_path) parse_upload_package(upload_id, self.db, storage_client) upload_doc = upload.get(self.db, upload_id, with_default_projection=False) self.assertEqual('finished', upload_doc.state) self.assertEqual(19, upload_doc.image_count) self.assertEqual(2, upload_doc.duplicate_image_count) self.assertEqual(0, upload_doc.broken_record_count) self.assertCountEqual([ 'SPC-EAWAG-5P0X-1543968141051783-9650586342759-000609-002-0-2088-32-84.jpeg', 'SPC-EAWAG-5P0X-1543968092032969-9650537338686-000119-003-2132-1914-48-48.jpeg' ], upload_doc.duplicate_filenames) self.assertCountEqual([], upload_doc.broken_records) items_after = self.db.items.count_documents({}) self.assertNotEqual(items_before, items_after) self.assertEqual( items_after - items_before, upload_doc.image_count - upload_doc.duplicate_image_count)
def test_cant_parse_upload_package_with_package_with_empty_tsv_file_and_no_images( self): storage_client = self.app.config['storage_client'] items_before = self.db.items.count_documents({}) with tempfile.TemporaryDirectory() as tmpdirname: data_path = os.path.join(DATA_PATH, '5p0xMAG_small_empty_tsv_no_images') package_path = os.path.join(tmpdirname, 'package.tar.bz2') self.create_package_from_directory(data_path, package_path) upload_id = self.upload_package(package_path) parse_upload_package(upload_id, self.db, storage_client) upload_doc = upload.get(self.db, upload_id, with_default_projection=False) self.assertEqual('failed', upload_doc.state) with self.assertRaises(AttributeError): upload_doc.image_count with self.assertRaises(AttributeError): upload_doc.duplicate_image_count with self.assertRaises(AttributeError): upload_doc.broken_record_count with self.assertRaises(AttributeError): upload_doc.duplicate_filenames with self.assertRaises(AttributeError): upload_doc.broken_records items_after = self.db.items.count_documents({}) self.assertEqual(items_before, items_after)
def test_cant_parse_upload_package_with_package_that_is_just_a_file(self): storage_client = self.app.config['storage_client'] items_before = self.db.items.count_documents({}) data_path = os.path.join(DATA_PATH, '5p0xMAG_small', 'features.tsv') upload_id = self.upload_package(data_path) parse_upload_package(upload_id, self.db, storage_client) upload_doc = upload.get(self.db, upload_id, with_default_projection=False) self.assertEqual('failed', upload_doc.state) with self.assertRaises(AttributeError): upload_doc.image_count with self.assertRaises(AttributeError): upload_doc.duplicate_image_count with self.assertRaises(AttributeError): upload_doc.broken_record_count with self.assertRaises(AttributeError): upload_doc.duplicate_filenames with self.assertRaises(AttributeError): upload_doc.broken_records items_after = self.db.items.count_documents({}) self.assertEqual(items_before, items_after)
def get(self, upload_id): db = app.config['db'] try: doc = upload.get(db, upload_id, with_default_projection=False) except InvalidId: return invalid_request() if doc: return doc.serializable(shallow=True) else: return invalid_request()
def test_api_can_post_empty_tags_list(self): with self.app.app_context(): upload_doc = copy.deepcopy(DUMMY_UPLOADS[0]) tags = [] request_data = json.dumps({ 'tags': tags }) res = self.client().post(f'/upload/{str(upload_doc._id)}/tags', data=request_data, headers=self.headers) self.assertEqual(res.status_code, 204) db = self.app.config['db'] upload_after = upload.get(db, upload_doc._id, with_default_projection=False) self.assertCountEqual(upload_after.tags, tags)
def test_api_cant_post_invalid_tags_list(self): with self.app.app_context(): upload_doc = copy.deepcopy(DUMMY_UPLOADS[3]) invalid_tags = [[4], ['valid', 4], 'invalid', [False]] for tags_list in invalid_tags: request_data = json.dumps({'tags': tags_list}) res = self.client().post(f'/upload/{str(upload_doc._id)}/tags', data=request_data, headers=self.headers) self.assertEqual(res.status_code, 400) db = self.app.config['db'] upload_after = upload.get(db, upload_doc._id, with_default_projection=False) self.assertCountEqual(upload_after.tags, upload_doc.tags)