def parse_upload(upload_id):
    db = get_db_from_env()
    storage_client = get_storage_client_from_env()
    try:
        parse_upload_package(upload_id, db, storage_client)
    except:
        logger.error(traceback.format_exc())
    def test_can_parse_upload_package_with_some_fields_missing(self):
        storage_client = self.app.config['storage_client']
        items_before = self.db.items.count_documents({})

        with tempfile.TemporaryDirectory() as tmpdirname:
            data_path = os.path.join(DATA_PATH,
                                     '5p0xMAG_small_with_missing_fields')
            package_path = os.path.join(tmpdirname, 'package.tar.bz2')
            self.create_package_from_directory(data_path, package_path)
            upload_id = self.upload_package(package_path)
            parse_upload_package(upload_id, self.db, storage_client)

            upload_doc = upload.get(self.db,
                                    upload_id,
                                    with_default_projection=False)
            self.assertEqual('finished', upload_doc.state)
            self.assertEqual(17, upload_doc.image_count)
            self.assertEqual(0, upload_doc.duplicate_image_count)
            self.assertEqual(1, upload_doc.broken_record_count)
            self.assertCountEqual([], upload_doc.duplicate_filenames)
            self.assertCountEqual([
                'SPC-EAWAG-5P0X-1543968111037290-9650556340265-000309-002-3712-0-52-40.jpeg'
            ], upload_doc.broken_records)

            items_after = self.db.items.count_documents({})
            self.assertNotEqual(items_before, items_after)
    def test_can_parse_upload_package_with_valid_spc_native_tar_format(self):
        storage_client = self.app.config['storage_client']
        items_before = self.db.items.count_documents({})

        with tempfile.TemporaryDirectory() as tmpdirname:
            data_path = os.path.join(DATA_PATH, '25_feb_upload_example_small')
            package_path = os.path.join(tmpdirname, 'package.tar')
            self.create_package_from_directory(data_path,
                                               package_path,
                                               compression='')
            upload_id = self.upload_package(package_path)
            parse_upload_package(upload_id, self.db, storage_client)

            upload_doc = upload.get(self.db,
                                    upload_id,
                                    with_default_projection=False)
            self.assertEqual('finished', upload_doc.state)
            self.assertEqual(8, upload_doc.image_count)
            self.assertEqual(0, upload_doc.duplicate_image_count)
            self.assertEqual(0, upload_doc.broken_record_count)
            self.assertCountEqual([], upload_doc.duplicate_filenames)
            self.assertCountEqual([], upload_doc.broken_records)

            items_after = self.db.items.count_documents({})
            self.assertNotEqual(items_before, items_after)
    def test_can_parse_upload_package_with_some_fields_as_infs_or_nans(self):
        storage_client = self.app.config['storage_client']
        items_before = self.db.items.count_documents({})

        with tempfile.TemporaryDirectory() as tmpdirname:
            data_path = os.path.join(DATA_PATH,
                                     '5p0xMAG_small_with_infs_and_nans')
            package_path = os.path.join(tmpdirname, 'package.tar.bz2')
            self.create_package_from_directory(data_path, package_path)
            upload_id = self.upload_package(package_path)
            parse_upload_package(upload_id, self.db, storage_client)

            upload_doc = upload.get(self.db,
                                    upload_id,
                                    with_default_projection=False)
            self.assertEqual('finished', upload_doc.state)
            self.assertEqual(17, upload_doc.image_count)
            self.assertEqual(0, upload_doc.duplicate_image_count)
            self.assertEqual(2, upload_doc.broken_record_count)
            self.assertCountEqual([], upload_doc.duplicate_filenames)
            self.assertCountEqual([
                'SPC-EAWAG-5P0X-1543968157067352-9650602344089-000769-002-3546-2354-48-48.jpeg',
                'SPC-EAWAG-5P0X-1543968114038057-9650559340515-000339-001-3536-32-68-92.jpeg'
            ], upload_doc.broken_records)

            items_after = self.db.items.count_documents({})
            self.assertNotEqual(items_before, items_after)
    def test_can_parse_upload_package_with_duplicated_fields_filenames_in_tsv(
            self):
        storage_client = self.app.config['storage_client']
        items_before = self.db.items.count_documents({})

        with tempfile.TemporaryDirectory() as tmpdirname:
            data_path = os.path.join(
                DATA_PATH, '5p0xMAG_small_with_tsv_duplicated_filenames')
            package_path = os.path.join(tmpdirname, 'package.tar.bz2')
            self.create_package_from_directory(data_path, package_path)
            upload_id = self.upload_package(package_path)
            parse_upload_package(upload_id, self.db, storage_client)

            upload_doc = upload.get(self.db,
                                    upload_id,
                                    with_default_projection=False)
            self.assertEqual('finished', upload_doc.state)
            self.assertEqual(19, upload_doc.image_count)
            self.assertEqual(2, upload_doc.duplicate_image_count)
            self.assertEqual(0, upload_doc.broken_record_count)
            self.assertCountEqual([
                'SPC-EAWAG-5P0X-1543968141051783-9650586342759-000609-002-0-2088-32-84.jpeg',
                'SPC-EAWAG-5P0X-1543968092032969-9650537338686-000119-003-2132-1914-48-48.jpeg'
            ], upload_doc.duplicate_filenames)
            self.assertCountEqual([], upload_doc.broken_records)

            items_after = self.db.items.count_documents({})
            self.assertNotEqual(items_before, items_after)
            self.assertEqual(
                items_after - items_before,
                upload_doc.image_count - upload_doc.duplicate_image_count)
    def test_cant_parse_upload_package_with_package_with_empty_tsv_file_and_no_images(
            self):
        storage_client = self.app.config['storage_client']
        items_before = self.db.items.count_documents({})

        with tempfile.TemporaryDirectory() as tmpdirname:
            data_path = os.path.join(DATA_PATH,
                                     '5p0xMAG_small_empty_tsv_no_images')
            package_path = os.path.join(tmpdirname, 'package.tar.bz2')
            self.create_package_from_directory(data_path, package_path)
            upload_id = self.upload_package(package_path)
            parse_upload_package(upload_id, self.db, storage_client)

            upload_doc = upload.get(self.db,
                                    upload_id,
                                    with_default_projection=False)
            self.assertEqual('failed', upload_doc.state)

            with self.assertRaises(AttributeError):
                upload_doc.image_count
            with self.assertRaises(AttributeError):
                upload_doc.duplicate_image_count
            with self.assertRaises(AttributeError):
                upload_doc.broken_record_count
            with self.assertRaises(AttributeError):
                upload_doc.duplicate_filenames
            with self.assertRaises(AttributeError):
                upload_doc.broken_records

            items_after = self.db.items.count_documents({})
            self.assertEqual(items_before, items_after)
    def test_cant_parse_upload_package_with_package_that_is_just_a_file(self):
        storage_client = self.app.config['storage_client']
        items_before = self.db.items.count_documents({})

        data_path = os.path.join(DATA_PATH, '5p0xMAG_small', 'features.tsv')
        upload_id = self.upload_package(data_path)
        parse_upload_package(upload_id, self.db, storage_client)

        upload_doc = upload.get(self.db,
                                upload_id,
                                with_default_projection=False)
        self.assertEqual('failed', upload_doc.state)

        with self.assertRaises(AttributeError):
            upload_doc.image_count
        with self.assertRaises(AttributeError):
            upload_doc.duplicate_image_count
        with self.assertRaises(AttributeError):
            upload_doc.broken_record_count
        with self.assertRaises(AttributeError):
            upload_doc.duplicate_filenames
        with self.assertRaises(AttributeError):
            upload_doc.broken_records

        items_after = self.db.items.count_documents({})
        self.assertEqual(items_before, items_after)
    def test_can_parse_upload_package_with_duplicates_only(self):
        storage_client = self.app.config['storage_client']
        items_before = self.db.items.count_documents({})

        with tempfile.TemporaryDirectory() as tmpdirname:
            data_path = os.path.join(DATA_PATH, '5p0xMAG_small')
            package_path = os.path.join(tmpdirname, 'package.tar.bz2')
            self.create_package_from_directory(data_path, package_path)
            upload_id = self.upload_package(package_path)
            parse_upload_package(upload_id, self.db, storage_client)

            upload_doc = upload.get(self.db,
                                    upload_id,
                                    with_default_projection=False)
            self.assertEqual('finished', upload_doc.state)
            self.assertEqual(17, upload_doc.image_count)
            self.assertEqual(0, upload_doc.duplicate_image_count)
            self.assertEqual(0, upload_doc.broken_record_count)
            self.assertCountEqual([], upload_doc.duplicate_filenames)
            self.assertCountEqual([], upload_doc.broken_records)

            items_after = self.db.items.count_documents({})
            self.assertNotEqual(items_before, items_after)

        items_inbeetween = self.db.items.count_documents({})

        with tempfile.TemporaryDirectory() as tmpdirname:
            data_path = os.path.join(DATA_PATH, '5p0xMAG_3_entries')
            package_path = os.path.join(tmpdirname, 'package.tar.bz2')
            self.create_package_from_directory(data_path, package_path)
            upload_id = self.upload_package(package_path)
            parse_upload_package(upload_id, self.db, storage_client)

            upload_doc = upload.get(self.db,
                                    upload_id,
                                    with_default_projection=False)
            self.assertEqual('finished', upload_doc.state)
            self.assertEqual(3, upload_doc.image_count)
            self.assertEqual(3, upload_doc.duplicate_image_count)
            self.assertEqual(0, upload_doc.broken_record_count)
            self.assertCountEqual([
                'SPC-EAWAG-5P0X-1543968085030435-9650530338104-000049-002-2838-1090-48-32.jpeg',
                'SPC-EAWAG-5P0X-1543968169050193-9650614345087-000889-004-2636-0-100-128.jpeg',
                'SPC-EAWAG-5P0X-1543968172024020-9650617345336-000919-002-1364-290-64-72.jpeg'
            ], upload_doc.duplicate_filenames)
            self.assertCountEqual([], upload_doc.broken_records)

            items_after = self.db.items.count_documents({})
            self.assertNotEqual(items_before, items_after)
            self.assertEqual(items_inbeetween, items_after)