def test__get_dtype_of_array__unsupported_type_raises_exception(self): unsupported_array = Series(list([time() for _ in range(2)]), dtype="datetime64[ns]") with self.assertRaises(TypeError) as exception_context: get_dtype_of_array(unsupported_array) self.assertIn("unsupported", str(exception_context.exception))
def test__get_dtype_of_array__unordered_integer_categories_return_as_expected(self): array = Series(data=[2, 3, 1, 3, 1, 2], dtype="category") expected_dtype = np.int32 actual_dtype = get_dtype_of_array(array) self.assertEqual(expected_dtype, actual_dtype)
def test__get_dtype_of_array__categories_return_as_expected(self): array = Series(data=["a", "b", "c"], dtype="category") expected_dtype = str actual_dtype = get_dtype_of_array(array) self.assertEqual(expected_dtype, actual_dtype)
def test__get_dtype_of_array__supported_dtypes_return_as_expected(self): types = [np.float32, np.int32, np.bool_, str] expected_dtypes = [np.float32, np.int32, np.uint8, np.unicode] for test_type_index in range(len(types)): with self.subTest(f"Testing get_dtype_of_array with type {types[test_type_index].__name__}", i=test_type_index): array = Series(data=[], dtype=types[test_type_index]) self.assertEqual(get_dtype_of_array(array), expected_dtypes[test_type_index])
def test__get_dtype_of_array__castable_dtypes_return_as_expected(self): types = [np.float64, np.int64] expected_dtypes = [np.float32, np.int32] for test_type_index in range(len(types)): with self.subTest(f"Testing get_dtype_of_array with castable type {types[test_type_index].__name__}", i=test_type_index): array = Series(data=[], dtype=types[test_type_index]) self.assertEqual(get_dtype_of_array(array), expected_dtypes[test_type_index])
def create_dataframe_array(array_name, dataframe): tiledb_filter = tiledb.FilterList( [ # Attempt aggressive compression as many of these dataframes are very repetitive strings, bools and # other non-float data. tiledb.ZstdFilter(level=22), ] ) attrs = [ tiledb.Attr(name=column, dtype=get_dtype_of_array(dataframe[column]), filters=tiledb_filter) for column in dataframe ] domain = tiledb.Domain( tiledb.Dim(domain=(0, dataframe.shape[0] - 1), tile=min(dataframe.shape[0], 1000), dtype=np.uint32) ) schema = tiledb.ArraySchema( domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major" ) tiledb.DenseArray.create(array_name, schema)
def write_labels(self, df, data_adaptor): auth_user_id = self.get_user_id() user_name = self.get_user_name() timestamp = time.time() dataset_location = data_adaptor.get_location() dataset_id = self.db.get_or_create_dataset(dataset_location) dataset_name = data_adaptor.get_title() user_id = self.db.get_or_create_user(auth_user_id) """ NOTE: The uri contains the dataset name, user name and a timestamp as a convenience for debugging purposes. People may have the same name and time.time() can be server dependent. See - https://docs.python.org/2/library/time.html#time.time The annotations objects in the database should be used as the source of truth about who an annotation belongs to (for authorization purposes) and what time it was created (for garbage collection). """ uri = f"{self.directory_path}{dataset_name}/{user_name}/{timestamp}" if uri.startswith("s3://"): pass else: os.makedirs(uri, exist_ok=True) _, dataframe_schema_type_hints = get_dtypes_and_schemas_of_dataframe( df) if not df.empty: self.check_category_names(df) # convert to tiledb datatypes for col in df: df[col] = df[col].astype(get_dtype_of_array(df[col])) tiledb.from_pandas(uri, df, sparse=True) else: uri = "" annotation = Annotation( tiledb_uri=uri, user_id=user_id, dataset_id=str(dataset_id), schema_hints=json.dumps(dataframe_schema_type_hints), ) self.db.session.add(annotation) self.db.session.commit()