Esempio n. 1
0
    def test__get_dtype_of_array__unsupported_type_raises_exception(self):
        unsupported_array = Series(list([time() for _ in range(2)]), dtype="datetime64[ns]")

        with self.assertRaises(TypeError) as exception_context:
            get_dtype_of_array(unsupported_array)

        self.assertIn("unsupported", str(exception_context.exception))
Esempio n. 2
0
    def test__get_dtype_of_array__unordered_integer_categories_return_as_expected(self):
        array = Series(data=[2, 3, 1, 3, 1, 2], dtype="category")
        expected_dtype = np.int32

        actual_dtype = get_dtype_of_array(array)

        self.assertEqual(expected_dtype, actual_dtype)
Esempio n. 3
0
    def test__get_dtype_of_array__categories_return_as_expected(self):
        array = Series(data=["a", "b", "c"], dtype="category")
        expected_dtype = str

        actual_dtype = get_dtype_of_array(array)

        self.assertEqual(expected_dtype, actual_dtype)
Esempio n. 4
0
    def test__get_dtype_of_array__supported_dtypes_return_as_expected(self):
        types = [np.float32, np.int32, np.bool_, str]
        expected_dtypes = [np.float32, np.int32, np.uint8, np.unicode]

        for test_type_index in range(len(types)):
            with self.subTest(f"Testing get_dtype_of_array with type {types[test_type_index].__name__}",
                              i=test_type_index):
                array = Series(data=[], dtype=types[test_type_index])
                self.assertEqual(get_dtype_of_array(array), expected_dtypes[test_type_index])
Esempio n. 5
0
    def test__get_dtype_of_array__castable_dtypes_return_as_expected(self):
        types = [np.float64, np.int64]
        expected_dtypes = [np.float32, np.int32]

        for test_type_index in range(len(types)):
            with self.subTest(f"Testing get_dtype_of_array with castable type {types[test_type_index].__name__}",
                              i=test_type_index):
                array = Series(data=[], dtype=types[test_type_index])
                self.assertEqual(get_dtype_of_array(array), expected_dtypes[test_type_index])
Esempio n. 6
0
 def create_dataframe_array(array_name, dataframe):
     tiledb_filter = tiledb.FilterList(
         [
             # Attempt aggressive compression as many of these dataframes are very repetitive strings, bools and
             # other non-float data.
             tiledb.ZstdFilter(level=22),
         ]
     )
     attrs = [
         tiledb.Attr(name=column, dtype=get_dtype_of_array(dataframe[column]), filters=tiledb_filter)
         for column in dataframe
     ]
     domain = tiledb.Domain(
         tiledb.Dim(domain=(0, dataframe.shape[0] - 1), tile=min(dataframe.shape[0], 1000), dtype=np.uint32)
     )
     schema = tiledb.ArraySchema(
         domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major"
     )
     tiledb.DenseArray.create(array_name, schema)
Esempio n. 7
0
    def write_labels(self, df, data_adaptor):
        auth_user_id = self.get_user_id()
        user_name = self.get_user_name()
        timestamp = time.time()
        dataset_location = data_adaptor.get_location()
        dataset_id = self.db.get_or_create_dataset(dataset_location)
        dataset_name = data_adaptor.get_title()
        user_id = self.db.get_or_create_user(auth_user_id)
        """
        NOTE: The uri contains the dataset name, user name and a timestamp as a convenience for debugging purposes.
        People may have the same name and time.time() can be server dependent.
        See - https://docs.python.org/2/library/time.html#time.time

        The annotations objects in the database should be used as the source of truth about who an annotation belongs
        to (for authorization purposes) and what time it was created (for garbage collection).
        """
        uri = f"{self.directory_path}{dataset_name}/{user_name}/{timestamp}"
        if uri.startswith("s3://"):
            pass
        else:
            os.makedirs(uri, exist_ok=True)
        _, dataframe_schema_type_hints = get_dtypes_and_schemas_of_dataframe(
            df)
        if not df.empty:
            self.check_category_names(df)
            # convert to tiledb datatypes

            for col in df:
                df[col] = df[col].astype(get_dtype_of_array(df[col]))
            tiledb.from_pandas(uri, df, sparse=True)
        else:
            uri = ""

        annotation = Annotation(
            tiledb_uri=uri,
            user_id=user_id,
            dataset_id=str(dataset_id),
            schema_hints=json.dumps(dataframe_schema_type_hints),
        )
        self.db.session.add(annotation)
        self.db.session.commit()