def create_sample_table(self): table_info = DataFrameMetadata("dataset", 'dataset') column_1 = DataFrameColumn("id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("data", ColumnType.NDARRAY, False, NdArrayType.UINT8, [2, 2, 3]) table_info.schema = [column_1, column_2] return table_info
def test_schema_equality(self): schema_name = "foo" column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False, NdArrayType.UINT8, [28, 28]) column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2, column_3] schema1 = DataFrameSchema(schema_name, col_list) schema2 = DataFrameSchema(schema_name, col_list[1:]) schema3 = DataFrameColumn('foo2', col_list) self.assertEqual(schema1, schema1) self.assertNotEqual(schema1, schema2) self.assertNotEqual(schema1, schema3)
def test_create_plan(self): dummy_info = TableInfo('dummy') dummy_table = TableRef(dummy_info) CatalogManager().reset() columns = [ DataFrameColumn('id', ColumnType.INTEGER), DataFrameColumn('name', ColumnType.TEXT, array_dimensions=[50]) ] dummy_plan_node = CreatePlan(dummy_table, columns, False) self.assertEqual(dummy_plan_node.opr_type, PlanOprType.CREATE) self.assertEqual(dummy_plan_node.if_not_exists, False) self.assertEqual(dummy_plan_node.table_ref.table.table_name, "dummy") self.assertEqual(dummy_plan_node.column_list[0].name, "id") self.assertEqual(dummy_plan_node.column_list[1].name, "name")
def test_df_schema(self): schema_name = "foo" column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False, NdArrayType.UINT8, [28, 28]) column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2, column_3] schema = DataFrameSchema(schema_name, col_list) expected_schema = SchemaUtils.get_petastorm_schema( schema_name, col_list) self.assertEqual(schema.name, schema_name) self.assertEqual(schema.column_list, col_list) self.assertEqual( schema.petastorm_schema.fields, expected_schema.fields) for field1, field2 in zip( schema.petastorm_schema.fields, expected_schema.fields): self.assertEqual(field1, field2) self.assertEqual( schema.pyspark_schema, expected_schema.as_spark_schema())
def test_get_petastorm_column_ndarray(self): expected_type = [np.int8, np.uint8, np.int16, np.int32, np.int64, np.unicode_, np.bool_, np.float32, np.float64, Decimal, np.str_, np.datetime64] col_name = 'frame_id' for array_type, np_type in zip(NdArrayType, expected_type): col = DataFrameColumn(col_name, ColumnType.NDARRAY, True, array_type, [10, 10]) petastorm_col = UnischemaField(col_name, np_type, [10, 10], NdarrayCodec(), True) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)
def test_get_petastorm_column(self): col_name = 'frame_id' col = DataFrameColumn(col_name, ColumnType.INTEGER, False) petastorm_col = UnischemaField( col_name, np.int32, (), ScalarCodec( IntegerType()), False) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col) col = DataFrameColumn(col_name, ColumnType.FLOAT, True) petastorm_col = UnischemaField( col_name, np.float64, (), ScalarCodec( FloatType()), True) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col) col = DataFrameColumn(col_name, ColumnType.TEXT, False) petastorm_col = UnischemaField( col_name, np.str_, (), ScalarCodec( StringType()), False) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col) col = DataFrameColumn(col_name, None, True, [10, 10]) self.assertEqual(SchemaUtils.get_petastorm_column(col), None)
def test_create_metadata_should_create_dataset_and_columns( self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() file_url = "file1" dataset_name = "name" columns = [(DataFrameColumn("c1", ColumnType.INTEGER))] actual = catalog.create_metadata(dataset_name, file_url, columns) ds_mock.return_value.create_dataset.assert_called_with( dataset_name, file_url, identifier_id='id', is_video=False) for column in columns: column.metadata_id = \ ds_mock.return_value.create_dataset.return_value.id dcs_mock.return_value.create_column.assert_called_with(columns) expected = ds_mock.return_value.create_dataset.return_value expected.schema = \ dcs_mock.return_value.create_column.return_value self.assertEqual(actual, expected)
def create_column_metadata( self, column_name: str, data_type: ColumnType, array_type: NdArrayType, dimensions: List[int], ) -> DataFrameColumn: """Create a dataframe column object this column. This function won't commit this object in the catalog database. If you want to commit it into catalog table call create_metadata with corresponding table_id Arguments: column_name {str} -- column name to be created data_type {ColumnType} -- type of column created array_type {NdArrayType} -- type of ndarray dimensions {List[int]} -- dimensions of the column created """ return DataFrameColumn( column_name, data_type, array_type=array_type, array_dimensions=dimensions, )
def test_raise_exception_when_unkown_array_type(self): col_name = 'frame_id' col = DataFrameColumn(col_name, ColumnType.NDARRAY, True, ColumnType.TEXT, [10, 10]) self.assertRaises(ValueError, SchemaUtils.get_petastorm_column, col)