Beispiel #1
0
 def create_sample_table(self):
     table_info = DataFrameMetadata("dataset", 'dataset')
     column_1 = DataFrameColumn("id", ColumnType.INTEGER, False)
     column_2 = DataFrameColumn("data", ColumnType.NDARRAY, False,
                                NdArrayType.UINT8, [2, 2, 3])
     table_info.schema = [column_1, column_2]
     return table_info
Beispiel #2
0
 def test_schema_equality(self):
     schema_name = "foo"
     column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
     column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False,
                                NdArrayType.UINT8, [28, 28])
     column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
     col_list = [column_1, column_2, column_3]
     schema1 = DataFrameSchema(schema_name, col_list)
     schema2 = DataFrameSchema(schema_name, col_list[1:])
     schema3 = DataFrameColumn('foo2', col_list)
     self.assertEqual(schema1, schema1)
     self.assertNotEqual(schema1, schema2)
     self.assertNotEqual(schema1, schema3)
Beispiel #3
0
    def test_create_plan(self):
        dummy_info = TableInfo('dummy')
        dummy_table = TableRef(dummy_info)

        CatalogManager().reset()
        columns = [
            DataFrameColumn('id', ColumnType.INTEGER),
            DataFrameColumn('name', ColumnType.TEXT, array_dimensions=[50])
        ]
        dummy_plan_node = CreatePlan(dummy_table, columns, False)
        self.assertEqual(dummy_plan_node.opr_type, PlanOprType.CREATE)
        self.assertEqual(dummy_plan_node.if_not_exists, False)
        self.assertEqual(dummy_plan_node.table_ref.table.table_name, "dummy")
        self.assertEqual(dummy_plan_node.column_list[0].name, "id")
        self.assertEqual(dummy_plan_node.column_list[1].name, "name")
Beispiel #4
0
 def test_df_schema(self):
     schema_name = "foo"
     column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
     column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False,
                                NdArrayType.UINT8, [28, 28])
     column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
     col_list = [column_1, column_2, column_3]
     schema = DataFrameSchema(schema_name, col_list)
     expected_schema = SchemaUtils.get_petastorm_schema(
         schema_name, col_list)
     self.assertEqual(schema.name, schema_name)
     self.assertEqual(schema.column_list, col_list)
     self.assertEqual(
         schema.petastorm_schema.fields,
         expected_schema.fields)
     for field1, field2 in zip(
             schema.petastorm_schema.fields, expected_schema.fields):
         self.assertEqual(field1, field2)
     self.assertEqual(
         schema.pyspark_schema,
         expected_schema.as_spark_schema())
Beispiel #5
0
 def test_get_petastorm_column_ndarray(self):
     expected_type = [np.int8, np.uint8, np.int16, np.int32, np.int64,
                      np.unicode_, np.bool_, np.float32, np.float64,
                      Decimal, np.str_, np.datetime64]
     col_name = 'frame_id'
     for array_type, np_type in zip(NdArrayType, expected_type):
         col = DataFrameColumn(col_name, ColumnType.NDARRAY, True,
                               array_type, [10, 10])
         petastorm_col = UnischemaField(col_name, np_type, [10, 10],
                                        NdarrayCodec(), True)
         self.assertEqual(SchemaUtils.get_petastorm_column(col),
                          petastorm_col)
Beispiel #6
0
    def test_get_petastorm_column(self):
        col_name = 'frame_id'
        col = DataFrameColumn(col_name, ColumnType.INTEGER, False)
        petastorm_col = UnischemaField(
            col_name, np.int32, (), ScalarCodec(
                IntegerType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.FLOAT, True)
        petastorm_col = UnischemaField(
            col_name, np.float64, (), ScalarCodec(
                FloatType()), True)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.TEXT, False)
        petastorm_col = UnischemaField(
            col_name, np.str_, (), ScalarCodec(
                StringType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, None, True, [10, 10])
        self.assertEqual(SchemaUtils.get_petastorm_column(col), None)
    def test_create_metadata_should_create_dataset_and_columns(
            self, dcs_mock, ds_mock, initdb_mock):
        catalog = CatalogManager()
        file_url = "file1"
        dataset_name = "name"

        columns = [(DataFrameColumn("c1", ColumnType.INTEGER))]
        actual = catalog.create_metadata(dataset_name, file_url, columns)
        ds_mock.return_value.create_dataset.assert_called_with(
            dataset_name, file_url, identifier_id='id', is_video=False)
        for column in columns:
            column.metadata_id = \
                ds_mock.return_value.create_dataset.return_value.id

        dcs_mock.return_value.create_column.assert_called_with(columns)

        expected = ds_mock.return_value.create_dataset.return_value
        expected.schema = \
            dcs_mock.return_value.create_column.return_value

        self.assertEqual(actual, expected)
Beispiel #8
0
    def create_column_metadata(
        self,
        column_name: str,
        data_type: ColumnType,
        array_type: NdArrayType,
        dimensions: List[int],
    ) -> DataFrameColumn:
        """Create a dataframe column object this column.
        This function won't commit this object in the catalog database.
        If you want to commit it into catalog table call create_metadata with
        corresponding table_id

        Arguments:
            column_name {str} -- column name to be created
            data_type {ColumnType} -- type of column created
            array_type {NdArrayType} -- type of ndarray
            dimensions {List[int]} -- dimensions of the column created
        """
        return DataFrameColumn(
            column_name,
            data_type,
            array_type=array_type,
            array_dimensions=dimensions,
        )
Beispiel #9
0
 def test_raise_exception_when_unkown_array_type(self):
     col_name = 'frame_id'
     col = DataFrameColumn(col_name, ColumnType.NDARRAY, True,
                           ColumnType.TEXT, [10, 10])
     self.assertRaises(ValueError, SchemaUtils.get_petastorm_column, col)