def create_sample_table(self): table_info = DataFrameMetadata("dataset", 'dataset') column_1 = DataFrameColumn("id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("data", ColumnType.NDARRAY, False, [2, 2, 3]) table_info.schema = [column_1, column_2] return table_info
def test_should_insert_row_in_table(self): dummy_info = TableInfo('MyVideo') dummy_table = TableRef(dummy_info) columns = [ DataFrameColumn('Frame_ID', ColumnType.INTEGER), DataFrameColumn('Frame_Path', ColumnType.TEXT, array_dimensions=50) ] plan_node = CreatePlan(dummy_table, columns, False) createExec = CreateExecutor(plan_node) url = createExec.exec() parser = Parser() insert_query = """INSERT INTO MyVideo (Frame_ID, Frame_Path) VALUES (1, '/mnt/frames/1.png'); """ eva_statement_list = parser.parse(insert_query) insert_stmt = eva_statement_list[0] convertor = StatementToPlanConvertor() convertor.visit(insert_stmt) logical_plan_node = convertor.plan print("logical", logical_plan_node) phy_plan_node = InsertPlan(logical_plan_node.video_catalog_id, logical_plan_node.column_list, logical_plan_node.value_list) insertExec = InsertExecutor(phy_plan_node) insertExec.exec() # test if we have a added the in our storage df = load_dataframe(url) self.assertEqual(df.collect()[0][0], 1) self.assertEqual(df.collect()[0][1], "'/mnt/frames/1.png'")
def test_df_equality(self): df_col = DataFrameColumn('name', ColumnType.TEXT, is_nullable=False) self.assertEqual(df_col, df_col) df_col1 = DataFrameColumn('name2', ColumnType.TEXT, is_nullable=False) self.assertNotEqual(df_col, df_col1) df_col1 = DataFrameColumn('name', ColumnType.INTEGER, is_nullable=False) self.assertNotEqual(df_col, df_col1) df_col1 = DataFrameColumn('name', ColumnType.INTEGER, is_nullable=True) self.assertNotEqual(df_col, df_col1) df_col1 = DataFrameColumn('name', ColumnType.INTEGER, is_nullable=False) self.assertNotEqual(df_col, df_col1) df_col.array_dimensions = [2, 4] df_col1 = DataFrameColumn('name', ColumnType.INTEGER, is_nullable=False, array_dimensions=[1, 2]) self.assertNotEqual(df_col, df_col1) df_col.metadata_id = 1 df_col1 = DataFrameColumn('name', ColumnType.INTEGER, is_nullable=False, array_dimensions=[2, 4], metadata_id=2) self.assertNotEqual(df_col, df_col1)
def test_schema(self): schema_name = "foo" column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False, [28, 28]) column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) schema = DataFrameSchema(schema_name, [column_1, column_2, column_3]) self.assertEqual(schema.column_list[0].name, "frame_id")
def test_df_column(self): df_col = DataFrameColumn('name', ColumnType.TEXT, is_nullable=False) df_col.array_dimensions = [1, 2] df_col.metadata_id = 1 self.assertEqual(df_col.array_dimensions, [1, 2]) self.assertEqual(df_col.is_nullable, False) self.assertEqual(df_col.name, 'name') self.assertEqual(df_col.type, ColumnType.TEXT) self.assertEqual(df_col.metadata_id, 1) self.assertEqual(df_col.id, None) self.assertEqual(str(df_col), 'Column: (name, TEXT, False, [1, 2])')
def test_df_metadata(self): df_metadata = DataFrameMetadata('name', 'eva_dataset') column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2] schema = DataFrameSchema('name', col_list) df_metadata.schema = col_list self.assertEqual(df_metadata.name, 'name') self.assertEqual(df_metadata.file_url, 'eva_dataset') self.assertEqual(df_metadata.id, None) self.assertEqual(df_metadata.identifier_column, 'id') self.assertEqual(df_metadata.schema, schema)
def test_schema_equality(self): schema_name = "foo" column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False, [28, 28]) column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2, column_3] schema1 = DataFrameSchema(schema_name, col_list) schema2 = DataFrameSchema(schema_name, col_list[1:]) schema3 = DataFrameColumn('foo2', col_list) self.assertEqual(schema1, schema1) self.assertNotEqual(schema1, schema2) self.assertNotEqual(schema1, schema3)
def test_create_plan(self): dummy_info = TableInfo('dummy') dummy_table = TableRef(dummy_info) CatalogManager().reset() columns = [DataFrameColumn('id', ColumnType.INTEGER), DataFrameColumn('name', ColumnType.TEXT, array_dimensions=50)] dummy_plan_node = CreatePlan(dummy_table, columns, False) self.assertEqual(dummy_plan_node.opr_type, PlanOprType.CREATE) self.assertEqual(dummy_plan_node.if_not_exists, False) self.assertEqual(dummy_plan_node.video_ref.table.table_name, "dummy") self.assertEqual(dummy_plan_node.column_list[0].name, "id") self.assertEqual(dummy_plan_node.column_list[1].name, "name")
def test_df_metadata_equality(self): df_metadata = DataFrameMetadata('name', 'eva_dataset') column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2] df_metadata.schema = col_list self.assertEqual(df_metadata, df_metadata) df_metadata1 = DataFrameMetadata('name2', 'eva_dataset') column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2] df_metadata1.schema = col_list self.assertNotEqual(df_metadata, df_metadata1) df_metadata2 = DataFrameMetadata('name2', 'eva_dataset') df_metadata2.schema = col_list[1:] self.assertNotEqual(df_metadata1, df_metadata2)
def test_create_executor_should_create_table_in_storage(self): dummy_info = TableInfo('dummy') dummy_table = TableRef(dummy_info) columns = [ DataFrameColumn('id', ColumnType.INTEGER), DataFrameColumn('name', ColumnType.TEXT, array_dimensions=50) ] plan_node = CreatePlan(dummy_table, columns, False) createExec = CreateExecutor(plan_node) url = createExec.exec() # test if we have a table created in our storage df = load_dataframe(url) self.assertEqual(2, len(df.columns)) self.assertEqual(df.columns, ['id', 'name'])
def test_get_petastorm_column(self): col_name = 'frame_id' col = DataFrameColumn(col_name, ColumnType.INTEGER, False) petastorm_col = UnischemaField(col_name, np.int32, (), ScalarCodec(IntegerType()), False) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col) col = DataFrameColumn(col_name, ColumnType.FLOAT, True) petastorm_col = UnischemaField(col_name, np.float64, (), ScalarCodec(FloatType()), True) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col) col = DataFrameColumn(col_name, ColumnType.TEXT, False) petastorm_col = UnischemaField(col_name, np.str_, (), ScalarCodec(StringType()), False) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col) col = DataFrameColumn(col_name, None, True, [10, 10]) self.assertEqual(SchemaUtils.get_petastorm_column(col), None)
def test_df_schema(self): schema_name = "foo" column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False, [28, 28]) column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2, column_3] schema = DataFrameSchema(schema_name, col_list) expected_schema = SchemaUtils.get_petastorm_schema( schema_name, col_list) self.assertEqual(schema.name, schema_name) self.assertEqual(schema.column_list, col_list) self.assertEqual(schema.petastorm_schema.fields, expected_schema.fields) for field1, field2 in zip(schema.petastorm_schema.fields, expected_schema.fields): self.assertEqual(field1, field2) self.assertEqual(schema.pyspark_schema, expected_schema.as_spark_schema())
def test_get_petastorm_column_ndarray(self): expected_type = [ np.int8, np.uint8, np.int16, np.int32, np.int64, np.unicode_, np.bool_, np.float32, np.float64, Decimal, np.str_, np.datetime64 ] col_name = 'frame_id' for array_type, np_type in zip(NdArrayType, expected_type): col = DataFrameColumn(col_name, ColumnType.NDARRAY, True, array_type, [10, 10]) petastorm_col = UnischemaField(col_name, np_type, [10, 10], NdarrayCodec(), True) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)
def create_column_metadata( self, column_name: str, data_type: ColumnType, dimensions: List[int]): """Create a dataframe column object this column. This function won't commit this object in the catalog database. If you want to commit it into catalog table call create_metadata with corresponding table_id Arguments: column_name {str} -- column name to be created data_type {ColumnType} -- type of column created dimensions {List[int]} -- dimensions of the column created """ return DataFrameColumn(column_name, data_type, array_dimensions=dimensions)
def get_table_bindings(self, database_name: str, table_name: str, column_names: List[str]) -> Tuple[int, List[int]]: """ This method fetches bindings for strings :param database_name: currently not in use :param table_name: the table that is being referred to :param column_names: the column names of the table for which bindings are required :return: returns metadat_id of table and a list of column ids """ metadata_id = DataFrameMetadata.get_id_from_name(table_name) column_ids = [] if column_names is not None: column_ids = DataFrameColumn.get_id_from_metadata_id_and_name_in( metadata_id, column_names) return metadata_id, column_ids
def get_metadata(self, metadata_id: int, col_id_list: List[int] = None) -> DataFrameMetadata: """ This method returns the metadata object given a metadata_id, when requested by the executor. It will further be used by storage engine for retrieving the dataframe. :param metadata_id: metadata id of the table :param col_id_list: optional column ids of the table referred :return: """ metadata = DataFrameMetadata.get(metadata_id) if col_id_list is not None: df_columns = DataFrameColumn.get_by_metadata_id_and_id_in( col_id_list, metadata_id) metadata.set_schema( DataFrameSchema(metadata.get_name(), df_columns)) return metadata
def test_dataset_by_name_should_return_name_of_model( self, dcs_mock, ds_mock, initdb_mock): #tests for dataset_by_name in df_service.py catalog = CatalogManager() file_url = "file1" set_name = "test_name" columns = [(DataFrameColumn("column", ColumnType.INTEGER))] catalog.create_metadata(set_name, file_url, columns) for column in columns: column.metadata_id = \ ds_mock.return_value.create_dataset.return_value.id real = catalog._dataset_service.dataset_by_name(set_name) ds_mock.return_value.dataset_by_name.assert_called_with(set_name) test = ds_mock.return_value.dataset_by_name.return_value self.assertEqual(test, real)
def test_create_metadata_should_create_dataset_and_columns( self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() file_url = "file1" dataset_name = "name" columns = [(DataFrameColumn("c1", ColumnType.INTEGER))] actual = catalog.create_metadata(dataset_name, file_url, columns) ds_mock.return_value.create_dataset.assert_called_with( dataset_name, file_url, identifier_id='id') for column in columns: column.metadata_id = \ ds_mock.return_value.create_dataset.return_value.id dcs_mock.return_value.create_column.assert_called_with(columns) expected = ds_mock.return_value.create_dataset.return_value expected.schema = \ dcs_mock.return_value.create_column.return_value self.assertEqual(actual, expected)
def test_raise_exception_when_unkown_array_type(self): col_name = 'frame_id' col = DataFrameColumn(col_name, ColumnType.NDARRAY, True, ColumnType.TEXT, [10, 10]) self.assertRaises(ValueError, SchemaUtils.get_petastorm_column, col)