def create_sample_table(self):
     table_info = DataFrameMetadata("dataset", 'dataset')
     column_1 = DataFrameColumn("id", ColumnType.INTEGER, False)
     column_2 = DataFrameColumn("data", ColumnType.NDARRAY, False,
                                [2, 2, 3])
     table_info.schema = [column_1, column_2]
     return table_info
Exemple #2
0
    def test_should_insert_row_in_table(self):
        dummy_info = TableInfo('MyVideo')
        dummy_table = TableRef(dummy_info)

        columns = [
            DataFrameColumn('Frame_ID', ColumnType.INTEGER),
            DataFrameColumn('Frame_Path', ColumnType.TEXT, array_dimensions=50)
        ]
        plan_node = CreatePlan(dummy_table, columns, False)

        createExec = CreateExecutor(plan_node)
        url = createExec.exec()

        parser = Parser()
        insert_query = """INSERT INTO MyVideo (Frame_ID, Frame_Path)
                                    VALUES    (1, '/mnt/frames/1.png');
                        """

        eva_statement_list = parser.parse(insert_query)
        insert_stmt = eva_statement_list[0]
        convertor = StatementToPlanConvertor()
        convertor.visit(insert_stmt)
        logical_plan_node = convertor.plan
        print("logical", logical_plan_node)
        phy_plan_node = InsertPlan(logical_plan_node.video_catalog_id,
                                   logical_plan_node.column_list,
                                   logical_plan_node.value_list)

        insertExec = InsertExecutor(phy_plan_node)
        insertExec.exec()

        # test if we have a added the in our storage
        df = load_dataframe(url)
        self.assertEqual(df.collect()[0][0], 1)
        self.assertEqual(df.collect()[0][1], "'/mnt/frames/1.png'")
Exemple #3
0
    def test_df_equality(self):
        df_col = DataFrameColumn('name', ColumnType.TEXT, is_nullable=False)
        self.assertEqual(df_col, df_col)
        df_col1 = DataFrameColumn('name2', ColumnType.TEXT, is_nullable=False)
        self.assertNotEqual(df_col, df_col1)
        df_col1 = DataFrameColumn('name',
                                  ColumnType.INTEGER,
                                  is_nullable=False)
        self.assertNotEqual(df_col, df_col1)
        df_col1 = DataFrameColumn('name', ColumnType.INTEGER, is_nullable=True)
        self.assertNotEqual(df_col, df_col1)
        df_col1 = DataFrameColumn('name',
                                  ColumnType.INTEGER,
                                  is_nullable=False)
        self.assertNotEqual(df_col, df_col1)
        df_col.array_dimensions = [2, 4]
        df_col1 = DataFrameColumn('name',
                                  ColumnType.INTEGER,
                                  is_nullable=False,
                                  array_dimensions=[1, 2])
        self.assertNotEqual(df_col, df_col1)

        df_col.metadata_id = 1
        df_col1 = DataFrameColumn('name',
                                  ColumnType.INTEGER,
                                  is_nullable=False,
                                  array_dimensions=[2, 4],
                                  metadata_id=2)
        self.assertNotEqual(df_col, df_col1)
Exemple #4
0
    def test_schema(self):
        schema_name = "foo"
        column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
        column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False,
                                   [28, 28])
        column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)

        schema = DataFrameSchema(schema_name, [column_1, column_2, column_3])

        self.assertEqual(schema.column_list[0].name, "frame_id")
Exemple #5
0
 def test_df_column(self):
     df_col = DataFrameColumn('name', ColumnType.TEXT, is_nullable=False)
     df_col.array_dimensions = [1, 2]
     df_col.metadata_id = 1
     self.assertEqual(df_col.array_dimensions, [1, 2])
     self.assertEqual(df_col.is_nullable, False)
     self.assertEqual(df_col.name, 'name')
     self.assertEqual(df_col.type, ColumnType.TEXT)
     self.assertEqual(df_col.metadata_id, 1)
     self.assertEqual(df_col.id, None)
     self.assertEqual(str(df_col), 'Column: (name, TEXT, False, [1, 2])')
Exemple #6
0
    def test_df_metadata(self):
        df_metadata = DataFrameMetadata('name', 'eva_dataset')
        column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
        column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
        col_list = [column_1, column_2]
        schema = DataFrameSchema('name', col_list)
        df_metadata.schema = col_list

        self.assertEqual(df_metadata.name, 'name')
        self.assertEqual(df_metadata.file_url, 'eva_dataset')
        self.assertEqual(df_metadata.id, None)
        self.assertEqual(df_metadata.identifier_column, 'id')
        self.assertEqual(df_metadata.schema, schema)
Exemple #7
0
 def test_schema_equality(self):
     schema_name = "foo"
     column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
     column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False,
                                [28, 28])
     column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
     col_list = [column_1, column_2, column_3]
     schema1 = DataFrameSchema(schema_name, col_list)
     schema2 = DataFrameSchema(schema_name, col_list[1:])
     schema3 = DataFrameColumn('foo2', col_list)
     self.assertEqual(schema1, schema1)
     self.assertNotEqual(schema1, schema2)
     self.assertNotEqual(schema1, schema3)
Exemple #8
0
    def test_create_plan(self):
        dummy_info = TableInfo('dummy')
        dummy_table = TableRef(dummy_info)

        CatalogManager().reset()
        columns = [DataFrameColumn('id', ColumnType.INTEGER),
                   DataFrameColumn('name', ColumnType.TEXT,
                                   array_dimensions=50)]
        dummy_plan_node = CreatePlan(dummy_table, columns, False)
        self.assertEqual(dummy_plan_node.opr_type, PlanOprType.CREATE)
        self.assertEqual(dummy_plan_node.if_not_exists, False)
        self.assertEqual(dummy_plan_node.video_ref.table.table_name,
                         "dummy")
        self.assertEqual(dummy_plan_node.column_list[0].name, "id")
        self.assertEqual(dummy_plan_node.column_list[1].name, "name")
Exemple #9
0
    def test_df_metadata_equality(self):
        df_metadata = DataFrameMetadata('name', 'eva_dataset')
        column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
        column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
        col_list = [column_1, column_2]
        df_metadata.schema = col_list
        self.assertEqual(df_metadata, df_metadata)

        df_metadata1 = DataFrameMetadata('name2', 'eva_dataset')
        column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
        column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
        col_list = [column_1, column_2]
        df_metadata1.schema = col_list
        self.assertNotEqual(df_metadata, df_metadata1)
        df_metadata2 = DataFrameMetadata('name2', 'eva_dataset')
        df_metadata2.schema = col_list[1:]
        self.assertNotEqual(df_metadata1, df_metadata2)
Exemple #10
0
    def test_create_executor_should_create_table_in_storage(self):
        dummy_info = TableInfo('dummy')
        dummy_table = TableRef(dummy_info)

        columns = [
            DataFrameColumn('id', ColumnType.INTEGER),
            DataFrameColumn('name', ColumnType.TEXT, array_dimensions=50)
        ]
        plan_node = CreatePlan(dummy_table, columns, False)

        createExec = CreateExecutor(plan_node)
        url = createExec.exec()

        # test if we have a table created in our storage
        df = load_dataframe(url)
        self.assertEqual(2, len(df.columns))
        self.assertEqual(df.columns, ['id', 'name'])
Exemple #11
0
    def test_get_petastorm_column(self):
        col_name = 'frame_id'
        col = DataFrameColumn(col_name, ColumnType.INTEGER, False)
        petastorm_col = UnischemaField(col_name, np.int32, (),
                                       ScalarCodec(IntegerType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.FLOAT, True)
        petastorm_col = UnischemaField(col_name, np.float64, (),
                                       ScalarCodec(FloatType()), True)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.TEXT, False)
        petastorm_col = UnischemaField(col_name, np.str_, (),
                                       ScalarCodec(StringType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, None, True, [10, 10])
        self.assertEqual(SchemaUtils.get_petastorm_column(col), None)
Exemple #12
0
 def test_df_schema(self):
     schema_name = "foo"
     column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
     column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False,
                                [28, 28])
     column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
     col_list = [column_1, column_2, column_3]
     schema = DataFrameSchema(schema_name, col_list)
     expected_schema = SchemaUtils.get_petastorm_schema(
         schema_name, col_list)
     self.assertEqual(schema.name, schema_name)
     self.assertEqual(schema.column_list, col_list)
     self.assertEqual(schema.petastorm_schema.fields,
                      expected_schema.fields)
     for field1, field2 in zip(schema.petastorm_schema.fields,
                               expected_schema.fields):
         self.assertEqual(field1, field2)
     self.assertEqual(schema.pyspark_schema,
                      expected_schema.as_spark_schema())
Exemple #13
0
 def test_get_petastorm_column_ndarray(self):
     expected_type = [
         np.int8, np.uint8, np.int16, np.int32, np.int64, np.unicode_,
         np.bool_, np.float32, np.float64, Decimal, np.str_, np.datetime64
     ]
     col_name = 'frame_id'
     for array_type, np_type in zip(NdArrayType, expected_type):
         col = DataFrameColumn(col_name, ColumnType.NDARRAY, True,
                               array_type, [10, 10])
         petastorm_col = UnischemaField(col_name, np_type, [10, 10],
                                        NdarrayCodec(), True)
         self.assertEqual(SchemaUtils.get_petastorm_column(col),
                          petastorm_col)
Exemple #14
0
    def create_column_metadata(
            self, column_name: str, data_type: ColumnType,
            dimensions: List[int]):
        """Create a dataframe column object this column.
        This function won't commit this object in the catalog database.
        If you want to commit it into catalog table call create_metadata with
        corresponding table_id

        Arguments:
            column_name {str} -- column name to be created
            data_type {ColumnType} -- type of column created
            dimensions {List[int]} -- dimensions of the column created
        """
        return DataFrameColumn(column_name, data_type,
                               array_dimensions=dimensions)
Exemple #15
0
    def get_table_bindings(self, database_name: str, table_name: str,
                           column_names: List[str]) -> Tuple[int, List[int]]:
        """
        This method fetches bindings for strings
        :param database_name: currently not in use
        :param table_name: the table that is being referred to
        :param column_names: the column names of the table for which
        bindings are required
        :return: returns metadat_id of table and a list of column ids
        """

        metadata_id = DataFrameMetadata.get_id_from_name(table_name)
        column_ids = []
        if column_names is not None:
            column_ids = DataFrameColumn.get_id_from_metadata_id_and_name_in(
                metadata_id, column_names)
        return metadata_id, column_ids
Exemple #16
0
 def get_metadata(self,
                  metadata_id: int,
                  col_id_list: List[int] = None) -> DataFrameMetadata:
     """
     This method returns the metadata object given a metadata_id,
     when requested by the executor. It will further be used by storage
     engine for retrieving the dataframe.
     :param metadata_id: metadata id of the table
     :param col_id_list: optional column ids of the table referred
     :return:
     """
     metadata = DataFrameMetadata.get(metadata_id)
     if col_id_list is not None:
         df_columns = DataFrameColumn.get_by_metadata_id_and_id_in(
             col_id_list, metadata_id)
         metadata.set_schema(
             DataFrameSchema(metadata.get_name(), df_columns))
     return metadata
    def test_dataset_by_name_should_return_name_of_model(
            self, dcs_mock, ds_mock, initdb_mock):
        #tests for dataset_by_name in df_service.py
        catalog = CatalogManager()
        file_url = "file1"
        set_name = "test_name"

        columns = [(DataFrameColumn("column", ColumnType.INTEGER))]
        catalog.create_metadata(set_name, file_url, columns)

        for column in columns:
            column.metadata_id = \
                ds_mock.return_value.create_dataset.return_value.id

        real = catalog._dataset_service.dataset_by_name(set_name)
        ds_mock.return_value.dataset_by_name.assert_called_with(set_name)

        test = ds_mock.return_value.dataset_by_name.return_value

        self.assertEqual(test, real)
Exemple #18
0
    def test_create_metadata_should_create_dataset_and_columns(
            self, dcs_mock, ds_mock, initdb_mock):
        catalog = CatalogManager()
        file_url = "file1"
        dataset_name = "name"

        columns = [(DataFrameColumn("c1", ColumnType.INTEGER))]
        actual = catalog.create_metadata(dataset_name, file_url, columns)
        ds_mock.return_value.create_dataset.assert_called_with(
            dataset_name, file_url, identifier_id='id')
        for column in columns:
            column.metadata_id = \
                ds_mock.return_value.create_dataset.return_value.id

        dcs_mock.return_value.create_column.assert_called_with(columns)

        expected = ds_mock.return_value.create_dataset.return_value
        expected.schema = \
            dcs_mock.return_value.create_column.return_value

        self.assertEqual(actual, expected)
Exemple #19
0
 def test_raise_exception_when_unkown_array_type(self):
     col_name = 'frame_id'
     col = DataFrameColumn(col_name, ColumnType.NDARRAY, True,
                           ColumnType.TEXT, [10, 10])
     self.assertRaises(ValueError, SchemaUtils.get_petastorm_column, col)