def test_by_dataset_id_and_empty_col_ids_should_query_correctly(
            self, mocked):
        return_val = [[1], [2], [3]]
        mocked.query.filter \
            .return_value.all.return_value = return_val

        service = DatasetColumnService()
        metadata_id = 123
        column_ids = None
        actual = service.columns_by_id_and_dataset_id(metadata_id, column_ids)
        mocked.query.filter.assert_called_with(
            mocked._metadata_id == metadata_id)
        self.assertEqual(actual, return_val)
    def test_column_by_metadata_id_and_names_should_query_correctly(
            self, mocked):
        mocked.query.with_entities.return_value.filter \
            .return_value.all.return_value = [[1], [2], [3]]

        service = DatasetColumnService()
        metadata_id = 123
        column_names = ["a", "b"]
        actual = service.columns_by_dataset_id_and_names(
            metadata_id, column_names)
        mocked.query.with_entities.assert_called_with(mocked._id)
        mocked.query.with_entities.return_value.filter.assert_called_with(
            mocked._metadata_id == metadata_id, mocked._name.in_(column_names))
        expected = [1, 2, 3]

        self.assertEqual(actual, expected)
Beispiel #3
0
 def __init__(self):
     self._dataset_service = DatasetService()
     self._column_service = DatasetColumnService()
     self._udf_service = UdfService()
     self._udf_io_service = UdfIOService()
Beispiel #4
0
class CatalogManager(object):
    _instance = None
    _catalog = None
    _catalog_dictionary = {}

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(CatalogManager, cls).__new__(cls)

            cls._instance.bootstrap_catalog()

        return cls._instance

    def __init__(self):
        self._dataset_service = DatasetService()
        self._column_service = DatasetColumnService()
        self._udf_service = UdfService()
        self._udf_io_service = UdfIOService()

    def bootstrap_catalog(self):
        """Bootstraps catalog.

        This method runs all tasks required for using catalog. Currently,
        it includes only one task ie. initializing database. It creates the
        catalog database and tables if they do not exist.

        """
        LoggingManager().log("Bootstrapping catalog", LoggingLevel.INFO)
        init_db()

    def create_metadata(self, name: str, file_url: str,
                        column_list: List[DataFrameColumn]) -> \
            DataFrameMetadata:
        """Creates metadata object when called by create executor.

        Creates a metadata object and column objects and persists them in
        database. Sets the schema field of the metadata object.

        Args:
            name: name of the dataset/video to which this metdata corresponds
            file_url: #todo
            column_list: list of columns

        Returns:
            The persisted DataFrameMetadata object with the id field populated.
        """

        metadata = self._dataset_service.create_dataset(name, file_url)
        for column in column_list:
            column.metadata_id = metadata.id
        column_list = self._column_service.create_column(column_list)
        metadata.schema = column_list
        return metadata

    def get_table_bindings(self, database_name: str, table_name: str,
                           column_names: List[str] = None) -> Tuple[int,
                                                                    List[int]]:
        """This method fetches bindings for strings.

        Args:
            database_name: currently not in use
            table_name: the table that is being referred to
            column_names: the column names of the table for which
           bindings are required

        Returns:
            returns metadata_id of table and a list of column ids
        """

        metadata_id = self._dataset_service.dataset_by_name(table_name)
        column_ids = []
        if column_names is not None:
            if not isinstance(column_names, list):
                LoggingManager().log(
                    "CatalogManager::get_table_binding() expected list",
                    LoggingLevel.WARNING)
            column_ids = self._column_service.columns_by_dataset_id_and_names(
                metadata_id,
                column_names)
        return metadata_id, column_ids

    def get_metadata(self, metadata_id: int,
                     col_id_list: List[int] = None) -> DataFrameMetadata:
        """This method returns the metadata object given a metadata_id,
        when requested by the executor.

        Args:
            metadata_id: metadata id of the table
            col_id_list: optional column ids of the table referred.
                         If none we all the columns are required

        Returns:
            metadata object with all the details of video/dataset
        """
        metadata = self._dataset_service.dataset_by_id(metadata_id)
        df_columns = self._column_service.columns_by_id_and_dataset_id(
            metadata_id, col_id_list)
        metadata.schema = df_columns
        return metadata

    def get_column_types(self, table_metadata_id: int,
                         col_id_list: List[int]) -> List[ColumnType]:
        """
        This method consumes the input table_id and the input column_id_list
        and
        returns a list of ColumnType for each provided column_id.

        Arguments:
            table_metadata_id {int} -- [metadata_id of the table]
            col_id_list {List[int]} -- [metadata ids of the columns; If list
            = None, return type for all columns in the table]

        Returns:
            List[ColumnType] -- [list of required column type for each input
            column]
        """
        metadata = self._dataset_service.dataset_by_id(table_metadata_id)
        col_types = []
        df_columns = self._column_service.columns_by_id_and_dataset_id(
            metadata.id, col_id_list
        )
        for col in df_columns:
            col_types.append(col.type)

        return col_types

    def get_column_ids(self, table_metadata_id: int) -> List[int]:
        """
        This method returns all the column_ids associated with the given
        table_metadata_id

        Arguments:
            table_metadata_id {int} -- [table metadata id for which columns
            are required]

        Returns:
            List[int] -- [list of columns ids for this table]
        """

        col_ids = []
        df_columns = self._column_service.columns_by_id_and_dataset_id(
            table_metadata_id)
        for col in df_columns:
            col_ids.append(col[0])

        return col_ids

    def create_column_metadata(
            self, column_name: str, data_type: ColumnType,
            dimensions: List[int]):
        """Create a dataframe column object this column.
        This function won't commit this object in the catalog database.
        If you want to commit it into catalog table call create_metadata with
        corresponding table_id

        Arguments:
            column_name {str} -- column name to be created
            data_type {ColumnType} -- type of column created
            dimensions {List[int]} -- dimensions of the column created
        """
        return DataFrameColumn(column_name, data_type,
                               array_dimensions=dimensions)

    def get_dataset_metadata(self, database_name: str, dataset_name: str,
                             column_names: List[str] = None) -> \
            DataFrameMetadata:
        """
        Returns the Dataset metadata for the given dataset name
        Arguments:
            dataset_name (str): name of the dataset

        Returns:
            DataFrameMetadata
        """

        return self._dataset_service.dataset_object_by_name(
            database_name, dataset_name)

    def udf_io(
            self, io_name: str, data_type: ColumnType,
            dimensions: List[int], is_input: bool):
        """Constructs an in memory udf_io object with given info.
        This function won't commit this object in the catalog database.
        If you want to commit it into catalog call create_udf with
        corresponding udf_id and io list

        Arguments:
            name(str): io name to be created
            data_type(ColumnType): type of io created
            dimensions(List[int]):dimensions of the io created
            is_input(bool): whether a input or output, if true it is an input
        """
        return UdfIO(io_name, data_type,
                     array_dimensions=dimensions, is_input=is_input)

    def create_udf(self, name: str, impl_file_path: str,
                   type: str, udf_io_list: List[UdfIO]) -> UdfMetadata:
        """Creates an udf metadata object and udf_io objects and persists them 
        in database.

        Arguments:
            name(str): name of the udf to which this metdata corresponds
            impl_file_path(str): implementation path of the udf, 
                                 relative to src/udf
            type(str): what kind of udf operator like classification, 
                                                        detection etc
            udf_io_list(List[UdfIO]): input/output info of this udf

        Returns:
            The persisted UdfMetadata object with the id field populated.
        """

        metadata = self._udf_service.create_udf(name, impl_file_path, type)
        for udf_io in udf_io_list:
            udf_io.udf_id = metadata.id
        self._udf_io_service.add_udf_io(udf_io_list)
        return metadata
 def test_failure_delete_columns(self):
     mocks = [MagicMock() for i in range(1)]
     mocks[0].side_effect = Exception()
     service = DatasetColumnService()
     actual = service.delete_column(mocks)
     self.assertEqual(actual, None)
 def test_delete_all_columns(self):
     mocks = [MagicMock() for i in range(5)]
     service = DatasetColumnService()
     service.delete_column(mocks)
     for mock in mocks:
         mock.delete.assert_called_once()
 def test_create_should_create_all_columns(self):
     mocks = [MagicMock() for i in range(5)]
     service = DatasetColumnService()
     service.create_column(mocks)
     for mock in mocks:
         mock.save.assert_called_once()
Beispiel #8
0
class CatalogManager(object):
    _instance = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(CatalogManager, cls).__new__(cls)

            cls._instance._bootstrap_catalog()

        return cls._instance

    def __init__(self):
        self._dataset_service = DatasetService()
        self._column_service = DatasetColumnService()
        self._udf_service = UdfService()
        self._udf_io_service = UdfIOService()

    def reset(self):
        """
        This method resets the state of the singleton instance.
        It should drop the catalog table and reinitialize all the member
        variables and services.
        """
        self._shutdown_catalog()
        self._bootstrap_catalog()
        self.__init__()

    def _bootstrap_catalog(self):
        """Bootstraps catalog.

        This method runs all tasks required for using catalog. Currently,
        it includes only one task ie. initializing database. It creates the
        catalog database and tables if they do not exist.

        """
        LoggingManager().log("Bootstrapping catalog", LoggingLevel.INFO)
        init_db()

    def _shutdown_catalog(self):
        """
        This method is responsible for gracefully shutting the
        catalog manager. Currently, it includes dropping the catalog database
        """
        LoggingManager().log("Shutting catalog", LoggingLevel.INFO)
        drop_db()

    def create_metadata(self, name: str, file_url: str,
                        column_list: List[DataFrameColumn],
                        identifier_column='id') -> \
            DataFrameMetadata:
        """Creates metadata object

        Creates a metadata object and column objects and persists them in
        database. Sets the schema field of the metadata object.

        Args:
            name: name of the dataset/video to which this metdata corresponds
            file_url: #todo
            column_list: list of columns
            identifier_column (str):  A unique identifier column for each row

        Returns:
            The persisted DataFrameMetadata object with the id field populated.
        """

        metadata = self._dataset_service.create_dataset(
            name, file_url, identifier_id=identifier_column)
        for column in column_list:
            column.metadata_id = metadata.id
        column_list = self._column_service.create_column(column_list)
        metadata.schema = column_list
        return metadata

    def get_table_bindings(
            self,
            database_name: str,
            table_name: str,
            column_names: List[str] = None) -> Tuple[int, List[int]]:
        """This method fetches bindings for strings.

        Args:
            database_name: currently not in use
            table_name: the table that is being referred to
            column_names: the column names of the table for which
           bindings are required

        Returns:
            returns metadata_id of table and a list of column ids
        """

        metadata_id = self._dataset_service.dataset_by_name(table_name)
        column_ids = []
        if column_names is not None:
            if not isinstance(column_names, list):
                LoggingManager().log(
                    "CatalogManager::get_table_binding() expected list",
                    LoggingLevel.WARNING)
            column_ids = self._column_service.columns_by_dataset_id_and_names(
                metadata_id, column_names)
        return metadata_id, column_ids

    def get_metadata(self,
                     metadata_id: int,
                     col_id_list: List[int] = None) -> DataFrameMetadata:
        """This method returns the metadata object given a metadata_id.

        Arguments:
            metadata_id: metadata id of the table
            col_id_list: optional column ids of the table referred.
                         If none all the columns are required

        Returns:
            metadata object with all the details of video/dataset
        """
        metadata = self._dataset_service.dataset_by_id(metadata_id)
        df_columns = self._column_service.columns_by_id_and_dataset_id(
            metadata_id, col_id_list)
        metadata.schema = df_columns
        return metadata

    def get_column_types(self, table_metadata_id: int,
                         col_id_list: List[int]) -> List[ColumnType]:
        """
        This method consumes the input table_id and the input column_id_list
        and
        returns a list of ColumnType for each provided column_id.

        Arguments:
            table_metadata_id {int} -- [metadata_id of the table]
            col_id_list {List[int]} -- [metadata ids of the columns; If list
            = None, return type for all columns in the table]

        Returns:
            List[ColumnType] -- [list of required column type for each input
            column]
        """
        metadata = self._dataset_service.dataset_by_id(table_metadata_id)
        col_types = []
        df_columns = self._column_service.columns_by_id_and_dataset_id(
            metadata.id, col_id_list)
        for col in df_columns:
            col_types.append(col.type)

        return col_types

    def get_column_ids(self, table_metadata_id: int) -> List[int]:
        """
        This method returns all the column_ids associated with the given
        table_metadata_id

        Arguments:
            table_metadata_id {int} -- [table metadata id for which columns
            are required]

        Returns:
            List[int] -- [list of columns ids for this table]
        """

        col_ids = []
        df_columns = self._column_service.columns_by_id_and_dataset_id(
            table_metadata_id)
        for col in df_columns:
            col_ids.append(col[0])

        return col_ids

    def create_column_metadata(self, column_name: str, data_type: ColumnType,
                               array_type: NdArrayType,
                               dimensions: List[int]) -> DataFrameColumn:
        """Create a dataframe column object this column.
        This function won't commit this object in the catalog database.
        If you want to commit it into catalog table call create_metadata with
        corresponding table_id

        Arguments:
            column_name {str} -- column name to be created
            data_type {ColumnType} -- type of column created
            array_type {NdArrayType} -- type of ndarray
            dimensions {List[int]} -- dimensions of the column created
        """
        return DataFrameColumn(column_name,
                               data_type,
                               array_type=array_type,
                               array_dimensions=dimensions)

    def get_dataset_metadata(self, database_name: str, dataset_name: str) -> \
            DataFrameMetadata:
        """
        Returns the Dataset metadata for the given dataset name
        Arguments:
            dataset_name (str): name of the dataset

        Returns:
            DataFrameMetadata
        """

        metadata = self._dataset_service.dataset_object_by_name(
            database_name, dataset_name)
        if metadata is None:
            return None
        # we are forced to set schema every time metadata is fetched
        # ToDo: maybe keep schema as a part of persistent metadata object
        df_columns = self._column_service.columns_by_id_and_dataset_id(
            metadata.id, None)
        metadata.schema = df_columns
        return metadata

    def udf_io(self, io_name: str, data_type: ColumnType,
               array_type: NdArrayType, dimensions: List[int], is_input: bool):
        """Constructs an in memory udf_io object with given info.
        This function won't commit this object in the catalog database.
        If you want to commit it into catalog call create_udf with
        corresponding udf_id and io list

        Arguments:
            name(str): io name to be created
            data_type(ColumnType): type of io created
            array_type(NdArrayType): type of array content
            dimensions(List[int]):dimensions of the io created
            is_input(bool): whether a input or output, if true it is an input
        """
        return UdfIO(io_name,
                     data_type,
                     array_type=array_type,
                     array_dimensions=dimensions,
                     is_input=is_input)

    def create_udf(self, name: str, impl_file_path: str, type: str,
                   udf_io_list: List[UdfIO]) -> UdfMetadata:
        """Creates an udf metadata object and udf_io objects and persists them
        in database.

        Arguments:
            name(str): name of the udf to which this metdata corresponds
            impl_file_path(str): implementation path of the udf,
                                 relative to src/udf
            type(str): what kind of udf operator like classification,
                                                        detection etc
            udf_io_list(List[UdfIO]): input/output info of this udf

        Returns:
            The persisted UdfMetadata object with the id field populated.
        """

        metadata = self._udf_service.create_udf(name, impl_file_path, type)
        for udf_io in udf_io_list:
            udf_io.udf_id = metadata.id
        self._udf_io_service.add_udf_io(udf_io_list)
        return metadata

    def get_udf_by_name(self, name: str) -> UdfMetadata:
        """
        Get the UDF information based on name.

        Arguments:
             name (str): name of the UDF

        Returns:
            UdfMetadata object
        """
        return self._udf_service.udf_by_name(name)

    def delete_metadata(self, table_name: str) -> bool:
        """
        This method deletes the table along with its columns from df_metadata
        and df_columns respectively

        Arguments:
           table_name: table name to be deleted.

        Returns:
           True if successfully deleted else False
        """
        metadata_id = self._dataset_service.dataset_by_name(table_name)
        return self._dataset_service.delete_dataset_by_id(metadata_id)

    def delete_udf(self, udf_name: str) -> bool:
        """
        This method drops the udf entry from the catalog

        Arguments:
           udf_name: udf name to be dropped.

        Returns:
           True if successfully deleted else False
        """
        return self._udf_service.delete_udf_by_name(udf_name)

    def get_udf_io_by_name(self, udf_io_name: str) -> UdfIO:
        """Returns the catalog object for the input udfio name

        Args:
            udf_io_name (str): name to query the UDFIO catalog table

        Returns:
            UdfIO: catalog object found
        """
        return self._udf_io_service.udf_io_by_name(udf_io_name)