Exemple #1
0
    def __init__(self, path_to_db_files, path_to_output_db_file=None):
        """Initialize V1BaseDBHandler.

        Args:
            path_to_db_files (list): list fo db-files with extension '.arrow'
            path_to_output_db_file (str): output database for saving

        """
        if isinstance(path_to_db_files, str):
            path_to_db_files = [path_to_db_files]

        # Check file extension
        for file in path_to_db_files:
            if not file.endswith('.arrow'):
                raise ValueError('DB file path must ends with ".arrow"')

        self.path_to_db_file = path_to_db_files[0]
        if path_to_output_db_file is not None:
            self.path_to_db_file = path_to_output_db_file
        self._cursor = 0
        self.logger = logging.getLogger(__name__)

        # load config
        self._config = load_config(self.__version__)
        self.columns = self._config[self.df_name]['columns']

        # initialize dataframe
        try:
            self._df = vaex.open_many(path_to_db_files)
        except IOError:
            self._df = None
Exemple #2
0
    def __init__(self,
                 db_engine=None,
                 db_host=None,
                 db_name=None,
                 db_username=None,
                 db_password=None,
                 df_name=None,
                 read_on_init=False,
                 **kwargs):
        """Initialize BaseDBHandler.

        Args:
            db_engine (str): database engine (if None, the one in the config file will be used)
            db_host (str): database HOST (if None, the one in the config file will be used)
            db_name (str): database name (if None, the one in the config file will be used)
            db_username (str): username (if None, the one in the config file will be used)
            db_password (str): password (if None, the one in the config file will be used)
            df_name (str): dataframe (table in DB) name (if None, class default value will be used)
            read_on_init (bool): if True, dataframe will be read from database on initialization

        """
        super(BaseDBHandler, self).__init__()
        self.logger = logging.getLogger(__name__)
        self._cursor = 0
        self._data = {}
        self._uuids_to_remove = []
        self._count_total = 0
        if df_name is not None:
            self.df_name = df_name

        # Load config
        config = load_config(self.__version__)
        try:
            self._config = ConfigDict(config['db']['df_class'][self._df_class])
        except KeyError:
            self._config = ConfigDict()

        # Initialize deepmerger
        self._merger = Merger(
            # pass in a list of tuple, with the
            # strategies you are looking to apply
            # to each type.
            [(dict, ['merge']),
             (list, [_deepmerge_append_list_unique, 'append'])],
            # next, choose the fallback strategies,
            # applied to all other types:
            ["override"],
            # finally, choose the strategies in
            # the case where the types conflict:
            ["override"])

        # Initialize database
        self._initialize_engine(db_engine, db_host, db_name, db_username,
                                db_password)
        self._load_config_from_db()

        # Fetch table
        if read_on_init:
            self.read()
Exemple #3
0
    def default_config(cls):
        """Return default configurations.

        Returns:
            (AttrDict): default configurations

        """
        return load_config(cls.__version__)
Exemple #4
0
class TimeSeriesDBHandler(BaseDBHandler):
    """Time series database handler."""

    __version__ = 'v2'
    db_defaults = load_config(__version__).sql.time_series_defaults
    _df_name = 'time_series_df'
    _columns = None

    def read(self, *args, **kwargs):
        """Read data from SQL.

        Args:
            *args: args
            **kwargs: kwargs

        Returns:
            (pandas.df): table

        """
        # Read table from DB
        if self._config.current_db['engine'] in ['influxdb']:
            super().read(*args, **kwargs)
        else:
            super().read(*args, **kwargs, index_col='timestamp')

        # Post-process
        try:
            df = self.df
            df.reset_index(inplace=True)
            df.rename(columns={'index': 'timestamp'}, inplace=True)
            df['timestamp'] = df['timestamp'].astype(int) / 10**9
        except Exception as e:
            self.logger.warning(
                'An error occurred in post-process: {}'.format(e))
            df = self._initialize_df()

        self.df = df

    def save(self, df=None, **kwargs):
        """Save data to SQL.

        Args:
            df (pandas.DataFrame): dataframe to save. if None, self.df will be saved

        """
        dataframe: pd.DataFrame = df if df is not None else self.df

        # Convert timestamp (float) to datetime
        assert 'timestamp' in dataframe.columns
        df_to_save = dataframe.copy()
        df_to_save['timestamp'] = pd.to_datetime(
            df_to_save['timestamp'].astype(float), unit='s')
        df_to_save.set_index('timestamp', inplace=True)

        if self._config.current_db['engine'] in ['influxdb']:
            super().save(df=df_to_save)
        else:
            super().save(df=df_to_save, index=True)
Exemple #5
0
class TimeSeriesDBHandler(_BaseDBHandler):
    """Time series database handler."""

    __version__ = 'v3'
    db_defaults = load_config(__version__).sql.time_series
    _df_class = 'time_series_df'
    _df_name = 'time_series_df'
    _columns = None

    def _initialize_engine(self,
                           db_engine=None,
                           db_host=None,
                           db_name=None,
                           db_username=None,
                           db_password=None):
        """Initialize DB engine."""
        # Load settings from environment variables
        engine = db_engine if db_engine is not None \
            else os.environ.get('PYDTK_TIME_SERIES_DB_ENGINE', None)
        username = db_username if db_username is not None \
            else os.environ.get('PYDTK_TIME_SERIES_DB_USERNAME', None)
        password = db_password if db_password is not None \
            else os.environ.get('PYDTK_TIME_SERIES_DB_PASSWORD', None)
        host = db_host if db_host is not None \
            else os.environ.get('PYDTK_TIME_SERIES_DB_HOST', None)
        database = db_name if db_name is not None \
            else os.environ.get('PYDTK_TIME_SERIES_DB_DATABASE', None)

        super()._initialize_engine(engine, host, database, username, password)

    def save(self, df=None, remove_duplicates=False, **kwargs):
        """Save data to SQL.

        Args:
            df (pandas.DataFrame): DataFrame to save (if None, self.df will be saved)
            remove_duplicates (bool): if True, duplicated rows will be removed
            **kwargs: kwargs for function `pandas.dataframe.to_sql`

        """
        dataframe = df.copy() if df is not None else self.df.copy()
        dataframe = dataframe[~dataframe.index.duplicated(keep='last')]

        if 'index' not in kwargs.keys():
            kwargs.update({'index': True})
        self._prepare_columns()

        # TODO: alter column 'uuid_in_df' as primary key or wait for the PR below
        # TODO: Use `if_exists=upsert_overwrite` after the following PR is merged
        # https://github.com/pandas-dev/pandas/pull/29636
        dataframe.to_sql(self.df_name,
                         self._engine,
                         if_exists='append',
                         dtype={'uuid_in_df': VARCHAR(32)},
                         method=_replace_into,
                         **kwargs)
Exemple #6
0
    def _get_handler(cls, db_class, db_engine=None):
        """Returns an appropriate handler.

        Args:
            db_class (str): database class (e.g. 'meta')
            db_engine (str): database engine (e.g. 'tinydb')

        Returns:
            (handler): database handler object

        """
        # Load default config
        config = load_config(cls.__version__)

        # Check if the db_class is available
        if db_class not in DB_HANDLERS.keys():
            raise ValueError('Unsupported db_class: {}'.format(db_class))

        # Get db_engine from environment variable if not specified
        if db_engine is None:
            db_engine = os.environ.get(
                'PYDTK_{}_DB_ENGINE'.format(db_class.upper()), None)

        # Get the default engine if not specified
        if db_engine is None:
            try:
                db_defaults = getattr(config.db.connection, db_class)
                db_engine = db_defaults.engine
            except (ValueError, AttributeError):
                raise ValueError(
                    'Could not find the default value for `db_engine`')

        # Check if the corresponding handler is registered
        if db_engine not in DB_HANDLERS[db_class].keys():
            raise ValueError('Unsupported db_engine: {}'.format(db_engine))

        # Get a DB-handler supporting the engine
        return DB_HANDLERS[db_class][db_engine]
Exemple #7
0
    def __init__(self,
                 db_engine=None,
                 db_host=None,
                 db_name=None,
                 db_username=None,
                 db_password=None,
                 df_name=None,
                 read_on_init=True):
        """Initialize BaseDBHandler.

        Args:
            db_engine (str): database engine (if None, the one in the config file will be used)
            db_host (str): database HOST (if None, the one in the config file will be used)
            db_name (str): database name (if None, the one in the config file will be used)
            db_username (str): username (if None, the one in the config file will be used)
            db_password (str): password (if None, the one in the config file will be used)
            df_name (str): dataframe (table in DB) name (if None, class default value will be used)
            read_on_init (bool): if True, dataframe will be read from database on initialization

        """
        self._cursor = 0
        self.logger = logging.getLogger(__name__)
        if df_name is not None:
            self.df_name = df_name

        # Load config
        self._config = load_config(self.__version__)

        # Initialize database
        self._initialize_engine(db_engine, db_host, db_name, db_username,
                                db_password)

        # Initialize dataframe
        self.df = self._initialize_df()

        # Fetch table
        if read_on_init:
            self.read()
Exemple #8
0
    def generate(target: str,
                 from_file: str = None,
                 template: str = None,
                 database_id: str = 'default',
                 record_id: str = None,
                 content: str = 'content',
                 base_dir: str = None):
        """Generate template or metadata from a model or a file.

        Args:
            target (str): What to to generate ('template' or 'metadata')
            from_file (str): File path to generate metadata from
            template (str): JSON file to use as a template of metadata
            database_id (str): Database ID
            record_id (str): Record ID
            content (str): Content (key of dict `contents`)
            base_dir (str): Base directory

        """
        assert target in ['template', 'metadata'
                          ], 'Target must be either "template" or "metadata"'

        from pydtk.db import DBHandler
        from pydtk.bin.make_meta import _get_timestamps_info, _get_contents_info
        from pydtk.utils.utils import load_config

        default_config = load_config('v4').bin.make_meta
        data = {k: None for k in default_config['common_item'].keys()}

        db_handler = DBHandler(db_class='meta', database_id=database_id)
        config = db_handler.config

        if target == 'template':
            data.update({column['name']: None for column in config['columns']})

            if template:
                data.update(template)

        elif target == 'metadata':
            if template:
                f = open(template, 'r')
                template_data = f.read()
                f.close()
                data.update(json.loads(template_data))

            if from_file is None:
                raise ValueError(
                    'Please specify a file to generate metadata from using option "--from-file"'
                )

            path = os.path.abspath(from_file)
            if base_dir:
                path = os.path.relpath(from_file, base_dir)

            data["path"] = path
            try:
                data["contents"] = _get_contents_info(from_file)
            except NotImplementedError:
                pass
            try:
                data["start_timestamp"], data[
                    "end_timestamp"] = _get_timestamps_info(from_file)
            except NotImplementedError:
                pass

        # Post process
        if record_id is not None:
            data['record_id'] = record_id
        if content is not None:
            # TODO: Specify content prefix
            pass

        # Display
        print(json.dumps(data, indent=4, default=_default_json_handler))
Exemple #9
0
class BaseDBHandler(object):
    """Base handler for db."""

    __version__ = 'v4'
    db_defaults = load_config(__version__).db.connection.base
    _df_class = 'base_df'
    _config = AttrDict()
    _df_name = 'base_df'
    _columns = None

    def __new__(cls,
                db_class: str = None,
                db_engine: str = None,
                **kwargs) -> object:
        """Create object.

        Args:
            db_class (str): database class (e.g. 'meta')
            db_engine (str): database engine (e.g. 'sqlite')
            **kwargs: DB-handler specific arguments

        Returns:
            (object): the corresponding handler object

        """
        if cls is BaseDBHandler:
            handler = cls._get_handler(db_class, db_engine)
            return super(BaseDBHandler, cls).__new__(handler)
        else:
            return super(BaseDBHandler, cls).__new__(cls)

    @classmethod
    def _get_handler(cls, db_class, db_engine=None):
        """Returns an appropriate handler.

        Args:
            db_class (str): database class (e.g. 'meta')
            db_engine (str): database engine (e.g. 'tinydb')

        Returns:
            (handler): database handler object

        """
        # Load default config
        config = load_config(cls.__version__)

        # Check if the db_class is available
        if db_class not in DB_HANDLERS.keys():
            raise ValueError('Unsupported db_class: {}'.format(db_class))

        # Get db_engine from environment variable if not specified
        if db_engine is None:
            db_engine = os.environ.get(
                'PYDTK_{}_DB_ENGINE'.format(db_class.upper()), None)

        # Get the default engine if not specified
        if db_engine is None:
            try:
                db_defaults = getattr(config.db.connection, db_class)
                db_engine = db_defaults.engine
            except (ValueError, AttributeError):
                raise ValueError(
                    'Could not find the default value for `db_engine`')

        # Check if the corresponding handler is registered
        if db_engine not in DB_HANDLERS[db_class].keys():
            raise ValueError('Unsupported db_engine: {}'.format(db_engine))

        # Get a DB-handler supporting the engine
        return DB_HANDLERS[db_class][db_engine]

    def __init__(self,
                 db_engine=None,
                 db_host=None,
                 db_name=None,
                 db_username=None,
                 db_password=None,
                 df_name=None,
                 read_on_init=False,
                 **kwargs):
        """Initialize BaseDBHandler.

        Args:
            db_engine (str): database engine (if None, the one in the config file will be used)
            db_host (str): database HOST (if None, the one in the config file will be used)
            db_name (str): database name (if None, the one in the config file will be used)
            db_username (str): username (if None, the one in the config file will be used)
            db_password (str): password (if None, the one in the config file will be used)
            df_name (str): dataframe (table in DB) name (if None, class default value will be used)
            read_on_init (bool): if True, dataframe will be read from database on initialization

        """
        super(BaseDBHandler, self).__init__()
        self.logger = logging.getLogger(__name__)
        self._cursor = 0
        self._data = {}
        self._uuids_to_remove = []
        self._count_total = 0
        if df_name is not None:
            self.df_name = df_name

        # Load config
        config = load_config(self.__version__)
        try:
            self._config = ConfigDict(config['db']['df_class'][self._df_class])
        except KeyError:
            self._config = ConfigDict()

        # Initialize deepmerger
        self._merger = Merger(
            # pass in a list of tuple, with the
            # strategies you are looking to apply
            # to each type.
            [(dict, ['merge']),
             (list, [_deepmerge_append_list_unique, 'append'])],
            # next, choose the fallback strategies,
            # applied to all other types:
            ["override"],
            # finally, choose the strategies in
            # the case where the types conflict:
            ["override"])

        # Initialize database
        self._initialize_engine(db_engine, db_host, db_name, db_username,
                                db_password)
        self._load_config_from_db()

        # Fetch table
        if read_on_init:
            self.read()

    def __len__(self):
        """Return number of recoreds."""
        return len(self.data)

    def __iter__(self):
        """Return iterator."""
        return self

    def __next__(self):
        """Return the next item."""
        if self._cursor >= len(self):
            self._cursor = 0
            raise StopIteration()

        # Grab data
        data = self[self._cursor]

        # Increment
        self._cursor += 1

        return data

    def __getitem__(self, idx, remove_internal_columns=True):
        """Return data at index idx.

        Args:
            idx (int): index of the target data
            remove_internal_columns (bool): if True, internal columns are removed

        Returns:
            (dict): data

        """
        data = self.data[idx]

        # Delete internal column
        if remove_internal_columns:
            if '_uuid' in data.keys():
                del data['_uuid']
            if '_creation_time' in data.keys():
                del data['_creation_time']

        return data

    def _initialize_engine(self,
                           db_engine=None,
                           db_host=None,
                           db_name=None,
                           db_username=None,
                           db_password=None):
        """Initialize DB engine.

        Args:
            db_engine (str): database engine (if None, the one in the config file will be used)
            db_host (str): database HOST (if None, the one in the config file will be used)
            db_name (str): database name (if None, the one in the config file will be used)
            db_username (str): username (if None, the one in the config file will be used)
            db_password (str): password (if None, the one in the config file will be used)

        """
        if db_engine is None:
            db_engine = self.db_defaults.engine
        if db_host is None:
            db_host = self.db_defaults.host
            if db_username is None:
                db_username = self.db_defaults.username
            if db_password is None:
                db_password = self.db_defaults.password

        self._db_engine = db_engine

        if db_engine in DB_ENGINES.keys():
            self._db = DB_ENGINES[db_engine].connect(
                db_host=db_host,
                db_name=db_name,
                db_username=db_username,
                db_password=db_password,
                collection_name=self._df_name,
                handler=self,
            )
            self._config_db = DB_ENGINES[db_engine].connect(
                db_host=db_host,
                db_name=db_name,
                db_username=db_username,
                db_password=db_password,
                collection_name='--config--{}'.format(self._df_name),
                handler=self,
            )
        else:
            raise ValueError("Unsupported engine: {}".format(db_engine))

    def _load_config_from_db(self):
        """Load configs from DB."""
        if self._db_engine not in DB_ENGINES.keys():
            return
        try:
            candidates = DB_ENGINES[self._db_engine].read(self._config_db,
                                                          handler=self)
            if len(candidates) > 0:
                if isinstance(candidates[0][0], dict):
                    self._config = ConfigDict(candidates[0][0])
                else:
                    raise TypeError('Unexpected type')
        except Exception as e:
            logging.warning('Failed to load configs from DB: {}'.format(
                str(e)))

    def _save_config_to_db(self):
        """Save configs to DB."""
        if self._db_engine not in DB_ENGINES.keys():
            return
        try:
            config = dict(self._config)
            config.update({'_uuid': '__config__'})
            config = [config]
            DB_ENGINES[self._db_engine].upsert(self._config_db,
                                               data=config,
                                               handler=self)
        except Exception as e:
            logging.warning('Failed to save configs to DB: {}'.format(str(e)))

    def _get_uuid_from_item(self, data_in):
        """Return UUID of the given item.

        Args:
            data_in (dict or pandas.Series): dict or Series containing data

        Returns:
            (str): UUID

        """
        hash_target_columns = \
            self._config['index_columns'] if 'index_columns' in self._config.keys() else []

        item = data_in
        if isinstance(item, pd.Series):
            item = data_in.to_dict()

        pre_hash = ''.join([
            '{:.09f}'.format(item[column])
            if isinstance(item[column], float) else str(item[column].keys())
            if isinstance(item[column], dict) else str(item[column])
            for column in hash_target_columns if column in item.keys()
        ])
        pre_hash = pre_hash.encode('utf-8')
        uuid = hashlib.md5(pre_hash).hexdigest()
        return uuid

    def _read(self, **kwargs):
        if self._db_engine is None:
            raise DatabaseNotInitializedError()
        elif self._db_engine in DB_ENGINES.keys():
            func = DB_ENGINES[self._db_engine].read
            available_args = set(inspect.signature(func).parameters.keys())
            unavailable_args = set([k for k, v in kwargs.items() if v is not None]) \
                .difference(available_args)
            if len(unavailable_args) > 0:
                logging.warning(
                    'DB-engine "{0}" does not support args: {1}'.format(
                        self._db_engine, list(unavailable_args)))
            return func(self._db, handler=self, **kwargs)
        else:
            raise ValueError('Unsupported DB engine: {}'.format(
                self._db_engine))

    def read(self,
             df_name=None,
             query=None,
             pql=None,
             where=None,
             group_by=None,
             order_by=None,
             limit=None,
             offset=None,
             **kwargs):
        """Read data from SQL.

        Args:
            df_name (str): Dataframe name to read
            query (str SQL query or SQLAlchemy Selectable): query to select items
            pql (PQL): Python-Query-Language to select items
            where (str): query string for filtering items
            group_by (str): column name to group
            order_by (srt): column name to sort by
            limit (int): number of items to return per a page
            offset (int): offset of cursor
            **kwargs: kwargs for function `pandas.read_sql_query`
                      or `influxdb.DataFrameClient.query`

        """
        if df_name is not None:
            self.df_name = df_name

        data, self._count_total = self._read(query=query,
                                             pql=pql,
                                             where=where,
                                             group_by=group_by,
                                             order_by=order_by,
                                             limit=limit,
                                             offset=offset,
                                             **kwargs)

        # Check if UUID exists in each value
        for value in data:
            if '_uuid' not in value.keys():
                raise ValueError('"_uuid" not found in data')

        self._data = {record['_uuid']: record for record in data}

    def _upsert(self, data):
        """Upsert data to DB.

        Args:
            data (list): data to save

        """
        if self._db_engine is None:
            raise DatabaseNotInitializedError()
        elif self._db_engine in DB_ENGINES.keys():
            DB_ENGINES[self._db_engine].upsert(self._db, data)
        else:
            raise ValueError('Unsupported DB engine: {}'.format(
                self._db_engine))

    def save(self, **kwargs):
        """Save data to DB."""
        self._remove(self._uuids_to_remove)
        self._uuids_to_remove = []
        self._upsert(list(self._data.values()))
        self._save_config_to_db()

    def _remove(self, uuids):
        """Remove data from DB.

        Args:
            uuids [str]: A list of unique IDs

        """
        if self._db_engine is None:
            raise DatabaseNotInitializedError()
        elif self._db_engine in DB_ENGINES.keys():
            DB_ENGINES[self._db_engine].remove(self._db, uuids)
        else:
            raise ValueError('Unsupported DB engine: {}'.format(
                self._db_engine))

    def _drop_table(self, name):
        """Drop table from DB.

        Args:
            name (str): Name of table (collection)

        """
        if self._db_engine is None:
            raise DatabaseNotInitializedError()
        elif self._db_engine in DB_ENGINES.keys():
            DB_ENGINES[self._db_engine].drop_table(self._db, name)
        else:
            raise ValueError('Unsupported DB engine: {}'.format(
                self._db_engine))

    def add_data(self, data_in, strategy='overwrite', **kwargs):
        """Add data to db.

        Args:
            data_in (dict): a dict containing data
            strategy (str): 'merge' or 'overwrite'

        """
        assert strategy in ['merge', 'overwrite'], 'Unknown strategy.'

        data = deepcopy(data_in)
        if '_uuid' not in data.keys() or '_creation_time' not in data.keys():
            # Add _uuid and _creation_time
            data['_uuid'] = self._get_uuid_from_item(data)
            data['_creation_time'] = datetime.now().timestamp()

        if data['_uuid'] in self._data.keys():
            if strategy == 'merge':
                base_data = self._data[data['_uuid']]
                data = self._merger.merge(base_data, data)

        # Add new columns (keys) to config
        columns = self._config['columns'] if 'columns' in self._config.keys(
        ) else []
        columns_existing = [c['name'] for c in columns]
        columns_in_data = list(data_in.keys())
        new_columns = []
        for new_column in set(columns_in_data).difference(columns_existing):
            name = new_column
            dtype = type(data_in[new_column]).__name__
            aggregation = 'first'
            new_columns.append({
                'name': name,
                'dtype': dtype,
                'aggregation': aggregation,
                'display_name': name,
            })
        columns += new_columns
        self._config['columns'] = columns

        self._data.update({data['_uuid']: data})

    def remove_data(self, data):
        """Remove data-record from DB.

        Args:
            data (dict): a dict containing the target data or '_uuid' in keys.

        """
        if '_uuid' in data.keys():
            uuid = data['_uuid']
        else:
            uuid = self._get_uuid_from_item(data)

        # Remove from in-memory data
        if uuid in self._data.keys():
            del self._data[uuid]

        # Remove from DB on saving
        self._uuids_to_remove.append(uuid)

    def _df_from_dicts(self, dicts):
        """Create a DataFrame from a list of dicts.

        Args:
            dicts (list): list of dicts

        Returns:
            (pd.DataFrame): a data-frame

        """
        columns = self._config['columns'] if 'columns' in self._config.keys(
        ) else []
        df = pd.concat(
            [
                pd.Series(name=c['name'],
                          dtype=dtype_string_to_dtype_object(c['dtype']))
                for c in columns
                if c['name'] != '_uuid' and c['name'] != '_creation_time'
            ]  # noqa: E501
            + [
                pd.Series(name='_uuid', dtype=str),
                pd.Series(name='_creation_time', dtype=float)
            ],
            axis=1)
        df.set_index('_uuid', inplace=True)
        df = pd.concat([df, pd.DataFrame.from_records(dicts)])
        return df

    def _to_display_names(self, df, inplace=False):
        """Rename columns to display-names.

        Args:
            df (pd.DataFrame): A data-frame
            inplace (bool): If true, df will be modified in-place.

        Returns:
            (pd.DataFrame): A data-frame with renamed columns if inplace=false, otherwise None.

        """
        column_renames = {
            c['name']: c['display_name']
            for c in self._config['columns'] if isinstance(c, dict)
            and 'name' in c.keys() and 'display_name' in c.keys()
        }
        return df.rename(columns=column_renames, inplace=inplace)

    @property
    def data(self):
        """Return data.

        Returns:
            (list): list of dicts

        """
        return list(self._data.values())

    @data.setter
    def data(self, data):
        """Setter for self.data.

        Args:
            data (list): new-data

        """
        assert isinstance(data, list)
        for value in data:
            self.add_data(value)

    @property
    def columns(self):
        """Return columns of DF."""
        df = self._df_from_dicts(self.data)
        return df.columns.tolist()

    @property
    def df(self):
        """Return df."""
        df = self._df_from_dicts(self.data)
        df = self._to_display_names(df)
        return df

    @property
    def count_total(self):
        """Return total number of rows."""
        return self._count_total

    @property
    def config(self):
        """Return config."""
        return self._config
Exemple #10
0
#!/usr/bin/env python3

import argparse
import json
import logging
import os

from collections import defaultdict
from pydtk.io.reader import BaseFileReader
from pydtk.models import MetaDataModel
from pydtk.utils.utils import load_config, smart_open

config = load_config('v4').bin.make_meta


def make_meta_interactively(template=None):
    """Make metadata with the interactive command."""
    if template is None:
        template = defaultdict(dict)
    meta = defaultdict(dict)
    for key in config.common_item.keys():
        if key in template.keys():
            meta[key] = \
                str(input(f"{config.common_item[key]} [{template[key]}]: ") or template[key])
        else:
            meta[key] = input(f"{config.common_item[key]}: ")
    return meta


def make_meta(file, template=None):
    """Make metadata with a template."""
Exemple #11
0
class BaseDBHandler(object):
    """Handler for db."""

    __version__ = 'v2'
    db_defaults = load_config(__version__).sql.base
    _config = AttrDict()
    _df_name = 'base_df'
    _columns = None

    @classmethod
    def default_config(cls):
        """Return default configurations.

        Returns:
            (AttrDict): default configurations

        """
        return load_config(cls.__version__)

    def __init__(self,
                 db_engine=None,
                 db_host=None,
                 db_name=None,
                 db_username=None,
                 db_password=None,
                 df_name=None,
                 read_on_init=True):
        """Initialize BaseDBHandler.

        Args:
            db_engine (str): database engine (if None, the one in the config file will be used)
            db_host (str): database HOST (if None, the one in the config file will be used)
            db_name (str): database name (if None, the one in the config file will be used)
            db_username (str): username (if None, the one in the config file will be used)
            db_password (str): password (if None, the one in the config file will be used)
            df_name (str): dataframe (table in DB) name (if None, class default value will be used)
            read_on_init (bool): if True, dataframe will be read from database on initialization

        """
        self._cursor = 0
        self.logger = logging.getLogger(__name__)
        if df_name is not None:
            self.df_name = df_name

        # Load config
        self._config = load_config(self.__version__)

        # Initialize database
        self._initialize_engine(db_engine, db_host, db_name, db_username,
                                db_password)

        # Initialize dataframe
        self.df = self._initialize_df()

        # Fetch table
        if read_on_init:
            self.read()

    def __iter__(self):
        """Return iterator."""
        return self

    def __next__(self):
        """Return the next item."""
        if self._cursor >= len(self.df):
            self._cursor = 0
            raise StopIteration()

        # Grab data
        data = self.df.take([self._cursor]).to_dict(orient='records')[0]

        # Delete internal column
        if 'uuid_in_df' in data.keys():
            del data['uuid_in_df']

        # Post-processes
        data = deserialize_dict_1d(data)

        # Increment
        self._cursor += 1

        return data

    def __len__(self):
        """Return number of records."""
        return len(self.df)

    def _initialize_engine(self,
                           db_engine=None,
                           db_host=None,
                           db_name=None,
                           db_username=None,
                           db_password=None):
        """Initialize DB engine."""
        # Parse
        engine = db_engine if db_engine is not None else self.db_defaults.engine
        username = db_username if db_username is not None else self.db_defaults.username
        password = db_password if db_password is not None else self.db_defaults.password
        host = db_host if db_host is not None else self.db_defaults.host
        database = db_name if db_name is not None else self.db_defaults.database

        # Substitute
        self._config.current_db = {
            'engine': engine,
            'host': host,
            'username': username,
            'password': password,
            'database': database,
        }

        # Connect
        if engine in ['influxdb']:
            if not _extra_supports['influxdb']:
                raise ImportError('Module `influxdb` cannot be not imported')
            hostname = host
            hostport = 8086
            if ':' in host:
                hostname, hostport = host.split(':')
            self._engine = DataFrameClient(hostname, hostport, username,
                                           password, database)
        else:
            engine = 'postgresql' if engine == 'timescaledb' \
                else 'mysql' if engine == 'mariadb' \
                else engine
            username_and_password = '' \
                if all([username == '', password == '']) \
                else '{0}:{1}@'.format(username, password)
            self._engine = sqlalchemy.create_engine('{0}://{1}{2}{3}'.format(
                engine, username_and_password if engine != 'sqlite' else '',
                '/{0}'.format(host) if engine == 'sqlite' else host,
                '' if engine == 'sqlite' else '/' + database),
                                                    echo=False)
            self._session = scoped_session(sessionmaker(bind=self._engine))

    def _initialize_df(self):
        """Initialize DF."""
        df = pd.concat([
            pd.Series(name=c['name'],
                      dtype=dtype_string_to_dtype_object(c['dtype']))
            for c in self.columns
        ] + [pd.Series(name='uuid_in_df', dtype=str)],
                       axis=1)
        return df

    def _get_column_names(self):
        """Return column names."""
        return [c['name'] for c in self.columns]

    def _get_sql_columns(self):
        return [sql.column(c) for c in self._get_column_names()]

    def _get_uuid_from_item(self, data_in):
        """Return UUID of the given item.

        Args:
            data_in (dict or pandas.Series): dict or Series containing data

        Returns:
            (str): UUID

        """
        item = data_in
        if isinstance(item, pd.Series):
            item = data_in.to_dict()

        pre_hash = ''.join([
            '{:.09f}'.format(item[c['name']]) if isinstance(
                item[c['name']], float) else str(item[c['name']])
            for c in self.columns if c['name'] in item.keys()
        ])
        pre_hash = pre_hash.encode('utf-8')
        uuid = hashlib.md5(pre_hash).hexdigest()
        return uuid

    def _preprocess_list_of_dicts(self, data_in):
        """Preprocess list of dicts.

        Args:
            data_in (list): list of dicts containing data

        Returns:
            (dict): dict of lists = listed dict

        """
        data = deepcopy(data_in)

        # Serialize (convert list to str)
        self.logger.info('(Preprocess) Serializing...')
        for item in tqdm(data, desc='Serialization', leave=False):
            item = serialize_dict_1d(item)

            # Add df_uuid
            item['uuid_in_df'] = self._get_uuid_from_item(item)

            # Add missing columns
            for column in self.columns:
                column_name, column_dtype = column['name'], column['dtype']
                if column_name not in item.keys():
                    item.update({column_name: None})
                else:
                    dtype_obj = dtype_string_to_dtype_object(column_dtype)
                    if dtype_obj is None:
                        continue
                    if item[column_name] is not None \
                            and not isinstance(item[column_name], dtype_obj):
                        try:
                            item[column_name] = dtype_obj(item[column_name])
                        except ValueError:
                            item[column_name] = np.nan

        # Convert dict to listed dict
        self.logger.info('(Preprocess) Converting...')
        data = dicts_to_listed_dict_2d(data)

        return data

    def _append_listed_dict_to_df(self, data, check_unique=False):
        """Append pre-processed dict to self._df.

        Args:
            data (dict): data to add
            check_unique (bool): if True, it will be checked that the data is unique in the db

        """
        self.df = pd.concat([self.df, pd.DataFrame.from_dict(data)],
                            sort=False)
        if check_unique:
            self.df.drop_duplicates('uuid_in_df', inplace=True)

    def add_data(self, data_in, **kwargs):
        """Add data to db.

        Args:
            data_in (dict): a dict containing data

        """
        self.add_list_of_data([data_in], **kwargs)

    def add_list_of_data(self, data_in, **kwargs):
        """Add list of data to db.

        Args:
            data_in (list): a list of dicts containing data

        """
        data = self._preprocess_list_of_dicts(data_in)
        self.logger.info('Adding data to DB...')
        self._append_listed_dict_to_df(data, **kwargs)
        self.logger.info('Successfully finished adding data to DB')

    def read(self,
             df_name=None,
             query=None,
             where=None,
             order_by=None,
             **kwargs):
        """Read data from SQL.

        Args:
            df_name (str): Dataframe name to read
            query (str SQL query or SQLAlchemy Selectable): query to select items
            where (str): query string for filtering items
            order_by (srt): column name to sort by
            **kwargs: kwargs for function `pandas.read_sql_query`
                      or `influxdb.DataFrameClient.query`

        """
        if df_name is not None:
            self.df_name = df_name

        # Create a query
        q = query
        if q is None:
            # Check if the table exits on DB
            if self._config.current_db['engine'] in ['influxdb']:
                if self.df_name not in [
                        entry['name']
                        for entry in self._engine.get_list_measurements()
                ]:
                    self.df = self._initialize_df()
                    return
            else:
                if not self._engine.dialect.has_table(self._engine,
                                                      self.df_name):
                    self.df = self._initialize_df()
                    return

            # Create a sub-query for extracting unique records
            sub_q = sql.select('*', from_obj=sql.table(self.df_name))
            if self._config.current_db['engine'] in ['mysql', 'sqlite']:
                sub_q = sub_q.group_by(sql.column('uuid_in_df'))
            elif self._config.current_db['engine'] in [
                    'postgresql', 'timescaledb'
            ]:
                sub_q = sub_q.distinct(sql.column('uuid_in_df'))

            # Create a query
            if self._config.current_db['engine'] in ['influxdb']:
                q = sub_q
            else:
                q = sql.select('*', from_obj=sub_q.alias('temp'))
            if where is not None:
                q = q.where(sql.text(where))
            if order_by is not None:
                q = q.order_by(sql.text(order_by))

        # Read table from DB
        try:
            if self._config.current_db['engine'] in ['influxdb']:
                df = list(self._engine.query(str(q), **kwargs).values())[0]
            else:
                df = pd.read_sql_query(q, self._engine, **kwargs)
        except Exception as e:
            self.logger.warning(
                'Could not execute SQL statement: "{0}" (reason: {1})'.format(
                    str(q), str(e)))
            df = self._initialize_df()

        self.df = df

    def save(self, df=None, remove_duplicates=False, **kwargs):
        """Save data to SQL.

        Args:
            df (pandas.DataFrame): DataFrame to save (if None, self.df will be saved)
            remove_duplicates (bool): if True, duplicated rows will be removed
            **kwargs: kwargs for function `pandas.dataframe.to_sql`
                      or `influxdb.DataFrameClient.write_points`

        """
        dataframe = df if df is not None else self.df
        dataframe.drop_duplicates('uuid_in_df', inplace=True)

        if self._config.current_db['engine'] in ['influxdb']:
            self._engine.write_points(dataframe, self.df_name, **kwargs)
        else:
            if 'index' not in kwargs.keys():
                kwargs.update({'index': False})
            dataframe.to_sql(self.df_name,
                             self._engine,
                             if_exists='append',
                             **kwargs)

        if remove_duplicates:
            if self._config.current_db['engine'] in ['influxdb']:
                logging.warning(
                    'Option "remove duplicates" is not supported yet.')
                return

            # Create temporal table
            temp_table_name = self.df_name + '_' + datetime.now().strftime(
                '%s')
            self._initialize_df().to_sql(temp_table_name,
                                         self._engine,
                                         index=False)

            # Select unique rows and insert into the temporal table
            if self._config.current_db['engine'] in ['mysql', 'sqlite']:
                select = 'select * from "{0}" group by uuid_in_df'.format(
                    self.df_name)
            elif self._config.current_db['engine'] in [
                    'postgresql', 'timescaledb'
            ]:
                select = 'select distinct * from "{0}"'.format(self.df_name)
            else:
                raise ValueError('Unsupported engine: {}'.format(
                    self._engine.name))
            q = 'insert into "{0}" {1}'.format(temp_table_name, select)
            self._engine.execute(q)

            # Drop deprecated table if exists
            if self._engine.dialect.has_table(self._engine,
                                              self.df_name + '_deprecated'):
                self._engine.execute('drop table "{0}"'.format(self.df_name +
                                                               '_deprecated'))

            # Deprecate the original table
            self._engine.execute('alter table "{0}" rename to "{1}"'.format(
                self.df_name, self.df_name + '_deprecated'))

            # Rename the temporal table
            self._engine.execute('alter table "{0}" rename to "{1}"'.format(
                temp_table_name, self.df_name))

            # Drop the deprecated table
            self._engine.execute('drop table "{0}"'.format(self.df_name +
                                                           '_deprecated'))

    @property
    def df(self):
        """Return df."""
        return self._df

    @df.setter
    def df(self, value):
        """Setter for self.df."""
        if not isinstance(value, pd.DataFrame):
            raise ValueError('Only pandas dataframe is accepted.')

        # Set columns based on the given DF
        if len(self.columns) == 0 and len(value) > 0:
            self.columns = [{
                'name': c,
                'dtype': 'none'
            } for c in value.columns.to_list()]

        # Add column 'uuid_in_df'
        if 'uuid_in_df' not in value.columns.to_list():
            value['uuid_in_df'] = value.apply(
                lambda x: self._get_uuid_from_item(x), axis=1)
            self.columns += [{'name': 'uuid_in_df', 'dtype': 'str'}]

        self._df = value

    @property
    def df_name(self):
        """Return df_name."""
        return self._df_name

    @df_name.setter
    def df_name(self, value):
        """Setter for self.df_name."""
        self._df_name = value

    @property
    def columns(self):
        """Return columns of DF."""
        if self._columns is not None:
            return self._columns
        try:
            return self._config[self.df_name]['columns']
        except KeyError:
            return []

    @columns.setter
    def columns(self, value):
        """Set self.columns."""
        if not isinstance(value, list):
            raise ValueError(
                'Columns must be a list of dicts with keys "name" and "dtype".'
            )
        if len(value) < 1:
            raise ValueError('At least one item must be in the list.')
        if not isinstance(value[0], dict) or "name" not in value[0].keys(
        ) or "dtype" not in value[0].keys():
            raise ValueError(
                'Columns must be a list of dicts with keys "name" and "dtype".'
            )

        try:
            _ = pd.concat([
                pd.Series(name=c['name'],
                          dtype=dtype_string_to_dtype_object(c['dtype']))
                for c in value
            ] + [pd.Series(name='uuid_in_df', dtype=str)],
                          axis=1)
        except KeyError as e:
            raise ValueError('Unrecognized value: {}'.format(str(e)))

        self._columns = value