def __init__(self, path_to_db_files, path_to_output_db_file=None): """Initialize V1BaseDBHandler. Args: path_to_db_files (list): list fo db-files with extension '.arrow' path_to_output_db_file (str): output database for saving """ if isinstance(path_to_db_files, str): path_to_db_files = [path_to_db_files] # Check file extension for file in path_to_db_files: if not file.endswith('.arrow'): raise ValueError('DB file path must ends with ".arrow"') self.path_to_db_file = path_to_db_files[0] if path_to_output_db_file is not None: self.path_to_db_file = path_to_output_db_file self._cursor = 0 self.logger = logging.getLogger(__name__) # load config self._config = load_config(self.__version__) self.columns = self._config[self.df_name]['columns'] # initialize dataframe try: self._df = vaex.open_many(path_to_db_files) except IOError: self._df = None
def __init__(self, db_engine=None, db_host=None, db_name=None, db_username=None, db_password=None, df_name=None, read_on_init=False, **kwargs): """Initialize BaseDBHandler. Args: db_engine (str): database engine (if None, the one in the config file will be used) db_host (str): database HOST (if None, the one in the config file will be used) db_name (str): database name (if None, the one in the config file will be used) db_username (str): username (if None, the one in the config file will be used) db_password (str): password (if None, the one in the config file will be used) df_name (str): dataframe (table in DB) name (if None, class default value will be used) read_on_init (bool): if True, dataframe will be read from database on initialization """ super(BaseDBHandler, self).__init__() self.logger = logging.getLogger(__name__) self._cursor = 0 self._data = {} self._uuids_to_remove = [] self._count_total = 0 if df_name is not None: self.df_name = df_name # Load config config = load_config(self.__version__) try: self._config = ConfigDict(config['db']['df_class'][self._df_class]) except KeyError: self._config = ConfigDict() # Initialize deepmerger self._merger = Merger( # pass in a list of tuple, with the # strategies you are looking to apply # to each type. [(dict, ['merge']), (list, [_deepmerge_append_list_unique, 'append'])], # next, choose the fallback strategies, # applied to all other types: ["override"], # finally, choose the strategies in # the case where the types conflict: ["override"]) # Initialize database self._initialize_engine(db_engine, db_host, db_name, db_username, db_password) self._load_config_from_db() # Fetch table if read_on_init: self.read()
def default_config(cls): """Return default configurations. Returns: (AttrDict): default configurations """ return load_config(cls.__version__)
class TimeSeriesDBHandler(BaseDBHandler): """Time series database handler.""" __version__ = 'v2' db_defaults = load_config(__version__).sql.time_series_defaults _df_name = 'time_series_df' _columns = None def read(self, *args, **kwargs): """Read data from SQL. Args: *args: args **kwargs: kwargs Returns: (pandas.df): table """ # Read table from DB if self._config.current_db['engine'] in ['influxdb']: super().read(*args, **kwargs) else: super().read(*args, **kwargs, index_col='timestamp') # Post-process try: df = self.df df.reset_index(inplace=True) df.rename(columns={'index': 'timestamp'}, inplace=True) df['timestamp'] = df['timestamp'].astype(int) / 10**9 except Exception as e: self.logger.warning( 'An error occurred in post-process: {}'.format(e)) df = self._initialize_df() self.df = df def save(self, df=None, **kwargs): """Save data to SQL. Args: df (pandas.DataFrame): dataframe to save. if None, self.df will be saved """ dataframe: pd.DataFrame = df if df is not None else self.df # Convert timestamp (float) to datetime assert 'timestamp' in dataframe.columns df_to_save = dataframe.copy() df_to_save['timestamp'] = pd.to_datetime( df_to_save['timestamp'].astype(float), unit='s') df_to_save.set_index('timestamp', inplace=True) if self._config.current_db['engine'] in ['influxdb']: super().save(df=df_to_save) else: super().save(df=df_to_save, index=True)
class TimeSeriesDBHandler(_BaseDBHandler): """Time series database handler.""" __version__ = 'v3' db_defaults = load_config(__version__).sql.time_series _df_class = 'time_series_df' _df_name = 'time_series_df' _columns = None def _initialize_engine(self, db_engine=None, db_host=None, db_name=None, db_username=None, db_password=None): """Initialize DB engine.""" # Load settings from environment variables engine = db_engine if db_engine is not None \ else os.environ.get('PYDTK_TIME_SERIES_DB_ENGINE', None) username = db_username if db_username is not None \ else os.environ.get('PYDTK_TIME_SERIES_DB_USERNAME', None) password = db_password if db_password is not None \ else os.environ.get('PYDTK_TIME_SERIES_DB_PASSWORD', None) host = db_host if db_host is not None \ else os.environ.get('PYDTK_TIME_SERIES_DB_HOST', None) database = db_name if db_name is not None \ else os.environ.get('PYDTK_TIME_SERIES_DB_DATABASE', None) super()._initialize_engine(engine, host, database, username, password) def save(self, df=None, remove_duplicates=False, **kwargs): """Save data to SQL. Args: df (pandas.DataFrame): DataFrame to save (if None, self.df will be saved) remove_duplicates (bool): if True, duplicated rows will be removed **kwargs: kwargs for function `pandas.dataframe.to_sql` """ dataframe = df.copy() if df is not None else self.df.copy() dataframe = dataframe[~dataframe.index.duplicated(keep='last')] if 'index' not in kwargs.keys(): kwargs.update({'index': True}) self._prepare_columns() # TODO: alter column 'uuid_in_df' as primary key or wait for the PR below # TODO: Use `if_exists=upsert_overwrite` after the following PR is merged # https://github.com/pandas-dev/pandas/pull/29636 dataframe.to_sql(self.df_name, self._engine, if_exists='append', dtype={'uuid_in_df': VARCHAR(32)}, method=_replace_into, **kwargs)
def _get_handler(cls, db_class, db_engine=None): """Returns an appropriate handler. Args: db_class (str): database class (e.g. 'meta') db_engine (str): database engine (e.g. 'tinydb') Returns: (handler): database handler object """ # Load default config config = load_config(cls.__version__) # Check if the db_class is available if db_class not in DB_HANDLERS.keys(): raise ValueError('Unsupported db_class: {}'.format(db_class)) # Get db_engine from environment variable if not specified if db_engine is None: db_engine = os.environ.get( 'PYDTK_{}_DB_ENGINE'.format(db_class.upper()), None) # Get the default engine if not specified if db_engine is None: try: db_defaults = getattr(config.db.connection, db_class) db_engine = db_defaults.engine except (ValueError, AttributeError): raise ValueError( 'Could not find the default value for `db_engine`') # Check if the corresponding handler is registered if db_engine not in DB_HANDLERS[db_class].keys(): raise ValueError('Unsupported db_engine: {}'.format(db_engine)) # Get a DB-handler supporting the engine return DB_HANDLERS[db_class][db_engine]
def __init__(self, db_engine=None, db_host=None, db_name=None, db_username=None, db_password=None, df_name=None, read_on_init=True): """Initialize BaseDBHandler. Args: db_engine (str): database engine (if None, the one in the config file will be used) db_host (str): database HOST (if None, the one in the config file will be used) db_name (str): database name (if None, the one in the config file will be used) db_username (str): username (if None, the one in the config file will be used) db_password (str): password (if None, the one in the config file will be used) df_name (str): dataframe (table in DB) name (if None, class default value will be used) read_on_init (bool): if True, dataframe will be read from database on initialization """ self._cursor = 0 self.logger = logging.getLogger(__name__) if df_name is not None: self.df_name = df_name # Load config self._config = load_config(self.__version__) # Initialize database self._initialize_engine(db_engine, db_host, db_name, db_username, db_password) # Initialize dataframe self.df = self._initialize_df() # Fetch table if read_on_init: self.read()
def generate(target: str, from_file: str = None, template: str = None, database_id: str = 'default', record_id: str = None, content: str = 'content', base_dir: str = None): """Generate template or metadata from a model or a file. Args: target (str): What to to generate ('template' or 'metadata') from_file (str): File path to generate metadata from template (str): JSON file to use as a template of metadata database_id (str): Database ID record_id (str): Record ID content (str): Content (key of dict `contents`) base_dir (str): Base directory """ assert target in ['template', 'metadata' ], 'Target must be either "template" or "metadata"' from pydtk.db import DBHandler from pydtk.bin.make_meta import _get_timestamps_info, _get_contents_info from pydtk.utils.utils import load_config default_config = load_config('v4').bin.make_meta data = {k: None for k in default_config['common_item'].keys()} db_handler = DBHandler(db_class='meta', database_id=database_id) config = db_handler.config if target == 'template': data.update({column['name']: None for column in config['columns']}) if template: data.update(template) elif target == 'metadata': if template: f = open(template, 'r') template_data = f.read() f.close() data.update(json.loads(template_data)) if from_file is None: raise ValueError( 'Please specify a file to generate metadata from using option "--from-file"' ) path = os.path.abspath(from_file) if base_dir: path = os.path.relpath(from_file, base_dir) data["path"] = path try: data["contents"] = _get_contents_info(from_file) except NotImplementedError: pass try: data["start_timestamp"], data[ "end_timestamp"] = _get_timestamps_info(from_file) except NotImplementedError: pass # Post process if record_id is not None: data['record_id'] = record_id if content is not None: # TODO: Specify content prefix pass # Display print(json.dumps(data, indent=4, default=_default_json_handler))
class BaseDBHandler(object): """Base handler for db.""" __version__ = 'v4' db_defaults = load_config(__version__).db.connection.base _df_class = 'base_df' _config = AttrDict() _df_name = 'base_df' _columns = None def __new__(cls, db_class: str = None, db_engine: str = None, **kwargs) -> object: """Create object. Args: db_class (str): database class (e.g. 'meta') db_engine (str): database engine (e.g. 'sqlite') **kwargs: DB-handler specific arguments Returns: (object): the corresponding handler object """ if cls is BaseDBHandler: handler = cls._get_handler(db_class, db_engine) return super(BaseDBHandler, cls).__new__(handler) else: return super(BaseDBHandler, cls).__new__(cls) @classmethod def _get_handler(cls, db_class, db_engine=None): """Returns an appropriate handler. Args: db_class (str): database class (e.g. 'meta') db_engine (str): database engine (e.g. 'tinydb') Returns: (handler): database handler object """ # Load default config config = load_config(cls.__version__) # Check if the db_class is available if db_class not in DB_HANDLERS.keys(): raise ValueError('Unsupported db_class: {}'.format(db_class)) # Get db_engine from environment variable if not specified if db_engine is None: db_engine = os.environ.get( 'PYDTK_{}_DB_ENGINE'.format(db_class.upper()), None) # Get the default engine if not specified if db_engine is None: try: db_defaults = getattr(config.db.connection, db_class) db_engine = db_defaults.engine except (ValueError, AttributeError): raise ValueError( 'Could not find the default value for `db_engine`') # Check if the corresponding handler is registered if db_engine not in DB_HANDLERS[db_class].keys(): raise ValueError('Unsupported db_engine: {}'.format(db_engine)) # Get a DB-handler supporting the engine return DB_HANDLERS[db_class][db_engine] def __init__(self, db_engine=None, db_host=None, db_name=None, db_username=None, db_password=None, df_name=None, read_on_init=False, **kwargs): """Initialize BaseDBHandler. Args: db_engine (str): database engine (if None, the one in the config file will be used) db_host (str): database HOST (if None, the one in the config file will be used) db_name (str): database name (if None, the one in the config file will be used) db_username (str): username (if None, the one in the config file will be used) db_password (str): password (if None, the one in the config file will be used) df_name (str): dataframe (table in DB) name (if None, class default value will be used) read_on_init (bool): if True, dataframe will be read from database on initialization """ super(BaseDBHandler, self).__init__() self.logger = logging.getLogger(__name__) self._cursor = 0 self._data = {} self._uuids_to_remove = [] self._count_total = 0 if df_name is not None: self.df_name = df_name # Load config config = load_config(self.__version__) try: self._config = ConfigDict(config['db']['df_class'][self._df_class]) except KeyError: self._config = ConfigDict() # Initialize deepmerger self._merger = Merger( # pass in a list of tuple, with the # strategies you are looking to apply # to each type. [(dict, ['merge']), (list, [_deepmerge_append_list_unique, 'append'])], # next, choose the fallback strategies, # applied to all other types: ["override"], # finally, choose the strategies in # the case where the types conflict: ["override"]) # Initialize database self._initialize_engine(db_engine, db_host, db_name, db_username, db_password) self._load_config_from_db() # Fetch table if read_on_init: self.read() def __len__(self): """Return number of recoreds.""" return len(self.data) def __iter__(self): """Return iterator.""" return self def __next__(self): """Return the next item.""" if self._cursor >= len(self): self._cursor = 0 raise StopIteration() # Grab data data = self[self._cursor] # Increment self._cursor += 1 return data def __getitem__(self, idx, remove_internal_columns=True): """Return data at index idx. Args: idx (int): index of the target data remove_internal_columns (bool): if True, internal columns are removed Returns: (dict): data """ data = self.data[idx] # Delete internal column if remove_internal_columns: if '_uuid' in data.keys(): del data['_uuid'] if '_creation_time' in data.keys(): del data['_creation_time'] return data def _initialize_engine(self, db_engine=None, db_host=None, db_name=None, db_username=None, db_password=None): """Initialize DB engine. Args: db_engine (str): database engine (if None, the one in the config file will be used) db_host (str): database HOST (if None, the one in the config file will be used) db_name (str): database name (if None, the one in the config file will be used) db_username (str): username (if None, the one in the config file will be used) db_password (str): password (if None, the one in the config file will be used) """ if db_engine is None: db_engine = self.db_defaults.engine if db_host is None: db_host = self.db_defaults.host if db_username is None: db_username = self.db_defaults.username if db_password is None: db_password = self.db_defaults.password self._db_engine = db_engine if db_engine in DB_ENGINES.keys(): self._db = DB_ENGINES[db_engine].connect( db_host=db_host, db_name=db_name, db_username=db_username, db_password=db_password, collection_name=self._df_name, handler=self, ) self._config_db = DB_ENGINES[db_engine].connect( db_host=db_host, db_name=db_name, db_username=db_username, db_password=db_password, collection_name='--config--{}'.format(self._df_name), handler=self, ) else: raise ValueError("Unsupported engine: {}".format(db_engine)) def _load_config_from_db(self): """Load configs from DB.""" if self._db_engine not in DB_ENGINES.keys(): return try: candidates = DB_ENGINES[self._db_engine].read(self._config_db, handler=self) if len(candidates) > 0: if isinstance(candidates[0][0], dict): self._config = ConfigDict(candidates[0][0]) else: raise TypeError('Unexpected type') except Exception as e: logging.warning('Failed to load configs from DB: {}'.format( str(e))) def _save_config_to_db(self): """Save configs to DB.""" if self._db_engine not in DB_ENGINES.keys(): return try: config = dict(self._config) config.update({'_uuid': '__config__'}) config = [config] DB_ENGINES[self._db_engine].upsert(self._config_db, data=config, handler=self) except Exception as e: logging.warning('Failed to save configs to DB: {}'.format(str(e))) def _get_uuid_from_item(self, data_in): """Return UUID of the given item. Args: data_in (dict or pandas.Series): dict or Series containing data Returns: (str): UUID """ hash_target_columns = \ self._config['index_columns'] if 'index_columns' in self._config.keys() else [] item = data_in if isinstance(item, pd.Series): item = data_in.to_dict() pre_hash = ''.join([ '{:.09f}'.format(item[column]) if isinstance(item[column], float) else str(item[column].keys()) if isinstance(item[column], dict) else str(item[column]) for column in hash_target_columns if column in item.keys() ]) pre_hash = pre_hash.encode('utf-8') uuid = hashlib.md5(pre_hash).hexdigest() return uuid def _read(self, **kwargs): if self._db_engine is None: raise DatabaseNotInitializedError() elif self._db_engine in DB_ENGINES.keys(): func = DB_ENGINES[self._db_engine].read available_args = set(inspect.signature(func).parameters.keys()) unavailable_args = set([k for k, v in kwargs.items() if v is not None]) \ .difference(available_args) if len(unavailable_args) > 0: logging.warning( 'DB-engine "{0}" does not support args: {1}'.format( self._db_engine, list(unavailable_args))) return func(self._db, handler=self, **kwargs) else: raise ValueError('Unsupported DB engine: {}'.format( self._db_engine)) def read(self, df_name=None, query=None, pql=None, where=None, group_by=None, order_by=None, limit=None, offset=None, **kwargs): """Read data from SQL. Args: df_name (str): Dataframe name to read query (str SQL query or SQLAlchemy Selectable): query to select items pql (PQL): Python-Query-Language to select items where (str): query string for filtering items group_by (str): column name to group order_by (srt): column name to sort by limit (int): number of items to return per a page offset (int): offset of cursor **kwargs: kwargs for function `pandas.read_sql_query` or `influxdb.DataFrameClient.query` """ if df_name is not None: self.df_name = df_name data, self._count_total = self._read(query=query, pql=pql, where=where, group_by=group_by, order_by=order_by, limit=limit, offset=offset, **kwargs) # Check if UUID exists in each value for value in data: if '_uuid' not in value.keys(): raise ValueError('"_uuid" not found in data') self._data = {record['_uuid']: record for record in data} def _upsert(self, data): """Upsert data to DB. Args: data (list): data to save """ if self._db_engine is None: raise DatabaseNotInitializedError() elif self._db_engine in DB_ENGINES.keys(): DB_ENGINES[self._db_engine].upsert(self._db, data) else: raise ValueError('Unsupported DB engine: {}'.format( self._db_engine)) def save(self, **kwargs): """Save data to DB.""" self._remove(self._uuids_to_remove) self._uuids_to_remove = [] self._upsert(list(self._data.values())) self._save_config_to_db() def _remove(self, uuids): """Remove data from DB. Args: uuids [str]: A list of unique IDs """ if self._db_engine is None: raise DatabaseNotInitializedError() elif self._db_engine in DB_ENGINES.keys(): DB_ENGINES[self._db_engine].remove(self._db, uuids) else: raise ValueError('Unsupported DB engine: {}'.format( self._db_engine)) def _drop_table(self, name): """Drop table from DB. Args: name (str): Name of table (collection) """ if self._db_engine is None: raise DatabaseNotInitializedError() elif self._db_engine in DB_ENGINES.keys(): DB_ENGINES[self._db_engine].drop_table(self._db, name) else: raise ValueError('Unsupported DB engine: {}'.format( self._db_engine)) def add_data(self, data_in, strategy='overwrite', **kwargs): """Add data to db. Args: data_in (dict): a dict containing data strategy (str): 'merge' or 'overwrite' """ assert strategy in ['merge', 'overwrite'], 'Unknown strategy.' data = deepcopy(data_in) if '_uuid' not in data.keys() or '_creation_time' not in data.keys(): # Add _uuid and _creation_time data['_uuid'] = self._get_uuid_from_item(data) data['_creation_time'] = datetime.now().timestamp() if data['_uuid'] in self._data.keys(): if strategy == 'merge': base_data = self._data[data['_uuid']] data = self._merger.merge(base_data, data) # Add new columns (keys) to config columns = self._config['columns'] if 'columns' in self._config.keys( ) else [] columns_existing = [c['name'] for c in columns] columns_in_data = list(data_in.keys()) new_columns = [] for new_column in set(columns_in_data).difference(columns_existing): name = new_column dtype = type(data_in[new_column]).__name__ aggregation = 'first' new_columns.append({ 'name': name, 'dtype': dtype, 'aggregation': aggregation, 'display_name': name, }) columns += new_columns self._config['columns'] = columns self._data.update({data['_uuid']: data}) def remove_data(self, data): """Remove data-record from DB. Args: data (dict): a dict containing the target data or '_uuid' in keys. """ if '_uuid' in data.keys(): uuid = data['_uuid'] else: uuid = self._get_uuid_from_item(data) # Remove from in-memory data if uuid in self._data.keys(): del self._data[uuid] # Remove from DB on saving self._uuids_to_remove.append(uuid) def _df_from_dicts(self, dicts): """Create a DataFrame from a list of dicts. Args: dicts (list): list of dicts Returns: (pd.DataFrame): a data-frame """ columns = self._config['columns'] if 'columns' in self._config.keys( ) else [] df = pd.concat( [ pd.Series(name=c['name'], dtype=dtype_string_to_dtype_object(c['dtype'])) for c in columns if c['name'] != '_uuid' and c['name'] != '_creation_time' ] # noqa: E501 + [ pd.Series(name='_uuid', dtype=str), pd.Series(name='_creation_time', dtype=float) ], axis=1) df.set_index('_uuid', inplace=True) df = pd.concat([df, pd.DataFrame.from_records(dicts)]) return df def _to_display_names(self, df, inplace=False): """Rename columns to display-names. Args: df (pd.DataFrame): A data-frame inplace (bool): If true, df will be modified in-place. Returns: (pd.DataFrame): A data-frame with renamed columns if inplace=false, otherwise None. """ column_renames = { c['name']: c['display_name'] for c in self._config['columns'] if isinstance(c, dict) and 'name' in c.keys() and 'display_name' in c.keys() } return df.rename(columns=column_renames, inplace=inplace) @property def data(self): """Return data. Returns: (list): list of dicts """ return list(self._data.values()) @data.setter def data(self, data): """Setter for self.data. Args: data (list): new-data """ assert isinstance(data, list) for value in data: self.add_data(value) @property def columns(self): """Return columns of DF.""" df = self._df_from_dicts(self.data) return df.columns.tolist() @property def df(self): """Return df.""" df = self._df_from_dicts(self.data) df = self._to_display_names(df) return df @property def count_total(self): """Return total number of rows.""" return self._count_total @property def config(self): """Return config.""" return self._config
#!/usr/bin/env python3 import argparse import json import logging import os from collections import defaultdict from pydtk.io.reader import BaseFileReader from pydtk.models import MetaDataModel from pydtk.utils.utils import load_config, smart_open config = load_config('v4').bin.make_meta def make_meta_interactively(template=None): """Make metadata with the interactive command.""" if template is None: template = defaultdict(dict) meta = defaultdict(dict) for key in config.common_item.keys(): if key in template.keys(): meta[key] = \ str(input(f"{config.common_item[key]} [{template[key]}]: ") or template[key]) else: meta[key] = input(f"{config.common_item[key]}: ") return meta def make_meta(file, template=None): """Make metadata with a template."""
class BaseDBHandler(object): """Handler for db.""" __version__ = 'v2' db_defaults = load_config(__version__).sql.base _config = AttrDict() _df_name = 'base_df' _columns = None @classmethod def default_config(cls): """Return default configurations. Returns: (AttrDict): default configurations """ return load_config(cls.__version__) def __init__(self, db_engine=None, db_host=None, db_name=None, db_username=None, db_password=None, df_name=None, read_on_init=True): """Initialize BaseDBHandler. Args: db_engine (str): database engine (if None, the one in the config file will be used) db_host (str): database HOST (if None, the one in the config file will be used) db_name (str): database name (if None, the one in the config file will be used) db_username (str): username (if None, the one in the config file will be used) db_password (str): password (if None, the one in the config file will be used) df_name (str): dataframe (table in DB) name (if None, class default value will be used) read_on_init (bool): if True, dataframe will be read from database on initialization """ self._cursor = 0 self.logger = logging.getLogger(__name__) if df_name is not None: self.df_name = df_name # Load config self._config = load_config(self.__version__) # Initialize database self._initialize_engine(db_engine, db_host, db_name, db_username, db_password) # Initialize dataframe self.df = self._initialize_df() # Fetch table if read_on_init: self.read() def __iter__(self): """Return iterator.""" return self def __next__(self): """Return the next item.""" if self._cursor >= len(self.df): self._cursor = 0 raise StopIteration() # Grab data data = self.df.take([self._cursor]).to_dict(orient='records')[0] # Delete internal column if 'uuid_in_df' in data.keys(): del data['uuid_in_df'] # Post-processes data = deserialize_dict_1d(data) # Increment self._cursor += 1 return data def __len__(self): """Return number of records.""" return len(self.df) def _initialize_engine(self, db_engine=None, db_host=None, db_name=None, db_username=None, db_password=None): """Initialize DB engine.""" # Parse engine = db_engine if db_engine is not None else self.db_defaults.engine username = db_username if db_username is not None else self.db_defaults.username password = db_password if db_password is not None else self.db_defaults.password host = db_host if db_host is not None else self.db_defaults.host database = db_name if db_name is not None else self.db_defaults.database # Substitute self._config.current_db = { 'engine': engine, 'host': host, 'username': username, 'password': password, 'database': database, } # Connect if engine in ['influxdb']: if not _extra_supports['influxdb']: raise ImportError('Module `influxdb` cannot be not imported') hostname = host hostport = 8086 if ':' in host: hostname, hostport = host.split(':') self._engine = DataFrameClient(hostname, hostport, username, password, database) else: engine = 'postgresql' if engine == 'timescaledb' \ else 'mysql' if engine == 'mariadb' \ else engine username_and_password = '' \ if all([username == '', password == '']) \ else '{0}:{1}@'.format(username, password) self._engine = sqlalchemy.create_engine('{0}://{1}{2}{3}'.format( engine, username_and_password if engine != 'sqlite' else '', '/{0}'.format(host) if engine == 'sqlite' else host, '' if engine == 'sqlite' else '/' + database), echo=False) self._session = scoped_session(sessionmaker(bind=self._engine)) def _initialize_df(self): """Initialize DF.""" df = pd.concat([ pd.Series(name=c['name'], dtype=dtype_string_to_dtype_object(c['dtype'])) for c in self.columns ] + [pd.Series(name='uuid_in_df', dtype=str)], axis=1) return df def _get_column_names(self): """Return column names.""" return [c['name'] for c in self.columns] def _get_sql_columns(self): return [sql.column(c) for c in self._get_column_names()] def _get_uuid_from_item(self, data_in): """Return UUID of the given item. Args: data_in (dict or pandas.Series): dict or Series containing data Returns: (str): UUID """ item = data_in if isinstance(item, pd.Series): item = data_in.to_dict() pre_hash = ''.join([ '{:.09f}'.format(item[c['name']]) if isinstance( item[c['name']], float) else str(item[c['name']]) for c in self.columns if c['name'] in item.keys() ]) pre_hash = pre_hash.encode('utf-8') uuid = hashlib.md5(pre_hash).hexdigest() return uuid def _preprocess_list_of_dicts(self, data_in): """Preprocess list of dicts. Args: data_in (list): list of dicts containing data Returns: (dict): dict of lists = listed dict """ data = deepcopy(data_in) # Serialize (convert list to str) self.logger.info('(Preprocess) Serializing...') for item in tqdm(data, desc='Serialization', leave=False): item = serialize_dict_1d(item) # Add df_uuid item['uuid_in_df'] = self._get_uuid_from_item(item) # Add missing columns for column in self.columns: column_name, column_dtype = column['name'], column['dtype'] if column_name not in item.keys(): item.update({column_name: None}) else: dtype_obj = dtype_string_to_dtype_object(column_dtype) if dtype_obj is None: continue if item[column_name] is not None \ and not isinstance(item[column_name], dtype_obj): try: item[column_name] = dtype_obj(item[column_name]) except ValueError: item[column_name] = np.nan # Convert dict to listed dict self.logger.info('(Preprocess) Converting...') data = dicts_to_listed_dict_2d(data) return data def _append_listed_dict_to_df(self, data, check_unique=False): """Append pre-processed dict to self._df. Args: data (dict): data to add check_unique (bool): if True, it will be checked that the data is unique in the db """ self.df = pd.concat([self.df, pd.DataFrame.from_dict(data)], sort=False) if check_unique: self.df.drop_duplicates('uuid_in_df', inplace=True) def add_data(self, data_in, **kwargs): """Add data to db. Args: data_in (dict): a dict containing data """ self.add_list_of_data([data_in], **kwargs) def add_list_of_data(self, data_in, **kwargs): """Add list of data to db. Args: data_in (list): a list of dicts containing data """ data = self._preprocess_list_of_dicts(data_in) self.logger.info('Adding data to DB...') self._append_listed_dict_to_df(data, **kwargs) self.logger.info('Successfully finished adding data to DB') def read(self, df_name=None, query=None, where=None, order_by=None, **kwargs): """Read data from SQL. Args: df_name (str): Dataframe name to read query (str SQL query or SQLAlchemy Selectable): query to select items where (str): query string for filtering items order_by (srt): column name to sort by **kwargs: kwargs for function `pandas.read_sql_query` or `influxdb.DataFrameClient.query` """ if df_name is not None: self.df_name = df_name # Create a query q = query if q is None: # Check if the table exits on DB if self._config.current_db['engine'] in ['influxdb']: if self.df_name not in [ entry['name'] for entry in self._engine.get_list_measurements() ]: self.df = self._initialize_df() return else: if not self._engine.dialect.has_table(self._engine, self.df_name): self.df = self._initialize_df() return # Create a sub-query for extracting unique records sub_q = sql.select('*', from_obj=sql.table(self.df_name)) if self._config.current_db['engine'] in ['mysql', 'sqlite']: sub_q = sub_q.group_by(sql.column('uuid_in_df')) elif self._config.current_db['engine'] in [ 'postgresql', 'timescaledb' ]: sub_q = sub_q.distinct(sql.column('uuid_in_df')) # Create a query if self._config.current_db['engine'] in ['influxdb']: q = sub_q else: q = sql.select('*', from_obj=sub_q.alias('temp')) if where is not None: q = q.where(sql.text(where)) if order_by is not None: q = q.order_by(sql.text(order_by)) # Read table from DB try: if self._config.current_db['engine'] in ['influxdb']: df = list(self._engine.query(str(q), **kwargs).values())[0] else: df = pd.read_sql_query(q, self._engine, **kwargs) except Exception as e: self.logger.warning( 'Could not execute SQL statement: "{0}" (reason: {1})'.format( str(q), str(e))) df = self._initialize_df() self.df = df def save(self, df=None, remove_duplicates=False, **kwargs): """Save data to SQL. Args: df (pandas.DataFrame): DataFrame to save (if None, self.df will be saved) remove_duplicates (bool): if True, duplicated rows will be removed **kwargs: kwargs for function `pandas.dataframe.to_sql` or `influxdb.DataFrameClient.write_points` """ dataframe = df if df is not None else self.df dataframe.drop_duplicates('uuid_in_df', inplace=True) if self._config.current_db['engine'] in ['influxdb']: self._engine.write_points(dataframe, self.df_name, **kwargs) else: if 'index' not in kwargs.keys(): kwargs.update({'index': False}) dataframe.to_sql(self.df_name, self._engine, if_exists='append', **kwargs) if remove_duplicates: if self._config.current_db['engine'] in ['influxdb']: logging.warning( 'Option "remove duplicates" is not supported yet.') return # Create temporal table temp_table_name = self.df_name + '_' + datetime.now().strftime( '%s') self._initialize_df().to_sql(temp_table_name, self._engine, index=False) # Select unique rows and insert into the temporal table if self._config.current_db['engine'] in ['mysql', 'sqlite']: select = 'select * from "{0}" group by uuid_in_df'.format( self.df_name) elif self._config.current_db['engine'] in [ 'postgresql', 'timescaledb' ]: select = 'select distinct * from "{0}"'.format(self.df_name) else: raise ValueError('Unsupported engine: {}'.format( self._engine.name)) q = 'insert into "{0}" {1}'.format(temp_table_name, select) self._engine.execute(q) # Drop deprecated table if exists if self._engine.dialect.has_table(self._engine, self.df_name + '_deprecated'): self._engine.execute('drop table "{0}"'.format(self.df_name + '_deprecated')) # Deprecate the original table self._engine.execute('alter table "{0}" rename to "{1}"'.format( self.df_name, self.df_name + '_deprecated')) # Rename the temporal table self._engine.execute('alter table "{0}" rename to "{1}"'.format( temp_table_name, self.df_name)) # Drop the deprecated table self._engine.execute('drop table "{0}"'.format(self.df_name + '_deprecated')) @property def df(self): """Return df.""" return self._df @df.setter def df(self, value): """Setter for self.df.""" if not isinstance(value, pd.DataFrame): raise ValueError('Only pandas dataframe is accepted.') # Set columns based on the given DF if len(self.columns) == 0 and len(value) > 0: self.columns = [{ 'name': c, 'dtype': 'none' } for c in value.columns.to_list()] # Add column 'uuid_in_df' if 'uuid_in_df' not in value.columns.to_list(): value['uuid_in_df'] = value.apply( lambda x: self._get_uuid_from_item(x), axis=1) self.columns += [{'name': 'uuid_in_df', 'dtype': 'str'}] self._df = value @property def df_name(self): """Return df_name.""" return self._df_name @df_name.setter def df_name(self, value): """Setter for self.df_name.""" self._df_name = value @property def columns(self): """Return columns of DF.""" if self._columns is not None: return self._columns try: return self._config[self.df_name]['columns'] except KeyError: return [] @columns.setter def columns(self, value): """Set self.columns.""" if not isinstance(value, list): raise ValueError( 'Columns must be a list of dicts with keys "name" and "dtype".' ) if len(value) < 1: raise ValueError('At least one item must be in the list.') if not isinstance(value[0], dict) or "name" not in value[0].keys( ) or "dtype" not in value[0].keys(): raise ValueError( 'Columns must be a list of dicts with keys "name" and "dtype".' ) try: _ = pd.concat([ pd.Series(name=c['name'], dtype=dtype_string_to_dtype_object(c['dtype'])) for c in value ] + [pd.Series(name='uuid_in_df', dtype=str)], axis=1) except KeyError as e: raise ValueError('Unrecognized value: {}'.format(str(e))) self._columns = value