Exemple #1
0
    def delete_data_in_ch(self):
        client = Client(**CLICKHOUSE_DB_INFO)
        delete_data_last = {}
        if not self.delete_data_list:
            self.time_cost_last_delete = 0
        else:
            logger.info(
                "begin group data with primary_key_values to each table")
            for i in self.delete_data_list:
                delete_data_last.setdefault(i['table_name'],
                                            []).append(i['primary_key_values'])
            logger.info("group  each table data for one sql")
            for table_name, values in delete_data_last.items():
                sql_cur = {}
                for v in values:
                    for k, vv in v.items():
                        sql_cur.setdefault(k, []).append(vv)

                last_delete = ' and '.join(
                    ['%s in %s' % (k, tuple(v)) for k, v in sql_cur.items()])
                exec_sql = "alter table %s delete where %s ;" % (table_name,
                                                                 last_delete)
                begin_delete_time = int(time.time())
                logger.info("begin execute: exec_sql:%s" % exec_sql)
                client.execute(exec_sql)
                logger.info("end execute")
                client.disconnect()
                self.time_cost_last_delete = int(
                    time.time()) - begin_delete_time
    def push(self, client: Client = None, key: str = None, close: bool = False):
        """Return a client to the pool for reuse.

        Args:
            client: The client to return.
            key: If known, the key of the client.
            close: Close the client instead of adding back to pool.

        """

        self._lock.acquire()
        try:
            if self.closed:
                raise ChPoolError("pool closed")
            if key is None:
                key = self._rused.get(id(client))
                if key is None:
                    raise ChPoolError("trying to put unkeyed client")
            if len(self._pool) < self.connections_min and not close:
                # TODO: verify connection still valid
                if client.connection.connected:
                    self._pool.append(client)
            else:
                client.disconnect()

            # ensure thread doesn't put connection back once the pool is closed
            if not self.closed or key in self._used:
                del self._used[key]
                del self._rused[id(client)]
        finally:
            self._lock.release()
class DBConnection:
    def __init__(self, clickhouse_host=None, clickhouse_name=None, dbuser=None,
                 dbpass=None, db_query=None):

        self.db_query = db_query

        self.clickhouse_client = Client(
            host=clickhouse_host, database=clickhouse_name, user=dbuser,
            password=dbpass
        )

        try_conn = self._connection_test()
        if try_conn:
            raise try_conn

        self.connection_status = False

    def make_connection(self):
        try:
            self.clickhouse_client.connection.connect()
        except Exception as error:
            logger.error(error)
            return False

        self.connection_status = True

        return True

    def _connection_test(self):
        try:
            self.clickhouse_client.connection.connect()
        except errors.Error as error:
            logger.error(exception_to_logger(error))
            return "Error of database connection."

        self.clickhouse_client.disconnect()
        return None

    def send_request(self, trace=False):
        response_status = False

        try:
            if isinstance(self.db_query, tuple):
                response = self.clickhouse_client.execute(
                    self.db_query[0], self.db_query[1]
                )
            else:
                response = self.clickhouse_client.execute(self.db_query)
            response_status = True
        except Exception as error:
            logger.error(exception_to_logger(error))
            response = json_str_error("Error in sql query!")
            if trace:
                print_statusbar([(error, 'd',)])

        return response_status, response

    def disconnect(self):
        self.clickhouse_client.disconnect()
        self.connection_status = False
Exemple #4
0
def insert_to_ch(schema, table, res, start_time=None, end_time=None):
    if not res:
        logger.info(f"None Result, pass: {schema}.{table} {start_time}, {end_time}")
        return
    ind = random.randint(0, len(CH_SERVERS) - 1)
    host, port = CH_SERVERS[ind][0], CH_SERVERS[ind][1]

    if end_time:
        distributed_table = table.rstrip('_local')
        client = Client(host=host, port=port, user=CH_USER, password=CH_PASS)
        s = client.execute(f"select count(*) cnt from "
                           f"{schema}.{distributed_table} where end_time=toDateTime('{end_time}')")
        s = s[0][0]
        client.disconnect()
    else:
        s = 0

    if not s:
        sql = f'INSERT INTO {schema}.{table}(' \
              f'end_time, invertal, gtid, transaction_spend_time, transaction_size, single_statement_affected_rows) VALUES'
        client = Client(host=host, port=port, user=CH_USER, password=CH_PASS)
        client.execute(sql, res)
        logger.info(f"Success: {schema}.{table} {start_time}, {end_time}")
        client.disconnect()
    else:
        logger.info(f'Exists, pass: {schema}.{table} {start_time}, {end_time}')
def clickhouse():
    client = Client(
        settings.CLICKHOUSE_HOST,
        user=settings.CLICKHOUSE_USER,
        password=settings.CLICKHOUSE_PASSWORD,
    )
    client.execute(f'DROP DATABASE IF EXISTS {settings.CLICKHOUSE_NAME}')
    client.execute(f'CREATE DATABASE {settings.CLICKHOUSE_NAME}')
    client.disconnect()
def insert_partition(rows):
    assert isinstance(rows, list), rows
    assert isinstance(rows[0], tuple), rows[0]
    assert len(rows[0]) == 2, rows[0]
    client = Client('localhost')
    try:
        client.execute('INSERT INTO hashes (data, hash) VALUES', rows)
    except Exception:
        client.disconnect()
Exemple #7
0
class ClientWrapper:
    """Context manager to allow use of ClickHouse connections in with clause"""
    def __init__(self, *args, **kwargs):
        self.client = Client(*args, **kwargs)

    def __enter__(self):
        return self.client

    def __exit__(self, *args):
        """Disconnect from server to up free socket"""
        self.client.disconnect()
Exemple #8
0
 def check_clickhouse_status(self):
     sql = "show databases"
     try:
         client = Client(**CLICKHOUSE_DB_INFO)
         client.execute(sql)
         client.disconnect()
         messages = "clickhouse status is ok"
         logger.info(messages)
         return True
     except Exception as error:
         messages = "clickhouse can not connection:%s" % (str(error))
         logger.error(messages)
         return False
Exemple #9
0
 def get_ch_column_type(self, db, table):
     """ :returns like :
     {'id': 'UInt32', 'report_id': 'Int32', 'cost_type': 'Int16', 'cost_name': 'String', 'amount': 'Int32', 'actual_amount': 'Int32'}
     """
     sql = "select name,type from system.columns where database='{0}' and table='{1}'".format(
         db, table)
     try:
         client = Client(**CLICKHOUSE_DB_INFO)
         ret = client.execute(sql)
         client.disconnect()
         return {d[0]: d[1] for d in ret}
     except Exception as error:
         messages = "获取clickhouse里面的字段类型错误. %s" % (str(error))
         logger.error(messages)
         alarm(alarm_cnf, title=self.alarm_title, messages=messages)
         exit(1)
    def test_insert_block_size(self):
        client = Client(self.host,
                        self.port,
                        self.database,
                        self.user,
                        self.password,
                        settings={'insert_block_size': 1})

        with self.create_table('a UInt8'):
            data = [(x, ) for x in range(4)]
            client.execute('INSERT INTO test (a) VALUES', data)

            query = 'SELECT * FROM test'
            inserted = self.emit_cli(query)
            self.assertEqual(inserted, '0\n1\n2\n3\n')
            inserted = client.execute(query)
            self.assertEqual(inserted, data)

        client.disconnect()
Exemple #11
0
def dbms_sync_clickhouse_schema(row):
    ignored_schemas = ('_temporary_and_external_tables', 'system', 'default')
    query = f"select name from system.databases where name not in {ignored_schemas}"
    config = {
        'host': row.host,
        'port': row.port,
        'database': 'default',
        'connect_timeout': 5,
        'send_receive_timeout': 5,
    }
    # 请在clickhouse创建好用户
    config.update(REOMOTE_USER)
    cnx = Client(**config)
    result = cnx.execute(query)
    for i in result:
        schema = i[0]
        models.DbSchemas.objects.update_or_create(cid_id=row.id,
                                                  schema=schema,
                                                  defaults={'schema': schema})
    cnx.disconnect()
Exemple #12
0
    def check_table_exists_on_clickhouse(self):
        """
        检查mysql需要同步的表在clickhouse里面是否存在
        :return:
        """
        client = Client(**CLICKHOUSE_DB_INFO)
        not_in_clickhouse_tables = []
        if not self.rsnyc_tables:
            messages = "no tables to sync "
            logger.error(messages)
            return False
        else:
            try:
                for i in self.rsnyc_tables:
                    table_schema = i['table_schema']
                    table_name = i['table_name']
                    check_sql = """select count(*) countnum from  system.tables where database= '%s' and name='%s';""" % (
                        table_schema, table_name)
                    countnum = client.execute(check_sql)
                    if countnum[0][0] == 0:
                        not_in_clickhouse_tables.append(table_schema + '.' +
                                                        table_name)
            except Exception as error:
                messages = "clickhouse failed: %s" % (str(error))
                logger.error(messages)
            finally:
                if client:
                    client.disconnect()

            if not_in_clickhouse_tables:
                messages = "tables: %s not in clickhouse" % (
                    ','.join(not_in_clickhouse_tables))
                logger.error(messages)
                return False
            else:
                messages = "tables all in clickhouse"
                logger.info(messages)
                return True
Exemple #13
0
def get_most_time_consuming_transaction(start_datetime, end_datetime, prefix):
    start = start_datetime.strftime('%Y-%m-%d %H:%M:%S')
    end = end_datetime.strftime('%Y-%m-%d %H:%M:%S')
    sql = f'''
        SELECT 
            toDateTime('{end}'),
            '{str(interval)}',
            gtid, 
            max(execute_time) - min(execute_time) AS transaction_spend_time, 
            (max(toUInt32(binlog_pos)) - min(toUInt32(binlog_pos))) + argMax(toUInt32(single_statement_size), toUInt32(binlog_pos)) AS transaction_size, 
            sum(single_statement_affected_rows) AS single_statement_affected_rows
        FROM mysql_monitor.{prefix}_binlog
        WHERE (execute_time >= '{start}') AND (execute_time < '{end}')
        GROUP BY gtid
        ORDER BY transaction_spend_time DESC
        LIMIT 1
    '''
    ind = random.randint(0, len(CH_SERVERS) - 1)
    host, port = CH_SERVERS[ind][0], CH_SERVERS[ind][1]
    client = Client(host=host, port=port, user=CH_USER, password=CH_PASS, settings=SETTINGS)
    res = client.execute(sql)
    client.disconnect()
    return res
Exemple #14
0
class ClickhouseClient:
    """
    clickhouse封装
    """
    def __init__(self, host: str, port: int, database: str, user: str,
                 password: str) -> None:
        self.conn = Client(host=host,
                           port=port,
                           database=database,
                           user=user,
                           password=password)

    def close(self):
        self.conn.disconnect()

    def query(self, query: str, params: Any = None) -> List[dict]:
        """
        查询一条结果
        :param query: 查询语句
        :param params: 查询参数
        :return: List[dict]
        """
        try:
            data = self.conn.execute_iter(query,
                                          params,
                                          with_column_types=True)
            columns = [column[0] for column in next(data)]
            temp = []
            for row in data:
                temp.append(
                    json.dumps(dict(zip(columns, [value for value in row])),
                               cls=DateEncoder))
        except Exception as e:
            raise e
        finally:
            self.close()
        return temp
Exemple #15
0
def insert_data(data: list):
    client = Client('localhost')
    client.execute("Insert into test.php_local (col1, col2, col3) VALUES ",
                   data)
    client.disconnect()
Exemple #16
0
class GeneralSocialDemoRatingAdmixer(generics.ListAPIView, ParamsHandler):
    renderer_classes = (r.BrowsableAPIRenderer, r.JSONRenderer,
                        AdmixerRenderer)
    serializer_class = SocialDemoRatingAdmixerSerializer
    permission_classes = (permissions.IsAuthenticatedOrReadOnly,
                          IsRequestsToSocialDemoAllow)
    pagination_class = ConfiguredPageNumberPagination
    admixer_values_list = ('platform', 'browser', 'region', 'age', 'gender',
                           'income', 'uniques', 'views')

    def _chunks(self, l, n):
        return [l[i:i + n] for i in range(0, len(l), n)]

    def _convert(self, tup):
        di = {}
        for a, b in tup:
            di.setdefault(a, []).append(b)
        return di

    def _query_admixer_data(self, batch_ids, start_date, end_date):
        ids = ",".join("'%s'" % item for item in batch_ids)

        query = 'select UrlId, Platform, Browser, Country, Age, Gender, Income, count(distinct IntVisKey), Sum(Views)' \
                'from admixer.UrlStat ' \
                'where UrlId in (%s) and Date >= \'%s\' and Date <= \'%s\' ' \
                'Group by UrlId, Platform, Browser, Country, Age, Gender, Income' % (ids, start_date, end_date)

        response = self._client.execute(query)
        results = dict(
            zip(self.admixer_values_list, [[], [], [], [], [], [], 0, 0]))
        for row in response:
            item = dict(zip(self.admixer_values_list, row[1:]))
            for key in self.admixer_values_list[:-2]:
                results[key].append(item[key])
            for key in self.admixer_values_list[len(self.admixer_values_list) -
                                                2:]:
                results[key] += (item[key])

        logger.info("Received %d records from ClickHouse", len(results))

        return results

    def list(self, request, *args, **kwargs):

        params = self.handle_request_params(request)

        aggregator = params.pop("aggregator", None)

        if not aggregator:
            return Response()

        try:
            end_date = params.pop("posted_date__lte")
            start_date = params.pop("posted_date__gte")
        except KeyError as e:
            end_date = datetime.strptime(settings.DEFAULT_TO_DATE, "%Y-%m-%d")
            start_date = datetime.strptime(settings.DEFAULT_FROM_DATE,
                                           "%Y-%m-%d")

        publications = Publication.objects.filter(**params).values_list(
            aggregator, "shukachpublication__shukach_id")
        l_part = self._convert(publications)

        self._client = Client(settings.CLICKHOUSE_HOST,
                              database=settings.CLICKHOUSE_DB,
                              user=settings.CLICKHOUSE_USER,
                              password=settings.CLICKHOUSE_PASSWORD)

        total = len(publications)
        current = 0
        queryset = []

        for agg, ids in l_part.items():
            results = dict(
                zip(self.admixer_values_list, [[], [], [], [], [], [], 0, 0]))
            for batch_ids in self._chunks(ids, 10000):
                logger.info("Sent %d ids" % len(batch_ids))
                item = self._query_admixer_data(batch_ids, start_date,
                                                end_date)
                for key in self.admixer_values_list:
                    results[key] += item[key]
                current += len(batch_ids)
                logger.info("Processed: %d/%d" % (current, total))
            queryset.append({
                "aggregator": agg,
                "views": results["views"],
                "platforms": dict(Counter(results["platform"])),
                "browsers": dict(Counter(results["browser"])),
                "regions": dict(Counter(results["region"])),
                "age_groups": dict(Counter(results["age"])),
                "gender_groups": dict(Counter(results["gender"])),
                "income_groups": dict(Counter(results["income"])),
                "uniques": results["uniques"]
            })

        self._client.disconnect()

        page = self.paginate_queryset(queryset)
        if page is not None:
            return self.get_paginated_response(queryset)
        return Response(queryset)
                if write_to_db == 0:
                        print(jsonrow)
                if counter == 1:
                        kafka_df = pd.DataFrame({'Message' : jsonrow },index=np.arange(1))
                else:
                        kafka_df.append(pd.DataFrame({'Message' : jsonrow },index=np.arange(1)))
                if (counter % mod_factor) == 0:
                        print('Prnting Dataframe')
                        print(kafka_df)
                        if write_to_db == 0:
                                sys.exit()
                        if write_to_db == 1:
                                try:
                                        clk_haus_client = Client('localhost',settings=clk_settings,connect_timeout=60,send_receive_timeout=900,sync_request_timeout=120)
                                        clk_haus_client.execute(insert_clause,kafka_df.to_dict('r'))
                                        clk_haus_client.disconnect()
                                except Exception as e:
                                        print(str(e))
                                        pass
                        kafka_array = []
                        kafka_df = pd.DataFrame()
                        counter = 0
                counter = counter + 1
except Exception as e:
        print("Outer Exception")
        print(str(e))
        sys.exit()
        pass
except KeyboardInterrupt:
    sys.exit()
Exemple #18
0
class ClickHouse(object):
    """
    ClickHouse is a facade pattern class based on clickhouse-driver python API for ClickHouse DBMS
    It defines at a higher-level useful commands and adds to this API tracing/debug functionality and
    improved output format with Pandas dataframes.
    """
    def __init__(self, host, port, user, password, database, trace=0):
        self._client = 'ClickHouse'
        self._host = host
        self._port = port
        self._user = user
        self._password = password
        self._database = database
        self._api = None
        self._trace = trace
        self._last_query = None
        # clickhouser-driver last query execution statistics variables
        (self._lastquery_id, self._resultset_rows, self._elapsed,
         self._processed_rows, self._processed_bytes,
         self._total_rows) = [None, None, None, None, None, None]

        if host == 'localhost':
            self._host = '127.0.0.1'
        # Create connection with clickhouse driver API
        try:
            self._api = Client(host=host,
                               database=database,
                               port=port,
                               user=user,
                               password=password)
            self._api.execute('SHOW TABLES')
        except Exception:
            raise DBConnectionFailed(
                f'Connection to ClickHouse failed. Check connection parameters'
            )

    @property
    def last_query(self):
        return self._last_query

    @property
    def last_query_stats(self):
        return [
            self._lastquery_id, self._resultset_rows, self._elapsed,
            self._processed_rows, self._processed_bytes, self._total_rows
        ]

    def sql(self,
            sql='',
            cols=None,
            index=None,
            split=True,
            auto=False,
            params=None,
            columnar=False,
            qid=None,
            execute=True):
        """
        This method is calling clickhouse-driver execute() method to execute sql query
        Connection has already been established.
        :param sql: clickhouse SQL query string that will be send to server
        :param cols: pandas dataframe columns
        :param index: pandas dataframe columns
        :param split: either split the columns string argument or leave it
        :param auto: if True, it will try to extract the columns from SQL SELECT , , , FROM and pass them to `cols`
        :param params: clickhouse-client executeparameters
        :param columnar: if specified the result will be returned in column-oriented form. Defaults row-like form.
        :param qid: query identifier. If no query id specified ClickHouse server will generate it
        :param execute: execute SQL commands only if execute=True

        :return: pandas dataframe
        """
        # Initialization stage
        tuples = ()
        self._last_query = sql
        self._lastquery_id = qid
        (self._elapsed, self._resultset_rows, self._processed_rows,
         self._processed_bytes, self._total_rows) = [0, 0, 0, 0, 0]

        # clickhouse-driver execution of sql statement
        # ToDO: 1. choose the ouput format, i.e. display tuples, pandas dataframe, dictionary, etc...
        #
        # ToDO: 2. paging with a generator e.g. gen = (row for row in cql.execute('SELECT * from FloatOnSSD_SRC')
        # ToDo: 2. and clickhouse-driver streaming results, i.e. execute_iter command
        if execute:
            tuples = self._api.execute(query=sql,
                                       params=params,
                                       columnar=columnar,
                                       query_id=qid)
            self._elapsed = self._api.last_query.elapsed
            # Avoid AttributeError: 'NoneType' object has no attribute 'rows' in clickhouse-driver
            try:
                self._resultset_rows = self._api.last_query.profile_info.rows
                self._processed_rows = self._api.last_query.progress.rows
                self._processed_bytes = self._api.last_query.progress.bytes
                self._total_rows = self._api.last_query.progress.total_rows
            except AttributeError:
                pass

        # Transform tuples to pandas dataframe
        # Start measuring elapsed time for transforming python tuples to pandas dataframe
        t_start = time.perf_counter()
        if auto:
            pos1 = sql.find('SELECT') + 7
            pos2 = sql.find('FROM', pos1)
            cols = sql[pos1:pos2 - 1]

        if cols and split:
            cols = cols.split(', ')
        if index and split:
            index = index.split(', ')
        try:
            result = ETL.get_dataframe(tuples, cols, index)
        except Exception:
            print(sql)
            raise PandasError(
                f'Failed to construct Pandas dataframe, check query and parameters'
            )
        # End measuring elapsed time for pandas dataframe transformation
        t_end = time.perf_counter()

        # Debug info
        if self._trace > 2:
            print(
                f'QueryID:{qid}\nLatency for pandas dataframe transformation : {round(t_end-t_start, 3)} sec.'
            )
        if self._trace > 1:
            print(
                f'{self._last_query}\n╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌'
            )
        if self._trace > 0:
            lqs = self.last_query_stats
            print(
                f'QueryID:{lqs[0]}\nElapsed: {round(lqs[2], 3)} sec',
                f'{lqs[1]} rows in set.\nProcessed: {lqs[3]} rows, {round(lqs[4]/1048576, 3)} MB',
                '\n___________________________________________________________________________'
            )

        if result.empty:
            if self._trace > 0:
                print(
                    'Done.\n╚═══════════════════════════════════════════════════════════════════════╝'
                )
            return None
        else:
            return result

    def cmd(self,
            cmd,
            dbhost=None,
            dbport=None,
            dbuser=None,
            dbpassword=None,
            db=None,
            table=None,
            engine=None,
            partkey=None,
            skey=None,
            settings=None,
            aggr=False,
            group_by=None,
            heading=None,
            fields=None,
            projection='*',
            where=None,
            hb2=None,
            source=None,
            ha2=None,
            fullpath=None,
            sql=None,
            active=True,
            limit=None,
            execute=True):
        """
        Basically this is a wrapper method that constructs sql statements,
        `sql` method executes these statements
        :param cmd: command controls the type of result to return. Default is to return the result set from query
        :param dbhost: mysql host
        :param dbport: mysql port
        :param dbuser: mysql database user
        :param dbpassword: mysql database user password
        :param db: database name
        :param table: table name
        :param engine: the type of clickhouse engine
        :param partkey: partition key
        :param skey: sorting key
        :param settings: clickhouse engine settings
        :param aggr: aggregate results
        :param group_by: SQL GROUP BY construct
        :param heading: list of field names paired with clickhouse data types
               ( 'fld1_name dtype1', 'fld2_name dtype2'... )
        :param fields: list of field names, used in pandas dataframe and in insert command
        :param projection: list of columns from a table, used in select command
        :param where: SQL WHERE construct in select command
        :param source: clickhouse file format, mysql, odbc, etc...
        :param fullpath: fullpath of the flat file used to read data from (relative to ClickHouse user_files_path)
        :param sql: SQL query
        :param active: select only active parts
        :param hb2: select parts with a specific hb2 dimension (hb2 is the dim2 of the Entity/ASET key)
               default hb2='%'
        :param ha2: attribute dimension that is used in insert command (ImportedDataResourceWithRightJoin)
        :param limit: SQL limit
        :param execute: Execute the command only if execute=True

        Each one of the following commands takes specific paramaters, `execute` is common for all of them:
        `tables`    : db, engine, table, group_by

        `columns`   : db, table, aggr

        `parts`     : db, table, hb2, active

        `optimize`  : table

        `mutations` : table, limit

        `query_log` :

        `create`    : table, heading, engine, partkey, skey, settings

        `select`    : dbhost, dbport, dbuser, dbpassword, db, table,
                      source, fullpath, heading, fields, where, projection, limit

        `insert`    : table, fields, source, ha2

        :return: query result set in a pandas dataframe
        """
        # Initialize local variables
        #

        # ------------------------------------------------------------------------------------
        # check cmd argument
        # ------------------------------------------------------------------------------------
        if cmd not in cmd_types:
            raise InvalidCmdOperation(
                f'Invalid command operation: failed with parameter cmd={cmd}')
        # ---------------------------------------------------------------------------------------
        # parsing cmd argument
        # ---------------------------------------------------------------------------------------
        if cmd == 'parts' and table is not None and db is not None:
            activepart = None
            # return information about parts of MergeTree tables
            if active is None:
                activepart = None
            elif active:
                activepart = 1
            elif not active:
                activepart = 0

            sel = f'\nSELECT table, database, partition_id, name, active, marks, rows,'
            sel += '\n        min_block_number as min_blk, max_block_number AS max_blk,'
            sel += '\n        level, toDecimal32(primary_key_bytes_in_memory/1024, 3) AS pk_mem'
            frm = 'FROM system.parts'
            if hb2 is None:
                wh = f'''WHERE database='{db}' AND table = '{table}' '''
            else:
                wh = f'''WHERE database='{db}' AND table = '{table}' AND partition_id LIKE '{hb2}-%' '''
            if active is not None:
                wh += f''' AND active = {activepart}'''
            ordname = f'ORDER BY name'
            # construct query
            query = sql_construct(select=sel, frm=frm, where=wh, order=ordname)
            # execute SQL query
            cols = 'table, db, PID, name, active, marks, rows, min_blk, max_blk, level, pk_mem (KB)'
            return self.sql(query,
                            cols,
                            index='PID',
                            qid='Parts Command',
                            execute=execute)
        elif cmd == 'mutations' and table is not None:
            # mutations allows changing or deleting lots of rows in a table
            # return information about mutations of MergeTree tables and their progress
            sel = f'\nSELECT table, command, create_time, block_numbers.number AS blk, parts_to_do AS parts, is_done,'
            sel += '\n        latest_failed_part AS failed_at, latest_fail_time AS failed_time'
            frm = f'FROM system.mutations'
            wh = f'WHERE table = \'{table}\''
            ordtime = f'ORDER BY create_time DESC'
            if limit:
                lim = f'LIMIT {limit}'
            else:
                lim = None
            if group_by:
                grp = f'GROUP BY {group_by}'
            else:
                grp = None
            # construct query
            query = sql_construct(select=sel,
                                  frm=frm,
                                  where=wh,
                                  group_by=grp,
                                  order=ordtime,
                                  limit=lim)
            # execute SQL query
            cols = 'table, command, created_at, blk, parts, is_done, failed_at, failed_time'
            self.sql('system flush logs', qid='flush logs', execute=execute)
            return self.sql(query,
                            cols,
                            qid='Mutation Information Command',
                            execute=execute)
        elif cmd == 'optimize' and table is not None:
            # construct query
            query = f'OPTIMIZE TABLE {table} FINAL'
            # execute SQL query
            return self.sql(query,
                            qid='Optimize Engine Command',
                            execute=execute)
        elif cmd == 'query_log':
            # metadata for queries logged in the ClickHouse table with log_queries=1 setting
            sel = f'SELECT query_id AS id, user, client_hostname as host, client_name as client,'
            sel += '\n        result_rows AS in_set, toDecimal32(query_duration_ms / 1000, 3) AS sec,'
            sel += '\n        toDecimal32(memory_usage/1048576, 3) AS MEM_MB,'
            sel += '\n        read_rows as R_Rows, toDecimal32(read_bytes / 1048576, 3) AS R_MB,'
            sel += '\n        written_rows AS W_Rows, toDecimal32(written_bytes/1048576, 3) AS W_MB, query'
            frm = f'FROM system.query_log'
            wh = f'WHERE (type = 2) AND (query NOT LIKE \'%query_duration_ms%\')'
            ordtime = f'ORDER BY event_time DESC'
            # construct query
            query = sql_construct(select=sel, frm=frm, where=wh, order=ordtime)
            # execute SQL query
            cols = 'id, user, host, client, in_set, sec, MEM_MB, R_Rows, R_MB, W_Rows, W_MB, query'
            self.sql('system flush logs',
                     qid='flush logs command',
                     execute=execute)
            return self.sql(query,
                            cols,
                            qid='Query Log Command',
                            index='id',
                            execute=execute)
        elif cmd == 'tables':
            # Contains metadata of each table that the server knows about. Detached tables are not shown
            sel = f'SELECT database as db, engine, name as table, '
            sel += '\n        partition_key as partkey, sorting_key as skey, primary_key as pkey'
            frm = f'FROM system.tables'
            wh = f'WHERE db=\'{db}\' '
            if table:
                wh += f'AND table like \'%{table}%\''
            if engine:
                wh += f'AND engine=\'{engine}\''
            ordengine = f'ORDER BY db, engine'
            # construct query
            query = sql_construct(select=sel,
                                  frm=frm,
                                  where=wh,
                                  order=ordengine)
            if group_by:
                sel = f'SELECT {group_by}, groupArray(table) AS tables'
                frm = f'FROM ( \n{query} )'
                grp = f'GROUP BY {group_by}'
                query = sql_construct(select=sel, frm=frm, group_by=grp)
                cols = 'engine, tables'
            else:
                cols = 'db, engine, table, partkey, skey, pkey'
            # execute SQL query
            return self.sql(query,
                            cols,
                            qid='Table Engines Metadata Command',
                            execute=execute)
        elif cmd == 'columns':
            # information about the columns in a table.
            sel = f'SELECT name, comment, type,'
            sel += '\n       toDecimal32(data_compressed_bytes/1048576, 3) as Compressed_MB,'
            sel += '\n       toDecimal32(data_uncompressed_bytes/1048576, 3) as Uncompressed_MB,'
            sel += '\n       toDecimal32(marks_bytes/1024, 3) as marks_KB'
            frm = 'FROM system.columns'
            wh = f'WHERE database=\'{db}\' AND table=\'{table}\''
            ordtype = f'ORDER BY type DESC, Compressed_MB DESC'
            # construct query
            query = sql_construct(select=sel, frm=frm, where=wh, order=ordtype)
            if aggr:
                sel = f'SELECT any(\'{db}\') as db, any(\'{table}\') as table,'
                sel += f'\n      sum(Compressed_MB) as total_compressed_MB,'
                sel += f'\n      sum(Uncompressed_MB) as total_uncompressed_MB, sum(marks_KB) as total_marks_KB,'
                sel += f'\n      argMin(name, Compressed_MB) as min_column, min(Compressed_MB) as min_MB,'
                sel += f'\n      argMax(name, Compressed_MB) as max_column, max(Compressed_MB) as max_MB,'
                sel += f'\n     avg(Compressed_MB) as avg_MB'
                frm = f'FROM ( \n{query} )'
                query = sql_construct(select=sel, frm=frm)
                cols = 'db, table, total_compressed_MB, total_uncompressed_MB, total_marks_KB, '
                cols += 'min_column, min_MB, max_column, max_MB, avg_MB'
            else:
                cols = 'name, comment, type, Compressed_MB, Uncomressed_MB, marks_KB'
            # execute SQL query
            return self.sql(query,
                            cols,
                            qid='Table Columns Metadata Command',
                            execute=execute)
        elif cmd == 'create':
            # Check engine passed
            if engine not in engine_types:
                raise InvalidEngine(
                    f'Invalid TRIADB engine: failed with parameter engine={engine}'
                )
            # join list of strings to ', \n' separated string
            structure = ', \n'.join(heading)
            query = f'CREATE TABLE {table}'
            query += f'({structure})'
            query += f'\nENGINE = {engine}()'
            query += f'\nPARTITION BY {partkey}'
            query += f'\nORDER BY {skey}'
            query += f'\nSETTINGS {settings}'

            # execute SQL query
            self.sql(f'DROP TABLE IF EXISTS {table}',
                     qid=f'Drop Table {table}',
                     execute=execute)
            return self.sql(query,
                            qid=f'Create Engine {engine} Command',
                            execute=execute)
        elif cmd == 'select':
            if source not in source_types:
                raise InvalidSourceType(
                    f'Invalid TRIADB source type: failed with parameter source={source}'
                )
            query = None
            result = None

            # join list of strings to ', \n' separated string
            if source in ['CSVWithNames', 'TabSeparatedWithNames', 'MySQL']:
                sel = f'SELECT {projection}'
                frm = ''
                if source in ['CSVWithNames', 'TabSeparatedWithNames']:
                    structure = ', '.join(heading)
                    frm = f'FROM file('
                    frm += f"\n   '{fullpath}',"
                    frm += f"\n   '{source}',"
                    frm += f"\n   '{structure}'"
                    frm += f'\n)'
                elif source == 'MySQL':
                    frm = f"FROM mysql('{dbhost}:{dbport}', '{db}', '{table}', '{dbuser}', '{dbpassword}')"

                if where:
                    wh = f'WHERE {where}'
                else:
                    wh = None

                if limit:
                    lim = f'LIMIT {limit}'
                else:
                    lim = None
                # construct query
                query = sql_construct(select=sel, frm=frm, where=wh, limit=lim)
                # execute SQL query
                result = self.sql(query,
                                  qid='Select rows from flat file Command',
                                  cols=fields,
                                  split=False,
                                  execute=execute)
            elif source == 'ImportedDataResource':
                structure = ', '.join(heading)
                sel = f'SELECT {structure}'
                frm = f'FROM {table} '
                grp = 'GROUP BY val'
                hav = 'HAVING isNotNull(val)'
                ordval = 'ORDER BY val'
                # construct query
                query = sql_construct(select=sel,
                                      frm=frm,
                                      group_by=grp,
                                      having=hav,
                                      order=ordval)
                # execute SQL query
                result = self.sql(
                    query,
                    qid='Select HyperAtom AdjacencyLists Command',
                    cols=fields,
                    split=False,
                    execute=execute)
            # if query is used in other commands use execute=false to return it
            if execute:
                return result
            else:
                return query
        elif cmd == 'insert':
            if source not in source_types:
                raise InvalidSourceType(
                    f'Invalid TRIADB source type: failed with parameter source={source}'
                )
            sqlid = None
            frm = None
            # Inserting the results of SELECT
            ins = f'INSERT INTO {table}'
            sel = f'SELECT ' + ', '.join(fields)
            # execute SQL query
            if source == 'file':
                # from part
                frm = f'FROM (\n{sql} \n)'
                # query id : insert from file to import data resource
                sqlid = 'InsertFromFile'
            elif source == 'ImportedDataResourceWithRightJoin':
                # from part
                frm = f'FROM {table} \nRIGHT JOIN(\n{sql} \n) AS A USING val\n WHERE ha2={ha2}'
                # query id : insert from imported data resource to load data on data type dictionary engines
                sqlid = 'InsertFromImportedDataResourceWithRightJoin'
            elif source == 'ImportedDataResource':
                # from part
                frm = f'FROM (\n{sql} \n)'
                # query id : insert from imported data resource to load data on data type dictionary engines
                sqlid = 'InsertFromImportedDataResource'
            elif source == 'TableEngine':
                # in that case sql parameter takes the name of the table engine
                # from part
                frm = f'FROM {sql}'
                # query id : insert from imported data resource to load data on data type dictionary engines
                sqlid = 'InsertFromTableEngine'
            elif source == 'DataTypeDictionary':
                # from part (sql variable here is the name of the DataTypeDictionary table)
                frm = f'FROM {sql}'
                # insert from DataTypeDictionary to load data onto HAtom and HLink
                sqlid = 'InsertFromDataTypeDictionary'
            # construct query
            query = ins + f'\n{sel}\n{frm}'
            return self.sql(query, qid=f'{sqlid} Command', execute=execute)
        else:
            raise InvalidCmdOperation(f'Invalid command operation')

    def disconnect(self):
        self._api.disconnect()
Exemple #19
0
def create_db(db_name, db_host, db_user, db_password):
    client = Client(db_host, user=db_user, password=db_password)
    client.execute(f"CREATE DATABASE IF NOT EXISTS {db_name}")
    client.disconnect()
            print(f'Извлечение отобранных строк таблицы {left_arc_file_name}')

            #Перемещение курсора по сжатой таблице к
            #началу каждой отвечающей запросу строки.
            #Очередная новая позиция курсора отсчитывается
            #не от начала файла, а от последней запомненной
            #позиции, что в ряде случаев приводит к
            #достижению колоссальной производительности.
            #Прописывание отобранных строк в конечный файл.
            #Присвоение флагу значения, показывающего
            #наличие в конечном файле нехэдерных строк.
            cur_pointer = 0
            for line_start in res:
                new_pointer = line_start[0]
                left_arc_file_opened.seek(new_pointer - cur_pointer, 1)
                trg_file_opened.write(
                    left_arc_file_opened.readline().decode('UTF-8'))
                cur_pointer = left_arc_file_opened.tell()
                empty_res = False

    #Если флаг-индикатор так и
    #остался равен True, значит,
    #результатов пересечения/вычитания
    #для данной левой таблицы нет, и в
    #конечный файл попали только хэдеры.
    #Такие конечные файлы программа удалит.
    if empty_res == True:
        os.remove(trg_file_path)

client.disconnect()
Exemple #21
0
def create_database():
    '''
        Функция создаст ClickHouse-базу данных и
        пополнит каждую её таблицу информацией,
        обеспечивающей быстрый доступ к элементам
        соответствующей сжатой исходной таблицы.
        '''
    ind_dir_path = os.path.normpath(
        input('\nПуть к папке с индексируемыми архивами: '))

    trg_dir_path = input('\nПуть к папке для результатов: ')

    #Имя базы данных сделаем для простоты почти
    #тем же, что и у папки с индексируемыми файлами.
    #Соединение с ClickHouse, создание клиент-объекта.
    db_name = f'DBCH{os.path.basename(ind_dir_path)}'
    client = Client('localhost')

    #Проверка на наличие базы,
    #созданной по тем же данным
    #при прошлых запусках программы.
    #Если предыдущая БД обнаружилась, то
    #выведудся имена хранимых там таблиц
    #и столбцов, а также типы данных.
    if (f'{db_name}', ) in client.execute('SHOW DATABASES'):
        print(f'\nБаза данных {db_name} уже существует')
        client.execute(f'USE {db_name}')
        tab_names = [
            tup[0] for tup in client.execute('SHOW TABLES')
            if tup[0] != 'header'
        ]
        table_struc = client.execute_iter(f'DESCRIBE TABLE {tab_names[0]}')
        col_names_n_types = {
            tup[0]: tup[1]
            for tup in table_struc if tup[0] != 'line_start'
        }
        print('\nТаблицы ранее созданной базы данных:\n', tab_names)
        print(
            '\nСтолбцы таблиц и соответствующие типы данных ранее созданной БД:\n',
            col_names_n_types)

        #Раз БД, соответствующая таблицам
        #выбранной папки, ранее была создана,
        #то можно сразу приступать к её
        #эксплуатации с помощью фронтенда.
        #Иной вариант - создать базу заново,
        #чтобы, например, переиндексировать
        #эти таблицы по другим столбцам.
        recreate = input('''\nПересоздать базу данных?
[yes(|y)|no(|n|<enter>)]: ''')
        if recreate in ['yes', 'y']:
            client.execute(f'DROP DATABASE {db_name}')
        elif recreate in ['no', 'n', '']:
            return ind_dir_path, trg_dir_path, db_name, tab_names, col_names_n_types
        else:
            print(f'{recreate} - недопустимая опция')
            sys.exit()

    ram = int(input('\nОбъём оперативной памяти компьютера, Гбайт: ')) * 1e9

    detect_headers = input(
        '''\nРаспознавать хэдеры VCF (##) и UCSC (track_name=)
индексируемых таблиц автоматически, или потом
вы укажете количество хэдеров самостоятельно?
(Предпросмотрщик больших файлов есть в репозитории
https://github.com/PlatonB/bioinformatic-python-scripts)
[auto(|a)|manual(|m)]: ''')
    if detect_headers in ['auto', 'a']:
        num_of_unind = None

    elif detect_headers in ['manual', 'm']:
        num_of_unind = input('''\nКоличество не обрабатываемых строк
в начале каждой индексируемой таблицы
(Важно! Табулированную шапку к ним не причисляйте)
(игнорирование ввода ==> производить работу для всех строк)
[0(|<enter>)|1|2|...]: ''')
        if num_of_unind == '':
            num_of_unind = 0
        else:
            num_of_unind = int(num_of_unind)
    else:
        print(f'{detect_headers} - недопустимая опция')
        sys.exit()

    cont, col_info = 'y', {}
    while cont not in ['no', 'n', '']:
        col_name = input('''\nИмя индексируемого столбца
(Нужно соблюдать регистр)
[#Chrom|pos|RSID|...]: ''')
        col_name = ''.join(col_name.split('#'))

        data_type = input(
            '''\nВ выбранном столбце - целые числа, вещественные числа или строки?
(примеры вещественного числа: 0.05, 2.5e-12)
(примеры строки: X, Y, A/C/G, rs11624464, HLA-DQB1)
[integer(|i)|decimal(|d)|string(|s)]: ''')
        if data_type in ['integer', 'i']:
            data_type = 'Int64'
            tale = None

        elif data_type in ['decimal', 'd']:
            tale = input('''\nСколько оставлять знаков после точки?
(игнорирование ввода ==> 5)
[...|5(|<enter>)|...|18): ''')
            if tale == '':
                tale = '5'
            elif 0 > int(tale) > 18:
                print(f'{tale} - недопустимая опция')
                sys.exit()
            data_type = f'Decimal64({tale})'

        elif data_type in ['string', 's']:
            data_type = 'String'
            tale = None
        else:
            print(f'{data_type} - недопустимая опция')
            sys.exit()
        col_info[col_name] = [data_type, 'cell_index', tale]

        cont = input('''\nПроиндексировать по ещё одному столбцу?
(игнорирование ввода ==> нет)
[yes(|y)|no(|n|<enter>)]: ''')
        if cont not in ['yes', 'y', 'no', 'n', '']:
            print('{cont} - недопустимая опция')
            sys.exit()

    #Доукомплектовываем созданный в рамках
    #пользовательского диалога словарь с названиями
    #и характеристиками выбранных пользователем
    #столбцов парой ключ-значение, описывающей
    #столбец индексов архивированной таблицы.
    col_info['line_start'] = ['Int64']

    #Получаем названия указанных пользователем
    #столбцов и столбца с индексами сжатой таблицы.
    col_names = list(col_info.keys())

    #ClickHouse не индексирует, а просто сортирует столбцы.
    #Выделим для сортировки половину оперативной памяти.
    #Если этого объёма не хватит, то ClickHouse задействует
    #внешнюю сортировку - размещение фрагментов столбца на
    #диске, сортировку каждого из них и поэтапное слияние.
    client.execute(f'SET max_bytes_before_external_sort = {int(ram) // 2}')

    #Создание БД, и выбор этой БД для использования
    #во всех последующих запросах по умолчанию.
    client.execute(f'CREATE DATABASE {db_name}')
    client.execute(f'USE {db_name}')

    print('')

    #Работа с архивами, каждый из
    #которых содержит по одной таблице.
    arc_file_names = os.listdir(ind_dir_path)
    for arc_file_name in arc_file_names:
        if arc_file_name.startswith('.~lock.'):
            continue
        with gzip.open(os.path.join(ind_dir_path, arc_file_name),
                       mode='rt') as arc_file_opened:

            #Автоматическое определение и прочтение
            #вхолостую хэдеров таблиц распространённых
            #биоинформатических форматов VCF и UCSC BED.
            #Последний из прочитанных хэдеров (он
            #же - шапка таблицы) будет сохранён.
            if num_of_unind == None:
                while True:
                    header_row = process_line(arc_file_opened)
                    if re.match(r'##|track_name=', header_row[0]) == None:
                        break

            #Холостое прочтение хэдеров, количество которых
            #указано пользователем, и сохранение шапки.
            else:
                for unind_index in range(num_of_unind):
                    arc_file_opened.readline()
                header_row = process_line(arc_file_opened)

            #Обязательное требование программы -
            #единообразие исходных таблиц.
            #Доказательством соблюдения этого правила
            #будет считаться одинаковость шапок.
            #Шапка первой обрабатываемой таблицы
            #назначается референсной, а шапки
            #следующих таблиц будут с ней сопоставляться.
            if 'common_header_row' not in locals():
                common_header_row = copy.deepcopy(header_row)
            elif header_row != common_header_row:
                print('Шапки индексируемых таблиц не совпадают')
                sys.exit()

            #Элементы шапки, озаглавливающие
            #выбранные пользователем столбцы,
            #станут потом именами столбцов БД.
            #Поскольку в этих именах не должно
            #быть символа # (таковы требования
            #со стороны ClickHouse), убираем его.
            for header_cell_index in range(len(header_row)):
                if header_row[header_cell_index].find('#') != -1:
                    header_row[header_cell_index] = ''.join(
                        header_row[header_cell_index].split('#'))

            #На этапе пользовательского диалога был
            #создан словарь с указанными пользователем
            #именами будущих столбцов БД и соответствующими
            #поддерживаемыми ClickHouse типами данных.
            #Добавляем ко всем ключам словаря,
            #кроме отвечающего за столбец стартов
            #строк, индексы имён этих столбцов,
            #обозначающие их позицию в шапке.
            #Эти же индексы будут определять
            #положение соответствующих ячеек в
            #каждой строке исходной таблицы.
            for col_name in col_names[:-1]:
                col_info[col_name][1] = header_row.index(col_name)

            #Для простоты назовём таблицы БД теми же
            #именами, что и у исходных, но только без
            #точек и дефисов, т.к. наличие таковых в
            #именах таблиц ClickHouse-баз недопустимо.
            #Таблицам также нельзя присваивать имена,
            #начинающиеся с цифры, поэтому добавим
            #каждому имени буквенную приставку.
            tab_name = 'TBL' + arc_file_name.replace('.', 'DOT').replace(
                '-', 'DEFIS')

            #Создаём таблицу БД, которая после
            #дальнейшего заполнения будет служить
            #путеводителем по соответствующей
            #gzip-архивированной крупной таблице.
            #Имя и тип данных каждого столбца БД
            #берём из ранее сформированного словаря.
            #По умолчанию ClickHouse сжимает каждый
            #столбец очень быстрым, но практически
            #не уменьшающим размер алгоритмом LZ4.
            #Применем к столбцам оптимальный по
            #скорости и степени компрессии Zstandart.
            client.execute(f'''CREATE TABLE {tab_name}
                                           ({", ".join([col_name + " " + col_ann[0] + " CODEC(ZSTD(22))" for col_name, col_ann in col_info.items()])})
                                           ENGINE = MergeTree()
                                           ORDER BY ({", ".join(col_names[:-1])})'''
                           )

            print(f'Таблица {tab_name} новой базы данных пополняется')

            #Данные будут поступать в
            #базу одной или более порциями.
            #Для контроля работы с порциями
            #далее будет отмеряться их размер.
            #Назначаем ноль в качестве
            #стартового значения этой величины.
            fragment, fragment_len = [], 0

            #Таблица БД будет пополняться
            #до тех пор, пока не закончится
            #перебор строк исходной таблицы.
            while True:

                #Размер порции в 100000 строк
                #соответствует рекомендации из
                #официальной документации ClickHouse.
                if fragment_len == 100000:
                    client.execute(
                        f'''INSERT INTO {tab_name}
                                                           ({", ".join(col_names)})
                                                           VALUES''', fragment)

                    #После добавления порции список,
                    #её содержащий, очищается, а
                    #счётчик её размера обнуляется.
                    fragment.clear()
                    fragment_len = 0

                #Получение байтовой позиции начала
                #текущей строки исходной таблицы.
                #Устранение \n и разбиение
                #этой строки на список.
                line_start, row = arc_file_opened.tell(), process_line(
                    arc_file_opened)

                #Чтение исходной таблицы завершено.
                #Вероятнее всего, количество строк
                #таблицы не кратно 100000, поэтому
                #к этому моменту накопилось ещё
                #некоторое количество данных.
                #Допропишем тогда их в базу.
                if row == ['']:
                    if fragment_len > 0:
                        client.execute(
                            f'''INSERT INTO {tab_name}
                                                                   ({", ".join(col_names)})
                                                                   VALUES''',
                            fragment)
                    break

                #Отбор ячеек тех столбцов сжатой
                #таблицы, по которым индексируем.
                #Сохранение этих ячеек и стартовых
                #позиций табличных строк в список.
                cells = fetch_cells(row, col_info, line_start)

                #Пополнение порции с нуля, в т.ч.
                #после отправки в БД предыдущей,
                #либо достройка текущей порции.
                fragment.append(dict(zip(col_names, cells)))

                #В любом случае, инкрементируем
                #счётчик размера порции.
                fragment_len += 1

    #Соберём информацию об устройстве базы данных.
    #Она будет далее использоваться фронтендами.
    #Выведем также эти сведения на экран,
    #чтобы пользователю при запусках фронтендов
    #было очень легко в базе разобраться.
    tab_names = [tup[0] for tup in client.execute('SHOW TABLES')]
    col_names_n_types = {
        col_name: col_ann[0]
        for col_name, col_ann in col_info.items() if col_name != 'line_start'
    }
    print('\nТаблицы новой базы данных:\n', tab_names)
    print('\nСтолбцы таблиц и соответствующие типы данных новой БД:\n',
          col_names_n_types)

    #Общая для всех исходных таблиц шапка
    #направится в отдельную таблицу БД.
    client.execute('''CREATE TABLE header
                          (header_cells String)
                          ENGINE = TinyLog''')
    client.execute(
        f'''INSERT INTO header
                           (header_cells)
                           VALUES''', [{
            'header_cells': header_cell
        } for header_cell in common_header_row])

    client.disconnect()

    return ind_dir_path, trg_dir_path, db_name, tab_names, col_names_n_types
class ClickHouse:

    def __init__(self):
        self.client = None

    def __enter__(self):
        self.client = Client(
            settings.CLICKHOUSE_HOST,
            user=settings.CLICKHOUSE_USER,
            password=settings.CLICKHOUSE_PASSWORD,
            database=settings.CLICKHOUSE_NAME,
        )

        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.client.disconnect()

    def execute(self, query, *args, **kwargs):
        echo = kwargs.pop('echo', None)
        if echo is not None:
            print(query)

        return self.client.execute(query, *args, **kwargs)

    def insert_rows(self, table, fields, rows):
        n = len(rows)
        if n > 0:
            fields = list(fields)
            values = []
            for row in rows:
                values.append(tuple(row[name] for name in fields))

            columns = ', '.join(f'"{name}"' for name in fields)
            self.execute(
                f'INSERT INTO {table} ({columns}) VALUES', values,
                types_check=True,
                #echo=True,
            )

        return n

    def upload(self, name, metadata, fields, rows):
        rows2 = []
        for time, data in rows:
            data = data.copy()
            data['TIMESTAMP'] = int(time.timestamp())
            rows2.append(data)
        rows = rows2

        # Guess the table name
        table_name = metadata.get('table_name')
        if table_name:
            table = f"{name}_{table_name}"
        else:
            table = str(name)

        # Create the table if it does not exist
        # The Replacing engine allows to avoid duplicates. Deduplication is
        # done in the background, so there may be duplicates until the parts
        # are merged.
        self.execute(
            f"CREATE TABLE IF NOT EXISTS {table} ({get_column('TIMESTAMP')}) "
            f"ENGINE = ReplacingMergeTree() ORDER BY TIMESTAMP",
            #echo=True,
        )

        # Get the table columns
        database = settings.CLICKHOUSE_NAME
        cols = set([name for name, in self.execute(
            f"SELECT name FROM system.columns "
            f"WHERE database = '{database}' AND table = '{table}';",
            #echo=True,
        )])

        # Add new columns
        fields = set(fields)
        new = fields - cols
        if new:
            actions = ', '.join(f'ADD COLUMN {get_column(name)}' for name in new)
            self.execute(f'ALTER TABLE {table} {actions};')

        self.insert_rows(table, fields, rows)

        return metadata


    def select(self, table, columns=None, where=None, group_by=None,
               order_by=None, limit=None, limit_by=None, **kwargs):

        if not columns:
            columns = '*'
        else:
            assert type(columns) is list
            columns = ', '.join(columns)

        query = [f'SELECT {columns} FROM {table}']

        clauses = [
            ('WHERE {}', where),
            ('GROUP BY {}', group_by),
            ('ORDER BY {}', order_by),
            ('LIMIT {0[0]} BY {0[1]}', limit_by),
            ('LIMIT {}', limit),
        ]
        for fstr, value in clauses:
            if value not in (None, ''):
                query.append(fstr.format(value))

        query = ' '.join(query)
        return self.execute(query, **kwargs)