def sort(self): if self._sorted: return self if self._type == 'String': raise Exception( 'Supporting for sorting Strings is currently not available in MergeVectors' ) ranges = [h['histogram'] for h in self.quantiles().exec().to_records()] queries = [] for i, a in enumerate(ranges): sql = 'CREATE TABLE {chunk}_ ENGINE = Memory() AS ' sql += 'SELECT rowNumberInBlock() AS i, v FROM (SELECT v FROM {vector} WHERE ' if i == 0: sql += "v < {max_bound} ORDER BY v) " elif i == len(self._arrays) - 1: sql += "v >= {min_bound} ORDER BY v) " else: sql += "v >= {min_bound} AND v < {max_bound} ORDER BY v) " sql += 'SETTINGS max_block_size={max_block_size}, max_threads={max_threads}' queries.append( sql.format(chunk='{chunk}_{idx}'.format(chunk=self._ref, idx=i + 1), max_block_size=round(self._array_chunk_size * 1.2), max_threads=self._max_threads, vector=self._ref, max_bound=a, min_bound=ranges[i - 1])) vulkn.Vulkn().scheduler.dispatch(queries) self._rebuild_arrays() self._sorted = True return self
def infer_column_types(database, table, columns, sample=0, allow_enums=True): tests = [] results = [] ctx = vulkn.Vulkn() for col in columns: r = ctx._conn.select(infer_column_type(database, table, col, sample)).to_records() if len(r) > 0: tests.append(r[0]) for i, col in enumerate(tests): r = '' if tests[i]['recommended_type'] == 'String' and tests[i][ 'enum'] and allow_enums: enum = [ "'{}'={}".format(st, str(idx + 1)) for idx, st in enumerate( sorted(ast.literal_eval(tests[i]['sample']))) ] r = "Enum({})".format(', '.join(enum)) else: r = tests[i]['recommended_type'] if tests[i]['nullable']: r = 'Nullable({})'.format(r) if tests[i]['low_cardinality']: r = 'LowCardinality({})'.format(r) results.append(r) return results
def _create_table(database, table, columns, engine): ctx = vulkn.Vulkn() column_ddl = ', '.join( ['"{}" {}'.format(k, v) for k, v in columns.items()]) create_ddl = f'CREATE TABLE "{database}"."{table}" ({column_ddl}) ENGINE = {engine}' if not ctx.exec(create_ddl): raise Exception('Unable to create table')
def _rebuild_arrays(self): ctx = vulkn.Vulkn() rename = [ f'RENAME TABLE {chunk}_ TO {chunk}' for chunk in self._arrays ] drop = [f'DROP TABLE IF EXISTS {chunk}' for chunk in self._arrays] ctx.scheduler.dispatch(drop) ctx.scheduler.dispatch(rename)
def quantiles(self): cols = [ 'quantile({})(v)'.format(str((x + 1) / len(self._arrays))) for x in range(len(self._arrays)) ] sql = 'SELECT arrayJoin([{quantiles}]) AS histogram FROM {vector}' return vulkn.Vulkn().q( sql.format(quantiles=','.join(cols), vector=self._ref))
def take(self, num_rows): # TODO: This uses too much memory for large values (1 billion..). sql = """ CREATE TABLE {session}_{new_uuid}_{new_chunk} ENGINE=Memory() AS SELECT rowNumberInAllBlocks() AS i, v FROM ( WITH (SELECT groupArray(v) FROM (SELECT v FROM {session}_{uuid}_{chunk} ORDER BY i)) AS `#v` SELECT `#v`[(number%length(`#v`))+1] AS v FROM numbers_mt(100000000) )""" r = MergeVector(max_block_size=self._max_block_size, max_threads=self._max_threads, array_chunk_size=self._array_chunk_size) r._length = num_rows r._ref = '{session}_{uuid}'.format(session=r._session(), uuid=r._uuid) arrays = [] queries = [] chunk = 0 chunks = r.chunks(num_rows, self._array_chunk_size) for i in range(chunks[0]): new_chunk = i + 1 chunk = (i + 1) % len(self._arrays) chunk = 1 if chunk == 0 else chunk queries.append( sql.format(session=r._session(), new_uuid=r._uuid, uuid=self._uuid, new_chunk=new_chunk, chunk=chunk, max_block_size=self._max_block_size, max_threads=self._max_threads)) arrays.append('{session}_{uuid}_{chunk}'.format( session=r._session(), uuid=r._uuid, chunk=new_chunk)) if (chunks[1] > 0 or (self._array_chunk_size >= self._length and self._array_chunk_size >= num_rows)): sql += ' LIMIT {length}'.format( length=chunks[1] if chunks[1] > 0 else num_rows) arrays.append('{session}_{uuid}_{chunk}'.format( session=r._session(), uuid=r._uuid, chunk=len(queries) + 1)) queries.append( sql.format(session=r._session(), new_uuid=r._uuid, uuid=self._uuid, new_chunk=len(queries) + 1, chunk=1 if len(self._arrays) == 1 else (chunk + 1) % len(self._arrays), max_block_size=self._max_block_size, max_threads=self._max_threads)) ctx = vulkn.Vulkn() ctx.scheduler.dispatch(queries) ctx.session._cache.extend(arrays) ctx.session._cache.append(r._ref) r._arrays = arrays r._rebuild_ref() return r
def rand(self, min_value, max_value, length, max_block_size=None, max_threads=2, array_chunk_size=None): sql = 'CREATE TABLE {session}_{uuid}_{chunk} ENGINE=Memory() AS ' sql += 'SELECT number AS i, rand64()%toUInt64({mod}) AS v FROM numbers_mt({array_chunk_size}) ' sql += 'SETTINGS max_block_size={max_block_size}, max_threads={max_threads}' array_chunk_size = array_chunk_size or math.ceil( length / ((os.cpu_count() * 0.75) - 2)) if array_chunk_size > 100000000: array_chunk_size = 100000000 max_block_size = max_block_size or array_chunk_size uuid = MergeVector.generateUUID() r = MergeVector(uuid=uuid, max_block_size=max_block_size, max_threads=max_threads, array_chunk_size=array_chunk_size) r._ref = '{session}_{uuid}'.format(session=r._session(), uuid=uuid) modsize = 1 + abs(min_value - max_value) chunks = r.chunks(length, array_chunk_size) queries = [] arrays = [] for i in range(chunks[0]): chunk = i + 1 queries.append( sql.format(session=r._session(), uuid=uuid, chunk=chunk, mod=modsize, array_chunk_size=array_chunk_size, max_block_size=max_block_size, max_threads=max_threads)) arrays.append('{session}_{uuid}_{chunk}'.format( session=r._session(), uuid=uuid, chunk=chunk)) if chunks[1] > 0: arrays.append('{session}_{uuid}_{chunk}'.format( session=r._session(), uuid=uuid, chunk=len(queries) + 1)) queries.append( sql.format(session=r._session(), uuid=uuid, chunk=len(queries) + 1, mod=modsize, array_chunk_size=chunks[1], max_block_size=max_block_size, max_threads=max_threads)) ctx = vulkn.Vulkn() ctx.scheduler.dispatch(queries) ctx.session._cache.extend(arrays) ctx.session._cache.append(r._ref) r._arrays = arrays r._rebuild_ref() r._length = length return r
def _rebuild_ref(self): ctx = vulkn.Vulkn() parent = "CREATE TABLE {ref} AS {ref}_1 ENGINE = Merge('{vulkn_database}', '^{session}_{uuid}_([0-9]*)$')" ctx.scheduler.dispatch( 'DROP TABLE IF EXISTS {ref}'.format(ref=self._ref)) ctx.scheduler.dispatch( parent.format(ref=self._ref, vulkn_database=self._session().split('.')[0], session=self._session().split('.')[1], uuid=self._uuid))
def __new__(self, database, table=None): import vulkn ctx = vulkn.Vulkn() if table is None: if '.' in database: database, table = database.split('.') else: table = database database = None database = database or ctx._database return BaseTableDataTable(ctx, database, table)
def v(): import vulkn from vulkn.workspaces import LocalWorkSpace ws = LocalWorkSpace(persist=False) f = vulkn.Vulkn(host='localhost', port=9001) f._port = 9001 f._reload() yield f del f ws.stop()
def range(self, start_value, end_value, max_block_size=None, max_threads=2, array_chunk_size=100000000): sql = 'CREATE TABLE {session}_{uuid}_{chunk} ENGINE=Memory() AS ' sql += 'SELECT number AS i, number+({array_chunk_size}*{idx}) AS v FROM numbers_mt({array_chunk_size}) ' sql += 'SETTINGS max_block_size={max_block_size}, max_threads={max_threads}' queries = [] arrays = [] uuid = MergeVector.generateUUID() max_block_size = max_block_size or array_chunk_size r = MergeVector(uuid=uuid, max_block_size=max_block_size, max_threads=max_threads, array_chunk_size=array_chunk_size) r._ref = '{session}_{uuid}'.format(session=r._session(), uuid=uuid) length = abs(end_value - start_value) chunks = r.chunks(length, array_chunk_size) for i in range(chunks[0]): chunk = i + 1 queries.append( sql.format(session=r._session(), uuid=uuid, chunk=chunk, idx=i, array_chunk_size=array_chunk_size, max_block_size=max_block_size, max_threads=max_threads)) arrays.append('{session}_{uuid}_{chunk}'.format( session=r._session(), uuid=uuid, chunk=chunk)) if chunks[1] > 0: arrays.append('{session}_{uuid}_{chunk}'.format( session=r._session(), uuid=uuid, chunk=len(queries) + 1)) queries.append( sql.format(session=r._session(), uuid=uuid, chunk=len(queries) + 1, idx=chunks[0], array_chunk_size=chunks[1], max_block_size=max_block_size, max_threads=max_threads)) ctx = vulkn.Vulkn() ctx.scheduler.dispatch(queries) ctx.session._cache.extend(arrays) ctx.session._cache.append(r._ref) r._arrays = arrays r._rebuild_ref() r._length = length r._sorted = True return r
def column_list(columns): # TODO: Next release. Whole thing is hacky. Needs refactoring. from vulkn.types import ArrayVector, ColumnVector from vulkn import Vulkn ctx = vulkn.Vulkn() rangeCols = [] selectExprList = [] q = None for idx, column in enumerate(columns): if isinstance(column, ArrayVector): if hasattr(column, 'vector_name'): selectExprList.append('v.{index} AS {name}'.format( index=str(idx + 1), name=column.vector_name)) else: selectExprList.append( 'v.{index} AS col{index}'.format(index=str(idx + 1))) rangeCols.append("joinGet('{table}', 'v', 1)".format( table=str(column._cache_table))) if isinstance(column, ColumnVector): if hasattr(column, 'vector_name'): selectExprList.append('c{index}.v AS {name}'.format( index=str(idx), name=column.vector_name)) else: selectExprList.append( 'c{index}.v AS col{index}'.format(index=str(idx))) if idx == 0: rangeCols.append('FROM ({}) c0'.format(str(column))) else: rangeCols.append( 'ANY LEFT JOIN ({subquery}) c{index} ON (c{prevIndex}.i = c{index}.i)' .format(subquery=str(column), index=str(idx), prevIndex=str(idx - 1))) if isinstance(column, ArrayVector): arrayCols = [f'v{i+1}' for i, k in enumerate(rangeCols)] ctx.session.optimize(final=True) q = """WITH arrayJoin( arrayMap( ({array_cols}) -> ({array_cols}), {source_cols})) AS v SELECT {columns}""".format( array_cols=','.join(arrayCols), source_cols=',\n'.join(rangeCols), columns=', '.join(selectExprList), session_cache=ctx.session.session_store) if isinstance(column, ColumnVector): q = "SELECT {selectExpr} {rangeCols}".format( selectExpr=', '.join(selectExprList), rangeCols=' '.join(rangeCols)) return q
def shuffle(self): sql = 'CREATE TABLE {chunk}_ ENGINE = Memory() AS ' sql += "SELECT rowNumberInBlock() AS i, v FROM {chunk} ORDER BY rand() " sql += 'SETTINGS max_block_size={max_block_size}, max_threads={max_threads}' queries = [] for a in self._arrays: queries.append( sql.format(chunk=a, max_block_size=self._max_block_size, max_threads=self._max_threads)) vulkn.Vulkn().scheduler.dispatch(queries) self._rebuild_arrays() self._sorted = False return self
def read(self, uri, database, table): # TODO: Next release. Hacky. Remove subprocess/cat pipeline. v = vulkn.Vulkn() header = 'CSVWithNames' if self._options['header'] else 'CSV' env = {'LC_ALL': 'C'} src = subprocess.Popen(['cat', uri.path], stdout=subprocess.PIPE, env=env, encoding='ascii') tgt = ['clickhouse-client', '-A', '-m', '-n'] tgt += ['--host', v._host, '--port', str(v._port), '--user', v._user, '--password', v._password] tgt += ['--query', f'INSERT INTO {database}.{table} FORMAT {header}'] log.debug(str(tgt)) log.log(LogLevels.SQL, f'INSERT INTO {database}.{table} FORMAT {header}') p = subprocess.Popen(tgt, stdin=src.stdout, stdout=subprocess.PIPE, env=env, encoding='ascii') src.stdout.close() p.communicate()
def cast(self, to_type): sql = 'CREATE TABLE {chunk}_ ENGINE = Memory() AS ' sql += "SELECT i, cast(v, '{type}') AS v FROM {chunk} " sql += 'SETTINGS max_block_size={max_block_size}, max_threads={max_threads}' toType = to_type if isinstance(to_type, str) else to_type.CAST queries = [] for a in self._arrays: queries.append( sql.format(chunk=a, type=toType, max_block_size=self._max_block_size, max_threads=self._max_threads)) vulkn.Vulkn().scheduler.dispatch(queries) self._rebuild_arrays() self._rebuild_ref() self._type = toType self._sorted = False return self
def fromVector(self, name, columns=(), engine=None, buffer_profile=None, replace=False): import vulkn database = '' table = '' ctx = vulkn.Vulkn() if isinstance(name, tuple): (database, table) = (name[0], name[1]) if isinstance(name, str): (database, table) = name.split('.') engine = engine or vulkn.engines.Memory() if replace: ctx._conn.execute(f'DROP TABLE IF EXISTS {database}.{table}') ddl = Table.DDL(database, table, columns, engine, buffer_profile) for ddl_query in ddl: ctx._conn.execute(ddl_query) return BaseTableDataTable(ctx, database, table)
def _session(self): return vulkn.Vulkn().session.session_store
def load(self, uri, database, table): # TODO: Next release. Use proper vulkn method def _create_table(database, table, columns, engine): ctx = vulkn.Vulkn() column_ddl = ', '.join( ['"{}" {}'.format(k, v) for k, v in columns.items()]) create_ddl = f'CREATE TABLE "{database}"."{table}" ({column_ddl}) ENGINE = {engine}' if not ctx.exec(create_ddl): raise Exception('Unable to create table') schema = self._format._options['schema'] tmp_db = database ctx = vulkn.Vulkn() if schema: cols = [c._name for c in schema] types = [c._col_type for c in schema] if schema is None or self._format._options['infer_schema']: cols = self._format.columns(PosixStorage(uri)) types = ['String'] * len(cols) if self._format._options['column_format'] == 'snake_case': cols = [snake_case(c) for c in cols] sample_engine = self._format._options[ 'sample_engine'] or vulkn.engines.Memory() ctx.dropTable(tmp_db, f'tmp_{table}') _create_table(tmp_db, f'tmp_{table}', dict(zip(cols, types)), sample_engine) # TODO: Next release. Use storage.write method sample = self._format.sample(PosixStorage(uri)) ctx._conn.insert_blob(sample, tmp_db + f'."tmp_{table}"', 'CSV') types = infer_column_types(tmp_db, f'tmp_{table}', cols, self._format._options['sample_size'], self._format._options['allow_enums']) ctx.dropTable(tmp_db, f'tmp_{table}') if self._format._options['overwrite']: ctx.dropTable(database, table) _create_table(database, table, dict(zip(cols, types)), engine=self._format._options['engine'] or vulkn.engines.Memory()) if self._format._options['infer_schema']: sample_engine = self._format._options[ 'sample_engine'] or vulkn.engines.Memory() _create_table(tmp_db, f'tmp_{table}', dict(zip(cols, types)), sample_engine) # TODO: Next release. Write without cat self._format.read(PosixStorage(uri), tmp_db, f'tmp_{table}') convert_dml = marshal_columns(tmp_db, f'tmp_{table}', database, table, cols, types) if ctx._conn.execute(convert_dml) != 0: raise Exception('Unable to load data') ctx.dropTable(tmp_db, f'tmp_{table}') else: # TODO: Next release. Write without cat self._format.read(PosixStorage(uri), database, table) return BaseTableDataTable(ctx, database, table).select('*')
def stats(self): sql = 'SELECT min(v) AS min, max(v) AS max, avg(v) AS avg, median(v) AS median FROM {vector}' return vulkn.Vulkn().q(sql.format(vector=self._ref))
#!/usr/bin/env python # Copyright (c) 2019, Jason Godden <*****@*****.**> # Copyright (c) 2019, VulknData Pty Ltd # GNU General Public License v3.0 (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt) # SPDX-License-Identifier: GPL-3.0-only import re import vulkn vulkn.session.log.setLevel('DEBUG') v = vulkn.Vulkn(host='localhost', http_port=8123, client='http') k = None def get_parameter_count(msg): if 'requires at least' in msg: return ( 1, None, ) m = re.match(r'.* passed (.*?), should be (.*?) or (.*?)\. \(', msg) if m is None: m = re.match(r'.* passed (.*?), should be (.*?) \(', msg) return (int(m[2]), 0) return (int(m[2]), int(m[3])) all_funcs = v.table('system.functions').select('name').orderBy(
#!/usr/bin/env python # Copyright (c) 2019, Jason Godden <*****@*****.**> # Copyright (c) 2019, VulknData Pty Ltd # GNU General Public License v3.0 (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt) # SPDX-License-Identifier: GPL-3.0-only import vulkn from vulkn.workspaces import LocalWorkSpace ws = LocalWorkSpace(persist=False) v = vulkn.Vulkn(host='localhost', port=9001) all_funcs = v.table('system.functions').select('*').orderBy( 'is_aggregate', 'name') funcs = all_funcs.where('not is_aggregate').exec().to_records() agg_funcs = all_funcs.where('is_aggregate').exec().to_records() for f in funcs: print(f"{f['name']},{f['alias_to']}") #print(agg_funcs) ws.stop()
#!/usr/bin/env python # Copyright (c) 2019, Jason Godden <*****@*****.**> # Copyright (c) 2019, VulknData Pty Ltd # GNU General Public License v3.0 (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt) # SPDX-License-Identifier: GPL-3.0-only import vulkn from vulkn.workspaces import LocalWorkSpace ws = LocalWorkSpace(persist=False) v = vulkn.Vulkn(host='localhost', http_port=8124) all_funcs = v.table('system.functions').select('name').orderBy('is_aggregate', 'name') funcs = all_funcs.where('not is_aggregate').exec().to_records() agg_funcs = all_funcs.where('is_aggregate').exec().to_records() for f in funcs: print(f"{f['name']}") #print(agg_funcs) k = {} for f in funcs: fname = f['name'] try: k[fname] = v.select(f'toTypeName({fname}()) AS t').r[0]['t'] except: try: k[fname] = v.select(f'toTypeName({fname}(1)) AS t').r[0]['t']
def agg(self, agg_func): return vulkn.Vulkn().q('SELECT {agg_func}(v) FROM {vector}'.format( agg_func=agg_func, vector=self._ref))
def peek(self): sql = 'SELECT * FROM {vector}_1 WHERE i < 20 ORDER BY i LIMIT 20' return vulkn.Vulkn().q(sql.format(vector=self._ref))
sys.exit(1) sys.exit(0) ce = None if args.local: args.port = get_next_free_socket('127.0.0.1', list(range(9001, 10000))) args.http_port = get_next_free_socket('127.0.0.1', list(range(8124, 8999))) ce = LocalWorkSpace(persist=args.persist, name=args.name, workspace=args.workspace, folio=args.folio, port=args.port, http_port=args.http_port) v = vulkn.Vulkn(host=args.host, port=args.port, user=args.user, password=args.password) vulkn.session.log.setLevel(args.log_level) vulkn.session.timing = args.timing tags = [ "The environmentally friendly real-time analytics engine powered by ClickHouse.", "The developer friendly real-time analytics engine powered by ClickHouse.", "Stop waiting for your queries to complete and start having fun.", "ClickHouse - an analytics database for the 21st century." ] print(f"""Copyright (C) 2019,2020 Jason Godden / VulknData Pty Ltd. Добро пожаловать to VULKИ version {VERSION}!
ce = None if args.local: args.port = get_next_free_socket('127.0.0.1', list(range(9001, 10000))) args.http_port = get_next_free_socket('127.0.0.1', list(range(8124, 8999))) ce = LocalWorkSpace(persist=args.persist, name=args.name, workspace=args.workspace, folio=args.folio, port=args.port, http_port=args.http_port, insecure=True) v = vulkn.Vulkn(host=args.host, port=args.port, http_port=args.http_port, user=args.user, password=args.password, client=args.client, insecure=args.insecure) vulkn.session.log.setLevel(args.log_level) vulkn.session.timing = args.timing log.debug(args) tags = [ "The environmentally friendly real-time analytics engine powered by ClickHouse.", "The developer friendly real-time analytics engine powered by ClickHouse.", "Stop waiting for your queries to complete and start having fun.", "ClickHouse - an analytics database for the 21st century." ]