def __init__(self, spec_dir, benchmark_hosts, result_hosts, log, fail_if, sample_mode): self.benchmark_hosts = benchmark_hosts self.sample_mode = sample_mode self.spec_dir = spec_dir self.client = clients.client(benchmark_hosts) self.result_client = clients.client(result_hosts) self.server_version_info = aio.run(self.client.get_server_version) self.server_version = parse_version(self.server_version_info['number']) self.log = log self.create_result = partial( Result, version_info=self.server_version_info ) if fail_if: self.fail_if = partial(eval_fail_if, fail_if) else: self.fail_if = lambda x: None if result_hosts: self.process_result = _result_to_crate(self.log, self.result_client) else: self.process_result = log.result
def test_reindex(self): crate_v3 = CrateNode(crate_dir=get_crate('3.x.x'), keep_data=True, settings=self.crate_settings) self._to_stop.append(crate_v3) crate_v3.start() with client(crate_v3.http_url) as c: aio.run(c.execute, "create table t (x int)") args = ( (1, ), (2, ), (3, ), ) aio.run(c.execute_many, "insert into t (x) values (?)", args) crate_v3.stop() self._to_stop.remove(crate_v3) crate_v4 = CrateNode(crate_dir=get_crate('4.0.3'), keep_data=True, settings=self.crate_settings) self._to_stop.append(crate_v4) crate_v4.start() reindex(crate_v4.http_url) with client(crate_v4.http_url) as c: result = aio.run( c.execute, "SELECT version FROM information_schema.tables WHERE table_name = 't'" ) version = result['rows'][0][0] self.assertEqual(version, {'upgraded': None, 'created': '4.0.3'}) cnt = aio.run(c.execute, 'SELECT count(*) FROM t')['rows'][0][0] self.assertEqual(cnt, 3)
def teardown(*args): try: with client(node.http_url) as c: aio.run(c.execute, 'drop table x.demo') aio.run(c.execute, 'drop blob table blobtable') finally: node.stop()
def insert_from_sql(src_uri=None, query=None, fetch_size=100, concurrency=25, table=None, hosts=None, output_fmt=None): """Insert data read from another SQL source into table.""" stats = Stats() with clients.client(hosts, concurrency=concurrency) as client: f = partial(aio.measure, stats, client.execute_many) try: aio.run( async_insert_from_sql, src_uri, concurrency, query, fetch_size, table, f ) except clients.SqlException as e: raise SystemExit(str(e)) try: print(format_stats(stats.get(), output_fmt)) except KeyError: if not stats.sampler.values: raise SystemExit('No data read from source') raise
def setup(*args): with client(node.http_url) as c: aio.run( c.execute, 'create table x.demo (id int, name string, country string) \ with (number_of_replicas = 0)' ) aio.run(c.execute, 'create table y.demo (name text) with (number_of_replicas = 0)') aio.run(c.execute, 'create blob table blobtable with (number_of_replicas = 0)')
def insert_json(table=None, bulk_size=1000, concurrency=25, hosts=None, infile=None, output_fmt=None): """Insert JSON lines from a file or stdin into a CrateDB cluster. If no hosts are specified the statements will be printed. Args: table: Target table name. bulk_size: Bulk size of the insert statements. concurrency: Number of operations to run concurrently. hosts: hostname:port pairs of the Crate nodes """ if not hosts: return print_only(infile, table) queries = (to_insert(table, d) for d in dicts_from_lines(infile)) bulk_queries = as_bulk_queries(queries, bulk_size) print('Executing inserts: bulk_size={} concurrency={}'.format( bulk_size, concurrency), file=sys.stderr) stats = Stats() with clients.client(hosts, concurrency=concurrency) as client: f = partial(aio.measure, stats, client.execute_many) try: aio.run_many(f, bulk_queries, concurrency) except clients.SqlException as e: raise SystemExit(str(e)) try: print(format_stats(stats.get(), output_fmt)) except KeyError: if not stats.sampler.values: raise SystemExit('No data received via stdin') raise
def insert_json(table=None, bulk_size=1000, concurrency=25, hosts=None, output_fmt=None): """Insert JSON lines fed into stdin into a Crate cluster. If no hosts are specified the statements will be printed. Args: table: Target table name. bulk_size: Bulk size of the insert statements. concurrency: Number of operations to run concurrently. hosts: hostname:port pairs of the Crate nodes """ if not hosts: return print_only(table) queries = (to_insert(table, d) for d in dicts_from_stdin()) bulk_queries = as_bulk_queries(queries, bulk_size) print('Executing inserts: bulk_size={} concurrency={}'.format( bulk_size, concurrency), file=sys.stderr) stats = Stats() with clients.client(hosts, concurrency=concurrency) as client: f = partial(aio.measure, stats, client.execute_many) try: aio.run_many(f, bulk_queries, concurrency) except clients.SqlException as e: raise SystemExit(str(e)) try: print(format_stats(stats.get(), output_fmt)) except KeyError: if not stats.sampler.values: raise SystemExit('No data received via stdin') raise
def reindex(hosts=None): with clients.client(hosts) as client: run(_async_reindex, client)
def __init__(self, hosts, concurrency, sample_mode): self.concurrency = concurrency self.client = client(hosts, concurrency=concurrency) self.sampler = get_sampler(sample_mode)
def insert_fake_data(hosts=None, table=None, num_records=1e5, bulk_size=1000, concurrency=25, mapping_file=None): """Generate random data and insert it into a table. This will read the table schema and then find suitable random data providers. Which provider is choosen depends on the column name and data type. Example: A column named `name` will map to the `name` provider. A column named `x` of type int will map to `random_int` because there is no `x` provider. Available providers are listed here: https://faker.readthedocs.io/en/latest/providers.html Additional providers: - auto_inc: Returns unique incrementing numbers. Automatically used for columns named "id" of type int or long - geo_point Returns [<lon>, <lat>] Automatically used for columns of type geo_point Args: hosts: <host>:[<port>] of the Crate node table: The table name into which the data should be inserted. Either fully qualified: `<schema>.<table>` or just `<table>` num_records: Number of records to insert. Usually a number but expressions like `1e4` work as well. bulk_size: The bulk size of the insert statements. concurrency: How many operations to run concurrently. mapping_file: A JSON file that defines a mapping from column name to fake-factory provider. The format is as follows: { "column_name": ["provider_with_args", ["arg1", "arg"]], "x": ["provider_with_args", ["arg1"]], "y": "provider_without_args" } """ with clients.client(hosts, concurrency=1) as client: schema, table_name = parse_table(table) columns = retrieve_columns(client, schema, table_name) if not columns: sys.exit('Could not find columns for table "{}"'.format(table)) print('Found schema: ') print(json.dumps(columns, sort_keys=True, indent=4)) mapping = None if mapping_file: mapping = json.load(mapping_file) bulk_size = min(num_records, bulk_size) num_inserts = int(math.ceil(num_records / bulk_size)) gen_row = create_row_generator(columns, mapping) stmt = to_insert('"{schema}"."{table_name}"'.format(**locals()), columns)[0] print('Using insert statement: ') print(stmt) print('Will make {} requests with a bulk size of {}'.format( num_inserts, bulk_size)) print('Generating fake data and executing inserts') q = asyncio.Queue(maxsize=concurrency) with clients.client(hosts, concurrency=concurrency) as client: active = [True] def stop(): asyncio.ensure_future(q.put(None)) active.clear() loop.remove_signal_handler(signal.SIGINT) if sys.platform != 'win32': loop.add_signal_handler(signal.SIGINT, stop) bulk_seq = _bulk_size_generator(num_records, bulk_size, active) with ThreadPoolExecutor() as e: tasks = asyncio.gather( _gen_data_and_insert(q, e, client, stmt, gen_row, bulk_seq), consume(q, total=num_inserts)) loop.run_until_complete(tasks)
def insert_fake_data(hosts=None, table=None, num_records=1e5, bulk_size=1000, concurrency=25, mapping_file=None): """Generate random data and insert it into a table. This will read the table schema and then find suitable random data providers. Which provider is choosen depends on the column name and data type. Example: A column named `name` will map to the `name` provider. A column named `x` of type int will map to `random_int` because there is no `x` provider. Available providers are listed here: https://faker.readthedocs.io/en/latest/providers.html Additional providers: - auto_inc: Returns unique incrementing numbers. Automatically used for columns named "id" of type int or long - geo_point Returns [<lon>, <lat>] Automatically used for columns of type geo_point Args: hosts: <host>:[<port>] of the Crate node table: The table name into which the data should be inserted. Either fully qualified: `<schema>.<table>` or just `<table>` num_records: Number of records to insert. Usually a number but expressions like `1e4` work as well. bulk_size: The bulk size of the insert statements. concurrency: How many operations to run concurrently. mapping_file: A JSON file that defines a mapping from column name to fake-factory provider. The format is as follows: { "column_name": ["provider_with_args", ["arg1", "arg"]], "x": ["provider_with_args", ["arg1"]], "y": "provider_without_args" } """ with clients.client(hosts, concurrency=1) as client: schema, table_name = parse_table(table) columns = retrieve_columns(client, schema, table_name) if not columns: sys.exit('Could not find columns for table "{}"'.format(table)) print('Found schema: ') print(json.dumps(columns, sort_keys=True, indent=4)) mapping = None if mapping_file: mapping = json.load(mapping_file) bulk_size = min(num_records, bulk_size) num_inserts = int(math.ceil(num_records / bulk_size)) gen_row = create_row_generator(columns, mapping) stmt = to_insert('"{schema}"."{table_name}"'.format(**locals()), columns)[0] print('Using insert statement: ') print(stmt) print('Will make {} requests with a bulk size of {}'.format( num_inserts, bulk_size)) print('Generating fake data and executing inserts') q = asyncio.Queue(maxsize=concurrency) with clients.client(hosts, concurrency=concurrency) as client: active = [True] def stop(): asyncio.ensure_future(q.put(None)) active.clear() loop.remove_signal_handler(signal.SIGINT) if sys.platform != 'win32': loop.add_signal_handler(signal.SIGINT, stop) bulk_seq = _bulk_size_generator(num_records, bulk_size, active) with ThreadPoolExecutor() as e: tasks = asyncio.gather( _gen_data_and_insert(q, e, client, stmt, gen_row, bulk_seq), consume(q, total=num_inserts) ) loop.run_until_complete(tasks)