def exec_instructions(self, instructions): filenames = instructions.statement_files filenames = (os.path.join(self.spec_dir, i) for i in filenames) lines = (line for fn in filenames for line in get_lines(fn)) statements = itertools.chain(as_statements(lines), instructions.statements) for stmt in statements: aio.run(self.client.execute, stmt) for data_file in instructions.data_files: inserts = as_bulk_queries(self._to_inserts(data_file), data_file.get('bulk_size', 5000)) concurrency = data_file.get('concurrency', 25) aio.run_many(self.client.execute_many, inserts, concurrency=concurrency) if self.client.is_cratedb: aio.run(self.client.execute, f"refresh table {data_file['target']}") for data_cmd in instructions.data_cmds: process = subprocess.Popen(data_cmd['cmd'], stdout=subprocess.PIPE, universal_newlines=True) target = data_cmd['target'] dicts = dicts_from_lines(process.stdout) inserts = as_bulk_queries((to_insert(target, d) for d in dicts), data_cmd.get('bulk_size', 5000)) concurrency = data_cmd.get('concurrency', 25) aio.run_many(self.client.execute_many, inserts, concurrency=concurrency) if self.client.is_cratedb: aio.run(self.client.execute, f"refresh table {target}")
def save_result(result): if not table_created: aio.run(client.execute, BENCHMARK_TABLE) table_created.append(None) stmt, args = to_insert('benchmarks', result.as_dict()) aio.run(client.execute, stmt, args) log.result(result)
def insert_data(conn, schema, table, num_rows): cols = columns_for_table(conn, schema, table) stmt, args = to_insert(f'"{schema}"."{table}"', cols) gen_row = create_row_generator(cols) c = conn.cursor() c.executemany(stmt, [gen_row() for x in range(num_rows)]) c.execute(f'REFRESH TABLE "{schema}"."{table}"')
def _to_inserts(self, data_spec): target = data_spec['target'] source = os.path.join(self.spec_dir, data_spec['source']) dicts = dicts_from_lines(get_lines(source)) return (to_insert(target, d) for d in dicts)
def insert_fake_data(hosts=None, table=None, num_records=1e5, bulk_size=1000, concurrency=25, mapping_file=None): """Generate random data and insert it into a table. This will read the table schema and then find suitable random data providers. Which provider is choosen depends on the column name and data type. Example: A column named `name` will map to the `name` provider. A column named `x` of type int will map to `random_int` because there is no `x` provider. Available providers are listed here: https://faker.readthedocs.io/en/latest/providers.html Additional providers: - auto_inc: Returns unique incrementing numbers. Automatically used for columns named "id" of type int or long - geo_point Returns [<lon>, <lat>] Automatically used for columns of type geo_point Args: hosts: <host>:[<port>] of the Crate node table: The table name into which the data should be inserted. Either fully qualified: `<schema>.<table>` or just `<table>` num_records: Number of records to insert. Usually a number but expressions like `1e4` work as well. bulk_size: The bulk size of the insert statements. concurrency: How many operations to run concurrently. mapping_file: A JSON file that defines a mapping from column name to fake-factory provider. The format is as follows: { "column_name": ["provider_with_args", ["arg1", "arg"]], "x": ["provider_with_args", ["arg1"]], "y": "provider_without_args" } """ with clients.client(hosts, concurrency=1) as client: schema, table_name = parse_table(table) columns = retrieve_columns(client, schema, table_name) if not columns: sys.exit('Could not find columns for table "{}"'.format(table)) print('Found schema: ') columns_dict = {r.name: r.type_name for r in columns} print(json.dumps(columns_dict, sort_keys=True, indent=4)) mapping = None if mapping_file: mapping = json.load(mapping_file) bulk_size = min(num_records, bulk_size) num_inserts = int(math.ceil(num_records / bulk_size)) gen_row = create_row_generator(columns, mapping) stmt = to_insert('"{schema}"."{table_name}"'.format(**locals()), columns_dict)[0] print('Using insert statement: ') print(stmt) print('Will make {} requests with a bulk size of {}'.format( num_inserts, bulk_size)) print('Generating fake data and executing inserts') q = asyncio.Queue(maxsize=concurrency) with clients.client(hosts, concurrency=concurrency) as client: active = [True] def stop(): asyncio.ensure_future(q.put(None)) active.clear() loop.remove_signal_handler(signal.SIGINT) if sys.platform != 'win32': loop.add_signal_handler(signal.SIGINT, stop) bulk_seq = _bulk_size_generator(num_records, bulk_size, active) with ThreadPoolExecutor() as e: tasks = asyncio.gather( _gen_data_and_insert(q, e, client, stmt, gen_row, bulk_seq), consume(q, total=num_inserts)) loop.run_until_complete(tasks)