Example #1
0
    def exec_instructions(self, instructions):
        filenames = instructions.statement_files
        filenames = (os.path.join(self.spec_dir, i) for i in filenames)
        lines = (line for fn in filenames for line in get_lines(fn))
        statements = itertools.chain(as_statements(lines),
                                     instructions.statements)
        for stmt in statements:
            aio.run(self.client.execute, stmt)

        for data_file in instructions.data_files:
            inserts = as_bulk_queries(self._to_inserts(data_file),
                                      data_file.get('bulk_size', 5000))
            concurrency = data_file.get('concurrency', 25)
            aio.run_many(self.client.execute_many,
                         inserts,
                         concurrency=concurrency)
            if self.client.is_cratedb:
                aio.run(self.client.execute,
                        f"refresh table {data_file['target']}")

        for data_cmd in instructions.data_cmds:
            process = subprocess.Popen(data_cmd['cmd'],
                                       stdout=subprocess.PIPE,
                                       universal_newlines=True)
            target = data_cmd['target']
            dicts = dicts_from_lines(process.stdout)
            inserts = as_bulk_queries((to_insert(target, d) for d in dicts),
                                      data_cmd.get('bulk_size', 5000))
            concurrency = data_cmd.get('concurrency', 25)
            aio.run_many(self.client.execute_many,
                         inserts,
                         concurrency=concurrency)
            if self.client.is_cratedb:
                aio.run(self.client.execute, f"refresh table {target}")
Example #2
0
 def save_result(result):
     if not table_created:
         aio.run(client.execute, BENCHMARK_TABLE)
         table_created.append(None)
     stmt, args = to_insert('benchmarks', result.as_dict())
     aio.run(client.execute, stmt, args)
     log.result(result)
Example #3
0
def insert_data(conn, schema, table, num_rows):
    cols = columns_for_table(conn, schema, table)
    stmt, args = to_insert(f'"{schema}"."{table}"', cols)
    gen_row = create_row_generator(cols)
    c = conn.cursor()
    c.executemany(stmt, [gen_row() for x in range(num_rows)])
    c.execute(f'REFRESH TABLE "{schema}"."{table}"')
Example #4
0
 def _to_inserts(self, data_spec):
     target = data_spec['target']
     source = os.path.join(self.spec_dir, data_spec['source'])
     dicts = dicts_from_lines(get_lines(source))
     return (to_insert(target, d) for d in dicts)
Example #5
0
def insert_fake_data(hosts=None,
                     table=None,
                     num_records=1e5,
                     bulk_size=1000,
                     concurrency=25,
                     mapping_file=None):
    """Generate random data and insert it into a table.

    This will read the table schema and then find suitable random data providers.
    Which provider is choosen depends on the column name and data type.

    Example:

        A column named `name` will map to the `name` provider.
        A column named `x` of type int will map to `random_int` because there
        is no `x` provider.

    Available providers are listed here:
        https://faker.readthedocs.io/en/latest/providers.html

        Additional providers:
        - auto_inc:
            Returns unique incrementing numbers.
            Automatically used for columns named "id" of type int or long
        - geo_point
            Returns [<lon>, <lat>]
            Automatically used for columns of type geo_point

    Args:
        hosts: <host>:[<port>] of the Crate node
        table: The table name into which the data should be inserted.
            Either fully qualified: `<schema>.<table>` or just `<table>`
        num_records: Number of records to insert.
            Usually a number but expressions like `1e4` work as well.
        bulk_size: The bulk size of the insert statements.
        concurrency: How many operations to run concurrently.
        mapping_file: A JSON file that defines a mapping from column name to
            fake-factory provider.
            The format is as follows:
            {
                "column_name": ["provider_with_args", ["arg1", "arg"]],
                "x": ["provider_with_args", ["arg1"]],
                "y": "provider_without_args"
            }
    """
    with clients.client(hosts, concurrency=1) as client:
        schema, table_name = parse_table(table)
        columns = retrieve_columns(client, schema, table_name)
    if not columns:
        sys.exit('Could not find columns for table "{}"'.format(table))
    print('Found schema: ')
    columns_dict = {r.name: r.type_name for r in columns}
    print(json.dumps(columns_dict, sort_keys=True, indent=4))
    mapping = None
    if mapping_file:
        mapping = json.load(mapping_file)

    bulk_size = min(num_records, bulk_size)
    num_inserts = int(math.ceil(num_records / bulk_size))

    gen_row = create_row_generator(columns, mapping)

    stmt = to_insert('"{schema}"."{table_name}"'.format(**locals()),
                     columns_dict)[0]
    print('Using insert statement: ')
    print(stmt)

    print('Will make {} requests with a bulk size of {}'.format(
        num_inserts, bulk_size))

    print('Generating fake data and executing inserts')
    q = asyncio.Queue(maxsize=concurrency)
    with clients.client(hosts, concurrency=concurrency) as client:
        active = [True]

        def stop():
            asyncio.ensure_future(q.put(None))
            active.clear()
            loop.remove_signal_handler(signal.SIGINT)

        if sys.platform != 'win32':
            loop.add_signal_handler(signal.SIGINT, stop)
        bulk_seq = _bulk_size_generator(num_records, bulk_size, active)
        with ThreadPoolExecutor() as e:
            tasks = asyncio.gather(
                _gen_data_and_insert(q, e, client, stmt, gen_row, bulk_seq),
                consume(q, total=num_inserts))
            loop.run_until_complete(tasks)