Example #1
0
 def connect(self, data_directory):
     path = os.environ.get('IBIS_TEST_SQLITE_DATABASE',
                           data_directory / 'ibis_testing.db')
     path = Path(path)
     if not path.exists():
         pytest.skip('SQLite testing db {} does not exist'.format(path))
     return ibis.sqlite.connect(str(path))
Example #2
0
def update(meta, source_path):
    path = Path(meta)

    click.echo('Updating {} recipe...'.format(path.parent))

    content = render(path)
    recipe = ruamel.yaml.round_trip_load(content)

    # update the necessary fields, skip leading 'v' in the version
    recipe['package']['version'] = ibis.__version__[1:]
    recipe['source'] = {'path': source_path}

    # XXX: because render will remove the {{ PYTHON }} variable
    recipe['build']['script'] = SCRIPT

    updated_content = ruamel.yaml.round_trip_dump(recipe,
                                                  default_flow_style=False,
                                                  width=sys.maxsize).strip()

    if PY2:
        updated_content = updated_content.decode('utf-8')

    click.echo(updated_content)

    path.write_text(updated_content)
Example #3
0
def data_directory():
    root = Path(__file__).absolute().parents[3]

    default = root / 'ci' / 'ibis-testing-data'
    datadir = os.environ.get('IBIS_TEST_DATA_DIRECTORY', default)
    datadir = Path(datadir)

    if not datadir.exists():
        pytest.skip('test data directory not found')

    return datadir
Example #4
0
def sqlite(database, schema, tables, data_directory, **params):
    database = Path(database)
    data_directory = Path(data_directory)
    logger.info('Initializing SQLite...')

    try:
        database.unlink()
    except OSError:
        pass

    params['database'] = str(database)
    engine = init_database('sqlite', params, schema, recreate=False)
    insert_tables(engine, tables, data_directory)
Example #5
0
def parquet(tables, data_directory, ignore_missing_dependency, **params):
    try:
        import pyarrow as pa  # noqa: F401
        import pyarrow.parquet as pq  # noqa: F401
    except ImportError:
        msg = 'PyArrow dependency is missing'
        if ignore_missing_dependency:
            logger.warning('Ignored: %s', msg)
            return 0
        else:
            raise click.ClickException(msg)

    data_directory = Path(data_directory)
    for table, df in read_tables(tables, data_directory):
        if table == 'functional_alltypes':
            schema = pa.schema([
                pa.field('string_col', pa.string()),
                pa.field('date_string_col', pa.string())
            ])
        else:
            schema = None
        arrow_table = pa.Table.from_pandas(df, schema=schema)
        target_path = data_directory / '{}.parquet'.format(table)

        pq.write_table(arrow_table, str(target_path))
Example #6
0
def mysql(schema, tables, data_directory, **params):
    data_directory = Path(data_directory)
    click.echo('Initializing MySQL...')
    engine = init_database('mysql+pymysql',
                           params,
                           schema,
                           isolation_level='AUTOCOMMIT')
    insert_tables(engine, tables, data_directory)
Example #7
0
def mysql(schema, tables, data_directory, **params):
    data_directory = Path(data_directory)
    logger.info('Initializing MySQL...')
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        engine = init_database('mysql+pymysql', params, schema,
                               isolation_level='AUTOCOMMIT')
    insert_tables(engine, tables, data_directory)
Example #8
0
def clone(repo_uri, destination):
    if Path(destination).exists():
        return

    cmd = git['clone', repo_uri, destination]

    cmd(stdout=click.get_binary_stream('stdout'),
        stderr=click.get_binary_stream('stderr'))
Example #9
0
def update(meta, source_path):
    path = Path(meta)

    click.echo('\nUpdating {} recipe...'.format(path.parent))

    content = render(path)
    recipe = ruamel.yaml.round_trip_load(content)

    # update the necessary fields, skip leading 'v' in the version
    recipe['package']['version'] = ibis.__version__[1:]
    recipe['source'] = {'path': source_path}

    updated_content = ruamel.yaml.round_trip_dump(recipe,
                                                  default_flow_style=False)

    if PY2:
        updated_content = updated_content.decode('utf-8')

    path.write_text(updated_content)
Example #10
0
def download(base_url, directory, name):
    directory = Path(directory)
    if not directory.exists():
        directory.mkdir()

    data_url = '{}/{}'.format(base_url, name)
    path = directory / name

    if not path.exists():
        download = curl[data_url, '-o', path, '-L']
        download(stdout=click.get_binary_stream('stdout'),
                 stderr=click.get_binary_stream('stderr'))
    else:
        logger.info('Skipping download: %s already exists', name)

    logger.info('Extracting archive to %s', directory)
    if path.suffix in ('.tar', '.gz', '.bz2', '.xz'):
        with tarfile.open(str(path), mode='r|gz') as f:
            f.extractall(path=str(directory))
Example #11
0
def deploy(package_location, artifact_directory, architectures):
    artifact_dir = Path(artifact_directory)
    artifact_dir.mkdir(parents=True, exist_ok=True)
    package_loc = Path(package_location)
    assert package_loc.exists(), 'Path {} does not exist'.format(package_loc)

    for architecture in architectures:
        arch_artifact_directory = str(artifact_dir / architecture)
        arch_package_directory = str(package_loc / architecture)
        shutil.copytree(arch_package_directory, arch_artifact_directory)
    cmd = conda['index', artifact_directory]
    cmd(stdout=click.get_binary_stream('stdout'),
        stderr=click.get_binary_stream('stderr'))
Example #12
0
def clickhouse(schema, tables, data_directory, **params):
    data_directory = Path(data_directory)
    logger.info('Initializing ClickHouse...')
    engine = init_database('clickhouse+native', params, schema)

    for table, df in read_tables(tables, data_directory):
        if table == 'batting':
            # float nan problem
            cols = df.select_dtypes([float]).columns
            df[cols] = df[cols].fillna(0).astype(int)
            # string None driver problem
            cols = df.select_dtypes([object]).columns
            df[cols] = df[cols].fillna('')
        elif table == 'awards_players':
            # string None driver problem
            cols = df.select_dtypes([object]).columns
            df[cols] = df[cols].fillna('')
        insert(engine, table, df)
Example #13
0
def postgres(schema, tables, data_directory, **params):
    data_directory = Path(data_directory)
    logger.info('Initializing PostgreSQL...')
    engine = init_database('postgresql', params, schema,
                           isolation_level='AUTOCOMMIT')

    query = "COPY {} FROM STDIN WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',')"
    database = params['database']
    for table in tables:
        src = data_directory / '{}.csv'.format(table)
        load = psql['--host', params['host'], '--port', params['port'],
                    '--username', params['user'], '--dbname', database,
                    '--command', query.format(table)]
        with local.env(PGPASSWORD=params['password']):
            with src.open('r') as f:
                load(stdin=f)

    engine.execute('VACUUM FULL ANALYZE')
Example #14
0
def bigquery(data_directory, ignore_missing_dependency, **params):
    try:
        import google.api_core.exceptions
        from google.cloud import bigquery
    except ImportError:
        msg = 'google-cloud-bigquery dependency is missing'
        if ignore_missing_dependency:
            logger.warning('Ignored: %s', msg)
            return 0
        else:
            raise click.ClickException(msg)

    project_id = os.environ['GOOGLE_BIGQUERY_PROJECT_ID']
    bqclient = bigquery.Client(project=project_id)

    # Create testing dataset.
    testing_dataset = bqclient.dataset('testing')
    try:
        bqclient.create_dataset(bigquery.Dataset(testing_dataset))
    except google.api_core.exceptions.Conflict:
        pass  # Skip if already created.

    # Set up main data table.
    data_directory = Path(data_directory)
    functional_alltypes_path = data_directory / 'functional_alltypes.csv'
    functional_alltypes_schema = []
    schema_path = data_directory / 'functional_alltypes_bigquery_schema.json'
    with open(str(schema_path)) as schemafile:
        schema_json = json.load(schemafile)
        for field in schema_json:
            functional_alltypes_schema.append(
                bigquery.SchemaField.from_api_repr(field))
    load_config = bigquery.LoadJobConfig()
    load_config.skip_leading_rows = 1  # skip the header row.
    load_config.schema = functional_alltypes_schema

    # Load main data table.
    functional_alltypes_schema = []
    with open(str(functional_alltypes_path), 'rb') as csvfile:
        job = bqclient.load_table_from_file(
            csvfile,
            testing_dataset.table('functional_alltypes'),
            job_config=load_config).result()

        if job.error_result:
            raise click.ClickException(str(job.error_result))

    # Load an ingestion time partitioned table.
    functional_alltypes_path = data_directory / 'functional_alltypes.csv'
    with open(str(functional_alltypes_path), 'rb') as csvfile:
        load_config.time_partitioning = bigquery.TimePartitioning()
        job = bqclient.load_table_from_file(
            csvfile,
            testing_dataset.table('functional_alltypes_parted'),
            job_config=load_config).result()

        if job.error_result:
            raise click.ClickException(str(job.error_result))

    # Create a table with complex data types (nested and repeated).
    struct_table_path = data_directory / 'struct_table.avro'
    with open(str(struct_table_path), 'rb') as avrofile:
        load_config = bigquery.LoadJobConfig()
        load_config.source_format = 'AVRO'
        job = bqclient.load_table_from_file(
            avrofile,
            testing_dataset.table('struct_table'),
            job_config=load_config)

        if job.error_result:
            raise click.ClickException(str(job.error_result))

    # Create empty date-partitioned table.
    date_table = bigquery.Table(testing_dataset.table('date_column_parted'))
    date_table.schema = [
        bigquery.SchemaField('my_date_parted_col', 'DATE'),
        bigquery.SchemaField('string_col', 'STRING'),
        bigquery.SchemaField('int_col', 'INTEGER'),
    ]
    date_table.time_partitioning = bigquery.TimePartitioning(
        field='my_date_parted_col')
    bqclient.create_table(date_table)

    # Create empty timestamp-partitioned tables.
    timestamp_table = bigquery.Table(
        testing_dataset.table('timestamp_column_parted'))
    timestamp_table.schema = [
        bigquery.SchemaField('my_timestamp_parted_col', 'DATE'),
        bigquery.SchemaField('string_col', 'STRING'),
        bigquery.SchemaField('int_col', 'INTEGER'),
    ]
    timestamp_table.time_partitioning = bigquery.TimePartitioning(
        field='my_timestamp_parted_col')
    bqclient.create_table(timestamp_table)

    # Create a table with a numeric column
    numeric_table = bigquery.Table(testing_dataset.table('numeric_table'))
    numeric_table.schema = [
        bigquery.SchemaField('string_col', 'STRING'),
        bigquery.SchemaField('numeric_col', 'NUMERIC'),
    ]
    bqclient.create_table(numeric_table)

    df = pd.read_csv(
        str(data_directory / 'functional_alltypes.csv'),
        usecols=['string_col', 'double_col'],
        header=0,
    )
    with tempfile.NamedTemporaryFile(mode='a+b') as csvfile:
        df.to_csv(csvfile, header=False, index=False)
        csvfile.seek(0)

        load_config = bigquery.LoadJobConfig()
        load_config.skip_leading_rows = 1  # skip the header row.
        load_config.schema = numeric_table.schema

        job = bqclient.load_table_from_file(
            csvfile,
            testing_dataset.table('numeric_table'),
            job_config=load_config).result()

        if job.error_result:
            raise click.ClickException(str(job.error_result))
Example #15
0
def mapd(schema, tables, data_directory, **params):
    if sys.version_info.major < 3:
        logger.info('MapD backend is unavailable for Python 2.')
        return

    import pymapd

    data_directory = Path(data_directory)
    reserved_words = ['table', 'year', 'month']

    # connection
    logger.info('Initializing MapD...')
    if params['database'] != 'mapd':
        conn = pymapd.connect(host=params['host'],
                              user=params['user'],
                              password=params['password'],
                              port=params['port'],
                              dbname='mapd')
        stmt = 'CREATE DATABASE {}'.format(params['database'])
        try:
            conn.execute(stmt)
        except Exception:
            logger.exception('MapD DDL statement %r failed', stmt)
        conn.close()

    conn = pymapd.connect(host=params['host'],
                          user=params['user'],
                          password=params['password'],
                          port=params['port'],
                          dbname=params['database'])

    # create tables
    for stmt in filter(None, map(str.strip, schema.read().split(';'))):
        try:
            conn.execute(stmt)
        except Exception:
            logger.exception('MapD DDL statement \n%r\n failed', stmt)

    # import data
    for table, df in read_tables(tables, data_directory):
        if table == 'batting':
            # float nan problem
            cols = df.select_dtypes([float]).columns
            df[cols] = df[cols].fillna(0).astype(int)

            # string None driver problem
            cols = df.select_dtypes([object]).columns
            df[cols] = df[cols].fillna('')
        elif table == 'awards_players':
            # string None driver problem
            cols = df.select_dtypes([object]).columns
            df[cols] = df[cols].fillna('')

        # rename fields
        for df_col in df.columns:
            if ' ' in df_col or ':' in df_col:
                column = df_col.replace(' ', '_').replace(':', '_')
            elif df_col in reserved_words:
                column = '{}_'.format(df_col)
            else:
                continue
            df.rename(columns={df_col: column}, inplace=True)
        conn.load_table_columnar(table, df)

    conn.close()
Example #16
0
def mapd(schema, tables, data_directory, **params):
    if sys.version_info[0] < 3:
        click.echo('[MAPD|EE] MapD backend is unavailable for Python 2.')
        return

    import pymapd

    data_directory = Path(data_directory)
    reserved_words = ['table', 'year', 'month']

    # connection
    click.echo('Initializing MapD...')
    if params['database'] != 'mapd':
        conn = pymapd.connect(host=params['host'],
                              user=params['user'],
                              password=params['password'],
                              port=params['port'],
                              dbname='mapd')
        try:
            conn.execute('CREATE DATABASE {}'.format(params['database']))
        except Exception as e:
            click.echo('[MAPD|WW]{}'.format(e))
        conn.close()

    conn = pymapd.connect(host=params['host'],
                          user=params['user'],
                          password=params['password'],
                          port=params['port'],
                          dbname=params['database'])

    # create tables
    for stmt in schema.read().split(';'):
        stmt = stmt.strip()
        if len(stmt):
            try:
                conn.execute(stmt)
            except Exception as e:
                click.echo('[MAPD|WW] {}'.format(str(e)))
    click.echo('[MAPD|II] Creating tables ... OK')

    # import data
    click.echo('[MAPD|II] Loading data ...')
    for table, df in read_tables(tables, data_directory):
        if table == 'batting':
            # float nan problem
            cols = df.select_dtypes([float]).columns
            df[cols] = df[cols].fillna(0).astype(int)
            # string None driver problem
            cols = df.select_dtypes([object]).columns
            df[cols] = df[cols].fillna('')
        elif table == 'awards_players':
            # string None driver problem
            cols = df.select_dtypes([object]).columns
            df[cols] = df[cols].fillna('')

        # rename fields
        for df_col in df.columns:
            if ' ' in df_col or ':' in df_col:
                column = df_col.replace(' ', '_').replace(':', '_')
            elif df_col in reserved_words:
                column = '{}_'.format(df_col)
            else:
                continue
            df.rename(columns={df_col: column}, inplace=True)
        conn.load_table_columnar(table, df)

    conn.close()

    click.echo('[MAPD|II] Done!')
Example #17
0
import os
import random
import shutil
import sys
import tempfile

import click
import ruamel.yaml

from jinja2 import Environment, FileSystemLoader
from plumbum.cmd import git, conda

import ibis
from ibis.compat import Path, PY2

IBIS_DIR = Path(__file__).parent.parent.absolute()


def render(path):
    env = Environment(loader=FileSystemLoader(str(path.parent)))
    template = env.get_template(path.name)
    return template.render()


@click.group()
def cli():
    pass


default_repo = 'https://github.com/conda-forge/ibis-framework-feedstock'
default_dest = os.path.join(
Example #18
0
import warnings

import click
import six

import pandas as pd
import sqlalchemy as sa

from toolz import dissoc
from plumbum import local
from plumbum.cmd import curl, psql

import ibis
from ibis.compat import Path

SCRIPT_DIR = Path(__file__).parent.absolute()
DATA_DIR = Path(
    os.environ.get('IBIS_TEST_DATA_DIRECTORY',
                   SCRIPT_DIR / 'ibis-testing-data'))

TEST_TABLES = ['functional_alltypes', 'diamonds', 'batting', 'awards_players']

logger = ibis.util.get_logger('datamgr')


def recreate_database(driver, params, **kwargs):
    url = sa.engine.url.URL(driver, **dissoc(params, 'database'))
    engine = sa.create_engine(url, **kwargs)

    with engine.connect() as conn:
        conn.execute('DROP DATABASE IF EXISTS {}'.format(params['database']))
Example #19
0
 def __init__(self, root):
     self.root = Path(str(root))
     self.dictionary = {}