Ejemplo n.º 1
0
def stream_insert(args):
    (from_db, sql, to_db, tgt_table) = args
    from xutil import get_conn
    conn_1 = get_conn(from_db, echo=False, reconnect=True)
    conn_2 = get_conn(to_db, echo=False, reconnect=True)
    stream = conn_1.stream(sql, yield_batch=True, echo=False)

    for batch_rows in stream:
        conn_2.insert(tgt_table, batch_rows, echo=False)

    return conn_1._stream_counter
Ejemplo n.º 2
0
def test_stream(db, sql):
    from xutil import get_conn
    conn = get_conn(db, echo=False, reconnect=True)
    stream = conn.stream(sql, yield_batch=True, echo=False)
    for batch_rows in stream:
        print('Batch : {}'.format(len(batch_rows)))
    return conn_1._stream_counter
Ejemplo n.º 3
0
def update_meta(worker: Worker, data_dict):
    """Update the worker's metadata and send results to frontend.

  Args:
    worker: the respective worker
    data_dict: the request payload dictionary
  """
    database = data_dict['database']

    try:
        conn = get_conn(database)
        make_rec = lambda name, rec: store.sqlx(name).ntRec(**dict(
            db_name=database, last_updated=int(time.time()), **rec))

        # meta_tables
        table_data = [
            make_rec('meta_tables', row._asdict())
            for row in conn.get_all_tables()
        ]
        store.sqlx('meta_tables').replace(table_data)

        # meta_columns
        column_data = [
            make_rec('meta_columns', row._asdict())
            for row in conn.get_all_columns()
        ]
        store.sqlx('meta_columns').replace(column_data)

        data = dict(
            id=data_dict['id'],
            payload_type='meta-updated',
            completed=True,
            orig_req=data_dict,
            sid=data_dict['sid'],
        )

    except Exception as E:
        worker.log(E)
        err_msg = get_error_str(E)

        data = dict(
            id=data_dict['id'],
            payload_type='meta-updated',
            completed=False,
            error=err_msg,
            orig_req=data_dict,
            sid=data_dict['sid'],
        )
    finally:
        worker.put_parent_q(data)
Ejemplo n.º 4
0
def get_analysis_sql(worker: Worker, data_dict):
    """Run the specified analysis and send results to frontend.

  Args:
    worker: the respective worker
    data_dict: the request payload dictionary
  """
    database = data_dict['database']

    try:
        conn = get_conn(database)
        if data_dict['analysis'] == 'join-match':
            sql = conn.analyze_join_match(as_sql=True, **data_dict['kwargs'])
        else:
            sql = conn.analyze_fields(analysis=data_dict['analysis'],
                                      table_name=data_dict['table_name'],
                                      fields=data_dict['fields'],
                                      as_sql=True,
                                      **data_dict['kwargs'])

        data = dict(
            id=data_dict['id'],
            payload_type='template-sql',
            sql=sql,
            completed=True,
            orig_req=data_dict,
            sid=data_dict['sid'],
        )

    except Exception as E:
        worker.log(E)
        err_msg = get_error_str(E)

        data = dict(
            id=data_dict['id'],
            payload_type='template-sql',
            sql=None,
            completed=False,
            error=err_msg,
            orig_req=data_dict,
            sid=data_dict['sid'],
        )

    finally:
        worker.put_parent_q(data)
Ejemplo n.º 5
0
def text_db_to_ff():

    ######## PG to Parquet
    from xutil import get_conn
    from xutil.diskio import write_pq, write_pqs, write_csvs

    from s3fs import S3FileSystem
    s3 = S3FileSystem()

    conn = get_conn('PG_XENIAL')
    df_chunks = conn.stream(
        'select * from housing.orange_county_data',
        dtype='dataframe',
    )
    dfs = list(df_chunks)

    write_pqs(
        '/tmp/housing.orange_county_data',
        df_chunks,
        # partition_cols=['property_zip'],
    )

    write_pqs(
        '/tmp/crypto.bittrex_prices',
        conn.stream(
            'select * from crypto.bittrex_prices',
            dtype='dataframe',
        ))

    write_csvs('/tmp/crypto.bittrex_prices.csv',
               conn.stream('select * from crypto.bittrex_prices'))

    write_pqs('s3://ocral-data-1/housing.landwatch',
              conn.stream(
                  'select * from housing.landwatch',
                  dtype='dataframe',
              ),
              filesystem=s3)

    write_pqs('/tmp/mining.places',
              conn.stream(
                  'select * from mining.places',
                  dtype='dataframe',
              ))
Ejemplo n.º 6
0
def execute_sql(worker: Worker, data_dict):
    "Execute SQL operation"
    log = worker.log

    database = data_dict['database']
    sid = data_dict['sid']
    pid = worker_pid

    conn = get_conn(database)
    if conn.type.lower() == 'spark':
        worker.put_parent_q(
            dict(
                payload_type='spark-url',
                database=database,
                url=conn.sparko.uiWebUrl,
                sid=data_dict['sid'],
            ))

    def start_sql(sql, id, limit, options, sid):
        rows = fields = []
        get_fields = lambda r: r.__fields__ if hasattr(r, '__fields__'
                                                       ) else r._fields
        s_t = epoch()
        cache_used = False
        limit = int(options['limit']) if 'limit' in options else limit

        try:

            def exec_sql(sql, limit_def=5000):
                log('\n------------SQL-START------------\n{}\n------------SQL-END------------ \n'
                    .format(sql),
                    color='blue')
                log('LIMIT: ' + str(limit), color='blue')
                cache_used = False
                if sql in worker_sql_cache:
                    for fields, rows in list(worker_sql_cache[sql]['results']):
                        # if limit above limit_def, then refresh
                        if limit > limit_def: break

                        # if limit is same and not a csv call, then refresh
                        if limit == worker_sql_cache[sql][
                                'limit'] and 'csv' not in options:
                            break

                        # if ran more than 10 minutes ago, then refresh
                        if now_minus(minutes=10
                                     ) > worker_sql_cache[sql]['timestamp']:
                            del worker_sql_cache[sql]
                            break

                        if len(fields) > 0:
                            cache_used = True  # must return data/fields
                            worker_sql_cache[sql]['limit'] = limit
                            log('+Cache Used')

                        yield fields, rows, cache_used

                if not cache_used:
                    worker_sql_cache[sql] = dict(timestamp=now(),
                                                 results=[],
                                                 limit=limit)
                    rows = conn.query(
                        sql.replace('%', '%%'),
                        dtype='tuple',
                        limit=limit if limit > limit_def else limit_def)
                    fields = conn._fields
                    worker_sql_cache[sql]['results'].append((fields, rows))
                    yield fields, rows, cache_used

            if 'meta' in options:
                # get_schemas or
                meta_func = options['meta']
                rows = getattr(conn, meta_func)(**options['kwargs'])
                rows = [tuple(r) for r in rows]
                fields = conn._fields

            elif 'special' in options:
                pass

            else:
                for fields, rows, cache_used in exec_sql(sql):
                    fields, rows = fields, rows
                    rows = rows[:limit] if len(rows) > limit else rows

            if rows == None: rows = []

            if 'email_address' in options or 'csv' in options:
                file_name = '{}-{}-{}.csv'.format(database, options['name'],
                                                  data_dict['id'])
                file_path = '{}/{}'.format(CSV_FOLDER, file_name)
                write_csv(file_path, fields, rows)
                if os.path.getsize(file_path) > 20 * (1024**2):
                    rc = os.system('gzip -f ' + file_path)
                    file_name = file_name + '.gz' if rc == 0 else file_name
                    file_path = '{}/{}'.format(CSV_FOLDER, file_name)

                url = 'http://{base_url}:{port}/csv/{name}'.format(
                    base_url=socket.gethostname(),
                    port=WEBAPP_PORT,
                    name=file_name,
                )
                options['url'] = url

            if 'email_address' in options:
                subj = 'DbNet -- Result for Query {}'.format(data_dict['id'])
                body_text = 'URL: {url}\n\nROWS: {rows}\n\nSQL:\n{sql}'.format(
                    url=url, rows=len(rows), sql=sql)
                to_address = options['email_address']
                email_template = os.getenv("SMTP_TEMPLATE")
                if 'exchange_server' == email_template:
                    email_func = send_email_exchange
                elif 'outlook' == email_template:
                    email_func = send_from_outlook
                elif 'gmail' == email_template:
                    email_func = send_from_gmail
                else:
                    raise Exception('Email method not implemented!')

                email_func(to_address, subj, body_text)

                if len(rows) > 100:
                    rows = rows[:100]

            e_t = epoch()
            secs = e_t - s_t

            # Add query
            store.sqlx('queries').add(
                task_id=data_dict['id'],
                database=database,
                sql_text=sql,
                exec_date=s_t,
                duration_sec=secs,
                row_count=len(rows),
                limit_val=limit,
                cached=cache_used,
                sql_md5=hashlib.md5(sql.encode('utf-8')).hexdigest(),
                last_updated=epoch(),
            )

            if sql.strip():
                sql_fpath = '{}/{}.{}.sql'.format(SQL_FOLDER, database,
                                                  data_dict['id'])
                sql_text = '-- Completed @ {} in {} seconds.\n\n{}'.format(
                    now_str(), secs, sql)
                write_file(sql_fpath, sql_text)

            # time.sleep(0.5)
            data = dict(
                id=data_dict['id'],
                payload_type='query-data',
                database=database,
                rows=rows,
                headers=fields,
                start_ts=s_t,
                end_ts=e_t,
                execute_time=round(secs, 2),
                completed=True,
                cache_used=cache_used,
                options=options,
                pid=worker_pid,
                orig_req=data_dict,
                sid=sid,
            )

        except Exception as E:
            secs = epoch() - s_t
            err_msg_long = get_exception_message()
            err_msg = get_error_str(E)

            worker.log(E)
            data = dict(id=id,
                        payload_type='query-data',
                        database=database,
                        rows=[],
                        headers=[],
                        execute_time=round(secs, 2),
                        completed=False,
                        error='ERROR:\n' + err_msg,
                        options=options,
                        pid=worker_pid,
                        orig_req=data_dict,
                        sid=sid)

        finally:
            # worker.pipe.send_to_parent(data)
            worker.put_parent_q(data)

    data_dict['limit'] = int(data_dict.get('limit', 500))
    data_dict['options'] = data_dict.get('options', {})
    data_dict['sql'] = data_dict.get('sql', '')

    start_sql(
        data_dict['sql'],
        data_dict['id'],
        data_dict['limit'],
        data_dict['options'],
        data_dict['sid'],
    )