def stream_insert(args): (from_db, sql, to_db, tgt_table) = args from xutil import get_conn conn_1 = get_conn(from_db, echo=False, reconnect=True) conn_2 = get_conn(to_db, echo=False, reconnect=True) stream = conn_1.stream(sql, yield_batch=True, echo=False) for batch_rows in stream: conn_2.insert(tgt_table, batch_rows, echo=False) return conn_1._stream_counter
def test_stream(db, sql): from xutil import get_conn conn = get_conn(db, echo=False, reconnect=True) stream = conn.stream(sql, yield_batch=True, echo=False) for batch_rows in stream: print('Batch : {}'.format(len(batch_rows))) return conn_1._stream_counter
def update_meta(worker: Worker, data_dict): """Update the worker's metadata and send results to frontend. Args: worker: the respective worker data_dict: the request payload dictionary """ database = data_dict['database'] try: conn = get_conn(database) make_rec = lambda name, rec: store.sqlx(name).ntRec(**dict( db_name=database, last_updated=int(time.time()), **rec)) # meta_tables table_data = [ make_rec('meta_tables', row._asdict()) for row in conn.get_all_tables() ] store.sqlx('meta_tables').replace(table_data) # meta_columns column_data = [ make_rec('meta_columns', row._asdict()) for row in conn.get_all_columns() ] store.sqlx('meta_columns').replace(column_data) data = dict( id=data_dict['id'], payload_type='meta-updated', completed=True, orig_req=data_dict, sid=data_dict['sid'], ) except Exception as E: worker.log(E) err_msg = get_error_str(E) data = dict( id=data_dict['id'], payload_type='meta-updated', completed=False, error=err_msg, orig_req=data_dict, sid=data_dict['sid'], ) finally: worker.put_parent_q(data)
def get_analysis_sql(worker: Worker, data_dict): """Run the specified analysis and send results to frontend. Args: worker: the respective worker data_dict: the request payload dictionary """ database = data_dict['database'] try: conn = get_conn(database) if data_dict['analysis'] == 'join-match': sql = conn.analyze_join_match(as_sql=True, **data_dict['kwargs']) else: sql = conn.analyze_fields(analysis=data_dict['analysis'], table_name=data_dict['table_name'], fields=data_dict['fields'], as_sql=True, **data_dict['kwargs']) data = dict( id=data_dict['id'], payload_type='template-sql', sql=sql, completed=True, orig_req=data_dict, sid=data_dict['sid'], ) except Exception as E: worker.log(E) err_msg = get_error_str(E) data = dict( id=data_dict['id'], payload_type='template-sql', sql=None, completed=False, error=err_msg, orig_req=data_dict, sid=data_dict['sid'], ) finally: worker.put_parent_q(data)
def text_db_to_ff(): ######## PG to Parquet from xutil import get_conn from xutil.diskio import write_pq, write_pqs, write_csvs from s3fs import S3FileSystem s3 = S3FileSystem() conn = get_conn('PG_XENIAL') df_chunks = conn.stream( 'select * from housing.orange_county_data', dtype='dataframe', ) dfs = list(df_chunks) write_pqs( '/tmp/housing.orange_county_data', df_chunks, # partition_cols=['property_zip'], ) write_pqs( '/tmp/crypto.bittrex_prices', conn.stream( 'select * from crypto.bittrex_prices', dtype='dataframe', )) write_csvs('/tmp/crypto.bittrex_prices.csv', conn.stream('select * from crypto.bittrex_prices')) write_pqs('s3://ocral-data-1/housing.landwatch', conn.stream( 'select * from housing.landwatch', dtype='dataframe', ), filesystem=s3) write_pqs('/tmp/mining.places', conn.stream( 'select * from mining.places', dtype='dataframe', ))
def execute_sql(worker: Worker, data_dict): "Execute SQL operation" log = worker.log database = data_dict['database'] sid = data_dict['sid'] pid = worker_pid conn = get_conn(database) if conn.type.lower() == 'spark': worker.put_parent_q( dict( payload_type='spark-url', database=database, url=conn.sparko.uiWebUrl, sid=data_dict['sid'], )) def start_sql(sql, id, limit, options, sid): rows = fields = [] get_fields = lambda r: r.__fields__ if hasattr(r, '__fields__' ) else r._fields s_t = epoch() cache_used = False limit = int(options['limit']) if 'limit' in options else limit try: def exec_sql(sql, limit_def=5000): log('\n------------SQL-START------------\n{}\n------------SQL-END------------ \n' .format(sql), color='blue') log('LIMIT: ' + str(limit), color='blue') cache_used = False if sql in worker_sql_cache: for fields, rows in list(worker_sql_cache[sql]['results']): # if limit above limit_def, then refresh if limit > limit_def: break # if limit is same and not a csv call, then refresh if limit == worker_sql_cache[sql][ 'limit'] and 'csv' not in options: break # if ran more than 10 minutes ago, then refresh if now_minus(minutes=10 ) > worker_sql_cache[sql]['timestamp']: del worker_sql_cache[sql] break if len(fields) > 0: cache_used = True # must return data/fields worker_sql_cache[sql]['limit'] = limit log('+Cache Used') yield fields, rows, cache_used if not cache_used: worker_sql_cache[sql] = dict(timestamp=now(), results=[], limit=limit) rows = conn.query( sql.replace('%', '%%'), dtype='tuple', limit=limit if limit > limit_def else limit_def) fields = conn._fields worker_sql_cache[sql]['results'].append((fields, rows)) yield fields, rows, cache_used if 'meta' in options: # get_schemas or meta_func = options['meta'] rows = getattr(conn, meta_func)(**options['kwargs']) rows = [tuple(r) for r in rows] fields = conn._fields elif 'special' in options: pass else: for fields, rows, cache_used in exec_sql(sql): fields, rows = fields, rows rows = rows[:limit] if len(rows) > limit else rows if rows == None: rows = [] if 'email_address' in options or 'csv' in options: file_name = '{}-{}-{}.csv'.format(database, options['name'], data_dict['id']) file_path = '{}/{}'.format(CSV_FOLDER, file_name) write_csv(file_path, fields, rows) if os.path.getsize(file_path) > 20 * (1024**2): rc = os.system('gzip -f ' + file_path) file_name = file_name + '.gz' if rc == 0 else file_name file_path = '{}/{}'.format(CSV_FOLDER, file_name) url = 'http://{base_url}:{port}/csv/{name}'.format( base_url=socket.gethostname(), port=WEBAPP_PORT, name=file_name, ) options['url'] = url if 'email_address' in options: subj = 'DbNet -- Result for Query {}'.format(data_dict['id']) body_text = 'URL: {url}\n\nROWS: {rows}\n\nSQL:\n{sql}'.format( url=url, rows=len(rows), sql=sql) to_address = options['email_address'] email_template = os.getenv("SMTP_TEMPLATE") if 'exchange_server' == email_template: email_func = send_email_exchange elif 'outlook' == email_template: email_func = send_from_outlook elif 'gmail' == email_template: email_func = send_from_gmail else: raise Exception('Email method not implemented!') email_func(to_address, subj, body_text) if len(rows) > 100: rows = rows[:100] e_t = epoch() secs = e_t - s_t # Add query store.sqlx('queries').add( task_id=data_dict['id'], database=database, sql_text=sql, exec_date=s_t, duration_sec=secs, row_count=len(rows), limit_val=limit, cached=cache_used, sql_md5=hashlib.md5(sql.encode('utf-8')).hexdigest(), last_updated=epoch(), ) if sql.strip(): sql_fpath = '{}/{}.{}.sql'.format(SQL_FOLDER, database, data_dict['id']) sql_text = '-- Completed @ {} in {} seconds.\n\n{}'.format( now_str(), secs, sql) write_file(sql_fpath, sql_text) # time.sleep(0.5) data = dict( id=data_dict['id'], payload_type='query-data', database=database, rows=rows, headers=fields, start_ts=s_t, end_ts=e_t, execute_time=round(secs, 2), completed=True, cache_used=cache_used, options=options, pid=worker_pid, orig_req=data_dict, sid=sid, ) except Exception as E: secs = epoch() - s_t err_msg_long = get_exception_message() err_msg = get_error_str(E) worker.log(E) data = dict(id=id, payload_type='query-data', database=database, rows=[], headers=[], execute_time=round(secs, 2), completed=False, error='ERROR:\n' + err_msg, options=options, pid=worker_pid, orig_req=data_dict, sid=sid) finally: # worker.pipe.send_to_parent(data) worker.put_parent_q(data) data_dict['limit'] = int(data_dict.get('limit', 500)) data_dict['options'] = data_dict.get('options', {}) data_dict['sql'] = data_dict.get('sql', '') start_sql( data_dict['sql'], data_dict['id'], data_dict['limit'], data_dict['options'], data_dict['sid'], )