def write_csv(file_path, headers, data, footer_text=None, append=False, log=log, echo=True, file_obj=None, encoding="utf8"): "Write to CSV, python3 compatible. 'data' must be list of iterables" s_t = now() mode = 'a' if append else 'w' f = file_obj if file_obj else open( file_path, mode, newline='', encoding=encoding) w = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') if not append: w.writerow(headers) for row in data: w.writerow(row) if footer_text: f.write(footer_text) if not file_obj: f.close() secs = (now() - s_t).total_seconds() rate = round(len(data) / secs, 1) if echo: log("Wrote {} rows to {} [{} r/s].".format(len(data), file_path, rate))
def read_csv(file_path, delimiter=',', quotechar='"', mode='r', encoding="utf8"): """Read CSV from File""" s_t = now() with open(file_path, mode, encoding=encoding) as f: reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar) Data = namedtuple("Data", [f.replace(' ', '_') for f in next(reader)]) # data_rows = [row for row in reader] try: i = 0 data_rows = [row for i, row in enumerate(imap(Data._make, reader))] except Exception as e: print('ERROR at line ' + str(i + 1)) raise e secs = (now() - s_t).total_seconds() rate = round(len(data_rows) / secs, 1) log("Imported {} rows from {} [{} r/s].".format(len(data_rows), file_path, rate)) return data_rows
def transfer_progress(self, transferred, total, unit='B'): "Display transfer progress" prct = int(100.0 * transferred / total) divide = lambda x, y: round(1.0 * x / (y), 1) if self.last_stat: secs = (datetime.datetime.now() - self.last_stat['time']).total_seconds() if secs > 2: rate = round((transferred - self.last_stat['transferred']) / secs, 1) self.last_stat = dict(time=now(), transferred=transferred, rate=rate) else: rate = self.last_stat['rate'] else: rate = 0 self.last_stat = dict(time=now(), transferred=transferred, rate=rate) if total > 1024**3: transferred = divide(transferred, 1024**3) total = divide(total, 1024**3) unit = 'GB' rate = '{} {} / sec'.format(divide(rate, 1024**2), 'MB') elif total > 1024**2: transferred = divide(transferred, 1024**2) total = divide(total, 1024**2) unit = 'MB' rate = '{} {} / sec'.format(divide(rate, 1024**2), unit) elif total > 1024**1: transferred = divide(transferred, 1024**1) total = divide(total, 1024**1) unit = 'KB' rate = '{} {} / sec'.format(divide(rate, 1024**1), unit) log('+{}% Complete: {} / {} {} @ {}'.format( prct, transferred, total, unit, rate))
def write_pq( file_path, dataf, partition_cols=None, flavor='spark', filesystem=None, append=False, log=log, ): "Write to Parquet, python3 compatible. 'data' must be list of interables" s_t = now() if not append and os.path.exists(file_path): shutil.rmtree(file_path, ignore_errors=True) table = pa.Table.from_pandas(dataf, nthreads=psutil.cpu_count()) counter = table.num_rows pq.write_to_dataset( table, root_path=file_path, partition_cols=partition_cols, flavor=flavor, preserve_index=False, filesystem=filesystem, use_deprecated_int96_timestamps=True, compression='snappy') # will append. delete folder for overwrite secs = (now() - s_t).total_seconds() rate = round(counter / secs, 1) log("Wrote: {} rows to {} [{} r/s].".format(counter, file_path, rate)) return counter
def reconnect(self, min_tresh=0): """Re-Connect to Database if minute threshold reached""" if (now() - self.last_connect).total_seconds() > min_tresh * 60: log('Reconnecting to {}...'.format(self.name)) if self.cursor is not None: self.cursor.close() self.connect() self.last_connect = now()
def read_csvD(file_path, delimiter=',', quotechar='"', date_cols=[], date_format=None, echo=True, recarray=False, detect_date=True): "Use Pandas DataFrame" # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html import pandas s_t = now() # https://stackoverflow.com/questions/17465045/can-pandas-automatically-recognize-dates def date_parser(x): dd = x try: dd = pandas.datetime.strptime(str(x), date_format) except ValueError: pass return dd date_parser = date_parser if date_format else None df = pandas.read_csv( file_path, delimiter=delimiter, parse_dates=date_cols, date_parser=date_parser, # quoting=csv.QUOTE_MINIMAL , infer_datetime_format=detect_date, ) if recarray: df = df.to_records() for col in df.columns: if not detect_date: continue if col in date_cols: continue if df[col].dtype == 'object': try: df[col] = pandas.to_datetime(df[col]) except ValueError: pass replace_func = lambda col: re.sub(r'_+', '_', re.sub( r'[\]\[. ]', '_', col)) df = df.rename(columns={col: replace_func(col) for col in df.columns}) secs = (now() - s_t).total_seconds() rate = round(len(df) / secs, 1) if echo: log("Imported {} rows from {} [{} r/s].".format( len(df), file_path, rate)) return df
def write_jsonl(file_path, data, log=log, encoding="utf8"): "Write JSONL to File" import jsonlines s_t = now() with open(file_path, 'w', encoding=encoding) as f: writer = jsonlines.Writer(f) writer.write_all(data) counter = len(data) secs = (now() - s_t).total_seconds() rate = round(counter / secs, 1) log("Wrote {} rows to {} [{} r/s].".format(counter, file_path, rate))
def read_jsonl(file_path, log=log, encoding="utf8"): "Read from JSONL File" import jsonlines s_t = now() with open(file_path, 'r', encoding=encoding) as f: reader = jsonlines.Reader(f) data = list(reader.iter()) counter = len(data) secs = (now() - s_t).total_seconds() rate = round(counter / secs, 1) log("Read {} rows from {} [{} r/s].".format(counter, file_path, rate)) return data
def start_worker_mon(): """Starts the Monitoring worker""" worker_name = '{}-mon'.format(WORKER_PREFIX) worker = Worker(worker_name, 'monitor', fn=mon_worker.run, kwargs={}, log=log, kill_if_running=True, pid_folder=DBNET_FOLDER) worker.start() log('Monitor Loop PID is {}'.format(worker.pid)) workers['mon'] = worker workers['mon'].put_child_q(dict(name=worker_name, pid=worker.pid)) # add to monitor store.sqlx('workers').replace_rec( hostname=worker.hostname, worker_name=worker.name, worker_type=worker.type, worker_pid=worker.pid, status='RUNNING', task_id=-1, task_function=worker.fn.__name__, task_start_date=now(), task_args=jdumps(worker.args), task_kwargs=jdumps(worker.kwargs), progress=None, queue_length=0, last_updated=epoch(), ) return worker
def start_worker_webapp(): """Starts the WebApp worker""" worker_name = '{}-webapp'.format(WORKER_PREFIX) worker = Worker(worker_name, 'web-app', fn=webapp_worker.run, log=log, kill_if_running=True, args=(WEBAPP_HOST, WEBAPP_PORT), kwargs={'mon_worker': workers['mon']}, pid_folder=DBNET_FOLDER) worker.start() workers['mon'].put_child_q(dict(name=worker_name, pid=worker.pid)) # add to monitor workers['webapp'] = worker store.sqlx('workers').replace_rec( hostname=worker.hostname, worker_name=worker.name, worker_type=worker.type, worker_pid=worker.pid, status='RUNNING', task_id=-1, task_function=worker.fn.__name__, task_start_date=now(), task_args=jdumps(worker.args), task_kwargs=jdumps(worker.kwargs), progress=None, queue_length=0, last_updated=epoch(), ) return worker
def hdfs_put(file_path, hdfs_folder, put=True, rm=True, log=log): "Move file from local to HDFS" file_size = get_path_size(file_path) file_name = file_path.split('/')[-1] hdfs_file_path = '{}/{}'.format(hdfs_folder, file_name) s_t = now() if rm: os.system('hdfs dfs -rm -r -f -skipTrash {}'.format(hdfs_file_path)) if put: os.system('hdfs dfs -put -f {} {}'.format(file_path, hdfs_folder)) secs = (now() - s_t).total_seconds() mins = round(secs / 60, 1) rate = round(file_size / 1024**2 / secs, 1) log("Moved data into HDFS in {} mins [{}MB -> {} MB/s]".format(mins, file_size, rate)) return hdfs_file_path
def __init__(self, conn_dict, profile=None, echo=False): "Inititate connection" self._cred = struct(conn_dict) self._cred.kwargs = conn_dict.get('kwargs', {}) self.name = self._cred.get('name', None) self.username = self._cred.get('username', None) self.type = self._cred.type self.engine = None self._cursor_description = None self.profile = profile self.batch_size = 10000 self.fetch_size = 20000 self.echo = echo self.connect() self.last_connect = now() # Base Template template_base_path = '{}/database/templates/base.yaml'.format( get_dir_path()) self.template_dict = read_yaml(template_base_path) # Specific Type Template template_path = '{}/database/templates/{}.yaml'.format( get_dir_path(), self.type) temp_dict = read_yaml(template_path) for key1 in temp_dict: # Level 1 if isinstance(temp_dict[key1], dict): if key1 not in self.template_dict: self.template_dict[key1] = temp_dict[key1] # Level 2 for key2 in temp_dict[key1]: # Always Overwrite self.template_dict[key1][key2] = temp_dict[key1][key2] else: # Level 1 Non-Dict Overwrite self.template_dict[key1] = temp_dict[key1] self.variables = self._template('variables') if os.getenv('PROFILE_YAML'): other_vars = get_variables() for key in other_vars: self.variables[key] = other_vars[key] self.tmp_folder = self.variables['tmp_folder'] self.set_variables() if echo: log("Connected to {} as {}".format(self._cred.name, self._cred.user))
def write_jsonls(file_path, data, log=log): "Sream Write to JSON Lines. 'data' must be namedtuple. schema is a dict of field to data-type" import jsonlines s_t = now() l_t = now() msg_dlt = 10000 counter = 0 counter2 = 0 with open(file_path, 'wb') as f: w = jsonlines.Writer(f) for row in data: w.write(row) counter += 1 counter2 += 1 # progress message if counter2 % msg_dlt == 0: secs_l = (now() - l_t).total_seconds() if secs_l >= 20: secs = (now() - s_t).total_seconds() rate = round(counter2 / secs_l, 1) mins = round(secs / 60, 1) log("{} min ## Writing to JSON: {} rows @ {} r/s.".format( mins, counter, rate)) l_t = now() counter2 = 0 secs = (now() - s_t).total_seconds() rate = round(counter / secs, 1) log("Wrote {} rows to {} [{} r/s].".format(counter, file_path, rate))
def write_csvs(file_path, data, fields=None, lineterminator='\n', deli=',', log=log, secs_d=10, gzip=False, to_str=False, encoding="utf8"): "Write to CSV, python3 compatible. 'data' must be list of interables" s_t = now() l_t = now() log_dlt = 100 counter2 = 0 f1 = open(file_path, 'w', newline='', encoding=encoding) w = csv.writer(f1, delimiter=deli, quoting=csv.QUOTE_MINIMAL, lineterminator=lineterminator) file_name = file_path.split('/')[-1] if fields: w.writerow(fields) else: row = next(data) w.writerow(row._fields) # write headers w.writerow(row) # write 1st row counter = 0 for row in data: row = [str(v) for v in row] if to_str else row w.writerow(row) counter += 1 counter2 += 1 # progress message if counter2 % log_dlt == 0: secs_l = (now() - l_t).total_seconds() if secs_l >= secs_d: secs = (now() - s_t).total_seconds() rate = round(counter2 / secs_l, 1) mins = round(secs / 60, 1) log("{} min ## Writing to {}: {} rows @ {} r/s.".format( mins, file_name, counter, rate)) l_t = now() counter2 = 0 f1.close() if gzip: log('Gzipping ' + file_path) os.system('gzip -f ' + file_path) file_path = file_path + '.gz' secs = (now() - s_t).total_seconds() rate = round(counter / secs, 1) log("Wrote: {} rows to {} [{} r/s].".format(counter, file_path, rate)) return counter
def write_pqs( file_path, dataf_chunks, partition_cols=None, flavor='spark', append=False, filesystem=None, log=log, secs_d=10, ): "Stream-Write to Parquet, python3 compatible. 'dataf_chunks' must be list of dataframes" s_t = now() l_t = now() log_dlt = 100 counter = 0 counter2 = 0 if not append and os.path.exists(file_path): shutil.rmtree(file_path, ignore_errors=True) file_name = file_path.split('/')[-1] for dataf in dataf_chunks: table = pa.Table.from_pandas(dataf, nthreads=psutil.cpu_count()) counter += table.num_rows counter2 += table.num_rows pq.write_to_dataset( table, root_path=file_path, partition_cols=partition_cols, flavor=flavor, preserve_index=False, filesystem=filesystem, use_deprecated_int96_timestamps=True, compression='snappy') # will append. delete folder for overwrite secs_l = (now() - l_t).total_seconds() if secs_l >= secs_d: secs = (now() - s_t).total_seconds() rate = round(counter2 / secs_l, 1) mins = round(secs / 60, 1) log("{} min ## Writing to {}: {} rows @ {} r/s.".format( mins, file_name, counter, rate)) l_t = now() counter2 = 0 secs = (now() - s_t).total_seconds() rate = round(counter / secs, 1) log("Wrote: {} rows to {} [{} r/s].".format(counter, file_path, rate)) return counter
def exec_sql(sql, limit_def=5000): log('\n------------SQL-START------------\n{}\n------------SQL-END------------ \n' .format(sql), color='blue') log('LIMIT: ' + str(limit), color='blue') cache_used = False if sql in worker_sql_cache: for fields, rows in list(worker_sql_cache[sql]['results']): # if limit above limit_def, then refresh if limit > limit_def: break # if limit is same and not a csv call, then refresh if limit == worker_sql_cache[sql][ 'limit'] and 'csv' not in options: break # if ran more than 10 minutes ago, then refresh if now_minus(minutes=10 ) > worker_sql_cache[sql]['timestamp']: del worker_sql_cache[sql] break if len(fields) > 0: cache_used = True # must return data/fields worker_sql_cache[sql]['limit'] = limit log('+Cache Used') yield fields, rows, cache_used if not cache_used: worker_sql_cache[sql] = dict(timestamp=now(), results=[], limit=limit) rows = conn.query( sql.replace('%', '%%'), dtype='tuple', limit=limit if limit > limit_def else limit_def) fields = conn._fields worker_sql_cache[sql]['results'].append((fields, rows)) yield fields, rows, cache_used
def run(db_prof, conf_queue: Queue, worker: Worker): """Launch the database worker and await requests. Args: db_prof: the db profile conf_queue: a multiprocessing Queue worker: the respective worker. """ global worker_name, worker_status log = worker.log worker_name = worker.name worker_status = 'IDLE' set_worker_idle() worker_db_prof = db_prof while True: try: time.sleep(0.05) # brings down CPU loop usage except (KeyboardInterrupt, SystemExit): return # data_dict = worker.pipe.recv_from_parent(timeout=0) data_dict = worker.get_child_q() if data_dict: conf_data = {'payload_type': 'confirmation'} if data_dict['req_type'] in func_map: worker_queue.append(data_dict) sync_queue() conf_data['queued'] = True # Add task store.sqlx('tasks').add( task_id=data_dict['id'], function=func_map[data_dict['req_type']].__name__, queue_date=now(), start_date=None, end_date=None, args=jdumps([]), kwargs=jdumps(data_dict), error=None, worker_name=worker_name, worker_pid=worker_pid, last_updated=epoch(), ) log('+({}) Queued task: {}'.format(len(worker_queue), data_dict)) # Send receipt confirmation? # with worker.lock: # worker.pipe.send_to_parent(conf_data) if len(worker_queue) and worker_status == 'IDLE': data_dict = worker_queue.popleft() sync_queue() worker_status = 'BUSY' func = func_map[data_dict['req_type']] # Sync worker store.sqlx('workers').update_rec( hostname=worker.hostname, worker_name=worker.name, status=worker_status, task_id=data_dict['id'], task_function=func.__name__, task_start_date=now(), task_args=jdumps([]), task_kwargs=jdumps(data_dict), last_updated=epoch(), ) # Sync task store.sqlx('tasks').update_rec( task_id=data_dict['id'], start_date=now(), last_updated=epoch(), ) try: error_data = None func(worker, data_dict) except Exception as E: log(E) error_data = dict( id=data_dict['id'], sid=data_dict['sid'], payload_type='task-error', error=get_error_str(E), ) # worker.pipe.send_to_parent(error_data) worker.put_parent_q(error_data) finally: # Sync worker worker_status = 'IDLE' set_worker_idle() # Sync task store.sqlx('tasks').update_rec( task_id=data_dict['id'], end_date=now(), error=jdumps(error_data) if error_data else None, last_updated=epoch(), )
def get_conn(db, dbs=None, echo=True, reconnect=False, use_jdbc=False, conn_expire_min=10, spark_hive=False) -> DBConn: global conns dbs = dbs if dbs else get_databases() profile = get_profile() db_dict = struct(dbs[db]) if db_dict.type.lower() == 'hive' and spark_hive: db_dict.type = 'spark' use_jdbc = True if ( use_jdbc or ('use_jdbc' in db_dict and db_dict['use_jdbc'])) else use_jdbc if db in conns and not reconnect: if (now() - conns[db].last_connect).total_seconds() / 60 < conn_expire_min: return conns[db] if use_jdbc: log('*USING JDBC for ' + db) from .jdbc import JdbcConn conn = JdbcConn(db_dict, profile=profile) elif db_dict.type.lower() == 'oracle': from .oracle import OracleConn conn = OracleConn(db_dict, echo=echo) elif db_dict.type.lower() == 'spark': from .spark import SparkConn conn = SparkConn(db_dict, echo=echo) elif db_dict.type.lower() == 'hive': from .hive import HiveConn, Beeline if 'use_beeline' in db_dict and db_dict.use_beeline: conn = Beeline(db_dict, echo=echo) else: conn = HiveConn(db_dict, echo=echo) elif db_dict.type.lower() in ('postgresql', 'redshift'): from .postgresql import PostgreSQLConn conn = PostgreSQLConn(db_dict, echo=echo) elif db_dict.type.lower() == 'sqlserver': from .sqlserver import SQLServerConn conn = SQLServerConn(db_dict, echo=echo) elif db_dict.type.lower() == 'sqlite': from .sqlite import SQLiteConn conn = SQLiteConn(db_dict, echo=echo) else: raise Exception(f'Type {db_dict.type} not handled!') conns[db] = conn return conn