def create(path, mod_name): if File.is_path(path): if path.exists(): logger(mod_name).info('found: "{}"'.format(str(path))) else: logger(mod_name).info('creating folder \"{}\"'.format(str(path))) path.mkdir(parents= True, exist_ok= True)
def _download_zips(cusid, tech, CAT, ymd_dppath, ymd_cfgpath, limit= None, mod_name= __name__): ds = customer.get_default_DS_config_all(cusid, tech) reCAT = ds[RESTRICT.ZIP_FLT][CAT] flpath = ymd_cfgpath.joinpath('files.txt') logger(mod_name).info('checking "{}"...'.format(flpath)) result = set() with open(str(flpath), 'r') as fo: c = 0 for _, p in enumerate(fo): ftppath = pathlib.Path(p.rstrip()) fn = ftppath.name zippath = ymd_dppath.joinpath(fn) m = reCAT.match(fn) #logger(__name__).debug((ftppath, fn, zippath, reCAT, m)) if type(m) is type(re.match('', '')): FTP.download_binary(ds[RESTRICT.PROTOCOL], ds[RESTRICT.HOST], ds[RESTRICT.PORT], ds[RESTRICT.USER], ds[RESTRICT.PSWD], ftppath.as_posix(), str(zippath), mod_name) if CAT == RESTRICT.PM: result.add((m.group(3), zippath)) elif CAT in set([PCOFNSRAW, PCOFNGRAW, IMSCSFRAW, IMSHSSRAW, MADNHRRAW, MADODCRAW, IMSDRARAW, XMLNSSRAW, NOKOBWRAW, NOKOMWRAW, NOKIUMRAW]): result.add((m.group(2), zippath)) else: result.add((m.group(1), zippath)) if type(limit) is int: c = c + 1 if c >= limit: break fo.close() return result
def __init__(self, header, outfile, delimiter=',', line_proc=None, mod_name=__name__): """ ## header: { column: (lambda(self, line, linum) -> value) } ## line: { column: acc } # a list of columns ## line_proc: lambda(header, line, linum) -> line """ if not Util.is_function(line_proc) or Util.get_arity(line_proc) != 3: logger(mod_name).error('bad function {} for {}'.format( line_proc, self)) sys.exit(RESTRICT.INTERRUPT) self.header = header self.line = None self.outfile = outfile self.fout = open(str(outfile), 'w') self.delimiter = delimiter self.line_proc = line_proc self.mod_name = mod_name self.RECENT_LINE_COUNT = 3 self.recent_lines = list() self._write_header()
def extract(cusid, tech, date): if type(date) is datetime.date: logger(__name__).info('updating {} FM'.format(str(date))) owner_tables = Common.get_owner_tables(cusid, tech, date, _cat, __name__) #tblcols = DSColumn.extract_columns(cusid, tech, date, _cat, owner_tables, __name__) LRC_ow_ta_columns = DSColumn.get_all_columns(cusid, tech, date, _cat, __name__) dsconf = customer.get_default_DS_config_all(cusid, tech) ei = Common.extract_info(cusid, tech, date, _cat, __name__) for i, (LRC, zippath, filename) in enumerate(ei): """ Common.perform_extraction(cusid, tech, date, _cat, LRC, zippath, filename, None, __name__) """ owner = 'FM' tblname = _get_table_name(filename, dsconf[RESTRICT.CSV_FLT][_cat]) tblcols = LRC_ow_ta_columns[LRC][owner, tblname] etl_agent = ETLAgent(tblcols, dict(), delimiter= RESTRICT.DELIMITER, newline= '\r\n', mod_name= __name__) outpath = Common.get_default_output_path(cusid, tech, date, _cat, filename, __name__) Folder.create(outpath.parent, __name__) ex_header = _get_li_ex_header(tblcols) line_proc = _get_line_proc(LRC, date) lineExtr = LineExtractor(ex_header, outpath, RESTRICT.DELIMITER, line_proc, __name__) etl_agent.add(lineExtr, 'default') Common.perform_extraction(cusid, tech, date, _cat, LRC, zippath, filename, etl_agent, __name__) etl_agent.clean() del etl_agent
def count_src(path): p = pathlib.Path(path).resolve() count = 0 if File.is_path(p) and p.exists() and p.is_dir(): for p1 in p.glob('*'): if p1.name != '__pycache__' and p1.is_dir(): count = count + count_src(p1) for py in p.glob('*.py'): count1 = 0 #logger(__name__).debug(py) if py.name == 'logging.py': continue with open(str(py), 'r') as fo: for ln in iter(fo): line = ln.strip() if line == '' or line.startswith('logger(__name__).debug(') or\ line.startswith('logger(mod_name).debug(') or\ line == '"""' or line.startswith('#'): continue count1 = count1 + 1 fo.close() logger(__name__).debug('"{}": {}'.format(str(py), count1)) count = count + count1 logger(__name__).debug('"{}": {}'.format(str(p), count)) return count
def approach(self, ifpath): if type(self.fin) is not _io.TextIOWrapper and\ File.is_path(ifpath) and File.exists(ifpath): logger(self.mod_name).info('approaching "{}"'.format(str(ifpath))) self.infile = ifpath if self.newline is None: self.fin = open(str(ifpath), 'r') else: self.fin = open(str(ifpath), 'r', newline=self.newline) if not self._is_init(): logger(self.mod_name).warning( 'bad ETL agent: agent {} not initialized; skipping it'.format( str(self))) self.start_time = datetime.datetime.now() self._time_log(0) logging.switch_to_progress(self.mod_name) reader = csv.DictReader(self.fin, delimiter=RESTRICT.DELIMITER) i = -1 for i, line in enumerate(reader): self._push_recent_line(line) self._transform(line, linum=i + 1) self._data_rate(i) logging.switch_to_normal(self.mod_name) self.fin.close() self._report(i + 1) if i > -1: self._output()
def _find_data_path(cusid, tech): #logger(__name__).debug(get_computed_config(cusid, tech, __name__)) dppath = get_computed_config(cusid, tech, __name__)[RESTRICT.DATA_PATH] if dppath.exists(): return dppath.joinpath(cusid).joinpath(tech) else: logger(__name__).warning('bad config: DATA_PATH "{}"'.format( str(dppath)))
def clear_working_space(cusid, tech, date): if type(date) is datetime.date: base = wic.find_config_path().joinpath( '{date:%Y%m%d}'.format(date=date)) logger(__name__).info('cleaning "{}"...'.format(str(base))) if base.exists(): Folder.remove(base, __name__) initialize_working_space(cusid, tech, date)
def _inspect_recent_lines(self, linum): lastLinum = linum - self.RECENT_LINE_COUNT + 1 logger(self.mod_name).error(' {}'.format( self._serialize_header('\t'))) for i, ln in enumerate(self.recent_lines): logger(self.mod_name).error('line {}: {} ({} column{})'.format( lastLinum + i, self._serialize_line(ln, '\t'), len(ln), '' if len(ln) < 2 else 's'))
def list_computed_config(cusid, tech, key, mod_name): json_data = get_computed_config(cusid, tech, mod_name) for k in json_data: if File.is_path(json_data[k]): if not File.exists(json_data[k]): logger(__name__).warning('bad path: "{}"'.format( str(json_data[k]))) json_data[k] = '{} {}'.format(type(json_data[k]), json_data[k]) return JSON.to_yaml(json_data, key)
def remove(path, mod_name): if File.is_path(path): for x in path.glob('*'): if x.is_dir(): remove(x, mod_name) else: File.remove(x, mod_name) path.rmdir() logger(mod_name).info('remove: "{}"'.format(path))
def _build_DDL_my_alter(database, tblname, dbcols, cols_order, dbcols_old): def _find_prev_column(cols_order, i, dbcols_old): i1 = i while True: i1 = i1 - 1 c = cols_order[i1] if c in dbcols_old: return c # states = dict() for c in dbcols: if c not in states: states[c] = list([dbcols[c]['order']]) else: states[c].append(dbcols[c]['order']) for c in dbcols_old: if c not in states: states[c] = list([dbcols_old[c]['order']]) else: states[c].append(dbcols_old[c]['order']) modcols = list() addcols = list() logger(__name__).debug((tblname, len(cols_order))) for i in sorted(cols_order): c = cols_order[i] logger(__name__).debug((i, c)) if len(states[c]) == 2 and len(set(states[c])) == 1: continue pos = 'first' if i == 1 else 'after `{column}`'.format( column=_find_prev_column(cols_order, i, dbcols_old)) if len(states[c]) == 1: addcols.append('add column `{cname}` {ctype} {desc} {pos}'.format( cname=c, ctype=dbcols[c]['column type'], desc=_build_column_desc(dbcols[c]['column type']), pos=pos)) else: modcols.append( 'modify column `{cname}` {ctype} {desc} {pos}'.format( cname=c, ctype=dbcols[c]['column type'], desc=_build_column_desc(dbcols[c]['column type']), pos=pos)) modcols = ',\n '.join(modcols) addcols = ',\n '.join(addcols) columns = ',\n '.join([x for x in [modcols, addcols] if x != '']) # ddlformat = """ alter table `{database}`.`{table}` {columns} """.strip() ddl = ddlformat.format(database=database, table=tblname, columns=columns) return ddl
def list_config(cusid, tech, key, mod_name): path = wic.find_config_file_path() logger(__name__).info('show "{}"'.format(path)) json_data = File.load_JSON(path, mod_name) for k in json_data: if File.is_path(json_data[k]): if not File.exists(json_data[k]): logger(__name__).warning('bad path: "{}"'.format( str(json_data[k]))) return JSON.to_yaml(json_data, key)
def update_columns(tech, date): if type(date) is datetime.date: logger(__name__).info('updating columns by {}'.format(str(date))) ymd = '{:%Y%m%d}'.format(date) cfgbase = wic.find_config_path(RESTRICT.CUSTOMER_ID, tech).joinpath(ymd) dpbase = wic.find_data_path(RESTRICT.CUSTOMER_ID, tech).joinpath(ymd) LRC_z_s = download_zips(RESTRICT.DB, tech, dpbase, cfgbase, __name__) put_LRCs(RESTRICT.DB, LRC_z_s, cfgbase, __name__) ps = [x for x in cfgbase.joinpath(RESTRICT.DB).glob('LRC*')] [p1, p2] = Util.pick(2, ps) good, bad = Column.build_columns(p1) ugly = Column.patch_columns(bad, p2)
def _get_group(self, line): """ ## line: { column: value } """ group = dict([(c, (self.keys[c])(self, line)) for c in self.keys if c in line]) if len(group) == len(self.keys): return group else: logger(self.mod_name).warning( 'bad group: "{}" not match "{}"'.format( group, str(list(self.keys.keys())))) return None
def _gen_SQL_list(SQL_gen_proc, owner, tblname, filepath, block_size= RESTRICT.MAX_ALLOWED_PACKET): if File.is_path(filepath): sqlformat = 'insert into {tblname} ({{columns}}) values {{values}};'.format(tblname= tblname) with open(str(filepath), 'r') as fo: reader = csv.DictReader(fo, delimiter= RESTRICT.DELIMITER) l, lines = len(sqlformat), list() for _, ln in enumerate(reader): if None in ln: logger(__name__).debug('f**k') logger(__name__).debug(ln) for k in ln: print(k, ln[k]) if [ ln[k] for k in ln if ln[k] is None ] != []: logger(__name__).debug('park') logger(__name__).debug(ln) for k in ln: print(k, ln[k]) #logger(__name__).debug(ln) l1 = sum([ n for n in map(lambda x: len(x) + 4, [ ln[k] for k in ln ])]) #logger(__name__).debug(l1) if l + l1 >= block_size: yield SQL_gen_proc(sqlformat, lines, owner, tblname) del lines l, lines = len(sqlformat) + l1, list([ln]) else: l = l + l1 lines.append(ln) if lines != list(): yield SQL_gen_proc(sqlformat, lines, owner, tblname) del lines
def find_data_path(cusid=None, tg=None): if cusid is None and tg is None: try: return pathlib.Path(Default.DATA_PATH).resolve() except Exception as e: logger(__name__).debug('{}: {}'.format(type(e), e)) logger(__name__).critical('bad DATA_PATH; check configuration') sys.exit(Default.INTERRUPT) elif tg is None: return find_data_path().joinpath(cusid) else: return find_data_path().joinpath(cusid).joinpath(tg)
def set_config(cusid, tech, key, value, mod_name): path = wic.find_DS_config_file_path() json_data = File.load_JSON(path, mod_name) if key is not None and key.upper() in json_data: key1 = key.upper() if key1 == 'PORT': try: json_data[key1] = int(value) File.dump_JSON(path, json_data, mod_name) except Exception as e: logger(__name__).warning('bad value "{}": {} {}'.format( value, type(e), e)) else: json_data[key1] = value File.dump_JSON(path, json_data, mod_name)
def _build_agg_sql(database, tblname, tblname_ym, agg_case, select, group_by, agg_spec, rest): _agg = _build_agg_columns(agg_case, agg_spec) _reststr = ', '.join(['null as {}'.format(x) for x in rest]) sql1format = ' '.join([ 'create temporary table test_{t}_{a}_{ym}', ' select {s} from {tym} group by {g} limit 0;'.lstrip(), 'describe test_{t}_{a}_{ym}' ]) sql1 = sql1format.format(t=tblname, a=agg_case, ym=_ym(tblname_ym), s=_build_select(agg_case, select, _agg, _reststr), tym=tblname_ym, g=_build_group_by(agg_case, group_by)) _cols = _agg.keys() if rest == set(): c1, c2 = (', '.join([select, (', '.join(_cols))]), ', '.join( [select, (', '.join([_agg[k] for _, k in enumerate(_cols)]))])) else: c1, c2 = (', '.join([select, (', '.join(_cols))]), ', '.join([ select, _reststr, (', '.join([_agg[k] for _, k in enumerate(_cols)])) ])) sql2format = ' '.join([ 'truncate {d}_{a}.{t}_{a}_latest;' 'insert into {d}_{a}.{t}_{a}_latest ({c1})', ' select {c2} from {tym} group by {g};'.lstrip(), 'insert into {d}_{a}.{t}_{a}_{ym} ({c1})', ' select {c2} from {d}_{a}.{t}_{a}_latest'.lstrip() ]) sql2 = sql2format.format(d=database, t=tblname, a=agg_case, c1=c1, c2=c2, tym=tblname_ym, g=_build_group_by(agg_case, group_by), ym=_ym(tblname_ym)) logger(__name__).debug(sql1) #logger(__name__).debug(sql2) return sql1, sql2
def _report(self, linum): if linum == 0: logger(self.mod_name).info('no line') else: lps, spl = self._statistics(linum) logger(self.mod_name).info('{:,} line{}'.format( linum, '' if linum < 2 else 's')) logger(self.mod_name).info('{:,.2f} lps'.format(lps)) logger(self.mod_name).debug('{:,.6f} spl'.format(spl))
def set_config(cusid, tech, key, value, mod_name): if key is not None: path = wic.find_config_file_path() json_data = File.load_JSON(path, mod_name) key1 = key.upper() if key1 in json_data: if key1.endswith('_PATH'): logger(__name__).info('varifying path "{}"...'.format(value)) try: pathlib.Path(value).resolve() json_data[key1] = value except Exception as e: logger(__name__).warning('bad path: {} {}'.format( type(e), e)) else: json_data[key1] = value File.dump_JSON(path, json_data, mod_name)
def extract(cusid, tech, date, CAT= _cat): if type(date) is datetime.date: logger(__name__).info('updating {} PM'.format(str(date))) owner_tables = Common.get_owner_tables(cusid, tech, date, CAT, __name__) #tblcols = DSColumn.extract_columns(cusid, tech, date, CAT, owner_tables, __name__) LRC_ow_ta_columns = DSColumn.get_all_columns(cusid, tech, date, _cat, __name__) dsconf = customer.get_default_DS_config_all(cusid, tech) ei = _extract_PM(cusid, tech, date, CAT) for i, (LRC, zippath, filename, prefix, owner, tblname, id_col, agg_terms) in enumerate(ei): """ Common.perform_extraction(cusid, tech, date, CAT, LRC, zippath, filename, None, __name__) """ tblcols = LRC_ow_ta_columns[LRC][owner, tblname] etl_agent = ETLAgent(tblcols, dict(), delimiter= RESTRICT.DELIMITER, newline= None, mod_name= __name__) outpath = Common.get_default_output_path(cusid, tech, date, owner, filename, __name__) Folder.create(outpath.parent, __name__) ex_header = _get_li_ex_header(tblcols) line_proc = _get_line_proc(prefix, LRC, id_col, date) lineExtr = LineExtractor(ex_header, outpath, RESTRICT.DELIMITER, line_proc, __name__) """ #agg_rules = ETLDB.get_agg_rules(cusid, tech, owner, tblname, __name__) #daily_agg_proc = _get_daily_agg_proc(prefix, LRC, date, agg_rules['all']) #time_agg_proc = ... agg_rules['all'] #obj_agg_proc = ... agg_rules['all'] #repAggList = #rep_agg_proc = #dailyAgg = Aggregator(ex_header, daOutpath, RESTRICT.DELIMITER, daily_agg_proc, __name__) #timeAgg = Aggregator(ex_header, taOutpath, RESTRICT.DELIMITER, time_agg_proc, __name__) #objAgg = Aggregator(ex_header, oaOutpath, RESTRICT.DELIMITER, obj_agg_proc, __name__) #repAggList = [ Aggregator(ex_header, raOutpath, RESTRICT.DELIMITER, rep_agg_proc, __name__) # for _, (raLevel, raOutpath, rep_agg_proc) in enumerate(repAggList) ] """ etl_agent.add(lineExtr, 'default') """ #etl_agent.add(dailyAgg, 'daily aggregation') #etl_agent.add(timeAgg, 'time aggregation') #etl_agent.add(objAgg, 'object aggregation') #for i, repAgg in enumerate(repAggList): # etl_agent.add(repAgg, 'report aggregation #{}'.format(i + 1)) """ Common.perform_extraction(cusid, tech, date, CAT, LRC, zippath, filename, etl_agent, __name__) etl_agent.clean() del etl_agent
def run_sql(conn, sql, prefix_tag= None): if type(conn) is pymysql.connections.Connection: cur = conn.cursor() sql1 = re.sub(r'(\r\n|\n)', ' ', sql).strip() sql2 = sql1[:512] try: #logger(__name__).debug(sql1) logger(__name__).debug('run{prefix_tag}: length {sql_len}; "{sql_part}{sql_rest}"'.format( prefix_tag= '' if prefix_tag is None else prefix_tag, sql_len= len(sql1), sql_part= sql2, sql_rest= '...' if len(sql1) > len(sql2) else '' )) cur.execute(sql) return cur except Exception as e: logger(__name__).error('bad SQL: {}'.format(e)) traceback.print_exc()
def get_schema(dbconf, table, mod_name): if type(dbconf) is dict: logger(mod_name).info("get schema '{}.{}'".format(dbconf[DBKey.DB], table)) sql = "select column_name, ordinal_position, column_type from information_schema.columns where table_schema = '{database}' and table_name = '{table}'".format(database= dbconf[DBKey.DB], table= table) #logger(__name__).debug(sql) conn = get_connection(dbconf[DBKey.HOST], dbconf[DBKey.PORT], dbconf[DBKey.USER], dbconf[DBKey.PSWD], dbconf[DBKey.DB], dbconf[DBKey.CHARSET], mod_name) cur = conn.cursor() cur.execute(sql) result = dict() for _, (cn, order, ct) in enumerate(cur): result[cn] = dict([ ('order', order), ('column type', ct) ]) conn.close() logger(mod_name).info('{} column{}'.format(len(result), '' if len(result) < 2 else 's')) return result
def _partition_agg_columns(dbcols, aggcols_spec, dropping_cols, owner, tblname): _aggcols_all = set(aggcols_spec['all'].keys()) if len(_aggcols_all) == 0: logger(__name__).warning( 'skip {}.{}: no aggregation columns'.format(owner, tblname)) return None _first_col = [ aggcols_spec['all'][c] for i, c in enumerate(aggcols_spec['all']) if i == 0 ][0] if _first_col['ID'] == '': logger(__name__).warning('skip {}.{}: no mapping ID'.format( owner, tblname)) return None result = dict() for case in aggcols_spec: for k in (set(aggcols_spec[case].keys()) - dbcols): aggcols_spec[case].pop(k) first_col = [ aggcols_spec[case][c] for i, c in enumerate(aggcols_spec[case]) if i == 0 ][0] aggcols = aggcols_spec[case] select = first_col['select'] group_by = first_col['group'] restcols = set([ x for x in (dbcols - set(aggcols.keys()) - set(group_by.split(',')) - set(select.split(',')) - dropping_cols) if not x.endswith('_ID') ]) result[case] = dict() result[case]['aggregation'] = aggcols result[case]['select'] = select result[case]['group by'] = group_by result[case]['rest'] = restcols
def get_connection(host, port, user, password, database, charset, mod_name): logger(mod_name).info('try to connect to host {}, database {}'.format(host, database)) count = 2 while True: try: conn = pymysql.connect(host= host, user= user, passwd= password, db= database, port= port, charset= charset, connect_timeout= 30, local_infile= True) return conn except: traceback.print_exc() if count <= 1: raise else: logger(mod_name).info('try to re-connect to host {}, database {}'.format(host, database)) count = count - 1
def set_config(cusid, tech, key, value, mod_name): path = wic.find_DB_config_file_path() json_data = File.load_JSON(path, mod_name) if key is not None: m = re.match('(\w+)[.](\w+)', key) if m is None: key1 = key.upper() if key1 in json_data and type(json_data[key1]) is not dict: json_data[key1] = value File.dump_JSON(path, json_data, mod_name) else: key1, key2 = m.group(1).upper(), m.group(2).upper() if key1 in json_data and key2 in json_data[key1]: if key2 == 'TABLE' and key1 in ['PM', 'CM', 'DC']: logger(__name__).warning( '{}.TABLE: fixed to any table ("*")'.format(key1)) json_data[key1][key2] = '*' elif key2 == 'PORT': json_data[key1][key2] = int(value) else: json_data[key1][key2] = value File.dump_JSON(path, json_data, mod_name)
def get_current(): confpath = Default.CONFIG_PATH.joinpath('{}/current'.format( Default.CONFIG_FOLDER)) cusid, tech = None, None line = None if confpath.exists(): with open(str(confpath), 'r') as fo: for ln in iter(fo): line = pathlib.Path(ln.rstrip()) break if line is None: logger(__name__).info('not found: "{}"'.format(str(confpath))) elif line.parent.parent == confpath.parent: cusid, tech = line.parent.name, line.name else: logger(__name__).info('bad current: "{}"'.format(str(line))) _find_current_candidate(confpath.parent) return cusid, tech
def get_data_set(dbconf, task_name, SQL, rec_proc, mod_name): if type(dbconf) is dict and Util.is_function(rec_proc) and Util.get_arity(rec_proc) == 1: logger(mod_name).info('fetching data set "{}"'.format(task_name)) conn = get_connection(dbconf[DBKey.HOST], dbconf[DBKey.PORT], dbconf[DBKey.USER], dbconf[DBKey.PSWD], dbconf[DBKey.DB], dbconf[DBKey.CHARSET], mod_name) cur = conn.cursor() logger(__name__).debug('SQL: {}'.format(SQL)) cur.execute(SQL) logger(__name__).debug('done') i = -1 for i, x in enumerate(cur): rec_proc(x) conn.close() logger(mod_name).info('{} record{}'.format(i + 1, '' if i < 1 else 's')) return rec_proc()
def extract(cusid, tech, date): if type(date) is datetime.date: logger(__name__).info('extracting {} CO'.format(str(date))) #ymd = '{:%Y%m%d}'.format(date) ei = Common.extract_info(cusid, tech, date, _cat, __name__) for i, (LRC, zippath, filename) in enumerate(ei): """ Common.perform_extraction(cusid, tech, date, _cat, LRC, zippath, filename, None, __name__) """ etl_agent = ETLAgent(get_etl_agent_header(cusid, tech), {'CO_OCV_SYS_VERSION': 'CO_SYS_VERSION'}, RESTRICT.DELIMITER, __name__) outpath = Common.get_default_output_path(cusid, tech, date, _cat, filename, __name__) Folder.create(outpath.parent, __name__) etl_agent.add(adapter=LineExtractor( get_li_ex_header(cusid, tech, LRC, filename), outpath, RESTRICT.DELIMITER, get_line_proc(LRC), __name__), name='default') Common.perform_extraction(cusid, tech, date, _cat, LRC, zippath, filename, etl_agent, __name__) etl_agent.clean() del etl_agent