def desc_tmp_table(self, tbl, cols): d = {col[0]: col for col in cols} pfmtd([ dict(Column=d[k][0], Data_type=d[k][1]) for k in sorted(d.keys()) ], tbl)
def desc_cur(self, cur, colord=True): rows = {} for col in cur.description: rows[col[0]] = [col[0], str(col[1]).split("'")[1], col[3]] out = [] for k in sorted(rows.keys()) if colord else rows.keys(): row = rows[k] d = collections.OrderedDict() for i in zip(['Column', 'Type', 'Length'], row): x, y = i d[x] = y out.append(d) pfmtd(out, 'Procedure')
def load_day(a, mon, day=None): cli = get_cli(a) pars = params[a.table][0].format(**dict(CENTER=a.center, CLIENT=a.client, YEAR=a.year, MONTH=mon, DAY=day, EOM=a.day_to, BUNIT=a.bunit)) pycli = cli.format( **dict(TABLE=a.table, PARAM_CNT=len([p for p in pars.strip().split(' ') if p]), DUMP='--dump' if a.dump else '--no-dump', LAME_DUCK='-ld %d ' % a.lame_duck)) cmd = '%s %s' % (pycli, pars) if not a.dry: pfmtd([dict(Command=os.linesep.join(cmd.split()))], a.table) pipe = subprocess.Popen([cmd], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) line = pipe.stdout.readline() while line: print('OUTPUT:', line.strip()) line = pipe.stdout.readline() line = pipe.stderr.readline() while line: print('ERROR:', line) line = pipe.stderr.readline() while pipe.poll() is None: print('Waiting...') time.sleep(1) if pipe.returncode != 0: print('returncode = %d' % pipe.returncode) e(pipe.returncode) else: print(cmd)
def desc_table(self, schema, tbl, col_ord=None): stmt = """ SELECT cname, coltype, nulls, colno, length, in_primary_key as in_pk FROM sys.syscolumns WHERE creator='%s' AND tname='%s' ORDER BY %s""" % (schema, tbl, 'cname' if not col_ord else '%s desc' % col_ord) #psql(stmt) self.cur.execute(stmt) out = [] rows = {row[1 if col_ord else 0]: row for row in self.cur.fetchall()} for k in sorted(rows.keys()): row = rows[k] d = collections.OrderedDict() for i in zip([col[0] for col in self.cur.description], row): x, y = i d[x] = y out.append(d) pfmtd(out, '%s.%s' % (schema, tbl))
def desc_table(self, schema, tbl, col_ord=None): stmt = """ SELECT ordinal_position as id, column_name, data_type, data_type_length as dt_len, is_nullable as nullable, column_default as default FROM v_catalog.columns WHERE table_schema='%s' AND table_name='%s' ORDER BY %s""" % (schema, tbl, 'ordinal_position' if not col_ord else '%s desc' % col_ord) self.cur.execute(stmt) #psql(stmt) out = [] rows = {row[0 if col_ord else 1]: row for row in self.cur.fetchall()} for k in sorted(rows.keys()): row = rows[k] d = OrderedDict() for i in zip([col[0] for col in self.cur.description], row): x, y = i d[x] = y out.append(d) pfmtd(out, '%s.%s' % (schema, tbl))
def fetch_many(self, chunk_size, source, qname, out, skip_header, terminate_line=False): assert chunk_size cli = self.cli chunk_size = self.cli.lame_duck if self.cli.lame_duck and chunk_size > self.cli.lame_duck else chunk_size assert chunk_size tf = "%Y-%m-%d.%H_%M_%S" current_ts = time.strftime(tf) id = 0 stmt = self.get_query(source, qname) log.debug(stmt) if not hasattr(self, 'cur') or not self.cur: self.cur = self.conn.cursor() cur = self.cur psql(' \n'.join(stmt.replace(',', ', ').split()), 'Extractor cmd') #e() cur.execute(stmt) cols = [c[0] for c in cur.description] total_read = 0 if 1: apx_cmap, apx_cols, apx = cli.get_appendix2() header = None first = True is_apx = [] start_time = time.time() while True: print('Elapsed [%d] PRE fetch: %s' % (id, time.time() - start_time)) start_time = time.time() out.data = [] if self.cli.lame_duck and self.cli.lame_duck <= total_read: break #decrease chunk size if self.cli.lame_duck and self.cli.lame_duck - total_read < chunk_size: chunk_size = self.cli.lame_duck - total_read fetch_time = time.time() rows = cur.fetchmany(chunk_size) print('Elapsed [%d] FMANY: %s' % (id, time.time() - fetch_time)) print(len(rows)) #e() data = [] append_time = time.time() if rows: for row in rows: d = [] for x in row: if x == None: d.append(b'') continue if isinstance(x, datetime.date) or isinstance( x, datetime.datetime): d.append(str(x).encode('utf-8')) continue if isinstance(x, int) or isinstance(x, float): d.append(repr(x)) continue if sys.version_info[0] < 3: d.append(x) else: d.append(x.encode()) if apx: #pp(d) #print len(d), len(d+apx.split(cli.csep)), apx #e() cols = cols + apx_cols is_apx = ['N'] * len(d) + ['Y'] * len(apx_cols) d = d + apx.split(cli.csep.decode()) data.append( d + [''] if terminate_line else []) #data.append('^'.join(str(v) for v in d+apx)) else: if 1: is_apx = ['N'] * len(d) data.append(d) #header = [col[:2] for ] else: assert 3 == 2 data.append('^'.join(str(v) for v in d) + os.linesep) if first: pfmtd([ dict(Col=col, Row=d[i], Appendix=is_apx[i]) for i, col in enumerate(cols) ], 'First row') first = False #e() else: break out.data = data print('Elapsed [%d] APPEND: %s' % (id, time.time() - append_time)) out.chunk_id, out.current_ts, out.actor = id, current_ts, self.cln if not data: break print('Elapsed [%d] POST fetch: %s' % (id, time.time() - start_time)) yield out id += 1 total_read += len(data)
def fetch_row(self, cur, source, qname, out, skip_header, terminate_line=False): cols = [c[0] for c in cur.description] total_read = 0 if 1: apx_cmap, apx_cols, apx = cli.get_appendix2() header = None first = True is_apx = [] start_time = time.time() if 1: print('Elapsed PRE fetch: %s' % (time.time() - start_time)) start_time = time.time() out.data = [] fetch_time = time.time() row = cur.fetchone() rid = 0 while row: #if rows: #for row in rows: d = [] for x in row: if x == None: d.append(b'') continue if isinstance(x, datetime.date) or isinstance( x, datetime.datetime): d.append(str(x).encode('utf-8')) continue if isinstance(x, int) or isinstance(x, float): d.append(repr(x)) continue if sys.version_info[0] < 3: d.append(x) else: d.append(x.encode()) if apx: #pp(d) #print len(d), len(d+apx.split(cli.csep)), apx #e() cols = cols + apx_cols is_apx = ['N'] * len(d) + ['Y'] * len(apx_cols) d = d + apx.split(cli.csep.decode()) yield d + [''] if terminate_line else [] #data.append(d+[''] if terminate_line else []) #data.append('^'.join(str(v) for v in d+apx)) else: if 1: is_apx = ['N'] * len(d) yield d #data.append(d) #header = [col[:2] for ] else: assert 3 == 2 yield '^'.join(str(v) for v in d) + os.linesep #data.append('^'.join(str(v) for v in d)+os.linesep) if first: pfmtd([ dict(Col=col, Row=d[i], Appendix=is_apx[i]) for i, col in enumerate(cols) ], 'First row') first = False #e() row = cur.fetchone() rid += 1 print('Elapsed POST fetch: %s' % (time.time() - start_time))
def describe(self): pfmtd([dict(Column=x) for x in self.get_header()], 'File header: %s' % self.file_name)
def run(): stats={} total_ins = 0 term_line = True #//validate cols for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg= scfg=cli.get_scfg(_src_class) for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) if tcfg.get('accountingDate', None): #//set acct_year, acct_mon for new target table naming fmt=cli.get_parsed(ckey='accountingDateFmt', cfg=tcfg) cli.set_target_table(tcfg=tcfg, acct_date=cli.get_parsed(ckey='accountingDate', cfg=tcfg), fmt=fmt) _dbname = tcfg["targetDb"] toDB = create_writer (aname =_trg_class, app_init=app_init ) toDB.begin_transaction ( env =tcfg['targetDb'] , out = to_conn ) table='%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=False) #// validate cols cfg_cols=[x[u'columnName'] for x in cli.scfg[u'columnMappings']] tcols=toDB.get_cols() t_vs_c = set(tcols) -set(cfg_cols) c_vs_t = set(cfg_cols) -set(tcols) if t_vs_c: pfmtd([dict(c_vs_t = c_vs_t)], 'Config has columns missing in target table.') raise Exception('Target table has columns missing in config: %s' % t_vs_c) if c_vs_t: pfmtd([dict(t_vs_c = t_vs_c)], 'Target table has columns missing in config.') raise Exception('Config has columns missing in target table: %s' % c_vs_t) toDB.commit_transaction ( trans = to_conn) #//transfer for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg= scfg=cli.get_scfg(_src_class) _dbname=cli.scfg["sourceDb"] #// in include/extractor fromDB = create_reader(aname = _src_class, app_init=app_init ) fromDB.begin_transaction ( env =cli.scfg['sourceDb'] , out = from_conn ) if 1: #//Extract to File for _dmp_class, val in cli.cfg['dump'][_source].items() or []: FileWriter = create_writer(aname =_dmp_class, app_init=app_init ) fromDB.set_loader(FileWriter) cli.dcfg= cli.get_dcfg(_dmp_class) for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) file_ins_cnt= 0 FileWriter.open_file( out = dump_file ) start_time = time.time() # //if fetch_many is not in IQ - it's in include/extractor/common/Extractor.py for iq_data in fromDB.fetch_many ( chunk_size=file_size_rows, source = cli.scfg, qname = 'sourceStmt', out=InOut(), skip_header=0, terminate_line= term_line): if 1: if not file_ins_cnt: FileWriter.create_header(file = dump_file, header = fromDB.get_header(), cfg=cli.dcfg, terminate_line= term_line) FileWriter.append_data ( file = dump_file, data = iq_data, cfg=cli.dcfg) file_ins_cnt+=len(iq_data.data) FileWriter.terminate(file = dump_file) print (len(iq_data.data)) print ('Elapsed read/write: %s' % (time.time() - start_time)) start_time = time.time() if not file_ins_cnt: #in case there's no data FileWriter.create_header(file = dump_file, header = fromDB.get_header(), cfg=cli.dcfg, terminate_line= term_line) #else: # FileWriter.terminate(file = dump_file) FileWriter.close_file(file = dump_file) total_ins +=file_ins_cnt fromDB.desc_cur(cur = from_conn.cur, colord=False) fromDB.commit_transaction ( trans = from_conn) log.info('Total records saved: %d' % total_ins) #// Load to IQ for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] DirReader = create_reader(aname = _src_class, app_init=app_init ) if 1: #//Get the file names cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) DirReader.glob_dir(path=path, out = data_files, ext='*.*') if 1: #//Load to DB for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer (aname =_trg_class, app_init=app_init ) toDB.begin_transaction ( env =tcfg['targetDb'] , out = to_conn ) table='%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=False) #// validate cols cfg_cols=[x[u'columnName'] for x in cli.scfg[u'columnMappings']] acols= cli.get_alt_cols(scfg) tcols=toDB.get_cols() fcols_alt=[] for data_file in data_files.file_names: dataFile = create_reader(aname = 'File', app_init=app_init, file_name=data_file, scfg=dir_scfg) dataFile.describe() file_stats[data_file] = dataFile.line_count() - cli.header_size(dir_scfg) fcols_alt=[acols.get(x.decode(),x.decode()) for x in dataFile.get_header(data_file, dir_scfg)] f_vs_c = set(fcols_alt) -set(cfg_cols) c_vs_f = set(cfg_cols) -set(fcols_alt) f_vs_t = set(fcols_alt) -set(tcols) t_vs_f = set(tcols) -set(fcols_alt) if f_vs_c: pfmtd([dict(c_vs_f = c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(f_vs_t = f_vs_t)], 'Dump file has columns missing in target table.') pfmtd([dict(t_vs_f = t_vs_f)], 'Target table has columns missing in dump file.') raise Exception('Target table has columns missing in config: %s' % f_vs_c) if c_vs_f: pfmtd([dict(f_vs_c = f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(f_vs_t = f_vs_t)], 'Dump file has columns missing in target table.') pfmtd([dict(t_vs_f = t_vs_f)], 'Target table has columns missing in dump file.') raise Exception('Config has columns missing in target table: %s' % c_vs_f) if f_vs_t: pfmtd([dict(f_vs_c = f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(c_vs_f = c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(t_vs_f = t_vs_f)], 'Target table has columns missing in dump file.') raise Exception('Dump file has columns missing in target table: %s' % f_vs_t) if t_vs_f: pfmtd([dict(f_vs_c = f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(c_vs_f = c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(f_vs_t = f_vs_t)], 'Dump file has columns missing in target table.') raise Exception('Target table has columns missing in dump file: %s' % t_vs_f) #toDB.truncate_table ( table = table ) toDB.bulk_load ( trans = to_conn, file_names = data_files, qname = 'insertStmt', cfg = (dir_scfg, tcfg), out=insert_stats, header=fcols_alt) toDB.commit_transaction ( trans = to_conn) for k in file_stats.keys(): assert file_stats[k], 'Dump file is empty' assert insert_stats[k] not in [-1], 'Insert failed' assert insert_stats[k] == file_stats[k], 'Insert vs file count diff: %s<>%s for file \n%s' % (insert_stats[k] , file_stats[k], k) if 1: stmt = cli.get_parsed(ckey='afterCountStmt', cfg=tcfg) cur = toDB.exec_query(stmt) after_cnt= cur.fetchall()[0][0] print(after_cnt) stats['%s->%s' % (_source, _trg_class)] =st= OrderedDict() st['source_cnt'] = total_ins st['total_inserted'] = sum(insert_stats.values()) st['after_count'] = after_cnt st['rollback'] = cli.get_parsed(ckey='rollbackStmt', cfg=tcfg) st['purge'] = cli.get_parsed(ckey='purgeStmt', cfg=tcfg) if 1: #//validate try: assert st['source_cnt'] == st['total_inserted'], "source_cnt %s <> total_inserted %s" % ( st['source_cnt'], st['total_inserted']) assert st['source_cnt'] == st['after_count'] , "source_cnt %s <> after_count %s" % ( st['source_cnt'], st['after_count']) except Exception as ex: del_cnt = toDB.exec_dml( dml=st['rollback'], trans=to_conn, commit=True) log.info('Rolled back recs: %d' % del_cnt) raise if 1: #//purge purge_cnt = toDB.exec_dml( dml=st['purge'], trans=to_conn, commit=True) log.info('Purged old recs: %d' % purge_cnt) toDB.commit_transaction( trans = to_conn ) if 0: Email.send_email( **email_args )
def insert_RC_data(self, trans, target, source, stmt, insert_stats, skip_header=0): pipe = source.pipe skip = str(skip_header).strip() if skip_header is not None: skip = str(skip_header).strip() assert str(skip).strip() in [ '0', '1' ], 'skip_header [%s] should be "0" or "1"' % str(skip).strip() if str(skip) == '1': pipe.readline() assert pipe start_time = time.time() xref = self.cli.tcfg["columnMap"] cols = [v[0] for _, v in xref.items()] cli.to_cols = ',\n'.join(cols) cli.to_quotes = ','.join([x for x in '?' * len(cols)]) assert cli.to_cols sql = self.get_query(target, stmt) #cnxn = pyodbc.connect(conn_str, autocommit=True) trans.conn.set_attr(pyodbc.SQL_ATTR_TXN_ISOLATION, pyodbc.SQL_TXN_SERIALIZABLE) trans.conn.autocommit = False cur = trans.conn.cursor() fline = line = pipe.readline() self.rows = rows = [] #pp(xref) apx = {x[0]: x[2] for x in xref.values() if len(x) == 3} apx = {x: cli.get_parsed(ckey=x, cfg=apx) for x, v in apx.items()} #ppe(fline) ext_c = list(set(xref.keys()) - set(fline.keys())) if ext_c: log.warn('Config has extra columns missing in REST') #pfmtd([dict(Id=k, DB_ColName=v) for k, v in enumerate(list(sorted(ext_c)))], 'Defaulting these to nulls') ext_l = list(set(fline.keys()) - set(xref.keys())) if ext_l: log.warn('REST has extra columns missing in DB') #pfmtd([dict(Id=k, RESR_Col=v) for k, v in enumerate(ext_l)], 'Extra cols in REST') #pp(ext_l) ignore=[u'signOffVersion', u'signOffTime', u'RBDate', u'asofDate'] +[u'DataSource', u'GPOSMATTol']+[u'CCY', u'DEPolicy', u'Price', u'UnrealizedPnL', u'Fund', u'RawUnrealizedPnL', u'SwapType'] + [u'SettlementDate']+[u'BuySell', u'IndependentAmount', u'ConfirmStatus', u'RefEntityName', u'ReferenceOb', u'CounterpartyRefID', u'CDSType', u'TerminationDateUnadjusted', u'TerminationDateAdjusted', u'StandardRefObligation', u'FixedRate'] + [u'MaturityDate', u'StrikePrice', u'IsSpot'] + [u'Symbol', u'VolatilityStrike']+[u'Direction', u'MaturityDateUnadjusted', u'TradeCurrency', u'ProductType', u'UnderlyingSecurity']+[u'MaturityTenor', u'PaymentDate', u'CAP_FLOOR', u'MaturityDateAdjusted'] + \ [u'IsElectronicallyConfirmed', u'Classification'] + [u'FloatingRateIndex']+[u'IsTodayResetDate']+ \ [u'FloatRateIndexRec', u'IndexTenorRec', u'IsOldED', u'DayCountFractionPay', u'DayCountFractionRec', u'PaymentFrequencyPay', u'PaymentFrequencyRec', u'RollDate']+[u'CCP', u'CCPConfirmRefId'] +[u'IndexTenorPay', u'SpreadPay', u'FloatRateIndexPay']+ \ [u'TerminationDate', u'FloatingIndex', u'StartFlow', u'CptyRefID'] +[u'Country']+ \ [u'Barrier1Strike', u'Barrier1CCYPair', u'bdi', u'Barrier2Strike', u'Barrier2CCYPair'] + [u'PutCall', u'UnderlyingSymbol', u'OptionStyle']+[u'TerminationDateUnderlyingUnadjusted', u'CallPut', u'PayReceive']+\ [u'TerminationDateUnderlyingAdjusted']+[u'ProceedsNotional'] +[u'ContractType', u'ExecutingAccount']+[u'SSGClientNote']+[u'Issuer'] while line: line.update(apx) ext_s = set(line.keys()) - set(xref.keys()) #pp(ext_s) if ext_s - set(ignore): pfmtd([ dict(Id=k, REST_Col=v) for k, v in enumerate(list(ext_s - set(ignore))) ], 'Extra cols in REST/IGNORE') pp(list(ext_s - set(ignore))) ignore = ignore + list(ext_s - set(ignore)) #rows.append([str(line[x]) if xref[x][1] in ['varchar'] else float(line[x]) if xref[x][1] in ['varchar'] else line[x] for x in xref if x not in ext]) line = pipe.readline() print(123) e() chunk = 3 total = 0 cid = 0 psql(sql, 'Insert') if not rows: raise Exception('No data in REST pipe.') else: ignore_cols = target["ignoreSourceColumns"] if not len(fline) == len(rows[0]) + len(ignore_cols): pp(fline) pp(rows[0]) raise Exception( 'line %s <> row %s not in xref:%s, not in source:%s' % (len(fline), len(rows[0]), set(fline.keys()) - set(xref.keys()), set(xref.keys()) - set(fline.keys()))) pfmtd([ dict(Col=col, Row=rows[0][i]) for i, col in enumerate([col for col in xref]) ], 'First row') while total < len(rows): cur.fast_executemany = True data = rows[total:][:chunk] #ppe(data) cur.executemany(sql, data) cur.execute("ROLLBACK") trans.conn.rollback() ins = len(data) total += ins cid += 1 log.info('[{}] [{}] {}: Running: {:,.0f}, Rows: {:,.0f}'.format( self.objtype, cid, self.cln, total, ins)) log.info( '[{}]: {}: Inserted: {:,.0f}, To-Schema:{}, To-Table:{}, Skipped: {}, Elapsed: {}' .format(self.objtype, self.cln, len(rows), target['targetSchema'], target["targetTable"], skip, round((time.time() - start_time), 2))) pipe.close() insert_stats.inserted_cnt = total
def load_file(self, trans, file_obj, schema, table_name, qname, fmt_cols, cfg, skip=0, apx=None, stats=None): scfg, tcfg = cfg file_name = file_obj.file_name assert os.path.isfile(file_name) if 1: colsep = scfg['columnDelimiter'] assert colsep lcnt = file_obj.line_count(file_name) if 1: pp(file_obj.cols) #cols = ','.join([col.decode() for col in file_obj.cols]) #pp(cols) trans.conn.autocommit = False copyfmt = ',\n'.join([ "%s FORMAT 'hex'" % col[0] if col[0] in fmt_cols else "%s" % col for col in file_obj.cols_alt ]) assert os.path.isfile(file_obj.file_name) stmt = """ COPY %s.%s (%s ) FROM LOCAL '%s' DELIMITER '|' ESCAPE AS '^' NULL '' SKIP %d ABORT ON ERROR NO COMMIT """ % (schema, table_name, copyfmt, file_obj.file_name, skip) try: self.desc_table(schema, table_name) psql(stmt, 'Load') trans.cur.execute(stmt) except: trans.conn.rollback() psql(stmt) raise accepted, rejected = trans.cur.execute( 'SELECT GET_NUM_ACCEPTED_ROWS(),GET_NUM_REJECTED_ROWS()' ).fetchall()[0] pfmtd([ dict(Line_count=lcnt - skip, Accepted=accepted, Rejected=rejected) ], 'Load stats') assert lcnt - skip == accepted out = OrderedDict() out['table_name'] = table_name out['accepted'] = accepted out['rejected'] = rejected out['linecount'] = lcnt out['skip'] = skip out['diff'] = lcnt - skip - accepted stats[table_name] = out
def run(): skip = 1 total_ins = 0 term_line = True #//validate cols for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg = scfg = cli.get_scfg(_src_class) for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(aname=_trg_class, app_init=app_init) toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) table = '%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=False) #// validate cols cfg_cols = [x[u'columnName'] for x in cli.scfg[u'columnMappings']] tcols = toDB.get_cols() t_vs_c = set(tcols) - set(cfg_cols) c_vs_t = set(cfg_cols) - set(tcols) if t_vs_c: pfmtd([dict(c_vs_t=c_vs_t)], 'Config has columns missing in target table.') raise Exception( 'Target table has columns missing in config: %s' % t_vs_c) if c_vs_t: pfmtd([dict(t_vs_c=t_vs_c)], 'Target table has columns missing in config.') raise Exception( 'Config has columns missing in target table: %s' % c_vs_t) toDB.commit_transaction(trans=to_conn) #// transfer for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg = scfg = cli.get_scfg(_src_class) _dbname = cli.scfg["sourceDb"] fromDB = create_reader(aname=_src_class, app_init=app_init) fromDB.begin_transaction(env=cli.scfg['sourceDb'], out=from_conn) if 1: #//Extract to Dir for _dmp_class, val in cli.cfg['dump'][_source].items() or []: FileWriter = create_writer(aname=_dmp_class, app_init=app_init) fromDB.set_loader(FileWriter) cli.dcfg = cli.get_dcfg(_dmp_class) for _trg_class, val in cli.cfg['target'][_source].items( ) or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) file_ins_cnt = 0 FileWriter.open_file(out=dump_file) for iq_data in fromDB.fetch_many(chunk_size=file_size_rows, source=cli.scfg, qname='sourceStmt', out=InOut(), skip_header=0, terminate_line=term_line): if not file_ins_cnt: FileWriter.create_header( file=dump_file, header=fromDB.get_header(), cfg=cli.dcfg, terminate_line=term_line) FileWriter.append_data(file=dump_file, data=iq_data, cfg=cli.dcfg) file_ins_cnt += len(iq_data.data) if not file_ins_cnt: #in case there's no data FileWriter.create_header(file=dump_file, header=fromDB.get_header(), cfg=cli.dcfg, terminate_line=term_line) FileWriter.close_file(file=dump_file) total_ins += file_ins_cnt fromDB.desc_cur(cur=from_conn.cur, colord=False) fromDB.commit_transaction(trans=from_conn) log.info('Total records saved: %d' % total_ins) #// Load to IQ for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] DirReader = create_reader(aname=_src_class, app_init=app_init) if 1: #//Get the file names cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) DirReader.glob_dir(path=path, out=data_files, ext='*.*') if 1: #//Load to DB for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(aname=_trg_class, app_init=app_init) toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) table = '%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=None) #// validate cols cfg_cols = [ x[u'columnName'] for x in cli.scfg[u'columnMappings'] ] acols = cli.get_alt_cols(scfg) tcols = toDB.get_cols() fcols_alt = [] for data_file in data_files.file_names: dataFile = create_reader(aname='File', app_init=app_init, file_name=data_file, scfg=dir_scfg) dataFile.describe() file_stats[data_file] = dataFile.line_count( ) - cli.header_size(dir_scfg) fcols_alt = [ acols.get(x.decode(), x.decode()) for x in dataFile.get_header(data_file, dir_scfg) ] f_vs_c = set(fcols_alt) - set(cfg_cols) c_vs_f = set(cfg_cols) - set(fcols_alt) f_vs_t = set(fcols_alt) - set(tcols) t_vs_f = set(tcols) - set(fcols_alt) if f_vs_c: pfmtd([dict(c_vs_f=c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(f_vs_t=f_vs_t)], 'Dump file has columns missing in target table.') pfmtd([dict(t_vs_f=t_vs_f)], 'Target table has columns missing in dump file.') raise Exception( 'Target table has columns missing in config: %s' % f_vs_c) if c_vs_f: pfmtd([dict(f_vs_c=f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(f_vs_t=f_vs_t)], 'Dump file has columns missing in target table.') pfmtd([dict(t_vs_f=t_vs_f)], 'Target table has columns missing in dump file.') raise Exception( 'Config has columns missing in target table: %s' % c_vs_f) if f_vs_t: pfmtd([dict(f_vs_c=f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(c_vs_f=c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(t_vs_f=t_vs_f)], 'Target table has columns missing in dump file.') raise Exception( 'Dump file has columns missing in target table: %s' % f_vs_t) if t_vs_f: pfmtd([dict(f_vs_c=f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(c_vs_f=c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(f_vs_t=f_vs_t)], 'Dump file has columns missing in target table.') raise Exception( 'Target table has columns missing in dump file: %s' % t_vs_f) if 1: for data_fn in [x for x in data_files.file_names]: dataFile = create_reader(aname="File", app_init=app_init, file_name=data_fn, scfg=dir_scfg) dataFile.describe() fileCols = [ col.decode() for col in dataFile.get_header_cols() ] tbl = tcfg[ "targetTable"] #tcfg. os.path.basename(data_fn).split('.')[-2] assert tbl if 1: if 0 and tbl not in do_not_delete: stmt = 'DELETE FROM %s WHERE %s in (SELECT t.%s FROM %s t)' % ( tbl, masterTblCol, masterTblCol, masterTbl) deleted[tbl] = toDB.exec_dml(stmt, trans=to_conn, commit=False) pfmt([[deleted[tbl]]], ['Deleted from %s' % tbl]) else: deleted[tbl] = -1 if 0: acols = cli.get_alt_cols(scfg) dataFile.cols_alt = [ acols.get(x.decode(), x.decode()) for x in dataFile.cols ] else: dataFile.set_alt_cols() missing_cols = list( set(dataFile.cols_alt) - set(tcols)) pfmt([(tbl, x) for x in missing_cols], ['Table', 'Missing columns']) schema = tcfg["targetSchema"] if missing_cols: pfmt([[x] for x in missing_cols], ['Columns in Source, but not Target']) to_conn.conn.rollback() toDB.desc_table(schema, tbl) raise Exception( 'File column %s missing in table "%s".' % (missing_cols, tbl)) if 1: apx = {} fmt_cols = [] toDB.load_file(trans=to_conn, file_obj=dataFile, schema=schema, table_name=tbl, qname='insertStmt', fmt_cols=fmt_cols, cfg=(dir_scfg, tcfg), skip=skip, apx=apx, stats=stats) loaded[data_fn] = tbl else: not_loaded[data_fn] = tbl else: if 1: toDB.commit_transaction(trans=to_conn) pfmt( [[k] + [deleted[k]] + list(v)[1:] for k, v in stats.items() if deleted[k] >= 0], [ 'Table', 'Deleted', 'Accepted', 'Rejected', 'Line count', 'Skip', 'Diff' ], 'Load completed (deleted)'.upper()) pfmt([(k, v) for k, v in loaded.items()], ['Loaded Files', 'Loaded Tables']) pfmt([(k, v) for k, v in not_loaded.items()], ['Not loaded Files', 'Not loaded Tables']) e() if 0: #toDB.truncate_table ( table = table ) toDB.bulk_load(trans=to_conn, file_names=data_files, qname='insertStmt', cfg=(dir_scfg, tcfg), out=insert_stats) for k in file_stats.keys(): assert insert_stats[k] == file_stats[ k], 'Insert vs file count diff: %s<>%s for file \n%s' % ( insert_stats[k], file_stats[k], k) toDB.commit_transaction(trans=to_conn) if 0: Email.send_email(**email_args)
def run(): skip = 2 do_not_load = [] for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] DirReader = create_reader(aname=_src_class, app_init=app_init) cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) ok_files = InOut(file_names=[]) DirReader.glob_dir(path=path, out=ok_files, ext='*.ok') loaded = {} for _trg_class, val in cli.cfg['target'][_source].items(): cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(aname=_trg_class, app_init=app_init) do_not_delete = tcfg['doNotDeleteTables'] do_not_load = tcfg['doNotLoadTables'] to_conn = InOut() toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) toSchema = tcfg['targetSchema'] stmt = 'set search_path to %s' % toSchema psql(stmt) to_conn.cur.execute(stmt) pkstats = {} for okfn in ok_files.file_names: okFile = create_reader(aname='File', app_init=app_init, file_name=okfn, scfg=dir_scfg) okdir, okname = os.path.splitext(okfn) okbn = os.path.basename(okdir) out_files = InOut(file_names=[]) DirReader.glob_dir(path=okdir, out=out_files, ext='*.out') #e() if 1: # Check if some there are files missing in config ftlist = [] for out_fn in out_files.file_names: print(out_fn) ftlist.append(os.path.basename(out_fn).split('.')[1]) pfmt([[x] for x in ftlist], ['Files->Tables']) ctables = cli.tcfg['targetTables'].keys() extra_file_tables = list(set(ftlist) - set(ctables)) pfmt([[x] for x in extra_file_tables], ['Tables not in config.']) extra_config_tables = list(set(ctables) - set(ftlist)) pfmt([[x] for x in extra_config_tables], ['Tables in config but not in file names.']) assert not extra_file_tables, 'Tables %s are not listed in config["targetTables"].' % extra_file_tables for outfn in out_files.file_names: # Master first outFile = create_reader(aname='File', app_init=app_init, file_name=outfn, scfg=dir_scfg) outbn = os.path.basename(outfn) tbl = outbn.split('.')[1] outTbl = 'tmp_PK_%s' % tbl outCols = outFile.get_header_cols() apxCols = [('MartModifiedDate', 'timestamp'), ('AsOfFrom', 'timestamp'), ('AsOfTo', 'timestamp'), ('MD5', 'char(22)')] outTblCols = toDB.get_create_col_list(outCols, apx=apxCols) toCols = toDB.get_col_types(toSchema, tbl) pp(toCols) toDB.desc_tmp_table(outTbl, outCols + apxCols) do_not_delete.append(outTbl) try: stmt = 'drop table %s' % outTbl to_conn.cur.execute(stmt) except Exception as ex: #raise if not 'Table "%s" does not exist' % outTbl in str(ex): raise psql(outfn) stmt = 'CREATE LOCAL TEMPORARY TABLE %s ( %s )\nON COMMIT PRESERVE ROWS' % ( outTbl, ', \n'.join( ['%s %s' % tuple(col) for col in toCols])) pfmt([[stmt]], ['Create master temp PK' + outTbl]) toDB.exec_ddl(stmt) if 1: #//Load data into PK table fmt_cols = {} mmDt = okFile.get_value(coords=(0, 0), skip=skip) md5val = (base64.b64encode( hashlib.md5(b'test').digest())) apx = OrderedDict() apx['MartModifiedDate'] = mmDt apx['AsOfFrom'] = mmDt apx['AsOfTo'] = "12/31/9999" apx['MD5'] = '' #//defined on row level pk_outfn = '%s.pk' % outfn colsep = dir_scfg['columnDelimiter'] with open(pk_outfn, 'wb') as pkfh: with open(outfn, 'rb') as outfh: line = outfh.readline().strip() pkfh.write(line + colsep.join(apx.keys()).encode() + os.linesep.encode()) line = outfh.readline().strip() apxTypes = colsep.join( [col[1] for col in apxCols]) pkfh.write(line + apxTypes.encode() + os.linesep.encode()) line = outfh.readline().strip() while line: md5 = (base64.b64encode( hashlib.md5(line.replace( b'|', b'')).digest())) apx['MD5'] = md5.decode('ascii', 'ignore').strip( '=') #// REDO pkfh.write( line + colsep.join(apx.values()).encode() + os.linesep.encode()) line = outfh.readline().strip() outPkFile = create_reader(aname='File', app_init=app_init, file_name=pk_outfn, scfg=dir_scfg) outPkFile.set_alt_cols() schema = tcfg['targetSchema'] toDB.load_grds_file(trans=to_conn, file_obj=outPkFile, schema=schema, table_name=outTbl, qname='insertStmt', fmt_cols=fmt_cols, cfg=(dir_scfg, tcfg), skip=skip, stats=pkstats) loaded[outbn] = outTbl #outPkFile.delete() #pfmtd([pkstats]) #e() stats = {} deleted = {} processed = [] not_processed = [] for okfn in ok_files.file_names: okFile = create_reader(aname='File', app_init=app_init, file_name=okfn, scfg=dir_scfg) okdir, _ = os.path.splitext(okfn) okbn = os.path.basename(okdir) #e() assert os.path.isdir(okdir) snap_df = cli.get_dest_folder(okdir) if os.path.isdir(snap_df): log.warning('[%s]Destination folder exists: [%s]' % (okdir, snap_df)) not_processed.append(okfn) continue out_files = InOut(file_names=[]) DirReader.glob_dir(path=okdir, out=out_files, ext='*.out') apx = dict( MartModifiedDate=okFile.get_value(coords=(0, 0), skip=skip)) #e() if 0: g = raw_input("Continue?") not_loaded = {} for table_name in ftlist: tmpTbl = 'tmp_PK_%s' % table_name toCols = toDB.get_tab_cols(tmpTbl) #pp(toCols) toDB.desc_table(None, tmpTbl) toDB.desc_table(toSchema, table_name) #e() if table_name in ['TxnLookupMap']: tmpCols = ',\n '.join( ['tmid.%s' % col[0].decode() for col in toCols]) ins = """ insert into {0} ( {1} ) select distinct {2} from {3} tmid LEFT JOIN {0} ta ON ta.{4} = tmid.{4} AND ta.{5} = tmid.{5} AND ta.{6} = tmid.{6} AND ta.ValidFrom = tmid.ValidFrom and ta.AsOfTo = tmid.AsOfTo where ta.MD5 <> tmid.MD5 OR ta.{4} is NULL """.format(table_name, ',\n '.join([col[0].decode() for col in toCols]), tmpCols, tmpTbl, toCols[0][0].decode(), toCols[1][0].decode(), toCols[2][0].decode()) psql(ins) inserted = toDB.exec_dml(ins, trans=to_conn, commit=False) pfmtd([dict(Inserted=inserted)]) elif table_name in [ 'G3Lookup', 'GCLookup', 'GISLookup', 'GPSLookup', 'GPXLookup', 'GPosLookup', 'GTxLookup', 'FundToBusinessUnitMap', 'TxEditReason' ]: tmpCols = ',\n '.join( ['tmid.%s' % col[0].decode() for col in toCols]) ins = """ insert into {0} ( {1} ) select distinct {2} from {3} tmid LEFT JOIN {0} ta ON ta.{4} = tmid.{4} AND ta.{5} = tmid.{5} AND ta.AsOfTo = tmid.AsOfTo where ta.MD5 <> tmid.MD5 OR ta.{4} is NULL """.format(table_name, ',\n '.join([col[0].decode() for col in toCols]), tmpCols, tmpTbl, toCols[0][0].decode(), toCols[1][0].decode()) psql(ins) inserted = toDB.exec_dml(ins, trans=to_conn, commit=False) pfmtd([dict(Inserted=inserted)]) else: tmpCols = ',\n '.join( ['tmid.%s' % col[0].decode() for col in toCols]) ins = """ insert into {0} ( {1} ) select distinct {2} from {3} tmid LEFT JOIN {0} ta ON ta.{4} = tmid.{4} AND ta.AsOfTo = tmid.AsOfTo where ta.MD5 <> tmid.MD5 OR ta.{4} is NULL ; """.format(table_name, ',\n '.join([col[0].decode() for col in toCols]), tmpCols, tmpTbl, toCols[0][0].decode()) psql(ins) inserted = toDB.exec_dml(ins, trans=to_conn, commit=False) pfmtd([dict(Inserted=inserted)]) if 1: toDB.commit_transaction(trans=to_conn) pfmt([[k] + list(v.values())[1:] for k, v in pkstats.items()], [ 'Table', 'Accepted', 'Rejected', 'Line count', 'Skip', 'Diff' ], 'Load completed'.upper()) pfmt([(k, v) for k, v in loaded.items()], ['Loaded Files', 'Loaded Tables']) pfmt([(k, v) for k, v in not_loaded.items()], ['Not loaded Files', 'Not loaded Tables']) assert os.path.isdir(okdir) if 0: cli.MoveSnapFolder(okdir) processed.append(dict(ProcessedFile=okfn)) #break; if not ok_files.file_names: pfmtd([ dict(NoFiles='No OK files at working dir: [ %s ]' % cli.pa[0]) ]) pfmtd(processed) pfmtd(not_processed) if 0: email_args.update(dict(cli_stats=None)) Email.send_email(**email_args) cli.done()