def run(): stats = {} for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = val.keys()[0] DirReader = create_reader(aname=_src_class, app_init=app_init) if 1: #Get the file names cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) #DirReader.get_files(path=path, out = data_files ) DirReader.glob_dir(path=path, out=data_files, ext='*.csv') if 1: #Load to DB to_conn = InOut() for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] #pp(data_files.file_names) for data_file in data_files.file_names: dataFile = create_reader(aname='File', app_init=app_init, file_name=data_file, scfg=dir_scfg) dataFile.describe() if 1: toDB = create_writer(aname=_trg_class, app_init=app_init) toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) toDB.desc_table(schema=tcfg['targetSchema'], tbl=cli.get_parsed(ckey='targetTable', cfg=tcfg), col_ord=False) #e() toDB.bulk_load_file(trans=to_conn, file_names=data_files, qname='insertStmt', cfg=(dir_scfg, tcfg), out=insert_stats) toDB.commit_transaction(trans=to_conn) if 0: stats['Dir->%s' % (_dbname)] = st = OrderedDict() st['source_cnt'] = cli.get_src_row_count( DB) if not cli.lame_duck else cli.lame_duck st['total_extracted'] = insert_stats.inserted_cnt st['total_inserted'] = insert_stats.inserted_cnt if 0: for k, v in stats.items(): assert v['source_cnt'] == v['total_extracted'] assert v['source_cnt'] == v['total_inserted'] if 0: email_args.update(dict(cli_stats=None)) Email.send_email(**email_args)
def run(): lite_tbl={} for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] DirReader = create_reader(aname = _src_class, app_init=app_init ) if 1: cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) out_files = InOut(file_names=[]) print path DirReader.glob_dir(path=path, out = out_files, ext='*.out') pp(out_files.file_names) for _trg_class, val in cli.cfg['target'][_source].items(): cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer (aname = _trg_class, app_init=app_init ) if 1: toDB.begin_transaction (env =tcfg['targetDb'] , out = lite_conn ) toDB.bulk_insert ( trans = lite_conn, file_names = out_files, qname = 'insertStmt', cfg = (dir_scfg, tcfg), create_table=True, strip_line_term=True) toDB.commit_transaction( trans = lite_conn) lite_tbl[_source] = cli.get_parsed(ckey='targetTable', cfg=tcfg) pp(lite_tbl)
def run(): lite_tbl = {} for _source, val in cli.cfg['source'].items(): _dbname = val["sourceDb"] DB = create_reader(_dbname, app_init=app_init) FileWriter = create_writer('File', app_init=app_init) #data_files.file_names=[] if 1: cli.set_source(_source) DB.set_loader(FileWriter) total_ins = 0 FileWriter.open_file(out=dump_file) for iq_data in DB.fetch_many(chunk_size=file_size_rows, source=cli.get_scfg(), qname='sourceStmt', out=InOut(), skip_header=0): if not total_ins: FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=cli.get_dcfg()) FileWriter.append_data(file=dump_file, data=iq_data, cfg=cli.get_dcfg()) total_ins += len(iq_data.data) if not total_ins: #in case there's no data FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=cli.get_dcfg()) FileWriter.close_file(file=dump_file) if 1: Email.send_email(**email_args)
def run(): for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = val.keys()[0] DirReader = create_reader(_src_class, app_init=app_init) if 1: #Get the file names cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) DirReader.glob_dir(path=path, out=ok_files, ext='*.ok') for okfn in ok_files.file_names: okdir, _ = os.path.splitext(okfn) assert os.path.isdir(okdir) OkReader = create_reader("Dir", app_init=app_init) DirReader.glob_dir(path=okdir, out=out_files, ext='*.out') pp(out_files.file_names) if 1: for _trg_class, val in cli.cfg['target'][_source].items(): cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(_trg_class, app_init=app_init) to_conn = InOut() for out_fn in out_files.file_names: tbl = os.path.basename(out_fn).split('.')[1] print tbl toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) toDB.load_file(trans=to_conn, file_name=out_fn, table_name=tbl, qname='insertStmt', cfg=(dir_scfg, tcfg), create_table=True) toDB.commit_transaction(trans=to_conn) if 1: email_args.update(dict(cli_stats=None)) Email.send_email(**email_args) etl.done()
def run(): for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = val.keys()[0] cli.scfg = cli.get_scfg(_src_class) _dbname = cli.scfg["sourceDb"] fromDB = create_reader(_src_class, app_init=app_init) fromDB.begin_transaction(env=cli.scfg['sourceDb'], out=from_conn) for _s3_class, val in cli.cfg['s3'][_source].items() or []: cli.s3cfg = cli.get_s3cfg(_s3_class) S3StreamLoader = create_writer(_s3_class, app_init=app_init) for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) TOdb = create_writer(_trg_class, app_init=app_init) fromDB.set_loader(TOdb) cli.proc_config() fromDB.open_stream(dbcfg=cli.scfg, qname='sourceStmt', out=IQ_cursor) S3StreamLoader.load_stream(source=IQ_cursor, skip_header=0, out=s3_file_names) if 1: TOdb.begin_transaction(env=cli.tcfg['targetDb'], out=snow_conn) TOdb.purge_data(trans=snow_conn, stmt='purgeStmt') TOdb.bulk_copy( trans=snow_conn, file_names=s3_file_names, target=cli.tcfg, qname='copyStmt', ) TOdb.commit_transaction(trans=snow_conn) S3StreamLoader.delete_files(file_names=s3_file_names) if 0: IQ.set_loader(Snowflake) IQ.open_stream(dbcfg=cli.scfg, qname='sourceStmt', out=IQ_cursor) S3StreamLoader.load_stream(source=IQ_cursor, skip_header=0, out=s3_file_names) #fromDB.commit_transaction ( trans = from_conn) pp(cli.tcfg) Snowflake.begin_transaction(env=cli.tcfg['targetDb'], out=snow_conn) Snowflake.purge_data(trans=snow_conn, stmt='purgeStmt') Snowflake.bulk_copy( trans=snow_conn, file_names=s3_file_names, target=cli.tcfg, qname='copyStmt', ) Snowflake.commit_transaction(trans=snow_conn) S3StreamLoader.delete_files(file_names=s3_file_names) if 0: Email.send_email(**email_args) cli.done()
def run(): total_ins = 0 for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = val.keys()[0] cli.scfg = cli.get_scfg(_src_class) _dbname = cli.scfg["sourceDb"] DB = create_reader(_dbname, app_init=app_init) FileWriter = create_writer('Dir', app_init=app_init) DB.set_loader(FileWriter) if 1: #Extract to Dir for _dmp_class, val in cli.cfg['dump'][_source].items() or []: cli.dcfg = cli.get_dcfg(_dmp_class) pp(cli.dcfg) file_ins_cnt = 0 FileWriter.open_file(out=dump_file) if 0: IQ.open_stream(dbcfg=cli.scfg, qname='sourceStmt', out=IQ_cursor) S3StreamLoader.load_stream(source=IQ_cursor, skip_header=0, out=s3_file_names) for iq_data in DB.fetch_many(chunk_size=file_size_rows, source=cli.scfg, qname='sourceStmt', out=InOut(), skip_header=0): if not file_ins_cnt: FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=cli.dcfg) FileWriter.append_data(file=dump_file, data=iq_data, cfg=cli.dcfg) file_ins_cnt += len(iq_data.data) if not file_ins_cnt: #in case there's no data FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=cli.dcfg) FileWriter.close_file(file=dump_file) total_ins += file_ins_cnt log.info('Total records saved: %d' % total_ins) if 1: Email.send_email(**email_args)
def run(): stats = {} for _source, val in cli.cfg['source'].items(): val = cli.cfg['source'][_source] _dbname = val["sourceDb"] DB = create_reader(_dbname, app_init=app_init) if 1: #Load to DB cli.set_source(_source) file_scfg = cli.cfg['dump'][_source] if 1: to_conn = InOut() #file_stats.ins_stats[_dbname]=ins={} for _target, val in cli.cfg['target'][_source].items() or []: tcfg = cli.cfg['target'][_source][_target] _todbname = val["targetDb"] toDB = create_writer(_target, app_init=app_init) rec_delim = '\n' skip_header = 0 #ins[_todbname]=manager.dict() toDB.insert_files(producer=(producer, (cli, _source)), out=file_stats, skip_header=skip_header, rec_delim=rec_delim, cfg=(file_scfg, tcfg), return_dict=return_dict) pp(file_stats.dump_files) extracted_cnt = 0 for fobj in file_stats.dump_files: extracted_cnt += fobj.extracted_cnt print toDB.counter.value() pp(return_dict.values()) stats['%s->%s' % (_dbname, _todbname)] = st = OrderedDict() st['source_cnt'] = insert_stats.source_cnt if not cli.lame_duck else cli.lame_duck st['total_extracted'] = extracted_cnt st['total_inserted'] = toDB.total_ins pp(stats) for k, v in stats.items(): assert v['source_cnt'] == v['total_extracted'] assert v['source_cnt'] == v['total_inserted'] if 1: email_args.update(dict(cli_stats=stats)) Email.send_email(**email_args)
def run(): lite_tbl={} #pp(cli.cfg['source'].items()) #e() for _source, val in cli.cfg['source'].items(): _dbname=val["sourceDb"] DB = create_reader(_dbname, app_init=app_init ) FileWriter = create_writer('File', app_init=app_init ) data_files.file_names=[] if 1: cli.set_source(_source) lite_scfg, lite_tcfg = cli.cfg['teardown']['source'][_source], cli.cfg['teardown']['target'][_source] #pp(lite_scfg) path = cli.get_parsed(ckey='sourceDir', cfg=lite_scfg) #pp(path) #e() Dir.get_files(path=path, out = data_files ) #pp(data_files.file_names) #e() if 1: SQLite.begin_transaction ( out = lite_conn ) SQLite.bulk_insert ( trans = lite_conn, file_names = data_files, qname = 'insertStmt', cfg = (lite_scfg, lite_tcfg) ) SQLite.commit_transaction( trans = lite_conn) lite_tbl[_source] = cli.get_parsed(ckey='targetTable', cfg=lite_tcfg) #SQLite.show_data(lite_tbl[_source]) #e() if 1: tear= cli.tear compare = tear['compare'] source = tear['source'] fmt={} for db in source: fmt[db]=lite_tbl[db] for k,v in compare.items(): compare[k]=v.format(**fmt) cli.exec_report(SQLite, compare) if 1: Email.send_email_att( **email_args )
def run(): lite_tbl = {} stats = {} for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = val.keys()[0] DirReader = create_reader(_src_class, app_init=app_init) if 1: #Get the file names cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) DirReader.get_files(path=path, out=data_files) if 1: #Load to DB to_conn = InOut() for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(_trg_class, app_init=app_init) toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) toDB.bulk_load(trans=to_conn, file_names=data_files, qname='insertStmt', cfg=(dir_scfg, tcfg), out=insert_stats) toDB.commit_transaction(trans=to_conn) if 0: stats['Dir->%s' % (_dbname)] = st = OrderedDict() st['source_cnt'] = cli.get_src_row_count( DB) if not cli.lame_duck else cli.lame_duck st['total_extracted'] = insert_stats.inserted_cnt st['total_inserted'] = insert_stats.inserted_cnt if 0: for k, v in stats.items(): assert v['source_cnt'] == v['total_extracted'] assert v['source_cnt'] == v['total_inserted'] if 1: email_args.update(dict(cli_stats=None)) Email.send_email(**email_args)
def run(): total_ins = 0 for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = val.keys()[0] cli.scfg = cli.get_scfg(_src_class) _dbname = cli.scfg["sourceDb"] fromDB = create_reader(_dbname, app_init=app_init) FileWriter = create_writer('Dir', app_init=app_init) fromDB.set_loader(FileWriter) fromDB.begin_transaction(env=cli.scfg['sourceDb'], out=from_conn) if 1: #Extract to Dir for _dmp_class, val in cli.cfg['dump'][_source].items() or []: cli.dcfg = cli.get_dcfg(_dmp_class) file_ins_cnt = 0 FileWriter.open_file(out=dump_file) for iq_data in fromDB.fetch_many(chunk_size=file_size_rows, source=cli.scfg, qname='sourceStmt', out=InOut(), skip_header=0): if not file_ins_cnt: FileWriter.create_header(file=dump_file, header=fromDB.get_header(), cfg=cli.dcfg) FileWriter.append_data(file=dump_file, data=iq_data, cfg=cli.dcfg) file_ins_cnt += len(iq_data.data) if not file_ins_cnt: #in case there's no data FileWriter.create_header(file=dump_file, header=fromDB.get_header(), cfg=cli.dcfg) FileWriter.close_file(file=dump_file) total_ins += file_ins_cnt fromDB.commit_transaction(trans=from_conn) log.info('Total records saved: %d' % total_ins) if 0: Email.send_email(**email_args)
def get_load_cols(self, toDB, cfg, data_files): scfg, dir_scfg = cfg acols = self.get_alt_cols(scfg) tcols = toDB.get_cols() fcols_alt = [] for data_file in data_files.file_names: dataFile = create_reader(aname='File', app_init=app_init, file_name=data_file, scfg=dir_scfg) dataFile.describe() fcols_alt = [ acols.get(x, x) for x in dataFile.get_header(data_file, dir_scfg) ] assert not set(fcols_alt) - set( tcols), 'File has columns missing in table.' assert not set(tcols) - set( fcols_alt), 'Table has columns missing in file.' assert fcols_alt return fcols_alt
def producer(cli, _source): val = cli.cfg['source'][_source] _dbname = val["sourceDb"] DB = create_reader(_dbname, app_init=app_init) cnt = cli.get_src_row_count(DB) if not cli.lame_duck: assert cli.dop > 0 cli.src_chunk_size = round(cnt / cli.dop) + 1 else: cli.src_chunk_size = cli.lame_duck FileWriter = create_writer('File', app_init=app_init) data_files.file_names = [] #uploaded_files.file_names=[] #ext_files=[] if 1: cli.set_source(_source) DB.set_loader(FileWriter) total_read = 0 scfg = cli.get_scfg() source_chunk_size = int( float(cli.get_parsed(ckey='sourceChunkSize', cfg=scfg))) cid = 0 skew_pct = int(float(cli.get_parsed(ckey='fileSkewPct', cfg=scfg))) log.debug('Skew percentile = %s' % skew_pct) if skew_pct and cli.dop >= 2: delta = source_chunk_size * (skew_pct / 100.0) num_of_files = cli.dop increment = int(delta / num_of_files) chunk_map = {} accum_skew = sum( [increment * (num_of_files - i) for i in range(num_of_files)]) for i in range(num_of_files): skew = ((cnt - accum_skew) / num_of_files) + increment * (num_of_files - i) chunk_map[i] = skew + 1 if not cli.lame_duck else cli.lame_duck pp(chunk_map) #e() if not cli.lame_duck: assert sum( chunk_map.values() ) >= cnt, 'Chunk map has to cover all source records [%s <> %s]' % ( sum(chunk_map.values()), cnt) #dfiles=[] for iq_data in DB.fetch_many_async(chunk_map=chunk_map, counter=counter, source=scfg, qname='sourceStmt', out=InOut(), skip_header=0): dump_file = InOut(source_cnt=cnt) FileWriter.open_file(id=cid, out=dump_file) if 1: #not total_ins: dump_cfg = cli.get_dcfg() FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=dump_cfg) FileWriter.append_data(file=dump_file, data=iq_data, cfg=dump_cfg) total_read += len(iq_data.data) FileWriter.close_file(file=dump_file) #ext_files.append(dump_file.fpath) #dfiles.append(dump_file) dump_file.extracted_cnt = total_read yield dump_file cid += 1 else: #lame duck print source_chunk_size #e() assert source_chunk_size for iq_data in DB.fetch_many(chunk_size=source_chunk_size, source=scfg, qname='sourceStmt', out=InOut(), skip_header=0): dump_file = InOut(source_cnt=cnt) FileWriter.open_file(id=cid, out=dump_file) if 1: #not total_ins: dump_cfg = cli.get_dcfg() FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=dump_cfg) FileWriter.append_data(file=dump_file, data=iq_data, cfg=dump_cfg) total_read += len(iq_data.data) FileWriter.close_file(file=dump_file) #ext_files.append(dump_file.fpath) log.debug('File %d created:file: %d, %d records' % (cid, len(iq_data.data), source_chunk_size)) cid += 1 dump_file.extracted_cnt = total_read yield dump_file log.debug('Done extracting.....')
def run(): skip = 2 #deleted = {} #loaded = {} #not_loaded = {} masterTbl = 'gtxMasterPKData' do_not_delete = ['TxFinancingRateHist', masterTbl] do_not_load = ['TxFinancingRateHist'] #'TxFinancingRate', for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = val.keys()[0] DirReader = create_reader(aname=_src_class, app_init=app_init) if 1: cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) ok_files = InOut(file_names=[]) DirReader.glob_dir(path=path, out=ok_files, ext='*.ok') if 1: for _trg_class, val in cli.cfg['target'][_source].items(): cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(aname=_trg_class, app_init=app_init) to_conn = InOut() toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) to_conn.cur.execute('set search_path to CIGRpt') if ok_files.file_names: # Master first try: tbl = masterTbl stmt = 'drop table %s' % tbl to_conn.cur.execute(stmt) except Exception, err: if not 'Table "%s" does not exist' % masterTbl in str( err): raise stmt = 'create local temporary table %s ( TxMasterID bigint not null, MartModifiedDate timestamp)\n ON COMMIT PRESERVE ROWS' % tbl pfmt([[stmt]], ['Create master temp PK']) to_conn.cur.execute(stmt) stats = {} deleted = {} processed = [] not_processed = [] for okfn in ok_files.file_names: okFile = create_reader(aname="File", app_init=app_init, file_name=okfn, scfg=dir_scfg) okdir, _ = os.path.splitext(okfn) okbn = os.path.basename(okdir) #e() assert os.path.isdir(okdir) snap_df = cli.get_dest_folder(okdir) if os.path.isdir(snap_df): log.warning('[%s]Destination folder exists: [%s]' % (okdir, snap_df)) not_processed.append(okfn) continue OkReader = create_reader(aname="Dir", app_init=app_init) out_files = InOut(file_names=[]) DirReader.glob_dir(path=okdir, out=out_files, ext='*.out') apx = dict(MartModifiedDate=okFile.get_value( coords=(0, 0), skip=skip)) tlist = [] for out_fn in out_files.file_names: tlist.append( [os.path.basename(out_fn).split('.')[1]]) pfmt(tlist, ['Files->Tables']) if 0: g = raw_input("Continue?") assert ['PK'] in tlist, 'PK file is missing' if 1: stmt = 'TRUNCATE TABLE %s' % (masterTbl) toDB.exec_dml(stmt, trans=to_conn, commit=False) deleted[masterTbl] = -1 #e() loaded = {} not_loaded = {} for out_fn in [ x for x in out_files.file_names if not os.path.basename(x).split('.')[1] in ['PK'] ]: outFile = create_reader(aname="File", app_init=app_init, file_name=out_fn, scfg=dir_scfg) outCols = [ col[0] for col in outFile.get_header_cols() ] tbl = os.path.basename(out_fn).split('.')[1] assert tbl if tbl not in ['PK'] + do_not_load: if tbl not in do_not_delete: stmt = 'DELETE FROM %s WHERE TxMasterID in (SELECT t.TxMasterID FROM %s t)' % ( tbl, masterTbl) deleted[tbl] = toDB.exec_dml(stmt, trans=to_conn, commit=False) pfmt([[deleted[tbl]]], ['Deleted from %s' % tbl]) else: deleted[tbl] = -1 tblCols = toDB.get_columns(tbl).values() pfmt([[x] for x in list( set(tblCols) - set(outCols) - set(['MartModifiedDate']))], ['Columns in Source, but not Target']) missing_cols = list( set(outCols) - set(tblCols)) pfmt([(tbl, x) for x in missing_cols], ['Table', 'Missing columns']) if missing_cols: to_conn.conn.rollback() raise Exception( 'File column %s missing in table "%s".' % (missing_cols, tbl)) fmt_cols = {} if 1: fmt_cols['Tx'] = [ 'TxMasterGUID', 'SwapEventGUID' ] toDB.load_file(trans=to_conn, file_obj=outFile, table_name=tbl, qname='insertStmt', cfg=(dir_scfg, tcfg), fmt_cols=fmt_cols.get( tbl, []), skip=skip, apx=apx, stats=stats) loaded[out_fn] = tbl else: not_loaded[out_fn] = tbl else: toDB.commit_transaction(trans=to_conn) pfmt( [[k] + [deleted[k]] + list(v)[1:] for k, v in stats.items() if deleted[k] >= 0], [ 'Table', 'Deleted', 'Accepted', 'Rejected', 'Line count', 'Skip', 'Diff' ], 'Load completed (deleted)'.upper()) pfmt([(k, v) for k, v in loaded.items()], ['Loaded Files', 'Loaded Tables']) pfmt([(k, v) for k, v in not_loaded.items()], ['Not loaded Files', 'Not loaded Tables']) assert os.path.isdir(okdir) cli.MoveSnapFolder(okdir) processed.append(okfn) #break; if not ok_files.file_names: counter = itertools.count(1) pfmt([['No OK files at working dir: [ %s ]' % cli.pa[0]]], ['No files']) if processed: counter = itertools.count(1) pfmt([[next(counter), x] for x in processed], ['##', 'Processed']) if not_processed: counter = itertools.count(1) pfmt([[next(counter), x] for x in not_processed], ['##', 'Not processed'])
def run(): skip = 2 serviceName = 'gfin' #deleted = {} #loaded = {} #not_loaded = {} #masterTbl = 'gtxMasterPKData' #do_not_delete = ['TxFinancingRateHist', masterTbl] do_not_load = ['TxFinancingRate', 'TxFinancingRateHist'] #'TxFinancingRate', for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] DirReader = create_reader(aname=_src_class, app_init=app_init) if 1: cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) ok_files = InOut(file_names=[]) DirReader.glob_dir(path=path, out=ok_files, ext='*.ok') if 1: for _trg_class, val in cli.cfg['target'][_source].items(): cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(aname=_trg_class, app_init=app_init) masterTabTag = tcfg['masterTableTag'] masterTbl = tcfg['targetTables'][masterTabTag][ 'table_name'] masterTblCol = tcfg['targetTables'][masterTabTag][ 'column_name'] do_not_delete = tcfg['doNotDeleteTables'] + [masterTbl] do_not_load = tcfg['doNotLoadTables'] to_conn = InOut() toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) to_conn.cur.execute('set search_path to CIGRpt') if ok_files.file_names: # Master first try: stmt = 'drop table %s' % masterTbl to_conn.cur.execute(stmt) except Exception as ex: #raise if not 'Table "%s" does not exist' % masterTbl in str( ex): raise stmt = 'create local temporary table %s ( %s bigint not null, MartModifiedDate timestamp)\n ON COMMIT PRESERVE ROWS' % ( masterTbl, masterTblCol) pfmt([[stmt]], ['Create master temp PK']) to_conn.cur.execute(stmt) #e() stats = {} deleted = {} processed = [] not_processed = [] for okfn in ok_files.file_names: okFile = create_reader(aname='File', app_init=app_init, file_name=okfn, scfg=dir_scfg) okdir, _ = os.path.splitext(okfn) okbn = os.path.basename(okdir) #e() assert os.path.isdir(okdir) snap_df = cli.get_dest_folder(okdir) if os.path.isdir(snap_df): log.warning('[%s]Destination folder exists: [%s]' % (okdir, snap_df)) not_processed.append(okfn) continue OkReader = create_reader(aname="Dir", app_init=app_init) out_files = InOut(file_names=[]) DirReader.glob_dir(path=okdir, out=out_files, ext='*.out') apx = dict(MartModifiedDate=okFile.get_value( coords=(0, 0), skip=skip)) ftlist = [] for out_fn in out_files.file_names: print(out_fn) ftlist.append( os.path.basename(out_fn).split('.')[1]) pfmt([[x] for x in ftlist], ['Files->Tables']) #e() if 1: ctables = cli.tcfg['targetTables'].keys() extra_file_tables = list( set(ftlist) - set(ctables)) pfmt([[x] for x in extra_file_tables], ['Tables not in config.']) extra_config_tables = list( set(ctables) - set(ftlist)) pfmt([[x] for x in extra_config_tables], ['Tables in config but not in file names.']) assert not extra_file_tables, 'Tables %s are not listed in config["targetTables"].' % extra_file_tables if 0: g = raw_input("Continue?") if 1: #//create PK file fromFile = create_reader( aname='File', app_init=app_init, file_name=os.path.join(okdir, 'gfin.Instrument.out'), scfg=dir_scfg) toFile = create_reader(aname='File', app_init=app_init, file_name=os.path.join( okdir, '%s.PK.out' % serviceName), scfg=dir_scfg, parse=False) rowcnt = cli.createPrimaryKeyFile( ffObj=fromFile, pkfn=os.path.join(okdir, '%s.PK.out' % serviceName)) assert masterTabTag in ftlist, '"%s" file is missing' % masterTabTag if 1: stmt = 'TRUNCATE TABLE %s' % (masterTbl) toDB.exec_dml(stmt, trans=to_conn, commit=False) deleted[masterTbl] = -1 #e() #e() loaded = {} not_loaded = {} if 1: pkfn = [ x for x in out_files.file_names if os.path.basename(x).split('.')[1] in [masterTabTag] ][0] schema = tcfg['targetSchema'] outFile = create_reader(aname="File", app_init=app_init, file_name=pkfn, scfg=dir_scfg) fmt_cols = tcfg['targetTables'][masterTabTag].get( 'formatColumns', []) outFile.set_alt_cols() toDB.load_gfin_file(trans=to_conn, file_obj=outFile, schema=schema, table_name=masterTbl, qname='insertStmt', fmt_cols=fmt_cols, cfg=(dir_scfg, tcfg), skip=skip, apx=apx, stats=stats) loaded[out_fn] = masterTbl #e() if 1: stmt = 'SELECT count(*) FROM %s t' % masterTbl pkcnt = toDB.exec_query(stmt).fetchall()[0][0] assert pkcnt == (rowcnt - skip) for out_fn in [ x for x in out_files.file_names if not os.path.basename(x).split('.')[1] in [masterTabTag] ]: outFile = create_reader(aname="File", app_init=app_init, file_name=out_fn, scfg=dir_scfg) outCols = [ col[0] for col in outFile.get_header_cols() ] tbl = os.path.basename(out_fn).split('.')[1] assert tbl if tbl not in [masterTabTag] + do_not_load: if tbl not in do_not_delete: stmt = 'DELETE FROM %s WHERE %s in (SELECT t.%s FROM %s t)' % ( tbl, masterTblCol, masterTblCol, masterTbl) deleted[tbl] = toDB.exec_dml(stmt, trans=to_conn, commit=False) pfmt([[deleted[tbl]]], ['Deleted from %s' % tbl]) else: deleted[tbl] = -1 tblCols = toDB.get_columns(tbl).values() pfmt([[x] for x in list( set(tblCols) - set(outCols) - set(['MartModifiedDate']))], ['Columns in Source, but not Target']) missing_cols = list( set(outCols) - set(tblCols)) pfmt([(tbl, x) for x in missing_cols], ['Table', 'Missing columns']) if missing_cols: to_conn.conn.rollback() schema = tcfg["targetSchema"] toDB.desc_table(schema, tbl) raise Exception( 'File column %s missing in table "%s".' % (missing_cols, tbl)) if 1: schema = tcfg['targetSchema'] fmt_cols = tcfg['targetTables'][tbl].get( 'formatColumns', []) outFile.set_alt_cols() toDB.load_gfin_file(trans=to_conn, file_obj=outFile, schema=schema, table_name=tbl, qname='insertStmt', fmt_cols=fmt_cols, cfg=(dir_scfg, tcfg), skip=skip, apx=apx, stats=stats) loaded[out_fn] = tbl else: not_loaded[out_fn] = tbl else: toDB.commit_transaction(trans=to_conn) #pfmt([[k]+[deleted [k]]+list(v)[1:] for k,v in stats.items() if deleted [k]>=0], ['Table','Deleted', 'Accepted', 'Rejected','Line count','Skip', 'Diff'],'Load completed (deleted)'.upper()) #pfmt([(k,v) for k, v in loaded.items()], ['Loaded Files','Loaded Tables']) #pfmt([(k,v) for k, v in not_loaded.items()], ['Not loaded Files','Not loaded Tables']) pfmt( [[k] + [deleted[k]] + list(v.values())[1:] for k, v in stats.items() if deleted[k] >= 0], [ 'Table', 'Deleted', 'Accepted', 'Rejected', 'Line count', 'Skip', 'Diff' ], 'Load completed/deleted'.upper()) pfmt([(k, v) for k, v in loaded.items()], ['Loaded Files', 'Loaded Tables']) pfmt([(k, v) for k, v in not_loaded.items()], ['Not loaded Files', 'Not loaded Tables']) assert os.path.isdir(okdir) if 0: cli.MoveSnapFolder(okdir) processed.append(okfn) #break; if not ok_files.file_names: counter = itertools.count(1) pfmt([['No OK files at working dir: [ %s ]' % cli.pa[0]]], ['No files']) if processed: counter = itertools.count(1) pfmt([[next(counter), x] for x in processed], ['##', 'Processed']) if not_processed: counter = itertools.count(1) pfmt([[next(counter), x] for x in not_processed], ['##', 'Not processed (backup exists)']) if 0: email_args.update(dict(cli_stats=None)) Email.send_email(**email_args) cli.done()
def run(): total_ins = 0 term_line = True for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg= scfg=cli.get_scfg(_src_class) _dbname=cli.scfg["sourceDb"] fromDB = create_reader(aname = _src_class, app_init=app_init ) fromDB.begin_transaction ( env =cli.scfg['sourceDb'] , out = from_conn ) if 1: #//Extract to Dir for _dmp_class, val in cli.cfg['dump'][_source].items() or []: FileWriter = create_writer(aname =_dmp_class, app_init=app_init ) fromDB.set_loader(FileWriter) cli.dcfg= cli.get_dcfg(_dmp_class) for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) file_ins_cnt= 0 FileWriter.open_file( out = dump_file ) for iq_data in fromDB.fetch_many ( chunk_size=file_size_rows, source = cli.scfg, qname = 'sourceStmt', out=InOut(), skip_header=0, terminate_line= term_line): if not file_ins_cnt: FileWriter.create_header(file = dump_file, header = fromDB.get_header(), cfg=cli.dcfg, terminate_line= term_line) FileWriter.append_data ( file = dump_file, data = iq_data, cfg=cli.dcfg) file_ins_cnt+=len(iq_data.data) if not file_ins_cnt: #in case there's no data FileWriter.create_header(file = dump_file, header = fromDB.get_header(), cfg=cli.dcfg, terminate_line= term_line) FileWriter.close_file(file = dump_file) total_ins +=file_ins_cnt fromDB.desc_cur(cur = from_conn.cur, colord=False) fromDB.commit_transaction ( trans = from_conn) log.info('Total records saved: %d' % total_ins) #// Load to IQ for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = val.keys()[0] DirReader = create_reader(aname = _src_class, app_init=app_init ) if 1: #//Get the file names cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) DirReader.glob_dir(path=path, out = data_files, ext='*.*') if 1: #//Load to DB to_conn = InOut() for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer (aname =_trg_class, app_init=app_init ) toDB.begin_transaction ( env =tcfg['targetDb'] , out = to_conn ) table='%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], colord=False) #// validate cols acols= cli.get_alt_cols(scfg) tcols=toDB.get_cols() fcols_alt=[] for data_file in data_files.file_names: dataFile = create_reader(aname = 'File', app_init=app_init, file_name=data_file, scfg=dir_scfg) dataFile.describe() file_stats[data_file] = dataFile.line_count() - cli.header_size(dir_scfg) fcols_alt=[acols.get(x,x) for x in dataFile.get_header(data_file, dir_scfg)] assert not set(fcols_alt) -set(tcols), 'File has columns missing in table.' assert not set(tcols) -set(fcols_alt), 'Table has columns missing in file.' #toDB.truncate_table ( table = table ) toDB.bulk_load ( trans = to_conn, file_names = data_files, qname = 'insertStmt', cfg = (dir_scfg, tcfg), out=insert_stats, header=fcols_alt) for k in file_stats.keys(): assert insert_stats[k] == file_stats[k], 'Insert vs file count diff: %s<>%s for file \n%s' % (insert_stats[k] , file_stats[k], k) toDB.commit_transaction ( trans = to_conn) if 0: Email.send_email( **email_args )
import sys from collections import OrderedDict from pprint import pprint as pp from include.utils import create_reader, create_writer, create_actor, InOut e = sys.exit cli, conn_pool = app_init Email = create_actor(aname='Email', app_init=app_init) Dir = create_reader(aname='Dir', app_init=app_init) dump_file = InOut() data_files = InOut() data_files.file_names = [] insert_stats = InOut(inserted_cnt=-1) file_size_rows = 250000 email_args = {'email_subject': 'IQ->file->SQL'} from_conn = InOut() term_line = False def run(): lite_tbl = {} stats = {} for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0]
def run(): skip = 1 total_ins = 0 term_line = True #//validate cols for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg = scfg = cli.get_scfg(_src_class) for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(aname=_trg_class, app_init=app_init) toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) table = '%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=False) #// validate cols cfg_cols = [x[u'columnName'] for x in cli.scfg[u'columnMappings']] tcols = toDB.get_cols() t_vs_c = set(tcols) - set(cfg_cols) c_vs_t = set(cfg_cols) - set(tcols) if t_vs_c: pfmtd([dict(c_vs_t=c_vs_t)], 'Config has columns missing in target table.') raise Exception( 'Target table has columns missing in config: %s' % t_vs_c) if c_vs_t: pfmtd([dict(t_vs_c=t_vs_c)], 'Target table has columns missing in config.') raise Exception( 'Config has columns missing in target table: %s' % c_vs_t) toDB.commit_transaction(trans=to_conn) #// transfer for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg = scfg = cli.get_scfg(_src_class) _dbname = cli.scfg["sourceDb"] fromDB = create_reader(aname=_src_class, app_init=app_init) fromDB.begin_transaction(env=cli.scfg['sourceDb'], out=from_conn) if 1: #//Extract to Dir for _dmp_class, val in cli.cfg['dump'][_source].items() or []: FileWriter = create_writer(aname=_dmp_class, app_init=app_init) fromDB.set_loader(FileWriter) cli.dcfg = cli.get_dcfg(_dmp_class) for _trg_class, val in cli.cfg['target'][_source].items( ) or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) file_ins_cnt = 0 FileWriter.open_file(out=dump_file) for iq_data in fromDB.fetch_many(chunk_size=file_size_rows, source=cli.scfg, qname='sourceStmt', out=InOut(), skip_header=0, terminate_line=term_line): if not file_ins_cnt: FileWriter.create_header( file=dump_file, header=fromDB.get_header(), cfg=cli.dcfg, terminate_line=term_line) FileWriter.append_data(file=dump_file, data=iq_data, cfg=cli.dcfg) file_ins_cnt += len(iq_data.data) if not file_ins_cnt: #in case there's no data FileWriter.create_header(file=dump_file, header=fromDB.get_header(), cfg=cli.dcfg, terminate_line=term_line) FileWriter.close_file(file=dump_file) total_ins += file_ins_cnt fromDB.desc_cur(cur=from_conn.cur, colord=False) fromDB.commit_transaction(trans=from_conn) log.info('Total records saved: %d' % total_ins) #// Load to IQ for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] DirReader = create_reader(aname=_src_class, app_init=app_init) if 1: #//Get the file names cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) DirReader.glob_dir(path=path, out=data_files, ext='*.*') if 1: #//Load to DB for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(aname=_trg_class, app_init=app_init) toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) table = '%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=None) #// validate cols cfg_cols = [ x[u'columnName'] for x in cli.scfg[u'columnMappings'] ] acols = cli.get_alt_cols(scfg) tcols = toDB.get_cols() fcols_alt = [] for data_file in data_files.file_names: dataFile = create_reader(aname='File', app_init=app_init, file_name=data_file, scfg=dir_scfg) dataFile.describe() file_stats[data_file] = dataFile.line_count( ) - cli.header_size(dir_scfg) fcols_alt = [ acols.get(x.decode(), x.decode()) for x in dataFile.get_header(data_file, dir_scfg) ] f_vs_c = set(fcols_alt) - set(cfg_cols) c_vs_f = set(cfg_cols) - set(fcols_alt) f_vs_t = set(fcols_alt) - set(tcols) t_vs_f = set(tcols) - set(fcols_alt) if f_vs_c: pfmtd([dict(c_vs_f=c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(f_vs_t=f_vs_t)], 'Dump file has columns missing in target table.') pfmtd([dict(t_vs_f=t_vs_f)], 'Target table has columns missing in dump file.') raise Exception( 'Target table has columns missing in config: %s' % f_vs_c) if c_vs_f: pfmtd([dict(f_vs_c=f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(f_vs_t=f_vs_t)], 'Dump file has columns missing in target table.') pfmtd([dict(t_vs_f=t_vs_f)], 'Target table has columns missing in dump file.') raise Exception( 'Config has columns missing in target table: %s' % c_vs_f) if f_vs_t: pfmtd([dict(f_vs_c=f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(c_vs_f=c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(t_vs_f=t_vs_f)], 'Target table has columns missing in dump file.') raise Exception( 'Dump file has columns missing in target table: %s' % f_vs_t) if t_vs_f: pfmtd([dict(f_vs_c=f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(c_vs_f=c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(f_vs_t=f_vs_t)], 'Dump file has columns missing in target table.') raise Exception( 'Target table has columns missing in dump file: %s' % t_vs_f) if 1: for data_fn in [x for x in data_files.file_names]: dataFile = create_reader(aname="File", app_init=app_init, file_name=data_fn, scfg=dir_scfg) dataFile.describe() fileCols = [ col.decode() for col in dataFile.get_header_cols() ] tbl = tcfg[ "targetTable"] #tcfg. os.path.basename(data_fn).split('.')[-2] assert tbl if 1: if 0 and tbl not in do_not_delete: stmt = 'DELETE FROM %s WHERE %s in (SELECT t.%s FROM %s t)' % ( tbl, masterTblCol, masterTblCol, masterTbl) deleted[tbl] = toDB.exec_dml(stmt, trans=to_conn, commit=False) pfmt([[deleted[tbl]]], ['Deleted from %s' % tbl]) else: deleted[tbl] = -1 if 0: acols = cli.get_alt_cols(scfg) dataFile.cols_alt = [ acols.get(x.decode(), x.decode()) for x in dataFile.cols ] else: dataFile.set_alt_cols() missing_cols = list( set(dataFile.cols_alt) - set(tcols)) pfmt([(tbl, x) for x in missing_cols], ['Table', 'Missing columns']) schema = tcfg["targetSchema"] if missing_cols: pfmt([[x] for x in missing_cols], ['Columns in Source, but not Target']) to_conn.conn.rollback() toDB.desc_table(schema, tbl) raise Exception( 'File column %s missing in table "%s".' % (missing_cols, tbl)) if 1: apx = {} fmt_cols = [] toDB.load_file(trans=to_conn, file_obj=dataFile, schema=schema, table_name=tbl, qname='insertStmt', fmt_cols=fmt_cols, cfg=(dir_scfg, tcfg), skip=skip, apx=apx, stats=stats) loaded[data_fn] = tbl else: not_loaded[data_fn] = tbl else: if 1: toDB.commit_transaction(trans=to_conn) pfmt( [[k] + [deleted[k]] + list(v)[1:] for k, v in stats.items() if deleted[k] >= 0], [ 'Table', 'Deleted', 'Accepted', 'Rejected', 'Line count', 'Skip', 'Diff' ], 'Load completed (deleted)'.upper()) pfmt([(k, v) for k, v in loaded.items()], ['Loaded Files', 'Loaded Tables']) pfmt([(k, v) for k, v in not_loaded.items()], ['Not loaded Files', 'Not loaded Tables']) e() if 0: #toDB.truncate_table ( table = table ) toDB.bulk_load(trans=to_conn, file_names=data_files, qname='insertStmt', cfg=(dir_scfg, tcfg), out=insert_stats) for k in file_stats.keys(): assert insert_stats[k] == file_stats[ k], 'Insert vs file count diff: %s<>%s for file \n%s' % ( insert_stats[k], file_stats[k], k) toDB.commit_transaction(trans=to_conn) if 0: Email.send_email(**email_args)
def run(): skip = 2 do_not_load = [] for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] DirReader = create_reader(aname=_src_class, app_init=app_init) cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) ok_files = InOut(file_names=[]) DirReader.glob_dir(path=path, out=ok_files, ext='*.ok') loaded = {} for _trg_class, val in cli.cfg['target'][_source].items(): cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(aname=_trg_class, app_init=app_init) do_not_delete = tcfg['doNotDeleteTables'] do_not_load = tcfg['doNotLoadTables'] to_conn = InOut() toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) toSchema = tcfg['targetSchema'] stmt = 'set search_path to %s' % toSchema psql(stmt) to_conn.cur.execute(stmt) pkstats = {} for okfn in ok_files.file_names: okFile = create_reader(aname='File', app_init=app_init, file_name=okfn, scfg=dir_scfg) okdir, okname = os.path.splitext(okfn) okbn = os.path.basename(okdir) out_files = InOut(file_names=[]) DirReader.glob_dir(path=okdir, out=out_files, ext='*.out') #e() if 1: # Check if some there are files missing in config ftlist = [] for out_fn in out_files.file_names: print(out_fn) ftlist.append(os.path.basename(out_fn).split('.')[1]) pfmt([[x] for x in ftlist], ['Files->Tables']) ctables = cli.tcfg['targetTables'].keys() extra_file_tables = list(set(ftlist) - set(ctables)) pfmt([[x] for x in extra_file_tables], ['Tables not in config.']) extra_config_tables = list(set(ctables) - set(ftlist)) pfmt([[x] for x in extra_config_tables], ['Tables in config but not in file names.']) assert not extra_file_tables, 'Tables %s are not listed in config["targetTables"].' % extra_file_tables for outfn in out_files.file_names: # Master first outFile = create_reader(aname='File', app_init=app_init, file_name=outfn, scfg=dir_scfg) outbn = os.path.basename(outfn) tbl = outbn.split('.')[1] outTbl = 'tmp_PK_%s' % tbl outCols = outFile.get_header_cols() apxCols = [('MartModifiedDate', 'timestamp'), ('AsOfFrom', 'timestamp'), ('AsOfTo', 'timestamp'), ('MD5', 'char(22)')] outTblCols = toDB.get_create_col_list(outCols, apx=apxCols) toCols = toDB.get_col_types(toSchema, tbl) pp(toCols) toDB.desc_tmp_table(outTbl, outCols + apxCols) do_not_delete.append(outTbl) try: stmt = 'drop table %s' % outTbl to_conn.cur.execute(stmt) except Exception as ex: #raise if not 'Table "%s" does not exist' % outTbl in str(ex): raise psql(outfn) stmt = 'CREATE LOCAL TEMPORARY TABLE %s ( %s )\nON COMMIT PRESERVE ROWS' % ( outTbl, ', \n'.join( ['%s %s' % tuple(col) for col in toCols])) pfmt([[stmt]], ['Create master temp PK' + outTbl]) toDB.exec_ddl(stmt) if 1: #//Load data into PK table fmt_cols = {} mmDt = okFile.get_value(coords=(0, 0), skip=skip) md5val = (base64.b64encode( hashlib.md5(b'test').digest())) apx = OrderedDict() apx['MartModifiedDate'] = mmDt apx['AsOfFrom'] = mmDt apx['AsOfTo'] = "12/31/9999" apx['MD5'] = '' #//defined on row level pk_outfn = '%s.pk' % outfn colsep = dir_scfg['columnDelimiter'] with open(pk_outfn, 'wb') as pkfh: with open(outfn, 'rb') as outfh: line = outfh.readline().strip() pkfh.write(line + colsep.join(apx.keys()).encode() + os.linesep.encode()) line = outfh.readline().strip() apxTypes = colsep.join( [col[1] for col in apxCols]) pkfh.write(line + apxTypes.encode() + os.linesep.encode()) line = outfh.readline().strip() while line: md5 = (base64.b64encode( hashlib.md5(line.replace( b'|', b'')).digest())) apx['MD5'] = md5.decode('ascii', 'ignore').strip( '=') #// REDO pkfh.write( line + colsep.join(apx.values()).encode() + os.linesep.encode()) line = outfh.readline().strip() outPkFile = create_reader(aname='File', app_init=app_init, file_name=pk_outfn, scfg=dir_scfg) outPkFile.set_alt_cols() schema = tcfg['targetSchema'] toDB.load_grds_file(trans=to_conn, file_obj=outPkFile, schema=schema, table_name=outTbl, qname='insertStmt', fmt_cols=fmt_cols, cfg=(dir_scfg, tcfg), skip=skip, stats=pkstats) loaded[outbn] = outTbl #outPkFile.delete() #pfmtd([pkstats]) #e() stats = {} deleted = {} processed = [] not_processed = [] for okfn in ok_files.file_names: okFile = create_reader(aname='File', app_init=app_init, file_name=okfn, scfg=dir_scfg) okdir, _ = os.path.splitext(okfn) okbn = os.path.basename(okdir) #e() assert os.path.isdir(okdir) snap_df = cli.get_dest_folder(okdir) if os.path.isdir(snap_df): log.warning('[%s]Destination folder exists: [%s]' % (okdir, snap_df)) not_processed.append(okfn) continue out_files = InOut(file_names=[]) DirReader.glob_dir(path=okdir, out=out_files, ext='*.out') apx = dict( MartModifiedDate=okFile.get_value(coords=(0, 0), skip=skip)) #e() if 0: g = raw_input("Continue?") not_loaded = {} for table_name in ftlist: tmpTbl = 'tmp_PK_%s' % table_name toCols = toDB.get_tab_cols(tmpTbl) #pp(toCols) toDB.desc_table(None, tmpTbl) toDB.desc_table(toSchema, table_name) #e() if table_name in ['TxnLookupMap']: tmpCols = ',\n '.join( ['tmid.%s' % col[0].decode() for col in toCols]) ins = """ insert into {0} ( {1} ) select distinct {2} from {3} tmid LEFT JOIN {0} ta ON ta.{4} = tmid.{4} AND ta.{5} = tmid.{5} AND ta.{6} = tmid.{6} AND ta.ValidFrom = tmid.ValidFrom and ta.AsOfTo = tmid.AsOfTo where ta.MD5 <> tmid.MD5 OR ta.{4} is NULL """.format(table_name, ',\n '.join([col[0].decode() for col in toCols]), tmpCols, tmpTbl, toCols[0][0].decode(), toCols[1][0].decode(), toCols[2][0].decode()) psql(ins) inserted = toDB.exec_dml(ins, trans=to_conn, commit=False) pfmtd([dict(Inserted=inserted)]) elif table_name in [ 'G3Lookup', 'GCLookup', 'GISLookup', 'GPSLookup', 'GPXLookup', 'GPosLookup', 'GTxLookup', 'FundToBusinessUnitMap', 'TxEditReason' ]: tmpCols = ',\n '.join( ['tmid.%s' % col[0].decode() for col in toCols]) ins = """ insert into {0} ( {1} ) select distinct {2} from {3} tmid LEFT JOIN {0} ta ON ta.{4} = tmid.{4} AND ta.{5} = tmid.{5} AND ta.AsOfTo = tmid.AsOfTo where ta.MD5 <> tmid.MD5 OR ta.{4} is NULL """.format(table_name, ',\n '.join([col[0].decode() for col in toCols]), tmpCols, tmpTbl, toCols[0][0].decode(), toCols[1][0].decode()) psql(ins) inserted = toDB.exec_dml(ins, trans=to_conn, commit=False) pfmtd([dict(Inserted=inserted)]) else: tmpCols = ',\n '.join( ['tmid.%s' % col[0].decode() for col in toCols]) ins = """ insert into {0} ( {1} ) select distinct {2} from {3} tmid LEFT JOIN {0} ta ON ta.{4} = tmid.{4} AND ta.AsOfTo = tmid.AsOfTo where ta.MD5 <> tmid.MD5 OR ta.{4} is NULL ; """.format(table_name, ',\n '.join([col[0].decode() for col in toCols]), tmpCols, tmpTbl, toCols[0][0].decode()) psql(ins) inserted = toDB.exec_dml(ins, trans=to_conn, commit=False) pfmtd([dict(Inserted=inserted)]) if 1: toDB.commit_transaction(trans=to_conn) pfmt([[k] + list(v.values())[1:] for k, v in pkstats.items()], [ 'Table', 'Accepted', 'Rejected', 'Line count', 'Skip', 'Diff' ], 'Load completed'.upper()) pfmt([(k, v) for k, v in loaded.items()], ['Loaded Files', 'Loaded Tables']) pfmt([(k, v) for k, v in not_loaded.items()], ['Not loaded Files', 'Not loaded Tables']) assert os.path.isdir(okdir) if 0: cli.MoveSnapFolder(okdir) processed.append(dict(ProcessedFile=okfn)) #break; if not ok_files.file_names: pfmtd([ dict(NoFiles='No OK files at working dir: [ %s ]' % cli.pa[0]) ]) pfmtd(processed) pfmtd(not_processed) if 0: email_args.update(dict(cli_stats=None)) Email.send_email(**email_args) cli.done()
def run(): lite_tbl={} stats={} if 1: #REST for _rest, val in cli.cfg['rest'].items(): cli.set_source(_rest) cli.set_rest(_rest) _rest_class = list(val.keys())[0] cli.rcfg= rcfg=cli.get_rcfg(_rest_class) pp(cli.rcfg) REST = create_reader(aname =_rest_class, app_init=app_init ) #REST.read_stream ( pipe = from_conn, skip_header = 0, out=trans_ids) #e() if 1: #//Load data for _trg_class, val in cli.cfg['target'][_rest].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) _todbname=tcfg["targetDb"] toDB = create_writer (aname =_trg_class, app_init=app_init ) toDB.begin_transaction ( env =tcfg['targetDb'] , out = to_conn ) if 1: #//reset date in case of AccountingDate=="auto" cli.set_default_acct_date(toDB, tcfg) acct_date=cli.get_parsed(ckey='accountingDate', cfg=tcfg) if 1: #//set acct_year, acct_mon for new target table naming fmt = cli.get_parsed(ckey='accountingDateFmt', cfg=tcfg) cli.set_target_table(tcfg=tcfg, acct_date=acct_date, fmt = fmt) #//count existing recs if 1: if 1: stmt = cli.get_parsed(ckey='preCountTablePartitionStmt', cfg=tcfg) cur = toDB.exec_query(stmt) pre_part_cnt= cur.fetchall()[0][0] if 1: stmt = cli.get_parsed(ckey='preCountTableStmt', cfg=tcfg) cur = toDB.exec_query(stmt) pre_tab_cnt= cur.fetchall()[0][0] assert pre_part_cnt == pre_tab_cnt, 'Extra records in table other than" %s" for AccountingDate "%s".' % (_todbname, acct_date) table='%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=False) if 1: cli.rest_acct_date= acct_date.replace('/','') REST.read_json_data ( cfg= cli.rcfg, skip_header = 0, out = rest_pipe, read_stats=read_stats) xref=cli.tcfg["columnMap"] cols = toDB.get_table_cols(schema = cli.get_parsed(ckey='targetSchema', cfg=tcfg), tab=cli.get_parsed(ckey='targetTable', cfg=tcfg)) #ppe(set(cols) - set([x[0] for x in xref.values()])) pp(set([x[0] for x in xref.values()])) pp(set(cols)) assert not (set([x[0] for x in xref.values()]) - set(cols)), 'There are columns in config, but not in target table: %s' % (set([x[0] for x in xref.values()]) - set(cols)) assert not (set(cols) - set([x[0] for x in xref.values()])), 'There are columns in target table, but not in config: %s' % (set([x[0] for x in xref.values()]) - set(cols)) toDB.insert_RC_data ( trans = to_conn, target = cli.tcfg, source = rest_pipe, stmt = 'insertStmt' , insert_stats=insert_stats) if 1: stmt = cli.get_parsed(ckey='afterCountStmt', cfg=tcfg) cur = toDB.exec_query(stmt) after_cnt= cur.fetchall()[0][0] stats['%s->%s' % (_rest, _todbname)] =st= OrderedDict() st['source_cnt'] = len(toDB.rows) st['total_extracted'] = read_stats.total_read st['total_inserted'] = insert_stats.inserted_cnt st['after_count'] = after_cnt st['rollback'] = cli.get_parsed(ckey='rollbackStmt', cfg=tcfg) st['purge'] = cli.get_parsed(ckey='purgeStmt', cfg=tcfg) try: assert v['source_cnt'] == v['total_extracted'], "source_cnt %s <> total_extracted %s" % ( v['source_cnt'], v['total_extracted']) assert v['source_cnt'] == v['total_inserted'], "source_cnt %s <> total_inserted %s" % ( v['source_cnt'], v['total_inserted']) assert v['source_cnt'] == v['after_count'] , "source_cnt %s <> after_count %s" % ( v['source_cnt'], v['after_count']) except Exception as ex: del_cnt = toDB.exec_dml( dml=st['rollback'], trans=to_conn, commit=True) log.info('Rolled back recs: %d' % del_cnt) raise if 1: #//purge purge_cnt = toDB.exec_dml( dml=st['purge'], trans=to_conn, commit=True) log.info('Purged old recs: %d' % purge_cnt) toDB.commit_transaction( trans = to_conn ) if 0: email_args.update(dict(cli_stats=stats)) Email.send_email( **email_args)
def run(): ext_files = [] for _source, val in cli.cfg['source'].items(): _dbname = val["sourceDb"] DB = create_reader(_dbname, app_init=app_init) FileWriter = create_writer('File', app_init=app_init) data_files.file_names = [] uploaded_files.file_names = [] if 1: cli.set_source(_source) DB.set_loader(FileWriter) total_ins = 0 scfg = cli.get_scfg() source_chunk_size = scfg['sourceChunkSize'] #maxRowsPerFile for cid, iq_data in enumerate( DB.fetch_many(chunk_size=source_chunk_size, source=scfg, qname='sourceStmt', out=InOut(), skip_header=0)): dump_file = InOut() FileWriter.open_file(id=cid, out=dump_file) if 1: #not total_ins: dump_cfg = cli.get_dcfg() FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=dump_cfg) FileWriter.append_data(file=dump_file, data=iq_data, cfg=dump_cfg) total_ins += len(iq_data.data) FileWriter.close_file(file=dump_file) ext_files.append(dump_file.fpath) #if not total_ins: #in case there's no data # FileWriter.create_header(file = dump_file, header = DB.get_header(), cfg = dump_cfg) pp(ext_files) if 1: #Load to DB cli.set_source(_source) file_scfg = cli.cfg['dump'][_source] path = cli.get_parsed(ckey='dumpDir', cfg=file_scfg) Dir.get_files(path=path, out=data_files) pp(data_files.file_names) if 1: to_conn = InOut() for _target, val in cli.cfg['target'][_source].items() or []: tcfg = cli.cfg['target'][_source][_target] _todbname = val["targetDb"] toDB = create_writer(_target, app_init=app_init) #print toDB #e() #toDB.begin_transaction ( out = to_conn ) rec_delim = '\n' skip_header = 0 #S3.upload_files ( file_names = data_files, out = uploaded_files, skip_header=skip_header, rec_delim=rec_delim) toDB.insert_files(file_names=data_files, out=uploaded_files, skip_header=skip_header, rec_delim=rec_delim, cfg=(file_scfg, tcfg)) #trans = to_conn, file_names = data_files, qname = 'insertStmt', cfg = (file_scfg, tcfg) ) #toDB.commit_transaction ( trans = to_conn) if 0: Email.send_email(**email_args)
def run(): total_ins = 0 for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = val.keys()[0] cli.scfg = cli.get_scfg(_src_class) _dbname = cli.scfg["sourceDb"] fromDB = create_reader(_src_class, app_init=app_init) fromDB.begin_transaction(env=cli.scfg['sourceDb'], out=from_conn) print from_conn.conn if 1: #Extract to Dir for _dmp_class, val in cli.cfg['dump'][_source].items() or []: FileWriter = create_writer(_dmp_class, app_init=app_init) fromDB.set_loader(FileWriter) cli.dcfg = cli.get_dcfg(_dmp_class) for _trg_class, val in cli.cfg['target'][_source].items( ) or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) file_ins_cnt = 0 FileWriter.open_file(out=dump_file) for iq_data in fromDB.fetch_many(chunk_size=file_size_rows, source=cli.scfg, qname='sourceStmt', out=InOut(), skip_header=0): if not file_ins_cnt: FileWriter.create_header( file=dump_file, header=fromDB.get_header(), cfg=cli.dcfg) FileWriter.append_data(file=dump_file, data=iq_data, cfg=cli.dcfg) file_ins_cnt += len(iq_data.data) if not file_ins_cnt: #in case there's no data FileWriter.create_header(file=dump_file, header=fromDB.get_header(), cfg=cli.dcfg) FileWriter.close_file(file=dump_file) total_ins += file_ins_cnt fromDB.commit_transaction(trans=from_conn) log.info('Total records saved: %d' % total_ins) #// Load to IQ for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = val.keys()[0] DirReader = create_reader(_src_class, app_init=app_init) if 1: #Get the file names cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) DirReader.get_files(path=path, out=data_files, chunk_size=file_size_rows) if 1: #Load to DB to_conn = InOut() for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer(_trg_class, app_init=app_init) toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn) table = tcfg['targetTable'] #toDB.truncate_table ( table = table ) toDB.bulk_insert(trans=to_conn, file_names=data_files, qname='insertStmt', cfg=(dir_scfg, tcfg)) toDB.commit_transaction(trans=to_conn) if 0: Email.send_email(**email_args)
def run(): lite_tbl = {} for _source, val in cli.cfg['source'].items(): _dbname = val["sourceDb"] DB = create_reader(_dbname, app_init=app_init) FileWriter = create_writer('File', app_init=app_init) data_files.file_names = [] if 1: cli.set_source(_source) DB.set_loader(FileWriter) total_ins = 0 FileWriter.open_file(out=dump_file) for iq_data in DB.fetch_many(chunk_size=file_size_rows, source=cli.get_scfg(), qname='sourceStmt', out=InOut(), skip_header=0): if not total_ins: FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=cli.get_tcfg()) FileWriter.append_data(file=dump_file, data=iq_data, cfg=cli.get_tcfg()) total_ins += len(iq_data.data) if not total_ins: #in case there's no data FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=cli.get_tcfg()) FileWriter.close_file(file=dump_file) if 1: cli.set_source(_source) lite_scfg, lite_tcfg = cli.cfg['teardown']['source'][ _source], cli.cfg['teardown']['target'][_source] #pp(lite_scfg) path = cli.get_parsed(ckey='sourceDir', cfg=lite_scfg) #pp(path) #e() Dir.get_files(path=path, out=data_files) if 1: SQLite.begin_transaction(out=lite_conn) SQLite.bulk_insert(trans=lite_conn, file_names=data_files, qname='insertStmt', cfg=(lite_scfg, lite_tcfg)) SQLite.commit_transaction(trans=lite_conn) lite_tbl[_source] = cli.get_parsed(ckey='targetTable', cfg=lite_tcfg) SQLite.show_data(lite_tbl[_source]) if 0: _source = "SQLServer" data_files.file_names = [] if 1: cli.set_source(_source) SQLServer.set_loader(SQL_FileWriter) total_ins = 0 SQL_FileWriter.open_file(out=dump_file) for iq_data in SQLServer.fetch_many(chunk_size=file_size_rows, source=cli.get_scfg(), qname='sourceStmt', out=InOut(), skip_header=0): if not total_ins: SQL_FileWriter.create_header(file=dump_file, header=SQLServer.get_header(), cfg=cli.get_tcfg()) SQL_FileWriter.append_data(file=dump_file, data=iq_data, cfg=cli.get_tcfg()) total_ins += len(iq_data.data) if not total_ins: #in case there's no data SQL_FileWriter.create_header(file=dump_file, header=SQLServer.get_header(), cfg=cli.get_tcfg()) SQL_FileWriter.close_file(file=dump_file) if 1: cli.set_source(_source) lite_scfg, lite_tcfg = cli.cfg['teardown']['source'][ _source], cli.cfg['teardown']['target'][_source] pp(lite_scfg) path = cli.get_parsed(ckey='sourceDir', cfg=lite_scfg) pp(path) #e() Dir.get_files(path=path, out=data_files) if 1: SQLite.begin_transaction(out=lite_conn) SQLite.bulk_insert(trans=lite_conn, file_names=data_files, qname='insertStmt', cfg=(lite_scfg, lite_tcfg)) SQLite.commit_transaction(trans=lite_conn) lite_tbl_2 = cli.get_parsed(ckey='targetTable', cfg=lite_tcfg) #pp(lite_tbl_2) SQLite.show_data(lite_tbl_2) if 1: tear = cli.tear compare = tear['compare'] source = tear['source'] fmt = {} for db in source: fmt[db] = lite_tbl[db] for k, v in compare.items(): compare[k] = v.format(**fmt) cli.exec_report(SQLite, compare) if 1: Email.send_email(**email_args)
def run(): lite_tbl = {} stats = {} for _source, val in cli.cfg['source'].items(): _dbname = val["sourceDb"] DB = create_reader(_dbname, app_init=app_init) FileWriter = create_writer('File', app_init=app_init) data_files.file_names = [] if 1: cli.set_source(_source) DB.set_loader(FileWriter) total_ins = 0 FileWriter.open_file(out=dump_file) dump_cfg = cli.get_dcfg() for iq_data in DB.fetch_many(chunk_size=file_size_rows, source=cli.get_scfg(), qname='sourceStmt', out=InOut(), skip_header=0): if not total_ins: FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=dump_cfg) FileWriter.append_data(file=dump_file, data=iq_data, cfg=dump_cfg) total_ins += len(iq_data.data) if not total_ins: #in case there's no data FileWriter.create_header(file=dump_file, header=DB.get_header(), cfg=dump_cfg) FileWriter.close_file(file=dump_file) if 1: #Load to DB cli.set_source(_source) file_scfg = cli.cfg['dump'][_source] path = cli.get_parsed(ckey='dumpDir', cfg=file_scfg) Dir.get_files(path=path, out=data_files) if 1: to_conn = InOut() for _target, val in cli.cfg['target'][_source].items() or []: tcfg = cli.cfg['target'][_source][_target] _todbname = val["targetDb"] toDB = create_writer(_todbname, app_init=app_init) toDB.begin_transaction(out=to_conn) toDB.bulk_insert(trans=to_conn, file_names=data_files, qname='insertStmt', cfg=(file_scfg, tcfg), out=insert_stats) toDB.commit_transaction(trans=to_conn) FileWriter.delete_dump(data_files) stats['%s->%s' % (_dbname, _todbname)] = st = OrderedDict() st['source_cnt'] = cli.get_src_row_count( DB) if not cli.lame_duck else cli.lame_duck st['total_extracted'] = total_ins st['total_inserted'] = insert_stats.inserted_cnt for k, v in stats.items(): assert v['source_cnt'] == v['total_extracted'] assert v['source_cnt'] == v['total_inserted'] if 1: email_args.update(dict(cli_stats=stats)) Email.send_email(**email_args)
def run(): stats={} total_ins = 0 term_line = True #//validate cols for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg= scfg=cli.get_scfg(_src_class) for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) if tcfg.get('accountingDate', None): #//set acct_year, acct_mon for new target table naming fmt=cli.get_parsed(ckey='accountingDateFmt', cfg=tcfg) cli.set_target_table(tcfg=tcfg, acct_date=cli.get_parsed(ckey='accountingDate', cfg=tcfg), fmt=fmt) _dbname = tcfg["targetDb"] toDB = create_writer (aname =_trg_class, app_init=app_init ) toDB.begin_transaction ( env =tcfg['targetDb'] , out = to_conn ) table='%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=False) #// validate cols cfg_cols=[x[u'columnName'] for x in cli.scfg[u'columnMappings']] tcols=toDB.get_cols() t_vs_c = set(tcols) -set(cfg_cols) c_vs_t = set(cfg_cols) -set(tcols) if t_vs_c: pfmtd([dict(c_vs_t = c_vs_t)], 'Config has columns missing in target table.') raise Exception('Target table has columns missing in config: %s' % t_vs_c) if c_vs_t: pfmtd([dict(t_vs_c = t_vs_c)], 'Target table has columns missing in config.') raise Exception('Config has columns missing in target table: %s' % c_vs_t) toDB.commit_transaction ( trans = to_conn) #//transfer for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg= scfg=cli.get_scfg(_src_class) _dbname=cli.scfg["sourceDb"] #// in include/extractor fromDB = create_reader(aname = _src_class, app_init=app_init ) fromDB.begin_transaction ( env =cli.scfg['sourceDb'] , out = from_conn ) if 1: #//Extract to File for _dmp_class, val in cli.cfg['dump'][_source].items() or []: FileWriter = create_writer(aname =_dmp_class, app_init=app_init ) fromDB.set_loader(FileWriter) cli.dcfg= cli.get_dcfg(_dmp_class) for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) file_ins_cnt= 0 FileWriter.open_file( out = dump_file ) start_time = time.time() # //if fetch_many is not in IQ - it's in include/extractor/common/Extractor.py for iq_data in fromDB.fetch_many ( chunk_size=file_size_rows, source = cli.scfg, qname = 'sourceStmt', out=InOut(), skip_header=0, terminate_line= term_line): if 1: if not file_ins_cnt: FileWriter.create_header(file = dump_file, header = fromDB.get_header(), cfg=cli.dcfg, terminate_line= term_line) FileWriter.append_data ( file = dump_file, data = iq_data, cfg=cli.dcfg) file_ins_cnt+=len(iq_data.data) FileWriter.terminate(file = dump_file) print (len(iq_data.data)) print ('Elapsed read/write: %s' % (time.time() - start_time)) start_time = time.time() if not file_ins_cnt: #in case there's no data FileWriter.create_header(file = dump_file, header = fromDB.get_header(), cfg=cli.dcfg, terminate_line= term_line) #else: # FileWriter.terminate(file = dump_file) FileWriter.close_file(file = dump_file) total_ins +=file_ins_cnt fromDB.desc_cur(cur = from_conn.cur, colord=False) fromDB.commit_transaction ( trans = from_conn) log.info('Total records saved: %d' % total_ins) #// Load to IQ for _source, val in cli.cfg['dump'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] DirReader = create_reader(aname = _src_class, app_init=app_init ) if 1: #//Get the file names cli.set_source(_source) dir_scfg = cli.get_dcfg(_src_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) DirReader.glob_dir(path=path, out = data_files, ext='*.*') if 1: #//Load to DB for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer (aname =_trg_class, app_init=app_init ) toDB.begin_transaction ( env =tcfg['targetDb'] , out = to_conn ) table='%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=False) #// validate cols cfg_cols=[x[u'columnName'] for x in cli.scfg[u'columnMappings']] acols= cli.get_alt_cols(scfg) tcols=toDB.get_cols() fcols_alt=[] for data_file in data_files.file_names: dataFile = create_reader(aname = 'File', app_init=app_init, file_name=data_file, scfg=dir_scfg) dataFile.describe() file_stats[data_file] = dataFile.line_count() - cli.header_size(dir_scfg) fcols_alt=[acols.get(x.decode(),x.decode()) for x in dataFile.get_header(data_file, dir_scfg)] f_vs_c = set(fcols_alt) -set(cfg_cols) c_vs_f = set(cfg_cols) -set(fcols_alt) f_vs_t = set(fcols_alt) -set(tcols) t_vs_f = set(tcols) -set(fcols_alt) if f_vs_c: pfmtd([dict(c_vs_f = c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(f_vs_t = f_vs_t)], 'Dump file has columns missing in target table.') pfmtd([dict(t_vs_f = t_vs_f)], 'Target table has columns missing in dump file.') raise Exception('Target table has columns missing in config: %s' % f_vs_c) if c_vs_f: pfmtd([dict(f_vs_c = f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(f_vs_t = f_vs_t)], 'Dump file has columns missing in target table.') pfmtd([dict(t_vs_f = t_vs_f)], 'Target table has columns missing in dump file.') raise Exception('Config has columns missing in target table: %s' % c_vs_f) if f_vs_t: pfmtd([dict(f_vs_c = f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(c_vs_f = c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(t_vs_f = t_vs_f)], 'Target table has columns missing in dump file.') raise Exception('Dump file has columns missing in target table: %s' % f_vs_t) if t_vs_f: pfmtd([dict(f_vs_c = f_vs_c)], 'Dump file has columns missing in config.') pfmtd([dict(c_vs_f = c_vs_f)], 'Config has columns missing in dump file.') pfmtd([dict(f_vs_t = f_vs_t)], 'Dump file has columns missing in target table.') raise Exception('Target table has columns missing in dump file: %s' % t_vs_f) #toDB.truncate_table ( table = table ) toDB.bulk_load ( trans = to_conn, file_names = data_files, qname = 'insertStmt', cfg = (dir_scfg, tcfg), out=insert_stats, header=fcols_alt) toDB.commit_transaction ( trans = to_conn) for k in file_stats.keys(): assert file_stats[k], 'Dump file is empty' assert insert_stats[k] not in [-1], 'Insert failed' assert insert_stats[k] == file_stats[k], 'Insert vs file count diff: %s<>%s for file \n%s' % (insert_stats[k] , file_stats[k], k) if 1: stmt = cli.get_parsed(ckey='afterCountStmt', cfg=tcfg) cur = toDB.exec_query(stmt) after_cnt= cur.fetchall()[0][0] print(after_cnt) stats['%s->%s' % (_source, _trg_class)] =st= OrderedDict() st['source_cnt'] = total_ins st['total_inserted'] = sum(insert_stats.values()) st['after_count'] = after_cnt st['rollback'] = cli.get_parsed(ckey='rollbackStmt', cfg=tcfg) st['purge'] = cli.get_parsed(ckey='purgeStmt', cfg=tcfg) if 1: #//validate try: assert st['source_cnt'] == st['total_inserted'], "source_cnt %s <> total_inserted %s" % ( st['source_cnt'], st['total_inserted']) assert st['source_cnt'] == st['after_count'] , "source_cnt %s <> after_count %s" % ( st['source_cnt'], st['after_count']) except Exception as ex: del_cnt = toDB.exec_dml( dml=st['rollback'], trans=to_conn, commit=True) log.info('Rolled back recs: %d' % del_cnt) raise if 1: #//purge purge_cnt = toDB.exec_dml( dml=st['purge'], trans=to_conn, commit=True) log.info('Purged old recs: %d' % purge_cnt) toDB.commit_transaction( trans = to_conn ) if 0: Email.send_email( **email_args )
from include.utils import create_reader, create_writer, create_actor, InOut cli, conn_pool=app_init IQ = create_reader('IQ', app_init=app_init ) IQ_Writer = create_writer('IQ', app_init=app_init ) Email = create_actor ('Email',app_init=app_init ) IQ_cursor = InOut() s3_file_names = InOut() snow_conn = InOut() ## ## email_args={'email_subject':'IQ.procedure->IQ'} ## ## def run(): IQ.set_loader(IQ_Writer) IQ.open_stream ( dbcfg = cli.scfg, qname = 'sourceStmt', out=IQ_cursor ) IQ_Writer.begin_transaction ( out = snow_conn ) IQ_Writer.purge_data ( trans = snow_conn, stmt = 'purgeStmt' ) IQ_Writer.bulk_copy ( trans = snow_conn, file_names = s3_file_names, target=cli.tcfg, qname = 'copyStmt', ) IQ_Writer.commit_transaction( trans = snow_conn ) IQ_Writer.delete_files ( file_names=s3_file_names) if 0: Email.send_email ( **email_args )
""" time python cli2.py -nopp 18 -dcf config/db_config.json -pcf config/proc/iq_s3_snow/DY_Position_SD.json --proc_params \ 223906 05/30/2019 'EOD' 'DESK' "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" -ld 100\ 2>&1| tee DY_Position_SD.log """ from include.utils import create_reader, create_writer, create_actor, InOut cli, conn_pool = app_init SQLServer = create_reader('SQLServer', app_init=app_init) Snowflake = create_writer('Snowflake', app_init=app_init) S3StreamLoader = create_writer('S3StreamLoader', app_init=app_init) Email = create_actor('Email', app_init=app_init) SQL_cursor = InOut() s3_file_names = InOut() snow_conn = InOut() ## ## email_args = {'email_subject': 'SQL->Snowflake'} ## ## def run(): SQLServer.set_loader(Snowflake) SQLServer.open_stream(dbcfg=cli.scfg, qname='sourceStmt', out=SQL_cursor) S3StreamLoader.load_stream(source=SQL_cursor, skip_header=0,
def run(): lite_tbl = {} stats = {} for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg = scfg = cli.get_scfg(_src_class) _dbname = cli.scfg["sourceDb"] fromDB = create_reader(aname=_src_class, app_init=app_init) #FileWriter = create_writer(aname ='File', app_init=app_init ) data_files.file_names = [] if 1: cli.set_source(_source) fromDB.begin_transaction(env=cli.scfg['sourceDb'], out=from_conn) for _dmp_class, val in cli.cfg['dump'][_source].items() or []: FileWriter = create_writer(aname=_dmp_class, app_init=app_init) fromDB.set_loader(FileWriter) cli.dcfg = dcfg = cli.get_dcfg(_dmp_class) for _trg_class, val in cli.cfg['target'][_source].items( ) or []: cli.tcfg = tcfg = cli.get_tcfg(_trg_class) file_ins_cnt = 0 total_ins = 0 FileWriter.open_file(out=dump_file) print(dump_file.fpath) if 1: #for iq_data in DB.fetch_many ( chunk_size=file_size_rows, source = cli.get_scfg(), qname = 'sourceStmt', out=InOut(), skip_header=0 ): for iq_data in fromDB.fetch_many( chunk_size=file_size_rows, source=cli.scfg, qname='sourceStmt', out=InOut(), skip_header=0, terminate_line=term_line): if not file_ins_cnt: FileWriter.create_header( file=dump_file, header=fromDB.get_header(), cfg=cli.dcfg, terminate_line=term_line) FileWriter.append_data(file=dump_file, data=iq_data, cfg=cli.dcfg) file_ins_cnt += len(iq_data.data) if not file_ins_cnt: #in case there's no data FileWriter.create_header( file=dump_file, header=fromDB.get_header(), cfg=cli.dcfg, terminate_line=term_line) FileWriter.close_file(file=dump_file) total_ins += file_ins_cnt if 1: #//check if there's data in a file dataFile = create_reader(aname='File', app_init=app_init, file_name=dump_file.fpath, scfg=dcfg) dataFile.describe() lcnt = dataFile.line_count() - cli.header_size(dcfg) assert lcnt, 'Dump file is empty\n%s' % dump_file.fpath #e() if 1: #Load to DB cli.set_source(_source) dir_scfg = cli.get_dcfg(_dmp_class) path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg) DirReader = create_reader(aname=_dmp_class, app_init=app_init) DirReader.glob_dir(path=path, out=data_files, ext='*.*') if 1: to_conn = InOut() _todbname = tcfg["targetDb"] toDB = create_writer(aname=_todbname, app_init=app_init) toDB.begin_transaction(env=cli.scfg['sourceDb'], out=to_conn) #toDB.begin_transaction ( out = to_conn ) toDB.bulk_insert(trans=to_conn, file_names=data_files, qname='insertStmt', cfg=(dir_scfg, tcfg), out=insert_stats) toDB.commit_transaction(trans=to_conn) if 1: FileWriter.delete_dump(data_files) stats['%s->%s' % (_dbname, _todbname)] = st = OrderedDict() st['source_cnt'] = cli.get_src_row_count( DB) if not cli.lame_duck else cli.lame_duck st['total_extracted'] = total_ins st['total_inserted'] = insert_stats.inserted_cnt if 1: for k, v in stats.items(): assert v['source_cnt'] == v['total_extracted'], " %s <> %s" % ( v['source_cnt'], v['total_extracted']) assert v['source_cnt'] == v['total_inserted'] if 1: email_args.update(dict(cli_stats=stats)) Email.send_email(**email_args)
def run(): lite_tbl={} stats={} for _source, val in cli.cfg['source'].items(): cli.set_source(_source) _src_class = list(val.keys())[0] cli.scfg= scfg=cli.get_scfg(_src_class) _dbname=cli.scfg["sourceDb"] fromDB = create_reader(aname =_src_class, app_init=app_init ) fromDB.begin_transaction ( env =cli.scfg['sourceDb'] , out = from_conn ) print fromDB.conn fromDB.open_query_cur ( dbcfg = cli.scfg, qname = 'RefCode_sourceStmt', out=from_conn ) if 1: #REST for _rest, val in cli.cfg['rest'].items(): cli.set_rest(_rest) _rest_class = list(val.keys())[0] cli.rcfg= rcfg=cli.get_rcfg(_rest_class) pp(cli.rcfg) REST = create_reader(aname =_rest_class, app_init=app_init ) REST.read_stream ( pipe = from_conn, skip_header = 0, out=trans_ids) if 1: #//Load data for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) _dbname = tcfg["targetDb"] toDB = create_writer (aname =_trg_class, app_init=app_init ) toDB.begin_transaction ( env =tcfg['targetDb'] , out = to_conn ) table='%s.%s' % (tcfg['targetSchema'], tcfg['targetTable']) toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=False) REST.open_stream ( source= cli.rcfg, skip_header = 0, trans_ids=trans_ids, out = rest_pipe ) toDB.insert_trans_data ( trans = to_conn, target = cli.tcfg, source = rest_pipe, stmt = 'insertStmt' ) toDB.commit_transaction( trans = to_conn ) if 0: cli.set_source(_source) fromDB.begin_transaction ( env =cli.scfg['sourceDb'] , out = from_conn ) for _dmp_class, val in cli.cfg['dump'][_source].items() or []: FileWriter = create_writer(aname =_dmp_class, app_init=app_init ) fromDB.set_loader(FileWriter) cli.dcfg= dcfg=cli.get_dcfg(_dmp_class) for _trg_class, val in cli.cfg['target'][_source].items() or []: cli.tcfg= tcfg = cli.get_tcfg(_trg_class) file_ins_cnt= 0 total_ins= 0 stats['%s->%s' % (_dbname, _todbname)] =st= OrderedDict() st['source_cnt'] = cli.get_src_row_count(DB) if not cli.lame_duck else cli.lame_duck st['total_extracted'] = total_ins st['total_inserted'] = insert_stats.inserted_cnt if 0: for k, v in stats.items(): assert v['source_cnt'] == v['total_extracted'], " %s <> %s" % ( v['source_cnt'], v['total_extracted']) assert v['source_cnt'] == v['total_inserted'] if 0: email_args.update(dict(cli_stats=stats)) Email.send_email( **email_args)
""" """ import sys import threading import subprocess from pprint import pprint as pp from include.utils import create_reader, create_writer, create_actor, InOut e = sys.exit cli, conn_pool = app_init Dir = create_reader('Dir', app_init=app_init) SQLite = create_writer('SQLite', app_init=app_init) Email = create_actor('Email', app_init=app_init) data_files = InOut() lite_conn = InOut() ## ## email_args = {'email_subject': 'File->SQLite'} ## ## data_files.file_names = [] def run(): Dir.get_files(out=data_files)
""" time python cli.py -nopp 3 -dcf config/db_config.DEV.json \ -pcf config/proc/sync/iq_mem_sql/delta_load.json --proc_params \ CIGActgH.HydraPNLEntries "WHERE LastModifiedTime>'2016-06-19'" Accounting.CIGActgH.HydraPNLEntries \ 2>&1| tee delta_load.log """ import sys from pprint import pprint as pp from include.utils import create_reader, create_writer, create_actor, InOut e=sys.exit cli, conn_pool=app_init IQ = create_reader('IQ', app_init=app_init ) SQLServer = create_reader('SQLServer', app_init=app_init ) IQ_FileWriter = create_writer('File', app_init=app_init ) SQL_FileWriter = create_writer('File', app_init=app_init ) Email = create_actor ('Email',app_init=app_init ) Dir = create_reader('Dir', app_init=app_init ) SQLite = create_writer('SQLite', app_init=app_init ) sql_conn = InOut() dump_file = InOut() data_files = InOut() lite_conn = InOut()