def chooseItems(mode='random', nItems=10): """query db for produceable stuff and applies whatever filter, return list of typeIDs""" command = (f'SELECT "typeID", "manufSize" ' f'FROM "BlueprintPriority" ' f'WHERE "priority" = "manufacturing" ' f'AND "lowPriority" = 0') typeIDs = utils.dbQuery(utils.currentDb, command, fetchAll=True) typeIDs = dict(typeIDs) if mode == 'random': chosen = random.sample(typeIDs.keys(), nItems) return {typeID: typeIDs[typeID] for typeID in chosen} elif mode == 'market': profits = market.itemProfits(typeIDs) sortedProfits = sorted(profits.items(), key=operator.itemgetter(1), reverse=True) totalProjectedProfits = utils.millify( sum(x[1] for x in sortedProfits[0:nItems])) print(f"expected total profit: {totalProjectedProfits}") return { typeID[0]: typeIDs[typeID[0]] for typeID in sortedProfits[0:nItems] }
def getManufacturingIndex(systemName='Ashab'): """""" command = (f'SELECT "manufacturing" ' f'FROM sysIndices ' f'WHERE systemName == "{systemName}"') index = utils.dbQuery(utils.currentDb, command) return index
def getAdjustedPrice(typeID): """get adjusted price from the database""" command = (f'SELECT "adjPrice" ' f'FROM adjPrices ' f'WHERE typeID == {typeID}') adjPrice = utils.dbQuery(utils.currentDb, command) return adjPrice
def _getCachedAvgPrice(typeID): """query the current database for avg prices""" command = (f'SELECT "avgPrice", "date" ' f'FROM "avgPrices" ' f'WHERE "typeID" = {typeID}') cachedPrice = utils.dbQuery(utils.currentDb, command) if cachedPrice: return cachedPrice else: return [None, None]
def checkMissing(config_list): print_hdr = "[datasync_quality_missing: checkMissing] - " print(logdt.now().strftime('[%Y-%m-%d %H:%M:%S] ') + print_hdr + "Entered") conn_metadata = None try: conn_metadata, cur_metadata = dbConnect( config_list['meta_db_dbName'], config_list['meta_db_dbUser'], config_list['meta_db_dbUrl'], base64.b64decode(config_list['meta_db_dbPwd'])) check_sql = "select c.id, c.data_path, c.load_type,c.source_schemaname||'.'||c.source_tablename as source_table, c.target_schemaname||'.'||c.target_tablename as target_table, " \ "c.system_name, c.hvr_source_schemaname, to_char(l.last_success_run_time,'YYYY-MM-DD HH24:MI:SS') as last_success_run_time, " \ "to_char(q.last_count_run_time,'YYYY-MM-DD HH24:MI:SS') as last_count_run_time, to_char(c.last_run_time,'YYYY-MM-DD HH24:MI:SS') as last_control_run_time " \ "from sync.control_table c " \ "left outer join (select data_path,load_type, target_tablename,max(run_time) as last_count_run_time from sbdt.datasync_quality group by data_path,load_type,target_tablename) q " \ "on q.data_path = c.data_path " \ "AND q.load_type = c.load_type " \ "AND q.target_tablename = c.target_schemaname||'.'||c.target_tablename " \ "left outer join (select data_path, table_name, max(log_time) as last_success_run_time from sbdt.edl_log where plant_name = 'DATASYNC' and data_path in ('SRC2Hive','Talend2Hive','KFK2Hive','SQOOP2Hive') and status = 'Job Finished' group by data_path, table_name) l " \ "on l.data_path = c.data_path " \ "AND l.table_name = c.target_schemaname||'.'||c.target_tablename " \ "where 1 = 1 " \ "AND c.data_path in ('SRC2Hive','Talend2Hive','KFK2Hive','SQOOP2Hive') " \ "AND c.source_schemaname not in ('ftp') " \ "AND (c.system_name is null or c.system_name not in ('externaldata')) " \ "AND c.status_flag = 'Y' " \ "AND (c.custom_sql is NULL OR trim(c.custom_sql) = '') " \ "AND ((q.last_count_run_time is null) or (l.last_success_run_time is not null and q.last_count_run_time < l.last_success_run_time - interval '1 day')) " \ "order by last_success_run_time desc nulls last" print check_sql check_results = dbQuery(cur_metadata, check_sql) if len(check_results) > 0: mail_subject = "ATTENTION: Datasync Quality: Missing Count Validation" sendMailHTML(config_list['email_dataQualityReceivers'], mail_subject, formatMissingMail(check_results)) except Exception as e: mail_subject = "ERROR: Datasync Quality Missing" output_msg = "ERROR: Encountered error while running job" + "\n" + traceback.format_exc( ) print(logdt.now().strftime('[%Y-%m-%d %H:%M:%S] ') + print_hdr + output_msg) sendMailHTML(config_list['email_dataQualityReceivers'], mail_subject, output_msg) sys.exit(0) finally: if conn_metadata is not None and not conn_metadata.closed: conn_metadata.close()
def get_waiting_job(self, lock_dlist): print_hdr = "[" + self.class_name + ": get_waiting_job] - " conn_metadata = None try: table_list = [] results = [] for lock_dict in lock_dlist: if lock_dict['table_name'].find('=') == -1: table_list.append(lock_dict['table_name']) table_distinct_list = list(set(table_list)) if table_distinct_list: table_filter_clause = ",".join("'" + l + "'" for l in table_distinct_list) conn_metadata, cur_metadata = dbConnect(self.config_list['meta_db_dbName'], self.config_list['meta_db_dbUser'], self.config_list['meta_db_dbUrl'], base64.b64decode(self.config_list['meta_db_dbPwd'])) log_sql = "select job_key,table_name,to_char(max_start_time,'YYYY-MM-DD HH24:MI:SS') as start_time \ from ( \ select plant_name ||' : '|| data_path||' : '||job_name||' : '||load_id||' : '||run_id as job_key,table_name,status,log_time, \ max(log_time) over (partition by table_name) as max_start_time \ from sbdt.edl_log \ where 1 = 1 \ and log_time > (current_timestamp - INTERVAL '1 day') \ and plant_name not in ('TRANSPORTATION') \ and (upper(data_path) not like '%2GP' or upper(data_path) not like '%2RDS' or upper(data_path) not like '%2PREDIX') \ and table_name in (" + table_filter_clause + ") \ and table_name is not null and length(trim(table_name)) > 0 and table_name <> 'NA') T1 \ where 1 = 1 \ and log_time = max_start_time \ and upper(status) like '%START%'" print (logdt.now().strftime('[%Y-%m-%d %H:%M:%S] ') + print_hdr + "log_sql: " + log_sql) results = dbQuery(cur_metadata, log_sql) print (logdt.now().strftime('[%Y-%m-%d %H:%M:%S] ') + print_hdr + "results: ", results) for lock_dict in lock_dlist: if len(results) > 0: for result in results: if (result['table_name'] == lock_dict['table_name']) and (datetime.strptime(result['start_time'], '%Y-%m-%d %H:%M:%S') >= \ (datetime.strptime(lock_dict['lock_datetime'], '%Y-%m-%d %H:%M:%S') - timedelta(minutes=30))): lock_dict['waiting_job'] = result['job_key'] lock_dict['waiting_job_start_time'] = result['start_time'] except Exception as e: print (logdt.now().strftime('[%Y-%m-%d %H:%M:%S] ') + print_hdr + "ERROR details: " + traceback.format_exc()) finally: if conn_metadata is not None and not conn_metadata.closed: conn_metadata.close() return lock_dlist
def baseMaterials(typeID): """calculate the manufacturing cost of an item""" typeID = int(typeID) returnDict = {} command = (f'SELECT "materialTypeID", "quantity" ' f'FROM "industryActivityMaterials" ' f'WHERE "TypeID" = {typeID} ' f'AND "activityID" = 1') materials = utils.dbQuery(utils.staticDb, command, fetchAll=True) if len(materials) > 0: for materialTuple in materials: returnDict[materialTuple[0]] = materialTuple[1] else: return None return returnDict
def fs2hdfs(self): self.technology = 'Python' self.system_name = 'HDFS' self.job_name = 'FS-->HDFS' t = datetime.fromtimestamp(time.time()) v_timestamp = str(t.strftime('%Y-%m-%d %H:%M:%S')) tablename = self.target_schemaname + "." + self.target_tablename try: conn_metadata, cur_metadata = dbConnect(self.metastore_dbName, self.dbmeta_User, self.dbmeta_Url, self.dbmeta_Pwd) except psycopg2.Error as e: error = 1 err_msg = "Error connecting to control table database".format( error) status = 'Job Error' output_msg = traceback.format_exc() print output_msg return error, err_msg, output_msg try: run_id_sql = "select nextval('sbdt.edl_run_id_seq')" run_id_lists = dbQuery(cur_metadata, run_id_sql) run_id_list = run_id_lists[0] run_id = run_id_list['nextval'] print "Run ID for the table", tablename, " is : ", run_id except Exception as e: print e error = 2 err_msg = "Error while getting Run ID" status = "Job Error" output_msg = traceback.format_exc() audit_logging(cur_metadata, self.load_id, 0, self.plant_name, self.system_name, self.job_name, tablename, status, self.data_path, self.technology, 0, 0, 0, error, err_msg, 0, 0, output_msg) status = 'Job Started' error = 0 err_msg = '' output_msg = '' audit_logging(cur_metadata, self.load_id, run_id, self.plant_name, self.system_name, self.job_name, tablename, status, self.data_path, self.technology, 0, 0, 0, error, err_msg, 0, 0, output_msg) if len(self.source_schemaname) > 0 and len(self.source_tablename) > 0: local_file_name = self.source_schemaname + self.source_tablename elif len(self.source_schemaname) > 0 and len( self.source_tablename) == 0: local_file_name = self.source_schemaname elif len(self.source_schemaname) == 0 and len( self.source_tablename) > 0: local_file_name = self.source_tablename else: error = 2 err_msg = "No source to run this program" output = "No source to run this program" status = 'Job Error' print err_msg audit_logging(cur_metadata, self.load_id, run_id, self.plant_name, self.system_name, self.job_name, tablename, status, self.data_path, self.technology, 0, 0, 0, error, err_msg, 0, 0, output_msg) return error, err_msg, tablename print local_file_name try: files = glob.glob(local_file_name) if len(files) == 0: error = 3 err_msg = "No data found" output = "No data found" print err_msg return error, err_msg, tablename else: self.target_path = self.hive_warehouse_path + "/" + self.target_schemaname + ".db/" + self.target_tablename + "/" (ret, out, err) = run_cmd( ['hadoop', 'fs', '-rm', '-r', (self.target_path + "*")]) if ret: if err.find("No such file or directory") <> -1: (ret, out, err) = run_cmd( ['hadoop', 'fs', '-mkdir', self.target_path]) if ret: pass else: error = 4 err_msg = "Error in cleaning in target path" output = traceback.format_exc() return error, err_msg, tablename except Exception as e: error = 5 err_msg = "Error while checking the local file path or cleaning the target location in HDFS" output = traceback.format_exc() status = 'Job Error' audit_logging(cur_metadata, self.load_id, run_id, self.plant_name, self.system_name, self.job_name, tablename, status, self.data_path, self.technology, 0, 0, 0, error, err_msg, 0, 0, output_msg) return error, err_msg, tablename try: files = glob.glob(local_file_name) for file in files: (ret, out, err) = run_cmd( ['hadoop', 'fs', '-copyFromLocal', file, self.target_path]) if ret > 0: error = 5 err_msg = "Error in ingesting into HDFS" output = traceback.format_exc() return error, err_msg, tablename except Exception as e: error = 6 err_msg = "Error while loading data into HDFS" output = traceback.format_exc() status = 'Job Error' audit_logging(cur_metadata, self.load_id, run_id, self.plant_name, self.system_name, self.job_name, tablename, status, self.data_path, self.technology, 0, 0, 0, error, err_msg, 0, 0, output_msg) return error, err_msg, tablename try: update_control_info_sql = "UPDATE sync.control_table set last_run_time = '" + v_timestamp + "' where id = " + str( self.id ) + " AND target_schemaname = '" + self.target_schemaname + "' AND target_tablename = '" + self.target_tablename + "' AND data_path = '" + self.data_path + "'" print update_control_info_sql cur_metadata.execute(update_control_info_sql) except psycopg2.Error as e: print e error = 7 err_msg = "Error while updating the control table" print err_msg status = 'Job Error' output_msg = traceback.format_exc() audit_logging(cur_metadata, self.load_id, run_id, self.plant_name, self.system_name, self.job_name, tablename, status, self.data_path, self.technology, 0, 0, 0, error, err_msg, 0, 0, output_msg) return error, err_msg, tablename # Final log entry try: error = 0 err_msg = 'No Errors' status = 'Job Finished' output_msg = 'Job Finished successfully' print output_msg audit_logging(cur_metadata, self.load_id, run_id, self.plant_name, self.system_name, self.job_name, tablename, status, self.data_path, self.technology, 0, 0, 0, error, err_msg, 0, 0, output_msg) except psycopg2.Error as e: error = 15 err_msg = "Error while dropping external table in target" print err_msg status = 'Job Error' output_msg = traceback.format_exc() print output_msg audit_logging(cur_metadata, self.load_id, run_id, self.plant_name, self.system_name, self.job_name, tablename, status, self.data_path, self.technology, 0, 0, 0, error, err_msg, 0, 0, output_msg) return error, err_msg, tablename finally: conn_metadata.close() return error, err_msg, tablename
def fs2hdfs_hive_log(self): hosts = [] # Get information about the table to load try: metadata_sql = "SELECT * FROM sync.control_table \ WHERE target_tablename = 'hive_log_ext' \ AND target_schemaname = 'default'" + " \ AND data_path = " + "'FS2HDFS'" print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + "metadata_sql: " + metadata_sql) conn_metadata, cur_metadata = dbConnect(self.metastore_dbName, self.dbmeta_User, self.dbmeta_Url, self.dbmeta_Pwd) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + "before connecting to metastore controls") controls = dbQuery(cur_metadata, metadata_sql) # print (datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + "metastore controls:", controls) except psycopg2.Error as e: error = 2 err_msg = "Error connecting to control table database".format( error) status = 'Job Error' output_msg = traceback.format_exc() print output_msg return output_msg sys.exit(error) finally: conn_metadata.close() if not controls: error = 3 err_msg = "No Entry found in control table".format(error) status = 'Job Error' output_msg = "No Entry found in control table" return output_msg sys.exit(error) self.id = str(controls[0]['id']) self.source_schema = str(controls[0]['source_schemaname']) self.source_tablename = str(controls[0]['source_tablename']) self.target_schema = str(controls[0]['target_schemaname']) self.target_tablename = str(controls[0]['target_tablename']) partitioned = controls[0]['is_partitioned'] self.load_type = str(controls[0]['load_type']) self.s3_backed = controls[0]['s3_backed'] first_partitioned_column = str(controls[0]['first_partitioned_column']) second_partitioned_column = str( controls[0]['second_partitioned_column']) partitioned_column_transformation = str( controls[0]['partition_column_transformation']) custom_sql = str(controls[0]['custom_sql']) self.join_columns = str(controls[0]['join_columns']) self.archived_enabled = controls[0]['archived_enabled'] distribution_columns = str(controls[0]['distribution_columns']) dist_col_transformation = str(controls[0]['dist_col_transformation']) self.log_mode = str(controls[0]['log_mode']) self.last_run_time = str(controls[0]['last_run_time']) incoming_path = self.paths + "/hiveserver2.log" local_inprogress_path = self.local_staging_path + "/in_progress/" inprogress_path = self.staging_path + self.target_schema + "/" + self.target_tablename + "/in_progress/" hosts = self.hive_hosts.split(',') print hosts # Creating the Local in_progress and/or clearing that location for new incoming files for host in hosts: print "Inside Host path check" path_to_check = self.local_staging_path + host print path_to_check path_check = glob.glob(path_to_check) print path_check if len(path_check) > 0: print "Path exists... Clearing the directory" (ret, out, err) = run_cmd(['rm', '-rf', (path_to_check)]) print(ret, out, err) if ret: error = 1 err_msg = "Error while cleaning in_progress location in Local FS".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print output_msg sys.exit(error) return output_msg (ret, out, err) = run_cmd(['mkdir', '-p', path_to_check]) if ret: error = 1 err_msg = "Error while creating in_progress location in Local FS".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print output_msg sys.exit(error) return output_msg path_check = glob.glob(local_inprogress_path) if len(path_check) > 0: print "Path exists... Clearing the directory" (ret, out, err) = run_cmd(['rm', '-rf', (local_inprogress_path)]) print(ret, out, err) if ret: error = 1 err_msg = "Error while cleaning in_progress location in Local FS".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print output_msg sys.exit(error) return output_msg (ret, out, err) = run_cmd(['mkdir', '-p', local_inprogress_path]) if ret: error = 1 err_msg = "Error while creating in_progress location in Local FS".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print output_msg sys.exit(error) return output_msg # Creating the HDFS in_progress location and/or clearing that location for new incoming files (ret, out, err) = run_cmd(["hadoop", "fs", "-test", "-e", inprogress_path]) if ret: print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + "Directory does not exist ... Creating...") (ret, out, err) = run_cmd(["hadoop", "fs", "-mkdir", "-p", inprogress_path]) if ret: error = 1 err_msg = "Error while creating in_progress location in HDFS".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print output_msg sys.exit(error) return output_msg # else: # (ret, out, err) = run_cmd(["hadoop", "fs", "-rm", "-r", inprogress_path + "*"]) # if ret: # if err.find("No such file or directory") <> -1: # pass # else: # error = 1 # err_msg = "Error while cleaning in_progress location in HDFS".format(error) # print (datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) # status = 'Job Error' # output_msg = traceback.format_exc() # print output_msg # return output_msg # Checking the last run time of the table. # Bringing the files from each host since the last run time from datetime import date, timedelta if self.last_run_time == 'None': self.last_run_time = str(datetime.now()) print "Last Run Time : ", self.last_run_time lr_dt, lr_ts = self.last_run_time.split() lr_dt = datetime.strptime(lr_dt, "%Y-%m-%d").date() today = datetime.now().date() delta = today - lr_dt # hosts = self.hive_hosts.split(',') print hosts for host in hosts: (ret, out, err) = run_cmd([ 'scp', ('hdp@' + host + ':' + incoming_path), (self.local_staging_path + host + "/") ]) print ret, out, err if ret > 0: error = 1 err_msg = "Error while moving Current Log File to Local in_progress location".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print err_msg, output_msg sys.exit(error) return output_msg for i in range(delta.days): dt = (lr_dt + timedelta(days=i)) dtstr = dt.isoformat() print dtstr (ret, out, err) = run_cmd([ 'scp', ('hdp@' + host + ':' + incoming_path + '.' + dtstr + '*'), (self.local_staging_path + host + "/") ]) print ret, out, err if ret > 0: if err.find('No such file or directory') <> -1: pass else: error = 1 err_msg = "Error while moving data to in_progress location".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print output_msg sys.exit(error) return output_msg # Unzipping the files if there are any zipped files for host in hosts: files = glob.glob((self.local_staging_path + host + "/*")) for file in files: if file.find(".gz") <> -1: try: with gzip.open(file, 'rb') as f_in: with open((file.replace('.gz', '_') + host), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) except Exception as e: error = 4 err_msg = "Error while unzipping file in Local FS" output_msg = traceback.format_exc() print err_msg, output_msg sys.exit(error) return output_msg #(ret,out,err) = run_cmd(['gunzip', '-c', file, ' > ','test')]) # (ret, out, err) = run_cmd(['gunzip', file]) #(ret, out, err) = run_cmd(['zcat', file, '>', (file.replace('.gz', '_') + host)]) # if ret > 0: # error = 1 # err_msg = "Error while unzipping file in Local FS".format(error) # print (datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) # status = 'Job Error' # output_msg = traceback.format_exc() # print err_msg, output_msg # return output_msg (ret, out, err) = run_cmd(['rm', '-f', file]) if ret > 0: error = 1 err_msg = "Error while removing zipped file in Local FS".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print err_msg, output_msg sys.exit(error) return output_msg else: (ret, out, err) = run_cmd(['mv', file, (file + '_' + host)]) if ret > 0: error = 1 err_msg = "Error while renaming file in Local FS".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print err_msg, output_msg sys.exit(error) return output_msg # Moving the final set of files to the in_progress location to send it to HDFS move_files((self.local_staging_path + host + "/*"), local_inprogress_path) if ret > 0: error = 1 err_msg = "Error while moving files to in_progress location in Local FS".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print err_msg, output_msg sys.exit(error) return output_msg # Ingesting to HDFS (ret, out, err) = run_cmd([ 'hadoop', 'distcp', '-overwrite', 'file:///' + (local_inprogress_path + "/*"), 'hdfs:///' + inprogress_path ]) if ret > 0: error = 1 err_msg = "Error while moving files to HDFS from Local in_progress path".format( error) print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + err) status = 'Job Error' output_msg = traceback.format_exc() print err_msg, output_msg sys.exit(error) return output_msg try: metadata_sql = "UPDATE sync.control_table SET last_run_time = now() \ WHERE target_tablename = 'hive_log' \ AND target_schemaname = 'default'" + " \ AND data_path = " + "'FS2HDFS'" print(datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + "metadata_sql: " + metadata_sql) conn_metadata, cur_metadata = dbConnect(self.metastore_dbName, self.dbmeta_User, self.dbmeta_Url, self.dbmeta_Pwd) cur_metadata.execute(metadata_sql) # print (datetime.now().strftime('[%Y-%m-%d %H:%M:%S] ') + "metastore controls:", controls) except psycopg2.Error as e: error = 2 err_msg = "Error connecting to control table database".format( error) status = 'Job Error' output_msg = traceback.format_exc() print output_msg sys.exit(error) return output_msg finally: conn_metadata.close()
elif data_path.find("GP2HDFS") <> -1: metadata_sql = metadata_sql + " AND source_schemaname = '" + input_schema + "'" \ + " AND (hvr_last_processed_value > last_run_time OR last_run_time IS NULL)" else: metadata_sql = metadata_sql + " AND source_schemaname = '" + input_schema + "'" if input_tablename_list is not None: if data_path.find("MIR2") <> -1: metadata_sql = metadata_sql + " AND target_tablename in (" + tablename_filter + ")" else: metadata_sql = metadata_sql + " AND source_tablename in (" + tablename_filter + ")" metadata_sql = metadata_sql + " AND load_type = '" + load_type + "'" print(print_hdr + "metadata_sql: " + metadata_sql) load_id_sql = "select nextval('sbdt.edl_load_id_seq')" controls = dbQuery(cur_metadata, metadata_sql) print(print_hdr + "controls: ", controls) load_id_lists = dbQuery(cur_metadata, load_id_sql) load_id_list = load_id_lists[0] load_id = load_id_list['nextval'] print(print_hdr + "load_id: " + str(load_id)) input = [] if len(controls) > 0: for control in controls: # Special logic to mirror table from one schema in GP to a different schema in HIVE if data_path.find("GP2HDFS") <> -1 and control[ 'source_schemaname'] <> control[ 'target_schemaname'] and not is_special_logic: error = 2 err_msg = "datasync_driver: main[{0}]: ERROR: Mirror loading between different schemas is not allowed: " \
def count(schemaname, loadtype): config_list = load_config() metastore_dbName = config_list['meta_db_dbName'] dbmeta_Url = config_list['meta_db_dbUrl'] dbmeta_User = config_list['meta_db_dbUser'] dbmeta_Pwd = base64.b64decode(config_list['meta_db_dbPwd']) dbtgt_host = config_list['src_db_hive_dbUrl'] dbtgt_host2 = config_list['src_db_hive_dbUrl2'] dbtgt_Port = config_list['src_db_hive_dataPort'] dbtgt_Auth = config_list['src_db_hive_authMech'] src_dbName = config_list['src_db_gp_dbName'] dbsrc_Url = config_list['src_db_gp_dbUrl'] dbsrc_User = config_list['src_db_gp_dbUser'] dbsrc_Pwd = base64.b64decode(config_list['src_db_gp_dbPwd']) emailSender = config_list['email_sender'] emailReceiver = config_list['email_receivers'] t = datetime.fromtimestamp(time.time()) v_timestamp = str(t.strftime('%Y-%m-%d %H:%M:%S')) input_source_schema = schemaname load_type = loadtype print input_source_schema # try: # count = 0 # for pid in psutil.pids(): # p = psutil.Process(pid) # if p.name() == "python2.7" and p.cmdline()[2] == input_source_schema: # print p.name(), p.cmdline()[1], p.cmdline()[2] # count = count +1 # except Exception as e: # print e # return # print count # if count > 0: # err_msg = "Exiting Count program as Loads are running . . ." # print err_msg # load_id = "None" # error_table_list = input_source_schema # sendMail(emailSender,emailReceiver,err_msg,error_table_list,load_id) # return # else: try: conn_metadata, cur_metadata = txn_dbConnect(metastore_dbName, dbmeta_User, dbmeta_Url, dbmeta_Pwd) except Exception as e: err_msg = "Error connecting to database while fetching metadata" # Send Email print e return plant_name = "DATASYNC" system_name = "GPDB-Hive" job_name = "COUNT " + input_source_schema tablename = input_source_schema data_path = "GP2HDFS" technology = "Python" rows_inserted = 0 rows_deleted = 0 rows_updated = 0 num_errors = 0 count_sql_gpdb = "" count_sql_hive = "" load_id_sql = "select nextval('sbdt.edl_load_id_seq')" load_id_lists = dbQuery(cur_metadata, load_id_sql) load_id_list = load_id_lists[0] load_id = load_id_list['nextval'] print "Load ID for this run is : ", load_id run_id_sql = "select nextval('sync.datasync_seq')" run_id_lists = dbQuery(cur_metadata, run_id_sql) run_id_list = run_id_lists[0] run_id = run_id_list['nextval'] print "Run ID for this run is : ", run_id metadata_sql = "SELECT source_schemaname||'.'||source_tablename||'-'||incremental_column as table_name " \ "FROM sync.control_table where data_path = 'GP2HDFS' " \ " and source_schemaname = '" + input_source_schema + "' AND load_type = '" + load_type + "'" print metadata_sql control = dbQuery(cur_metadata, metadata_sql) control_df = pd.DataFrame(control) control_df.columns = ['table_name'] new_control = control_df['table_name'].tolist() status = 'Job Started' output_msg = '' err_msg = '' audit_logging(cur_metadata, load_id,run_id, plant_name, system_name, job_name, tablename,status, \ data_path, technology,rows_inserted,rows_updated, rows_deleted, num_errors, err_msg ,0,0,output_msg) q = 0 for j in new_control: table_name, incremental_col = j.split('-') if q < len(new_control) - 1: count_sql_gpdb += "SELECT " + str( run_id ) + " as run_id, COUNT(*) as COUNT,'" + table_name + "' as table_name, 'GPDB' as db_name , '" + v_timestamp + "' as end_date, max(" + incremental_col + "::timestamp without time zone) as max_incr_col FROM " + table_name + " WHERE " + incremental_col + " > '1900-01-01' AND " + incremental_col + " <= '" + v_timestamp + "' UNION ALL " count_sql_hive += "SELECT " + str( run_id ) + " as run_id, COUNT(*) as COUNT,'" + table_name + "' as table_name, 'Hive' as db_name , cast('" + v_timestamp + "' as timestamp) as end_date,max(hive_updated_date) as max_incr_col FROM " + table_name + " WHERE hive_updated_date > '1900-01-01' AND hive_updated_date <= '" + v_timestamp + "' UNION ALL " q = q + 1 else: count_sql_gpdb += "SELECT " + str( run_id ) + " as run_id, COUNT(*) as COUNT,'" + table_name + "' as table_name , 'GPDB' as db_name , '" + v_timestamp + "' as end_date, max(" + incremental_col + "::timestamp without time zone) as max_incr_col FROM " + table_name + " WHERE " + incremental_col + " > '1900-01-01' AND " + incremental_col + " <= '" + v_timestamp + "'" count_sql_hive += "SELECT " + str( run_id ) + " as run_id, COUNT(*) as COUNT,'" + table_name + "' as table_name , 'Hive' as db_name, cast('" + v_timestamp + "' as timestamp) as end_date, max(hive_updated_date) as max_incr_col FROM " + table_name + " WHERE hive_updated_date > '1900-01-01' AND hive_updated_date <= '" + v_timestamp + "'" print "Running GPDB Count . . . . ." # print count_sql_gpdb try: conn_source, cur_source = dbConnect(src_dbName, dbsrc_User, dbsrc_Url, dbsrc_Pwd) except psycopg2.Error as e: err_msg = "Error connecting to source database" status = 'Job Error' output_msg = traceback.format_exc() audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, data_path, technology, rows_inserted, rows_updated, rows_deleted, num_errors, err_msg, 0, 0, output_msg) conn_metadata.close() #continue return try: temp_table_sql = "CREATE TEMP TABLE count_" + input_source_schema + " AS " + count_sql_gpdb # print temp_table_sql cur_source.execute(temp_table_sql) except psycopg2.Error as e: print e err_msg = "Error while creating temp table in source" status = 'Job Error' output_msg = traceback.format_exc() audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, data_path, technology, rows_inserted, rows_updated, rows_deleted, num_errors, err_msg, 0, 0, output_msg) conn_metadata.close() #continue return try: file = "/apps/staging/g00003/counts_" + input_source_schema + ".txt" gpdb_count_op_sql = "COPY count_" + input_source_schema + " TO STDOUT DELIMITER '|' NULL ''" pg_count_ip_sql = "COPY counts FROM STDIN DELIMITER '|' NULL ''" fo = open(file, 'w') cur_source.copy_expert(gpdb_count_op_sql, fo) fo.close() fi = open(file, 'r') cur_metadata.copy_expert(pg_count_ip_sql, fi) fi.close() except psycopg2.Error as e: err_msg = "Error while copying" print err_msg print e status = 'Job Error' output_msg = traceback.format_exc() conn_metadata.close() conn_source.close() #continue return conn_source.close() print "Running Hive Count. . . . . " try: conn_target, cur_target = dbConnectHive(dbtgt_host, dbtgt_Port, dbtgt_Auth) except Exception as e: try: conn_target, cur_target = dbConnectHive(dbtgt_host2, dbtgt_Port, dbtgt_Auth) except Exception as e: err_msg = "Error while connecting to target database" status = 'Job Error' print e output_msg = e audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, data_path, technology, rows_inserted, rows_updated, rows_deleted, num_errors, err_msg, 0, 0, output_msg) conn_metadata.rollback() conn_metadata.close() conn_source.close() return count_view_sql = "CREATE OR REPLACE VIEW counts_" + input_source_schema + " AS " + count_sql_hive # print count_view_sql try: cur_target.execute(count_view_sql) except Exception as e: print e err_msg = "Error while creating view" status = 'Job Error' output_msg = traceback.format_exc() audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, data_path, technology, rows_inserted, rows_updated, rows_deleted, num_errors, err_msg, 0, 0, output_msg) conn_metadata.rollback() conn_metadata.close() conn_source.close() conn_target.close() return count_query = "SELECT * FROM counts_" + input_source_schema try: cur_target.execute(count_query) except Exception as e: print e err_msg = "Error while executing count query" print err_msg status = 'Job Error' output_msg = traceback.format_exc() audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, data_path, technology, rows_inserted, rows_updated, rows_deleted, num_errors, err_msg, 0, 0, output_msg) conn_metadata.rollback() conn_metadata.close() conn_source.close() conn_target.close() return #results = {} #column = 0 #for d in cur_target.description: # results[d[0]] = column # column = column + 1 columnNames = [a['columnName'] for a in cur_target.getSchema()] # print columnNames try: count_df = pd.DataFrame(cur_target.fetchall(), columns=columnNames) file = "/apps/staging/g00003/counts_" + input_source_schema + ".txt" f1 = open(file, 'w') count_df.to_csv(path_or_buf=f1, sep='\t', header=False, index=False) f1.close() except Exception as e: print e err_msg = "Error while writing Data Frame into file" print err_msg status = 'Job Error' output_msg = traceback.format_exc() audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, data_path, technology, rows_inserted, rows_updated, rows_deleted, num_errors, err_msg, 0, 0, output_msg) conn_metadata.rollback() conn_metadata.close() conn_source.close() conn_target.close() return try: copy_sql = "COPY public.counts FROM STDIN WITH DELIMITER '\t'" fo = open(file) cur_metadata.copy_expert(copy_sql, fo) run_cmd([ 'rm', '-f', '/apps/staging/g00003/counts_' + input_source_schema + '.txt' ]) err_msg = "Count completed successfully . . ." print err_msg error_table_list = input_source_schema conn_target.close() except Exception as e: print e err_msg = "Error while inserting data into final table" print err_msg status = 'Job Error' output_msg = traceback.format_exc() audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, data_path, technology, rows_inserted, rows_updated, rows_deleted, num_errors, err_msg, 0, 0, output_msg) conn_metadata.rollback() conn_metadata.close() conn_source.close() conn_target.close() return # Final log entry try: error = 0 err_msg = 'No Errors' status = 'Job Finished' output_msg = 'Job Finished successfully' print output_msg audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename,status, \ data_path, technology,rows_inserted,rows_updated, rows_deleted, num_errors, err_msg ,0,0,output_msg) except psycopg2.Error as e: error = 15 err_msg = "Error while dropping external table in target" print err_msg status = 'Job Error' output_msg = traceback.format_exc() print output_msg audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, data_path, technology, rows_inserted, rows_updated, rows_deleted, num_errors, err_msg, 0, 0, output_msg) conn_target.rollback() conn_target.close() conn_metadata.close() return error, err_msg, tablename conn_metadata.commit() conn_metadata.close()
def fn_call(fn_name, load_id=None, run_id=None): config = read_config(['/apps/common/environ.properties']) env = config.get('branch', 'env') metastore_dbName = config.get(env + '.meta_db', 'dbName') dbmeta_Url = config.get(env + '.meta_db', 'dbUrl') dbmeta_User = config.get(env + '.meta_db', 'dbUser') dbmeta_Pwd = base64.b64decode(config.get(env + '.meta_db', 'dbPwd')) dbtgt_Url = config.get(env + '.tgt_db_i360', 'dbUrl') dbtgt_User = config.get(env + '.tgt_db_i360', 'dbUser') dbtgt_dbName = config.get(env + '.tgt_db_i360', 'dbName') dbtgt_Pwd = base64.b64decode(config.get(env + '.tgt_db_i360', 'dbPwd')) # Making the Job Started entry try: conn_metadata, cur_metadata = dbConnect(metastore_dbName, dbmeta_User, dbmeta_Url, dbmeta_Pwd) # status = 'Job Started' plant_name = 'GE Transportation' system_name = 'RDS' job_name = 'RDS - Trigger DB Function' tablename = fn_name # audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python',0, 0, 0, 0, '', 0, 0, '') except Exception as e: output_msg = traceback.format_exc() error = 1 err_msg = "Error: Unable to generate LOAD ID" print err_msg, output_msg # sendMail(emailSender, emailReceiver, err_msg, tablename, load_id, env, "ERROR","DataIKU Backup", '') return error, err_msg # Generating load id if it was not supplied try: if load_id is None: load_id_sql = "select nextval('sbdt.edl_load_id_seq')" load_id_lists = dbQuery(cur_metadata, load_id_sql) load_id_list = load_id_lists[0] load_id = load_id_list['nextval'] except Exception as e: output_msg = traceback.format_exc() error = 1 status = 'Job Error' err_msg = "Error: connecting to logging database while making first audit entry" print err_msg, output_msg audit_logging(cur_metadata, 0, 0, plant_name, system_name, job_name, tablename, status, '', 'Python', 0, 0, 0, 0, err_msg, 0, 0, output_msg) return error, err_msg try: if run_id is None: run_id_sql = "select nextval('sbdt.edl_run_id_seq')" run_id_lists = dbQuery(cur_metadata, run_id_sql) run_id_list = run_id_lists[0] run_id = run_id_list['nextval'] except Exception as e: error = 1 err_msg = "Error: connecting to logging database while making second audit entry" print err_msg output_msg = traceback.format_exc() status = 'Job Error' audit_logging(cur_metadata, 0, 0, plant_name, system_name, job_name, tablename, status, '', 'Python', 0, 0, 0, 0, err_msg, 0, 0, output_msg) return error, err_msg try: conn_target, cur_target = dbConnect(dbtgt_dbName, dbtgt_User, dbtgt_Url, dbtgt_Pwd) except Exception as e: error = 2 status = 'Job Error' output_msg = traceback.format_exc() err_msg = "Error while connecting to the Target Database" audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python', 0, 0, 0, 0, err_msg, 0, 0, output_msg) return error, err_msg try: fn_name_list = fn_name.split(',') for fn_name in fn_name_list: status = 'Job Started' tablename = fn_name.split('(')[0] audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python', 0, 0, 0, 0, '', 0, 0, '') if fn_name.find("(") <> -1 and fn_name.find(")") <> -1: fn_result = dbQuery(cur_target, "SELECT * FROM " + fn_name) print "Running SQL : SELECT * FROM " + fn_name else: fn_result = dbQuery(cur_target, "SELECT * FROM " + fn_name + "()") print "Running SQL : SELECT * FROM " + fn_name + "()" print fn_result print fn_result[0][fn_name.split('(')[0].split('.')[1]] for notice in conn_target.notices: print notice if str( fn_result[0] [fn_name.split('(')[0].split('.')[1]]) == 'False' or str( fn_result[0][fn_name.split('(')[0].split('.')[1]]) == '1': print "Function returned False in the Target Database. Please check the function for more details" error = 4 status = 'Job Error' output_msg = traceback.format_exc() err_msg = "Function returned False in the Target Database. Please check the function for more details" audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python', 0, 0, 0, 0, err_msg, 0, 0, output_msg) conn_metadata.close() conn_target.close() return error, err_msg else: status = 'Job Finished' error = 0 err_msg = 'No Error' audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python', 0, 0, 0, 0, '', 0, 0, '') except Exception as e: error = 3 status = 'Job Error' output_msg = traceback.format_exc() err_msg = "Error while running the RDS Function" audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python', 0, 0, 0, 0, err_msg, 0, 0, output_msg) conn_metadata.close() conn_target.close() return error, err_msg # if str(fn_result[0][fn_name.split('(')[0].split('.')[1]]) == 'False' or str(fn_result[0][fn_name.split('(')[0].split('.')[1]]) == '1': # print "Function returned False in the Target Database. Please check the function for more details" # error = 4 # status = 'Job Error' # output_msg = traceback.format_exc() # err_msg = "Function returned False in the Target Database. Please check the function for more details" # audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python',0, 0, 0, 0, err_msg, 0, 0, output_msg) # conn_metadata.close() # conn_target.close() # return error, err_msg # status = 'Job Finished' # error = 0 # err_msg = 'No Error' # audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python', 0,0, 0, 0, '', 0, 0, '') conn_metadata.close() conn_target.close() return error, err_msg
def fn_call(load_id,fn_name): config = read_config(['/apps/common/environ.properties']) env = config.get('branch', 'env') metastore_dbName = config.get(env + '.meta_db', 'dbName') dbmeta_Url = config.get(env + '.meta_db', 'dbUrl') dbmeta_User = config.get(env + '.meta_db', 'dbUser') dbmeta_Pwd = base64.b64decode(config.get(env + '.meta_db', 'dbPwd')) dbtgt_Url_predix_wto = config.get(env + '.tgt_db_predix_wto', 'dbUrl') dbtgt_User_predix_wto = config.get(env + '.tgt_db_predix_wto', 'dbUser') dbtgt_dbName_predix_wto = config.get(env + '.tgt_db_predix_wto', 'dbName') dbtgt_Pwd_predix_wto = base64.b64decode(config.get(env + '.tgt_db_predix_wto', 'dbPwd')) dbtgt_dbName_port_wto = config.get(env + '.tgt_db_predix_wto', 'dbPort') try: conn_metadata, cur_metadata = dbConnect(metastore_dbName, dbmeta_User, dbmeta_Url, dbmeta_Pwd) run_id_sql = "select nextval('sbdt.edl_run_id_seq')" run_id_lists = dbQuery(cur_metadata, run_id_sql) run_id_list = run_id_lists[0] run_id = run_id_list['nextval'] status= 'Job Started' plant_name = 'GE Transportation' system_name = 'WTO Predix' job_name = 'WTO Predix - Trigger DB Function' tablename = 'WTO Predix' audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename,status, '', 'Python', 0, 0, 0, 0, '',0, 0, '') except Exception as e: error = 1 err_msg = "Error: connecting to logging database while making first audit entry" print err_msg # sendMail(emailSender, emailReceiver, err_msg, tablename, load_id, env, "ERROR","DataIKU Backup", '') return error, err_msg try: conn_target, cur_target = txn_dbConnect(dbtgt_dbName_predix_wto, dbtgt_User_predix_wto, dbtgt_Url_predix_wto,dbtgt_Pwd_predix_wto, dbtgt_dbName_port_wto) except Exception as e: error = 2 status = 'Job Error' output_msg = traceback.format_exc() err_msg = "Error while connecting to the Target Predix Database" audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python',0, 0, 0, 0, err_msg, 0, 0, output_msg) return error, err_msg try: if fn_name.find("(") <> -1 and fn_name.find(")") <> -1: fn_result = dbQuery(cur_target, "SELECT * FROM " + fn_name) else: fn_result = dbQuery(cur_target, "SELECT * FROM " + fn_name + "()") # print fn_result # print fn_result[0]['proc_wto_wheel_data'] except Exception as e: error = 3 status = 'Job Error' output_msg = traceback.format_exc() err_msg = "Error while running the Predix Function" audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python',0, 0, 0, 0, err_msg, 0, 0, output_msg) conn_metadata.close() conn_target.close() return error, err_msg if str(fn_result[0]['proc_wto_wheel_data']) == 'False' or str(fn_result[0]['proc_wto_wheel_data']) == '1': print "Function returned False in the Predix Database. Please check the function for more details" error = 4 status = 'Job Error' output_msg = traceback.format_exc() err_msg = "Function returned False in the Predix Database. Please check the function for more details" audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python',0, 0, 0, 0, err_msg, 0, 0, output_msg) conn_metadata.close() conn_target.close() return error, err_msg status = 'Job Finished' err_msg = '' audit_logging(cur_metadata, load_id, run_id, plant_name, system_name, job_name, tablename, status, '', 'Python', 0,0, 0, 0, '', 0, 0, '') conn_metadata.close() conn_target.close()