def splitOnUCLC(token): refined1 = splitOnChange(token) hits1 = 0 for t in refined1: if Oracle.isWord(t.lower()): hits1 += 1 refined2 = splitPenultimate(token) hits2 = 0 for t in refined2: if Oracle.isWord(t.lower()): hits2 += 1 if hits2 > hits1: return refined2 return refined1
def __init__(self, configFile, mktName, processingDate, debugFlag): """ Purpose: Constructor :param self: class object itself :param configFile: Configuration file to use """ # Initialize m_logger object from class Logger and add Header to the log, using addGenericInfo function self.m_logger = Logger(logging.INFO, configFile, processingDate) self.m_logger.addFileHandler(logging.DEBUG) self.m_logger.addGenericInfo(__file__) self.processingDate = processingDate self.debugFlag = debugFlag #self.forceFlag = forceFlag self.configFile = configFile self.mktName = mktName try: # Get configuration to a dictionary self.m_configDict = configuration(self.configFile, True).m_dictionary #Initialize Oracle instance along with connection self.m_oracle_db = Oracle(self.m_configDict, self.m_logger) except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1)
def prepareData(mcd, mcfFile, featModel, moves, filename, wordsLimit): c = Config(mcfFile, mcd, dicos) numSent = 0 numWords = 0 listMvt = [] listFeatVect = [] while c.getBuffer().readNextSentence() and numWords < wordsLimit: numWords += c.getBuffer().getLength() numSent += 1 # print(">>>>>>>>>>>>> Sent", numSent, " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") prepareWordBufferForTrain(c.getBuffer()) while True: mvt = Oracle.oracle(c) listMvt.append(mvt) outputVector = moves.buildOutputVector(mvt) featVect = c.extractFeatVec(featModel) listFeatVect.append(featVect) if verbose is True: print("------------------------------------------") c.affiche() print('oracle says', mvt[0], mvt[1]) print(mvt, featVect) # c.getBuffer().affiche(mcd) res = c.applyMvt(mvt) if res is False: print("cannot apply movement") if c.isFinal(): # print("is final is true") break try: dataFile = open(filename, 'w', encoding='utf-8') except IOError: print('cannot open', filename) exit(1) dataFile.write(str(inputSize)) dataFile.write("\n") dataFile.write(str(outputSize)) dataFile.write("\n") for i in range(len(listFeatVect)): featVect = listFeatVect[i] mvt = listMvt[i] inputVector = featModel.buildInputVector(featVect, dicos) #outputVector = featModel.buildOutputVector(featVect, dicos) np.savetxt(dataFile, inputVector, fmt="%s", delimiter=' ', newline=' ') dataFile.write('\n') np.savetxt(dataFile, outputVector, fmt="%s", delimiter=' ', newline=' ') dataFile.write('\n')
def Newton_F(Oracle, x0): ##### Initialisation des variables iter_max = 100 gradient_step = 1 threshold = 0.000001 gradient_norm_list = [] gradient_step_list = [] critere_list = [] time_start = process_time() x = x0 ##### Boucle sur les iterations for k in range(iter_max): # Valeur du critere et du gradient critere, gradient, hessien = Oracle(x, 7) # Test de convergence gradient_norm = norm(gradient) if gradient_norm <= threshold: break # Direction de descente D = -dot(inv(hessien), gradient) print(gradient_norm) # Mise a jour des variables x = x + (gradient_step * D) # Evolution du gradient, du pas, et du critere gradient_norm_list.append(gradient_norm) gradient_step_list.append(gradient_step) critere_list.append(critere) ##### Resultats de l'optimisation critere_opt = critere gradient_opt = gradient x_opt = x time_cpu = process_time() - time_start print() print('Iteration :', k) print('Temps CPU :', time_cpu) print('Critere optimal :', critere_opt) print('Norme du gradient :', norm(gradient_opt)) # Visualisation de la convergence Visualg(gradient_norm_list, gradient_step_list, critere_list) return critere_opt, gradient_opt, x_opt
def greedy(start, t): end = len(t) if start == end: return [] while not Oracle.isWord(t[start:end].lower()): end -= 1 if start == end: return [t[start:]] return [t[start:end]] + greedy(end, t)
def splitOnUCLC(token, debug=False): refined1 = splitOnChange(token) hits1 = 0 for t in refined1: if Oracle.oracle(t.lower()): hits1 += 1 refined2 = splitPenultimate(token) hits2 = 0 for t in refined2: if Oracle.oracle(t.lower()): hits2 += 1 if debug: print(refined1, hits1) print(refined2, hits2) if hits2 > hits1: return refined2 return refined1
def splitOnUCLC(token, debug=False): refined1 = splitOnChange(token) hits1 = 0 for t in refined1: if Oracle.oracle(t.lower()): hits1 += 1 refined2 = splitPenultimate(token) hits2 = 0 for t in refined2: if Oracle.oracle(t.lower()): hits2 += 1 if debug: print (refined1, hits1) print (refined2, hits2) if hits2 > hits1: return refined2 return refined1
def greedy(start, t): end = len(t) if start == end: return [] while not Oracle.isWord(t[start:end].lower()): end -= 1 if start == end: return [t[start:]] return [ t[start:end] ] + greedy(end, t)
def __init__(self, connectString): self.connectString = connectString realConn, self.useOracleMod, self.verbose = self._parseConnectString( connectString) self.realConn = realConn if self.useOracleMod: self.conn = Oracle.getConnection(realConn) else: self.conn = DCOracle.Connect(realConn) self.bvcount = 0 oraConns.append(self.conn) self.bindVariables = 1 if self.verbose: print 'CONNECTED'
def get_xy(file_conllu, file_features, file_embedding=None): mcd = get_mcd() print("Chargement des arbres") obj_generateAlltree = ConstructAllTree(file_conllu, mcd, True) # print(obj_generateAlltree.get_corpus()) # print(obj_generateAlltree.get_vocabulary()) all_tree = obj_generateAlltree.get_allTreeProjectiviser() # print(all_tree[0].print_tree()) print("Arbres charger : ", len(all_tree)) print("Création du dataset") features = Features(file_features) i = 0 for tree in all_tree: # tree.print_tree() # if i != 43 and i != 61: A = Oracle(tree, features) A.run() # # print(features.datas) # # print(features.labels_encoders) print("Convertion du dataset") print("file_embedding : ", file_embedding) X, Y = features.get_Data_Set(file_embedding) labels_encoderX = features.get_label_encoderX() labels_encoderY = features.get_label_encoderY() print("X_train_shape", X.shape) print("Y_train_shape", Y.shape) return X, Y, labels_encoderX, labels_encoderY, all_tree
class Loader(): #class variables m_logger = "" #database objects m_oracle_db = "" m_netezza_db = "" def __init__(self, configFile, tradeDate): """ Purpose: Constructor :param self: class object itself :param configFile: Configuration file to use """ # Initialize m_logger object from class Logger and add Header to the log, using addGenericInfo function self.m_logger = Logger(logging.INFO, configFile, tradeDate) self.m_logger.addFileHandler(logging.DEBUG) self.m_logger.addGenericInfo(__file__) self.tradeDate = tradeDate #log.addFileHandler(logging.INFO) try: # Get configuration to a dictionary self.m_configDict = configuration(configFile, True).m_dictionary #Initialize Oracle instance along with connection self.m_oracle_db = Oracle(self.m_configDict, self.m_logger) except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def readMktConfigFile(self, mktConfigFile): """ Purpose - To read the content of mktConfigFile into the global dictionary m_mktConfigDict for reference :param mktConfigFile: :return: """ try: self.m_mktConfigDict = configuration(mktConfigFile, True).m_dictionary #print("mktConfigFile = ", mktConfigFile, "m_mktConfigDict = ", self.m_mktConfigDict) #return m_mktConfigDict except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration for logger " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def chkActiveLoads(self): """ Purpose - To check the count of active Active loads happening at a given point :param None: None at this point :return: """ try: if self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_check_flag"] == 'Y': localActiveLoadMax = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"] localActiveLoadWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_wait_time"] localActiveLoadMaxWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max_wait_time"] mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["get_active_loads"] activeFlag=1 totalActiveWaitTime=0 while activeFlag: #returnVal=0 #returnCode=0 #returnVal, returnCode = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) if returnStr[0] != 0: self.m_logger.error("Unable to get active loads using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Check if actual active loads is <= configured active loads. If so, return out of the fn #print "chkActiveLoads - ReturnCode = ", returnCode if returnStr[1] <= localActiveLoadMax: activeFlag=0 return 0 #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localActiveLoadWaitTime) totalActiveWaitTime += localActiveLoadWaitTime #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit if totalActiveWaitTime > localActiveMaxWaitTime: self.m_logger.error("In Fn chkActiveLoads. Total Actual Wait Time exceeds the configured value active_load_max_wait_time. Either cleanup orphaned loads or increase the either active_load_max or active_load_max_wait_time. totalActiveWaitTime = " + str(totalActiveWaitTime) + " localActiveMaxWaitTime=" + str(localActiveMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: #self.m_logger.error("Failure in chkActiveLoads process for file " + sourceFileWthPath + " with fileID = " + localFileID + " with the error " + str(exp)) self.m_logger.error("Failure in chkActiveLoads process for file with the error " + str(exp)) sys.exit(1) def chkRaceStatus(self): """ Purpose - To check if a load is already running for the given dataset :param None: None at this point :return: """ try: if self.m_mktConfigDict["RACE"]["race_status_check_flag"] == 'Y': localRaceStatusWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_wait_time"]) localRaceStatusMaxWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_max_wait_time"]) mySql = "" myParams = {"datasetName":self.datasetName} mySql = self.m_configDict["SQL"]["get_race_status"] raceFlag=1 totalRaceStatusWaitTime=0 while raceFlag: #returnVal=0 #returnCode=0 #returnVal, returnCode = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) if returnStr[0] != 0: self.m_logger.error("Unable to get race status using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Check if the load for this dataset is already running. If not, exit out of the function with normal return value #print "0.chkActiveLoads - returnVal", returnVal, " ReturnCode = ", int(returnStr[1].strip()) #if int(returnStr[1]) <= 1: if int(returnStr[1].strip()) <= 1: raceFlag=0 return 0 #else: #print "E.chkActiveLoads - returnVal", returnVal, " ReturnCode = ", int(returnStr[1].strip()) #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit #print "1.chkActiveLoads - returnVal", returnVal, " ReturnCode = ", int(returnStr[1].strip()), "localRaceStatusWaitTime =", localRaceStatusWaitTime #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localRaceStatusWaitTime) #time.sleep(90) totalRaceStatusWaitTime += localRaceStatusWaitTime #print "totalRaceStatusWaitTime = ", totalRaceStatusWaitTime, "localRaceStatusWaitTime =", localRaceStatusWaitTime if totalRaceStatusWaitTime > localRaceStatusMaxWaitTime: self.m_logger.error("In Fn chkRaceStatusLoads. Total Actual Wait Time exceeds the configured value race_status_max_wait_time. Either check if the Dataset is getting loaded or increase the either active_load_max or active_load_max_wait_time. totalRaceStatusWaitTime = " + str(totalRaceStatusWaitTime) + " localRaceStatusMaxWaitTime=" + str(localRaceStatusMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: #self.m_logger.error("Failure in chkRaceStatus process for file " + sourceFileWthPath + " with fileID = " + localFileID + " with the error " + str(exp)) self.m_logger.error("Failure in chkRaceStatus process for file with the error " + str(exp)) sys.exit(1) def loadData(self,localDataFile, localFileID, localFileIDQueue, localDBFlag): """ Purpose - To load the given datafile to the S3 bucket specified in the global mktConfigFile :param localDataFile: Source datafile to be uploaded to S3 :param localFileID: Internal File ID assigned to the source datafile :param localFileIDQueue: Queue in which, results of the operation is stored :return: """ try: if localDBFlag: raceStatusReturnValue=self.chkRaceStatus() #print "0.raceStatusReturnValue=", raceStatusReturnValue if raceStatusReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(raceStatusReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 activeLoadsReturnValue=self.chkActiveLoads() if activeLoadsReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(activeLoadsReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 processID = os.getpid() hostName = socket.gethostname() #Insert Process status into Oracle db #DB_CALL - sp_ddy_insert_process_status(RUNID, FILE_ID, etc) mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["put_process_status"] pStatus = 'P' pComment = 'Load started' myParams = {"datasetName":self.datasetName, "runID":self.runID, "fileID":localFileID, "fileName":localDataFile, "tDate":self.tradeDate, "processID":processID, "hostName":hostName, "status":pStatus , "lcomment":pComment, "tidalRunID":self.tidalRunID} #myParams = {"datasetName":self.datasetName, "runID":self.runID, "fileID":localFileID, "fileName":localDataFile, "tDate":self.tradeDate, "processID":processID, "hostName":hostName, "status":pStatus , "lcomment":pComment} #print "Insert process Status" #print "mySql = ", mySql, "myParams = ", myParams #returnVal, returnCode = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) if returnStr[0] != 0: self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Call s3.gettoken to get the token and establish connection sourceFileWthPath = localDataFile #print("sourceFileWthPath =", sourceFileWthPath) #Commented the following lines to move getToken outside parallel thread ##print("mktConfigFile=", mktConfigFile) #s3object = S3(self.mktConfigFile, self.m_logger) #s3object.getToken() ##sourceFileWthPath = s3object.m_configfile["S3"]["source_file"] targetFileWthPath = os.path.join(self.s3object.m_configFile["S3"]["path"], os.path.basename(sourceFileWthPath)) #print("targetFileWthPath =", targetFileWthPath) targetBucket = self.s3object.m_configFile["S3"]["bucket"] #print("targetBucket =", targetBucket) encryptKeyFlag = self.s3object.m_configFile["S3"]["encrypt_key"] #print("encryptKeyFlag =", encryptKeyFlag) loadReturnValue = 0 #Call s3.dataUpload to load the data (single part load) loadReturnValue = self.s3object.loadDataSinglePart(sourceFileWthPath, targetFileWthPath, targetBucket, encryptKeyFlag ) if loadReturnValue == 0: pStatus = 'S' pComment = 'Load completed' else: pStatus = 'F' pComment = 'Load failed' if localDBFlag: #Call DB to insert 'S' or 'F' in tb_ddy_process_status #localFileIDQueue.put((localFileID, loadReturnValue)) mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["put_process_status"] myParams = {"datasetName":self.datasetName, "runID":self.runID, "fileID":localFileID, "fileName":localDataFile, "tDate":self.tradeDate, "processID":processID, "hostName":hostName, "status":pStatus , "lcomment":pComment, "tidalRunID":self.tidalRunID} #myParams = {"datasetName":self.datasetName, "runID":self.runID, "fileID":localFileID, "fileName":localDataFile, "tDate":self.tradeDate, "processID":processID, "hostName":hostName, "status":pStatus , "lcomment":pComment} #returnVal, returnCode = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) if returnStr[0] != 0: self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) localFileIDQueue.put((localFileID,loadReturnValue)) else: return loadReturnValue #if localFileID == 1: #localFileIDQueue.put((localFileID, 1)) ##print("Queue In = ", localFileIDQueue.get()) ##print(localFileID) #else: #localFileIDQueue.put((localFileID, 0)) ##print("Queue In = ", localFileIDQueue.get()) ##print(localFileID) except Exception as exp: #self.m_logger.error("Failure in loadData process for file " + sourceFileWthPath + " with fileID = " + localFileID + " with the error " + str(exp)) self.m_logger.error("Failure in loadData process for file with the error " + str(exp)) if localDBFlag: localFileIDQueue.put(localFileID, 1) else: return 1 #sys.exit(1) def createFinraManifestFile(self, manifestFile): try: # Read Manifest file to get info like total rows, total size & other details to populate the done file for FINRA with open(manifestFile,"r") as fh: self.totalRows = 0 self.totalSize = 0 self.fileCount = 0 self.fileDict = {} for data in fh: data.rstrip("\n") mylist = [] self.fileCount +=1 mylist = data.split("|") self.fileDict[self.fileCount] = [mylist[0],os.path.basename(mylist[1]),int(mylist[2]),int(mylist[3])] self.totalRows += int(mylist[3]) self.totalSize += int(mylist[2]) except Exception as exp: self.m_logger.error("Failed while processing readManifest with Error = " + str(exp)) return 1 try: # Read Default file to get default file structure info #defaultsDir = self.m_mktConfigDict["DATASET"]["defaults_dir"] #Use self.defautltsFile which is populated from the db later. No need to get it from config file self.defaultsFileWthPath = self.m_mktConfigDict["DATASET"]["defaults_dir"] + "/" + self.defaultsFile #self.defaultsFileWthPath = self.m_mktConfigDict["DATASET"]["defaults_dir"] + "/" + self.m_mktConfigDict["DATASET"]["defaults_file"] with open(self.defaultsFileWthPath,"r") as fh: self.defaultsDict = {} self.defaultsCount = 0 for data in fh: data.rstrip('\n') self.defaultsCount +=1 self.defaultsDict[self.defaultsCount]=data except Exception as exp: self.m_logger.error("Failed while processing defaults file " + self.defaultsFileWthPath + " with Error = " + str(exp)) return 1 try: self.finraManifestFile = self.m_mktConfigDict["ENV"]["donefile_dir"] + "/" + os.path.basename(manifestFile) + ".done" with open(self.finraManifestFile,"w") as finraMnFH: finraMnFH.write("# AWS RunID : {}\n".format(str(self.runID))) #finraMnFH.write("# AWS RunID : {}\n".format(str(self.tidalRunID))) finraMnFH.write("# Dataset : {0} , TradeDate : {1}\n".format(str(self.datasetName),str(self.tradeDate))) finraMnFH.write("total_compressed={}\n".format(self.totalSize)) finraMnFH.write("total_rows={}\n".format(self.totalRows)) #finraMnFH.write("no of files={}\n".format(len(self.fileDict))) finraMnFH.write("no of files={}\n".format(self.fileCount)) for key,val in self.fileDict.items(): finraMnFH.write("file_{0}={1}\n".format(str(key),val[1])) finraMnFH.write("file_{0}_rows={1}\n".format(str(key),val[3])) finraMnFH.write("# Data Attributes\n") for key,val in self.defaultsDict.items(): finraMnFH.write("{0}".format(str(val))) return 0 except Exception as exp: self.m_logger.error("Failed while creating AWS Done file " + self.finraManifestFile + " with Error = " + str(exp)) return 1 def processLoader(self, configFile, manifestFile, datasetName, tidalRunID): """ Purpose - Function responsible for reading the manifest file, get market name, call multiprocess load and other db calls :param configFile: Configuration File :param manifestFile: Manifest File :param tradeDate: Trade Date :return: """ try: # Read the manifest filename and get the suffix i.e. datasetname # Assumption - Manifest file format - manifest.<datasetName>.<tradeDate>.<tidalRunID> # Program will break otherwise self.datasetName = datasetName self.tidalRunID = tidalRunID # DB_CALL # Make database call sp_ddy_get_market_info(datasetname) and get market info mktName = '' #print "self.m_configDict = ", self.m_configDict #print "self.m_configDict[mkt] = ", self.m_configDict["SQL"]["get_mkt"] #print "datasetName = ", self.datasetName #print "ManifestFile = ", manifestFile self.manifestFile = self.m_configDict["ENV"]["manifestfile_dir"] + "/" + manifestFile #print "ManifestFile with Path = ", self.manifestFile ##Validata Manifest file if not os.path.isfile(self.manifestFile): self.m_logger.error("Invalid manifest file " + self.manifestFile) sys.exit(1) # Enable this one the proc to get mkt name and default file are ready and test it mySql = self.m_configDict["SQL"]["get_mkt_defaults_filename"] myParams = {"datasetName":self.datasetName} #returnVal, returnCode = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) #print "returnCode = ", returnStr if returnStr[0] == 0: #print "returnStr[2].strip()", returnStr[2].strip() if returnStr[2].strip() != None: mktName = returnStr[2].strip() else: self.m_logger.error("Invalid Market Name " + returnStr[2].strip() ) sys.exit(1) if returnStr[3].strip() != None: self.defaultsFile = returnStr[3].strip() else: self.m_logger.error("Invalid Defaults File " + returnStr[3].strip() ) sys.exit(1) else: self.m_logger.error("Unable to get market info from the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #print "MktName from DB = ", mktName #mySql = self.m_configDict["SQL"]["get_mkt"] ##print "mySql = ", mySql #myParams = {"datasetName":self.datasetName} ##print "myParams = ", myParams #returnVal, returnCode = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) ##print "returnVal = ", returnVal ##print "returnCode = ", returnCode #if returnVal == 0: #mktName = returnCode #else: #self.m_logger.error("Unable to get market info from the database using sql " + mySql + ". Error = " + returnCode) #sys.exit(1) #print "MktName from DB = ", mktName # Temporarily use file lookup call ######## ### Start of temp call #lookupFile = "/Users/rnarayan/apps/ddy/ICE/conf/dataset_lookup.txt" #mktName = '' #with open(lookupFile, "r") as myFile: #for line in myFile: #print(datasetName, line) #if datasetName in line: #mktNameArray = line.partition('=') #mktName = mktNameArray[2].strip() #if not mktName: #self.m_logger.error("Unable to find market manifest for dataset " ) #sys.exit("ERROR: Unable to find market manifest for dataset " + mktConfigFile) #sys.exit(1) #print("Final MktName = ", mktName) ### End of temp call ######## #Build the string for mktConfigFile based on mktName and configFile info self.mktConfigFile = os.path.dirname(configFile) + '/' + os.path.basename(configFile).split('.',1)[0].strip() + '_' + mktName.lower() + '.' + os.path.basename(configFile).split('.',1)[1].strip() #print("mktConfigFile = ", self.mktConfigFile) #Validata Manifest file is a valid file if not os.path.isfile(self.mktConfigFile): #print "Inside invalid mktConfigFile" + self.mktConfigFile self.m_logger.error("Invalid market manifest file " + self.mktConfigFile) sys.exit(1) #May not need the following section, as we send mktConfigFile to other function not the dictionary self.m_mktConfigDict. Need to remove it after finishing the loadData part fully # Read Market specific config file and store it in a specific dictionary #m_mktConfigDict=process.readMktConfigFile(mktConfigFile) self.readMktConfigFile(self.mktConfigFile) #print("m_mktConfigDict=",self.m_mktConfigDict) # Read the contents of manifest - dataFileNames into a list - Will validate the datafiles as well localManifest = Manifest() manifestFileList = localManifest.readManifest(self.manifestFile, self.m_logger) #print("manifestFileList = ", manifestFileList) # Get RunID self.runID = generate_runId() #print("RunID = ", self.runID) #Call Oracle fn to insert status 'P' into TB_DDY_DATASET_TRANS with RUNID etc #DB_CALL # Make database call sp_ddy_insert_dataset_trans and insert data that process started #print "self.m_configDict = ", self.m_configDict #print "self.m_configDict[put_dataset] = ", self.m_configDict["SQL"]["put_dataset"] mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["put_dataset"] #print "mySql = ", mySql pStatus = 'P' myParams = {"datasetName":self.datasetName, "runID": self.runID, "tDate":self.tradeDate, "status": pStatus, "tidalRunID":self.tidalRunID} #myParams = {"datasetName":self.datasetName, "runID": self.runID, "tDate":self.tradeDate, "status": pStatus} #print "myParams = ", myParams #returnVal, returnCode = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) if returnStr[0] != 0: self.m_logger.error("Unable to put dataset info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) # Insert Manifest data in db and call multiprocessing s3 loader process. Shd we add RUN_ID to manifest table #For each datafile, generate fileID and call loadData fn using multiprocess to load data into AWS for dataRecord in manifestFileList: mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["put_manifest"] myParams = {"datasetName":self.datasetName, "runID": self.runID, "tDate":self.tradeDate, "dataFileName":dataRecord[1], "manifestFileName":manifestFile , "noOfRecords": dataRecord[2], "fileSize":dataRecord[3], "tidalRunID":self.tidalRunID} #myParams = {"datasetName":self.datasetName, "runID": self.runID, "tDate":self.tradeDate, "dataFileName":dataRecord[1], "manifestFileName":manifestFile , "noOfRecords": dataRecord[2], "fileSize":dataRecord[3]} returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) if returnStr[0] != 0: self.m_logger.error("Unable to put manifest info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) self.s3object = S3(self.mktConfigFile, self.m_logger) self.s3object.getToken() fileID=1 fileIDQueue = Queue() dbFlag=1 procs = [] for dataRecord in manifestFileList: processHandle = Process(target=Loader.loadData, args=(self, dataRecord[1],fileID, fileIDQueue, dbFlag)) processHandle.start() procs.append(processHandle) fileID += 1 #time.sleep(5) for p in procs: p.join() #Without sleep the queue is unreliable and do not return the expected values #time.sleep(2) failureFlag=0 while not fileIDQueue.empty(): #print("inside while") qFileID, qResult = fileIDQueue.get() #print("qFileID = ", qFileID, "qResult = ", qResult) if qResult: failureFlag=1 #print "Failure Flag = ", failureFlag if failureFlag: pStatus = 'F' else: pStatus = 'S' """ #Generate FINRA Manifest file and Push it to AWS """ # Call Divakar's generate done file function returnValue = self.createFinraManifestFile(self.manifestFile) if returnValue: self.m_logger.error("Unable to generate done file. Please fix the issue the re-run the load") #sys.exit(1) dbFlag=0 fileID=0 # Call the loader function with the manifest file finraManifestLoadStatus=0 finraManifestLoadStatus=self.loadData(self.finraManifestFile ,fileID, fileIDQueue, dbFlag) if finraManifestLoadStatus: pStatus = 'F' self.m_logger.error("Unable to load finra manifest file ") # Do we need to exit here or insert a failure #sys.exit(1) #Call Oracle fn to insert status 'S' into TB_DDY_DATASET_TRANS with RUNID etc #DB_CALL # Make database call sp_ddy_insert_dataset_trans and insert data based on Failure or Success #print "self.m_configDict = ", self.m_configDict mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["put_dataset"] #print "mySql = ", mySql myParams = {"datasetName":self.datasetName, "runID": self.runID, "tDate":self.tradeDate, "status": pStatus, "tidalRunID":self.tidalRunID} #myParams = {"datasetName":self.datasetName, "runID": self.runID, "tDate":self.tradeDate, "status": pStatus} #print "myParams = ", myParams #returnVal, returnCode = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql, myParams) if returnStr[0] != 0: self.m_logger.error("Unable to put dataset info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) except Exception as e: self.m_logger.error("ProcessLoader failed with error " + str(e)) sys.exit(1)
class DEAExtractor(): #class variables m_logger = "" #database objects m_oracle_db = "" #def __init__(self, configFile, mktName, processingDate, debugFlag, forceFlag): def __init__(self, configFile, mktName, processingDate, debugFlag): """ Purpose: Constructor :param self: class object itself :param configFile: Configuration file to use """ # Initialize m_logger object from class Logger and add Header to the log, using addGenericInfo function self.m_logger = Logger(logging.INFO, configFile, processingDate) self.m_logger.addFileHandler(logging.DEBUG) self.m_logger.addGenericInfo(__file__) self.processingDate = processingDate self.debugFlag = debugFlag #self.forceFlag = forceFlag self.configFile = configFile self.mktName = mktName try: # Get configuration to a dictionary self.m_configDict = configuration(self.configFile, True).m_dictionary #Initialize Oracle instance along with connection self.m_oracle_db = Oracle(self.m_configDict, self.m_logger) except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def readMktConfigFile(self, mktConfigFile): """ Purpose - To read the content of mktConfigFile into the global dictionary m_mktConfigDict for reference :param mktConfigFile: :return: """ try: self.m_mktConfigDict = configuration(mktConfigFile, True).m_dictionary except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration for logger " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def chkActiveLoads(self): """ Purpose - To check the count of active Active loads happening at a given point :param None: None at this point :return: """ try: if self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_check_flag"] == 'Y': localActiveLoadMax = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"] localActiveLoadWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_wait_time"] localActiveLoadMaxWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max_wait_time"] mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["get_active_loads"] activeFlag=1 totalActiveWaitTime=0 while activeFlag: returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "mySql = ", mySql print "returnStr = ", returnStr print "chkActiveLoads - Active Loads value = ", int(returnStr[1].strip()) if returnStr[0] != '0': self.m_logger.error("Unable to get active loads using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Check if actual active loads is <= configured active loads. If so, return out of the fn if int(returnStr[1].strip()) <= localActiveLoadMax: activeFlag=0 return 0 #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localActiveLoadWaitTime) totalActiveWaitTime += localActiveLoadWaitTime #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit if totalActiveWaitTime > localActiveMaxWaitTime: self.m_logger.error("In Fn chkActiveLoads. Total Actual Wait Time exceeds the configured value active_load_max_wait_time. Either cleanup orphaned loads or increase the either active_load_max or active_load_max_wait_time. totalActiveWaitTime = " + str(totalActiveWaitTime) + " localActiveMaxWaitTime=" + str(localActiveMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: self.m_logger.error("Failure in chkActiveLoads process for file with the error " + str(exp)) sys.exit(1) def chkRaceStatus(self): """ Purpose - To check if a load is already running for the given dataset :param None: None at this point :return: """ try: if self.m_mktConfigDict["RACE"]["race_status_check_flag"] == 'Y': localRaceStatusWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_wait_time"]) localRaceStatusMaxWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_max_wait_time"]) mySql = "" #myParams = {"datasetName":self.datasetName} tempSql = self.m_configDict["SQL"]["get_race_status"] myParamsDict = { 'datasetName' : self.datasetName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) raceFlag=1 totalRaceStatusWaitTime=0 while raceFlag: returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr print "chkRaceStatus - ReturnCode = ", int(returnStr[1].strip()) if returnStr[0] != '0': self.m_logger.error("Unable to get race status using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Check if the load for this dataset is already running. If not, exit out of the function with normal return value if int(returnStr[1].strip()) <= 1: raceFlag=0 return 0 #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localRaceStatusWaitTime) #time.sleep(90) totalRaceStatusWaitTime += localRaceStatusWaitTime if self.debugFlag: print "totalRaceStatusWaitTime = ", totalRaceStatusWaitTime, "localRaceStatusWaitTime =", localRaceStatusWaitTime if totalRaceStatusWaitTime > localRaceStatusMaxWaitTime: self.m_logger.error("In Fn chkRaceStatusLoads. Total Actual Wait Time exceeds the configured value race_status_max_wait_time. Either check if the Dataset is getting loaded or increase the either active_load_max or active_load_max_wait_time. totalRaceStatusWaitTime = " + str(totalRaceStatusWaitTime) + " localRaceStatusMaxWaitTime=" + str(localRaceStatusMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: self.m_logger.error("Failure in chkRaceStatus process for file with the error " + str(exp)) sys.exit(1) def extractData(self,localDataFile, localFileID, localFileIDQueue, localDBFlag): """ Purpose - To extract the given datafile from the S3 bucket specified in the global mktConfigFile :param localDataFile: Data Filename :param localFileID: Internal File ID assigned to the local datafile :param localFileIDQueue: Queue in which, results of the operation is stored :param localDBFlag: Flag indicating if database should be used or not :return: """ try: if self.debugFlag: print "Inside extractData function" print "localDataFile = ", localDataFile if localDBFlag: """ Not sure if we need Race Status check for Extract raceStatusReturnValue=self.chkRaceStatus() if self.debugFlag: print "raceStatusReturnValue=", raceStatusReturnValue if raceStatusReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(raceStatusReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 """ """ Need to integrate Active loads with tb_dxt_process_status and tb_dxt_process_status ? activeLoadsReturnValue=self.chkActiveLoads() if activeLoadsReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(activeLoadsReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 """ processID = os.getpid() hostName = socket.gethostname() #Insert Process status into Oracle db #DB_CALL - sp_dxt_insert_process_status(RUNID, FILE_ID, etc) mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_process_status"] pStatus = 'P' pComment = 'Load started' # Keep the below vars 0 for now localDataFileSize=0 localDataFileRecordCount=0 myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileName":localDataFile, "tDate":str(self.processingDate), "processID":str(processID), "hostName":hostName, "fileSize": str(localDataFileSize), "recordCount" : str(localDataFileRecordCount), "status":pStatus , "lcomment":pComment} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) return 1 #Here localFileWthPath is the local stage dir with file name localFileWthPath = self.m_configDict["ENV"]["stage_dir"] + "/" + localDataFile targetFolder = self.s3object.m_configFile["S3"]["path"] targetFileDir = targetFolder + self.processingDate + "/" #Here targetFileWthPath is the AWS dir with file name targetFileWthPath = os.path.join(targetFileDir, os.path.basename(localDataFile)) targetBucket = self.s3object.m_configFile["S3"]["bucket"] encryptKeyFlag = self.s3object.m_configFile["S3"]["encrypt_key"] localAWSRetries = int(self.m_mktConfigDict["ENV"]["aws_retries"]) if self.debugFlag: print("localFileWthPath =", localFileWthPath) print("targetFileWthPath =", targetFileWthPath) print("targetBucket =", targetBucket) print("encryptKeyFlag =", encryptKeyFlag) print("localAWSRetries =", localAWSRetries) initCount = 0 while (initCount < localAWSRetries): extractReturnValue = 0 #Call s3.data download to extract the manifest file (single part load) #extractReturnValue = self.s3object.getDataSinglePart(localFileWthPath, targetFileWthPath, targetBucket, encryptKeyFlag ) extractReturnValue = self.s3object.getDataSinglePart(localFileWthPath, targetFileWthPath, targetBucket) if self.debugFlag: print "extractReturnValue = ", extractReturnValue if int(extractReturnValue) == 0: pStatus = 'S' pComment = 'Extract completed' break else: pStatus = 'F' pComment = 'Extract failed' initCount += 1 # Get the size of the file downloaded localFileSize = os.stat(localFileWthPath).st_size # Check if the downloaded file size is matching with what is mentioned in manifest file. If not mark it as failed # Following check is commented as we don't have any manifest file to cross check size # if localFileSize != localDataFileSize: # pStatus = 'F' # pComment = 'Actual file size != Manifest file size' localRecordCount = 0 if localDBFlag: #Call DB to insert 'S' or 'F' in tb_dxt_process_status #localFileIDQueue.put((localFileID, extractReturnValue)) mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_process_status"] #myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileID":str(localFileID), "fileName":localDataFile, "tDate":str(self.processingDate), "processID":str(processID), "hostName":hostName, "fileSize": localFileSize, "recordCount" : localRecordCount, "status":pStatus , "lcomment":pComment} myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileName":localDataFile, "tDate":str(self.processingDate), "processID":str(processID), "hostName":hostName, "fileSize": str(localFileSize), "recordCount" : str(localRecordCount), "status":pStatus , "lcomment":pComment} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) return 1 localFileIDQueue.put((localFileID,extractReturnValue)) return extractReturnValue except Exception as exp: self.m_logger.error("Failure in extractData process for file with the error " + str(exp)) if localDBFlag: localFileIDQueue.put((localFileID, 1)) else: return 1 def getRecords(self, fileDict, startDateTime, endDateTime): """ Purpose - Function to sort the dictionary based on the key and return a sorted list :param fileDict : Dictionary containing Last_modified Date and file name :param startDateTime : Start DateTime in the format '2016-02-01 00:00:00' :param endDateTime : End DateTime in the format '2016-02-10 00:00:00' """ try: # No pattern to search for #patternToSearch = self.m_configDict["ENV"]["pattern_to_search"] if self.debugFlag: print "fileDict = ", fileDict #print "patternToSearch = ", patternToSearch print "startDateTime = ", startDateTime print "endDateTime = ", endDateTime sorted_values = sorted(fileDict.values()) start = bisect.bisect_left(sorted_values, startDateTime) end = bisect.bisect_right(sorted_values, endDateTime) if self.debugFlag: print "sorted_values = ", sorted_values print "start = ", start print "end = ", end for fileItem in sorted(fileDict.iteritems())[start:end]: # No pattern to search for in DEA #if patternToSearch in fileItem[0]: #yield fileItem[0] yield fileItem[0] if self.debugFlag: print "fileItem[0] = ", fileItem[0] except Exception as exp: self.m_logger.error("Failed while executing getRecords to sort the dictionary content of dictionary with Error = " + str(exp)) sys.exit(1) def readManifestFile(self, manifestFileName): """ Purpose - To read the content of Finra's manifest file stored in key-value pair into Nested dictionary :param manifestFileName : Finra's manifestFileName containing data filenames, file size & no of rows """ try: manifestRecordStartPattern = self.m_configDict["dxt"]["MANIFEST_RECORD_START_PATTERN"] if self.debugFlag: print "manifestRecordStartPattern =", manifestRecordStartPattern with open(manifestFileName) as infile: manifestFileDict = {} file = None line_count = 0 for line in infile: line = line.strip() if line.startswith(manifestRecordStartPattern): line_count += 1 file = line_count manifestFileDict[file] = {} var, val = line.split(':',1) manifestFileDict[file][var.strip()] = val.strip() if self.debugFlag: print "manifestFileDict = ", manifestFileDict return manifestFileDict except Exception as exp: self.m_logger.error("Failed while executing readManifestFile to get FINRA manifest file into nested dictionary, Error = " + str(exp)) sys.exit(1) def getFileList(self, startDateTime, endDateTime, s3Bucket, s3Path, folderPosition): """ Purpose - Function to sort the dictionary based on the key and return a sorted list :param startDateTime : Start DateTime in the format '2016-02-01 00:00:00' :param endDateTime : End DateTime in the format '2016-02-10 00:00:00' """ try: if self.debugFlag: print "s3Bucket = ", s3Bucket print "s3Path = ", s3Path print "startDateTime = ", startDateTime print "endDateTime = ", endDateTime print "folderPosition = ", folderPosition fileListDict = self.s3object.listBucketWPathByLastModified(s3Bucket, s3Path, folderPosition) if self.debugFlag: print "fileListDict = ", fileListDict #endDateTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") fileList = list(self.getRecords(fileListDict, startDateTime, endDateTime)) if self.debugFlag: print "fileListDict = ", fileListDict print "fileList = ", fileList return fileList except Exception as exp: self.m_logger.error("Failed while creating AWS manifest file list with Error = " + str(exp)) return 1 def processDEAExtractor(self): """ Purpose - Function responsible for getting the AWS token and reading the last modified date in DB and fetch the list of files from AWS to be processed :param : None :return: """ try: # DB_CALL # Make database call sp_dxt_validate_mktName(mktName) to validate mktName # tempSql = self.m_configDict["SQL"]["validate_market_name"] # myParamsDict = { 'mktName' : self.mktName.upper() } # tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) # mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) # returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) # # if self.debugFlag: # print "tempSql = ", tempSql # print "myParamsDict = ", myParamsDict # print "mySql = ", mySql # print "returnStr = ", returnStr # # if returnStr[0] != '0': # self.m_logger.error("Invalid market name provided " + mySql + ". Error = " + self.mktName) # sys.exit(1) if self.debugFlag: print "MktName from DB = ", self.mktName #Build the string for mktConfigFile based on mktName and configFile info self.mktConfigFile = os.path.dirname(self.configFile) + '/' + os.path.basename(self.configFile).split('.',1)[0].strip() + '_' + self.mktName.lower() + '.' + os.path.basename(self.configFile).split('.',1)[1].strip() if self.debugFlag: print("mktConfigFile = ", self.mktConfigFile) #Validate Market Config file is a valid file if not os.path.isfile(self.mktConfigFile): self.m_logger.error("Invalid market manifest file " + self.mktConfigFile) sys.exit(1) # Read Market specific config file and store it in a specific dictionary self.readMktConfigFile(self.mktConfigFile) if self.debugFlag: print("m_mktConfigDict=",self.m_mktConfigDict) # Get RunID self.runID = generate_runId() if self.debugFlag: print("RunID = ", self.runID) # Initialize S3 object and get FINRA cloud service token and establish s3 session self.s3object = S3(self.mktConfigFile, self.m_logger, self.debugFlag) tokenRetryTimes = int(self.m_configDict["TOKEN"]["token_retry_times"]) tokenRetryWaitTime = int(self.m_configDict["TOKEN"]["token_retry_wait_time"]) deaFileWaitTime = int(self.m_configDict["dea"]["DEA_FILE_WAIT_TIME"]) deaFileSleepTime = int(self.m_configDict["dea"]["DEA_FILE_SLEEP_TIME"]) s3TimeoutTime = int(self.m_configDict["dea"]["S3_TIMEOUT_TIME"]) #Not sure what to do. Keep this for a place holder in the future, when FINRA manifest for zero byte files everyday handleNoDatafileFlag = self.m_configDict["dea"]["HANDLE_NO_DATAFILE_FLAG"] deaActualTime = 0 # Download manifest files in the manifest file list to a specific folder from AWS localFileDir = self.s3object.m_configFile["ENV"]["stage_dir"] targetFolder = self.s3object.m_configFile["S3"]["path"] targetBucket = self.s3object.m_configFile["S3"]["bucket"] encryptKeyFlag = self.s3object.m_configFile["S3"]["encrypt_key"] folderPosition = int(self.s3object.m_configFile["S3"]["folder_position"]) targetFileDir = targetFolder + self.processingDate + "/" if self.debugFlag: print "localFileDir = ", localFileDir print "targetFolder = ", targetFolder print "targetBucket = ", targetBucket print "encryptKeyFlag = ", encryptKeyFlag print "self.processingDate = ", self.processingDate print "targetFileDir = ", targetFileDir startDate = ((datetime.now() - relativedelta(years=1)) + timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S") endDate = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S") getTokenFlag = 0 fileExistFlag = 0 while deaActualTime < deaFileWaitTime: # Get token only the first time or when the time exceed s3TimeoutTime if deaActualTime > s3TimeoutTime or not getTokenFlag: getTokenFlag=1 initCount = 0 while (initCount < tokenRetryTimes): tokenReturnCode = self.s3object.getToken() if tokenReturnCode: if initCount == tokenRetryTimes: self.m_logger.error("Error: Exceeded the max retries " + tokenRetryTimes + " to get AWS Token from FINRA. Please re-try after some time or escalate.. ") sys.exit(1) initCount += 1 time.sleep(tokenRetryWaitTime) else: break self.currentEpochTime = int(time.time()) # Get the list of files from the AWS folder for the given processing date fileList = self.getFileList(startDate, endDate, targetBucket, targetFileDir, folderPosition) if len(fileList): if self.debugFlag: print("fileList = ", fileList) fileExistFlag=1 break time.sleep(deaFileSleepTime) deaActualTime += deaFileSleepTime if self.debugFlag: print "deaActualTime =", deaActualTime print "deaFileSleepTime =", deaFileSleepTime print "deaFileWaitTime =", deaFileWaitTime self.m_logger.info("INFO : Waiting for file in FINRA's cloud, " + str(deaFileWaitTime - deaActualTime) + " secs remaining...") # End of while tblName = self.m_mktConfigDict["dea"]["TARGET_TBL_NAME"] + "_" + self.mktName.upper() # If no files exists for the given day, create a zero byte data file and a manifest file for the same if not fileExistFlag: #deaDummyDataFile = self.m_configDict["dea"]["DEA_DUMMY_DATA_FILE"].replace("PDATE", self.processingDate) deaDummyDataFile = self.m_configDict["dea"]["DEA_DUMMY_DATA_FILE"].replace("PDATE", str(self.processingDate)) deaDummyDataFileWthPath = self.m_configDict["ENV"]["stage_dir"] + "/" + deaDummyDataFile open(deaDummyDataFileWthPath,'a').close() fatlManifestFile = self.m_configDict["ENV"]["stage_dir"] + "/" + tblName + "." + self.processingDate + ".manifest" if self.debugFlag: print "fileExistFlag = ", fileExistFlag with open(fatlManifestFile,"w") as fh: fileSize = 0 if self.debugFlag: print "deaDummyDataFileWthPath = ", deaDummyDataFileWthPath print "fileSize = ", fileSize print "tblName = ", tblName, "file = ", deaDummyDataFile, "fileSize = ", fileSize, "mktName = ", self.mktName fh.write(tblName + "|" + deaDummyDataFile + "|" + str(fileSize) + "|" + "0" + "\n") self.m_logger.info("INFO : No File found for processing date " + self.processingDate + ". Creating zero byte data file " + deaDummyDataFileWthPath + " and manifest file " + fatlManifestFile) sys.exit(0) fileIDQueue = Queue() localAWSRetries = int(self.m_mktConfigDict["ENV"]["aws_retries"]) # Insert a record into tb_dxt_dataset_trans with status 'P' for the given datasetName, saying that we start the process for this manifest file pStatus = 'P' # We decided to use tblName instead of dataset for DEA, as we don't have dataset concept or manifest files self.datasetName = tblName tempSql = self.m_configDict["SQL"]["put_dataset"] myParamsDict = {'datasetName':self.datasetName, 'runID': str(self.runID), 'tDate':str(self.processingDate), 'status': pStatus } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to insert into tb_dxt_dataset_trans table " + mySql + ". Error = " + self.datasetName + " for processing date = " + self.processingDate) sys.exit(1) fileID = 1 dbFlag = 1 fileIDQueue = Queue() procs = [] doneCounter = 0 sendCounter = 0 failureFlag = 0 process_count = int(self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"]) while doneCounter < len(fileList): while sendCounter < len(fileList) and sendCounter - doneCounter < process_count: #print "Inside while loop" #print "fileList = ", fileList # Call fn extractData to fetch files from AWS. Pass manifestFileDict[sendCounter] as it contains the whole record including the filename, filesize & row count processHandle = Process(target=DEAExtractor.extractData, args=(self, fileList[sendCounter],fileID, fileIDQueue, dbFlag)) processFlag=1 if ((int(time.time()) - self.currentEpochTime) > s3TimeoutTime): self.currentEpochTime = int(time.time()) self.m_logger.info("Getting New Token for Batch : {0}, Max batches : {1}".format(batch_count,max_batches)) if self.debugFlag: print 'Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print "self.currentEpochTime = ", self.currentEpochTime print "Current Time in Epoch = ", int(time.time()) if self.debugFlag: print "Inside get new token - self.currentEpochTime = ", self.currentEpochTime initCount = 0 while (initCount < tokenRetryTimes): tokenReturnCode = 0 tokenReturnCode = self.s3object.getToken() if tokenReturnCode: if initCount == tokenRetryTimes: self.m_logger.error("Error: Exceed the max retries " + tokenRetryTimes + " to get AWS Token from FINRA. Please re-try after some time or escalate.. ") sys.exit(1) initCount += 1 time.sleep(tokenRetryWaitTime) else: break threadDelayTime = int(self.m_configDict["dea"]["THREAD_DELAY_TIME"]) time.sleep(threadDelayTime) processHandle.start() procs.append(processHandle) sendCounter += 1 fileID += 1 if processFlag: for p in procs: p.join() procs=[] processFlag=0 while not fileIDQueue.empty(): # process completed results as they arrive #time.sleep(3) qFileID, qResult = fileIDQueue.get() if self.debugFlag: print("qFileID = ", qFileID, "qResult = ", qResult) doneCounter += 1 if qResult: failureFlag = 1 if self.debugFlag: print "ProcessFlag = ", processFlag, "sendCounter = ", sendCounter, "doneCounter = ", doneCounter if failureFlag: break if self.debugFlag: print "Failure Flag = ", failureFlag if failureFlag: pStatus = 'F' else: pStatus = 'S' fatlManifestFile = self.m_configDict["ENV"]["stage_dir"] + "/" + tblName + "." + self.processingDate + ".manifest" if self.debugFlag: print "File List = ", fileList print "fatlManifestFile =", fatlManifestFile with open(fatlManifestFile,"w") as fh: counter = 0 for file in fileList: sourceFileWthPath = self.m_configDict["ENV"]["stage_dir"] + "/" + file fileSize = os.stat(sourceFileWthPath).st_size if self.debugFlag: print "sourceFileWthPath = ", sourceFileWthPath print "fileSize = ", fileSize print "tblName = ", tblName, "file = ", file, "fileSize = ", fileSize, "mktName = ", self.mktName fh.write(tblName + "|" + file + "|" + str(fileSize) + "|" + "0" + "\n") counter += 1 # insert a record into tb_dxt_dataset_trans table with 'S' or 'F' record #Call Oracle fn to insert status 'S' into TB_DDY_DATASET_TRANS with RUNID etc #DB_CALL # Make database call sp_dxt_insert_dataset_trans and insert data based on Failure or Success mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_dataset"] myParamsDict = {"datasetName":self.datasetName, "runID": str(self.runID), "tDate":str(self.processingDate), "status": pStatus } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to insert into tb_dxt_dataset_trans table " + mySql + ". Error = " + self.datasetName + " for processing date = " + self.processingDate) sys.exit(1) except Exception as e: self.m_logger.error("processDEAExtractor failed with error " + str(e)) sys.exit(1)
class Extractor(): #class variables m_logger = "" #database objects m_oracle_db = "" def __init__(self, configFile, mktName, tradeDate, debugFlag): """ Purpose: Constructor :param self: class object itself :param configFile: Configuration file to use """ # Initialize m_logger object from class Logger and add Header to the log, using addGenericInfo function self.m_logger = Logger(logging.INFO, configFile, tradeDate) self.m_logger.addFileHandler(logging.DEBUG) self.m_logger.addGenericInfo(__file__) self.tradeDate = tradeDate self.debugFlag = debugFlag self.configFile = configFile self.mktName = mktName try: # Get configuration to a dictionary self.m_configDict = configuration(self.configFile, True).m_dictionary #Initialize Oracle instance along with connection self.m_oracle_db = Oracle(self.m_configDict, self.m_logger) except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def readMktConfigFile(self, mktConfigFile): """ Purpose - To read the content of mktConfigFile into the global dictionary m_mktConfigDict for reference :param mktConfigFile: :return: """ try: self.m_mktConfigDict = configuration(mktConfigFile, True).m_dictionary except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration for logger " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def chkActiveLoads(self): """ Purpose - To check the count of active Active loads happening at a given point :param None: None at this point :return: """ try: if self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_check_flag"] == 'Y': localActiveLoadMax = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"] localActiveLoadWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_wait_time"] localActiveLoadMaxWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max_wait_time"] mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["get_active_loads"] activeFlag=1 totalActiveWaitTime=0 while activeFlag: returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "mySql = ", mySql print "returnStr = ", returnStr print "chkActiveLoads - Active Loads value = ", int(returnStr[1].strip()) if returnStr[0] != '0': self.m_logger.error("Unable to get active loads using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Check if actual active loads is <= configured active loads. If so, return out of the fn if int(returnStr[1].strip()) <= localActiveLoadMax: activeFlag=0 return 0 #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localActiveLoadWaitTime) totalActiveWaitTime += localActiveLoadWaitTime #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit if totalActiveWaitTime > localActiveMaxWaitTime: self.m_logger.error("In Fn chkActiveLoads. Total Actual Wait Time exceeds the configured value active_load_max_wait_time. Either cleanup orphaned loads or increase the either active_load_max or active_load_max_wait_time. totalActiveWaitTime = " + str(totalActiveWaitTime) + " localActiveMaxWaitTime=" + str(localActiveMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: self.m_logger.error("Failure in chkActiveLoads process for file with the error " + str(exp)) sys.exit(1) def chkRaceStatus(self): """ Purpose - To check if a load is already running for the given dataset :param None: None at this point :return: """ try: if self.m_mktConfigDict["RACE"]["race_status_check_flag"] == 'Y': localRaceStatusWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_wait_time"]) localRaceStatusMaxWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_max_wait_time"]) mySql = "" #myParams = {"datasetName":self.datasetName} tempSql = self.m_configDict["SQL"]["get_race_status"] myParamsDict = { 'datasetName' : self.datasetName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) raceFlag=1 totalRaceStatusWaitTime=0 while raceFlag: returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr print "chkRaceStatus - ReturnCode = ", int(returnStr[1].strip()) if returnStr[0] != '0': self.m_logger.error("Unable to get race status using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Check if the load for this dataset is already running. If not, exit out of the function with normal return value if int(returnStr[1].strip()) <= 1: raceFlag=0 return 0 #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localRaceStatusWaitTime) #time.sleep(90) totalRaceStatusWaitTime += localRaceStatusWaitTime if self.debugFlag: print "totalRaceStatusWaitTime = ", totalRaceStatusWaitTime, "localRaceStatusWaitTime =", localRaceStatusWaitTime if totalRaceStatusWaitTime > localRaceStatusMaxWaitTime: self.m_logger.error("In Fn chkRaceStatusLoads. Total Actual Wait Time exceeds the configured value race_status_max_wait_time. Either check if the Dataset is getting loaded or increase the either active_load_max or active_load_max_wait_time. totalRaceStatusWaitTime = " + str(totalRaceStatusWaitTime) + " localRaceStatusMaxWaitTime=" + str(localRaceStatusMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: self.m_logger.error("Failure in chkRaceStatus process for file with the error " + str(exp)) sys.exit(1) def extractData(self,localDataRecordList, localFileID, localFileIDQueue, localDBFlag): """ Purpose - To load the given datafile to the S3 bucket specified in the global mktConfigFile :param localDataRecordList: Datafile related info fetched from FINRA's manifest file including filename, filesize, recordcount :param localFileID: Internal File ID assigned to the local datafile :param localFileIDQueue: Queue in which, results of the operation is stored :param localDBFlag: Flag indicating if database should be used or not :return: """ try: if self.debugFlag: print "Inside extractData function" if localDBFlag: """ Not sure if we need Race Status check for Extract raceStatusReturnValue=self.chkRaceStatus() if self.debugFlag: print "raceStatusReturnValue=", raceStatusReturnValue if raceStatusReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(raceStatusReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 """ """ Need to integrate Active loads with tb_dxt_process_status and tb_dxt_process_status ? activeLoadsReturnValue=self.chkActiveLoads() if activeLoadsReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(activeLoadsReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 """ processID = os.getpid() hostName = socket.gethostname() # Need to check the order localDataFile = localDataRecordList[1] dataFileSize = int(localDataRecordList[2]) dataFileRecordCount = int(localDataRecordList[3]) #Insert Process status into Oracle db #DB_CALL - sp_dxt_insert_process_status(RUNID, FILE_ID, etc) mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_process_status"] pStatus = 'P' pComment = 'Load started' #myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileID":str(localFileID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "fileSize": dataFileSize, "recordCount" : dataFileRecordCount, "status":pStatus , "lcomment":pComment} myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "fileSize": str(dataFileSize), "recordCount" : str(dataFileRecordCount), "status":pStatus , "lcomment":pComment} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) # Get the dataFileName file to be extracted from AWS dataFileName = localDataRecordList[1] #Here localFileWthPath is the local stage dir with file name localFileWthPath = self.m_configDict["ENV"]["stage_dir"] + "/" + dataFileName #Here targetFileWthPath is the AWS dir with file name targetFileWthPath = os.path.join(self.s3object.m_configFile["S3"]["path"], os.path.basename(dataFileName)) targetBucket = self.s3object.m_configFile["S3"]["bucket"] encryptKeyFlag = self.s3object.m_configFile["S3"]["encrypt_key"] localAWSRetries = int(self.m_mktConfigDict["ENV"]["aws_retries"]) if self.debugFlag: print("localFileWthPath =", localFileWthPath) print("targetFileWthPath =", targetFileWthPath) print("targetBucket =", targetBucket) print("encryptKeyFlag =", encryptKeyFlag) print("localAWSRetries =", localAWSRetries) initCount = 0 while (initCount < localAWSRetries): extractReturnValue = 0 #Call s3.data download to extract the manifest file (single part load) #extractReturnValue = self.s3object.getDataSinglePart(localFileWthPath, targetFileWthPath, targetBucket, encryptKeyFlag ) extractReturnValue = self.s3object.getDataSinglePart(localFileWthPath, targetFileWthPath, targetBucket) if self.debugFlag: print "extractReturnValue = ", extractReturnValue if int(extractReturnValue) == 0: pStatus = 'S' pComment = 'Load completed' break else: pStatus = 'F' pComment = 'Load failed' initCount += 1 # Get the size of the file downloaded localFileSize = os.stat(localFileWthPath).st_size # Check if the downloaded file size is matching with what is mentioned in manifest file. If not mark it as failed if localFileSize != dataFileSize: pStatus = 'F' pComment = 'Actual file size != Manifest file size' localRecordCount = 0 if localDBFlag: #Call DB to insert 'S' or 'F' in tb_dxt_process_status #localFileIDQueue.put((localFileID, extractReturnValue)) mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_process_status"] #myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileID":str(localFileID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "fileSize": localFileSize, "recordCount" : localRecordCount, "status":pStatus , "lcomment":pComment} myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "fileSize": str(localFileSize), "recordCount" : str(localRecordCount), "status":pStatus , "lcomment":pComment} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) localFileIDQueue.put((localFileID,extractReturnValue)) else: return extractReturnValue except Exception as exp: self.m_logger.error("Failure in extractData process for file with the error " + str(exp)) if localDBFlag: localFileIDQueue.put((localFileID, 1)) else: return 1 def getRecords(self, fileDict, startDateTime, endDateTime): """ Purpose - Function to sort the dictionary based on the key and return a sorted list :param fileDict : Dictionary containing Last_modified Date and file name :param startDateTime : Start DateTime in the format '2016-02-01 00:00:00' :param endDateTime : End DateTime in the format '2016-02-10 00:00:00' """ try: patternToSearch = self.m_configDict["ENV"]["pattern_to_search"] if self.debugFlag: print "fileDict = ", fileDict print "patternToSearch = ", patternToSearch print "startDateTime = ", startDateTime print "endDateTime = ", endDateTime sorted_keys = sorted(fileDict.iterkeys()) start = bisect.bisect_left(sorted_keys, startDateTime) end = bisect.bisect_right(sorted_keys, endDateTime) if self.debugFlag: print "start = ", start print "end = ", end for fileItem in sorted(fileDict.iteritems())[start:end]: print "For fileItem = ", fileItem if patternToSearch in fileItem[1]: yield fileItem[1] except Exception as exp: self.m_logger.error("Failed while executing getRecords to sort the dictionary content of dictionary with Error = " + str(exp)) sys.exit(1) def getManifestFileList(self, startDateTime, endDateTime): """ Purpose - Function to sort the dictionary based on the key and return a sorted list :param startDateTime : Start DateTime in the format '2016-02-01 00:00:00' :param endDateTime : End DateTime in the format '2016-02-10 00:00:00' """ try: if self.debugFlag: print "S3 Bucket = ", self.m_configDict["S3"]["bucket"] print "S3 Path = ", self.m_configDict["S3"]["path"] print "startDateTime = ", startDateTime print "endDateTime = ", endDateTime fileListDict = self.s3object.listBucketWPathByLastModified(self.m_configDict["S3"]["bucket"], self.m_configDict["S3"]["path"]) #endDateTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") manifestFileList = list(self.getRecords(fileListDict, startDateTime, endDateTime)) if self.debugFlag: print "fileListDict = ", fileListDict print "manifestFileList = ", manifestFileList return manifestFileList except Exception as exp: self.m_logger.error("Failed while creating AWS manifest file list with Error = " + str(exp)) return 1 def processExtractor(self): """ Purpose - Function responsible for getting the AWS token and reading the last modified date in DB and fetch the list of files from AWS to be processed :param : None :return: """ try: # DB_CALL # Make database call sp_dxt_validate_mktName(mktName) to validate mktName tempSql = self.m_configDict["SQL"]["validate_market_name"] myParamsDict = { 'mktName' : self.mktName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Invalid market name provided " + mySql + ". Error = " + self.mktName) sys.exit(1) if self.debugFlag: print "MktName from DB = ", self.mktName #Build the string for mktConfigFile based on mktName and configFile info self.mktConfigFile = os.path.dirname(self.configFile) + '/' + os.path.basename(self.configFile).split('.',1)[0].strip() + '_' + self.mktName.lower() + '.' + os.path.basename(self.configFile).split('.',1)[1].strip() if self.debugFlag: print("mktConfigFile = ", self.mktConfigFile) #Validate Market Config file is a valid file if not os.path.isfile(self.mktConfigFile): self.m_logger.error("Invalid market manifest file " + self.mktConfigFile) sys.exit(1) # Read Market specific config file and store it in a specific dictionary self.readMktConfigFile(self.mktConfigFile) if self.debugFlag: print("m_mktConfigDict=",self.m_mktConfigDict) # Read the table for the given market and fetch the last modified timestamp for the given manifest file #tempSql = self.m_configDict["SQL"]["get_last_modified"] #myParamsDict = { 'mktName' : self.mktName } #tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) #mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) #returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) #if self.debugFlag: #print "tempSql = ", tempSql #print "myParamsDict = ", myParamsDict #print "mySql = ", mySql #print "returnStr = ", returnStr #if returnStr[0] == '0': #if returnStr[1]: #lastModifiedDate=returnStr[1] #else: #lastModifiedDate="2015-01-01 00:00:00" #else: #self.m_logger.error("Unable to get last_modified date using the sql " + mySql + ". Error = " + self.mktName) #sys.exit(1) #Temp call. Need to enable the previous lines to use DB call lastModifiedDate="2015-01-01 00:00:00" # Get RunID self.runID = generate_runId() if self.debugFlag: print("RunID = ", self.runID) # Initialize S3 object and get FINRA cloud service token and establish s3 session self.s3object = S3(self.mktConfigFile, self.m_logger, self.debugFlag) self.s3object.getToken() # Get list of Manifest files to be processed #currentDate = datetime.now().strftime("%Y-%m-%d %H:%M:%S") #currentDate = datetime.now().strftime("%Y-%m-%d %H:%M:%S") currentDate = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S") finraManifestFileList = self.getManifestFileList(lastModifiedDate, currentDate) if self.debugFlag: print("finraManifestFileList = ", finraManifestFileList) # Download manifest files in the manifest file list to a specific folder from AWS localFileDir = self.s3object.m_configFile["ENV"]["stage_dir"] targetFolder = self.s3object.m_configFile["S3"]["path"] #targetFileWthPath = os.path.join(self.s3object.m_configFile["S3"]["path"], os.path.basename(localFileWthPath)) targetBucket = self.s3object.m_configFile["S3"]["bucket"] encryptKeyFlag = self.s3object.m_configFile["S3"]["encrypt_key"] #targetFileWthPath="50006/slmm_mnem.007.txt.gz" #localFileWthPath="/tmp/slmm_mnem.007.txt.gz" # Get an instance of the Manifest class localManifest = Manifest() fileIDQueue = Queue() localAWSRetries = int(self.m_mktConfigDict["ENV"]["aws_retries"]) for finraManifestFile in finraManifestFileList: targetFileWthPath = targetFolder + finraManifestFile localFileWthPath = localFileDir + "/" + finraManifestFile if self.debugFlag: print "targetFileWthPath = ", targetFileWthPath print "localFileWthPath = ", localFileWthPath print "finraManifestFile = ", finraManifestFile initCount = 0 while (initCount < localAWSRetries): extractReturnValue = 0 #Call s3.data download to extract the manifest file (single part load) #extractReturnValue = self.s3object.getDataSinglePart(localFileWthPath, targetFileWthPath, targetBucket, encryptKeyFlag ) extractReturnValue = self.s3object.getDataSinglePart(localFileWthPath, targetFileWthPath, targetBucket) if self.debugFlag: print "extractReturnValue = ", extractReturnValue if extractReturnValue: # Try it again initCount += 1 else: # Come out of the loop break # End of while loop for AWS Retries if extractReturnValue: self.m_logger.error("Unable to fetch manifestFile = " + finraManifestFile + "from the path = " + targetFileWthPath + " to the local filesystem = " + localFileWthPath ) sys.exit(1) """ Not needed if extractReturnValue == 0: pStatus = 'P' pComment = 'Load completed' break else: pStatus = 'F' pComment = 'Load failed' """ initCount += 1 # get datasetname from the manifest file. Need check based on FINRA naming self.datasetName = os.path.basename(finraManifestFile).split('.',3)[1].strip().upper() if self.debugFlag: print "datasetName = ", self.datasetName # Need to check DB call, once it is ready # Validate the manifest file name to make sure that we are expecting it tempSql = self.m_configDict["SQL"]["validate_dataset_name"] myParamsDict = { 'datasetName' : self.datasetName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr #Check if dataset is there in the tb_dxt_dataset_master, if not, skip it and move to the next file. For other errors, exit out of the program if int(returnStr[0]) < 0: self.m_logger.error("Unable to validate datasetName " + mySql + ". Error = " + self.datasetName) sys.exit(1) elif int(returnStr[0]) > 0: self.m_logger.info("Give Dataset is not in the list to process. Skipping it" + mySql + ". Dataset Name = " + self.datasetName) # Continue to the next file entry in the manifest list continue # Insert a record into tb_dxt_dataset_trans with status 'P' for the given datasetName, saying that we start the process for this manifest file pStatus = 'P' tempSql = self.m_configDict["SQL"]["put_dataset"] myParamsDict = {'datasetName':self.datasetName, 'runID': str(self.runID), 'tDate':str(self.tradeDate), 'status': pStatus } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to insert into tb_dxt_dataset_trans table " + mySql + ". Error = " + self.datasetName) sys.exit(1) # Read the contents of manifestfile i.e. dataFileNames into a list - Will validate the datafiles as well manifestDelim = self.m_configDict["ENV"]["manifest_delim"] if self.debugFlag: print "localFileWthPath = ", localFileWthPath manifestFileList = localManifest.readManifest(localFileWthPath, self.m_logger, manifestDelim, self.debugFlag) if self.debugFlag: print "manifestDelim = ", manifestDelim print "manifestFileList = ", manifestFileList process_count = int(self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"]) # Now go into multiprocessing and call extractData function and extract files ones by one fileID=1 dbFlag=1 fileIDQueue = Queue() procs = [] doneCounter = 0 sendCounter = 0 failureFlag = 0 while doneCounter < len(manifestFileList): while sendCounter < len(manifestFileList) and sendCounter - doneCounter < process_count: if self.debugFlag: print "manifestFileList[sendCounter][1]", manifestFileList[sendCounter][1], "fileID = ", fileID # Call fn extractData to fetch files from AWS. Pass manifestFileList[sendCounter] as it contains the whole record including the filename, filesize & row count processHandle = Process(target=Extractor.extractData, args=(self, manifestFileList[sendCounter],fileID, fileIDQueue, dbFlag)) processFlag=1 processHandle.start() procs.append(processHandle) sendCounter += 1 fileID += 1 if processFlag: for p in procs: p.join() procs=[] processFlag=0 while not fileIDQueue.empty(): # process completed results as they arrive #time.sleep(3) qFileID, qResult = fileIDQueue.get() if self.debugFlag: print("qFileID = ", qFileID, "qResult = ", qResult) doneCounter += 1 if qResult: failureFlag = 1 if self.debugFlag: print "ProcessFlag = ", processFlag, "sendCounter = ", sendCounter, "doneCounter = ", doneCounter if failureFlag: break if self.debugFlag: print "Failure Flag = ", failureFlag if failureFlag: pStatus = 'F' else: pStatus = 'S' # Move all the data files to inbox from the stg location. No need for this step, as Joejo mentioned there will be another Tidal job doing this step # Move the manifest file to inbox from the stg location # insert a record into tb_dxt_dataset_trans table with 'S' or 'F' record #Call Oracle fn to insert status 'S' into TB_DDY_DATASET_TRANS with RUNID etc #DB_CALL # Make database call sp_dxt_insert_dataset_trans and insert data based on Failure or Success mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_dataset"] myParamsDict = {"datasetName":self.datasetName, "runID": str(self.runID), "tDate":str(self.tradeDate), "status": pStatus } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put dataset info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) if failureFlag: self.m_logger.error("Extract failed for data files for manifest file " + self.manifestFile) sys.exit(1) # End of for loop for finraManifestFiles except Exception as e: self.m_logger.error("ProcessExtractor failed with error " + str(e)) sys.exit(1)
def getProcedure(self, procName): if self.useOracleMod: return Oracle.getProcedure(self.realConn, procName) else: return getattr(self.getCursor().procedures, procName)
def BFGS(Oracle, x0): ##### Initialisation des variables iter_max = 10000 # gradient_step_ini = 1. # Problème primal. gradient_step_ini = 1000. # Problème dual. threshold = 0.000001 error_count = 0 # Compteur de non-convergence de l'algorithme de Fletcher-Lemarechal. gradient_norm_list = [] gradient_step_list = [] critere_list = [] time_start = process_time() x = x0 ##### Boucle sur les iterations for k in range(iter_max): # Nouvelles valeurs du critere et du gradient critere, gradient = Oracle(x, 4) # Test de convergence gradient_norm = norm(gradient) if gradient_norm <= threshold: break # Direction de descente if k == 0: W = np.eye(len(gradient)) else: delta_x = x - x_p delta_g = gradient - gradient_p delta_mat_1 = np.outer(delta_x, delta_g) / np.vdot( delta_g, delta_x) delta_mat_2 = np.outer(delta_x, delta_x) / np.vdot( delta_g, delta_x) I = np.eye(len(gradient)) # Matrice identité W = np.dot(np.dot(I - delta_mat_1, W_p), I - np.transpose(delta_mat_1)) + delta_mat_2 direction = np.dot(-W, gradient) # Pas de descente gradient_step, error_code = Wolfe(gradient_step_ini, x, direction, Oracle) if error_code != 1: error_count += 1 # Mise a jour des variables x_p = x # Valeur précédente de la position gradient_p = gradient # Valeur précédente du gradient direction_p = direction # Valeur précédente de la direction W_p = W x = x + (gradient_step * direction) # Evolution du gradient, du pas, et du critere gradient_norm_list.append(gradient_norm) gradient_step_list.append(gradient_step) critere_list.append(critere) if error_count > 0: print() print("Non-convergence de l'algorithme de Fletcher-Lemarechal : {}". format(error_count)) ##### Resultats de l'optimisation critere_opt = critere gradient_opt = gradient x_opt = x time_cpu = process_time() - time_start print() print('Iteration :', k) print('Temps CPU :', time_cpu) print('Critere optimal :', critere_opt) print('Norme du gradient :', norm(gradient_opt)) # Visualisation de la convergence Visualg(gradient_norm_list, gradient_step_list, critere_list) return critere_opt, gradient_opt, x_opt,
data_filename=data_dir, batch_size=batch_size, sequence_length=sequence_length, validation_split=validation_split, fake_batch_size=discriminator_pre_training_fake_batch_size, seed=seed, data_type="val", use_word_vectors=use_word_vectors) # Initialize Models oracle = Oracle(train_data_loader=train_dl, validation_data_loader=val_dl, units=oracle_hidden_units, leaky_relu_alpha=oracle_leaky_relu_alpha, num_layers=oracle_layers, opt=oracle_optimizer, dropout_keep_prob=oracle_dropout_keep_prob, l2_reg_lambda=oracle_l2_regularization_lambda, sequence_length=sequence_length, loss=oracle_loss, metrics=oracle_metrics) gen = Generator(train_data_loader=train_dl, validation_data_loader=val_dl, units=generator_hidden_units, leaky_relu_alpha=generator_leaky_relu_alpha, num_layers=generator_layers, opt=generator_optimizer, dropout_keep_prob=generator_dropout_keep_prob, l2_reg_lambda=generator_l2_regularization_lambda, sequence_length=sequence_length, loss=generator_loss,
class Loader(): #class variables m_logger = "" #database objects m_oracle_db = "" m_netezza_db = "" def __init__(self, configFile, tradeDate, debugFlag): """ Purpose: Constructor :param self: class object itself :param configFile: Configuration file to use """ # Initialize m_logger object from class Logger and add Header to the log, using addGenericInfo function self.m_logger = Logger(logging.INFO, configFile, tradeDate) self.m_logger.addFileHandler(logging.DEBUG) self.m_logger.addGenericInfo(__file__) self.tradeDate = tradeDate self.debugFlag = debugFlag self.configFile = configFile try: # Get configuration to a dictionary self.m_configDict = configuration(self.configFile, True).m_dictionary #Initialize Oracle instance along with connection self.m_oracle_db = Oracle(self.m_configDict, self.m_logger) except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def readMktConfigFile(self, mktConfigFile): """ Purpose - To read the content of mktConfigFile into the global dictionary m_mktConfigDict for reference :param mktConfigFile: :return: """ try: self.m_mktConfigDict = configuration(mktConfigFile, True).m_dictionary except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration for logger " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def chkActiveLoads(self): """ Purpose - To check the count of active Active loads happening at a given point :param None: None at this point :return: """ try: if self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_check_flag"] == 'Y': localActiveLoadMax = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"] localActiveLoadWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_wait_time"] localActiveLoadMaxWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max_wait_time"] mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["get_active_loads"] activeFlag=1 totalActiveWaitTime=0 while activeFlag: returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "mySql = ", mySql print "returnStr = ", returnStr print "chkActiveLoads - Active Loads value = ", int(returnStr[1].strip()) if returnStr[0] != '0': self.m_logger.error("Unable to get active loads using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Check if actual active loads is <= configured active loads. If so, return out of the fn if int(returnStr[1].strip()) <= localActiveLoadMax: activeFlag=0 return 0 #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localActiveLoadWaitTime) totalActiveWaitTime += localActiveLoadWaitTime #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit if totalActiveWaitTime > localActiveMaxWaitTime: self.m_logger.error("In Fn chkActiveLoads. Total Actual Wait Time exceeds the configured value active_load_max_wait_time. Either cleanup orphaned loads or increase the either active_load_max or active_load_max_wait_time. totalActiveWaitTime = " + str(totalActiveWaitTime) + " localActiveMaxWaitTime=" + str(localActiveMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: self.m_logger.error("Failure in chkActiveLoads process for file with the error " + str(exp)) sys.exit(1) def chkRaceStatus(self): """ Purpose - To check if a load is already running for the given dataset :param None: None at this point :return: """ try: if self.m_mktConfigDict["RACE"]["race_status_check_flag"] == 'Y': localRaceStatusWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_wait_time"]) localRaceStatusMaxWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_max_wait_time"]) mySql = "" #myParams = {"datasetName":self.datasetName} tempSql = self.m_configDict["SQL"]["get_race_status"] myParamsDict = { 'datasetName' : self.datasetName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) raceFlag=1 totalRaceStatusWaitTime=0 while raceFlag: returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr print "chkRaceStatus - ReturnCode = ", int(returnStr[1].strip()) if returnStr[0] != '0': self.m_logger.error("Unable to get race status using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Check if the load for this dataset is already running. If not, exit out of the function with normal return value if int(returnStr[1].strip()) <= 1: raceFlag=0 return 0 #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localRaceStatusWaitTime) #time.sleep(90) totalRaceStatusWaitTime += localRaceStatusWaitTime if self.debugFlag: print "totalRaceStatusWaitTime = ", totalRaceStatusWaitTime, "localRaceStatusWaitTime =", localRaceStatusWaitTime if totalRaceStatusWaitTime > localRaceStatusMaxWaitTime: self.m_logger.error("In Fn chkRaceStatusLoads. Total Actual Wait Time exceeds the configured value race_status_max_wait_time. Either check if the Dataset is getting loaded or increase the either active_load_max or active_load_max_wait_time. totalRaceStatusWaitTime = " + str(totalRaceStatusWaitTime) + " localRaceStatusMaxWaitTime=" + str(localRaceStatusMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: self.m_logger.error("Failure in chkRaceStatus process for file with the error " + str(exp)) sys.exit(1) #def loadData(self,localDataFile, localFileID, localFileIDQueue, localDBFlag): def loadData(self,localDataFile, localFileID, localFileIDQueue): """ Purpose - To load the given datafile to the S3 bucket specified in the global mktConfigFile :param localDataFile: Source datafile to be uploaded to S3 :param localFileID: Internal File ID assigned to the source datafile :param localFileIDQueue: Queue in which, results of the operation is stored :return: """ try: localDBFlag=1 if self.debugFlag: print "Inside loadData function" if localDBFlag: raceStatusReturnValue=self.chkRaceStatus() if self.debugFlag: print "raceStatusReturnValue=", raceStatusReturnValue if raceStatusReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(raceStatusReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 activeLoadsReturnValue=self.chkActiveLoads() if activeLoadsReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(activeLoadsReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 processID = os.getpid() hostName = socket.gethostname() #Insert Process status into Oracle db #DB_CALL - sp_ddy_insert_process_status(RUNID, FILE_ID, etc) mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_process_status"] pStatus = 'P' pComment = 'Load started' myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileID":str(localFileID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "status":pStatus , "lcomment":pComment, "tidalRunID":str(self.tidalRunID)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Call s3.gettoken to get the token and establish connection sourceFileWthPath = localDataFile #Commented the following lines to move getToken outside parallel thread # Keep it until we test all 93 loads and remove it #s3object = S3(self.mktConfigFile, self.m_logger) #s3object.getToken() ##sourceFileWthPath = s3object.m_configfile["S3"]["source_file"] targetFileWthPath = os.path.join(self.s3object.m_configFile["S3"]["path"], os.path.basename(sourceFileWthPath)) targetBucket = self.s3object.m_configFile["S3"]["bucket"] encryptKeyFlag = self.s3object.m_configFile["S3"]["encrypt_key"] local_aws_retries = int(self.m_mktConfigDict["ENV"]["aws_retries"]) if self.debugFlag: print("sourceFileWthPath =", sourceFileWthPath) print("targetFileWthPath =", targetFileWthPath) print("targetBucket =", targetBucket) print("encryptKeyFlag =", encryptKeyFlag) print("local_aws_retries =", local_aws_retries) init_count = 0 while (init_count < local_aws_retries): loadReturnValue = 0 #Call s3.dataUpload to load the data (single part load) loadReturnValue = self.s3object.loadDataSinglePart(sourceFileWthPath, targetFileWthPath, targetBucket, encryptKeyFlag ) if self.debugFlag: print "loadReturnValue = ", loadReturnValue if loadReturnValue == 0: pStatus = 'S' pComment = 'Load completed' break else: pStatus = 'F' pComment = 'Load failed' init_count += 1 if localDBFlag: #Call DB to insert 'S' or 'F' in tb_ddy_process_status #localFileIDQueue.put((localFileID, loadReturnValue)) mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_process_status"] myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileID":str(localFileID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "status":pStatus , "lcomment":pComment, "tidalRunID":str(self.tidalRunID)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) localFileIDQueue.put((localFileID,loadReturnValue)) else: return loadReturnValue except Exception as exp: self.m_logger.error("Failure in loadData process for file with the error " + str(exp)) if localDBFlag: localFileIDQueue.put(localFileID, 1) else: return 1 def createFinraManifestFile(self, manifestFile): try: # Read Manifest file to get info like total rows, total size & other details to populate the done file for FINRA if self.debugFlag: print "Inside createFinraManifestFile fuction" with open(manifestFile,"r") as fh: self.totalRows = 0 self.totalSize = 0 self.fileCount = 0 self.fileDict = {} for data in fh: data.rstrip("\n") mylist = [] self.fileCount +=1 mylist = data.split("|") self.fileDict[self.fileCount] = [mylist[0],os.path.basename(mylist[1]),int(mylist[2]),int(mylist[3])] self.totalRows += int(mylist[3]) self.totalSize += int(mylist[2]) if self.debugFlag: print "self.fileDict = ", self.fileDict except Exception as exp: self.m_logger.error("Failed while processing readManifest with Error = " + str(exp)) return 1 try: #Use self.defautltsFile which is populated from the db later. No need to get it from config file self.defaultsFileWthPath = self.m_mktConfigDict["DATASET"]["defaults_dir"] + "/" + self.defaultsFile with open(self.defaultsFileWthPath,"r") as fh: self.defaultsDict = {} self.defaultsCount = 0 for data in fh: data.rstrip('\n') self.defaultsCount +=1 self.defaultsDict[self.defaultsCount]=data if self.debugFlag: print "After Defaults, self.fileDict = ", self.fileDict except Exception as exp: self.m_logger.error("Failed while processing defaults file " + self.defaultsFileWthPath + " with Error = " + str(exp)) return 1 try: self.finraManifestFile = self.m_mktConfigDict["ENV"]["donefile_dir"] + "/" + os.path.basename(manifestFile) + ".done" with open(self.finraManifestFile,"w") as finraMnFH: finraMnFH.write("# AWS RunID : {}\n".format(str(self.runID))) finraMnFH.write("# Dataset : {0} , TradeDate : {1}\n".format(str(self.datasetName),str(self.tradeDate))) finraMnFH.write("total_compressed={}\n".format(self.totalSize)) finraMnFH.write("total_rows={}\n".format(self.totalRows)) finraMnFH.write("no of files={}\n".format(self.fileCount)) for key,val in self.fileDict.items(): finraMnFH.write("file_{0}={1}\n".format(str(key),val[1])) finraMnFH.write("file_{0}_rows={1}\n".format(str(key),val[3])) finraMnFH.write("# Data Attributes\n") for key,val in self.defaultsDict.items(): finraMnFH.write("{0}".format(str(val))) return 0 except Exception as exp: self.m_logger.error("Failed while creating AWS Done file " + self.finraManifestFile + " with Error = " + str(exp)) return 1 def processLoader(self, manifestFile, datasetName, tidalRunID): """ Purpose - Function responsible for reading the manifest file, get market name, call multiprocess load and other db calls :param manifestFile: Manifest File :param tradeDate: Trade Date :param tidalRunID: Tidal Run ID :return: """ try: # Read the manifest filename and get the suffix i.e. datasetname # Assumption - Manifest file format - manifest.<datasetName>.<tradeDate>.<tidalRunID> # Program will break otherwise self.datasetName = datasetName self.tidalRunID = tidalRunID # DB_CALL # Make database call sp_ddy_get_market_info(datasetname) and get market info mktName = '' self.manifestFile = self.m_configDict["ENV"]["manifestfile_dir"] + "/" + manifestFile ##Validate Manifest file if not os.path.isfile(self.manifestFile): self.m_logger.error("Invalid manifest file " + self.manifestFile) sys.exit(1) if self.debugFlag: print "Inside processLoader" print "DatasetName = ", self.datasetName print "ManifestFile = ", manifestFile print "Self ManifestFile = ", self.manifestFile print "TidalRunID = ", self.tidalRunID print "DebugFlag = ", self.debugFlag print "confDict = ", self.m_configDict # Enable this one the proc to get mkt name and default file are ready and test it tempSql = self.m_configDict["SQL"]["get_mkt_defaults_filename"] myParamsDict = { 'datasetName' : self.datasetName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] == '0': mktName = returnStr[2].strip() self.defaultsFile = returnStr[3].strip() else: self.m_logger.error("Unable to get market info from the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) if self.debugFlag: print "MktName from DB = ", mktName print "Defaults = ", self.defaultsFile #Build the string for mktConfigFile based on mktName and configFile info self.mktConfigFile = os.path.dirname(self.configFile) + '/' + os.path.basename(self.configFile).split('.',1)[0].strip() + '_' + mktName.lower() + '.' + os.path.basename(self.configFile).split('.',1)[1].strip() if self.debugFlag: print("mktConfigFile = ", self.mktConfigFile) #Validate Manifest file is a valid file if not os.path.isfile(self.mktConfigFile): self.m_logger.error("Invalid market manifest file " + self.mktConfigFile) sys.exit(1) #May not need the following section, as we send mktConfigFile to other function not the dictionary self.m_mktConfigDict. Need to remove it after finishing the loadData part fully # Read Market specific config file and store it in a specific dictionary self.readMktConfigFile(self.mktConfigFile) if self.debugFlag: print("m_mktConfigDict=",self.m_mktConfigDict) # Read the contents of manifest - dataFileNames into a list - Will validate the datafiles as well localManifest = Manifest() manifestDelim = self.m_configDict["ENV"]["manifest_delim"] manifestFileList = localManifest.readManifest(self.manifestFile, self.m_logger, manifestDelim, self.debugFlag) # Get RunID self.runID = generate_runId() if self.debugFlag: print("RunID = ", self.runID) #print("manifestFileList = ", manifestFileList) #Call Oracle fn to insert status 'P' into TB_DDY_DATASET_TRANS with RUNID etc #DB_CALL # Make database call sp_ddy_insert_dataset_trans and insert data that process started mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_dataset"] pStatus = 'P' myParamsDict = {'datasetName':self.datasetName, 'runID': str(self.runID), 'tDate':str(self.tradeDate), 'status': pStatus, 'tidalRunID':str(self.tidalRunID)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put dataset info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) # Insert Manifest data in db and call multiprocessing s3 loader process. Shd we add RUN_ID to manifest table #For each datafile, generate fileID and call loadData fn using multiprocess to load data into AWS for dataRecord in manifestFileList: mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_manifest"] myParamsDict = {"datasetName":self.datasetName, "runID": str(self.runID), "tDate":str(self.tradeDate), "dataFileName":dataRecord[1], "manifestFileName":manifestFile , "noOfRecords": str(dataRecord[2]), "fileSize":str(dataRecord[3]), "tidalRunID":str(self.tidalRunID)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) #if self.debugFlag: #print "tempSql = ", tempSql #print "myParamsDict = ", myParamsDict #print "mySql = ", mySql #print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put manifest info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) # Initialize S3 object and get FINRA cloud service token and establish s3 session self.s3object = S3(self.mktConfigFile, self.m_logger, self.debugFlag) self.s3object.getToken() # Get Active load values from config file localActiveLoadCheckFlag = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_check_flag"] process_count = int(self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"]) #localActiveLoadMax = int(self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"]) #localActiveLoadWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_wait_time"] #localActiveLoadMaxWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max_wait_time"] if self.debugFlag: print("localActiveLoadMax = ", process_count) print("len(manifestFileList) = ", len(manifestFileList)) pool = multiprocessing.Pool(processes=process_count) m = multiprocessing.Manager() fileIDQueue = m.Queue() #dbFlag=1 sendCounter = 0 doneCounter = 0 fileID=1 failureFlag=0 print manifestFileList while doneCounter < len(manifestFileList): print "Inside while doneCounter = ", doneCounter print "doneCounter = ", doneCounter, "sendCounter = ", sendCounter while sendCounter < len(manifestFileList) and sendCounter - doneCounter < process_count: tmpDataFileName = manifestFileList[sendCounter][1] print "Inside sendCounter, manifestFileList[sendCounter] = ", manifestFileList[sendCounter], "manifestFileList[sendCounter][1] = ", manifestFileList[sendCounter][1] #finraManifestLoadStatus=self.loadData(self.finraManifestFile ,fileID, fileIDQueue, dbFlag) # #processHandle = Process(target=Loader.loadData, args=(self, dataRecord[1],fileID, fileIDQueue, dbFlag)) #def loadData(self,localDataFile, localFileID, localFileIDQueue, localDBFlag): #results = mpPool.apply_async(Loader.loadData, (self, manifestFileList[sendCounter][1], fileID, fileIDQueue, dbFlag) ) #results = mpPool.apply_async(self.loadData, (tmpDataFileName, fileID, fileIDQueue, dbFlag)) results = pool.apply_async(self.loadData, args=(tmpDataFileName, fileID, fileIDQueue)) #results = pool.apply_async(Loader.loadData, (tmpDataFileName, fileID, fileIDQueue)) print "After pool apply_async" time.sleep(2) sendCounter += 1 fileID += 1 while not fileIDQueue.empty(): # process completed results as they arrive print "Inside Queue" time.sleep(3) qFileID, qResult = fileIDQueue.get() if qResult: failureFlag=1 if self.debugFlag: print("qFileID = ", qFileID, "qResult = ", qResult) doneCounter += 1 if failureFlag: break time.sleep(2) # #for dataRecord in manifestFileList: # #if self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_check_flag"] == 'Y': # #processHandle = Process(target=Loader.loadData, args=(self, dataRecord[1],fileID, fileIDQueue, dbFlag)) # #processHandle.start() # #procs.append(processHandle) # #fileID += 1 # # #for p in procs: # #p.join() # # #Without sleep the queue is unreliable and do not return the expected values. Fixed with procs.append function. No need for sleep anymore # #time.sleep(2) # ## failureFlag=0 # while not fileIDQueue.empty(): # qFileID, qResult = fileIDQueue.get() # if qResult: # failureFlag=1 # if self.debugFlag: # print("Inside fileIDQueue while") # print("qFileID = ", qFileID, "qResult = ", qResult) # # if self.debugFlag: # print "Failure Flag = ", failureFlag if failureFlag: pStatus = 'F' else: pStatus = 'S' """ #Generate FINRA Manifest file and Push it to AWS """ # Call Divakar's generate done file function returnValue = self.createFinraManifestFile(self.manifestFile) if self.debugFlag: print "Post createFinraManifestFile fn - return value= ", returnValue if returnValue: self.m_logger.error("Unable to generate done file. Please fix the issue the re-run the load") #sys.exit(1) failureFlag=1 pStatus = 'F' else: dbFlag=0 fileID=0 # Call the loader function with the manifest file finraManifestLoadStatus=0 finraManifestLoadStatus=self.loadData(self.finraManifestFile ,fileID, fileIDQueue, dbFlag) if finraManifestLoadStatus: pStatus = 'F' self.m_logger.error("Unable to load finra manifest file ") #Call Oracle fn to insert status 'S' into TB_DDY_DATASET_TRANS with RUNID etc #DB_CALL # Make database call sp_ddy_insert_dataset_trans and insert data based on Failure or Success mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_dataset"] myParamsDict = {"datasetName":self.datasetName, "runID": str(self.runID), "tDate":str(self.tradeDate), "status": pStatus, "tidalRunID":str(self.tidalRunID)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put dataset info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) if failureFlag: self.m_logger.error("Load failed") sys.exit(1) except Exception as e: self.m_logger.error("ProcessLoader failed with error " + str(e)) sys.exit(1)
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. # from SkunkWeb import Configuration, LogObj, ServiceRegistry from requestHandler.requestHandler import CleanupRequest import Oracle ServiceRegistry.registerService('oracle') Configuration.mergeDefaults( OracleConnectStrings = {}, OracleProcedurePackageLists = {} ) for u, str in Configuration.OracleConnectStrings.items(): LogObj.DEBUG(ServiceRegistry.ORACLE, 'initializing user %s' % u) Oracle.initUser(u, str) for u, pkglist in Configuration.OracleProcedurePackageLists: Oracle.loadSignatures(u, pkglist, LogObj.LOG, lambda x: LogObj.DEBUG(ServiceRegistry.ORACLE, x)) def rollbackConnection(*args): for v in Oracle._connections.values(): v.rollback() CleanupRequest.addFunction(rollbackConnection)
def refineUnknown(token): if not Oracle.isWord(token): return greedy(0, token) return [ token ]
def refineUnknown(token): if not Oracle.isWord(token): return greedy(0, token) return [token]
class Loader(): #class variables m_logger = "" #database objects m_oracle_db = "" m_netezza_db = "" def __init__(self, configFile, tradeDate, debugFlag, datasetName): """ Purpose: Constructor :param self: class object itself :param configFile: Configuration file to use """ # Initialize m_logger object from class Logger and add Header to the log, using addGenericInfo function self.m_logger = Logger(logging.INFO, configFile, tradeDate, datasetName.lower()) self.m_logger.addFileHandler(logging.DEBUG) self.m_logger.addGenericInfo(__file__) self.tradeDate = tradeDate self.debugFlag = debugFlag self.configFile = configFile try: # Get configuration to a dictionary self.m_configDict = configuration(self.configFile, True).m_dictionary #Initialize Oracle instance along with connection self.m_oracle_db = Oracle(self.m_configDict, self.m_logger) except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def readMktConfigFile(self, mktConfigFile): """ Purpose - To read the content of mktConfigFile into the global dictionary m_mktConfigDict for reference :param mktConfigFile: :return: """ try: self.m_mktConfigDict = configuration(mktConfigFile, True).m_dictionary except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration for logger " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def chkActiveLoads(self): """ Purpose - To check the count of active Active loads happening at a given point :param None: None at this point :return: """ try: if self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_check_flag"] == 'Y': localActiveLoadMax = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"] localActiveLoadWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_wait_time"] localActiveLoadMaxWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max_wait_time"] mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["get_active_loads"] activeFlag=1 totalActiveWaitTime=0 while activeFlag: returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "mySql = ", mySql print "returnStr = ", returnStr print "chkActiveLoads - Active Loads value = ", int(returnStr[1].strip()) if returnStr[0] != '0': self.m_logger.info("Retry after delay., Unable to get active loads using sql " + mySql + ". Error = " + returnStr[1]) local_delay_time = int(self.m_configDict["SQL"]["delay_time"]) time.sleep(local_delay_time) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if returnStr[0] != '0': self.m_logger.error("Unable to get active loads using sql " + mySql + ". Error = " + returnStr[1]) return 1 #Check if actual active loads is <= configured active loads. If so, return out of the fn if int(returnStr[1].strip()) <= localActiveLoadMax: activeFlag=0 return 0 #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localActiveLoadWaitTime) totalActiveWaitTime += localActiveLoadWaitTime #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit if totalActiveWaitTime > localActiveMaxWaitTime: self.m_logger.error("In Fn chkActiveLoads. Total Actual Wait Time exceeds the configured value active_load_max_wait_time. Either cleanup orphaned loads or increase the either active_load_max or active_load_max_wait_time. totalActiveWaitTime = " + str(totalActiveWaitTime) + " localActiveMaxWaitTime=" + str(localActiveMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: self.m_logger.error("Failure in chkActiveLoads process for file with the error " + str(exp)) return 1 def chkRaceStatus(self): """ Purpose - To check if a load is already running for the given dataset :param None: None at this point :return: """ try: if self.m_mktConfigDict["RACE"]["race_status_check_flag"] == 'Y': localRaceStatusWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_wait_time"]) localRaceStatusMaxWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_max_wait_time"]) mySql = "" #myParams = {"datasetName":self.datasetName} tempSql = self.m_configDict["SQL"]["get_race_status"] myParamsDict = { 'datasetName' : self.datasetName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) raceFlag=1 totalRaceStatusWaitTime=0 while raceFlag: returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr print "chkRaceStatus - ReturnCode = ", int(returnStr[1].strip()) if returnStr[0] != '0': self.m_logger.info("Retry after delay., Unable to get race status using sql " + mySql + ". Error = " + returnStr[1]) local_delay_time = int(self.m_configDict["SQL"]["delay_time"]) time.sleep(local_delay_time) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if returnStr[0] != '0': self.m_logger.error("Unable to get race status using sql " + mySql + ". Error = " + returnStr[1]) return 1 #Check if the load for this dataset is already running. If not, exit out of the function with normal return value if int(returnStr[1].strip()) <= 1: raceFlag=0 return 0 #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localRaceStatusWaitTime) #time.sleep(90) totalRaceStatusWaitTime += localRaceStatusWaitTime if self.debugFlag: print "totalRaceStatusWaitTime = ", totalRaceStatusWaitTime, "localRaceStatusWaitTime =", localRaceStatusWaitTime if totalRaceStatusWaitTime > localRaceStatusMaxWaitTime: self.m_logger.error("In Fn chkRaceStatusLoads. Total Actual Wait Time exceeds the configured value race_status_max_wait_time. Either check if the Dataset is getting loaded or increase the either active_load_max or active_load_max_wait_time. totalRaceStatusWaitTime = " + str(totalRaceStatusWaitTime) + " localRaceStatusMaxWaitTime=" + str(localRaceStatusMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: self.m_logger.error("Failure in chkRaceStatus process for file with the error " + str(exp)) return 1 def loadData(self,localDataFile, localFileID, localFileIDQueue, localDBFlag, dataFileFlag, localRecordCount): """ Purpose - To load the given datafile to the S3 bucket specified in the global mktConfigFile :param localDataFile: Source datafile to be uploaded to S3 :param localFileID: Internal File ID assigned to the source datafile :param localFileIDQueue: Queue in which, results of the operation is stored :return: """ try: if self.debugFlag: print "Inside loadData function" if localDBFlag: raceStatusReturnValue=self.chkRaceStatus() if self.debugFlag: print "raceStatusReturnValue=", raceStatusReturnValue if raceStatusReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(raceStatusReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 activeLoadsReturnValue=self.chkActiveLoads() if activeLoadsReturnValue: self.m_logger.error("Failure value returned by chkActiveLoads fn. Return value = " + str(activeLoadsReturnValue)) localFileIDQueue.put((localFileID, activeLoadsReturnValue)) return 1 processID = os.getpid() hostName = socket.gethostname() #Insert Process status into Oracle db #DB_CALL - sp_ddy_insert_process_status(RUNID, FILE_ID, etc) mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_process_status"] pStatus = 'P' pComment = 'Load started' myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileID":str(localFileID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "status":pStatus , "lcomment":pComment, "tidalRunID":str(self.tidalRunID)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.info("Retry after delay., Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) local_delay_time = int(self.m_configDict["SQL"]["delay_time"]) time.sleep(local_delay_time) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if returnStr[0] != '0': self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) localFileIDQueue.put((localFileID, 1)) return 1 #Call s3.gettoken to get the token and establish connection sourceFileWthPath = localDataFile #Commented the following lines to move getToken outside parallel thread # Keep it until we test all 93 loads and remove it #s3object = S3(self.mktConfigFile, self.m_logger) #s3object.getToken() ##sourceFileWthPath = s3object.m_configfile["S3"]["source_file"] targetFileWthPath = os.path.join(self.s3object.m_configFile["S3"]["path"], os.path.basename(sourceFileWthPath)) targetBucket = self.s3object.m_configFile["S3"]["bucket"] encryptKeyFlag = self.s3object.m_configFile["S3"]["encrypt_key"] local_aws_retries = int(self.m_mktConfigDict["ENV"]["aws_retries"]) if self.debugFlag: print("sourceFileWthPath =", sourceFileWthPath) print("targetFileWthPath =", targetFileWthPath) print("targetBucket =", targetBucket) print("encryptKeyFlag =", encryptKeyFlag) print("local_aws_retries =", local_aws_retries) #Get size of the file sourceSize = os.stat(sourceFileWthPath).st_size multiPartFlag=False GBFACTOR = float(1<<30) #Check if the given file is greater than 4.5 GB. Limit on AWS > 5 GB on single part upload if float(sourceSize/GBFACTOR) > 4.5: multiPartFlag=True init_count = 0 self.m_logger.info("Started Xfer of Source File " + sourceFileWthPath + " with size " + str(sourceSize) + " to target " + targetFileWthPath) while (init_count < local_aws_retries): loadReturnValue = 0 #Call s3.dataUpload to load the data (single part load) if multiPartFlag: if self.debugFlag: print "Inside Multipart load. File size = ", sourceSize loadReturnValue = self.s3object.loadDataMultiPart(sourceFileWthPath, targetFileWthPath, targetBucket, encryptKeyFlag, self.bytes_per_chunk) else: if self.debugFlag: print "Inside Singlepart load. File size = ", sourceSize loadReturnValue = self.s3object.loadDataSinglePart(sourceFileWthPath, targetFileWthPath, targetBucket, encryptKeyFlag ) if self.debugFlag: print "loadReturnValue = ", loadReturnValue #Check if we are sending data file. If so, we need to generate a complete file and send it along if (dataFileFlag) and (loadReturnValue == 0): completeFile = localDataFile.split(".")[0] + self.compFilePattern sourceFileWthPath = self.m_mktConfigDict["ENV"]["donefile_dir"] + "/" + os.path.basename(completeFile) if self.debugFlag: print("completeFile =", completeFile) print("sourceFileWthPath =", sourceFileWthPath) with open(sourceFileWthPath,"w") as finraMnFH: finraMnFH.write("{0},{1}\n".format(str(self.tradeDate),str(localRecordCount))) targetFileWthPath = os.path.join(self.s3object.m_configFile["S3"]["path"], os.path.basename(sourceFileWthPath)) sourceSize = os.stat(sourceFileWthPath).st_size self.m_logger.info("Started Xfer of complete file " + sourceFileWthPath + " with size " + str(sourceSize) + " to target " + targetFileWthPath) loadReturnValueCompleteFile = self.s3object.loadDataSinglePart(sourceFileWthPath, targetFileWthPath, targetBucket, encryptKeyFlag ) if loadReturnValueCompleteFile: loadReturnValue = 1 #End of loadReturnValueCompleteFile If if loadReturnValue == 0: pStatus = 'S' pComment = 'Load completed' break else: pStatus = 'F' pComment = 'Load failed' init_count += 1 if localDBFlag: #Call DB to insert 'S' or 'F' in tb_ddy_process_status #localFileIDQueue.put((localFileID, loadReturnValue)) mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_process_status"] myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileID":str(localFileID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "status":pStatus , "lcomment":pComment, "tidalRunID":str(self.tidalRunID)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "localFileID = ", localFileID print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.info("Retry after delay., Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) local_delay_time = int(self.m_configDict["SQL"]["delay_time"]) time.sleep(local_delay_time) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if returnStr[0] != '0': self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) localFileIDQueue.put((localFileID, 1)) return 1 if self.debugFlag: print "localFileID = ", localFileID print "loadReturnValue = ", loadReturnValue print 'Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss localFileIDQueue.put((localFileID,loadReturnValue)) else: return loadReturnValue except Exception as exp: self.m_logger.error("Failure in loadData process for file with the error " + str(exp)) if localDBFlag: localFileIDQueue.put(localFileID, 1) else: return 1 def createFinraManifestFile(self, manifestFile): try: # Read Manifest file to get info like total rows, total size & other details to populate the done file for FINRA if self.debugFlag: print "Inside createFinraManifestFile fuction" with open(manifestFile,"r") as fh: self.totalRows = 0 self.totalSize = 0 self.fileCount = 0 self.fileDict = {} for data in fh: data.rstrip("\n") # Exclude any entry with the pattern "start-of-day" if self.sodFilePatternSearch in data: continue mylist = [] self.fileCount +=1 mylist = data.split("|") self.fileDict[self.fileCount] = [mylist[0],os.path.basename(mylist[1]),int(mylist[2]),int(mylist[3])] self.totalRows += int(mylist[3]) self.totalSize += int(mylist[2]) if self.debugFlag: print "self.fileDict = ", self.fileDict except Exception as exp: self.m_logger.error("Failed while processing readManifest with Error = " + str(exp)) return 1 try: #Use self.defautltsFile which is populated from the db later. No need to get it from config file self.defaultsFileWthPath = self.m_mktConfigDict["DATASET"]["defaults_dir"] + "/" + self.defaultsFile with open(self.defaultsFileWthPath,"r") as fh: self.defaultsDict = {} self.defaultsCount = 0 for data in fh: data.rstrip('\n') self.defaultsCount +=1 self.defaultsDict[self.defaultsCount]=data if self.debugFlag: print "After Defaults, self.fileDict = ", self.fileDict except Exception as exp: self.m_logger.error("Failed while processing defaults file " + self.defaultsFileWthPath + " with Error = " + str(exp)) return 1 try: # Not needed as the naming convention is changed #self.finraManifestFile = self.m_mktConfigDict["ENV"]["donefile_dir"] + "/" + os.path.basename(manifestFile) + ".done" #Changing the EOD naming convention per Finra's requirement if self.debugFlag: print "self.eodFilePattern = ", self.eodFilePattern myParamsDict = {'datasetName':self.datasetName.lower(), 'tradeDate':str(self.tradeDate)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) eodFileName = re.sub( tempGrp, lambda m:myParamsDict[m.group()], self.eodFilePattern) if self.debugFlag: print "eodFileName = ", eodFileName self.finraManifestFile = self.m_mktConfigDict["ENV"]["donefile_dir"] + "/" + eodFileName if self.debugFlag: print "self.finraManifestFile = ", self.finraManifestFile with open(self.finraManifestFile,"w") as finraMnFH: finraMnFH.write("# AWS RunID : {}\n".format(str(self.runID))) finraMnFH.write("# Dataset : {0} , TradeDate : {1}\n".format(str(self.datasetName),str(self.tradeDate))) finraMnFH.write("total_compressed={}\n".format(self.totalSize)) finraMnFH.write("total_rows={}\n".format(self.totalRows)) finraMnFH.write("no of files={}\n".format(self.fileCount)) for key,val in self.fileDict.items(): finraMnFH.write("file_{0}={1}\n".format(str(key),val[1])) finraMnFH.write("file_{0}_rows={1}\n".format(str(key),val[3])) finraMnFH.write("# Data Attributes\n") for key,val in self.defaultsDict.items(): finraMnFH.write("{0}".format(str(val))) return 0 except Exception as exp: self.m_logger.error("Failed while creating AWS Done file " + self.finraManifestFile + " with Error = " + str(exp)) return 1 def processLoader(self, manifestFile, datasetName, tidalRunID): """ Purpose - Function responsible for reading the manifest file, get market name, call multiprocess load and other db calls :param manifestFile: Manifest File :param tradeDate: Trade Date :param tidalRunID: Tidal Run ID :return: """ try: # Read the manifest filename and get the suffix i.e. datasetname # Assumption - Manifest file format - manifest.<datasetName>.<tradeDate>.<tidalRunID> # Program will break otherwise self.datasetName = datasetName self.tidalRunID = tidalRunID # DB_CALL # Make database call sp_ddy_get_market_info(datasetname) and get market info mktName = '' self.manifestFile = self.m_configDict["ENV"]["manifestfile_dir"] + "/" + manifestFile ##Validate Manifest file if not os.path.isfile(self.manifestFile): self.m_logger.error("Invalid manifest file " + self.manifestFile) sys.exit(1) if self.debugFlag: print "Inside processLoader" print "DatasetName = ", self.datasetName print "ManifestFile = ", manifestFile print "Self ManifestFile = ", self.manifestFile print "TidalRunID = ", self.tidalRunID print "DebugFlag = ", self.debugFlag print "confDict = ", self.m_configDict # Enable this one the proc to get mkt name and default file are ready and test it tempSql = self.m_configDict["SQL"]["get_mkt_defaults_filename"] myParamsDict = { 'datasetName' : self.datasetName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] == '0': mktName = returnStr[2].strip() self.defaultsFile = returnStr[3].strip() else: self.m_logger.info("Retry after delay., Unable to get market info from the database using sql " + mySql + ". Error = " + returnStr[1]) local_delay_time = int(self.m_configDict["SQL"]["delay_time"]) time.sleep(local_delay_time) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if returnStr[0] == '0': mktName = returnStr[2].strip() self.defaultsFile = returnStr[3].strip() else: self.m_logger.error("Unable to get market info from the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) if self.debugFlag: print "MktName from DB = ", mktName print "Defaults = ", self.defaultsFile #Build the string for mktConfigFile based on mktName and configFile info self.mktConfigFile = os.path.dirname(self.configFile) + '/' + os.path.basename(self.configFile).split('.',1)[0].strip() + '_' + mktName.lower() + '.' + os.path.basename(self.configFile).split('.',1)[1].strip() if self.debugFlag: print("mktConfigFile = ", self.mktConfigFile) #Validate Manifest file is a valid file if not os.path.isfile(self.mktConfigFile): self.m_logger.error("Invalid market manifest file " + self.mktConfigFile) sys.exit(1) # Read Market specific config file and store it in a specific dictionary self.readMktConfigFile(self.mktConfigFile) if self.debugFlag: print("m_mktConfigDict=",self.m_mktConfigDict) # Read the contents of manifest - dataFileNames into a list - Will validate the datafiles as well localManifest = Manifest() manifestDelim = self.m_configDict["ENV"]["manifest_delim"] manifestFileList = localManifest.readManifest(self.manifestFile, self.m_logger, manifestDelim, self.debugFlag) # Get RunID self.runID = generate_runId() if self.debugFlag: print("RunID = ", self.runID) #print("manifestFileList = ", manifestFileList) #Call Oracle fn to insert status 'P' into TB_DDY_DATASET_TRANS with RUNID etc #DB_CALL # Make database call sp_ddy_insert_dataset_trans and insert data that process started mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_dataset"] pStatus = 'P' myParamsDict = {'datasetName':self.datasetName, 'runID': str(self.runID), 'tDate':str(self.tradeDate), 'status': pStatus, 'tidalRunID':str(self.tidalRunID)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.info("Retry after delay., Unable to put dataset info into the database using sql " + mySql + ". Error = " + returnStr[1]) local_delay_time = int(self.m_configDict["SQL"]["delay_time"]) time.sleep(local_delay_time) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if returnStr[0] != '0': self.m_logger.error("Unable to put dataset info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) # Insert Manifest data in db and call multiprocessing s3 loader process. Shd we add RUN_ID to manifest table #For each datafile, generate fileID and call loadData fn using multiprocess to load data into AWS for dataRecord in manifestFileList: mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_manifest"] myParamsDict = {"datasetName":self.datasetName, "runID": str(self.runID), "tDate":str(self.tradeDate), "dataFileName":dataRecord[1], "manifestFileName":manifestFile , "noOfRecords": str(dataRecord[3]), "fileSize":str(dataRecord[2]), "tidalRunID":str(self.tidalRunID)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) #if self.debugFlag: #print "tempSql = ", tempSql #print "myParamsDict = ", myParamsDict #print "mySql = ", mySql #print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.info("Retry after delay., Unable to put manifest info into the database using sql " + mySql + ". Error = " + returnStr[1]) local_delay_time = int(self.m_configDict["SQL"]["delay_time"]) time.sleep(local_delay_time) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if returnStr[0] != '0': self.m_logger.error("Unable to put manifest info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) # Initialize S3 object and get FINRA cloud service token and establish s3 session self.currentEpochTime = int(time.time()) self.s3object = S3(self.mktConfigFile, self.m_logger, self.debugFlag) self.s3object.getToken() if self.debugFlag: print "self.currentEpochTime = ", self.currentEpochTime process_count = int(self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"]) fileID=1 fileIDQueue = Queue() dbFlag=1 procs = [] doneCounter = 0 sendCounter = 0 processFlag=0 failureFlag=0 #Get chunk size from config file for multipart uploads self.bytes_per_chunk = int(self.m_configDict["DATASET"]["bytes_per_chunk"]) self.sodFilePatternSearch = self.m_configDict["ddy"]["SOD_FILE_PATTERN_SEARCH"] # Following variables are used across the class. Hence, assigned to self variables self.eodFilePattern = self.m_configDict["ddy"]["EOD_FILE_PATTERN"] self.compFilePattern = self.m_configDict["ddy"]["COMP_FILE_PATTERN"] self.sodFileCheck = self.m_configDict["ddy"]["SOD_FILE_CHECK"].strip().upper() manifestListItems = len(manifestFileList) if self.debugFlag: print "bytes_per_chunk = ", self.bytes_per_chunk print "self.sodFilePatternSearch = ", self.sodFilePatternSearch print "self.eodFilePattern = ", self.eodFilePattern print "self.compFilePattern = ", self.compFilePattern print "self.sodFileCheck = ", self.sodFileCheck print "manifestListItems = ", manifestListItems dataFileFlag=False sodFileProcessedFlag=0 max_batches= int(math.ceil(float(len(manifestFileList))/process_count)) batch_count=0 while doneCounter < manifestListItems and failureFlag == 0 : while sendCounter < manifestListItems and sendCounter - doneCounter < process_count and failureFlag == 0: if self.sodFilePatternSearch in manifestFileList[sendCounter][1] and not sodFileProcessedFlag: dataFileFlag=False sodFileLoadStatus=self.loadData(manifestFileList[sendCounter][1] ,fileID, fileIDQueue, dbFlag, dataFileFlag, 0) if sodFileLoadStatus: self.m_logger.error("Unable to push Start of Day file to FINRA. Exiting.. ") sys.exit(1) sodFileProcessedFlag=1 sendCounter += 1 qFileID = 0 qRestult = 0 qFileID, qResult = fileIDQueue.get() doneCounter += 1 fileID += 1 else: if self.sodFileCheck == 'Y': if not sodFileProcessedFlag: self.m_logger.error("No Start of day file. Please add SOD file to the generate manifest. Exiting.. ") sys.exit(1) dataFileFlag=True if self.debugFlag: print "manifestFileList[sendCounter][1]", manifestFileList[sendCounter][1], "fileID = ", fileID processHandle = Process(target=Loader.loadData, args=(self, manifestFileList[sendCounter][1],fileID, fileIDQueue, dbFlag, dataFileFlag, manifestFileList[sendCounter][3])) processFlag=1 s3TimeoutTime = int(self.m_configDict["ddy"]["S3_TIMEOUT_TIME"]) if ((int(time.time()) - self.currentEpochTime) > s3TimeoutTime): self.currentEpochTime = int(time.time()) self.m_logger.info("Getting New Token for Batch : {0}, Max batches : {1}".format(batch_count,max_batches)) if self.debugFlag: print "Inside get new token - self.currentEpochTime = ", self.currentEpochTime self.s3object.getToken() threadDelayTime = int(self.m_configDict["ddy"]["THREAD_DELAY_TIME"]) time.sleep(threadDelayTime) processHandle.start() procs.append(processHandle) sendCounter += 1 fileID += 1 if processFlag and ( sendCounter - doneCounter == process_count or sendCounter == manifestListItems ) : batch_count += 1 self.m_logger.info("Waiting for Batch : {0} to complete. No of active workers : {2}. Max batches : {1}".format(batch_count,max_batches,sendCounter-doneCounter)) for p in procs: p.join() processFlag=0 if self.debugFlag: print "Before fileIDQueue - ProcessFlag = ", processFlag, "sendCounter = ", sendCounter, "doneCounter = ", doneCounter, "manifestListItems = ", manifestListItems while not fileIDQueue.empty(): # process completed results as they arrive qFileID = 0 qRestult = 0 qFileID, qResult = fileIDQueue.get() if self.debugFlag: print("qFileID = ", qFileID, "qResult = ", qResult) doneCounter += 1 if qResult: failureFlag = 1 if self.debugFlag: print "After fileIDQueue - ProcessFlag = ", processFlag, "sendCounter = ", sendCounter, "doneCounter = ", doneCounter, "manifestListItems = ", manifestListItems, "failureFlag = ", failureFlag if failureFlag: break #Check to see if specified time has passed. If so get another token to avoid expiration. Required for large datasets if self.debugFlag: print 'Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print "self.currentEpochTime = ", self.currentEpochTime print "Current Time in Epoch = ", int(time.time()) # End of else #End of Inner While #End of Outer While if failureFlag: pStatus = 'F' else: pStatus = 'S' """ #Generate FINRA Manifest file and Push it to AWS """ # Call Divakar's finra manifest generate function returnValue = self.createFinraManifestFile(self.manifestFile) if self.debugFlag: print "Post createFinraManifestFile fn - return value= ", returnValue if returnValue: self.m_logger.error("Unable to generate done file. Please fix the issue the re-run the load") #sys.exit(1) failureFlag=1 pStatus = 'F' else: dbFlag=0 fileID=0 # Call the loader function with the manifest file finraManifestLoadStatus=0 dataFileFlag=False finraManifestLoadStatus=self.loadData(self.finraManifestFile ,fileID, fileIDQueue, dbFlag, dataFileFlag, 0) if finraManifestLoadStatus: pStatus = 'F' self.m_logger.error("Unable to load finra manifest file ") #Call Oracle fn to insert status 'S' into TB_DDY_DATASET_TRANS with RUNID etc #DB_CALL # Make database call sp_ddy_insert_dataset_trans and insert data based on Failure or Success mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_dataset"] myParamsDict = {"datasetName":self.datasetName, "runID": str(self.runID), "tDate":str(self.tradeDate), "status": pStatus, "tidalRunID":str(self.tidalRunID)} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.info("Retry after delay., Unable to put dataset info into the database using sql " + mySql + ". Error = " + returnStr[1]) local_delay_time = int(self.m_configDict["SQL"]["delay_time"]) time.sleep(local_delay_time) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if returnStr[0] != '0': self.m_logger.error("Unable to put dataset info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) if failureFlag: self.m_logger.error("Load failed") sys.exit(1) except Exception as e: self.m_logger.error("ProcessLoader failed with error " + str(e)) sys.exit(1)
class Recon(): #class variables m_logger = "" #database objects m_oracle_db = "" m_netezza_db = "" def __init__(self, configFile, tradeDate, debugFlag): """ Purpose: Constructor :param self: class object itself :param configFile: Configuration file to use """ # Initialize m_logger object from class Logger and add Header to the log, using addGenericInfo function self.m_logger = Logger(logging.INFO, configFile, tradeDate) self.m_logger.addFileHandler(logging.DEBUG) self.m_logger.addGenericInfo(__file__) self.tradeDate = tradeDate self.debugFlag = debugFlag self.configFile = configFile try: # Get configuration to a dictionary self.m_configDict = configuration(self.configFile, True).m_dictionary #Initialize Oracle instance along with connection self.m_oracle_db = Oracle(self.m_configDict, self.m_logger) except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def readMktConfigFile(self, mktConfigFile): """ Purpose - To read the content of mktConfigFile into the global dictionary m_mktConfigDict for reference :param mktConfigFile: :return: """ try: self.m_mktConfigDict = configuration(mktConfigFile, True).m_dictionary except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration for logger " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def processRecon(self, tidalRunID): """ Purpose - Function responsible for reading the datasets, get market name, call AWS ack files and other db calls :param tradeDate: Trade Date :param tidalRunID: Tidal Run ID :return: """ try: # Read the manifest filename and get the suffix i.e. datasetname # Assumption - Manifest file format - manifest.<datasetName>.<tradeDate>.<tidalRunID> # Program will break otherwise self.tidalRunID = tidalRunID # DB_CALL # Make database call sp_ddy_get_market_info(datasetname) and get market info mktName = '' # select * from TB_DDY_PROCESS_STATUS where CREATE_TIME > SYSDATE - INTERVAL '1' DAY # select * from TB_DDY_MANIFEST_TRANS where CREATE_TIME > SYSDATE - INTERVAL '1' DAY; # select * from TB_DDY_DATASET_MASTER where dataset_id = 49 tempSql = "select DM.DATASET_NAME , PS.FILE_ID, TO_CHAR(PS.TRADE_DATE,'YYYYMMDD'), PS.RUN_ID, PS.FILE_NAME from TB_DDY_PROCESS_STATUS PS" \ " INNER JOIN TB_DDY_MANIFEST_TRANS MT ON MT.RUN_ID= PS.RUN_ID" \ " INNER JOIN TB_DDY_DATASET_MASTER DM ON DM.DATASET_ID= PS.DATASET_ID" \ " WHERE PS.CREATE_TIME > SYSDATE - INTERVAL '1' DAY" \ " AND NOT EXISTS " \ " ( SELECT 1 FROM TB_DDY_PROCESS_STATUS PS1 WHERE PS1.RUN_ID = PS.RUN_ID and PS1.FILE_ID = PS.FILE_ID and PS.STATUS = 'R')" \ #" AND PS.STATUS = 'S'" \ #" AND rownum < 10000" print(tempSql) #myParamsDict = {} #tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) #mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnList = self.m_oracle_db.runSqlWthParamsGetMultipleRows(tempSql) # Remove last element #print(returnList) #returnList.pop() returnDataDict= {d[0]: ','.join(d[2:]) if d[2] else 0 for d in returnList} returnDict = {d[0]+"_"+d[1]+"_"+d[2]: ','.join(d[0:]) if d[1:] else 0 for d in returnList} #print(returnDataDict) #print(returnDict) self._sqlerror_ = 0 returnMktList = [] for datasetName in returnDataDict: tempSql = self.m_configDict["SQL"]["get_mkt_defaults_filename"] myParamsDict = { 'datasetName' : datasetName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnList = [] returnList = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) print(returnList[0]) if int(returnList[0]) == 0: if returnList[2] not in returnMktList: returnMktList.append(returnList[2]) #returnMktList.append(returnList[2]) #print(datasetName, returnList[2] ) elif int(returnList[0]) < 0: self.m_logger.error("Error in Get Market Deafults Filename for Dataset : {1}, SQL : {0}".format(mySql,datasetName)) self._sqlerror_ += 1 else: self.m_logger.error("Warning in Get Market Deafults Filename for Dataset : {1}, SQL : {0}".format(mySql,datasetName)) print(returnMktList) #returnMktList = ['NYSE'] for mktName in returnMktList : #Build the string for mktConfigFile based on mktName and configFile info mktConfigFile = os.path.dirname(self.configFile) + '/' + os.path.basename(self.configFile).split('.',1)[0].strip() + '_' + mktName.lower() + '.' + os.path.basename(self.configFile).split('.',1)[1].strip() print("mktConfigFile = ", mktConfigFile) #Validata Manifest file is a valid file if not os.path.isfile(mktConfigFile): #print "Inside invalid mktConfigFile" + self.mktConfigFile self.m_logger.error("Invalid market manifest file " + mktConfigFile) sys.exit(1) self.readMktConfigFile(mktConfigFile) self.s3object = S3(mktConfigFile, self.m_logger, self.debugFlag) self.s3object.getToken() #print(self.s3object.m_configFile["TOKEN"]) print(self.s3object.m_configFile["S3"]) bucket = self.s3object.m_configFile["S3"]["bucket"] path = self.s3object.m_configFile["S3"]["path"] ackpath= self.s3object.m_configFile["S3"]["ack_path"] print(bucket, path) print(ackpath) #ackPath = bucket + "/" + str(self.s3object.m_configFile["S3"]["path"]) + "/" + "acknowldge" #ackPath = str(bucket) + "/" + str(path) encryptKeyFlag = self.s3object.m_configFile["S3"]["encrypt_key"] #myBucket = self.s3object.m_connection.get_bucket(bucket, validate = False) #for testkey in myBucket.list(prefix='50006/'): # print("File = ", testkey.name) #b = self.s3object.m_connection.get_bucket(bucket) #rs = b.list() # get the result set from bucket #print(b.list()) rs = self.s3object.listBucketWPath(bucket,ackpath) print(rs) #ackList = s3object.getBucketList(ackPath) #print(ackList) except: self.m_logger.error("Error while creating S3 recon file Exception : {0}".format(sys.exc_info()[0])) # Not exitting at this point #sys.exit(1) sys.exit(0)
def Newton_V(Oracle, x0): ##### Initialisation des variables iter_max = 100 # gradient_step_ini = 1. # Problème primal. gradient_step_ini = 1000 # Problème dual. threshold = 0.000001 error_count = 0 # Compteur de non-convergence de l'algorithme de Fletcher-Lemarechal. gradient_norm_list = [] gradient_step_list = [] critere_list = [] time_start = process_time() x = x0 ##### Boucle sur les iterations for k in range(iter_max): # Valeur du critere et du gradient critere, gradient, hessien = Oracle(x, 7) # Test de convergence gradient_norm = norm(gradient) if gradient_norm <= threshold: break # Direction de descente direction = -dot(inv(hessien), gradient) # Pas de descente gradient_step, error_code = Wolfe(gradient_step_ini, x, direction, Oracle) if error_code != 1: error_count += 1 # Mise a jour des variables x = x + (gradient_step * direction) # Evolution du gradient, du pas, et du critere gradient_norm_list.append(gradient_norm) gradient_step_list.append(gradient_step) critere_list.append(critere) if error_count > 0: print() print("Non-convergence de l'algorithme de Fletcher-Lemarechal : {}". format(error_count)) ##### Resultats de l'optimisation critere_opt = critere gradient_opt = gradient x_opt = x time_cpu = process_time() - time_start print() print('Iteration :', k) print('Temps CPU :', time_cpu) print('Critere optimal :', critere_opt) print('Norme du gradient :', norm(gradient_opt)) # Visualisation de la convergence Visualg(gradient_norm_list, gradient_step_list, critere_list) return critere_opt, gradient_opt, x_opt,
class Extractor(): #class variables m_logger = "" #database objects m_oracle_db = "" def __init__(self, configFile, mktName, tradeDate, debugFlag): """ Purpose: Constructor :param self: class object itself :param configFile: Configuration file to use """ # Initialize m_logger object from class Logger and add Header to the log, using addGenericInfo function self.m_logger = Logger(logging.INFO, configFile, tradeDate) self.m_logger.addFileHandler(logging.DEBUG) self.m_logger.addGenericInfo(__file__) self.tradeDate = tradeDate self.debugFlag = debugFlag self.configFile = configFile self.mktName = mktName try: # Get configuration to a dictionary self.m_configDict = configuration(self.configFile, True).m_dictionary #Initialize Oracle instance along with connection self.m_oracle_db = Oracle(self.m_configDict, self.m_logger) except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def readMktConfigFile(self, mktConfigFile): """ Purpose - To read the content of mktConfigFile into the global dictionary m_mktConfigDict for reference :param mktConfigFile: :return: """ try: self.m_mktConfigDict = configuration(mktConfigFile, True).m_dictionary except Exception as exp: # An exception occurred self.m_logger.error("Unable to initialize the configuration for logger " + str(exp)) print("ERROR: Unable to initialize the configuration for logger " + str(exp)) sys.exit(1) def chkActiveLoads(self): """ Purpose - To check the count of active Active loads happening at a given point :param None: None at this point :return: """ try: if self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_check_flag"] == 'Y': localActiveLoadMax = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"] localActiveLoadWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_wait_time"] localActiveLoadMaxWaitTime = self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max_wait_time"] mySql = "" myParams = "" mySql = self.m_configDict["SQL"]["get_active_loads"] activeFlag=1 totalActiveWaitTime=0 while activeFlag: returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "mySql = ", mySql print "returnStr = ", returnStr print "chkActiveLoads - Active Loads value = ", int(returnStr[1].strip()) if returnStr[0] != '0': self.m_logger.error("Unable to get active loads using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Check if actual active loads is <= configured active loads. If so, return out of the fn if int(returnStr[1].strip()) <= localActiveLoadMax: activeFlag=0 return 0 #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localActiveLoadWaitTime) totalActiveWaitTime += localActiveLoadWaitTime #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit if totalActiveWaitTime > localActiveMaxWaitTime: self.m_logger.error("In Fn chkActiveLoads. Total Actual Wait Time exceeds the configured value active_load_max_wait_time. Either cleanup orphaned loads or increase the either active_load_max or active_load_max_wait_time. totalActiveWaitTime = " + str(totalActiveWaitTime) + " localActiveMaxWaitTime=" + str(localActiveMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: self.m_logger.error("Failure in chkActiveLoads process for file with the error " + str(exp)) sys.exit(1) def chkRaceStatus(self): """ Purpose - To check if a load is already running for the given dataset :param None: None at this point :return: """ try: if self.m_mktConfigDict["RACE"]["race_status_check_flag"] == 'Y': localRaceStatusWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_wait_time"]) localRaceStatusMaxWaitTime = int(self.m_mktConfigDict["RACE"]["race_status_max_wait_time"]) mySql = "" #myParams = {"datasetName":self.datasetName} tempSql = self.m_configDict["SQL"]["get_race_status"] myParamsDict = { 'datasetName' : self.datasetName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) raceFlag=1 totalRaceStatusWaitTime=0 while raceFlag: returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr print "chkRaceStatus - ReturnCode = ", int(returnStr[1].strip()) if returnStr[0] != '0': self.m_logger.error("Unable to get race status using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) #Check if the load for this dataset is already running. If not, exit out of the function with normal return value if int(returnStr[1].strip()) <= 1: raceFlag=0 return 0 #Check if actual Total wait time is > configured total wait time. If so, throw an error and exit #Sleep for time defined by configured value for "active_load_wait_time" time.sleep(localRaceStatusWaitTime) #time.sleep(90) totalRaceStatusWaitTime += localRaceStatusWaitTime if self.debugFlag: print "totalRaceStatusWaitTime = ", totalRaceStatusWaitTime, "localRaceStatusWaitTime =", localRaceStatusWaitTime if totalRaceStatusWaitTime > localRaceStatusMaxWaitTime: self.m_logger.error("In Fn chkRaceStatusLoads. Total Actual Wait Time exceeds the configured value race_status_max_wait_time. Either check if the Dataset is getting loaded or increase the either active_load_max or active_load_max_wait_time. totalRaceStatusWaitTime = " + str(totalRaceStatusWaitTime) + " localRaceStatusMaxWaitTime=" + str(localRaceStatusMaxWaitTime)) return 1 else: return 0 #Return failure return 1 except Exception as exp: self.m_logger.error("Failure in chkRaceStatus process for file with the error " + str(exp)) sys.exit(1) def extractData(self,localDataRecordDict, localFileID, localFileIDQueue, localDBFlag): """ Purpose - To load the given datafile to the S3 bucket specified in the global mktConfigFile :param localDataRecordDict: Datafile related info fetched from FINRA's manifest file including filename, filesize, recordcount :param localFileID: Internal File ID assigned to the local datafile :param localFileIDQueue: Queue in which, results of the operation is stored :param localDBFlag: Flag indicating if database should be used or not :return: """ try: if self.debugFlag: print "Inside extractData function" print "localDataRecordDict = ", localDataRecordDict if localDBFlag: """ Not sure if we need Race Status check for Extract raceStatusReturnValue=self.chkRaceStatus() if self.debugFlag: print "raceStatusReturnValue=", raceStatusReturnValue if raceStatusReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(raceStatusReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 """ """ Need to integrate Active loads with tb_dxt_process_status and tb_dxt_process_status ? activeLoadsReturnValue=self.chkActiveLoads() if activeLoadsReturnValue: self.m_logger.error("Failure value returned by chkRaceStatus fn. Return value = " + str(activeLoadsReturnValue)) localFileIDQueue.put((localFileID, raceStatusReturnValue)) return 1 """ processID = os.getpid() hostName = socket.gethostname() # Need to check the order test_var = str(self.m_configDict["dxt"]["DATA_FILE_NAME_STR"]) localDataFile = localDataRecordDict[self.m_configDict["dxt"]["DATA_FILE_NAME_STR"]] localDataFileSize = int(localDataRecordDict[self.m_configDict["dxt"]["DATA_FILE_SIZE_STR"]]) localDataFileRecordCount = int(localDataRecordDict[self.m_configDict["dxt"]["NO_OF_ROWS_STR"]]) if self.debugFlag: print "localDataFile = ", localDataFile print "localDataFileSize = ", localDataFileSize print "localDataFileRecordCount = ", localDataFileRecordCount #Insert Process status into Oracle db #DB_CALL - sp_dxt_insert_process_status(RUNID, FILE_ID, etc) mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_process_status"] pStatus = 'P' pComment = 'Load started' #myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileID":str(localFileID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "fileSize": localDataFileSize, "recordCount" : localDataFileRecordCount, "status":pStatus , "lcomment":pComment} myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "fileSize": str(localDataFileSize), "recordCount" : str(localDataFileRecordCount), "status":pStatus , "lcomment":pComment} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) # Get the dataFileName file to be extracted from AWS dataFileName = localDataRecordDict[self.m_configDict["dxt"]["DATA_FILE_NAME_STR"]] #Here localFileWthPath is the local stage dir with file name localFileWthPath = self.m_configDict["ENV"]["stage_dir"] + "/" + dataFileName #Here targetFileWthPath is the AWS dir with file name targetFileWthPath = os.path.join(self.s3object.m_configFile["S3"]["path"], os.path.basename(dataFileName)) targetBucket = self.s3object.m_configFile["S3"]["bucket"] encryptKeyFlag = self.s3object.m_configFile["S3"]["encrypt_key"] localAWSRetries = int(self.m_mktConfigDict["ENV"]["aws_retries"]) if self.debugFlag: print("localFileWthPath =", localFileWthPath) print("targetFileWthPath =", targetFileWthPath) print("targetBucket =", targetBucket) print("encryptKeyFlag =", encryptKeyFlag) print("localAWSRetries =", localAWSRetries) initCount = 0 while (initCount < localAWSRetries): extractReturnValue = 0 #Call s3.data download to extract the manifest file (single part load) #extractReturnValue = self.s3object.getDataSinglePart(localFileWthPath, targetFileWthPath, targetBucket, encryptKeyFlag ) extractReturnValue = self.s3object.getDataSinglePart(localFileWthPath, targetFileWthPath, targetBucket) if self.debugFlag: print "extractReturnValue = ", extractReturnValue if int(extractReturnValue) == 0: pStatus = 'S' pComment = 'Load completed' break else: pStatus = 'F' pComment = 'Load failed' initCount += 1 # Get the size of the file downloaded localFileSize = os.stat(localFileWthPath).st_size # Check if the downloaded file size is matching with what is mentioned in manifest file. If not mark it as failed if localFileSize != localDataFileSize: pStatus = 'F' pComment = 'Actual file size != Manifest file size' localRecordCount = 0 if localDBFlag: #Call DB to insert 'S' or 'F' in tb_dxt_process_status #localFileIDQueue.put((localFileID, extractReturnValue)) mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_process_status"] #myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileID":str(localFileID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "fileSize": localFileSize, "recordCount" : localRecordCount, "status":pStatus , "lcomment":pComment} myParamsDict = {"datasetName":self.datasetName, "runID":str(self.runID), "fileName":localDataFile, "tDate":str(self.tradeDate), "processID":str(processID), "hostName":hostName, "fileSize": str(localFileSize), "recordCount" : str(localRecordCount), "status":pStatus , "lcomment":pComment} tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put process status info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) localFileIDQueue.put((localFileID,extractReturnValue)) else: return extractReturnValue except Exception as exp: self.m_logger.error("Failure in extractData process for file with the error " + str(exp)) if localDBFlag: localFileIDQueue.put((localFileID, 1)) else: return 1 def getRecords(self, fileDict, startDateTime, endDateTime): """ Purpose - Function to sort the dictionary based on the key and return a sorted list :param fileDict : Dictionary containing Last_modified Date and file name :param startDateTime : Start DateTime in the format '2016-02-01 00:00:00' :param endDateTime : End DateTime in the format '2016-02-10 00:00:00' """ try: patternToSearch = self.m_configDict["ENV"]["pattern_to_search"] if self.debugFlag: print "fileDict = ", fileDict print "patternToSearch = ", patternToSearch print "startDateTime = ", startDateTime print "endDateTime = ", endDateTime sorted_values = sorted(fileDict.values()) start = bisect.bisect_left(sorted_values, startDateTime) end = bisect.bisect_right(sorted_values, endDateTime) if self.debugFlag: print "start = ", start print "end = ", end for fileItem in sorted(fileDict.iteritems())[start:end]: if patternToSearch in fileItem[0]: if self.debugFlag: print "fileItem[0] = ", fileItem[0] yield fileItem[0] except Exception as exp: self.m_logger.error("Failed while executing getRecords to sort the dictionary content of dictionary with Error = " + str(exp)) sys.exit(1) def readManifestFile(self, manifestFileName): """ Purpose - To read the content of Finra's manifest file stored in key-value pair into Nested dictionary :param manifestFileName : Finra's manifestFileName containing data filenames, file size & no of rows """ try: manifestRecordStartPattern = self.m_configDict["dxt"]["MANIFEST_RECORD_START_PATTERN"] if self.debugFlag: print "manifestRecordStartPattern =", manifestRecordStartPattern with open(manifestFileName) as infile: manifestFileDict = {} file = 0 line_count = 0 for line in infile: line = line.strip() if line.startswith(manifestRecordStartPattern): file = line_count line_count += 1 manifestFileDict[file] = {} var, val = line.split('=',1) if self.debugFlag: print "var = ", var, "val = ", val manifestFileDict[file][var.strip()] = val.strip() if self.debugFlag: print "=====================================" print "manifestFileDict = ", manifestFileDict print "=====================================" return manifestFileDict #for key, values in manifest.items(): #if key == 1: #for k,v in values.items(): #print k, v except Exception as exp: self.m_logger.error("Failed while executing readManifestFile to get FINRA manifest file into nested dictionary, Error = " + str(exp)) sys.exit(1) def getManifestFileList(self, startDateTime, endDateTime, s3Bucket, s3Path, folderPosition): """ Purpose - Function to sort the dictionary based on the key and return a sorted list :param startDateTime : Start DateTime in the format '2016-02-01 00:00:00' :param endDateTime : End DateTime in the format '2016-02-10 00:00:00' """ try: if self.debugFlag: print "s3Bucket = ", s3Bucket print "s3Path = ", s3Path print "startDateTime = ", startDateTime print "endDateTime = ", endDateTime print "folderPosition = ", folderPosition fileListDict = self.s3object.listBucketWPathByLastModified(s3Bucket, s3Path, folderPosition) if self.debugFlag: print "fileListDict = ", fileListDict #endDateTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") manifestFileList = list(self.getRecords(fileListDict, startDateTime, endDateTime)) if self.debugFlag: print "fileListDict = ", fileListDict print "manifestFileList = ", manifestFileList return manifestFileList except Exception as exp: self.m_logger.error("Failed while creating AWS manifest file list with Error = " + str(exp)) return 1 def processExtractor(self): """ Purpose - Function responsible for getting the AWS token and reading the last modified date in DB and fetch the list of files from AWS to be processed :param : None :return: """ try: # DB_CALL # Make database call sp_dxt_validate_mktName(mktName) to validate mktName tempSql = self.m_configDict["SQL"]["validate_market_name"] myParamsDict = { 'mktName' : self.mktName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Invalid market name provided " + mySql + ". Error = " + self.mktName) sys.exit(1) if self.debugFlag: print "MktName from DB = ", self.mktName #Build the string for mktConfigFile based on mktName and configFile info self.mktConfigFile = os.path.dirname(self.configFile) + '/' + os.path.basename(self.configFile).split('.',1)[0].strip() + '_' + self.mktName.lower() + '.' + os.path.basename(self.configFile).split('.',1)[1].strip() if self.debugFlag: print("mktConfigFile = ", self.mktConfigFile) #Validate Market Config file is a valid file if not os.path.isfile(self.mktConfigFile): self.m_logger.error("Invalid market manifest file " + self.mktConfigFile) sys.exit(1) # Read Market specific config file and store it in a specific dictionary self.readMktConfigFile(self.mktConfigFile) if self.debugFlag: print("m_mktConfigDict=",self.m_mktConfigDict) # Read the table for the given market and fetch the last modified timestamp for the given manifest file tempSql = self.m_configDict["SQL"]["get_last_modified"] myParamsDict = { 'mktName' : self.mktName.upper() } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] == '0': if returnStr[1]: lastModifiedDate=returnStr[1] else: lastModifiedDate="2015-01-01 00:00:00" else: self.m_logger.error("Unable to get last_modified date using the sql " + mySql + ". Error = " + self.mktName) sys.exit(1) if self.debugFlag: print("lastModifiedDate=",lastModifiedDate) #Temp call. Need to enable the previous lines to use DB call. Comment them bfr production if self.mktName == 'nyse_mkt': lastModifiedDate="2016-06-03 15:00:00" else: lastModifiedDate="2016-06-01 00:00:00" #print "Ram - Last Modified Date = ", lastModifiedDate, "mktName = ", self.mktName # Get RunID self.runID = generate_runId() if self.debugFlag: print("RunID = ", self.runID) # Initialize S3 object and get FINRA cloud service token and establish s3 session self.s3object = S3(self.mktConfigFile, self.m_logger, self.debugFlag) tokenRetryTimes = int(self.m_configDict["TOKEN"]["token_retry_times"]) tokenRetryWaitTime = int(self.m_configDict["TOKEN"]["token_retry_wait_time"]) initCount = 0 while (initCount < tokenRetryTimes): tokenReturnCode = self.s3object.getToken() if tokenReturnCode: if initCount == tokenRetryTimes: self.m_logger.error("Error: Exceeded the max retries " + tokenRetryTimes + " to get AWS Token from FINRA. Please re-try after some time or escalate.. ") sys.exit(1) initCount += 1 time.sleep(tokenRetryWaitTime) else: break self.currentEpochTime = int(time.time()) # Get list of Manifest files to be processed #currentDate = datetime.now().strftime("%Y-%m-%d %H:%M:%S") #currentDate = datetime.now().strftime("%Y-%m-%d %H:%M:%S") currentDate = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S") folderPosition = int(self.s3object.m_configFile["S3"]["folder_position"]) targetBucket = self.s3object.m_configFile["S3"]["bucket"] targetFolder = self.s3object.m_configFile["S3"]["path"] targetFilePath = targetFolder finraManifestFileList = self.getManifestFileList(lastModifiedDate, currentDate, targetBucket, targetFilePath, folderPosition) if self.debugFlag: print("finraManifestFileList = ", finraManifestFileList) # Download manifest files in the manifest file list to a specific folder from AWS localFileDir = self.s3object.m_configFile["ENV"]["stage_dir"] #targetFileWthPath = os.path.join(self.s3object.m_configFile["S3"]["path"], os.path.basename(localFileWthPath)) encryptKeyFlag = self.s3object.m_configFile["S3"]["encrypt_key"] # Get an instance of the Manifest class fileIDQueue = Queue() localAWSRetries = int(self.m_mktConfigDict["ENV"]["aws_retries"]) for finraManifestFile in finraManifestFileList: # Following 2 lines temporarily written to avoid bad manifest files. Please remove them before go-live if finraManifestFile == 'manifest.TSP_A_20160425.txt': continue if finraManifestFile == 'manifest.TSP_P_20160425.txt': continue targetFileWthPath = targetFolder + finraManifestFile localFileWthPath = localFileDir + "/" + finraManifestFile if self.debugFlag: print "targetFileWthPath = ", targetFileWthPath print "localFileWthPath = ", localFileWthPath print "finraManifestFile = ", finraManifestFile initCount = 0 while (initCount < localAWSRetries): extractReturnValue = 0 #Call s3.data download to extract the manifest file (single part load) #extractReturnValue = self.s3object.getDataSinglePart(localFileWthPath, targetFileWthPath, targetBucket, encryptKeyFlag ) extractReturnValue = self.s3object.getDataSinglePart(localFileWthPath, targetFileWthPath, targetBucket) if self.debugFlag: print "extractReturnValue = ", extractReturnValue if extractReturnValue: # Try it again initCount += 1 else: # Come out of the loop break # End of while loop for AWS Retries if extractReturnValue: self.m_logger.error("Unable to fetch manifestFile = " + finraManifestFile + "from the path = " + targetFileWthPath + " to the local filesystem = " + localFileWthPath ) sys.exit(1) """ Not needed if extractReturnValue == 0: pStatus = 'P' pComment = 'Load completed' break else: pStatus = 'F' pComment = 'Load failed' """ initCount += 1 # get datasetname from the manifest file. Need check based on FINRA naming # Original requirement #self.datasetName = os.path.basename(finraManifestFile).split('.',3)[1].strip().upper() # Customized for FINRA's latest file self.datasetName = os.path.basename(finraManifestFile).split('.')[1].split('_')[1].strip().upper() if self.debugFlag: print "datasetName = ", self.datasetName # Need to check DB call, once it is ready # Validate the manifest file name to make sure that we are expecting it tempSql = self.m_configDict["SQL"]["validate_dataset_name"] myParamsDict = { 'datasetName' : self.datasetName } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr #Check if dataset is there in the tb_dxt_dataset_master, if not, skip it and move to the next file. For other errors, exit out of the program if int(returnStr[0]) < 0: self.m_logger.error("Unable to validate datasetName " + mySql + ". Error = " + self.datasetName) sys.exit(1) elif int(returnStr[0]) > 0: self.m_logger.info("Give Dataset is not in the list to process. Skipping it" + mySql + ". Dataset Name = " + self.datasetName) # Continue to the next file entry in the manifest list continue # Insert a record into tb_dxt_dataset_trans with status 'P' for the given datasetName, saying that we start the process for this manifest file pStatus = 'P' tempSql = self.m_configDict["SQL"]["put_dataset"] myParamsDict = {'datasetName':self.datasetName, 'runID': str(self.runID), 'tDate':str(self.tradeDate), 'status': pStatus } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to insert into tb_dxt_dataset_trans table " + mySql + ". Error = " + self.datasetName) sys.exit(1) # Read the contents of manifestfile i.e. dataFileNames into a list - Will validate the datafiles as well manifestDelim = self.m_configDict["ENV"]["manifest_delim"] if self.debugFlag: print "localFileWthPath = ", localFileWthPath #Need to change the following line to read a nested dictionary from a keyValuePair manifestFileDict = self.readManifestFile(localFileWthPath) if self.debugFlag: print "manifestDelim = ", manifestDelim print "manifestFileDict = ", manifestFileDict process_count = int(self.m_mktConfigDict["ACTIVE_LOAD"]["active_load_max"]) # Now go into multiprocessing and call extractData function and extract files ones by one fileID=1 dbFlag=1 fileIDQueue = Queue() procs = [] doneCounter = 0 sendCounter = 0 failureFlag = 0 finraManifestFileCounter=0 while doneCounter < len(manifestFileDict): while sendCounter < len(manifestFileDict) and sendCounter - doneCounter < process_count: if self.debugFlag: print "manifestFileDict[self.m_configDict[dxt][DATA_FILE_NAME_STR]] = ", manifestFileDict[sendCounter]['Datafilename'] # Call fn extractData to fetch files from AWS. Pass manifestFileDict[sendCounter] as it contains the whole record including the filename, filesize & row count processHandle = Process(target=Extractor.extractData, args=(self, manifestFileDict[sendCounter],fileID, fileIDQueue, dbFlag)) processFlag=1 s3TimeoutTime = int(self.m_configDict["dxt"]["S3_TIMEOUT_TIME"]) if ((int(time.time()) - self.currentEpochTime) > s3TimeoutTime): self.currentEpochTime = int(time.time()) self.m_logger.info("Getting New Token for Batch : {0}, Max batches : {1}".format(batch_count,max_batches)) if self.debugFlag: print 'Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print "self.currentEpochTime = ", self.currentEpochTime print "Current Time in Epoch = ", int(time.time()) if self.debugFlag: print "Inside get new token - self.currentEpochTime = ", self.currentEpochTime initCount = 0 while (initCount < tokenRetryTimes): tokenReturnCode = 0 tokenReturnCode = self.s3object.getToken() if tokenReturnCode: if initCount == tokenRetryTimes: self.m_logger.error("Error: Exceed the max retries " + tokenRetryTimes + " to get AWS Token from FINRA. Please re-try after some time or escalate.. ") sys.exit(1) initCount += 1 time.sleep(tokenRetryWaitTime) else: break threadDelayTime = int(self.m_configDict["dxt"]["THREAD_DELAY_TIME"]) time.sleep(threadDelayTime) processHandle.start() procs.append(processHandle) sendCounter += 1 fileID += 1 if processFlag: for p in procs: p.join() procs=[] processFlag=0 while not fileIDQueue.empty(): # process completed results as they arrive #time.sleep(3) qFileID, qResult = fileIDQueue.get() if self.debugFlag: print("qFileID = ", qFileID, "qResult = ", qResult) doneCounter += 1 if qResult: failureFlag = 1 if self.debugFlag: print "ProcessFlag = ", processFlag, "sendCounter = ", sendCounter, "doneCounter = ", doneCounter if failureFlag: break if self.debugFlag: print "Failure Flag = ", failureFlag if failureFlag: pStatus = 'F' else: pStatus = 'S' tblName = self.m_mktConfigDict["dxt"]["TARGET_TBL_NAME"] + "_" + self.mktName.upper() manifestDate = os.path.basename(finraManifestFile).split('.',3)[1][6:12] fatlManifestFile = self.m_configDict["ENV"]["stage_dir"] + "/" + tblName + "." + manifestDate + ".manifest" with open(fatlManifestFile,"w") as fh: counter = 0 for dictRecord in manifestFileDict: dataFile = manifestFileDict[dictRecord][self.m_configDict["dxt"]["DATA_FILE_NAME_STR"]] sourceFileWthPath = self.m_configDict["ENV"]["stage_dir"] + "/" + dataFile dataFileSize = int(manifestFileDict[dictRecord][self.m_configDict["dxt"]["DATA_FILE_SIZE_STR"]]) dataFileRecordCount = int(manifestFileDict[dictRecord][self.m_configDict["dxt"]["NO_OF_ROWS_STR"]]) #fileSize = os.stat(sourceFileWthPath).st_size if self.debugFlag: print "dataFile = ", dataFile print "dataFileSize = ", dataFileSize print "dataFileRecordCount = ", dataFileRecordCount print "sourceFileWthPath = ", sourceFileWthPath print "tblName = ", tblName, "dataFile = ", dataFile, "dataFileSize = ", dataFileSize, "mktName = ", self.mktName fh.write(tblName + "|" + str(dataFile) + "|" + str(dataFileSize) + "|" + str(dataFileRecordCount) + "|" + "0" + "\n") counter += 1 # Move all the data files to inbox from the stg location. No need for this step, as Joejo mentioned there will be another Tidal job doing this step # Move the manifest file to inbox from the stg location # insert a record into tb_dxt_dataset_trans table with 'S' or 'F' record #Call Oracle fn to insert status 'S' into TB_DDY_DATASET_TRANS with RUNID etc #DB_CALL # Make database call sp_dxt_insert_dataset_trans and insert data based on Failure or Success mySql = "" myParams = "" tempSql = self.m_configDict["SQL"]["put_dataset"] myParamsDict = {"datasetName":self.datasetName, "runID": str(self.runID), "tDate":str(self.tradeDate), "status": pStatus } tempGrp = "(%s)" % "|".join( map(re.escape, myParamsDict.keys()) ) mySql = re.sub( tempGrp, lambda m:myParamsDict[m.group()], tempSql) returnStr = self.m_oracle_db.runSqlWthParamsGetOneRow(mySql) if self.debugFlag: print "tempSql = ", tempSql print "myParamsDict = ", myParamsDict print "mySql = ", mySql print "returnStr = ", returnStr if returnStr[0] != '0': self.m_logger.error("Unable to put dataset info into the database using sql " + mySql + ". Error = " + returnStr[1]) sys.exit(1) if failureFlag: self.m_logger.error("Extract failed for data files for manifest file " + finraManifestFile) sys.exit(1) finraManifestFileCounter += 1 # End of for loop for finraManifestFiles except Exception as e: self.m_logger.error("ProcessExtractor failed with error " + str(e)) sys.exit(1)
def get_phones_in(): phones_in = Oracle.consulta_fones() return phones_in