def processSites (self,recordList,sitesName): try: for record in recordList: sites_exist = None try: where = and_(testTPWDmodel.Sites.Latitude==record['start_latitude_num'], testTPWDmodel.Sites.Longitude==record['start_longitude_num']) sites_exist = self.session.query(testTPWDmodel.Sites).filter(where).one() except NoResultFound, e: newSite = testTPWDmodel.Sites() newSite.SiteID = self.getMaxId('Sites') # mkey = md5.new() # mkey.update(str(record['start_latitude_num'])+str(record['start_longitude_num'])) # mStr = mkey.hexdigest() SiteCode_I = ''.join([major_area_str,record['major_area_code'], minor_bay_str,record['minor_bay_code'], station_str,record['station_code']]) newSite.SiteCode = unicode('_'.join([SiteCode_I,str(newSite.SiteID)])) #here, changing to name-looking function later #sites name is based on the csv file name processed newSite.SiteName = TPWDSitesDict[sitesName.split('_')[0]] ################################### newSite.Latitude,newSite.Longitude = float(record['start_latitude_num']),float(record['start_longitude_num']) newSite.LatLongDatumID= 2 newSite.VerticalDatum = u'Unknown' newSite.State = u'Texas' self.session.add(newSite) self.session.flush() record['SiteID'] = newSite.SiteID else: record['SiteID'] = sites_exist.SiteID self.session.commit() reticLog.logInfo(self.logList, "( " + self.name + " ) sites info processed on sink : " + self.name) return 0
def __init__ (self, args, logList): try: self.logList = [] self.logList = logList self.name = args['name'] self.exitOnError = 'y' reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Intitializing fileSource") self.fileFilter = args['fileFilter'] self.newExtension = args['newExtension'] self.msgList = [] self.metadata = {} self.nbMsg = 0 self.msg = '' self.msgName = '' self.filePath = args['filePath'] #this is used for wait(interval) function, so use float self.interval = float(args['pollPeriod']) #here,determine the os path seperator, '\\' for nt, '/' for linux import os self.filePath = self.filePath + os.sep if args.has_key('exitOnError'): self.exitOnError = args['exitOnError'] except KeyError: reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Error on fileSource initialization") reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Parameter " + str(sys.exc_info()[1]) + " is missing on source definition" ) sys.exit(1) except: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Unknown error on initialization on source") reticLog.logError(self.logList, '( ' + self.name + ' ) ' + errorMessage) sys.exit(1)
def __init__ (self, args, logger): try: self.logList = logger self.name = args['name'] reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Intitializing HTTPSource : " + self.name) self.URL = args['URL'] self.exitOnError = 'n' self.msgList = [] self.nbMsg = 0 self.msg = [] self.msgName = '' self.params = [] self.metadata = {} #this is used for wait(interval) function, so use float if args.has_key('pollPeriod'): self.interval = float(args['pollPeriod']) if args.has_key('params'): self.params = args['params'] if args.has_key('exitOnError'): self.exitOnError = args['exitOnError'] except KeyError: reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Error on HTTPSource initialization") reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Parameter " + str(sys.exc_info()[1]) + " is missing on source definition" ) sys.exit(1) except: reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Unknown error on HTTPSource initialization") reticLog.logError(self.logList, '( ' + self.name + ' ) ' + traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])) raise sys.exit(1)
def process (self): 'Creates the XML file in the output buffer' try: self.tmpMsg = StringIO.StringIO() self.updateAttributesFromMetadata() if self.hasHeader == 'y': self.fieldNames = self.getHeader() self.tmpMsg.write('<?xml version=\"1.0\" encoding=\"' + self.encoding + '\"?>') self.tmpMsg.write('\n<' + self.rootTag + '>\n') current_record = self.InMsg.readline() #print current_record while len(current_record) > 0 : #print current_record if current_record[0] == '#' : current_record = self.InMsg.readline() continue if current_record[-1] == '\n': self.writeRecordAsXML(current_record[:-1]) else: self.writeRecordAsXML(current_record) current_record = self.InMsg.readline() self.tmpMsg.write('</' + self.rootTag + '>') self.tmpMsg.seek(0) self.msg = self.tmpMsg.read() #print self.msg self.msgList.append(self.msg) #print type(self.msgList) reticLog.logInfo(self.logList, '(' + self.name + ') ' + "Message process is finished in pipe") return 0 except: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, '(' + self.name + ') ' + "Error during message processing in pipe") reticLog.logError(self.logList, '(' + self.name + ') ' + errorMessage) return 1
def preprocess(filename,loglist): """ function that does preprocessing TPWD file.(in CSV format) main functionality: 1. extract columns needed. 2. combine repetitive rows (after extract the needed columns) 3. sort all rows on chronological order 4. from the results of step 1) 2) 3); save a copy of the csv file, and add a timestamp at the end of it for this specific case (TPWD), there are 2 more operations performed: a) transfer time into correct date format; b) transfer longitude and latitude into correct decimal format """ #open a csv reader TPWDReader = csv.reader(open(filename)) #start processing reticLog.logInfo(loglist, "start preprocessing %s......" % filename.split(os.sep)[-1]) #extract the columns needed: #more pythonic way: #print "extracting needed columns......", extractedList = list(list(row[i] for i in NEEDED_COL) for row in TPWDReader) #print "done" #transfer date #headerRow : the name of all the columns, do not need to change. #recordRows : actual data part, need to change headerRow,recordRows = extractedList[0],extractedList[1:] #sort the output list on timely basis #if times are the same, sort on latitude,longitude #print "sort rows based on completion time......", #print "done" #eliminate duplicate rows #print "eliminating duplicated rows......", groupByList=list(row for row,group in itertools.groupby(recordRows)) groupByList.sort(date_compare) #print "done" #convert time to datetime format #convert longitude, latitude to decimal format #print "convert date and geological coordinates......", for row in groupByList: row[NEEDED_COL.index(DATE)]= str(datetime.datetime.strptime(row[NEEDED_COL.index(DATE)][:-4],'%d%b%Y:%H:%M:%S')) row[NEEDED_COL.index(LONGITUDE)] = GotoDecimal(row[NEEDED_COL.index(LONGITUDE)],True) row[NEEDED_COL.index(LATITUDE)] = GotoDecimal(row[NEEDED_COL.index(LATITUDE)]) #print "done" #now add header row back groupByList.insert(0,headerRow) #print "%d non-duplicated records generated" % len(groupByList) #outputfile name process: add time stamp to the fileSplit = filename.split('.') #here, timestamp precesion: minutes, concatenated with underscore fileSplit[0] = string.join([fileSplit[0].split('_')[-2],datetime.datetime.now().strftime("%Y%m%d%H%M")],'_') outFileName = ''.join([localPath, string.join(fileSplit,'.').split(os.sep)[-1]]) #write output file #print "write output file with timestamp (%s)......" % outFileName, #Write in binary mode to avoid the extra newline character outWriter = csv.writer(open(outFileName, "wb")) outWriter.writerows(groupByList) reticLog.logInfo(loglist, "preprocessing %s is done." % filename.split(os.sep)[-1])
def getMsg (self, messages): """ for TCEQ, every time, it passes in 2 message StringIO object, one for event, another for result """ try: reticLog.logInfo(self.logList, "( " + self.name + " ) Retrieving message for sink : " + self.name) # Re-initialize msg to get new message # eventString IO is message[0], resultStringIO is messag[1] # get a mini hash table for each segment: # key: RFA tag id, # value: [event list, result list] self.basinSegmentInfo = {} eventCSVList = csv.reader(messages[0],delimiter="|") resultCSVLIst = csv.reader(messages[1],delimiter="|") # raw_input("print event...") # for row in eventCSVList: # print row # raw_input("print result...") # for row in resultCSVLIst: # print row for row in eventCSVList: #this is for basiID 6, year 2010 if (row[0] == "" and row[1] == "" and row[2] == ""): row = row[3:] print row self.basinSegmentInfo[row[RFATAG_COLUMN]] = {} self.basinSegmentInfo[row[RFATAG_COLUMN]][EVENT_IN_HASHTable] = row for row in resultCSVLIst: if self.basinSegmentInfo[row[RFATAG_COLUMN]].has_key(RESULT_IN_HASHTable): self.basinSegmentInfo[row[RFATAG_COLUMN]][RESULT_IN_HASHTable].append(row) else: #if this is the first result row for this RFATAG self.basinSegmentInfo[row[RFATAG_COLUMN]][RESULT_IN_HASHTable] = [row] # resultCounter,eventCounter = 0,0 # for key in self.basinSegmentInfo.keys(): # #print "key => ",self.basinSegmentInfo[key] # eventCounter += 1 # for resultRow in self.basinSegmentInfo[key][RESULT_IN_HASHTable]: # #print resultRow # resultCounter += 1 # #print "%d result in Total......" % eventCounter # print "%d result in Total......" % resultCounter reticLog.logInfo(self.logList, "( " + self.name + " ) Message retrieved in sink : " + self.name) return 0 except Exception, e: import traceback #if row[RFATAG_COLUMN] in self.basinSegmentInfo: # print "In Dictionary Already!" #else: # print "Not In Dictionary!" traceback.print_exc(file=sys.stdout) errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, "( " + self.name + " ) Error during message retrieval in sink : " + self.name) reticLog.logError(self.logList, "( " + self.name + " ) " + errorMessage) return 1
def start (self): 'Start the source of the adaptor (begin work...)' try: reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Starting the source adaptor") self.getMsg() return 0 except: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Unknown error on start of source") reticLog.logError(self.logList, '( ' + self.name + ' ) ' + errorMessage) sys.exit(1)
def prepareUpdateObject(self, recordList,processInfo): reticLog.logInfo(self.logList, "( " + self.name + " ) prepareUpdateObject Start") #object list, to be add to session/real database objects = [] for record in recordList: #here add sites name here for time stamp comparison #not processed data, so we process here result = processInfo(record) for ob in result: objects.append(ob) reticLog.logInfo(self.logList, "( " + self.name + " ) prepareUpdateObject End") print "%d data value objects generated" % len(objects) return objects
def getMsg (self, message): try: reticLog.logInfo(self.logList, "( " + self.name + " ) Retrieving message for sink : " + self.name) # Re-initialize msg to get new message self.msg = '' self.msg = message reticLog.logInfo(self.logList, "( " + self.name + " ) Message retrieved in sink : " + self.name) return 0 except: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, "( " + self.name + " ) Error during message retrieval in sink : " + self.name) reticLog.logError(self.logList, "( " + self.name + " ) " + errorMessage) return 1
def getMsg (self, message): 'Initializes input buffer with message content' try: reticLog.logInfo(self.logList, '(' + self.name + ') ' + "Getting message into pipe") self.InMsg = StringIO.StringIO() self.msg = '' self.InMsg.write(message) self.InMsg.seek(0) return 0 except: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, '(' + self.name + ') ' + "Error during message retrieval in pipe" ) reticLog.logError(self.logList, '(' + self.name + ') ' + errorMessage) return 1
def getRecordList(self): """Extraction of the fields and values to map to the SQL statement. The method returns a list of dictionnaries""" reticLog.logInfo(self.logList, "( " + self.name + " ) getRecordList Start") msg = StringIO.StringIO() msg.write(self.msg) #print msg msg.seek(0) recordList = [] msgFormat = '' xmlRoot = None if self.metadata.has_key(msgFormat) : if self.metadata['msgFormat'] == 'xml': xmlRoot = ElementTree(msg) msgFormat = 'xml' else: msgFormat = 'flat' if msgFormat == '': try: xmlRoot = ElementTree(file=msg) msgFormat = 'xml' except: msgFormat = 'flat' reticLog.logDebug(self.logList, "Input format detected : " + msgFormat) # I am here if msgFormat == 'xml': recNb = 0 #Create an iterator iter = xmlRoot.getiterator() #traverse the xml tree for element in iter: if element.getchildren(): for child in element.getchildren(): prepRecord = {} if child.getchildren(): for subChild in child.getchildren(): prepRecord[subChild.tag] = subChild.text recordList.append(prepRecord) reticLog.logDebug(self.logList, "All records processed.") #here for processing flat file elif msgFormat == 'flat': raise Exception('Do not support flat file at this time') reticLog.logInfo(self.logList, "( " + self.name + " ) getRecordList End") return recordList
def connect (self, args): 'Establish a connection with the database' try: reticLog.logInfo(self.logList, "Intitializing Database Connection : " + args['dsn']) #construct connection string according to parameters engineStr = string.join([args['dbType'], ''.join(['//', args['user']],), ''.join([args['password'],"@",args['dsn']]) ], ':') #print engineStr dbDriverMod = __import__(args['driverName']) self.engine = create_engine(engineStr, module=dbDriverMod) reticLog.logInfo(self.logList, "DataBase Connection established") except: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, "Database error : " + errorMessage ); raise "Database Error";
def next(self): 'Get the next message to be processed or return that sources are dry' if self.nbMsg == 0: return 0 else: reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Messages Left on queue of adaptor : " + str(self.nbMsg)) # No params are provided, processing raw URL. (without GET/POST request) if len(self.params) == 0: reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Retrieving message from : " + self.URL) try: request = urllib2.Request(self.URL) connection = urllib2.urlopen(request) self.msg.append(connection.read()) reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Message retrieved on adaptor: " + self.name) except: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Error on message retrieval on source : " + self.name) reticLog.logError(self.logList, '( ' + self.name + ' ) ' + errorMessage) if self.exitOnError.lower() == 'y': sys.exit(1) else: self.msg = [] for param in self.params: # Params are provided, processing URL passing them through GET method # There are as many calls as there are param lists paramLine = '?' for key in param.keys(): self.metadata[key] = param[key] paramLine = urlencode(self.metadata) reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Retrieving message from : " + self.URL+"?" + paramLine) successful = False while not successful: try: request = urllib2.Request("?".join([self.URL,paramLine])) connection = urllib2.urlopen(request) self.msg.append(connection.read()) reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Message retrieved on adaptor: " + self.name) except: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Error on message retrieval on source : " + self.name) reticLog.logError(self.logList, '( ' + self.name + ' ) ' + errorMessage) if self.exitOnError.lower() == 'y': sys.exit(1) else: successful = True return 1
def updateDB(self, objects): reticLog.logInfo(self.logList, "( " + self.name + " ) Starting update objects from sink : " + self.name) count = 0 for ob in objects : retries = self.retries execOk = 0 ob.ValueID = self.getMaxId('DV') try: where = and_(testTPWDmodel.DataValues.DataValue==ob.DataValue, testTPWDmodel.DataValues.LocalDateTime==ob.LocalDateTime, testTPWDmodel.DataValues.SiteID==ob.SiteID, testTPWDmodel.DataValues.VariableID==ob.VariableID) valueExist = self.session.query(testTPWDmodel.DataValues).filter(where).one() #this record does not exist,insert it except NoResultFound, e: while retries >= 0 and execOk == 0: try: ############ self.session.add(ob) self.session.flush() execOk = 1 #this is the handler for some violation of unique constriant on keys except exc.OperationalError: print "DB constraint violation happen" self.session.rollback() continue #execOk = 0 # retries = retries - 1 #this is the handler or invalid request error except exc.InvalidRequestError: print "DB constraint violation happen" self.session.rollback() continue #raise if execOk == 0 and retries < 0: raise "Database Exception : all retries failed" elif execOk == 1: print "recordNo == >", ob.ValueID, "generated" count += 1 else: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logWarning(self.logList, "Database Update failed : " + errorMessage) #this record exists, skip it else: print "record skipped" continue
def commit(self): 'Commit the current message treatment' if self.nbMsg > 0 : reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Commiting msg " + self.msgList[0] + " on source : " + self.name) msgName = self.msgList[0] try: #if self.newExtension != '' and self.newExtension != ' ': #shutil.copyfile(self.filePath+self.msgName,self.filePath+self.msgName+self.newExtension) shutil.os.remove(self.filePath+self.msgName) self.nbMsg = self.nbMsg - 1 self.msgList = self.msgList[1:] self.msg = '' return 0 except: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Error on commit phase on source - File : " + msgName) reticLog.logError(self.logList, '( ' + self.name + ' ) ' + errorMessage) return 1 else: return 1
def preprocess_batch(directoryName,loglist): """Find the fold under secific location(direcotryPath) with the largest time stamp; and preprocess each csv file inside it """ reticLog.logInfo(loglist,"Starting TPWD preprocessing procedure") import glob #get the fold with the largest timestamp max_datetime = None folderToProcess = None for name in glob.glob('%s/request_*' % directoryName): import datetime year,month,day = int(name.split('request')[1][1:][0:4]), \ int(name.split('request')[1][1:][4:6]), \ int(name.split('request')[1][1:][6:8]) tempDateTime = datetime.datetime(year,month,day) if (not max_datetime) or (tempDateTime > max_datetime): max_datetime = tempDateTime folderToProcess = name #at this point, folderToProcess should have the folder name of the latest timestamp reticLog.logInfo(loglist, 'preprocess folder: %s, with requested time %s' % (folderToProcess,str(max_datetime)) ) #so preprocess each file inside folderToProcess for name in glob.glob('%s/*.csv' % folderToProcess): preprocess(name,loglist)
def next(self): 'Get the next message to be processed or return that sources are dry' if self.nbMsg == 0: reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Sources dry on source ") return 0 else: try: reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Messages Left on queue of adaptor : " + str(self.nbMsg)) reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Retrieving file : " + self.msgList[0]) self.msgName = self.msgList[0] dotIndex = string.find(self.msgName,'.') if dotIndex > 0: if(not self.metadata.has_key('filename')): self.metadata['filename'] = [] self.metadata['filename'].append(string.split(self.msgName,'.')[0]) self.metadata['extension'] = string.split(self.msgName,'.')[1] else: if(not self.metadata.has_key('filename')): self.metadata['filename'] = [] self.metadata['filename'] = self.msgName self.metadata['extension'] = '' #file reading happend here fp = open(os.path.join(self.filePath,self.msgList[0])) if reticUtils.istext(fp): fp.close() fp = open(os.path.join(self.filePath,self.msgList[0]),'r') else: fp.close() fp = open(os.path.join(self.filePath,self.msgList[0]),'rb') self.msg = fp.read() fp.close() return 1 except: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logError(self.logList, '( ' + self.name + ' ) ' + "Error on message retrieval on source : " + self.name) reticLog.logError(self.logList, '( ' + self.name + ' ) ' + errorMessage) if self.exitOnError.lower() == 'y': return 0 else: return 1
def start (self): """Start the source of the adaptor (begin work...)""" reticLog.logInfo(self.logList, '( ' + self.name + ' ) ' + "Starting the http source adaptor") self.getMsg() return 0
def updateDB(self,methodLookUpfile): reticLog.logInfo(self.logList, "( " + self.name + " ) Starting update objects from sink : " + self.name) count = 0 for key in self.basinSegmentInfo.keys(): #need this because for some years (eg: 1973), there is no data collected in result file if self.basinSegmentInfo[key].has_key(RESULT_IN_HASHTable) and len(self.basinSegmentInfo[key][RESULT_IN_HASHTable]) >= VALUE : for resultRow in self.basinSegmentInfo[key][RESULT_IN_HASHTable]: retries = self.retries execOk = 0 try: import datetime ValueID = self.getMaxId('DV') DataValue = float(resultRow[VALUE]) LocalDateTime = datetime.datetime.strptime(" ".join([self.basinSegmentInfo[key][EVENT_IN_HASHTable][DATE], self.basinSegmentInfo[key][EVENT_IN_HASHTable][TIME]]), "%m/%d/%Y %H:%M") SiteID = self.lookUpSite(self.basinSegmentInfo[key][EVENT_IN_HASHTable][SITECODE]) VariableID = self.lookUpVariableID(resultRow[VARIABLECODE]) if not self.basinSegmentInfo[key][EVENT_IN_HASHTable][OFFSETDEPTH] == "": OffsetValue = float(self.basinSegmentInfo[key][EVENT_IN_HASHTable][OFFSETDEPTH]) else: OffsetValue = float(-9999) CensorCode = u'nc' #find method id import anydbm methodDBMfile = anydbm.open(methodLookUpfile, 'r') MethodDescription = methodDBMfile[resultRow[VARIABLECODE]] MethodID = self.lookUpMethodID(MethodDescription) #for production databse: #SourceID = 1 where = and_(DataValues.DataValue == DataValue, DataValues.LocalDateTime== LocalDateTime, DataValues.SiteID== SiteID, DataValues.VariableID == VariableID, DataValues.OffsetValue == OffsetValue, DataValues.MethodID == MethodID) valueExist = self.session.query(DataValues).filter(where).one() #this DataValue record does not exist,insert it except NoResultFound, e: while retries >= 0 and execOk == 0: try: ############ newDataValueRecord = DataValues(ValueID,DataValue,LocalDateTime,SiteID,VariableID,OffsetValue,MethodID) self.session.add(newDataValueRecord) self.session.flush() execOk = 1 #this is the handler for some violation of unique constriant on keys except exc.OperationalError: print "DB constraint violation happen" self.session.rollback() #execOk = 0 # retries = retries - 1 #this is the handler or invalid request error except exc.InvalidRequestError: print "DB constraint violation happen" self.session.rollback() #raise if execOk == 0 and retries < 0: raise "Database Exception : all retries failed" elif execOk == 1: print "recordNo == >", newDataValueRecord.ValueID, "generated" count += 1 else: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logWarning(self.logList, "Database Update failed : " + errorMessage) except TCEQRecordNotFoundError, e: errorMessage = str(e) reticLog.logWarning(self.logList, "Database Update failed : " + errorMessage) #this record exists, skip it except Exception, e: traceback.print_exc(file=sys.stdout) print resultRow raise else: print "record found, need to skip this record (may be wrong behavior....)"
count += 1 else: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logWarning(self.logList, "Database Update failed : " + errorMessage) except TCEQRecordNotFoundError, e: errorMessage = str(e) reticLog.logWarning(self.logList, "Database Update failed : " + errorMessage) #this record exists, skip it except Exception, e: traceback.print_exc(file=sys.stdout) print resultRow raise else: print "record found, need to skip this record (may be wrong behavior....)" #end big for loop reticLog.logInfo(self.logList, "( " + self.name + " ) Number DB record (%d) added : " % count + self.name) reticLog.logInfo(self.logList, "( " + self.name + " ) Update of Databases ended in sink : " + self.name) #unit of work pattern, only commit one time try: self.session.commit() #self.session.close() reticLog.logInfo(self.logList, "( " + self.name + " ) Update commited") except: self.session.rollback() #self.session.close() reticLog.logWarning(self.logList, "Commit Failed in SQLSink") #auto generate new record for new DataValues table record def getMaxId(self,tabFlag): maxid = 0
def main(): session = initDB() #logger setup. Here, simply set a consloe logger logAttDic = {'name': 'TCEQ sites and parameters importing for the first time', 'level': 'DEBUG', 'format':'Simple', 'handler':'ConsoleAppender'} logList = [] reticLog.addLogger(logList,logAttDic) #get sites list (a text file) from an HTTPSource # and insert all the sites into the "Sites" table of ODM database siteSrc_args = {} siteSrc_args['name'] = "TCEQ sites httpsource" siteSrc_args['URL'] = "ftp://ftp.tceq.state.tx.us/pub/WaterResourceManagement/WaterQuality/DataCollection/CleanRivers/public/stations.txt" sitesHTTPSource = HTTPSource.source(siteSrc_args,logList) sitesHTTPSource.start() while(sitesHTTPSource.next()==1): print "Content of this URL: %s" % sitesHTTPSource.URL sitesFile = StringIO(sitesHTTPSource.msg[0]) sitesListReader = csv.reader(sitesFile, delimiter='|') for index,row in enumerate(sitesListReader): if index == 0: continue try: newRecordSiteName = row[SITENAME] if len(row[SITENAME]) <= 255 else row[SITENAME][0:255] where = and_(Sites.SiteCode == unicode(row[SITECODE]) ,Sites.Latitude == float(row[LATITUDE]) ,Sites.Longitude == float(row[LONGITUDE])) valueExist = session.query(Sites).filter(where).one() print "find record with SiteCode %s in database, skip it..." % row[SITECODE] continue #this site record does not exist, so insert it except NoResultFound, e: # This is for system robust # retries is max number of insertion times, and execOk is to show whether update is successful retries,execOk = 5,0 newSiteRecord = Sites(row[SITECODE],newRecordSiteName, float(row[LATITUDE]),float(row[LONGITUDE]),row[COUNTY], \ ";".join(["HUC 8 = ",row[HUC],"EPA_Type1 = ",row[TYPE1],"EPA_Type2 = ",row[TYPE2]])) while retries >= 0 and execOk == 0: try: ############ newSiteRecord.SiteID = getMaxId(session,"Sites") session.add(newSiteRecord) session.flush() execOk = 1 #this is the handler for some violation of unique constriant on keys except exc.OperationalError: print "DB constraint violation happen" session.rollback() retries = retries - 1 continue #this is the handler or invalid request error except exc.InvalidRequestError: print "DB constraint violation happen" session.rollback() retries = retries - 1 continue if execOk == 0 and retries < 0: raise "Database Exception : all retries failed" elif execOk == 1: print "inert new Site record with SiteCode ==> %s" % row[SITECODE] else: errorMessage = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] reticLog.logWarning(logList, "Sites Table in Database Update failed : " + errorMessage) #unit of work pattern, only commit one time try: session.commit() reticLog.logInfo(logList, "( " + "TCEQ Sites" + " ) Update commited") except: session.rollback() reticLog.logWarning(logList, "Commit Failed in SQLSink") sitesHTTPSource.commit()