def log_job_error(msg): tqdm.write(' -- There were unhandled errors during this batch. ' 'Please check errors_pyArchiveService.log for details') # function to print any error that are encountered during parallel execution file_append( 'errors_pyArchiveService.log', 'ON ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' an unhandled error occurred:\n' + msg + '\n' + 'END OF ERROR =================== \n\n')
def gamit_callback(job): result = job.result if result is not None: msg = [] if 'error' not in result.keys(): if result['nrms'] > 1: msg.append(' > NRMS > 1.0 (%.3f)' % result['nrms']) if result['wl'] < 60: msg.append(' > WL fixed < 60 (%.1f)' % result['wl']) if result['missing']: msg.append(' > Missing sites in solution: ' + ', '.join(result['missing'])) # DDG: only show sessions with problems to facilitate debugging. if result['success']: if len(msg) > 0: tqdm.write( ' -- %s Done processing: %s -> WARNINGS:\n%s' % (print_datetime(), result['session'], '\n'.join(msg))) # insert information in gamit_stats try: cnn = dbConnection.Cnn( 'gnss_data.cfg') # type: dbConnection.Cnn cnn.insert('gamit_stats', result) cnn.close() except dbConnection.dbErrInsert as e: tqdm.write( ' -- %s Error while inserting GAMIT stat for %s: ' % (print_datetime(), result['session'] + ' ' + str(e))) else: tqdm.write(' -- %s Done processing: %s -> FATAL:\n' ' > Failed to complete. Check monitor.log:\n%s' % (print_datetime(), result['session'], indent('\n'.join(result['fatals']), 4))) # write FATAL to file file_append( 'FATAL.log', 'ON %s session %s -> FATAL: Failed to complete. Check monitor.log\n%s\n' % (print_datetime(), result['session'], indent('\n'.join(result['fatals']), 4))) else: tqdm.write( ' -- %s Error in session %s message from node follows -> \n%s' % (print_datetime(), result['session'], result['error'])) else: tqdm.write( ' -- %s Fatal error on node %s message from node follows -> \n%s' % (print_datetime(), job.ip_addr, job.exception))
def write_error(folder, filename, msg): # @todo why retries are used? # do append just in case... count = 0 while True: try: file_append(os.path.join(folder, filename), msg) return except IOError as e: if count < 3: count += 1 else: raise IOError(str(e) + ' after 3 retries')
def output_handle(callback): messages = [outmsg.errors for outmsg in callback] if len([out_msg for out_msg in messages if out_msg]) > 0: tqdm.write( ' >> There were unhandled errors during this batch. Please check errors_pyScanArchive.log for details' ) # function to print any error that are encountered during parallel execution for msg in messages: if msg: file_append( 'errors_amend.log', 'ON ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' an unhandled error occurred:\n' + msg + '\n' + 'END OF ERROR =================== \n\n') return []
def finish(self): try: # delete everything inside the processing dir shutil.rmtree(self.pwd_brdc) shutil.rmtree(self.pwd_igs) # remove files in tables for ftype in ('*.grid', '*.dat', '*.apr'): for ff in glob.glob(os.path.join(self.pwd_tables, ftype)): os.remove(ff) # remove processing files for ftype in ('b*', 'cfmrg*', 'DPH.*', 'eq_rename.*', 'g*', 'k*', 'p*', 'rcvant.*', 'y*'): for ff in glob.glob(os.path.join(os.path.join(self.pwd, self.date.ddd()), ftype)): os.remove(ff) try: if not os.path.exists(os.path.dirname(self.solution_pwd)): os.makedirs(os.path.dirname(self.solution_pwd)) except OSError: # racing condition having several processes trying to create the same folder # if OSError occurs, ignore and continue pass # the solution folder exists because it was created by GamitSession to start the processing. # erase it to upload the result if os.path.exists(self.solution_pwd): shutil.rmtree(self.solution_pwd) # execute final step: copy to self.solution_pwd shutil.copytree(self.pwd, self.solution_pwd, symlinks=True) # remove the remote pwd shutil.rmtree(self.pwd) except: msg = traceback.format_exc() + '\nProcessing %s date %s on node %s' \ % (self.params['NetName'], self.date.yyyyddd(), platform.node()) file_append(os.path.join(self.pwd, 'monitor.log'), now_str() + ' -> ERROR in pyGamitTask.finish()\n%s' % msg)
def output_handle(job): if job.result is not None: apr = job.result[0] stninfo = job.result[1] log = job.result[2] metafile = job.result[3] # write the APR and sigmas # writen in ENU not NEU, as specified by Abel meta_path = './' + metafile if apr is not None: file_append(meta_path + '.apr', apr + '\n') if stninfo is not None: file_append(meta_path + '.info', stninfo + '\n') # write a log line for debugging if log is not None: file_append(meta_path + '.log', log + '\n') elif job.exception: tqdm.write(' -- There were unhandled errors during this batch: ' + job.exception)
def pull_rinex(cnn, date, Config, JobServer): # before starting the sync, determine if there were any station code changes that will require file deletions in AWS # Join aws_sync with stations. If an entry in aws_sync has has no record in stations, station was renamed and needs # to be deleted. It will be resent in this run. rs = cnn.query( 'SELECT a."NetworkCode", a."StationCode", a."StationAlias" FROM aws_sync as a ' 'LEFT JOIN stations as s on ' 'a."NetworkCode" = s."NetworkCode" and ' 'a."StationCode" = s."StationCode" ' 'WHERE "Year" = %i AND "DOY" = %i AND s."StationCode" IS NULL' % (date.year, date.doy)) deletes = rs.dictresult() for stn in deletes: # produce a single file with the deletions that need to occur in the AWS file_append( 'file_ops.log', 'rm %s/%s* # %s.%s not found in stations table with net.stn code declared in aws_sync\n' % (date.yyyyddd().replace(' ', '/'), stn['StationAlias'], stn['NetworkCode'], stn['StationCode'])) # delete the records from aws_sync cnn.query( 'DELETE FROM aws_sync WHERE "Year" = %i AND "DOY" = %i AND "NetworkCode" = \'%s\' AND ' '"StationCode" = \'%s\'' % (date.year, date.doy, stn['NetworkCode'], stn['StationCode'])) # Join aws_sync with stationalias (stationalias is FK-ed to stations). # If an entry in aws_sync that has StationCode <> StationAlias has no record in stationalias OR # the stationalias declared is different than the station alias in the aws_sync, delete from AWS. # will be resent in this batch rs = cnn.query( 'SELECT a."NetworkCode", a."StationCode", a."StationAlias" FROM aws_sync as a ' 'LEFT JOIN stationalias as sa on ' 'a."NetworkCode" = sa."NetworkCode" and ' 'a."StationCode" = sa."StationCode" ' 'WHERE "Year" = %i AND "DOY" = %i AND ' 'a."StationAlias" <> sa."StationAlias" OR ' '(sa."StationAlias" IS NULL AND a."StationCode" <> a."StationAlias")' % (date.year, date.doy)) deletes = rs.dictresult() for stn in deletes: # produce a single file with the deletions that need to occur in the AWS file_append( 'file_ops.log', 'rm %s/%s* # alias declared in aws_sync for %s.%s does not match alias in stationalias table\n' % (date.yyyyddd().replace(' ', '/'), stn['StationAlias'], stn['NetworkCode'], stn['StationCode'])) # delete the records from aws_sync cnn.query( 'DELETE FROM aws_sync WHERE "Year" = %i AND "DOY" = %i AND "NetworkCode" = \'%s\' AND ' '"StationCode" = \'%s\'' % (date.year, date.doy, stn['NetworkCode'], stn['StationCode'])) # check the individual files for this day. All files reported as uploaded should have a match in the rinex_proc # table, otherwise this could be a station split or deletion. If that's the case, order their deletion from the AWS rs = cnn.query( 'SELECT a."NetworkCode", a."StationCode", a."StationAlias" FROM aws_sync as a ' 'LEFT JOIN rinex_proc as rx on ' 'a."NetworkCode" = rx."NetworkCode" and ' 'a."StationCode" = rx."StationCode" and ' 'a."Year" = rx."ObservationYear" and ' 'a."DOY" = rx."ObservationDOY" ' 'WHERE "Year" = %i AND "DOY" = %i AND ' 'rx."StationCode" IS NULL ' % (date.year, date.doy)) deletes = rs.dictresult() for stn in deletes: # produce a single file with the deletions that need to occur in the AWS file_append( 'file_ops.log', 'rm %s/%s* # rinex file for %s.%s could not be found in the rinex_proc table\n' % (date.yyyyddd().replace(' ', '/'), stn['StationAlias'], stn['NetworkCode'], stn['StationCode'])) # delete the records from aws_sync cnn.query( 'DELETE FROM aws_sync WHERE "Year" = %i AND "DOY" = %i AND "NetworkCode" = \'%s\' AND ' '"StationCode" = \'%s\'' % (date.year, date.doy, stn['NetworkCode'], stn['StationCode'])) #################################################################################################################### # continue with sync of files #################################################################################################################### # behavior requested by Abel: ALWAYS output the metadata but don't output a RINEX if already synced. rs = cnn.query( 'SELECT rinex_proc.* FROM rinex_proc ' 'WHERE "ObservationYear" = %i AND "ObservationDOY" = %i AND "Completion" >= 0.3' % (date.year, date.doy)) rinex = rs.dictresult() pbar = tqdm(total=len(rinex), ncols=80) metafile = date.yyyy() + '/' + date.ddd() + '/' + date.yyyyddd().replace( ' ', '-') date_subpath = date.yyyy() + '/' + date.ddd() date_path = './' + date_path # following Abel's request, make a subdir for the files lele_path = '/media/leleiona/aws-files/' + date_subpath for p in (date_path, lele_path): if not os.path.isdir(p): os.makedirs(p) # write the header to the .info file file_write( './' + metafile + '.info', '*SITE Station Name Session Start Session Stop Ant Ht HtCod Ant N Ant E ' 'Receiver Type Vers SwVer Receiver SN Antenna Type Dome ' 'Antenna SN \n') modules = ('dbConnection', 'pyETM', 'pyDate', 'pyRinex', 'pyStationInfo', 'pyOptions', 'pyArchiveStruct', 'os', 'numpy', 'traceback', 'platform', 'Utils', 'shutil') depfuncs = (window_rinex, sigmas_neu2xyz) JobServer.create_cluster(rinex_task, depfuncs, output_handle, pbar, modules=modules) for rnx in rinex: JobServer.submit(rnx['NetworkCode'], rnx['StationCode'], date, rnx['ObservationFYear'], metafile) JobServer.wait() pbar.close() JobServer.close_cluster() print('Done, chau!')
def start(self, dirname, year, doy, dry_run=False): monitor_open = False try: # copy the folder created by GamitSession in the solution_pwd to the remote_pwd (pwd) try: if not os.path.exists(os.path.dirname(self.pwd)): os.makedirs(os.path.dirname(self.pwd)) except OSError: # racing condition having several processes trying to create the same folder # if OSError occurs, ignore and continue pass # if the local folder exists (due to previous incomplete processing, erase it). if os.path.exists(self.pwd): shutil.rmtree(self.pwd) # ready to copy the shared solution_dir to pwd shutil.copytree(self.solution_pwd, self.pwd, symlinks=True) with file_open(os.path.join(self.pwd, 'monitor.log'), 'a') as monitor: monitor_open = True def log(s): monitor.write(now_str() + ' -> ' + s + '\n') log('%s %i %i executing on %s' % (dirname, year, doy, platform.node())) log('fetching orbits') try: Sp3 = pySp3.GetSp3Orbits(self.orbits['sp3_path'], self.date, self.orbits['sp3types'], self.pwd_igs, True) # type: pySp3.GetSp3Orbits except pySp3.pySp3Exception: log('could not find principal orbits, fetching alternative') # try alternative orbits if self.options['sp3altrn']: Sp3 = pySp3.GetSp3Orbits(self.orbits['sp3_path'], self.date, self.orbits['sp3altrn'], self.pwd_igs, True) # type: pySp3.GetSp3Orbits else: raise if Sp3.type != 'igs': # rename file shutil.copyfile(Sp3.file_path, Sp3.file_path.replace(Sp3.type, 'igs')) log('fetching broadcast orbits') pyBrdc.GetBrdcOrbits(self.orbits['brdc_path'], self.date, self.pwd_brdc, no_cleanup=True) # type: pyBrdc.GetBrdcOrbits for rinex in self.params['rinex']: log('fetching rinex for %s %s %s %s' % (stationID(rinex), rinex['StationAlias'], '{:10.6f} {:11.6f}'.format(rinex['lat'], rinex['lon']), 'tie' if rinex['is_tie'] else '')) try: with pyRinex.ReadRinex(rinex['NetworkCode'], rinex['StationCode'], rinex['source'], False) as Rinex: # type: pyRinex.ReadRinex # WARNING! some multiday RINEX were generating conflicts because the RINEX has a name, say, # tuc12302.10o and the program wants to rename it as tuc12030.10o but because it's a # multiday file, during __init__ it's already split and renamed as tuc12300.10o and # additional folders are generated with the information for each file. Therefore, find # the rinex that corresponds to the date being processed and use that one instead of the # original file. These files are not allowed by pyArchiveService, but the "start point" of # the database (i.e. the files already in the folders read by pyScanArchive) has such # problems. # figure out if this station has been affected by an earthquake # if so, window the data if rinex['jump'] is not None: monitor.write( ' -> RINEX file has been windowed: ETM detected jump on ' + rinex['jump'].datetime().strftime('%Y-%m-%d %H:%M:%S') + '\n') if Rinex.multiday: # find the rinex that corresponds to the session being processed for Rnx in Rinex.multiday_rnx_list: if Rnx.date == self.date: Rnx.rename(rinex['destiny']) if rinex['jump'] is not None: self.window_rinex(Rnx, rinex['jump']) # before creating local copy, decimate file Rnx.decimate(30) Rnx.purge_comments() Rnx.compress_local_copyto(self.pwd_rinex) break else: Rinex.rename(rinex['destiny']) if rinex['jump'] is not None: self.window_rinex(Rinex, rinex['jump']) # before creating local copy, decimate file Rinex.decimate(30) Rinex.purge_comments() Rinex.compress_local_copyto(self.pwd_rinex) except (OSError, IOError): log('An error occurred while trying to copy ' + rinex['source'] + ' to ' + rinex['destiny'] + ': File skipped.') except (pyRinex.pyRinexException, Exception) as e: log('An error occurred while trying to copy ' + rinex['source'] + ': ' + str(e)) log('executing GAMIT') # create the run script self.create_replace_links() self.create_run_script() self.create_finish_script() # run the script to replace the links of the tables directory self.p = subprocess.Popen('find ./tables ! -name "otl.grid" -type l -exec ./replace_links.sh {} +', shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE, cwd=self.pwd) _, _ = self.p.communicate() # now execute the run script if not dry_run: self.p = subprocess.Popen('./run.sh', shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.pwd) self.stdout, self.stderr = self.p.communicate() self.p = subprocess.Popen('./finish.sh', shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.pwd) self.stdout, self.stderr = self.p.communicate() # check for any fatals self.p = subprocess.Popen('grep -q \'FATAL\' monitor.log', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.pwd) _, _ = self.p.communicate() self.success = (self.p.returncode != 0) # output statistics to the parent to display result = self.parse_monitor(self.success) file_append(os.path.join(self.pwd, 'monitor.log'), now_str() + ' -> return to Parallel.GAMIT\n') # no matter the result of the processing, move folder to final destination if not dry_run: self.finish() return result except: msg = traceback.format_exc() + '\nProcessing %s date %s on node %s' \ % (self.params['NetName'], self.date.yyyyddd(), platform.node()) # DDG: do not attempt to write to monitor.log or do any file operations (maybe permission problem) # problem might occur during copytree or rmtree or some other operation before opening monitor.log if monitor_open: file_append(os.path.join(self.pwd, 'monitor.log'), now_str() + ' -> ERROR in pyGamitTask.start()\n%s' % msg) # the solution folder exists because it was created by GamitSession to start the processing. # erase it to upload the result if os.path.exists(self.solution_pwd): shutil.rmtree(self.solution_pwd) # execute final error step: copy to self.solution_pwd shutil.copytree(self.pwd, self.solution_pwd, symlinks=True) # remove the remote pwd shutil.rmtree(self.pwd) # output statistics to the parent to display result = self.parse_monitor(False) else: result = {'session' : '%s %s' % (self.date.yyyyddd(), self.params['DirName']), 'Project' : self.params['NetName'], 'subnet' : self.params['subnet'], 'Year' : self.date.year, 'DOY' : self.date.doy, 'FYear' : self.date.fyear, 'wl' : 0, 'nl' : 0, 'nrms' : 0, 'relaxed_constrains' : '', 'max_overconstrained' : '', 'node' : platform.node(), 'execution_time' : 0, 'execution_date' : 0, 'missing' : '', 'success' : False, 'fatals' : [] } result['error'] = msg # return useful information to the main node return result
def debug(s): if DEBUG: file_append('/tmp/db.log', "DB: %s\n" % s)