def main(): delay = 0.5 # First iteration will set delay=1 or multiply by 2 num_short_delays = 0 while True: try: Downloader.status() if Downloader.run(): # files were successfully downloaded delay=1 num_short_delays = 0 #elif num_short_delays <= 8: #elif num_short_delays <= 4: # No files successfully download this iteration # Increase sleep time # delay = min((delay*2, 4)) # num_short_delays += 1 else: delay = min((delay*2,32)) except Exception, e: if config.email.send_on_crash: msg = '*** Downloader has crashed! ***\n\n' msg += 'Fatal error occured while running downloader: %s\n\n' % str(e) msg += ''.join(traceback.format_exception(*sys.exc_info())) notification = mailer.ErrorMailer(msg, subject="Downloader crash!") notification.send() sys.stderr.write("Fatal error occurred!\n") raise print "Will sleep for %d seconds" % (config.background.sleep*delay) sys.exit() time.sleep(config.background.sleep*delay)
def get_output_dir(fns): """Given a list of data files, 'fns', generate path to output results. path is: {base_results_directory}/{mjd}/{obs_name}/{beam_num}/{proc_date}/ Note: 'base_results_directory' is defined in the processing config file. 'mjd', 'obs_name', and 'beam_num' are from parsing the job's datafiles. 'proc_date' is the current date in yymmddThhmmss format. """ # Get info from datafile headers data = datafile.autogen_dataobj([fns[0]]) if not isinstance(data, datafile.PsrfitsData): errormsg = "Data must be of PSRFITS format.\n" errormsg += "\tData type: %s\n" % type(data) raise pipeline_utils.PipelineError(errormsg) # Generate output directory mjd = int(data.timestamp_mjd) beam_num = data.beam_id obs_name = data.obs_name proc_date = datetime.datetime.now().strftime('%y%m%dT%H%M%S') baseoutdir = os.path.join(config.processing.base_results_directory, \ str(mjd), str(obs_name), \ str(beam_num), proc_date) outdir = baseoutdir # Make sure our output directory doesn't already exist counter = 0 while os.path.exists(outdir): counter += 1 outdir = "%s_%d" % (baseoutdir, counter) # Make the directory immediately so the pipeline knows it's taken os.makedirs(outdir) # Send an email if our first choice for outdir wasn't available if counter: errormsg = "The first-choice output directory '%s' " \ "already existed. Had to settle for '%s' " \ "after %d tries. \n\n " \ "Data files:\n " \ "\t%s" % (baseoutdir, outdir, counter, "\n\t".join(fns)) notification = mailer.ErrorMailer(errormsg, \ subject="Job outdir existance warning") notification.send() return outdir
def main(): while True: try: JobUploader.run() except Exception, e: if config.email.send_on_crash: msg = '*** Uploader has crashed! ***\n\n' msg += 'Fatal error occured while running job uploader: %s\n\n' % str( e) msg += ''.join(traceback.format_exception(*sys.exc_info())) notification = mailer.ErrorMailer(msg, subject="Uploader crash!") notification.send() sys.stderr.write("Fatal error occurred!\n") raise time.sleep(config.background.sleep)
def main(): while True: #rotation function changes/updates the states and submits jobs #that were created try: job.status() job.rotate() except Exception, e: if config.email.send_on_crash: msg = '*** Job pooler has crashed! ***\n\n' msg += 'Fatal error occured while running job pool: %s\n\n' % str( e) msg += ''.join(traceback.format_exception(*sys.exc_info())) notification = mailer.ErrorMailer(msg).send() sys.stderr.write("Fatal error occurred!\n") raise time.sleep(config.background.sleep)
def download(self, ftp_file_path): try_counter = 0 login = False while not login: try_counter += 1 try: myFtp = self.login(self.connect()) login = True except (CornellFTPConnectionError, CornellFTPLoginError), e: print str(e) if try_counter > 7: try: notification = mailer.ErrorMailer( 'CornellFTP login failure, retried %u times: %s' % (str(e), try_counter)) notification.send() except Exception, e: pass
def main(): while True: #rotation function changes/updates the states and submits jobs #that were created try: job.update_jobs_status_from_queue() pipeline_utils.move_results() job.update_jobs_status_from_queue() job.status() job.rotate() #pipeline_utils.archive_logs() except Exception, e: if config.email.send_on_crash: msg = '*** Job pooler has crashed! ***\n\n' msg += 'Fatal error occured while running job pool: %s\n\n' % str( e) msg += ''.join(traceback.format_exception(*sys.exc_info())) notification = mailer.ErrorMailer(msg, subject="Job Pooler crash!") notification.send() sys.stderr.write("Fatal error occurred!\n") raise time.sleep(config.background.sleep)
import time import sys import traceback import mailer import JobUploader import config.background import config.email while True: try: JobUploader.run() except Exception, e: if config.email.send_on_crash: msg = '*** Uploader has crashed! ***\n\n' msg += 'Fatal error occured while running job uploader: %s\n\n' % str( e) msg += ''.join(traceback.format_exception(*sys.exc_info())) notification = mailer.ErrorMailer(msg).send() sys.stderr.write("Fatal error occurred!\n") raise time.sleep(config.background.sleep)
def recover_failed_jobs(): """Gather jobs with status 'failed' from the job-tracker DB. For each of these jobs see if it can be re-submitted. If it can, set the status to 'retrying'. If the job cannot be re-submitted, set the status to 'terminal_failure', and delete the raw data (if config is set for deletion). Depending on configurations emails may be sent. """ failed_jobs = jobtracker.query("SELECT * FROM jobs " \ "WHERE status='failed'") for job in failed_jobs: # Count the number of times this job has been submitted already submits = jobtracker.query("SELECT * FROM job_submits " \ "WHERE job_id=%d " \ "ORDER BY id DESC" % job['id']) if len(submits) < config.jobpooler.max_attempts: # We can re-submit this job. if config.email.send_on_failures: # Send error email msg = "Error! Job submit status: %s\n" % \ submits[0]['status'] msg += "Job ID: %d, Job submit ID: %d\n\n" % \ (job['id'], submits[0]['id']) msg += str(submits[0]['details']) msg += "\n*** Job will be re-submitted to the queue ***\n" mailer.ErrorMailer(msg).send() # Set status to 'retrying'. jobtracker.query("UPDATE jobs " \ "SET status='retrying', " \ "updated_at='%s', " \ "details='Job will be retried' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job['id'])) jobpool_cout.outs("Job #%d will be retried." % job['id']) else: # We've run out of attempts for this job if config.email.send_on_terminal_failures or \ config.email.send_on_failure: # Send error email msg = "Error! Job submit status: %s\n" % \ str(submits[0]['status']) msg += "Job ID: %d, Job submit ID: %d\n\n" % \ (job['id'], submits[0]['id']) msg += str(submits[0]['details']) msg += "\n*** No more attempts for this job. ***\n" msg += "*** Job will NOT be re-submitted! ***\n" if config.basic.delete_rawdata: jobpool_cout.outs("Job #%d will NOT be retried. " \ "Data files will be deleted." % job['id']) msg += "*** Raw data files will be deleted. ***\n" else: jobpool_cout.outs("Job #%d will NOT be retried. " % job['id']) mailer.ErrorMailer(msg).send() if config.basic.delete_rawdata: pipeline_utils.clean_up(job['id']) # Set status to 'terminal_failure'. jobtracker.query("UPDATE jobs " \ "SET status='terminal_failure', " \ "updated_at='%s', " \ "details='Job has failed permanently' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job['id']))
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row['id']) try: presubmission_check(fns) outdir = get_output_dir(fns) # Attempt to submit the job queue_id = config.jobpooler.queue_manager.submit\ (fns, outdir, job_row['id']) except (FailedPreCheckError): # Error caught during presubmission check. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Job ID: %d " % job_row['id'] errormsg += "failed presubmission check!\n\n" errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Job ID: %d failed presubmission check!\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) if config.email.send_on_terminal_failures: # Send error email msg = "Presubmission check failed!\n" msg += "Job ID: %d\n\n" % \ (job_row['id']) msg += errormsg msg += "\n*** Job has been terminally failed. ***\n" msg += "*** Job will NOT be re-submitted! ***\n" if config.basic.delete_rawdata: jobpool_cout.outs("Job #%d will NOT be retried. " \ "Data files will be deleted." % job_row['id']) msg += "*** Raw data files will be deleted. ***\n" else: jobpool_cout.outs("Job #%d will NOT be retried. " % job_row['id']) notification = mailer.ErrorMailer(msg, \ subject="Job failed presubmission check - Terminal") notification.send() if config.basic.delete_rawdata: pipeline_utils.clean_up(job_row['id']) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'precheck_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='terminal_failure', " \ "details='Failed presubmission check', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) except (queue_managers.QueueManagerJobFatalError,\ datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row['id'] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" \ "\tJob ID: %d\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise else: # No error occurred msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row['id'], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "queue_id, " \ "output_dir, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % \ (job_row['id'], queue_id, outdir, 'running', \ jobtracker.nowstr(), jobtracker.nowstr(), \ 'Job submitted to queue')) queries.append("UPDATE jobs " \ "SET status='submitted', " \ "details='Job submitted to queue', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries)