def update_jobs_status_from_queue(): """ Updates Database entries for job processing according to the Jobs' Queue Status. """ # Collect all non processed jobs from db linking to downloaded files submits = jobtracker.query("SELECT * FROM job_submits " "WHERE status='running'") for submit in submits: # Check if job is still running (according to queue manager) is_running = config.jobpooler.queue_manager.is_running(submit["queue_id"]) if is_running: # Do nothing. pass else: # Check if processing had errors if config.jobpooler.queue_manager.had_errors(submit["queue_id"]): # Errors during processing... errormsg = config.jobpooler.queue_manager.get_errors(submit["queue_id"]) jobpool_cout.outs( "Processing of Job #%d (Submit ID: %d; Queue ID: %s) " "had errors." % (submit["job_id"], submit["id"], submit["queue_id"]) ) # Mark job entry with status 'failed' # Mark job_submit entry with status 'processing_failed' queries = [] arglists = [] queries.append( "UPDATE jobs " "SET status='failed', " "updated_at=?, " "details='Errors during processing' " "WHERE id=?" ) arglists.append((jobtracker.nowstr(), submit["job_id"])) queries.append( "UPDATE job_submits " "SET status='processing_failed', " "details=?, " "updated_at=? " "WHERE id=?" ) arglists.append((errormsg, jobtracker.nowstr(), submit["id"])) jobtracker.execute(queries, arglists) else: # No errors. Woohoo! # Mark job and job_submit entries with status 'processed' queries = [] queries.append( "UPDATE jobs " "SET status='processed', " "updated_at='%s', " "details='Processed without errors' " "WHERE id=%d" % (jobtracker.nowstr(), submit["job_id"]) ) queries.append( "UPDATE job_submits " "SET status='processed', " "updated_at='%s', " "details='Processed without error' " "WHERE id=%d" % (jobtracker.nowstr(), submit["id"]) ) jobtracker.query(queries)
def update_jobs_status_from_queue(): """ Updates Database entries for job processing according to the Jobs' Queue Status. """ # Collect all non processed jobs from db linking to downloaded files submits = jobtracker.query("SELECT * FROM job_submits " \ "WHERE status='running'") for submit in submits: # Check if job is still running (according to queue manager) is_running = config.jobpooler.queue_manager.is_running( submit['queue_id']) if is_running: # Do nothing. pass else: # Check if processing had errors if config.jobpooler.queue_manager.had_errors(submit['queue_id']): # Errors during processing... errormsg = config.jobpooler.queue_manager.get_errors( submit['queue_id']) jobpool_cout.outs("Processing of Job #%d (Submit ID: %d; Queue ID: %s) " \ "had errors." % \ (submit['job_id'], submit['id'], submit['queue_id'])) # Mark job entry with status 'failed' # Mark job_submit entry with status 'processing_failed' queries = [] arglists = [] queries.append("UPDATE jobs " \ "SET status='failed', " \ "updated_at=?, " \ "details='Errors during processing' " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), submit['job_id'])) queries.append("UPDATE job_submits " \ "SET status='processing_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), submit['id'])) jobtracker.execute(queries, arglists) else: # No errors. Woohoo! # Mark job and job_submit entries with status 'processed' queries = [] queries.append("UPDATE jobs " \ "SET status='processed', " \ "updated_at='%s', " \ "details='Processed without errors' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), submit['job_id'])) queries.append("UPDATE job_submits " \ "SET status='processed', " \ "updated_at='%s', " \ "details='Processed without error' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), submit['id'])) jobtracker.query(queries)
def upload_results(job_submit): """ Uploads Results for a given submit. Input: job_submit: A row from the job_submits table. Results from this job submission will be uploaded. Output: None """ print "Attempting to upload results" print "\tJob ID: %d, Job submission ID: %d" % \ (job_submit['job_id'], job_submit['id']) if debug.UPLOAD: upload.upload_timing_summary = {} starttime = time.time() try: # Connect to the DB db = database.Database('default', autocommit=False) # Prepare for upload dir = job_submit['output_dir'] if not os.path.exists(dir) or not os.listdir(dir): errormsg = 'ERROR: Results directory, %s, does not exist or is empty for job_id=%d' %\ (dir, job_submit['job_id']) raise upload.UploadNonFatalError(errormsg) fitsfiles = get_fitsfiles(job_submit) data = datafile.autogen_dataobj(fitsfiles) version_number = get_version_number(dir) if debug.UPLOAD: parsetime = time.time() # Upload results hdr = header.get_header(fitsfiles) print "\tHeader parsed." cands = candidates.get_candidates(version_number, dir) print "\tPeriodicity candidates parsed." sp_cands = sp_candidates.get_spcandidates(version_number, dir) print "\tSingle pulse candidates parsed." for c in (cands + sp_cands): hdr.add_dependent(c) diags = diagnostics.get_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir) print "\tDiagnostics parsed." if debug.UPLOAD: upload.upload_timing_summary['Parsing'] = \ upload.upload_timing_summary.setdefault('Parsing', 0) + \ (time.time()-parsetime) # Perform the upload header_id = hdr.upload(db) for d in diags: d.upload(db) print "\tEverything uploaded and checked successfully. header_id=%d" % \ header_id except (upload.UploadNonFatalError): # Parsing error caught. Job attempt has failed! exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while checking results!\n" errormsg += "\tJob ID: %d, Job submit ID: %d\n\n" % \ (job_submit['job_id'], job_submit['id']) errormsg += "".join(exceptionmsgs) sys.stderr.write("Error while checking results!\n") sys.stderr.write("Database transaction will not be committed.\n") sys.stderr.write("\t%s" % exceptionmsgs[-1]) queries = [] arglists = [] queries.append("UPDATE job_submits " \ "SET status='upload_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while uploading results', " \ "updated_at=? " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), job_submit['job_id'])) jobtracker.execute(queries, arglists) # Rolling back changes. db.rollback() except (database.DatabaseConnectionError, CornellFTP.CornellFTPTimeout,\ upload.UploadDeadlockError, database.DatabaseDeadlockError), e: # Connection error while uploading. We will try again later. sys.stderr.write(str(e)) sys.stderr.write( "\tRolling back DB transaction and will re-try later.\n") # Rolling back changes. db.rollback()
def upload_results(job_submit): """ Uploads Results for a given submit. Input: job_submit: A row from the job_submits table. Results from this job submission will be uploaded. Output: None """ print "Attempting to upload results" print "\tJob ID: %d, Job submission ID: %d" % \ (job_submit['job_id'], job_submit['id']) try: db = database.Database('common-copy', autocommit=False) # Prepare for upload dir = job_submit['output_dir'] fitsfiles = get_fitsfiles(job_submit) # Upload results header_id = header.upload_header(fitsfiles, dbname=db) if not header.upload_header(fitsfiles, dbname=db): raise AssertionError("Header values in common DB " \ "do not match values uploaded.") else: print "\tHeader uploaded and checked. Header ID: %d" % header_id version_number = get_version_number(dir) candidate_uploader.upload_candidates(header_id, \ version_number, \ dir, dbname=db) if not candidate_uploader.check_candidates(header_id, \ version_number, \ dir, dbname=db): raise AssertionError("Candidate values in common DB " \ "do not match values uploaded.") else: print "\tCandidates uploaded and checked." data = datafile.autogen_dataobj(fitsfiles) diagnostic_uploader.upload_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir, dbname=db) if not diagnostic_uploader.check_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir, dbname=db): raise AssertionError("Diagnostic values in common DB " \ "do not match values uploaded.") else: print "\tDiagnostics uploaded and checked." except (header.HeaderError, \ candidate_uploader.PeriodicityCandidateError, \ diagnostic_uploader.DiagnosticError): # Parsing error caught. Job attempt has failed! exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while checking results!\n" errormsg += "\tJob ID: %d, Job submit ID: %d\n\n" % \ (job_submit['job_id'], job_submit['id']) errormsg += "".join(exceptionmsgs) sys.stderr.write("Error while checking results!\n") sys.stderr.write("Database transaction will not be committed.\n") sys.stderr.write("\t%s" % exceptionmsgs[-1]) queries = [] arglists = [] queries.append("UPDATE job_submits " \ "SET status='upload_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while uploading results', " \ "updated_at=? " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), job_submit['job_id'])) jobtracker.execute(queries, arglists) # Rolling back changes. db.rollback() except database.DatabaseConnectionError, e: # Connection error while uploading. We will try again later. sys.stderr.write(str(e)) sys.stderr.write("\tRolling back DB transaction and will re-try later.\n") # Rolling back changes. db.rollback()
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row['id']) bad_days = [ '20170414', '20170419', '20170420', '20170423', '20170423', '20170427', '20170429', '20170503', '20170510', '20170516' ] bad_beams = ['b5', 'b6'] for bad_day in bad_days: if bad_day in fns[0]: if (bad_beams[0] in fns[0]) or (bad_beams[1] in fns[0]): print "Files affected by the bad beams 5, 6 60Hz signal: ", fns print "Will delete the raw data files." queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'Beams 5 and 6', \ jobtracker.nowstr(), jobtracker.nowstr(), \ "Affected by 60Hz signal") ) queries.append("UPDATE jobs " \ "SET status='terminal_failure', " \ "details='Beams 5 and 6 affected by 60Hz signal', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) return try: presubmission_check(fns) print "\t file has been checked" outdir, created = get_output_dir(fns) if not created: #current_usr = getpass.getuser() #errormsg = "Permission denied to %s when attempting to create output directory %s"%(current_usr,outdir) #f = open('/scratch/eparent/eparent/PALFA4/results/missing_permission.out','a') #f.write(errormsg+'\n') #f.close() print "Permission issues while creating output directory, skipping" pass # Attempt to submit the job if job_row['status'] == 'retrying': ## ppn=2 ppn = 1 tfact = 1.25 else: ppn = 1 tfact = 1. if config.jobpooler.alternative_submit_script: print "Submitting:", config.jobpooler.alternative_submit_script queue_id = config.jobpooler.queue_manager.submit\ (fns, outdir, job_row['id'],\ script=config.jobpooler.alternative_submit_script,\ ppn=ppn) else: queue_id = config.jobpooler.queue_manager.submit(fns, outdir, job_row['id'], ppn=ppn, tfact=tfact) msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row['id'], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "queue_id, " \ "output_dir, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % \ (job_row['id'], queue_id, outdir, 'running', \ jobtracker.nowstr(), jobtracker.nowstr(), \ 'Job submitted to queue')) queries.append("UPDATE jobs " \ "SET status='submitted', " \ "details='Job submitted to queue', updated_at='%s' WHERE id=%d" % (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries) except (FailedPreCheckError): # Error caught during presubmission check. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Job ID: %d " % job_row['id'] errormsg += "failed presubmission check!\n\n" errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Job ID: %d failed presubmission check!\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) #if config.basic.delete_rawdata: # pipeline_utils.clean_up(job_row['id']) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'precheck_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='terminal_failure', " \ "details='Failed presubmission check', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) except (queue_managers.QueueManagerJobFatalError,\ datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row['id'] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" \ "\tJob ID: %d\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise except (MissingFilesError): # Unexpected error exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Job ID: %d " % job_row['id'] errormsg += "Raw data files missing from /scratch/ area.!\n\n" errormsg += "".join(exceptionmsgs) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed',details='Error while submitting job',updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) print errormsg
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row['id']) try: presubmission_check(fns) outdir = get_output_dir(fns) # Attempt to submit the job queue_id = config.jobpooler.queue_manager.submit\ (fns, outdir, job_row['id']) except (FailedPreCheckError): # Error caught during presubmission check. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Job ID: %d " % job_row['id'] errormsg += "failed presubmission check!\n\n" errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Job ID: %d failed presubmission check!\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) if config.email.send_on_terminal_failures: # Send error email msg = "Presubmission check failed!\n" msg += "Job ID: %d\n\n" % \ (job_row['id']) msg += errormsg msg += "\n*** Job has been terminally failed. ***\n" msg += "*** Job will NOT be re-submitted! ***\n" if config.basic.delete_rawdata: jobpool_cout.outs("Job #%d will NOT be retried. " \ "Data files will be deleted." % job_row['id']) msg += "*** Raw data files will be deleted. ***\n" else: jobpool_cout.outs("Job #%d will NOT be retried. " % job_row['id']) notification = mailer.ErrorMailer(msg, \ subject="Job failed presubmission check - Terminal") notification.send() if config.basic.delete_rawdata: pipeline_utils.clean_up(job_row['id']) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'precheck_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='terminal_failure', " \ "details='Failed presubmission check', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append( (jobtracker.nowstr(), job_row['id']) ) jobtracker.execute(queries, arglist) except (queue_managers.QueueManagerJobFatalError,\ datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row['id'] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" \ "\tJob ID: %d\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append( (jobtracker.nowstr(), job_row['id']) ) jobtracker.execute(queries, arglist) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise else: # No error occurred msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row['id'], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "queue_id, " \ "output_dir, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % \ (job_row['id'], queue_id, outdir, 'running', \ jobtracker.nowstr(), jobtracker.nowstr(), \ 'Job submitted to queue')) queries.append("UPDATE jobs " \ "SET status='submitted', " \ "details='Job submitted to queue', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries)
def upload_results(job_submit): """ Uploads Results for a given submit. Input: job_submit: A row from the job_submits table. Results from this job submission will be uploaded. Output: None """ print "Attempting to upload results" print "\tJob ID: %d, Job submission ID: %d" % \ (job_submit['job_id'], job_submit['id']) if debug.UPLOAD: upload.upload_timing_summary = {} starttime = time.time() try: # Connect to the DB db = database.Database('default', autocommit=False) # Prepare for upload dir = job_submit['output_dir'] if not os.path.exists(dir) or not os.listdir(dir): errormsg = 'ERROR: Results directory, %s, does not exist or is empty for job_id=%d' %\ (dir, job_submit['job_id']) raise upload.UploadNonFatalError(errormsg) fitsfiles = get_fitsfiles(job_submit) data = datafile.autogen_dataobj(fitsfiles) version_number = get_version_number(dir) if debug.UPLOAD: parsetime = time.time() # Upload results hdr = header.get_header(fitsfiles) print "\tHeader parsed." cands = candidates.get_candidates(version_number, dir) print "\tPeriodicity candidates parsed." sp_cands = sp_candidates.get_spcandidates(version_number, dir) print "\tSingle pulse candidates parsed." for c in (cands + sp_cands): hdr.add_dependent(c) diags = diagnostics.get_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir) print "\tDiagnostics parsed." if debug.UPLOAD: upload.upload_timing_summary['Parsing'] = \ upload.upload_timing_summary.setdefault('Parsing', 0) + \ (time.time()-parsetime) # Perform the upload header_id = hdr.upload(db) for d in diags: d.upload(db) print "\tEverything uploaded and checked successfully. header_id=%d" % \ header_id except (upload.UploadNonFatalError): # Parsing error caught. Job attempt has failed! exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while checking results!\n" errormsg += "\tJob ID: %d, Job submit ID: %d\n\n" % \ (job_submit['job_id'], job_submit['id']) errormsg += "".join(exceptionmsgs) sys.stderr.write("Error while checking results!\n") sys.stderr.write("Database transaction will not be committed.\n") sys.stderr.write("\t%s" % exceptionmsgs[-1]) queries = [] arglists = [] queries.append("UPDATE job_submits " \ "SET status='upload_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while uploading results', " \ "updated_at=? " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), job_submit['job_id'])) jobtracker.execute(queries, arglists) # Rolling back changes. db.rollback() except (database.DatabaseConnectionError, CornellFTP.CornellFTPTimeout,\ upload.UploadDeadlockError, database.DatabaseDeadlockError), e: # Connection error while uploading. We will try again later. sys.stderr.write(str(e)) sys.stderr.write("\tRolling back DB transaction and will re-try later.\n") # Rolling back changes. db.rollback()
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row['id']) try: presubmission_check(fns) outdir = get_output_dir(fns) # Attempt to submit the job queue_id = config.jobpooler.queue_manager.submit\ (fns, outdir, job_row['id']) except (FailedPreCheckError): # Error caught during presubmission check. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Job ID: %d " % job_row['id'] errormsg += "failed presubmission check!\n\n" errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Job ID: %d failed presubmission check!\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) if config.email.send_on_terminal_failures: # Send error email msg = "Presubmission check failed!\n" msg += "Job ID: %d\n\n" % \ (job_row['id']) msg += errormsg msg += "\n*** Job has been terminally failed. ***\n" msg += "*** Job will NOT be re-submitted! ***\n" if config.basic.delete_rawdata: jobpool_cout.outs("Job #%d will NOT be retried. " \ "Data files will be deleted." % job_row['id']) msg += "*** Raw data files will be deleted. ***\n" else: jobpool_cout.outs("Job #%d will NOT be retried. " % job_row['id']) notification = mailer.ErrorMailer(msg, \ subject="Job failed presubmission check - Terminal") notification.send() if config.basic.delete_rawdata: pipeline_utils.clean_up(job_row['id']) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'precheck_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='terminal_failure', " \ "details='Failed presubmission check', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) except (queue_managers.QueueManagerJobFatalError,\ datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row['id'] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" \ "\tJob ID: %d\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise else: # No error occurred msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row['id'], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "queue_id, " \ "output_dir, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % \ (job_row['id'], queue_id, outdir, 'running', \ jobtracker.nowstr(), jobtracker.nowstr(), \ 'Job submitted to queue')) queries.append("UPDATE jobs " \ "SET status='submitted', " \ "details='Job submitted to queue', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries)
def upload_results(job_submit): """ Uploads Results for a given submit. Input: job_submit: A row from the job_submits table. Results from this job submission will be uploaded. Output: None """ print "Attempting to upload results" print "\tJob ID: %d, Job submission ID: %d" % \ (job_submit['job_id'], job_submit['id']) try: db = database.Database('common-copy', autocommit=False) # Prepare for upload dir = job_submit['output_dir'] fitsfiles = get_fitsfiles(job_submit) # Upload results header_id = header.upload_header(fitsfiles, dbname=db) if not header.upload_header(fitsfiles, dbname=db): raise AssertionError("Header values in common DB " \ "do not match values uploaded.") else: print "\tHeader uploaded and checked. Header ID: %d" % header_id version_number = get_version_number(dir) candidate_uploader.upload_candidates(header_id, \ version_number, \ dir, dbname=db) if not candidate_uploader.check_candidates(header_id, \ version_number, \ dir, dbname=db): raise AssertionError("Candidate values in common DB " \ "do not match values uploaded.") else: print "\tCandidates uploaded and checked." data = datafile.autogen_dataobj(fitsfiles) diagnostic_uploader.upload_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir, dbname=db) if not diagnostic_uploader.check_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir, dbname=db): raise AssertionError("Diagnostic values in common DB " \ "do not match values uploaded.") else: print "\tDiagnostics uploaded and checked." except (header.HeaderError, \ candidate_uploader.PeriodicityCandidateError, \ diagnostic_uploader.DiagnosticError): # Parsing error caught. Job attempt has failed! exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while checking results!\n" errormsg += "\tJob ID: %d, Job submit ID: %d\n\n" % \ (job_submit['job_id'], job_submit['id']) errormsg += "".join(exceptionmsgs) sys.stderr.write("Error while checking results!\n") sys.stderr.write("Database transaction will not be committed.\n") sys.stderr.write("\t%s" % exceptionmsgs[-1]) queries = [] arglists = [] queries.append("UPDATE job_submits " \ "SET status='upload_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while uploading results', " \ "updated_at=? " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), job_submit['job_id'])) jobtracker.execute(queries, arglists) # Rolling back changes. db.rollback() except database.DatabaseConnectionError, e: # Connection error while uploading. We will try again later. sys.stderr.write(str(e)) sys.stderr.write( "\tRolling back DB transaction and will re-try later.\n") # Rolling back changes. db.rollback()
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row['id']) try: outdir = get_output_dir(fns) # Attempt to submit the job queue_id = config.jobpooler.queue_manager.submit\ (fns, outdir, job_row['id']) except (queue_managers.QueueManagerJobFatalError,\ datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row['id'] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" \ "\tJob ID: %d\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise else: # No error occurred msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row['id'], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "queue_id, " \ "output_dir, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % \ (job_row['id'], queue_id, outdir, 'running', \ jobtracker.nowstr(), jobtracker.nowstr(), \ 'Job submitted to queue')) queries.append("UPDATE jobs " \ "SET status='submitted', " \ "details='Job submitted to queue', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries)
def upload_results(job_submit): """ Uploads Results for a given submit. Input: job_submit: A row from the job_submits table. Results from this job submission will be uploaded. Output: None """ print "Attempting to upload results" print "\tJob ID: %d, Job submission ID: %d\n\tOutput Dir: %s" % \ (job_submit['job_id'], job_submit['id'], job_submit['output_dir']) if debug.UPLOAD: upload.upload_timing_summary = {} starttime = time.time() try: # Connect to the DB db = database.Database('default', autocommit=False) # Prepare for upload dir = job_submit['output_dir'] # NEW Beluga - Untar the tarball import tarfile to_keep = os.listdir(job_submit['output_dir']) tarball = glob.glob(job_submit['output_dir'] + '/*00.tgz')[0] tar = tarfile.open(tarball, 'r:gz') tar.extractall(path=job_submit['output_dir']) tar.close() all_files = os.listdir(job_submit['output_dir']) to_del = set(all_files) - set(to_keep) if config.upload.upload_zerodm_periodicity or config.upload.upload_zerodm_singlepulse: to_keep_zerodm = os.listdir(job_submit['output_dir'] + '/zerodm') tarball = glob.glob(job_submit['output_dir'] + '/zerodm/*zerodm.tgz')[0] tar = tarfile.open(tarball, 'r:gz') tar.extractall(path=job_submit['output_dir'] + '/zerodm') tar.close() all_files_zerodm = os.listdir(job_submit['output_dir'] + '/zerodm') to_del_zerodm = set(all_files_zerodm) - set(to_keep_zerodm) pdm_dir = os.path.join( dir, "zerodm") if config.upload.upload_zerodm_periodicity else dir sp_dir = os.path.join( dir, "zerodm") if config.upload.upload_zerodm_singlepulse else dir if not os.path.exists(dir) or not os.listdir(dir): errormsg = 'ERROR: Results directory, %s, does not exist or is empty for job_id=%d' %\ (dir, job_submit['job_id']) raise upload.UploadNonFatalError(errormsg) elif len(os.listdir(dir)) == 1 and os.listdir(dir)[0] == 'zerodm' \ and not os.listdir(os.path.join(dir,os.listdir(dir)[0])): errormsg = 'ERROR: Results directory, %s, does not exist or is empty for job_id=%d' %\ (dir, job_submit['job_id']) raise upload.UploadNonFatalError(errormsg) fitsfiles = get_fitsfiles(job_submit) try: data = datafile.autogen_dataobj(fitsfiles) except ValueError: raise upload.UploadNonFatalError version_number = get_version_number(dir) if debug.UPLOAD: parsetime = time.time() # Upload results hdr = header.get_header(fitsfiles) print "\tHeader parsed." rat_inst_id_cache = ratings2.utils.RatingInstanceIDCache( dbname='common3') cands, tempdir = candidates.get_candidates(version_number, pdm_dir, \ timestamp_mjd=data.timestamp_mjd, \ inst_cache=rat_inst_id_cache) print "\tPeriodicity candidates parsed. (%d cands)" % len(cands) sp_cands, tempdir_sp = sp_candidates.get_spcandidates(version_number, sp_dir, \ timestamp_mjd=data.timestamp_mjd, \ inst_cache=rat_inst_id_cache) print "\tSingle pulse candidates parsed. (%d cands)" % len(sp_cands) diags = diagnostics.get_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ pdm_dir, sp_dir) print "\tDiagnostics parsed." for c in (cands + sp_cands): hdr.add_dependent(c) if debug.UPLOAD: upload.upload_timing_summary['Parsing'] = \ upload.upload_timing_summary.setdefault('Parsing', 0) + \ (time.time()-parsetime) # Perform the upload header_id = hdr.upload(db) print "Header ID: ", header_id for d in diags: d.upload(db) print "\tDB upload completed and checked successfully. header_id=%d" % \ header_id except (upload.UploadNonFatalError): # Parsing error caught. Job attempt has failed! exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while checking results!\n" errormsg += "\tJob ID: %d, Job submit ID: %d\n\n" % \ (job_submit['job_id'], job_submit['id']) errormsg += "".join(exceptionmsgs) sys.stderr.write("Error while checking results!\n") sys.stderr.write("Database transaction will not be committed.\n") sys.stderr.write("\t%s" % exceptionmsgs[-1]) queries = [] arglists = [] queries.append("UPDATE job_submits " \ "SET status='upload_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while uploading results', " \ "updated_at=? " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), job_submit['job_id'])) jobtracker.execute(queries, arglists) # Rolling back changes. db.rollback() except (database.DatabaseConnectionError, ratings2.database.DatabaseConnectionError,\ CornellFTP.CornellFTPTimeout, upload.UploadDeadlockError,\ database.DatabaseDeadlockError), e: # Connection error while uploading. We will try again later. sys.stderr.write(str(e)) sys.stderr.write( "\tRolling back DB transaction and will re-try later.\n") # Rolling back changes. db.rollback()
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row["id"]) try: outdir = get_output_dir(fns) # Attempt to submit the job queue_id = config.jobpooler.queue_manager.submit(fns, outdir, job_row["id"]) except (queue_managers.QueueManagerJobFatalError, datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row["id"] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" "\tJob ID: %d\n\t%s\n" % (job_row["id"], exceptionmsgs[-1])) queries = [] arglist = [] queries.append( "INSERT INTO job_submits (" "job_id, " "status, " "created_at, " "updated_at, " "details) " "VALUES (?, ?, ?, ?, ?)" ) arglist.append((job_row["id"], "submission_failed", jobtracker.nowstr(), jobtracker.nowstr(), errormsg)) queries.append( "UPDATE jobs " "SET status='failed', " "details='Error while submitting job', " "updated_at=? " "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row["id"])) jobtracker.execute(queries, arglist) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise else: # No error occurred msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row["id"], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append( "INSERT INTO job_submits (" "job_id, " "queue_id, " "output_dir, " "status, " "created_at, " "updated_at, " "details) " "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % ( job_row["id"], queue_id, outdir, "running", jobtracker.nowstr(), jobtracker.nowstr(), "Job submitted to queue", ) ) queries.append( "UPDATE jobs " "SET status='submitted', " "details='Job submitted to queue', " "updated_at='%s' " "WHERE id=%d" % (jobtracker.nowstr(), job_row["id"]) ) jobtracker.query(queries)
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row["id"]) script = os.path.join(config.basic.pipelinedir, "bin", "%s_search.py" % config.basic.survey) # Specify requested resources for job submission if job_row["task"] == "rfifind": res = [4 * 60 * 60, 1024, 25] elif "search" in job_row["task"]: res = [165240, 1024, 28] # 45.9 hrs elif job_row["task"] == "sifting": # Sifting should be quick res = [30 * 60, 256, 5] elif "folding" in job_row["task"]: res = [96 * 60 * 60, 3000, 28] # elif job_row['task']=='tidyup': # res = [30*60, 256, 5] options = job_row["task"] try: SPAN512_job.presubmission_check(fns) outdir = SPAN512_job.get_output_dir(fns) # Attempt to submit the job queue_id = config.jobpooler.queue_manager.submit( fns, outdir, job_row["id"], resources=res, script=script, opts=options ) except (FailedPreCheckError): # Error caught during presubmission check. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Job ID: %d " % job_row["id"] errormsg += "failed presubmission check!\n\n" errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Job ID: %d failed presubmission check!\n\t%s\n" % (job_row["id"], exceptionmsgs[-1])) if config.email.send_on_terminal_failures: # Send error email msg = "Presubmission check failed!\n" msg += "Job ID: %d\n\n" % (job_row["id"]) msg += errormsg msg += "\n*** Job has been terminally failed. ***\n" msg += "*** Job will NOT be re-submitted! ***\n" if config.basic.delete_rawdata: jobpool_cout.outs("Job #%d will NOT be retried. " "Data files will be deleted." % job_row["id"]) msg += "*** Raw data files will be deleted. ***\n" else: jobpool_cout.outs("Job #%d will NOT be retried. " % job_row["id"]) notification = mailer.ErrorMailer(msg, subject="Job failed presubmission check - Terminal") notification.send() if config.basic.delete_rawdata: pipeline_utils.clean_up(job_row["id"]) queries = [] arglist = [] queries.append( "INSERT INTO job_submits (" "job_id, " "status, " "created_at, " "updated_at, " "details) " "VALUES (%d, %s, '%s', '%s', %s)" % (job_row["id"], "precheck_failed", jobtracker.nowstr(), jobtracker.nowstr(), errormsg) ) queries.append( "UPDATE jobs " "SET status='terminal_failure', " "details='Failed presubmission check', " "updated_at='%s'" "WHERE id=%d" % (jobtracker.nowstr(), job_row["id"]) ) jobtracker.query(queries) except (queue_managers.QueueManagerJobFatalError, datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row["id"] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" "\tJob ID: %d\n\t%s\n" % (job_row["id"], exceptionmsgs[-1])) queries = [] arglist = [] queries.append( "INSERT INTO job_submits (" "job_id, " "status, " "created_at, " "updated_at, " "details) " "VALUES (%d, %s, '%s', '%s', %s)" % (job_row["id"], "submission_failed", jobtracker.nowstr(), jobtracker.nowstr(), errormsg) ) queries.append( "UPDATE jobs " "SET status='failed', " "details='Error while submitting job', " "updated_at='%s' " "WHERE id=%d" % (jobtracker.nowstr(), job_row["id"]) ) jobtracker.execute(queries) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise else: # No error occurred msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row["id"], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append( "INSERT INTO job_submits (" "job_id, " "queue_id, " "output_dir, " "status, " "created_at, " "updated_at, " "details) " "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % ( job_row["id"], queue_id, outdir, "running", jobtracker.nowstr(), jobtracker.nowstr(), "Job submitted to queue", ) ) queries.append( "UPDATE jobs " "SET status='submitted', " "details='Job submitted to queue', " "updated_at='%s' " "WHERE id=%d" % (jobtracker.nowstr(), job_row["id"]) ) jobtracker.query(queries)