def make_request(): """Make a request for data to be restored by connecting to the web services at Cornell. """ dlm_cout.outs("Requesting data") num_beams = 1 web_service = CornellWebservice.Client() guid = web_service.Restore(username=config.download.api_username, \ pw=config.download.api_password, \ number=num_beams, \ bits=config.download.request_numbits, \ fileType=config.download.request_datatype) if guid == "fail": raise pipeline_utils.PipelineError("Request for restore returned 'fail'.") requests = jobtracker.query("SELECT * FROM requests " \ "WHERE guid='%s'" % guid) if requests: # Entries in the requests table exist with this GUID!? raise pipeline_utils.PipelineError("There are %d requests in the " \ "job-tracker DB with this GUID %s" % \ (len(requests), guid)) jobtracker.query("INSERT INTO requests ( " \ "guid, " \ "created_at, " \ "updated_at, " \ "status, " \ "details) " \ "VALUES ('%s', '%s', '%s', '%s', '%s')" % \ (guid, jobtracker.nowstr(), jobtracker.nowstr(), 'waiting', \ 'Newly created request'))
def check_download_attempts(): """For each download attempt with status 'downloading' check to see that its thread is still active. If not, mark it as 'unknown', and mark the file as 'unverified'. """ attempts = jobtracker.query("SELECT * FROM download_attempts " \ "WHERE status='downloading'") active_ids = [int(t.getName()) for t in threading.enumerate() \ if isinstance(t, DownloadThread)] for attempt in attempts: if attempt['id'] not in active_ids: dlm_cout.outs("Download attempt (ID: %d) is no longer running." % \ attempt['id']) queries = [] queries.append("UPDATE files " \ "SET status='unverified', " \ "updated_at='%s', " \ "details='Download thread is no longer running' " "WHERE id=%d" % (jobtracker.nowstr(), attempt['download_id'])) queries.append("UPDATE download_attempts " \ "SET status='unknown', " \ "updated_at='%s', " \ "details='Download thread is no longer running' " "WHERE id=%d" % (jobtracker.nowstr(), attempt['id'])) jobtracker.query(queries)
def check_active_requests(): """Check for any requests with status='waiting'. If there are some, check if the files are ready for download. """ active_requests = jobtracker.query("SELECT * FROM requests " \ "WHERE status='waiting'") web_service = CornellWebservice.Client() for request in active_requests: location = web_service.Location(guid=request['guid'], \ username=config.download.api_username, \ pw=config.download.api_password) if location == "done": dlm_cout.outs("Restore (%s) is done. Will create file entries." % \ request['guid']) create_file_entries(request) else: query = "SELECT (julianday('%s')-julianday(created_at))*24 " \ "AS deltaT_hours " \ "FROM requests " \ "WHERE guid='%s'" % \ (jobtracker.nowstr(), request['guid']) row = jobtracker.query(query, fetchone=True) if row['deltaT_hours'] > config.download.request_timeout: dlm_cout.outs("Restore (%s) is over %d hr old " \ "and still not ready. Marking " \ "it as failed." % \ (request['guid'], config.download.request_timeout)) jobtracker.query("UPDATE requests " \ "SET status='failed', " \ "details='Request took too long (> %d hr)', " \ "updated_at='%s' " \ "WHERE guid='%s'" % \ (config.download.request_timeout, jobtracker.nowstr(), \ request['guid']))
def download(attempt): """Given a row from the job-tracker's download_attempts table, actually attempt the download. """ file = jobtracker.query("SELECT * FROM files " \ "WHERE id=%d" % attempt['download_id'], \ fetchone=True) request = jobtracker.query("SELECT * FROM requests " \ "WHERE id=%d" % file['request_id'], \ fetchone=True) queries = [] try: cftp = CornellFTP.CornellFTP() cftp.download(os.path.join(request['guid'], file['remote_filename'])) except Exception, e: raise queries.append("UPDATE files " \ "SET status='failed', " \ "updated_at='%s', " \ "details='Download failed - %s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), str(e), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='download_failed', " \ "details='Download failed - %s', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (str(e), jobtracker.nowstr(), attempt['id']))
def update_jobs_status_from_queue(): """ Updates Database entries for job processing according to the Jobs' Queue Status. """ # Collect all non processed jobs from db linking to downloaded files submits = jobtracker.query("SELECT * FROM job_submits " "WHERE status='running'") for submit in submits: # Check if job is still running (according to queue manager) is_running = config.jobpooler.queue_manager.is_running(submit["queue_id"]) if is_running: # Do nothing. pass else: # Check if processing had errors if config.jobpooler.queue_manager.had_errors(submit["queue_id"]): # Errors during processing... errormsg = config.jobpooler.queue_manager.get_errors(submit["queue_id"]) jobpool_cout.outs( "Processing of Job #%d (Submit ID: %d; Queue ID: %s) " "had errors." % (submit["job_id"], submit["id"], submit["queue_id"]) ) # Mark job entry with status 'failed' # Mark job_submit entry with status 'processing_failed' queries = [] arglists = [] queries.append( "UPDATE jobs " "SET status='failed', " "updated_at=?, " "details='Errors during processing' " "WHERE id=?" ) arglists.append((jobtracker.nowstr(), submit["job_id"])) queries.append( "UPDATE job_submits " "SET status='processing_failed', " "details=?, " "updated_at=? " "WHERE id=?" ) arglists.append((errormsg, jobtracker.nowstr(), submit["id"])) jobtracker.execute(queries, arglists) else: # No errors. Woohoo! # Mark job and job_submit entries with status 'processed' queries = [] queries.append( "UPDATE jobs " "SET status='processed', " "updated_at='%s', " "details='Processed without errors' " "WHERE id=%d" % (jobtracker.nowstr(), submit["job_id"]) ) queries.append( "UPDATE job_submits " "SET status='processed', " "updated_at='%s', " "details='Processed without error' " "WHERE id=%d" % (jobtracker.nowstr(), submit["id"]) ) jobtracker.query(queries)
def recover_failed_downloads(): """For each entry in the job-tracker DB's files table check if the download can be retried or not. Update status and clean up, as necessary. """ failed_files = jobtracker.query("SELECT * FROM files " \ "WHERE status='failed'") for file in failed_files: attempts = jobtracker.query("SELECT * FROM download_attempts " \ "WHERE download_id=%d" % file['id']) if len(attempts) < config.download.numretries: # download can be retried jobtracker.query("UPDATE files " \ "SET status='retrying', " \ "updated_at='%s', " \ "details='Download will be attempted again' " \ "WHERE id=%s" % \ (jobtracker.nowstr(), file['id'])) else: # Abandon this file if os.path.exists(file['filename']): os.remove(file['filename']) jobtracker.query("UPDATE files " \ "SET status='terminal_failure', " \ "updated_at='%s', " \ "details='This file has been abandoned' " \ "WHERE id=%s" % \ (jobtracker.nowstr(), file['id']))
def create_parallel_folding_jobs(): """Check job-tracker DB for processed jobs. Submit successive jobs and create entries in the jobs table. """ Jobs = check_parallel_jobs() file_ids = Jobs.keys() file_ids.sort() queries = [] for file_id in file_ids: # retrieve file_ids # rows = jobtracker.query("SELECT job_id from job_files " \ # "WHERE file_id=%d AND task LIKE 'search%'"%file_id) # # files_ids = [str(row['file_id']) for row in rows] # Submit all parallel jobs () # for istep in range(config.searching.ddplans['nuppi']): if 1: # task_name = "folding %d"%istep # TODO task_name = "folding" queries.append( "INSERT INTO jobs (" "created_at, " "details, " "status, " "task, " "updated_at) " "VALUES ('%s', '%s', '%s', '%s', '%s')" % (jobtracker.nowstr(), "Newly created job", "new", task_name, jobtracker.nowstr()) ) # for file_id in files_ids: queries.append( "INSERT INTO job_files (" "file_id, " "created_at, " "job_id, " "updated_at) " "SELECT id, '%s', (SELECT LAST_INSERT_ID()), '%s' " "FROM files " "WHERE id=%d" % (jobtracker.nowstr(), jobtracker.nowstr(), int(file_id)) ) # Mark the previous task as 'done' for job_id in Jobs[file_id]: queries.append( "UPDATE jobs " "SET status='done', " "updated_at='%s', " "details='Processed without errors' " "WHERE id=%d" % (jobtracker.nowstr(), job_id) ) jobtracker.query(queries)
def make_request(dbname='default'): """Make a request for data to be restored by connecting to the data server. """ num_beams = get_num_to_request() if not num_beams: # Request size is 0 return dlm_cout.outs("Requesting data\nIssuing a request of size %d" % num_beams) # Ask to restore num_beams db = database.Database(dbname) QUERY = "SELECT f.obs_id FROM full_processing as f LEFT JOIN processing AS p ON f.obs_id = p.obs_id WHERE f.status='available' AND p.details is NULL LIMIT %d"%num_beams db.cursor.execute(QUERY) obs_ids = [row[0] for row in db.cursor.fetchall()] # Ask for an uuid QUERY = "SELECT UUID();" db.cursor.execute(QUERY) guid = db.cursor.fetchone()[0] if not obs_ids: print "There are no files to be restored." return # Mark the beams for restorations for obs_id in obs_ids: QUERY = "UPDATE full_processing SET status='requested', guid='%s', updated_at=NOW() WHERE obs_id=%s"%(guid, obs_id) db.cursor.execute(QUERY) db.conn.close() #if guid == "fail": # raise pipeline_utils.PipelineError("Request for restore returned 'fail'.") requests = jobtracker.query("SELECT * FROM requests WHERE guid='%s'" % guid) if requests: # Entries in the requests table exist with this GUID!? raise pipeline_utils.PipelineError("There are %d requests in the " \ "job-tracker DB with this GUID %s" % \ (len(requests), guid)) jobtracker.query("INSERT INTO requests ( " \ "numbits, " \ "numrequested, " \ "file_type, " \ "guid, " \ "created_at, " \ "updated_at, " \ "status, " \ "details) " \ "VALUES (%d, %d, '%s', '%s', '%s', '%s', '%s', '%s')" % \ (config.download.request_numbits, num_beams, \ config.download.request_datatype, guid, \ jobtracker.nowstr(), jobtracker.nowstr(), 'waiting', \ 'Newly created request'))
def verify_files(): """For all downloaded files with status 'unverify' verify the files. """ toverify = jobtracker.query("SELECT * FROM files " \ "WHERE status='unverified'") for file in toverify: if os.path.exists(file['filename']): actualsize = os.path.getsize(file['filename']) else: actualsize = -1 expectedsize = file['size'] last_attempt_id = jobtracker.query("SELECT id " \ "FROM download_attempts " \ "WHERE download_id=%s " \ "ORDER BY id DESC " % file['id'], \ fetchone=True)[0] queries = [] if actualsize == expectedsize: dlm_cout.outs("Download of %s is complete and verified." % \ os.path.split(file['filename'])[-1]) # Everything checks out! queries.append("UPDATE files " \ "SET status='downloaded', " \ "details='Download is complete and verified', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='downloaded', " \ "details='Download is complete and verified', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), last_attempt_id)) else: dlm_cout.outs("Verification of %s failed. \n" \ "\tActual size (%d bytes) != Expected size (%d bytes)" % \ (os.path.split(file['filename'])[-1], actualsize, expectedsize)) # Boo... verification failed. queries.append("UPDATE files " \ "SET status='failed', " \ "details='Downloaded file failed verification', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='verification_failed', " \ "details='Downloaded file failed verification', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), last_attempt_id)) jobtracker.query(queries)
def create_parallel_search_jobs(): """Check job-tracker DB for processed jobs. Submit successive jobs and create entries in the jobs table. """ # Look for job with rfifind done rows = jobtracker.query("SELECT * from jobs " "WHERE status='processed' " "AND task='rfifind'") queries = [] for row in rows: # retrieve file_ids rows2 = jobtracker.query("SELECT * from job_files " "WHERE job_id=%d" % row["id"]) files_ids = [str(row2["file_id"]) for row2 in rows2] # Submit all parallel jobs (1 job per DDplan) for istep in range(len(config.searching.ddplans["nuppi"])): task_name = "search %d" % istep # TODO queries.append( "INSERT INTO jobs (" "created_at, " "details, " "status, " "task, " "updated_at) " "VALUES ('%s', '%s', '%s', '%s', '%s')" % (jobtracker.nowstr(), "Newly created job", "new", task_name, jobtracker.nowstr()) ) rows = jobtracker.query("SELECT filename FROM files WHERE id IN ('%s')") for file_id in files_ids: queries.append( "INSERT INTO job_files (" "file_id, " "created_at, " "job_id, " "updated_at) " "SELECT id, '%s', (SELECT LAST_INSERT_ID()), '%s' " "FROM files " "WHERE id=%d" % (jobtracker.nowstr(), jobtracker.nowstr(), int(file_id)) ) # Mark the previous task as 'done' queries.append( "UPDATE jobs " "SET status='done', " "updated_at='%s', " "details='Processed without errors' " "WHERE id=%d" % (jobtracker.nowstr(), row["id"]) ) jobtracker.query(queries)
def create_jobs_for_new_files(): """Check job-tracker DB for newly downloaded files. Group jobs that belong to the same observation and create entries in the jobs table. """ # Get files that were not associated with a job yet rows = jobtracker.query( "SELECT filename FROM files " "LEFT JOIN job_files " "ON job_files.file_id=files.id " "WHERE files.status IN ('downloaded', 'added') " "AND job_files.id IS NULL" ) newfns = [str(row["filename"]) for row in rows] # Group together files that belong together groups = datafile.simple_group_files(newfns) # Keep only groups that are not missing any files complete_groups = [grp for grp in groups if SPAN512_job.is_complete(grp)] if complete_groups: jobpool_cout.outs("Inserting %d new entries into jobs table" % len(complete_groups)) # Label the first task task_name = "rfifind" for complete in complete_groups: # Insert new job and link it to data files queries = [] queries.append( "INSERT INTO jobs (" "created_at, " "details, " "status, " "task, " "updated_at) " "VALUES ('%s', '%s', '%s', '%s', '%s')" % (jobtracker.nowstr(), "Newly created job", "new", task_name, jobtracker.nowstr()) ) queries.append( "INSERT INTO job_files (" "file_id, " "created_at, " "job_id, " "updated_at) " "SELECT id, '%s', (SELECT LAST_INSERT_ID()), '%s' " "FROM files " "WHERE filename IN ('%s')" % (jobtracker.nowstr(), jobtracker.nowstr(), "', '".join(complete)) ) jobtracker.query(queries)
def create_download(file_path): filename = os.path.basename(file_path) filesize = os.path.getsize(file_path) query = "INSERT INTO files (" \ "remote_filename, " \ "filename, " \ "status, " \ "created_at, " \ "updated_at, " \ "size, " \ "details) " \ "VALUES ('%s','%s','%s','%s','%s',%u,'%s')" % \ (filename, file_path, 'downloaded', jobtracker.nowstr(), \ jobtracker.nowstr(), filesize, \ "Manually added via add_files.py") return jobtracker.query(query)
def main(): jobids = set([int(id) for id in args]) jobids.update(options.jobids) for fn in options.files: rows = jobtracker.query("SELECT job_files.job_id FROM job_files " \ "LEFT JOIN files " \ "ON files.id = job_files.file_id " \ "WHERE files.filename LIKE '%%%s' " % fn) for row in rows: jobids.add(row['job_id']) print "Number of jobs to kill: %d" % len(jobids) for jobid in jobids: print "Attempting to kill job with id %d" % jobid row = jobtracker.query("SELECT status FROM jobs " \ "WHERE id=%d" % jobid, \ fetchone=True) if row['status'] in ['new', 'retrying']: jobtracker.query("UPDATE jobs " \ "SET status='terminal_failure', " \ "updated_at='%s', " \ "details='Job was killed manually' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), jobid)) print "Job's status has been set to 'terminal_failure'" pipeline_utils.clean_up(jobid) else: print "Only jobs whose status is 'waiting' or 'retrying' " \ "can be killed. (Current status of job %d: %s)" % \ (jobid, row['status'])
def download(attempt): """Given a row from the job-tracker's download_attempts table, actually attempt the download. """ file = jobtracker.query("SELECT * FROM files " \ "WHERE id=%d" % attempt['file_id'], \ fetchone=True) request = jobtracker.query("SELECT * FROM requests " \ "WHERE id=%d" % file['request_id'], \ fetchone=True) queries = [] # Download using bbftp res = DownloaderSPAN512.exec_download(request, file) # bbftp should report 'get filename OK' if the transfer is successfull if res == 'OK': queries.append("UPDATE files " \ "SET status='unverified', " \ "updated_at='%s', " \ "details='Download is complete - File is unverified' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='complete', " \ "details='Download is complete', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), attempt['id'])) else: queries.append("UPDATE files " \ "SET status='failed', " \ "updated_at='%s', " \ "details='Download failed - %s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), str(res), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='download_failed', " \ "details='Download failed - %s', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (str(res), jobtracker.nowstr(), attempt['id'])) jobtracker.query(queries)
def create_jobs_for_new_files(): """Check job-tracker DB for newly downloaded files. Group jobs that belong to the same observation and create entries in the jobs table. """ # Get files that aren't already associated with a job rows = jobtracker.query("SELECT filename FROM files " \ "LEFT JOIN job_files " \ "ON job_files.file_id=files.id " \ "WHERE files.status='downloaded' " \ "AND job_files.id IS NULL") newfns = [str(row['filename']) for row in rows] # Group together files that belong together groups = datafile.group_files(newfns) # Keep only groups that are not missing any files complete_groups = [grp for grp in groups if datafile.is_complete(grp)] if complete_groups: jobpool_cout.outs("Inserting %d new entries into jobs table" % \ len(complete_groups)) for complete in complete_groups: # Insert new job and link it to data files queries = [] queries.append("INSERT INTO jobs (" \ "created_at, " \ "details, " \ "status, " \ "updated_at) " \ "VALUES ('%s', '%s', '%s', '%s')" % \ (jobtracker.nowstr(), 'Newly created job', \ 'new', jobtracker.nowstr())) queries.append("INSERT INTO job_files (" \ "file_id, " \ "created_at, " \ "job_id, " \ "updated_at) " \ "SELECT id, '%s', (SELECT LAST_INSERT_ROWID()), '%s' " \ "FROM files " \ "WHERE filename IN ('%s')" % \ (jobtracker.nowstr(), jobtracker.nowstr(), \ "', '".join(complete))) jobtracker.query(queries)
def main(): for queue_id in args: job_submits = jobtracker.query("SELECT id, job_id, queue_id " \ "FROM job_submits " \ "WHERE queue_id LIKE '%s'" % queue_id) if len(job_submits) != 1: sys.stderr.write("Bad number (%d) of job submissions for queue " \ "ID provided: %s\n" % (len(job_submits), queue_id)) elif config.jobpooler.queue_manager.is_running(job_submits[0]['queue_id']): print "Stopping job: %s" % job_submits[0]['queue_id'] queries = [] if options.fail: queries.append("UPDATE job_submits " \ "SET status='stopped', " \ "updated_at='%s', " \ "details='Job was manually failed' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_submits[0]['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "updated_at='%s', " \ "details='Job was manually failed' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_submits[0]['job_id'])) else: queries.append("DELETE FROM job_submits " \ "WHERE id=%d" % job_submits[0]['id']) queries.append("UPDATE jobs " \ "SET status='retrying', " \ "updated_at='%s', " \ "details='Job was manually removed, politely' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_submits[0]['job_id'])) jobtracker.query(queries) try: config.jobpooler.queue_manager.delete(job_submits[0]['queue_id']) except pipeline_utils.PipelineError, e: print "PipelineError: %s" % str(e) else: sys.stderr.write("There is no job currently in the queue with " \ "the ID provided: %s\n" % job_submits[0]['queue_id'])
def start_downloads(): """Check for entries in the files table with status 'retrying' or 'new' and start the downloads. """ todownload = jobtracker.query("SELECT * FROM files " \ "WHERE status='retrying' " \ "ORDER BY created_at ASC") todownload += jobtracker.query("SELECT * FROM files " \ "WHERE status='new' " \ "ORDER BY created_at ASC") for file in todownload: if can_download(): dlm_cout.outs("Initiating download of %s" % \ os.path.split(file['filename'])[-1]) # Update file status and insert entry into download_attempts queries = [] queries.append("UPDATE files " \ "SET status='downloading', " \ "details='Initiated download', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), file['id'])) jobtracker.query(queries) queries = [] queries.append("INSERT INTO download_attempts (" \ "status, " \ "details, " \ "updated_at, " \ "created_at, " \ "file_id) " \ "VALUES ('%s', '%s', '%s', '%s', %d)" % \ ('downloading', 'Initiated download', jobtracker.nowstr(), \ jobtracker.nowstr(), file['id'])) insert_id = jobtracker.query(queries, fetchone=True) attempt = jobtracker.query("SELECT * FROM download_attempts " \ "WHERE id=%d" % insert_id, fetchone=True) # download(attempt) DownloadThread(attempt).start() else: break
def check_active_requests(): """Check for any requests with status='waiting'. If there are some, check if the files are ready for download. """ active_requests = jobtracker.query("SELECT * FROM requests " \ "WHERE status='waiting'") for request in active_requests: # Check requested status if DownloaderSPAN512.check_request_done(request): dlm_cout.outs("Restore (GUID: %s) has succeeded. Will create file entries.\n" % request['guid']) create_file_entries(request) else: # dlm_cout.outs("Request (GUID: %s) has failed.\n" \ # "\tDatabase failed to report the data as restored." % request['guid']) # jobtracker.query("UPDATE requests SET status='failed', " \ # "details='Request failed. Why ?', " \ # "updated_at='%s' " \ # "WHERE guid='%s'" % (jobtracker.nowstr(), request['guid'])) query = "SELECT (TO_SECONDS('%s')-TO_SECONDS(created_at)) " \ "AS deltaT_seconds " \ "FROM requests " \ "WHERE guid='%s'" % \ (jobtracker.nowstr(), request['guid']) row = jobtracker.query(query, fetchone=True) #if row['deltaT_seconds']/3600. > config.download.request_timeout: if row/3600. > config.download.request_timeout: dlm_cout.outs("Restore (GUID: %s) is over %d hr old " \ "and still not ready. Marking " \ "it as failed." % \ (request['guid'], config.download.request_timeout)) jobtracker.query("UPDATE requests " \ "SET status='failed', " \ "details='Request took too long (> %d hr)', " \ "updated_at='%s' " \ "WHERE guid='%s'" % \ (config.download.request_timeout, jobtracker.nowstr(), \ request['guid']))
def mark_finished_beams(): """ """ finished_jobs = jobtracker.query("SELECT * FROM jobs WHERE " "task like 'folding%' AND status='processed'") queries = [] for finished_job in finished_jobs: queries.append( "UPDATE jobs " "SET status='finished', " "updated_at='%s', " "details='Finished without error' " "WHERE id=%d" % (jobtracker.nowstr(), finished_job["id"]) ) jobtracker.query(queries)
def remove_file(fn): """Delete a file (if it exists) and mark it as deleted in the job-tracker DB. Input: fn: The name of the file to remove. Outputs: None """ import jobtracker if os.path.exists(fn): os.remove(fn) print "Deleted: %s" % fn jobtracker.query("UPDATE files " \ "SET status='deleted', " \ "updated_at='%s', " \ "details='File was deleted' " \ "WHERE filename='%s'" % \ (jobtracker.nowstr(), fn))
def acknowledge_downloaded_files(): """Acknowledge the reception of the files """ requests_to_delete = jobtracker.query("SELECT * FROM requests " \ "WHERE status='finished'") if len(requests_to_delete) > 0: queries = [] for request_to_delete in requests_to_delete: DownloaderSPAN512.delete_stagged_file(request_to_delete) dlm_cout.outs("Report download (%s) succeeded." % request_to_delete['guid']) queries.append("UPDATE requests " \ "SET status='cleaned_up', " \ "details='download complete', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), request_to_delete['id'])) jobtracker.query(queries) else: pass
def check_downloading_requests(): requests = jobtracker.query("SELECT * FROM requests "\ "WHERE status='downloading'") if len(requests) > 0: queries = [] for request in requests: files_in_request = jobtracker.query("SELECT * FROM files "\ "WHERE request_id=%d" % \ request['id']) downloaded_files = 0 for f in files_in_request: if f['status'] == 'downloaded': downloaded_files += 1 if downloaded_files == len(files_in_request): queries.append("UPDATE requests " \ "SET status='finished', " \ "details='All files downloaded', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), request['id'])) jobtracker.query(queries) else: pass
def verify_files(): """For all downloaded files with status 'unverify' verify the files. Inputs: None Output: numverified: The number of files successfully verified. """ toverify = jobtracker.query("SELECT * FROM files " \ "WHERE status='unverified'") k = 1 numverified = 0 for file in toverify: if k % 10 == 0: print k, '/', len(toverify) k += 1 if os.path.exists(file['filename']): actualsize = os.path.getsize(file['filename']) else: # Check if download.datadir has changed since file entry was created # and if that is why file missing. alt_path = os.path.join(config.download.datadir, os.path.split(file['filename'])[-1]) if os.path.exists(alt_path): actualsize = os.path.getsize(alt_path) jobtracker.query("UPDATE files SET filename='%s' WHERE id=%d" % (alt_path, file['id'])) else: actualsize = -1 expectedsize = file['size'] last_attempt_id = jobtracker.query("SELECT id " \ "FROM download_attempts " \ "WHERE file_id=%s " \ "ORDER BY id DESC " %file['id'],fetchone=True)[0] queries = [] if actualsize == expectedsize: dlm_cout.outs("Download of %s is complete and verified." % \ os.path.split(file['filename'])[-1]) # Everything checks out! queries.append("UPDATE files " \ "SET status='downloaded', " \ "details='Download is complete and verified', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='downloaded', " \ "details='Download is complete and verified', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), last_attempt_id)) numverified += 1 else: dlm_cout.outs("Verification of %s failed. \n" \ "\tActual size (%d bytes) != Expected size (%d bytes)" % \ (os.path.split(file['filename'])[-1], actualsize, expectedsize)) # Boo... verification failed. queries.append("UPDATE files " \ "SET status='failed', " \ "details='Downloaded file failed verification', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='verification_failed', " \ "details='Downloaded file failed verification', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), last_attempt_id)) jobtracker.query(queries) return numverified
"details='Download failed - %s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), str(e), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='download_failed', " \ "details='Download failed - %s', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (str(e), jobtracker.nowstr(), attempt['id'])) else: queries.append("UPDATE files " \ "SET status='unverified', " \ "updated_at='%s', " \ "details='Download is complete - File is unverified' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='complete', " \ "details='Download is complete', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), attempt['id'])) jobtracker.query(queries) def verify_files(): """For all downloaded files with status 'unverify' verify the files. """ toverify = jobtracker.query("SELECT * FROM files " \ "WHERE status='unverified'")
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row['id']) try: presubmission_check(fns) outdir = get_output_dir(fns) # Attempt to submit the job queue_id = config.jobpooler.queue_manager.submit\ (fns, outdir, job_row['id']) except (FailedPreCheckError): # Error caught during presubmission check. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Job ID: %d " % job_row['id'] errormsg += "failed presubmission check!\n\n" errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Job ID: %d failed presubmission check!\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) if config.email.send_on_terminal_failures: # Send error email msg = "Presubmission check failed!\n" msg += "Job ID: %d\n\n" % \ (job_row['id']) msg += errormsg msg += "\n*** Job has been terminally failed. ***\n" msg += "*** Job will NOT be re-submitted! ***\n" if config.basic.delete_rawdata: jobpool_cout.outs("Job #%d will NOT be retried. " \ "Data files will be deleted." % job_row['id']) msg += "*** Raw data files will be deleted. ***\n" else: jobpool_cout.outs("Job #%d will NOT be retried. " % job_row['id']) notification = mailer.ErrorMailer(msg, \ subject="Job failed presubmission check - Terminal") notification.send() if config.basic.delete_rawdata: pipeline_utils.clean_up(job_row['id']) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'precheck_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='terminal_failure', " \ "details='Failed presubmission check', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append( (jobtracker.nowstr(), job_row['id']) ) jobtracker.execute(queries, arglist) except (queue_managers.QueueManagerJobFatalError,\ datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row['id'] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" \ "\tJob ID: %d\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append( (jobtracker.nowstr(), job_row['id']) ) jobtracker.execute(queries, arglist) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise else: # No error occurred msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row['id'], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "queue_id, " \ "output_dir, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % \ (job_row['id'], queue_id, outdir, 'running', \ jobtracker.nowstr(), jobtracker.nowstr(), \ 'Job submitted to queue')) queries.append("UPDATE jobs " \ "SET status='submitted', " \ "details='Job submitted to queue', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries)
requests = jobtracker.query("SELECT * FROM requests " \ "WHERE guid='%s'" % guid) if requests: # Entries in the requests table exist with this GUID!? raise pipeline_utils.PipelineError("There are %d requests in the " \ "job-tracker DB with this GUID %s" % \ (len(requests), guid)) jobtracker.query("INSERT INTO requests ( " \ "guid, " \ "created_at, " \ "updated_at, " \ "status, " \ "details) " \ "VALUES ('%s', '%s', '%s', '%s', '%s')" % \ (guid, jobtracker.nowstr(), jobtracker.nowstr(), 'waiting', \ 'Newly created request')) def check_active_requests(): """Check for any requests with status='waiting'. If there are some, check if the files are ready for download. """ active_requests = jobtracker.query("SELECT * FROM requests " \ "WHERE status='waiting'") web_service = suds.client.Client(config.download.api_service_url).service for request in active_requests: location = web_service.Location(guid=request['guid'], \ username=config.download.api_username, \ pw=config.download.api_password)
# Rolling back changes. db.rollback() raise else: # No errors encountered. Commit changes to the DB. db.commit() # Update database statuses queries = [] queries.append("UPDATE job_submits " \ "SET status='uploaded', " \ "details='Upload successful (header_id=%d)', " \ "updated_at='%s' " \ "WHERE id=%d" % (header_id, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='uploaded', " \ "details='Upload successful (header_id=%d)', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (header_id, jobtracker.nowstr(), job_submit['job_id'])) jobtracker.query(queries) print "Results successfully uploaded" if config.basic.delete_rawdata: pipeline_utils.clean_up(job_submit['job_id']) print "" # Just a blank line
def verify_files(): """For all downloaded files with status 'unverify' verify the files. Inputs: None Output: numverified: The number of files successfully verified. """ toverify = jobtracker.query("SELECT * FROM files " \ "WHERE status='unverified'") numverified = 0 for file in toverify: if os.path.exists(file['filename']): actualsize = os.path.getsize(file['filename']) else: actualsize = -1 expectedsize = file['size'] last_attempt_id = jobtracker.query("SELECT id " \ "FROM download_attempts " \ "WHERE file_id=%s " \ "ORDER BY id DESC " % file['id'], \ fetchone=True)[0] queries = [] if actualsize == expectedsize: dlm_cout.outs("Download of %s is complete and verified." % \ os.path.split(file['filename'])[-1]) # Everything checks out! queries.append("UPDATE files " \ "SET status='downloaded', " \ "details='Download is complete and verified', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='downloaded', " \ "details='Download is complete and verified', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), last_attempt_id)) numverified += 1 else: dlm_cout.outs("Verification of %s failed. \n" \ "\tActual size (%d bytes) != Expected size (%d bytes)" % \ (os.path.split(file['filename'])[-1], actualsize, expectedsize)) # Boo... verification failed. queries.append("UPDATE files " \ "SET status='failed', " \ "details='Downloaded file failed verification', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), file['id'])) queries.append("UPDATE download_attempts " \ "SET status='verification_failed', " \ "details='Downloaded file failed verification', " \ "updated_at='%s'" \ "WHERE id=%d" % \ (jobtracker.nowstr(), last_attempt_id)) jobtracker.query(queries) return numverified
def upload_results(job_submit): """ Uploads Results for a given submit. Input: job_submit: A row from the job_submits table. Results from this job submission will be uploaded. Output: None """ print "Attempting to upload results" print "\tJob ID: %d, Job submission ID: %d" % \ (job_submit['job_id'], job_submit['id']) try: db = database.Database('common-copy', autocommit=False) # Prepare for upload dir = job_submit['output_dir'] fitsfiles = get_fitsfiles(job_submit) # Upload results header_id = header.upload_header(fitsfiles, dbname=db) if not header.upload_header(fitsfiles, dbname=db): raise AssertionError("Header values in common DB " \ "do not match values uploaded.") else: print "\tHeader uploaded and checked. Header ID: %d" % header_id version_number = get_version_number(dir) candidate_uploader.upload_candidates(header_id, \ version_number, \ dir, dbname=db) if not candidate_uploader.check_candidates(header_id, \ version_number, \ dir, dbname=db): raise AssertionError("Candidate values in common DB " \ "do not match values uploaded.") else: print "\tCandidates uploaded and checked." data = datafile.autogen_dataobj(fitsfiles) diagnostic_uploader.upload_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir, dbname=db) if not diagnostic_uploader.check_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir, dbname=db): raise AssertionError("Diagnostic values in common DB " \ "do not match values uploaded.") else: print "\tDiagnostics uploaded and checked." except (header.HeaderError, \ candidate_uploader.PeriodicityCandidateError, \ diagnostic_uploader.DiagnosticError): # Parsing error caught. Job attempt has failed! exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while checking results!\n" errormsg += "\tJob ID: %d, Job submit ID: %d\n\n" % \ (job_submit['job_id'], job_submit['id']) errormsg += "".join(exceptionmsgs) sys.stderr.write("Error while checking results!\n") sys.stderr.write("Database transaction will not be committed.\n") sys.stderr.write("\t%s" % exceptionmsgs[-1]) queries = [] arglists = [] queries.append("UPDATE job_submits " \ "SET status='upload_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while uploading results', " \ "updated_at=? " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), job_submit['job_id'])) jobtracker.execute(queries, arglists) # Rolling back changes. db.rollback() except database.DatabaseConnectionError, e: # Connection error while uploading. We will try again later. sys.stderr.write(str(e)) sys.stderr.write("\tRolling back DB transaction and will re-try later.\n") # Rolling back changes. db.rollback()
def main(): jobsubmit_ids = options.submit_ids queue_ids = args + options.queue_ids for queue_id in queue_ids: qids = jobtracker.query("SELECT id " \ "FROM job_submits " \ "WHERE queue_id LIKE '%s'" % queue_id) if len(qids) != 1: sys.stderr.write("Bad number (%d) of job submissions for queue " \ "ID provided: %s\nSkipping...\n" % (len(qids), queue_id)) else: jobsubmit_ids.append(qids[0]['id']) for jobsubmit_id in jobsubmit_ids: job_submits = jobtracker.query("SELECT id, job_id, status, queue_id " \ "FROM job_submits " \ "WHERE id LIKE '%s'" % jobsubmit_id) if len(job_submits) != 1: sys.stderr.write("Bad number (%d) of job submissions for job submit " \ "ID provided: %s\nSkipping...\n" % (len(job_submits), jobsubmit_id)) continue elif config.jobpooler.queue_manager.is_running( job_submits[0]['queue_id']): isrunning = True elif job_submits[0]['status'] == 'processed' and options.also_processed: isrunning = False else: sys.stderr.write("The job submit ID/queue ID provided is invalid. " \ "This code only allows jobs currently running " \ "(i.e. in the queue), or job submits with " \ "status='processed' (if the --also-processed flag " \ "is given), to be stopped/failed. Sorry. Please " \ "try again!\nSkipping...\n") continue queries = [] if options.fail: queries.append("UPDATE job_submits " \ "SET status='stopped', " \ "updated_at='%s', " \ "details='Job was manually failed' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_submits[0]['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "updated_at='%s', " \ "details='Job was manually failed' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_submits[0]['job_id'])) else: queries.append("DELETE FROM job_submits " \ "WHERE id=%d" % job_submits[0]['id']) queries.append("UPDATE jobs " \ "SET status='retrying', " \ "updated_at='%s', " \ "details='Job was manually removed, politely' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_submits[0]['job_id'])) jobtracker.query(queries) if isrunning: print "Stopping job: %s" % job_submits[0]['queue_id'] try: config.jobpooler.queue_manager.delete( job_submits[0]['queue_id']) except pipeline_utils.PipelineError, e: print "PipelineError: %s" % str(e)
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row['id']) bad_days = ['20170414', '20170419', '20170420', '20170423', '20170423', '20170427', '20170429', '20170503', '20170510', '20170516'] bad_beams = ['b5', 'b6'] for bad_day in bad_days: if bad_day in fns[0]: if (bad_beams[0] in fns[0]) or (bad_beams[1] in fns[0]): print "Files affected by the bad beams 5, 6 60Hz signal: ", fns print "Will delete the raw data files." queries=[] arglist=[] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'Beams 5 and 6', \ jobtracker.nowstr(), jobtracker.nowstr(), \ "Affected by 60Hz signal") ) queries.append("UPDATE jobs " \ "SET status='terminal_failure', " \ "details='Beams 5 and 6 affected by 60Hz signal', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append( (jobtracker.nowstr(), job_row['id']) ) jobtracker.execute(queries, arglist) return try: presubmission_check(fns) outdir = get_output_dir(fns) if outdir=="": pass # Attempt to submit the job if job_row['status'] == 'retrying': ## ppn=2 ppn=1 else: ppn=1 if config.jobpooler.alternative_submit_script: print "Submitting:", config.jobpooler.alternative_submit_script queue_id = config.jobpooler.queue_manager.submit\ (fns, outdir, job_row['id'],\ script=config.jobpooler.alternative_submit_script,\ ppn=ppn) else: queue_id = config.jobpooler.queue_manager.submit\ (fns, outdir, job_row['id'], ppn=ppn) except (FailedPreCheckError): # Error caught during presubmission check. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Job ID: %d " % job_row['id'] errormsg += "failed presubmission check!\n\n" errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Job ID: %d failed presubmission check!\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) if config.basic.delete_rawdata: pipeline_utils.clean_up(job_row['id']) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'precheck_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='terminal_failure', " \ "details='Failed presubmission check', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append( (jobtracker.nowstr(), job_row['id']) ) jobtracker.execute(queries, arglist) except (queue_managers.QueueManagerJobFatalError,\ datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row['id'] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" \ "\tJob ID: %d\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append( (jobtracker.nowstr(), job_row['id']) ) jobtracker.execute(queries, arglist) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise except (MissingFilesError): # Unexpected error exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Job ID: %d " % job_row['id'] errormsg += "Raw data files missing from /scratch/ area.!\n\n" errormsg += "".join(exceptionmsgs) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append( (jobtracker.nowstr(), job_row['id']) ) jobtracker.execute(queries, arglist) print errormsg else: # No error occurred msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row['id'], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "queue_id, " \ "output_dir, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % \ (job_row['id'], queue_id, outdir, 'running', \ jobtracker.nowstr(), jobtracker.nowstr(), \ 'Job submitted to queue')) queries.append("UPDATE jobs " \ "SET status='submitted', " \ "details='Job submitted to queue', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries)
sys.stderr.write(errmsg) sys.stderr.write("\tRolling back DB transaction and raising error.\n") # Rolling back changes (just last uncommited FTP). db.rollback() raise pipeline_utils.PipelineError(errmsg) else: # Update database statuses queries = [] queries.append("UPDATE job_submits " \ "SET status='uploaded', " \ "details='Upload successful (header_id=%d)', " \ "updated_at='%s' " \ "WHERE id=%d" % (header_id, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='uploaded', " \ "details='Upload successful (header_id=%d)', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (header_id, jobtracker.nowstr(), job_submit['job_id'])) jobtracker.query(queries) print "Results successfully uploaded" # Tar and Copy final results to HPSS print "Copy results to HPSS" pipeline_utils.copy_results_to_HPSS(dir) if config.basic.delete_rawdata:
def upload_results(job_submit): """ Uploads Results for a given submit. Input: job_submit: A row from the job_submits table. Results from this job submission will be uploaded. Output: None """ print "Attempting to upload results" print "\tJob ID: %d, Job submission ID: %d" % \ (job_submit['job_id'], job_submit['id']) try: db = database.Database('common-copy', autocommit=False) # Prepare for upload dir = job_submit['output_dir'] fitsfiles = get_fitsfiles(job_submit) # Upload results header_id = header.upload_header(fitsfiles, dbname=db) if not header.upload_header(fitsfiles, dbname=db): raise AssertionError("Header values in common DB " \ "do not match values uploaded.") else: print "\tHeader uploaded and checked. Header ID: %d" % header_id version_number = get_version_number(dir) candidate_uploader.upload_candidates(header_id, \ version_number, \ dir, dbname=db) if not candidate_uploader.check_candidates(header_id, \ version_number, \ dir, dbname=db): raise AssertionError("Candidate values in common DB " \ "do not match values uploaded.") else: print "\tCandidates uploaded and checked." data = datafile.autogen_dataobj(fitsfiles) diagnostic_uploader.upload_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir, dbname=db) if not diagnostic_uploader.check_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir, dbname=db): raise AssertionError("Diagnostic values in common DB " \ "do not match values uploaded.") else: print "\tDiagnostics uploaded and checked." except (header.HeaderError, \ candidate_uploader.PeriodicityCandidateError, \ diagnostic_uploader.DiagnosticError): # Parsing error caught. Job attempt has failed! exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while checking results!\n" errormsg += "\tJob ID: %d, Job submit ID: %d\n\n" % \ (job_submit['job_id'], job_submit['id']) errormsg += "".join(exceptionmsgs) sys.stderr.write("Error while checking results!\n") sys.stderr.write("Database transaction will not be committed.\n") sys.stderr.write("\t%s" % exceptionmsgs[-1]) queries = [] arglists = [] queries.append("UPDATE job_submits " \ "SET status='upload_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while uploading results', " \ "updated_at=? " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), job_submit['job_id'])) jobtracker.execute(queries, arglists) # Rolling back changes. db.rollback() except database.DatabaseConnectionError, e: # Connection error while uploading. We will try again later. sys.stderr.write(str(e)) sys.stderr.write( "\tRolling back DB transaction and will re-try later.\n") # Rolling back changes. db.rollback()
def create_file_entries(request): """Given a row from the requests table in the job-tracker DB check the FTP server for its files and create entries in the files table. Input: request: A row from the requests table. Outputs: None """ cftp = CornellFTP.CornellFTP() try: files = cftp.get_files(request['guid']) except CornellFTP.M2Crypto.ftpslib.error_perm: exctype, excvalue, exctb = sys.exc_info() dlm_cout.outs("FTP error getting file information.\n" \ "\tGUID: %s\n\tError: %s" % \ (request['guid'], \ "".join(traceback.format_exception_only(exctype, excvalue)).strip())) files = [] total_size = 0 num_files = 0 queries = [] for fn, size in files: # Check if file is from the phantom beam (beam 7) datafile_type = datafile.get_datafile_type([fn]) parsedfn = datafile_type.fnmatch(fn) if parsedfn.groupdict().setdefault('beam', '-1') == '7': print "Ignoring beam 7 data: %s" % fn continue # Insert entry into DB's files table queries.append("INSERT INTO files ( " \ "request_id, " \ "remote_filename, " \ "filename, " \ "status, " \ "created_at, " \ "updated_at, " \ "size) " \ "VALUES ('%s', '%s', '%s', '%s', '%s', '%s', %d)" % \ (request['id'], fn, os.path.join(config.download.datadir, fn), \ 'new', jobtracker.nowstr(), jobtracker.nowstr(), size)) total_size += size num_files += 1 if num_files: dlm_cout.outs("Request (GUID: %s) has succeeded.\n" \ "\tNumber of files to be downloaded: %d" % \ (request['guid'], num_files)) queries.append("UPDATE requests " \ "SET size=%d, " \ "updated_at='%s', " \ "status='finished', " \ "details='Request has been filled' " \ "WHERE id=%d" % \ (total_size, jobtracker.nowstr(), request['id'])) else: dlm_cout.outs("Request (GUID: %s) has failed.\n" \ "\tThere are no files to be downloaded." % \ request['guid']) queries.append("UPDATE requests " \ "SET updated_at='%s', " \ "status='failed', " \ "details='No files to download' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), request['id'])) jobtracker.query(queries)
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row['id']) try: outdir = get_output_dir(fns) # Submit job queue_id = config.jobpooler.queue_manager.submit(fns, outdir) msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row['id'], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "queue_id, " \ "output_dir, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % \ (job_row['id'], queue_id, outdir, 'running', \ jobtracker.nowstr(), jobtracker.nowstr(), \ 'Job submitted to queue')) queries.append("UPDATE jobs " \ "SET status='submitted', " \ "details='Job submitted to queue', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries) except pipeline_utils.PipelineError: # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row['id'] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" \ "\tJob ID: %d\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s')" % \ (job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg)) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries)
def create_file_entries2(request, files): """Given a row from the requests table in the job-tracker DB check the FTP server for its files and create entries in the files table. Input: request: A row from the requests table. Outputs: None """ cftp = CornellFTP.CornellFTP() try: files = cftp.get_files(request['guid']) except CornellFTP.M2Crypto.ftpslib.error_perm: exctype, excvalue, exctb = sys.exc_info() dlm_cout.outs("FTP error getting file information.\n" \ "\tGUID: %s\n\tError: %s" % \ (request['guid'], \ "".join(traceback.format_exception_only(exctype, excvalue)).strip())) files = [] print "Create_file_entries : %s new files " % str(len(files)) total_size = 0 num_files = 0 queries = [] k = 1 for fn, size in files: if k % 10 == 0: print k, '/', len(files) k += 1 if not pipeline_utils.can_add_file(fn, verbose=True): dlm_cout.outs("Skipping %s" % fn) continue # Insert entry into DB's files table queries.append("INSERT INTO files ( " \ "request_id, " \ "remote_filename, " \ "filename, " \ "status, " \ "created_at, " \ "updated_at, " \ "size) " \ "VALUES ('%s', '%s', '%s', '%s', '%s', '%s', %d)" % \ (request['id'], fn, os.path.join(config.download.datadir, fn), \ 'new', jobtracker.nowstr(), jobtracker.nowstr(), size)) total_size += size num_files += 1 if num_files: dlm_cout.outs("Request (GUID: %s) has succeeded.\n" \ "\tNumber of files to be downloaded: %d" % \ (request['guid'], num_files)) queries.append("UPDATE requests " \ "SET size=%d, " \ "updated_at='%s', " \ "status='downloading', " \ "details='Request has been filled' " \ "WHERE id=%d" % \ (total_size, jobtracker.nowstr(), request['id'])) else: dlm_cout.outs("Request (GUID: %s) has failed.\n" \ "\tThere are no files to be downloaded." % \ request['guid']) # delete restore since there may be skipped files """ web_service = CornellWebservice.Client() delete_status = web_service.Deleter(guid=request['guid'], \ username=config.download.api_username, \ pw=config.download.api_password) if delete_status == "deletion successful": dlm_cout.outs("Deletion (%s) succeeded." % request['guid']) elif delete_status == "invalid user": dlm_cout.outs("Deletion (%s) failed due to invalid user." % \ request['guid']) elif delete_status == "deletion failed": dlm_cout.outs("Deletion (%s) failed for unknown reasons." % \ request['guid']) """ # redefine 'queries' because there are no files to update queries = ["UPDATE requests " \ "SET updated_at='%s', " \ "status='failed', " \ "details='No files to download.' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), request['id'])] jobtracker.query(queries)
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row['id']) try: outdir = get_output_dir(fns) # Attempt to submit the job queue_id = config.jobpooler.queue_manager.submit\ (fns, outdir, job_row['id']) except (queue_managers.QueueManagerJobFatalError,\ datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row['id'] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" \ "\tJob ID: %d\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise else: # No error occurred msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row['id'], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "queue_id, " \ "output_dir, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % \ (job_row['id'], queue_id, outdir, 'running', \ jobtracker.nowstr(), jobtracker.nowstr(), \ 'Job submitted to queue')) queries.append("UPDATE jobs " \ "SET status='submitted', " \ "details='Job submitted to queue', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries)
def create_file_entries(request): """Given a row from the requests table in the job-tracker DB check the FTP server for its files and create entries in the files table. Input: request: A row from the requests table. Outputs: None """ cftp = CornellFTP.CornellFTP() try: files = cftp.get_files(request['guid']) except CornellFTP.M2Crypto.ftpslib.error_perm: exctype, excvalue, exctb = sys.exc_info() dlm_cout.outs("FTP error getting file information.\n" \ "\tGUID: %s\n\tError: %s" % \ (request['guid'], \ "".join(traceback.format_exception_only(exctype, excvalue)).strip())) files = [] total_size = 0 num_files = 0 queries = [] for fn, size in files: if not pipeline_utils.can_add_file(fn): dlm_cout.outs("Skipping %s" % fn) continue # Insert entry into DB's files table queries.append("INSERT INTO files ( " \ "request_id, " \ "remote_filename, " \ "filename, " \ "status, " \ "created_at, " \ "updated_at, " \ "size) " \ "VALUES ('%s', '%s', '%s', '%s', '%s', '%s', %d)" % \ (request['id'], fn, os.path.join(config.download.datadir, fn), \ 'new', jobtracker.nowstr(), jobtracker.nowstr(), size)) total_size += size num_files += 1 if num_files: dlm_cout.outs("Request (GUID: %s) has succeeded.\n" \ "\tNumber of files to be downloaded: %d" % \ (request['guid'], num_files)) queries.append("UPDATE requests " \ "SET size=%d, " \ "updated_at='%s', " \ "status='finished', " \ "details='Request has been filled' " \ "WHERE id=%d" % \ (total_size, jobtracker.nowstr(), request['id'])) else: dlm_cout.outs("Request (GUID: %s) has failed.\n" \ "\tThere are no files to be downloaded." % \ request['guid']) # redefine 'queries' because there are no files to update queries = ["UPDATE requests " \ "SET updated_at='%s', " \ "status='failed', " \ "details='No files to download' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), request['id'])] jobtracker.query(queries)
def upload_results(job_submit): """ Uploads Results for a given submit. Input: job_submit: A row from the job_submits table. Results from this job submission will be uploaded. Output: None """ print "Attempting to upload results" print "\tJob ID: %d, Job submission ID: %d\n\tOutput Dir: %s" % \ (job_submit['job_id'], job_submit['id'], job_submit['output_dir']) if debug.UPLOAD: upload.upload_timing_summary = {} starttime = time.time() try: # Connect to the DB db = database.Database('default', autocommit=False) # Prepare for upload dir = job_submit['output_dir'] # NEW Beluga - Untar the tarball import tarfile to_keep = os.listdir(job_submit['output_dir']) tarball = glob.glob(job_submit['output_dir'] + '/*00.tgz')[0] tar = tarfile.open(tarball, 'r:gz') tar.extractall(path=job_submit['output_dir']) tar.close() all_files = os.listdir(job_submit['output_dir']) to_del = set(all_files) - set(to_keep) if config.upload.upload_zerodm_periodicity or config.upload.upload_zerodm_singlepulse: to_keep_zerodm = os.listdir(job_submit['output_dir'] + '/zerodm') tarball = glob.glob(job_submit['output_dir'] + '/zerodm/*zerodm.tgz')[0] tar = tarfile.open(tarball, 'r:gz') tar.extractall(path=job_submit['output_dir'] + '/zerodm') tar.close() all_files_zerodm = os.listdir(job_submit['output_dir'] + '/zerodm') to_del_zerodm = set(all_files_zerodm) - set(to_keep_zerodm) pdm_dir = os.path.join( dir, "zerodm") if config.upload.upload_zerodm_periodicity else dir sp_dir = os.path.join( dir, "zerodm") if config.upload.upload_zerodm_singlepulse else dir if not os.path.exists(dir) or not os.listdir(dir): errormsg = 'ERROR: Results directory, %s, does not exist or is empty for job_id=%d' %\ (dir, job_submit['job_id']) raise upload.UploadNonFatalError(errormsg) elif len(os.listdir(dir)) == 1 and os.listdir(dir)[0] == 'zerodm' \ and not os.listdir(os.path.join(dir,os.listdir(dir)[0])): errormsg = 'ERROR: Results directory, %s, does not exist or is empty for job_id=%d' %\ (dir, job_submit['job_id']) raise upload.UploadNonFatalError(errormsg) fitsfiles = get_fitsfiles(job_submit) try: data = datafile.autogen_dataobj(fitsfiles) except ValueError: raise upload.UploadNonFatalError version_number = get_version_number(dir) if debug.UPLOAD: parsetime = time.time() # Upload results hdr = header.get_header(fitsfiles) print "\tHeader parsed." rat_inst_id_cache = ratings2.utils.RatingInstanceIDCache( dbname='common3') cands, tempdir = candidates.get_candidates(version_number, pdm_dir, \ timestamp_mjd=data.timestamp_mjd, \ inst_cache=rat_inst_id_cache) print "\tPeriodicity candidates parsed. (%d cands)" % len(cands) sp_cands, tempdir_sp = sp_candidates.get_spcandidates(version_number, sp_dir, \ timestamp_mjd=data.timestamp_mjd, \ inst_cache=rat_inst_id_cache) print "\tSingle pulse candidates parsed. (%d cands)" % len(sp_cands) diags = diagnostics.get_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ pdm_dir, sp_dir) print "\tDiagnostics parsed." for c in (cands + sp_cands): hdr.add_dependent(c) if debug.UPLOAD: upload.upload_timing_summary['Parsing'] = \ upload.upload_timing_summary.setdefault('Parsing', 0) + \ (time.time()-parsetime) # Perform the upload header_id = hdr.upload(db) print "Header ID: ", header_id for d in diags: d.upload(db) print "\tDB upload completed and checked successfully. header_id=%d" % \ header_id except (upload.UploadNonFatalError): # Parsing error caught. Job attempt has failed! exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while checking results!\n" errormsg += "\tJob ID: %d, Job submit ID: %d\n\n" % \ (job_submit['job_id'], job_submit['id']) errormsg += "".join(exceptionmsgs) sys.stderr.write("Error while checking results!\n") sys.stderr.write("Database transaction will not be committed.\n") sys.stderr.write("\t%s" % exceptionmsgs[-1]) queries = [] arglists = [] queries.append("UPDATE job_submits " \ "SET status='upload_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while uploading results', " \ "updated_at=? " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), job_submit['job_id'])) jobtracker.execute(queries, arglists) # Rolling back changes. db.rollback() except (database.DatabaseConnectionError, ratings2.database.DatabaseConnectionError,\ CornellFTP.CornellFTPTimeout, upload.UploadDeadlockError,\ database.DatabaseDeadlockError), e: # Connection error while uploading. We will try again later. sys.stderr.write(str(e)) sys.stderr.write( "\tRolling back DB transaction and will re-try later.\n") # Rolling back changes. db.rollback()
def submit(job_row): """ Submits a job to QueueManager, if successful will store returned queue id. Input: job_row: A row from the jobs table. The datafiles associated with this job will be submitted to be processed. Outputs: None """ fns = pipeline_utils.get_fns_for_jobid(job_row['id']) try: presubmission_check(fns) outdir = get_output_dir(fns) # Attempt to submit the job queue_id = config.jobpooler.queue_manager.submit\ (fns, outdir, job_row['id']) except (FailedPreCheckError): # Error caught during presubmission check. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Job ID: %d " % job_row['id'] errormsg += "failed presubmission check!\n\n" errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Job ID: %d failed presubmission check!\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) if config.email.send_on_terminal_failures: # Send error email msg = "Presubmission check failed!\n" msg += "Job ID: %d\n\n" % \ (job_row['id']) msg += errormsg msg += "\n*** Job has been terminally failed. ***\n" msg += "*** Job will NOT be re-submitted! ***\n" if config.basic.delete_rawdata: jobpool_cout.outs("Job #%d will NOT be retried. " \ "Data files will be deleted." % job_row['id']) msg += "*** Raw data files will be deleted. ***\n" else: jobpool_cout.outs("Job #%d will NOT be retried. " % job_row['id']) notification = mailer.ErrorMailer(msg, \ subject="Job failed presubmission check - Terminal") notification.send() if config.basic.delete_rawdata: pipeline_utils.clean_up(job_row['id']) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'precheck_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='terminal_failure', " \ "details='Failed presubmission check', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) except (queue_managers.QueueManagerJobFatalError,\ datafile.DataFileError): # Error caught during job submission. exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while submitting job!\n" errormsg += "\tJob ID: %d\n\n" % job_row['id'] errormsg += "".join(exceptionmsgs) jobpool_cout.outs("Error while submitting job!\n" \ "\tJob ID: %d\n\t%s\n" % \ (job_row['id'], exceptionmsgs[-1])) queries = [] arglist = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (?, ?, ?, ?, ?)" ) arglist.append( ( job_row['id'], 'submission_failed', \ jobtracker.nowstr(), jobtracker.nowstr(), \ errormsg) ) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while submitting job', " \ "updated_at=? " \ "WHERE id=?" ) arglist.append((jobtracker.nowstr(), job_row['id'])) jobtracker.execute(queries, arglist) except queue_managers.QueueManagerNonFatalError: # Do nothing. Don't submit the job. Don't mark the job as 'submitted'. # Don't mark the job as 'failed'. The job submission will be retried. pass except queue_managers.QueueManagerFatalError: # A fatal error occurred. Re-raise! raise else: # No error occurred msg = "Submitted job to process:\n" msg += "\tJob ID: %d, Queue ID: %s\n" % (job_row['id'], queue_id) msg += "\tData file(s):\n" for fn in fns: msg += "\t%s\n" % fn jobpool_cout.outs(msg) queries = [] queries.append("INSERT INTO job_submits (" \ "job_id, " \ "queue_id, " \ "output_dir, " \ "status, " \ "created_at, " \ "updated_at, " \ "details) " \ "VALUES (%d,'%s','%s','%s','%s','%s','%s')" % \ (job_row['id'], queue_id, outdir, 'running', \ jobtracker.nowstr(), jobtracker.nowstr(), \ 'Job submitted to queue')) queries.append("UPDATE jobs " \ "SET status='submitted', " \ "details='Job submitted to queue', " \ "updated_at='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job_row['id'])) jobtracker.query(queries)
def create_file_entries(request): """Given a row from the requests table in the job-tracker DB check the FTP server for its files and create entries in the files table. Input: request: A row from the requests table. Outputs: None """ cftp = CornellFTP.CornellFTP() try: files = cftp.get_files(request['guid']) except CornellFTP.M2Crypto.ftpslib.error_perm: exctype, excvalue, exctb = sys.exc_info() dlm_cout.outs("FTP error getting file information.\n" \ "\tGUID: %s\n\tError: %s" % \ (request['guid'], \ "".join(traceback.format_exception_only(exctype, excvalue)).strip())) files = [] total_size = 0 num_files = 0 queries = [] for fn, size in files: # Check if file is from the phantom beam (beam 7) datafile_type = datafile.get_datafile_type([fn]) parsedfn = datafile_type.fnmatch(fn) if parsedfn.groupdict().setdefault('beam', '-1') == '7': print "Ignoring beam 7 data: %s" % fn continue # Insert entry into DB's files table queries.append("INSERT INTO files ( " \ "request_id, " \ "remote_filename, " \ "filename, " \ "status, " \ "created_at, " \ "updated_at, " \ "size) " \ "VALUES ('%s', '%s', '%s', '%s', '%s', '%s', %d)" % \ (request['id'], fn, os.path.join(config.download.temp, fn), \ 'new', jobtracker.nowstr(), jobtracker.nowstr(), size)) total_size += size num_files += 1 if num_files: dlm_cout.outs("Request (GUID: %s) has succeeded.\n" \ "\tNumber of files to be downloaded: %d" % \ (request['guid'], num_files)) queries.append("UPDATE requests " \ "SET size=%d, " \ "updated_at='%s', " \ "status='finished', " \ "details='Request has been filled' " \ "WHERE id=%d" % \ (total_size, jobtracker.nowstr(), request['id'])) else: dlm_cout.outs("Request (GUID: %s) has failed.\n" \ "\tThere are no files to be downloaded." % \ request['guid']) queries.append("UPDATE requests " \ "SET updated_at='%s', " \ "status='failed', " \ "details='No files to download' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), request['id'])) jobtracker.query(queries)
def recover_failed_jobs(): """Gather jobs with status 'failed' from the job-tracker DB. For each of these jobs see if it can be re-submitted. If it can, set the status to 'retrying'. If the job cannot be re-submitted, set the status to 'terminal_failure', and delete the raw data (if config is set for deletion). Depending on configurations emails may be sent. """ failed_jobs = jobtracker.query("SELECT * FROM jobs " \ "WHERE status='failed'") for job in failed_jobs: # Count the number of times this job has been submitted already submits = jobtracker.query("SELECT * FROM job_submits " \ "WHERE job_id=%d " \ "ORDER BY id DESC" % job['id']) if len(submits) < config.jobpooler.max_attempts: # We can re-submit this job. if config.email.send_on_failures: # Send error email msg = "Error! Job submit status: %s\n" % \ submits[0]['status'] msg += "Job ID: %d, Job submit ID: %d\n\n" % \ (job['id'], submits[0]['id']) msg += str(submits[0]['details']) msg += "\n*** Job will be re-submitted to the queue ***\n" mailer.ErrorMailer(msg).send() # Set status to 'retrying'. jobtracker.query("UPDATE jobs " \ "SET status='retrying', " \ "updated_at='%s', " \ "details='Job will be retried' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job['id'])) jobpool_cout.outs("Job #%d will be retried." % job['id']) else: # We've run out of attempts for this job if config.email.send_on_terminal_failures or \ config.email.send_on_failure: # Send error email msg = "Error! Job submit status: %s\n" % \ str(submits[0]['status']) msg += "Job ID: %d, Job submit ID: %d\n\n" % \ (job['id'], submits[0]['id']) msg += str(submits[0]['details']) msg += "\n*** No more attempts for this job. ***\n" msg += "*** Job will NOT be re-submitted! ***\n" if config.basic.delete_rawdata: jobpool_cout.outs("Job #%d will NOT be retried. " \ "Data files will be deleted." % job['id']) msg += "*** Raw data files will be deleted. ***\n" else: jobpool_cout.outs("Job #%d will NOT be retried. " % job['id']) mailer.ErrorMailer(msg).send() if config.basic.delete_rawdata: pipeline_utils.clean_up(job['id']) # Set status to 'terminal_failure'. jobtracker.query("UPDATE jobs " \ "SET status='terminal_failure', " \ "updated_at='%s', " \ "details='Job has failed permanently' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), job['id']))
def upload_results(job_submit): """ Uploads Results for a given submit. Input: job_submit: A row from the job_submits table. Results from this job submission will be uploaded. Output: None """ print "Attempting to upload results" print "\tJob ID: %d, Job submission ID: %d" % \ (job_submit['job_id'], job_submit['id']) if debug.UPLOAD: upload.upload_timing_summary = {} starttime = time.time() try: # Connect to the DB db = database.Database('default', autocommit=False) # Prepare for upload dir = job_submit['output_dir'] if not os.path.exists(dir): errormsg = 'ERROR: Results directory, %s, does not exist for job_id=%d' %\ (dir, job_submit['job_id']) raise upload.UploadNonFatalError(errormsg) fitsfiles = get_fitsfiles(job_submit) data = datafile.autogen_dataobj(fitsfiles) version_number = get_version_number(dir) if debug.UPLOAD: parsetime = time.time() # Upload results hdr = header.get_header(fitsfiles) print "\tHeader parsed." cands = candidates.get_candidates(version_number, dir) print "\tPeriodicity candidates parsed." sp_cands = sp_candidates.get_spcandidates(version_number, dir) print "\tSingle pulse candidates parsed." for c in (cands + sp_cands): hdr.add_dependent(c) diags = diagnostics.get_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir) print "\tDiagnostics parsed." if debug.UPLOAD: upload.upload_timing_summary['Parsing'] = \ upload.upload_timing_summary.setdefault('Parsing', 0) + \ (time.time()-parsetime) # Perform the upload header_id = hdr.upload(db) for d in diags: d.upload(db) print "\tEverything uploaded and checked successfully. header_id=%d" % \ header_id except (upload.UploadNonFatalError): # Parsing error caught. Job attempt has failed! exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while checking results!\n" errormsg += "\tJob ID: %d, Job submit ID: %d\n\n" % \ (job_submit['job_id'], job_submit['id']) errormsg += "".join(exceptionmsgs) sys.stderr.write("Error while checking results!\n") sys.stderr.write("Database transaction will not be committed.\n") sys.stderr.write("\t%s" % exceptionmsgs[-1]) queries = [] arglists = [] queries.append("UPDATE job_submits " \ "SET status='upload_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while uploading results', " \ "updated_at=? " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), job_submit['job_id'])) jobtracker.execute(queries, arglists) # Rolling back changes. db.rollback() except (database.DatabaseConnectionError, CornellFTP.CornellFTPTimeout,\ upload.UploadDeadlockError, database.DatabaseDeadlockError), e: # Connection error while uploading. We will try again later. sys.stderr.write(str(e)) sys.stderr.write("\tRolling back DB transaction and will re-try later.\n") # Rolling back changes. db.rollback()
def upload_results(job_submit): """ Uploads Results for a given submit. Input: job_submit: A row from the job_submits table. Results from this job submission will be uploaded. Output: None """ print "Attempting to upload results" print "\tJob ID: %d, Job submission ID: %d" % \ (job_submit['job_id'], job_submit['id']) if debug.UPLOAD: upload.upload_timing_summary = {} starttime = time.time() try: # Connect to the DB db = database.Database('SPAN512', autocommit=False) # Prepare for upload dir = job_submit['output_dir'] if not os.path.exists(dir) or not os.listdir(dir): errormsg = 'ERROR: Results directory, %s, does not exist or is empty for job_id=%d' %\ (dir, job_submit['job_id']) raise upload.UploadNonFatalError(errormsg) fitsfiles = get_fitsfiles(job_submit) data = datafile.autogen_dataobj(fitsfiles) version_number = get_version_number(dir) if debug.UPLOAD: parsetime = time.time() # Upload results hdr = header.get_header(fitsfiles) print "\tHeader parsed." rat_inst_id_cache = ratings2.utils.RatingInstanceIDCache(dbname='nancay') cands, tempdir = candidates.get_candidates(version_number, dir, \ timestamp_mjd=data.timestamp_mjd, \ inst_cache=rat_inst_id_cache) print "\tPeriodicity candidates parsed." sp_cands = sp_candidates.get_spcandidates(version_number, dir, \ timestamp_mjd=data.timestamp_mjd) print "\tSingle pulse candidates parsed." for c in (cands + sp_cands): hdr.add_dependent(c) diags = diagnostics.get_diagnostics(data.obs_name, data.beam_id, \ data.obstype, \ version_number, \ dir) print "\tDiagnostics parsed." if debug.UPLOAD: upload.upload_timing_summary['Parsing'] = \ upload.upload_timing_summary.setdefault('Parsing', 0) + \ (time.time()-parsetime) # Perform the upload header_id = hdr.upload(db) for d in diags: d.upload(db) print "\tDB upload completed and checked successfully. header_id=%d" % \ header_id except (upload.UploadNonFatalError): # Parsing error caught. Job attempt has failed! exceptionmsgs = traceback.format_exception(*sys.exc_info()) errormsg = "Error while checking results!\n" errormsg += "\tJob ID: %d, Job submit ID: %d\n\n" % \ (job_submit['job_id'], job_submit['id']) errormsg += "".join(exceptionmsgs) sys.stderr.write("Error while checking results!\n") sys.stderr.write("Database transaction will not be committed.\n") sys.stderr.write("\t%s" % exceptionmsgs[-1]) queries = [] arglists = [] queries.append("UPDATE job_submits " \ "SET status='upload_failed', " \ "details=\"%s\", " \ "updated_at='%s' " \ "WHERE id=%d"%(errormsg.replace("\"","\'"), jobtracker.nowstr(), job_submit['id'])) queries.append("UPDATE jobs " \ "SET status='failed', " \ "details='Error while uploading results', " \ "updated_at='%s' " \ "WHERE id=%d"%(jobtracker.nowstr(), job_submit['job_id'])) jobtracker.query(queries) # Rolling back changes. db.rollback() except (database.DatabaseConnectionError, \ upload.UploadDeadlockError, database.DatabaseDeadlockError), e: # Connection error while uploading. We will try again later. sys.stderr.write(str(e)) sys.stderr.write("\tRolling back DB transaction and will re-try later.\n") # Rolling back changes. db.rollback()
def update_jobs_status_from_queue(): """ Updates Database entries for job processing according to the Jobs' Queue Status. """ # Collect all non processed jobs from db linking to downloaded files submits = jobtracker.query("SELECT * FROM job_submits " \ "WHERE status='running'") for submit in submits: # Check if job is still running (according to queue manager) is_running = config.jobpooler.queue_manager.is_running(submit['queue_id']) if is_running: # Do nothing. pass else: # Check if processing had errors fatal_error,details = eval_errorlog(submit['queue_id']) if fatal_error: # Errors during processing... errormsg = config.jobpooler.queue_manager.get_errors(submit['queue_id']) if errormsg.count("\n") > 100: errormsg = string.join(errormsg.split("\n")[:50],"\n") jobpool_cout.outs("Processing of Job #%d (Submit ID: %d; Queue ID: %s) " \ "had errors." % \ (submit['job_id'], submit['id'], submit['queue_id'])) # Mark job entry with status 'failed' # Mark job_submit entry with status 'processing_failed' queries = [] arglists = [] queries.append("UPDATE jobs " \ "SET status='failed', " \ "updated_at=?, " \ "details='Errors during processing' " \ "WHERE id=?") arglists.append((jobtracker.nowstr(), submit['job_id'])) queries.append("UPDATE job_submits " \ "SET status='processing_failed', " \ "details=?, " \ "updated_at=? " \ "WHERE id=?") arglists.append((errormsg, jobtracker.nowstr(), submit['id'])) #print arglists jobtracker.execute(queries, arglists) else: # No errors. Woohoo! # Mark job and job_submit entries with status 'processed' queries = [] queries.append("UPDATE jobs " \ "SET status='processed', " \ "updated_at='%s', " \ "details='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(), details,submit['job_id'])) queries.append("UPDATE job_submits " \ "SET status='processed', " \ "updated_at='%s', " \ "details='%s' " \ "WHERE id=%d" % \ (jobtracker.nowstr(),details, submit['id'])) jobtracker.query(queries)