def fix_interrupts(name): assert os.path.exists(name) and os.path.isdir(name) assert name.endswith('fail') queue_fail = QueueDir(name) queue_success = QueueDir(name.replace('fail', 'success')) restore_count = 0 queue_fail_size = queue_fail.qsize() fail_files = queue_fail.list_files() success_cache = {} for i in range(queue_success.qsize()): jd = queue_success.peek(i) key = jd['job_id'] jd_rec = {'jd': jd, 'id': i} success_cache[key] = jd_rec for i in range(queue_fail.qsize() - 1, -1, -1): jd = queue_fail.peek(i) if _has_output(name, jd): if jd['job_id'] in success_cache: print "WARN: already in success (%s)" % fail_files[i] continue print "seemsOK: %d" % jd['job_id'] restore_count += 1 queue_fail.remove(i) jd['ex_status'] = jd['status'] jd['status'] = 'SUCCESS' queue_success.put(jd) print "restored %d JDs of %d" % (restore_count, queue_fail_size)
def _queue_jds(name): assert os.path.exists(name) and os.path.isdir(name) queue = QueueDir(name) jds = {} for i in range(queue.qsize()): jd = queue.peek(i) jds[i] = jd return jds
def _queue_ids(name): assert os.path.exists(name) and os.path.isdir(name) queue = QueueDir(name) ids = [] for i in range(queue.qsize()): jd = queue.peek(i) job_id = jd['job_id'] ids.append(job_id) return ids
def check_dupes(name, do_remove=False): assert os.path.exists(name) and os.path.isdir(name) queue = QueueDir(name) queue_files = queue.list_files() jds = {} for i in range(queue.qsize()): jd = queue.peek(i) key = jd['job_id'] jd_rec = {'file': queue_files[i], 'jd': jd, 'id': i} if key in (jds): jds[key].append(jd_rec) else: jds[key] = [jd_rec] for key, dupes in jds.iteritems(): if len(dupes) > 1: print "Dupes: %s" % dupes if do_remove: for jd_rec in dupes[0:-1]: print "remove: %s" % jd_rec['file'] os.remove(jd_rec['file']) # hack