def fix_interrupts(name): assert os.path.exists(name) and os.path.isdir(name) assert name.endswith('fail') queue_fail = QueueDir(name) queue_success = QueueDir(name.replace('fail', 'success')) restore_count = 0 queue_fail_size = queue_fail.qsize() fail_files = queue_fail.list_files() success_cache = {} for i in range(queue_success.qsize()): jd = queue_success.peek(i) key = jd['job_id'] jd_rec = {'jd': jd, 'id': i} success_cache[key] = jd_rec for i in range(queue_fail.qsize() - 1, -1, -1): jd = queue_fail.peek(i) if _has_output(name, jd): if jd['job_id'] in success_cache: print "WARN: already in success (%s)" % fail_files[i] continue print "seemsOK: %d" % jd['job_id'] restore_count += 1 queue_fail.remove(i) jd['ex_status'] = jd['status'] jd['status'] = 'SUCCESS' queue_success.put(jd) print "restored %d JDs of %d" % (restore_count, queue_fail_size)
def fill(dst, template, count): queue = QueueDir(dst) with open(template) as fh: jd_template = json.load(fh) jd_min = None if 'job_id' in jd_template: jd_min = jd_template['job_id'] assert count > 0 for i in range(count): jd = copy(jd_template) if jd_min is not None: jd['job_id'] = jd_min + i queue.put(jd)
def reset_fail(name): assert os.path.exists(name) and os.path.isdir(name) name = name.rstrip('/') assert name.endswith('.fail') origname = name.replace('.fail', '') groupname = os.path.basename(origname) qfail = QueueDir(name) qorig = QueueDir(origname) for jd in qfail: outdir = 'output-%s/%d' % (groupname, jd['job_id']) if os.path.exists(outdir): shutil.rmtree(outdir) qorig.put(jd)
def unlock(name): lockfile = "%s.locker" % name assert os.path.exists(name) and os.path.isdir(name) jds = _lock_ids(lockfile) assert len(jds) > 0 queue_orig = QueueDir(name) queue_succ = QueueDir(name + ".success") for job_id, jd in jds.iteritems(): if _has_output(name, jd): queue_succ.put(jd) logger.info("%d -> success" % job_id) else: queue_orig.put(jd) logger.info("%d -> orig" % job_id) os.remove(lockfile)
def check_success(start_id, stop_id): pool = multiprocessing.Pool(POOL_SIZE) group_names = ["mc%02d" % i for i in range(1,21)] print group_names unsuccessful = pool.map(_find_no_output, group_names) unsuccessful = [x for x in unsuccessful if len(x) > 0] print unsuccessful with open("no_output.dump", 'w') as fh: cPickle.dump(unsuccessful, fh) for unx in unsuccessful: queue_succ_name = unx.keys()[0].split(':')[0] queue_succ = QueueDir(queue_succ_name) queue_fail = QueueDir(queue_succ_name.replace('success', 'fail')) for key in sorted(unx.keys(), key=lambda x: int(x.split(':')[1]), reverse=True): id = int(key.split(':')[1]) jd = unx[key] print "%s -> fail (%d)" % (key, jd['job_id']) queue_fail.put(jd) queue_succ.remove(id)