Example #1
0
def fix_interrupts(name):
    assert os.path.exists(name) and os.path.isdir(name)
    assert name.endswith('fail')
    queue_fail = QueueDir(name)
    queue_success = QueueDir(name.replace('fail', 'success'))
    restore_count = 0
    queue_fail_size = queue_fail.qsize()
    fail_files = queue_fail.list_files()

    success_cache = {}
    for i in range(queue_success.qsize()):
        jd = queue_success.peek(i)
        key = jd['job_id']
        jd_rec = {'jd': jd, 'id': i}
        success_cache[key] = jd_rec

    for i in range(queue_fail.qsize() - 1, -1, -1):
        jd = queue_fail.peek(i)
        if _has_output(name, jd):
            if jd['job_id'] in success_cache:
                print "WARN: already in success (%s)" % fail_files[i]
                continue
            print "seemsOK: %d" % jd['job_id']
            restore_count += 1
            queue_fail.remove(i)
            jd['ex_status'] = jd['status']
            jd['status'] = 'SUCCESS'
            queue_success.put(jd)
    print "restored %d JDs of %d" % (restore_count, queue_fail_size)
Example #2
0
def fill(dst, template, count):
    queue = QueueDir(dst)
    with open(template) as fh:
        jd_template = json.load(fh)
    jd_min = None
    if 'job_id' in jd_template:
        jd_min = jd_template['job_id']
    assert count > 0
    for i in range(count):
        jd = copy(jd_template)
        if jd_min is not None:
            jd['job_id'] = jd_min + i
        queue.put(jd)
Example #3
0
def reset_fail(name):
    assert os.path.exists(name) and os.path.isdir(name)
    name = name.rstrip('/')
    assert name.endswith('.fail')
    origname = name.replace('.fail', '')
    groupname = os.path.basename(origname)

    qfail = QueueDir(name)
    qorig = QueueDir(origname)
    for jd in qfail:
        outdir = 'output-%s/%d' % (groupname, jd['job_id'])
        if os.path.exists(outdir):
            shutil.rmtree(outdir)
        qorig.put(jd)
Example #4
0
def unlock(name):
    lockfile = "%s.locker" % name
    assert os.path.exists(name) and os.path.isdir(name)
    jds = _lock_ids(lockfile)
    assert len(jds) > 0
    queue_orig = QueueDir(name)
    queue_succ = QueueDir(name + ".success")
    for job_id, jd in jds.iteritems():
        if _has_output(name, jd):
            queue_succ.put(jd)
            logger.info("%d -> success" % job_id)
        else:
            queue_orig.put(jd)
            logger.info("%d -> orig" % job_id)
    os.remove(lockfile)
Example #5
0
def check_success(start_id, stop_id):
    pool = multiprocessing.Pool(POOL_SIZE)
    group_names = ["mc%02d" % i for i in range(1,21)]
    print group_names
    unsuccessful = pool.map(_find_no_output, group_names)
    unsuccessful = [x for x in unsuccessful if len(x) > 0]
    print unsuccessful
    with open("no_output.dump", 'w') as fh:
        cPickle.dump(unsuccessful, fh)
    for unx in unsuccessful:
        queue_succ_name = unx.keys()[0].split(':')[0]
        queue_succ = QueueDir(queue_succ_name)
        queue_fail = QueueDir(queue_succ_name.replace('success', 'fail'))
        for key in sorted(unx.keys(), key=lambda x: int(x.split(':')[1]), reverse=True):
            id = int(key.split(':')[1])
            jd = unx[key]
            print "%s -> fail (%d)" % (key, jd['job_id'])
            queue_fail.put(jd)
            queue_succ.remove(id)