def run_iter(json_file, init_model): prev_model = init_model fp = open(json_file, 'r') jdata = json.load(fp) numb_iter = jdata["numb_iter"] numb_task = 8 record = "record.rit" iter_rec = [0, -1] if os.path.isfile(record): with open(record) as frec: for line in frec: iter_rec = [int(x) for x in line.split()] logging.info("continue from iter %03d task %02d" % (iter_rec[0], iter_rec[1])) global exec_machine for ii in range(numb_iter): if ii > 0: prev_model = glob.glob( make_iter_name(ii - 1) + "/" + train_name + "/*pb") for jj in range(numb_task): if ii * max_tasks + jj <= iter_rec[0] * max_tasks + iter_rec[1]: continue if jj == 0: log_iter("make_temp", ii, jj) make_temp(ii, json_file, prev_model) elif jj == 1: log_iter("run_temp", ii, jj) run_temp(ii, json_file) elif jj == 2: log_iter("post_temp", ii, jj) cont = post_temp(ii, json_file) if not cont: log_iter("no more conf needed", ii, jj) return elif jj == 3: log_iter("make_res", ii, jj) cont = make_res(ii, json_file) elif jj == 4: log_iter("run_res", ii, jj) run_res(ii, json_file, exec_machine) elif jj == 5: log_iter("post_res", ii, jj) post_res(ii, json_file) elif jj == 6: log_iter("make_train", ii, jj) make_train(ii, json_file) elif jj == 7: log_iter("run_train", ii, jj) run_train(ii, json_file, exec_machine) else: raise RuntimeError("unknow task %d, something wrong" % jj) record_iter(record, ii, jj)
def run_iter(json_file, init_model): base_dir = os.getcwd() prev_model = init_model fp = open(json_file, 'r') jdata = json.load(fp) sits_param = jdata.get("sits_settings", None) numb_iter = jdata["numb_iter"] niter_per_sits = sits_param.get("niter_per_sits", 100000000) numb_task = 8 record = "record.rid" record_sits = "record.sits" cleanup = jdata["cleanup"] iter_rec = [0, -1] sits_iter_rec = [0, -1] if os.path.isfile(record): with open(record) as frec: for line in frec: iter_rec = [int(x) for x in line.split()] logging.info("continue from iter %03d task %02d" % (iter_rec[0], iter_rec[1])) if os.path.isfile(record_sits): with open(record_sits) as frec: for line in frec: sits_iter_rec = [int(x) for x in line.split()] logging.info("continue from iter %03d task %02d" % (sits_iter_rec[0], sits_iter_rec[1])) global exec_machine bPost_train = jdata.get("post_train") if sits_iter_rec == [0, -1]: create_path("sits") data_name = "data" for ii in range(iter_rec[0], numb_iter): kk = int(ii / niter_per_sits) data_name = "data%03d" % (kk + 1) if ii > 0: prev_model = glob.glob( make_iter_name(ii - 1) + "/" + train_name + "/*pb") if ii % niter_per_sits == 0: log_iter("run_sits_iter", kk, 0) if not os.path.exists(join("sits", make_iter_name(kk))): create_path(join("sits", make_iter_name(kk))) if kk > 0: open(join("sits", make_iter_name(kk - 1), "rid_iter_end.dat"), "w+").write("%d" % ii) open(join("sits", make_iter_name(kk), "rid_iter_begin.dat"), "w+").write("%d" % ii) for jj in range((sits_iter_rec[1] + 1) % 6, 6): if kk * max_tasks + jj <= sits_iter_rec[ 0] * max_tasks + sits_iter_rec[1]: continue os.chdir(base_dir) if jj == 0: make_sits_iter(kk, json_file, prev_model) elif jj == 1: run_sits_iter(kk, json_file) elif jj == 2: post_sits_iter(kk, json_file) elif jj == 3: if kk > 0: make_train_eff(kk, json_file) elif jj == 4: if kk > 0: run_train_eff(kk, json_file, exec_machine) elif jj == 5: if kk > 0: post_train_eff(kk, json_file) record_iter(record_sits, kk, jj) for jj in range(numb_task): if ii * max_tasks + jj <= iter_rec[0] * max_tasks + iter_rec[1]: continue os.chdir(base_dir) if jj == 0: log_iter("make_enhc", ii, jj) # logging.info ("use prev model " + str(prev_model)) make_enhc(ii, json_file, prev_model) elif jj == 1: log_iter("run_enhc", ii, jj) run_enhc(ii, json_file) elif jj == 2: log_iter("post_enhc", ii, jj) post_enhc(ii, json_file) elif jj == 3: log_iter("make_res", ii, jj) cont = make_res(ii, json_file) if not cont: log_iter("no more conf needed", ii, jj) return elif jj == 4: log_iter("run_res", ii, jj) run_res(ii, json_file, exec_machine) elif jj == 5: log_iter("post_res", ii, jj) post_res(ii, json_file, data_name=data_name) elif jj == 6: log_iter("make_train", ii, jj) make_train(ii, json_file, data_name=data_name) elif jj == 7: log_iter("run_train", ii, jj) run_train(ii, json_file, exec_machine, data_name=data_name) if cleanup: clean_train(ii) clean_enhc(ii) clean_enhc_confs(ii) clean_res(ii) else: raise RuntimeError("unknow task %d, something wrong" % jj) record_iter(record, ii, jj)
def run_train_eff(sits_iter_index, json_file, exec_machine=MachineLocal): run_train(sits_iter_index, json_file, exec_machine=exec_machine, data_name="data%03d" % (sits_iter_index + 1), sits_iter=True)