def submit_hillshade_newest_headwall_line_grid_job(ids_list, idx, grid_base_name, max_job_count): wait_if_reach_max_jobs(max_job_count, 'dLi') # draw Line on hillshade job_name = 'dLi%d' % idx check_length_jobname(job_name) work_dir = working_dir_string(idx, 'hillshade_newest_headwall_line_', root=root_dir) if os.path.isdir(work_dir) is False: io_function.mkdir(work_dir) os.chdir(work_dir) ids_list = [str(item) for item in ids_list] io_function.save_list_to_txt(grid_base_name + '.txt', ids_list) # prepare job sh_list = [ 'hillshade_headwall_line_grid.sh', 'job_hillshade_headwall_line_grid.sh' ] copy_curc_job_files(jobsh_dir, work_dir, sh_list) slurm_utility.modify_slurm_job_sh( 'job_hillshade_headwall_line_grid.sh', 'job-name', job_name) else: os.chdir(work_dir) submit_job_names = slurm_utility.get_submited_job_names(curc_username) if job_name in submit_job_names: print( 'The folder: %s already exist and the job has been submitted, skip submitting a new job' % work_dir) return # job is completed if os.path.isfile('done.txt'): print('The job in the folder: %s is Done' % work_dir) return # submit the job # sometime, when submit a job, end with: singularity: command not found,and exist, wired, then try run submit a job in scomplie note submit_job_curc_or_run_script_local('job_hillshade_headwall_line_grid.sh', 'hillshade_headwall_line_grid.sh') os.chdir(curr_dir_before_start)
def submit_extract_headwall_job(slope_tifs, idx, max_job_count): wait_if_reach_max_jobs(max_job_count, 'HW') job_name = 'HW%d' % idx check_length_jobname(job_name) work_dir = working_dir_string(idx, 'extract_headwall_', root=root_dir) if os.path.isdir(work_dir) is False: io_function.mkdir(work_dir) os.chdir(work_dir) io_function.save_list_to_txt('slope_tif_list.txt', slope_tifs) # run segmentation sh_list = ['job_healwall.sh', 'extract_headwall_from_slope.sh'] copy_curc_job_files(jobsh_dir, work_dir, sh_list) slurm_utility.modify_slurm_job_sh('job_healwall.sh', 'job-name', job_name) else: os.chdir(work_dir) # job is completed if os.path.isfile('done.txt'): print('The job in the folder: %s is Done' % work_dir) return submit_job_names = slurm_utility.get_submited_job_names(curc_username) if job_name in submit_job_names: print( 'The folder: %s already exist and the job has been submitted, skip submitting a new job' % work_dir) return # submit the job # sometime, when submit a job, end with: singularity: command not found,and exist, wired, then try run submit a job in scomplie note submit_job_curc_or_run_script_local('job_healwall.sh', 'extract_headwall_from_slope.sh') os.chdir(curr_dir_before_start) return
def submit_training_job(idx, lr, iter_num, batch_size, backbone, buffer_size, training_data_per, data_augmentation, data_aug_ignore_classes): while True: job_count = slurm_utility.get_submit_job_count(curc_username) if job_count >= 5: print(machine_name, datetime.now(), 'You have submitted 5 or more jobs, wait ') time.sleep(60) # continue break para_file = 'main_para_exp9.ini' job_name = 'tune%d' % idx work_dir = working_dir_string(idx, root=root_dir) if os.path.isdir(work_dir) is False: io_function.mkdir(work_dir) os.chdir(work_dir) # create a training folder copy_ini_files(ini_dir, work_dir, para_file, area_ini_list, backbone) # change para_file modify_parameter(os.path.join(work_dir, para_file), 'network_setting_ini', backbone) modify_parameter(os.path.join(work_dir, backbone), 'base_learning_rate', lr) modify_parameter(os.path.join(work_dir, backbone), 'batch_size', batch_size) modify_parameter(os.path.join(work_dir, backbone), 'iteration_num', iter_num) modify_parameter(os.path.join(work_dir, para_file), 'buffer_size', buffer_size) modify_parameter(os.path.join(work_dir, para_file), 'training_data_per', training_data_per) modify_parameter(os.path.join(work_dir, para_file), 'data_augmentation', data_augmentation) modify_parameter(os.path.join(work_dir, para_file), 'data_aug_ignore_classes', data_aug_ignore_classes) # run training # whole_procedure.run_whole_procedure(para_file, b_train_only=True) # copy job.sh exe.sh and other, run submit jobs copy_curc_job_files(jobsh_dir, work_dir) slurm_utility.modify_slurm_job_sh('job_tf_GPU.sh', 'job-name', job_name) else: os.chdir(work_dir) submit_job_names = slurm_utility.get_submited_job_names(curc_username) if job_name in submit_job_names: print( 'The folder: %s already exist and the job has been submitted, skip submitting a new job' % work_dir) return work_dir, os.path.join(work_dir, para_file) # if result exists, well trained, or early stopping early_stop, model_trained_iter = check_early_stopping_trained_iteration( work_dir, para_file) if early_stop is True: print( 'The folder: %s is early_stopping with trained model of %d iteration, skip submitting a new job' % (work_dir, model_trained_iter)) return work_dir, os.path.join(work_dir, para_file) if model_trained_iter >= iter_num: print( 'The folder: %s has been trained of %d iteration (>=required), skip submitting a new job' % (work_dir, model_trained_iter)) return work_dir, os.path.join(work_dir, para_file) # submit the job # sometime, when submit a job, end with: singularity: command not found,and exist, wired, then try run submit a job in scomplie note res = os.system('sbatch job_tf_GPU.sh') if res != 0: sys.exit(1) os.chdir(curr_dir_before_start) return work_dir, os.path.join(work_dir, para_file)