def __run_p4a_process(self): self.files_to_compile += [ file_dict['file_full_path'] for file_dict in self.get_file_list() ] command = 'PATH=/bin:$PATH p4a -vv ' + ' '.join(self.files_to_compile) if self.extra_files: command += f' {" ".join(self.extra_files)}' command += ' ' + ' '.join(map(str, super().get_compilation_flags())) if self.include_dirs_list: command += ' -I ' + ' -I '.join( map( lambda x: os.path.join(self.get_input_file_directory(), str(x)), self.include_dirs_list)) try: logger.info(f'{Par4all.__name__}: start parallelizing') stdout, stderr, ret_code = run_subprocess([ command, ], self.get_input_file_directory()) log_file_path = os.path.join(self.get_input_file_directory(), Par4allConfig.LOG_FILE_NAME) logger.log_to_file(f'{stdout}\n{stderr}', log_file_path) logger.debug(f'{Par4all.__name__}: {stdout}') logger.debug_error(f'{Par4all.__name__}: {stderr}') logger.info(f'{Par4all.__name__}: finished parallelizing') except subprocess.CalledProcessError as e: log_file_path = os.path.join(self.get_input_file_directory(), Par4allConfig.LOG_FILE_NAME) logger.log_to_file(f'{e.output}\n{e.stderr}', log_file_path) raise CombinationFailure( f'par4all return with {e.returncode} code: {str(e)} : {e.output} : {e.stderr}' ) except Exception as e: raise CompilationError( f"{e}\nfiles in directory {self.get_input_file_directory()} failed to be parallel!" )
def compile(self): super().compile() try: for file in self.get_file_list(): Cetus.replace_line_in_code(file["file_full_path"], GlobalsConfig.OMP_HEADER, '') cwd_path = os.path.dirname(file["file_full_path"]) self.copy_headers(cwd_path) logger.info(f'{Cetus.__name__}: start parallelizing {file["file_name"]}') command = [f'cetus {" ".join(self.get_compilation_flags())} {file["file_name"]}'] stdout, stderr, ret_code = run_subprocess(command, cwd_path) log_file_path = f'{os.path.splitext(file["file_full_path"])[0]}{CetusConfig.LOG_FILE_SUFFIX}' logger.log_to_file(f'{stdout}\n{stderr}', log_file_path) logger.debug(f'{Cetus.__name__}: {stdout}') logger.debug_error(f'{Cetus.__name__}: {stderr}') logger.info(f'{Cetus.__name__}: finished parallelizing {file["file_name"]}') # Replace file from cetus output folder into original file folder if os.path.isdir(os.path.join(cwd_path, CetusConfig.OUTPUT_DIR_NAME)): src_file = os.path.join(cwd_path, CetusConfig.OUTPUT_DIR_NAME, file["file_name"]) dst_file = file["file_full_path"] shutil.copy(src_file, dst_file) shutil.rmtree(os.path.join(cwd_path, CetusConfig.OUTPUT_DIR_NAME)) Cetus.inject_line_in_code(file["file_full_path"], GlobalsConfig.OMP_HEADER) return True except subprocess.CalledProcessError as ex: log_file_path = f'{os.path.splitext(file["file_full_path"])[0]}{CetusConfig.LOG_FILE_SUFFIX}' logger.log_to_file(f'{ex.output}\n{ex.stderr}', log_file_path) raise CombinationFailure(f'cetus return with {ex.returncode} code: {str(ex)} : {ex.output} : {ex.stderr}') except Exception as ex: raise CompilationError(str(ex) + " files in directory " + self.get_input_file_directory() + " failed to be parallel!")
def run_parallel_combinations(self): logger.info('Start to work on parallel combinations') self.parallel_jobs_pool_executor.create_jobs_pool() # if equal to one - we don't need to concatenate the number of repetitions to combination id nor calculate avg is_multiple_combinations = self.multiple_combinations > 1 for combination_json in self.db.combinations_iterator(): original_combination_obj = Combination.json_to_obj(combination_json) logger.info(LogPhrases.NEW_COMBINATION.format(original_combination_obj.combination_id)) for i in range(self.multiple_combinations): if is_multiple_combinations: combination_obj = copy.deepcopy(original_combination_obj) combination_obj.combination_id = f'{combination_obj.combination_id}_{i}' logger.info(f'#{i} repetition of {original_combination_obj.combination_id} combination') else: combination_obj = original_combination_obj combination_folder_path = self.create_combination_folder(str(combination_obj.get_combination_id())) try: self.parallel_compilation_of_one_combination(combination_obj, combination_folder_path) self.compile_combination_to_binary(combination_folder_path) except Exception as ex: logger.info_error(f'Exception at {Compar.__name__}: {ex}') logger.debug_error(f'{traceback.format_exc()}') self.save_combination_as_failure(combination_obj.get_combination_id(), str(ex), combination_folder_path) continue job = Job(combination_folder_path, combination_obj, self.main_file_parameters) self.parallel_jobs_pool_executor.run_job_in_thread(self.run_and_save_job, job) self.parallel_jobs_pool_executor.wait_and_finish_pool() if is_multiple_combinations: self.calculate_multiple_combinations_average() logger.info('Finish to work on all the parallel combinations')
def delete_combination(self, combination_id: str): try: self.dynamic_db[self.collection_name].delete_one({"_id": combination_id}) return True except Exception as e: logger.info_error(f'Exception at {Database.__name__}: Could not delete combination: {e}') logger.debug_error(f'{traceback.format_exc()}') return False
def run_makefile(self): logger.info(f'{Makefile.__name__}: started running makefile') command = ' && '.join(self.commands) stdout, stderr, ret_code = run_subprocess(command, self.working_directory) logger.debug(f'{Makefile.__name__}: {stdout}') logger.debug_error(f'{Makefile.__name__}: {stderr}') logger.info(f'{Makefile.__name__}: finished running makefile')
def insert_new_combination_results(self, combination_result: dict): try: self.dynamic_db[self.collection_name].insert_one(combination_result) return True except Exception as e: logger.info_error(f'{Database.__name__}: cannot update dynamic DB: {e}') logger.debug_error(f'{traceback.format_exc()}') return False
def get_combination_results(self, combination_id: str): combination = None try: combination = self.dynamic_db[self.collection_name].find_one({"_id": combination_id}) except Exception as e: logger.info_error(f'Exception at {Database.__name__}: Could not find results for combination: {e}') logger.debug_error(f'{traceback.format_exc()}') finally: return combination
def run_and_save_job(self, job_obj: Job): try: job_obj = self.execute_job(job_obj, self.serial_run_time) except Exception as ex: logger.info_error(f'Exception at {Compar.__name__}: {ex}') logger.debug_error(f'{traceback.format_exc()}') finally: if not self.save_combinations_folders: self.__delete_combination_folder(job_obj.get_directory_path())
def run_compiler(self): input_file_path_only = os.path.dirname(self.get_input_file_directory() + os.path.sep) dir_name = os.path.basename(input_file_path_only) logger.info(f'{BinaryCompiler.__name__}: start to compiling {self.get_main_c_file()}') command = [self.get_compiler_name(), "-fopenmp"] + self.get_compilation_flags() command += [self.get_main_c_file(), "-o", dir_name + ".x"] stdout, stderr, ret_code = run_subprocess(command, self.get_input_file_directory()) logger.debug(f'{BinaryCompiler.__name__}: {stdout}') logger.debug_error(f'{BinaryCompiler.__name__}: {stderr}') logger.info(f'{BinaryCompiler.__name__}: finished compiling {self.get_main_c_file()}')
def run_compiler(self): input_file_path_only = os.path.dirname(self.get_input_file_directory() + os.path.sep) dir_name = os.path.basename(input_file_path_only) logger.info(f'{Icc.__name__}: start to compiling {self.get_main_c_file()}') stdout, stderr, ret_code = run_subprocess([self.get_compiler_name()] + ["-fopenmp"] + self.get_compilation_flags() + [self.get_main_c_file()] + ["-o"] + [dir_name + ".x"], self.get_input_file_directory()) logger.debug(stdout) logger.debug_error(stderr) logger.info(f'{Icc.__name__}: finished compiling {self.get_main_c_file()}')
def __remove_bswap_function(file_path: str): bswap_regex = re.compile(r'static __uint64_t __bswap_64[^\}]*\}', flags=re.DOTALL) try: with open(file_path, 'r+') as f: content = f.read() if bswap_regex.match(content): content = bswap_regex.sub('', content) f.seek(0) f.write(content) f.truncate() except Exception as e: logger.info_error(f'Exception at {Par4all.__name__}: {e}') logger.debug_error(f'{traceback.format_exc()}')
def run_autopar(self, file_name: str, file_full_path: str, options: list): logger.info(f'{Autopar.__name__}: started parallelizing {file_name}') command = 'autoPar' if self.include_dirs_list: command += ' -I' + ' -I'.join(map(lambda x: os.path.join(self.get_input_file_directory(), str(x)), self.include_dirs_list)) command += f' {" ".join(options)} -c {file_name}' stdout, stderr, ret_code = run_subprocess([command], os.path.dirname(file_full_path)) log_file_path = f'{os.path.splitext(file_full_path)[0]}{AutoParConfig.LOG_FILE_SUFFIX}' logger.log_to_file(f'{stdout}\n{stderr}', log_file_path) dir_path, file_name = os.path.split(file_full_path) parallel_file_full_path = os.path.join(dir_path, f'{AutoParConfig.OUTPUT_FILE_NAME_PREFIX}{file_name}') if os.path.exists(parallel_file_full_path): os.remove(file_full_path) os.rename(parallel_file_full_path, file_full_path) logger.debug(f'{Autopar.__name__}: {stdout}') logger.debug_error(f'{Autopar.__name__}: {stderr}') logger.info(f'{Autopar.__name__}: finished parallelizing {file_name}')
def get_combination_from_static_db(self, combination_id: str): combination = None if combination_id == self.SERIAL_COMBINATION_ID: return { "_id": Database.SERIAL_COMBINATION_ID, "compiler_name": Database.SERIAL_COMBINATION_ID, "parameters": { "omp_rtl_params": [], "omp_directives_params": [], "compilation_params": [] } } try: combination = self.static_db[self.collection_name].find_one({"_id": combination_id}) except Exception as e: logger.info_error(f'Exception at {Database.__name__}: Could not find combination: {e}') logger.debug_error(f'{traceback.format_exc()}') finally: return combination
def __run_user_script(self, script_name: str): json_script_file_path = os.path.join(GlobalsConfig.ASSETS_DIR_PATH, script_name) if os.path.exists(json_script_file_path): with open(json_script_file_path, 'r') as f: json_content = json.load(f) if self.NAME in json_content: user_script_path = json_content[self.NAME] if os.path.exists(user_script_path): try: script_command = f'{user_script_path} {self.get_input_file_directory()}' std_out, std_err, ret_code = run_subprocess( script_command) logger.debug(std_out) logger.debug_error(std_err) except subprocess.CalledProcessError as e: logger.info_error( f'{self.NAME}: user {script_name} script return with {e.returncode}: {e}' ) logger.info(e.output) logger.info_error(e.stderr)
def initialize_static_db(self): try: combinations = generate_combinations() num_of_parallel_combinations = len(combinations) for combination in combinations: curr_combination_id = Database.generate_combination_id(combination) self.static_db[self.collection_name].update_one( filter={ '_id': curr_combination_id }, update={ '$setOnInsert': combination }, upsert=True ) return num_of_parallel_combinations except Exception as e: logger.info_error(f'Exception at {Database.__name__}: cannot initialize static DB: {e}') logger.debug_error(f'{traceback.format_exc()}') raise DatabaseError() finally: del combinations
slurm_parameters=args.slurm_parameters, extra_files=args.extra_files, main_file_rel_path=args.main_file_rel_path, time_limit=args.time_limit, slurm_partition=args.slurm_partition, test_file_path=args.test_file_path, mode=args.mode, code_with_markers=args.code_with_markers, clear_db=args.clear_db, multiple_combinations=args.multiple_combinations, log_level=args.log_level) try: compar_obj.fragment_and_add_timers() compar_obj.run_serial() compar_obj.run_parallel_combinations() compar_obj.generate_optimal_code() logger.info('Finish Compar execution') except Exception: if args.clear_db: compar_obj.clear_related_collections() raise if __name__ == "__main__": try: main() except Exception as e: logger.info_error(f'Exception at Compar Program: {e}') logger.debug_error(traceback.format_exc()) exit(1)
def __run_with_sbatch(self, user_slurm_parameters: list): logger.info( f'Start running {self.get_job().get_combination().get_combination_id()} combination' ) slurm_parameters = user_slurm_parameters dir_path = self.get_job().get_directory_path() dir_name = os.path.basename(dir_path) x_file = dir_name + MakefileConfig.EXE_FILE_EXTENSION sbatch_script_file = self.__make_sbatch_script_file(x_file) log_file = dir_name + GlobalsConfig.LOG_EXTENSION x_file_path = os.path.join(dir_path, x_file) log_file_path = os.path.join(dir_path, log_file) slurm_parameters = " ".join(slurm_parameters) cmd = f'sbatch {slurm_parameters} -o {log_file_path} {sbatch_script_file} {x_file_path}' if self.get_job().get_exec_file_args(): cmd += f' {" ".join([str(arg) for arg in self.get_job().get_exec_file_args()])} ' stdout = "" batch_job_sent = False while not batch_job_sent: try: stdout, stderr, ret_code = run_subprocess(cmd) batch_job_sent = True except subprocess.CalledProcessError as ex: logger.info_error( f'Exception at {ExecuteJob.__name__}: {ex}\n{ex.output}\n{ex.stderr}' ) logger.debug_error(f'{traceback.format_exc()}') logger.info_error( 'sbatch command not responding (slurm is down?)') time.sleep( ExecuteJobConfig.TRY_SLURM_RECOVERY_AGAIN_SECOND_TIME) result = stdout # set job id result = re.findall('[0-9]', str(result)) result = ''.join(result) self.get_job().set_job_id(result) logger.info( LogPhrases.JOB_SENT_TO_SLURM.format(self.get_job().get_job_id())) cmd = f"squeue -j {self.get_job().get_job_id()} --format %t" last_status = '' is_first_time = True is_finish = False while not is_finish: try: stdout, stderr = '', '' try: stdout, stderr, ret_code = run_subprocess(cmd) except subprocess.CalledProcessError: # check if squeue is not working or if the job finished _, _, ret_code = run_subprocess('squeue') if ret_code != 0: raise else: is_finish = True current_status = '' try: current_status = stdout.split('\n')[1] except IndexError: if not is_finish: logger.info_error( f'Warning: check the squeue command output: {stdout} {stderr}' ) time.sleep(ExecuteJobConfig. TRY_SLURM_RECOVERY_AGAIN_SECOND_TIME) continue if current_status != last_status and current_status != '': logger.info( f'Job {self.get_job().get_job_id()} status is {current_status}' ) last_status = current_status if not is_finish and not is_first_time: # not is_first_time - some times the job go to COMPLETE immediately (fast running) time.sleep(ExecuteJobConfig.CHECK_SQUEUE_SECOND_TIME) if is_first_time: is_first_time = False except subprocess.CalledProcessError as ex: # squeue command not responding (slurm is down?) logger.info_error( f'Exception at {ExecuteJob.__name__}: {ex}\n{ex.stdout}\n{ex.stderr}' ) logger.debug_error(f'{traceback.format_exc()}') logger.info_error( 'squeue command not responding (slurm is down?)') time.sleep( ExecuteJobConfig.TRY_SLURM_RECOVERY_AGAIN_SECOND_TIME) logger.info( LogPhrases.JOB_IS_COMPLETE.format(self.get_job().get_job_id()))