def run_parallel_combinations(self): logger.info('Start to work on parallel combinations') self.parallel_jobs_pool_executor.create_jobs_pool() # if equal to one - we don't need to concatenate the number of repetitions to combination id nor calculate avg is_multiple_combinations = self.multiple_combinations > 1 for combination_json in self.db.combinations_iterator(): original_combination_obj = Combination.json_to_obj(combination_json) logger.info(LogPhrases.NEW_COMBINATION.format(original_combination_obj.combination_id)) for i in range(self.multiple_combinations): if is_multiple_combinations: combination_obj = copy.deepcopy(original_combination_obj) combination_obj.combination_id = f'{combination_obj.combination_id}_{i}' logger.info(f'#{i} repetition of {original_combination_obj.combination_id} combination') else: combination_obj = original_combination_obj combination_folder_path = self.create_combination_folder(str(combination_obj.get_combination_id())) try: self.parallel_compilation_of_one_combination(combination_obj, combination_folder_path) self.compile_combination_to_binary(combination_folder_path) except Exception as ex: logger.info_error(f'Exception at {Compar.__name__}: {ex}') logger.debug_error(f'{traceback.format_exc()}') self.save_combination_as_failure(combination_obj.get_combination_id(), str(ex), combination_folder_path) continue job = Job(combination_folder_path, combination_obj, self.main_file_parameters) self.parallel_jobs_pool_executor.run_job_in_thread(self.run_and_save_job, job) self.parallel_jobs_pool_executor.wait_and_finish_pool() if is_multiple_combinations: self.calculate_multiple_combinations_average() logger.info('Finish to work on all the parallel combinations')
def delete_combination(self, combination_id: str): try: self.dynamic_db[self.collection_name].delete_one({"_id": combination_id}) return True except Exception as e: logger.info_error(f'Exception at {Database.__name__}: Could not delete combination: {e}') logger.debug_error(f'{traceback.format_exc()}') return False
def insert_new_combination_results(self, combination_result: dict): try: self.dynamic_db[self.collection_name].insert_one(combination_result) return True except Exception as e: logger.info_error(f'{Database.__name__}: cannot update dynamic DB: {e}') logger.debug_error(f'{traceback.format_exc()}') return False
def get_combination_results(self, combination_id: str): combination = None try: combination = self.dynamic_db[self.collection_name].find_one({"_id": combination_id}) except Exception as e: logger.info_error(f'Exception at {Database.__name__}: Could not find results for combination: {e}') logger.debug_error(f'{traceback.format_exc()}') finally: return combination
def combinations_iterator(self): try: for combination in self.static_db[self.collection_name].find(): if self.combination_has_results(combination['_id']): continue yield combination except Exception: logger.info_error(f"Exception at {Database.__name__}: get_next_combination") raise
def run_and_save_job(self, job_obj: Job): try: job_obj = self.execute_job(job_obj, self.serial_run_time) except Exception as ex: logger.info_error(f'Exception at {Compar.__name__}: {ex}') logger.debug_error(f'{traceback.format_exc()}') finally: if not self.save_combinations_folders: self.__delete_combination_folder(job_obj.get_directory_path())
def __get_collection_name(project_name): collection_name = f"{getpass.getuser()}_{project_name}" static_namespace = f'{DatabaseConfig.STATIC_DB_NAME}.{collection_name}' dynamic_namespace = f'{DatabaseConfig.DYNAMIC_DB_NAME}.{collection_name}' longer_namespace = max((static_namespace, dynamic_namespace), key=len) if len(longer_namespace) > DatabaseConfig.NAMESPACE_LENGTH_LIMIT: new_name = longer_namespace[:DatabaseConfig.NAMESPACE_LENGTH_LIMIT].split('.')[1] logger.info_error(f'DB namespace is too long! (max is {DatabaseConfig.NAMESPACE_LENGTH_LIMIT} characters)') logger.info_error(f'The name was changed from {collection_name} to {new_name}') collection_name = new_name return collection_name
def __remove_bswap_function(file_path: str): bswap_regex = re.compile(r'static __uint64_t __bswap_64[^\}]*\}', flags=re.DOTALL) try: with open(file_path, 'r+') as f: content = f.read() if bswap_regex.match(content): content = bswap_regex.sub('', content) f.seek(0) f.write(content) f.truncate() except Exception as e: logger.info_error(f'Exception at {Par4all.__name__}: {e}') logger.debug_error(f'{traceback.format_exc()}')
def get_combination_from_static_db(self, combination_id: str): combination = None if combination_id == self.SERIAL_COMBINATION_ID: return { "_id": Database.SERIAL_COMBINATION_ID, "compiler_name": Database.SERIAL_COMBINATION_ID, "parameters": { "omp_rtl_params": [], "omp_directives_params": [], "compilation_params": [] } } try: combination = self.static_db[self.collection_name].find_one({"_id": combination_id}) except Exception as e: logger.info_error(f'Exception at {Database.__name__}: Could not find combination: {e}') logger.debug_error(f'{traceback.format_exc()}') finally: return combination
def __analyze_job_exit_code(self): job_id = self.get_job().get_job_id() command = f"sacct -j {job_id} --format=exitcode" try: stdout, stderr, ret_code = run_subprocess(command) result = stdout.replace("\r", "").split("\n") if len(result) < 3: logger.info_error( f'Warning: sacct command - no results for job id: {job_id}.' ) return left_code, right_code = result[2].replace(" ", "").split(":") left_code, right_code = int(left_code), int(right_code) if left_code != 0 or right_code != 0: raise Exception( f"Job id: {job_id} ended with return code: {left_code}:{right_code}." ) except subprocess.CalledProcessError as ex: logger.info_error( f'Warning: sacct command not responding (slurm is down?)\n{ex.output}\n{ex.stderr}' )
def __run_user_script(self, script_name: str): json_script_file_path = os.path.join(GlobalsConfig.ASSETS_DIR_PATH, script_name) if os.path.exists(json_script_file_path): with open(json_script_file_path, 'r') as f: json_content = json.load(f) if self.NAME in json_content: user_script_path = json_content[self.NAME] if os.path.exists(user_script_path): try: script_command = f'{user_script_path} {self.get_input_file_directory()}' std_out, std_err, ret_code = run_subprocess( script_command) logger.debug(std_out) logger.debug_error(std_err) except subprocess.CalledProcessError as e: logger.info_error( f'{self.NAME}: user {script_name} script return with {e.returncode}: {e}' ) logger.info(e.output) logger.info_error(e.stderr)
def initialize_static_db(self): try: combinations = generate_combinations() num_of_parallel_combinations = len(combinations) for combination in combinations: curr_combination_id = Database.generate_combination_id(combination) self.static_db[self.collection_name].update_one( filter={ '_id': curr_combination_id }, update={ '$setOnInsert': combination }, upsert=True ) return num_of_parallel_combinations except Exception as e: logger.info_error(f'Exception at {Database.__name__}: cannot initialize static DB: {e}') logger.debug_error(f'{traceback.format_exc()}') raise DatabaseError() finally: del combinations
def trigger_test_output_test(test_file_path: str, working_dir: str = "", output_file_name: str = "", check_for_existence: bool = False): command = ["pytest"] command += [f"{test_file_path}::{CombinationValidator.UNIT_TEST_NAME}"] if working_dir: command += ["--working_dir", working_dir] if output_file_name: command += ["--output_file_name", output_file_name] command = " ".join(command) try: stdout, stderr, exit_code = run_subprocess(command) except CalledProcessError as e: if e.returncode is None or e.returncode not in [ code for code in ExitCode ]: logger.info_error( f"{CombinationValidator.__name__}: " f"pytest operation failed. could not run the test.\n{e}") return ExitCode.INTERNAL_ERROR stdout = e.stdout stderr = e.stderr exit_code = e.returncode except Exception as ex: logger.info_error( f"{CombinationValidator.__name__}: exception thrown during pytest operation." f" could not run the test.\n{ex}") return ExitCode.INTERNAL_ERROR if not check_for_existence: if exit_code == ExitCode.OK: logger.verbose( f"{CombinationValidator.__name__}: test '{CombinationValidator.UNIT_TEST_NAME}' passed." ) else: logger.info_error( f"{CombinationValidator.__name__}: " f"test '{CombinationValidator.UNIT_TEST_NAME}' failed.") logger.debug( f"{CombinationValidator.__name__}: {stdout}\n{stderr}.") return exit_code
def __run_with_sbatch(self, user_slurm_parameters: list): logger.info( f'Start running {self.get_job().get_combination().get_combination_id()} combination' ) slurm_parameters = user_slurm_parameters dir_path = self.get_job().get_directory_path() dir_name = os.path.basename(dir_path) x_file = dir_name + MakefileConfig.EXE_FILE_EXTENSION sbatch_script_file = self.__make_sbatch_script_file(x_file) log_file = dir_name + GlobalsConfig.LOG_EXTENSION x_file_path = os.path.join(dir_path, x_file) log_file_path = os.path.join(dir_path, log_file) slurm_parameters = " ".join(slurm_parameters) cmd = f'sbatch {slurm_parameters} -o {log_file_path} {sbatch_script_file} {x_file_path}' if self.get_job().get_exec_file_args(): cmd += f' {" ".join([str(arg) for arg in self.get_job().get_exec_file_args()])} ' stdout = "" batch_job_sent = False while not batch_job_sent: try: stdout, stderr, ret_code = run_subprocess(cmd) batch_job_sent = True except subprocess.CalledProcessError as ex: logger.info_error( f'Exception at {ExecuteJob.__name__}: {ex}\n{ex.output}\n{ex.stderr}' ) logger.debug_error(f'{traceback.format_exc()}') logger.info_error( 'sbatch command not responding (slurm is down?)') time.sleep( ExecuteJobConfig.TRY_SLURM_RECOVERY_AGAIN_SECOND_TIME) result = stdout # set job id result = re.findall('[0-9]', str(result)) result = ''.join(result) self.get_job().set_job_id(result) logger.info( LogPhrases.JOB_SENT_TO_SLURM.format(self.get_job().get_job_id())) cmd = f"squeue -j {self.get_job().get_job_id()} --format %t" last_status = '' is_first_time = True is_finish = False while not is_finish: try: stdout, stderr = '', '' try: stdout, stderr, ret_code = run_subprocess(cmd) except subprocess.CalledProcessError: # check if squeue is not working or if the job finished _, _, ret_code = run_subprocess('squeue') if ret_code != 0: raise else: is_finish = True current_status = '' try: current_status = stdout.split('\n')[1] except IndexError: if not is_finish: logger.info_error( f'Warning: check the squeue command output: {stdout} {stderr}' ) time.sleep(ExecuteJobConfig. TRY_SLURM_RECOVERY_AGAIN_SECOND_TIME) continue if current_status != last_status and current_status != '': logger.info( f'Job {self.get_job().get_job_id()} status is {current_status}' ) last_status = current_status if not is_finish and not is_first_time: # not is_first_time - some times the job go to COMPLETE immediately (fast running) time.sleep(ExecuteJobConfig.CHECK_SQUEUE_SECOND_TIME) if is_first_time: is_first_time = False except subprocess.CalledProcessError as ex: # squeue command not responding (slurm is down?) logger.info_error( f'Exception at {ExecuteJob.__name__}: {ex}\n{ex.stdout}\n{ex.stderr}' ) logger.debug_error(f'{traceback.format_exc()}') logger.info_error( 'squeue command not responding (slurm is down?)') time.sleep( ExecuteJobConfig.TRY_SLURM_RECOVERY_AGAIN_SECOND_TIME) logger.info( LogPhrases.JOB_IS_COMPLETE.format(self.get_job().get_job_id()))
slurm_parameters=args.slurm_parameters, extra_files=args.extra_files, main_file_rel_path=args.main_file_rel_path, time_limit=args.time_limit, slurm_partition=args.slurm_partition, test_file_path=args.test_file_path, mode=args.mode, code_with_markers=args.code_with_markers, clear_db=args.clear_db, multiple_combinations=args.multiple_combinations, log_level=args.log_level) try: compar_obj.fragment_and_add_timers() compar_obj.run_serial() compar_obj.run_parallel_combinations() compar_obj.generate_optimal_code() logger.info('Finish Compar execution') except Exception: if args.clear_db: compar_obj.clear_related_collections() raise if __name__ == "__main__": try: main() except Exception as e: logger.info_error(f'Exception at Compar Program: {e}') logger.debug_error(traceback.format_exc()) exit(1)