def exec( self, input_path: str, output_path: str, aux_dir: str, additional_params: t.Optional[t.Dict[str, str]] = None, parallelize: bool = False, cluster_data_dir: t.Optional[str] = None, priority: int = 0, queue: str = "itaym", wait_until_complete: bool = False, get_completion_validator: bool = True, ) -> t.Union[float, str]: """ :param input_path: path to alignment file :param output_path: path in which the program should write its output :param additional_params: additional parameters unique to the program :param parallelize: boolean indicating weather execution of the program should be parallelized in the cluster or not :param cluster_data_dir: cluster directory that is mounted to the container data directory. Must be provided with parallleize is True :param aux_dir: directory in which auxiliary files should be generated by the job submission process :param priority: priority of the jobs :param queue: queue to submit the jobs to :param wait_until_complete: indicator weather the main program should wait until completion of all jobs (recommended: True) :param get_completion_validator: boolean indicating weather a validator file should be generated upon job completion (recommended: True) :return: either the duration of the command in minutes, if no parallelization was selected, or the path to the touch file that is used for validation of job completion in case of parallelization """ additional_args = dict() from .paml import Paml from .busted import Busted if type(self) in [Paml, Busted]: additional_args["input_tree_path"] = re.sub( "\.fas[^.]*", "_tree.nwk", input_path) if type(self) is Paml: additional_args["control_file_path"] = re.sub( "\.fas[^.]*", "_paml.ctl", input_path) command = self.set_command( input_path=input_path, output_path=output_path, additional_params=additional_params, parallelize=parallelize, cluster_data_dir=cluster_data_dir, **additional_args, ) os.makedirs(aux_dir, exist_ok=True) if os.path.exists(output_path): logger.info( f"{self.name} output already exists at {output_path} and will not be generated again" ) return if not parallelize: start_time = time() if type(self) is not Paml: os.chdir( aux_dir ) # move to aux dir as rate4site generates extra files in current running directory for cmd in command: if "cd " in cmd: os.chdir(cmd.replace("cd ", "")) else: res = os.system( f"{cmd} > /dev/null 2>&1" ) # for some reason, rate4 prints some logs into the stderr, # making the typical test (raise error i=f stderr > 0) invalid in this case if res != 0: raise RuntimeError(f"command {cmd} failed to execute.") end_time = time() return (end_time - start_time) / 60 else: commands = ([ f"cd {aux_dir.replace(os.environ['container_data_dir'], cluster_data_dir)}", """timestamp() { date +"%T" # current time } timestamp""", ] + command + ["timestamp"]) job = Job( name=self.name, sh_dir=aux_dir, output_dir=aux_dir, commands=commands, priority=priority, queue=queue, ) completion_validator = job.submit( wait_until_complete=wait_until_complete, get_completion_validator=get_completion_validator, ) return completion_validator
def exec_pipeline_on_simulations(input_path: click.Path): """Program to simulate multiple datasets and then submit pipeline jobs for each one For example of the json format parameters, see data/test/simulation.json""" # process input json file with open(input_path, "r") as input_file: simulation_params = json.load(input_file) os.makedirs( simulation_params["simulations_output_dir"], exist_ok=True, ) # intialize the logger logging.basicConfig( level=logging.INFO, format= "%(asctime)s module: %(module)s function: %(funcName)s line: %(lineno)d %(message)s", handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler( f"{os.path.dirname(input_path)}/simulations.log"), ], ) logger = logging.getLogger(__name__) logger.info("Json input has been successfully processed") logger.info(f"Processing simulation input from {input_path}") simulation_input = SimulationInput(**simulation_params) logger.info("Json input has been successfully parsed as simulation input") logger.info( f"Simulating data in {simulation_input.simulations_output_dir}") simulations_exist = False simulations_exec_complete = False repetitions_num = simulation_input.nrep if (os.path.exists(simulation_input.simulations_output_dir) and os.listdir( simulation_input.simulations_output_dir) == repetitions_num): simulations_exist = True all_exist = True for path in os.listdir(simulation_params.simulations_output_dir): completion_validator = f"{simulation_params.simulations_output_dir}/{path}/job_aux/pipeline_on_simulated_data.touch" if not os.path.exists(completion_validator): all_exist = False break if all_exist: simulations_exec_complete = True if not simulations_exist: pipeline_input_json_paths = SimulationTools.simulate( simulation_input=simulation_input) simulations_dirs = [ f"{os.path.dirname(json_path)}/" for json_path in pipeline_input_json_paths ] logger.info(f"Simulation is complete.") else: simulations_dirs = [ f"{simulation_input.simulations_output_dir}/{path}/" for path in os.listdir(simulation_input.simulations_output_dir) ] if not simulations_exec_complete: logger.info(f"submitting pipeline jobs for the simulated data") completion_validators = [] for simulations_dir in simulations_dirs: aux_dir = f"{simulations_dir}/job_aux/" json_path = f"{simulations_dir}/input.json" if not os.path.exists( f"{aux_dir}/pipeline_on_simulated_data.touch"): job = Job( name="pipeline_on_simulated_data", sh_dir=aux_dir, output_dir=aux_dir, commands=[ f"python /groups/itay_mayrose/halabikeren/down_sampling_analysis/src/main.py --input_path={json_path}" ], priority=simulation_params["priority"], queue=simulation_params["queue"], ) completion_validators.append( job.submit( wait_until_complete=False, get_completion_validator=True, )) logger.info(f"Job submission is complete") # wait for jobs to complete for validator in completion_validators: while not os.path.exists(validator): sleep(60) # analyze large scale results paths = [ path for path in os.listdir(simulation_input.simulations_output_dir) if "rep" in path ] overlap_dfs = [] for path in paths: overlap_df_path = f"{simulation_input.simulations_output_dir}/{path}/pipeline_dir/samples/samples_overlap.csv" overlap_df = pd.read_csv(overlap_df_path) overlap_df["replicate"] = path overlap_df["compared_methods"] = overlap_df["method_1"].str.cat( overlap_df[["method_2"]], sep=",") overlap_dfs.append(overlap_df) full_overlap_df = pd.concat(overlap_dfs) plot_large_scale_samples_overlap( df=full_overlap_df, output_path= f"{simulation_input.simulations_output_dir}/samples_overlap.svg", ) for program in simulation_input.programs: data = [] paths = [ path for path in os.listdir(simulation_input.simulations_output_dir) if "rep" in path ] for path in paths: df_path = f"{simulation_input.simulations_output_dir}/{path}/pipeline_dir/tables/{program}_summary.csv" try: rep_data = pd.read_csv(df_path) rep_data["replicate"] = path data.append(rep_data) except Exception as e: logger.error( f"Failed to load dataframe from {df_path} due to error {e}" ) full_df = pd.concat(data) full_df["full_bias"] = full_df["result"] - full_df["full_result"] full_df["simulated_bias"] = full_df["result"] - full_df["simulated"] full_df_grouped = (full_df.groupby( ["replicate", "sampling_fraction", "sampling_method"]).mean().reset_index()) full_df_grouped.to_csv( f"{simulation_params['simulations_output_dir']}/{program}_aggregated_data.csv" ) # plot large scale data plot_large_scale_error( df=full_df_grouped, output_path= f"{simulation_input.simulations_output_dir}/{program}_absolute_error.svg", use_relative_error=False, ) plot_large_scale_error( df=full_df_grouped, output_path= f"{simulation_input.simulations_output_dir}/{program}_relative_error.svg", use_relative_error=True, ) plot_large_scale_bias( df=full_df_grouped, output_path= f"{simulation_input.simulations_output_dir}/{program}_bias.svg", )