Beispiel #1
0
    def exec(
        self,
        input_path: str,
        output_path: str,
        aux_dir: str,
        additional_params: t.Optional[t.Dict[str, str]] = None,
        parallelize: bool = False,
        cluster_data_dir: t.Optional[str] = None,
        priority: int = 0,
        queue: str = "itaym",
        wait_until_complete: bool = False,
        get_completion_validator: bool = True,
    ) -> t.Union[float, str]:
        """
        :param input_path: path to alignment file
        :param output_path: path in which the program should write its output
        :param additional_params: additional parameters unique to the program
        :param parallelize: boolean indicating weather execution of the program should be parallelized in the cluster or not
        :param cluster_data_dir: cluster directory that is mounted to the container data directory. Must be provided with parallleize is True
        :param aux_dir: directory in which auxiliary files should be generated by the job submission process
        :param priority: priority of the jobs
        :param queue: queue to submit the jobs to
        :param wait_until_complete: indicator weather the main program should wait until completion of all jobs (recommended: True)
        :param get_completion_validator: boolean indicating weather a validator file should be generated upon job completion (recommended: True)
        :return: either the duration of the command in minutes, if no parallelization was selected, or the path to the touch file that is used for validation of job completion in case of parallelization
        """
        additional_args = dict()
        from .paml import Paml
        from .busted import Busted

        if type(self) in [Paml, Busted]:
            additional_args["input_tree_path"] = re.sub(
                "\.fas[^.]*", "_tree.nwk", input_path)
        if type(self) is Paml:
            additional_args["control_file_path"] = re.sub(
                "\.fas[^.]*", "_paml.ctl", input_path)
        command = self.set_command(
            input_path=input_path,
            output_path=output_path,
            additional_params=additional_params,
            parallelize=parallelize,
            cluster_data_dir=cluster_data_dir,
            **additional_args,
        )
        os.makedirs(aux_dir, exist_ok=True)

        if os.path.exists(output_path):
            logger.info(
                f"{self.name} output already exists at {output_path} and will not be generated again"
            )
            return

        if not parallelize:
            start_time = time()
            if type(self) is not Paml:
                os.chdir(
                    aux_dir
                )  # move to aux dir as rate4site generates extra files in current running directory
            for cmd in command:
                if "cd " in cmd:
                    os.chdir(cmd.replace("cd ", ""))
                else:
                    res = os.system(
                        f"{cmd} > /dev/null 2>&1"
                    )  # for some reason, rate4 prints some logs into the stderr,
                    # making the typical test (raise error i=f stderr > 0) invalid in this case
                    if res != 0:
                        raise RuntimeError(f"command {cmd} failed to execute.")
            end_time = time()
            return (end_time - start_time) / 60
        else:
            commands = ([
                f"cd {aux_dir.replace(os.environ['container_data_dir'], cluster_data_dir)}",
                """timestamp() {
                      date +"%T" # current time
                    }
                    timestamp""",
            ] + command + ["timestamp"])

            job = Job(
                name=self.name,
                sh_dir=aux_dir,
                output_dir=aux_dir,
                commands=commands,
                priority=priority,
                queue=queue,
            )
            completion_validator = job.submit(
                wait_until_complete=wait_until_complete,
                get_completion_validator=get_completion_validator,
            )
            return completion_validator
def exec_pipeline_on_simulations(input_path: click.Path):
    """Program to simulate multiple datasets and then submit pipeline jobs for each one
    For example of the json format parameters, see data/test/simulation.json"""

    # process input json file
    with open(input_path, "r") as input_file:
        simulation_params = json.load(input_file)
    os.makedirs(
        simulation_params["simulations_output_dir"],
        exist_ok=True,
    )

    # intialize the logger
    logging.basicConfig(
        level=logging.INFO,
        format=
        "%(asctime)s module: %(module)s function: %(funcName)s line: %(lineno)d %(message)s",
        handlers=[
            logging.StreamHandler(sys.stdout),
            logging.FileHandler(
                f"{os.path.dirname(input_path)}/simulations.log"),
        ],
    )
    logger = logging.getLogger(__name__)
    logger.info("Json input has been successfully processed")

    logger.info(f"Processing simulation input from {input_path}")
    simulation_input = SimulationInput(**simulation_params)
    logger.info("Json input has been successfully parsed as simulation input")

    logger.info(
        f"Simulating data in {simulation_input.simulations_output_dir}")
    simulations_exist = False
    simulations_exec_complete = False
    repetitions_num = simulation_input.nrep
    if (os.path.exists(simulation_input.simulations_output_dir) and os.listdir(
            simulation_input.simulations_output_dir) == repetitions_num):
        simulations_exist = True
        all_exist = True
        for path in os.listdir(simulation_params.simulations_output_dir):
            completion_validator = f"{simulation_params.simulations_output_dir}/{path}/job_aux/pipeline_on_simulated_data.touch"
            if not os.path.exists(completion_validator):
                all_exist = False
                break
        if all_exist:
            simulations_exec_complete = True

    if not simulations_exist:
        pipeline_input_json_paths = SimulationTools.simulate(
            simulation_input=simulation_input)
        simulations_dirs = [
            f"{os.path.dirname(json_path)}/"
            for json_path in pipeline_input_json_paths
        ]
        logger.info(f"Simulation is complete.")

    else:
        simulations_dirs = [
            f"{simulation_input.simulations_output_dir}/{path}/"
            for path in os.listdir(simulation_input.simulations_output_dir)
        ]

    if not simulations_exec_complete:
        logger.info(f"submitting pipeline jobs for the simulated data")
        completion_validators = []
        for simulations_dir in simulations_dirs:
            aux_dir = f"{simulations_dir}/job_aux/"
            json_path = f"{simulations_dir}/input.json"
            if not os.path.exists(
                    f"{aux_dir}/pipeline_on_simulated_data.touch"):
                job = Job(
                    name="pipeline_on_simulated_data",
                    sh_dir=aux_dir,
                    output_dir=aux_dir,
                    commands=[
                        f"python /groups/itay_mayrose/halabikeren/down_sampling_analysis/src/main.py --input_path={json_path}"
                    ],
                    priority=simulation_params["priority"],
                    queue=simulation_params["queue"],
                )
                completion_validators.append(
                    job.submit(
                        wait_until_complete=False,
                        get_completion_validator=True,
                    ))
        logger.info(f"Job submission is complete")

        # wait for jobs to complete
        for validator in completion_validators:
            while not os.path.exists(validator):
                sleep(60)

    # analyze large scale results
    paths = [
        path for path in os.listdir(simulation_input.simulations_output_dir)
        if "rep" in path
    ]
    overlap_dfs = []
    for path in paths:
        overlap_df_path = f"{simulation_input.simulations_output_dir}/{path}/pipeline_dir/samples/samples_overlap.csv"
        overlap_df = pd.read_csv(overlap_df_path)
        overlap_df["replicate"] = path
        overlap_df["compared_methods"] = overlap_df["method_1"].str.cat(
            overlap_df[["method_2"]], sep=",")
        overlap_dfs.append(overlap_df)
    full_overlap_df = pd.concat(overlap_dfs)
    plot_large_scale_samples_overlap(
        df=full_overlap_df,
        output_path=
        f"{simulation_input.simulations_output_dir}/samples_overlap.svg",
    )

    for program in simulation_input.programs:
        data = []
        paths = [
            path
            for path in os.listdir(simulation_input.simulations_output_dir)
            if "rep" in path
        ]
        for path in paths:
            df_path = f"{simulation_input.simulations_output_dir}/{path}/pipeline_dir/tables/{program}_summary.csv"
            try:
                rep_data = pd.read_csv(df_path)
                rep_data["replicate"] = path
                data.append(rep_data)
            except Exception as e:
                logger.error(
                    f"Failed to load dataframe from {df_path} due to error {e}"
                )
        full_df = pd.concat(data)
        full_df["full_bias"] = full_df["result"] - full_df["full_result"]
        full_df["simulated_bias"] = full_df["result"] - full_df["simulated"]
        full_df_grouped = (full_df.groupby(
            ["replicate", "sampling_fraction",
             "sampling_method"]).mean().reset_index())
        full_df_grouped.to_csv(
            f"{simulation_params['simulations_output_dir']}/{program}_aggregated_data.csv"
        )

        # plot large scale data
        plot_large_scale_error(
            df=full_df_grouped,
            output_path=
            f"{simulation_input.simulations_output_dir}/{program}_absolute_error.svg",
            use_relative_error=False,
        )
        plot_large_scale_error(
            df=full_df_grouped,
            output_path=
            f"{simulation_input.simulations_output_dir}/{program}_relative_error.svg",
            use_relative_error=True,
        )
        plot_large_scale_bias(
            df=full_df_grouped,
            output_path=
            f"{simulation_input.simulations_output_dir}/{program}_bias.svg",
        )