def create_pbs_file(env, cmd_run, pf_pbs, **kwargs):

    job_name = get_value(kwargs, "job_name", "JOB")
    num_nodes = get_value(kwargs, "num_nodes", 1)
    ppn = get_value(kwargs, "ppn", 1)
    node_property = get_value(kwargs, "node_property", "")
    walltime = get_value(kwargs, "pbs-walltime", "07:00:00")

    pd_work = env["pd-work"]

    pbs_text = ""

    pbs_text += "#PBS -N " + str(job_name) + "\n"
    pbs_text += "#PBS -o " + "{}/{}".format(pd_work, "error") + "\n"
    pbs_text += "#PBS -j oe" + "\n"
    pbs_text += "#PBS -l nodes=" + str(num_nodes) + ":ppn=" + str(
        ppn) + "{}\n".format(node_property)
    pbs_text += "#PBS -l walltime=" + str(walltime) + "\n"

    pbs_text += "#PBS -W umask=002" + "\n"

    pbs_text += "export PATH=\"/home/karl/anaconda/envs/sbsp/bin:$PATH\"\n"

    pbs_text += "PBS_O_WORKDIR=" + pd_work + "\n"
    pbs_text += "cd $PBS_O_WORKDIR \n"

    pbs_text += "echo The working directory is `echo $PBS_O_WORKDIR`" + "\n"
    pbs_text += "echo This job runs on the following nodes:" + "\n"
    pbs_text += "echo `cat $PBS_NODEFILE`" + "\n"

    pbs_text += "\n{}\n".format(cmd_run)

    from sbsp_io.general import write_string_to_file
    write_string_to_file(pbs_text, pf_pbs)
Beispiel #2
0
    def _create_pbs_file(self, jobname, num_jobs, pf_pbs,
                         pf_input_package_template,
                         pf_output_package_template):
        """
        Create PBS file for runnning all input jobs
        :param jobname: Name of job
        :param num_jobs:
        :param pf_pbs:
        :param pf_input_package_template:
        :return:
        """

        # create unique compute directory
        pd_compute = None  #run_shell_cmd("mktemp --tmpdir={}".format(self._prl_options["pbs-pd-root-compute"]))

        pbs_text = PBS._generate_pbs_header_array(num_jobs,
                                                  jobname,
                                                  self._prl_options,
                                                  pd_compute=pd_compute)

        pbs_text += "\n{}\n".format(
            PBS._generate_call_command(self._env,
                                       pf_input_package_template,
                                       pf_output_package_template,
                                       self._prl_options,
                                       pd_compute=pd_compute))

        # write to file
        from sbsp_io.general import write_string_to_file
        write_string_to_file(pbs_text, pf_pbs)
Beispiel #3
0
def write_sequence_list_to_fasta_file(sequences, pf_sequences):
    # type: (List[Seq], str) -> None

    data = ""
    for i in range(len(sequences)):
        data += ">{}\n{}\n".format(i, sequences[i])

    write_string_to_file(data, pf_sequences)
def write_fasta_hash_to_file(fasta, pf_output):
    # type: (Dict[str, Any], str) -> None

    output = ""
    for header in fasta.keys():
        output += ">{}\n{}\n".format(
            header, fasta[header]
        )

    write_string_to_file(output, pf_output)
Beispiel #5
0
def write_to_temporary_alignment_file(pf_tmp, list_sequences):
    # type: (str, List[str]) -> str

    if len(list_sequences) == 0:
        raise ValueError("No sequences to write to file")

    for s in list_sequences:
        if len(s) != len(list_sequences[0]):
            raise ValueError("Sequences should have the same length")

    text = "{} {}".format(2, len(list_sequences[0]))
    for i in range(1, len(list_sequences) + 1):

        text += "\nsequence_{}  {}".format(i, list_sequences[i - 1])

    text += "\n"

    write_string_to_file(text, pf_tmp)

    return pf_tmp
Beispiel #6
0
def filter_orthologs(env, pf_data, pf_output, **kwargs):
    # type: (Environment, str, str, Dict[str, Any]) -> str
    msa_options = get_value(kwargs, "msa_options",
                            SBSPOptions(env))  # type: SBSPOptions
    pf_filter_stats = get_value(kwargs, "pf_filter_stats", None)
    filter_non_group_only = get_value(kwargs, "filter_non_group_only", True)

    from sbsp_alg.msa import filter_df, print_filter_stats_to_file, print_filter_stats

    column_distance = "k2p-distance"
    if msa_options.safe_get("column-distance"):
        column_distance = msa_options.safe_get("column-distance")

    df = pd.read_csv(pf_data, header=0)

    filter_stats = {}
    df = filter_df(df,
                   msa_options,
                   filter_stats=filter_stats,
                   filter_non_group_only=filter_non_group_only,
                   column_distance=column_distance)

    # df = df_add_labeled_sequences(env, df,
    #                               source="both",
    #                               suffix_coordinates=suffix_coordinates,
    #                               suffix_gene_sequence=tag_msa,
    #                               upstream_length_nt=upstream_length_nt,
    #                               downstream_length_nt=downstream_length_nt)

    if pf_filter_stats:
        print_filter_stats_to_file(filter_stats, pf_filter_stats)
    else:
        print_filter_stats(filter_stats)

    df.to_csv(pf_output, index=False)

    write_string_to_file("\n".join(set(df["q-3prime"].unique())),
                         pf_output + "_map")
    return pf_output
Beispiel #7
0
def _run_codeml(seq_a, seq_b, **kwargs):
    # type: (str, str, Dict[str, Any]) -> Dict[str, Any]

    pd_work = get_value(kwargs, "pd_work", ".", default_if_none=True)
    pf_ctl = get_value(kwargs, "pf_ctl", None)

    if pf_ctl is None:
        raise ValueError("Cannot compute distance without CTL file for CodeML")

    if not os.path.isfile(pf_ctl):
        raise ValueError("File doesn't exist: {}".format(pf_ctl))

    random_name = generate_random_non_existing_filename(pd_work)

    pd_codeml_run = os.path.join(pd_work, random_name)
    mkdir_p(pd_codeml_run)

    shutil.copyfile(pf_ctl, os.path.join(pd_codeml_run, "codeml.ctl"))

    pf_sequences = os.path.join(pd_codeml_run, "in.phy")
    write_to_temporary_alignment_file(pf_sequences, [seq_a, seq_b])

    write_string_to_file("(1)\n", os.path.join(pd_codeml_run, "in.tre"))

    # run code ml
    scorer = codeml.Codeml(tree=os.path.join(pd_codeml_run, "in.tre"),
                           alignment=pf_sequences,
                           out_file=os.path.join(pd_codeml_run, "out.txt"),
                           working_dir=pd_codeml_run)

    try:
        results = scorer.run(ctl_file="codeml.ctl", verbose=False)
    except Exception:
        results = {}

    shutil.rmtree(pd_codeml_run)

    return results
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    if args.pf_load_state is None:
        gcfid_to_number_of_targets = count_targets_per_gcfid(
            args.pf_sbsp_output)

        df_assembly_summary = read_assembly_summary_into_dataframe(
            args.pf_assembly_summary)
        gcfid_to_assembly_info = get_assembly_info_per_gcfid(
            df_assembly_summary)

        taxid_to_number_of_targets = {
            int(gcfid_to_assembly_info[gcfid]["taxid"]):
            gcfid_to_number_of_targets[gcfid]
            for gcfid in gcfid_to_number_of_targets
            if gcfid in gcfid_to_assembly_info
        }

        tree = TaxonomyTree.load(args.pf_taxonomy_tree)

        tree.update_tree_attributes(
            set_number_of_targets_per_taxid,
            {"taxid_to_number_of_targets": taxid_to_number_of_targets},
            direction="bottom-up")

        if args.pf_save_state is not None:
            save_obj(tree, args.pf_save_state)
    else:
        tree = load_obj(args.pf_load_state)

    tree_string = tree.to_string(check_if_should_print=should_print,
                                 attribute_name="number_of_targets",
                                 attribute_format="{:,}",
                                 tag_name=args.tag,
                                 max_depth=args.max_depth)
    write_string_to_file(tree_string, args.pf_output)
Beispiel #9
0
def print_taxonomy_tree(env, pf_taxonomy_tree, pf_assembly_summary, pf_output,
                        **kwargs):
    pf_names_of_interest = get_value(kwargs, "pf_names_of_interest", None)
    tax_tree = TaxonomyTree.load(pf_taxonomy_tree)
    taxid_to_info_list = get_rows_by_key(pf_assembly_summary,
                                         key="taxid",
                                         **kwargs)

    limit_path_to = None
    if pf_names_of_interest:
        limit_path_to = set(read_rows_to_list(pf_names_of_interest))

    refseq_count_per_taxid = {
        taxid: len(taxid_to_info_list[taxid])
        for taxid in taxid_to_info_list
    }

    refseq_count_per_taxid = {taxid: 1 for taxid in taxid_to_info_list}

    def check_if_should_print(attributes):
        # type: (Dict[str, Any]) -> bool
        if "leads_to_node_of_interest" in attributes:
            return attributes["leads_to_node_of_interest"]
        return "num_refseq" in attributes and attributes["num_refseq"] != 0

    out = tax_tree.to_string_tree_with_stats(
        "num_refseq",
        count_refseq_under_node, {
            "refseq_count_per_taxid": refseq_count_per_taxid,
            "limit_path_to": limit_path_to,
        },
        check_if_should_print=check_if_should_print,
        attribute_format="{:,}",
        **kwargs)

    write_string_to_file(out, pf_output)
Beispiel #10
0
 def _create_dummy_pbs_file(pf_dummy, jobname_dummy, pd_work):
     # type: (str, str, str) -> None
     pbstext = PBS.generate_pbs_header(jobname_dummy, pd_work, 1, 1,
                                       "00:00:01")
     write_string_to_file(pbstext, pf_dummy)
Beispiel #11
0
    def to_csv(self, pf_csv, **kwargs):
        # type: (str, Dict[str, Any]) -> None

        out = self.to_string(**kwargs)
        write_string_to_file(out, pf_csv)