def create_pbs_file(env, cmd_run, pf_pbs, **kwargs): job_name = get_value(kwargs, "job_name", "JOB") num_nodes = get_value(kwargs, "num_nodes", 1) ppn = get_value(kwargs, "ppn", 1) node_property = get_value(kwargs, "node_property", "") walltime = get_value(kwargs, "pbs-walltime", "07:00:00") pd_work = env["pd-work"] pbs_text = "" pbs_text += "#PBS -N " + str(job_name) + "\n" pbs_text += "#PBS -o " + "{}/{}".format(pd_work, "error") + "\n" pbs_text += "#PBS -j oe" + "\n" pbs_text += "#PBS -l nodes=" + str(num_nodes) + ":ppn=" + str( ppn) + "{}\n".format(node_property) pbs_text += "#PBS -l walltime=" + str(walltime) + "\n" pbs_text += "#PBS -W umask=002" + "\n" pbs_text += "export PATH=\"/home/karl/anaconda/envs/sbsp/bin:$PATH\"\n" pbs_text += "PBS_O_WORKDIR=" + pd_work + "\n" pbs_text += "cd $PBS_O_WORKDIR \n" pbs_text += "echo The working directory is `echo $PBS_O_WORKDIR`" + "\n" pbs_text += "echo This job runs on the following nodes:" + "\n" pbs_text += "echo `cat $PBS_NODEFILE`" + "\n" pbs_text += "\n{}\n".format(cmd_run) from sbsp_io.general import write_string_to_file write_string_to_file(pbs_text, pf_pbs)
def _create_pbs_file(self, jobname, num_jobs, pf_pbs, pf_input_package_template, pf_output_package_template): """ Create PBS file for runnning all input jobs :param jobname: Name of job :param num_jobs: :param pf_pbs: :param pf_input_package_template: :return: """ # create unique compute directory pd_compute = None #run_shell_cmd("mktemp --tmpdir={}".format(self._prl_options["pbs-pd-root-compute"])) pbs_text = PBS._generate_pbs_header_array(num_jobs, jobname, self._prl_options, pd_compute=pd_compute) pbs_text += "\n{}\n".format( PBS._generate_call_command(self._env, pf_input_package_template, pf_output_package_template, self._prl_options, pd_compute=pd_compute)) # write to file from sbsp_io.general import write_string_to_file write_string_to_file(pbs_text, pf_pbs)
def write_sequence_list_to_fasta_file(sequences, pf_sequences): # type: (List[Seq], str) -> None data = "" for i in range(len(sequences)): data += ">{}\n{}\n".format(i, sequences[i]) write_string_to_file(data, pf_sequences)
def write_fasta_hash_to_file(fasta, pf_output): # type: (Dict[str, Any], str) -> None output = "" for header in fasta.keys(): output += ">{}\n{}\n".format( header, fasta[header] ) write_string_to_file(output, pf_output)
def write_to_temporary_alignment_file(pf_tmp, list_sequences): # type: (str, List[str]) -> str if len(list_sequences) == 0: raise ValueError("No sequences to write to file") for s in list_sequences: if len(s) != len(list_sequences[0]): raise ValueError("Sequences should have the same length") text = "{} {}".format(2, len(list_sequences[0])) for i in range(1, len(list_sequences) + 1): text += "\nsequence_{} {}".format(i, list_sequences[i - 1]) text += "\n" write_string_to_file(text, pf_tmp) return pf_tmp
def filter_orthologs(env, pf_data, pf_output, **kwargs): # type: (Environment, str, str, Dict[str, Any]) -> str msa_options = get_value(kwargs, "msa_options", SBSPOptions(env)) # type: SBSPOptions pf_filter_stats = get_value(kwargs, "pf_filter_stats", None) filter_non_group_only = get_value(kwargs, "filter_non_group_only", True) from sbsp_alg.msa import filter_df, print_filter_stats_to_file, print_filter_stats column_distance = "k2p-distance" if msa_options.safe_get("column-distance"): column_distance = msa_options.safe_get("column-distance") df = pd.read_csv(pf_data, header=0) filter_stats = {} df = filter_df(df, msa_options, filter_stats=filter_stats, filter_non_group_only=filter_non_group_only, column_distance=column_distance) # df = df_add_labeled_sequences(env, df, # source="both", # suffix_coordinates=suffix_coordinates, # suffix_gene_sequence=tag_msa, # upstream_length_nt=upstream_length_nt, # downstream_length_nt=downstream_length_nt) if pf_filter_stats: print_filter_stats_to_file(filter_stats, pf_filter_stats) else: print_filter_stats(filter_stats) df.to_csv(pf_output, index=False) write_string_to_file("\n".join(set(df["q-3prime"].unique())), pf_output + "_map") return pf_output
def _run_codeml(seq_a, seq_b, **kwargs): # type: (str, str, Dict[str, Any]) -> Dict[str, Any] pd_work = get_value(kwargs, "pd_work", ".", default_if_none=True) pf_ctl = get_value(kwargs, "pf_ctl", None) if pf_ctl is None: raise ValueError("Cannot compute distance without CTL file for CodeML") if not os.path.isfile(pf_ctl): raise ValueError("File doesn't exist: {}".format(pf_ctl)) random_name = generate_random_non_existing_filename(pd_work) pd_codeml_run = os.path.join(pd_work, random_name) mkdir_p(pd_codeml_run) shutil.copyfile(pf_ctl, os.path.join(pd_codeml_run, "codeml.ctl")) pf_sequences = os.path.join(pd_codeml_run, "in.phy") write_to_temporary_alignment_file(pf_sequences, [seq_a, seq_b]) write_string_to_file("(1)\n", os.path.join(pd_codeml_run, "in.tre")) # run code ml scorer = codeml.Codeml(tree=os.path.join(pd_codeml_run, "in.tre"), alignment=pf_sequences, out_file=os.path.join(pd_codeml_run, "out.txt"), working_dir=pd_codeml_run) try: results = scorer.run(ctl_file="codeml.ctl", verbose=False) except Exception: results = {} shutil.rmtree(pd_codeml_run) return results
def main(env, args): # type: (Environment, argparse.Namespace) -> None if args.pf_load_state is None: gcfid_to_number_of_targets = count_targets_per_gcfid( args.pf_sbsp_output) df_assembly_summary = read_assembly_summary_into_dataframe( args.pf_assembly_summary) gcfid_to_assembly_info = get_assembly_info_per_gcfid( df_assembly_summary) taxid_to_number_of_targets = { int(gcfid_to_assembly_info[gcfid]["taxid"]): gcfid_to_number_of_targets[gcfid] for gcfid in gcfid_to_number_of_targets if gcfid in gcfid_to_assembly_info } tree = TaxonomyTree.load(args.pf_taxonomy_tree) tree.update_tree_attributes( set_number_of_targets_per_taxid, {"taxid_to_number_of_targets": taxid_to_number_of_targets}, direction="bottom-up") if args.pf_save_state is not None: save_obj(tree, args.pf_save_state) else: tree = load_obj(args.pf_load_state) tree_string = tree.to_string(check_if_should_print=should_print, attribute_name="number_of_targets", attribute_format="{:,}", tag_name=args.tag, max_depth=args.max_depth) write_string_to_file(tree_string, args.pf_output)
def print_taxonomy_tree(env, pf_taxonomy_tree, pf_assembly_summary, pf_output, **kwargs): pf_names_of_interest = get_value(kwargs, "pf_names_of_interest", None) tax_tree = TaxonomyTree.load(pf_taxonomy_tree) taxid_to_info_list = get_rows_by_key(pf_assembly_summary, key="taxid", **kwargs) limit_path_to = None if pf_names_of_interest: limit_path_to = set(read_rows_to_list(pf_names_of_interest)) refseq_count_per_taxid = { taxid: len(taxid_to_info_list[taxid]) for taxid in taxid_to_info_list } refseq_count_per_taxid = {taxid: 1 for taxid in taxid_to_info_list} def check_if_should_print(attributes): # type: (Dict[str, Any]) -> bool if "leads_to_node_of_interest" in attributes: return attributes["leads_to_node_of_interest"] return "num_refseq" in attributes and attributes["num_refseq"] != 0 out = tax_tree.to_string_tree_with_stats( "num_refseq", count_refseq_under_node, { "refseq_count_per_taxid": refseq_count_per_taxid, "limit_path_to": limit_path_to, }, check_if_should_print=check_if_should_print, attribute_format="{:,}", **kwargs) write_string_to_file(out, pf_output)
def _create_dummy_pbs_file(pf_dummy, jobname_dummy, pd_work): # type: (str, str, str) -> None pbstext = PBS.generate_pbs_header(jobname_dummy, pd_work, 1, 1, "00:00:01") write_string_to_file(pbstext, pf_dummy)
def to_csv(self, pf_csv, **kwargs): # type: (str, Dict[str, Any]) -> None out = self.to_string(**kwargs) write_string_to_file(out, pf_csv)