def merge_csv_files(list_pf_csv, pf_output): # type: (List[str, Any], str) -> None remove_p(pf_output) for pf in list_pf_csv: df = pd.read_csv(pf) append_data_frame_to_csv(df, pf_output)
def train_and_create_models(env, pf_labels, pf_sequences, **kwargs): # type: (Environment, str, str) -> GMS2Mod pf_new_seq, pf_new_labels = convert_multi_fasta_to_single( env, pf_sequences, pf_labels) mod = train_gms2_model(env, pf_new_seq, pf_new_labels, **kwargs) remove_p(pf_new_labels) remove_p(pf_new_seq) return mod
def gather_mgm_test_set(env, gil, pf_output, **kwargs): # type: (Environment, GenomeInfoList, str, Dict[str, Any]) -> str remove_p(pf_output) # start clean print(pf_output) for gi in tqdm(gil, total=len(gil)): df = gather_mgm_test_set_for_genome(env, gi, **kwargs) append_data_frame_to_csv(df, pf_output) return pf_output
def train_gms2_model(env, pf_new_seq, pf_new_labels, **kwargs): group = get_value(kwargs, "group", "A", default_if_none=True) clean = get_value(kwargs, "clean", True) pf_mod = get_value(kwargs, "pf_mod", os_join(env["pd-work"], "a.mod"), default_if_none=True) cmd = f"cd {env['pd-work']}; " cmd += f"/storage4/karl/sbsp/biogem/sbsp/bin_external/gms2/biogem gms2-training -s {pf_new_seq} -l {pf_new_labels} -m {pf_mod} --order-coding 5 --order-noncoding 2 --only-train-on-native 1 --genetic-code 11 --order-start-context 2 --fgio-dist-thr 25 --genome-group {group} --ga-upstr-len-rbs 20 --align right --ga-width-rbs 6" run_shell_cmd(cmd) mod = GMS2Mod.init_from_file(pf_mod) if not clean: remove_p(pf_mod) return mod
def analysis_per_query(env, gil, pf_output_summary, **kwargs): # type: (Environment, GenomeInfoList, str, Dict[str, Any]) -> None dn_run = get_value(kwargs, "dn_run", "sbsp", default_if_none=True) if os.path.isfile(pf_output_summary): remove_p(pf_output_summary) counter = 0 header = None for gi in gil: logger.info("{} / {}: {}".format(counter, len(gil), gi.name)) pd_genome = os.path.join(env["pd-data"], gi.name) pf_sequence = os.path.join(pd_genome, "sequence.fasta") gc = compute_gc_from_file(pf_sequence) pd_run = os.path.join(env["pd-runs"], gi.name, dn_run) df = analysis_per_query_for_genome(env, gi, pd_run, **kwargs) if len(df) == 0: logger.warning(f"No data found for {gi.name}") continue df["GCFID"] = gi.name df["Name"] = gi.attributes[ "name"] if "name" in gi.attributes else gi.name df["Genome GC"] = gc df["Ancestor"] = gi.attributes[ "ancestor"] if "ancestor" in gi.attributes else "" if header is None: header = sorted(df.columns.values) else: if header != sorted(df.columns.values): logger.debug( f"Header conflict.\nA: {header}\nB: {sorted(df.columns.values)}" ) append_data_frame_to_csv(df, pf_output_summary) counter += 1
def analysis_per_query(env, gil, gcfid_to_pd_sbsp, pf_output_summary, **kwargs): # type: (Environment, GenomeInfoList, Dict[str, str], str, Dict[str, Any]) -> None if os.path.isfile(pf_output_summary): remove_p(pf_output_summary) counter = 0 for gi in gil: logger.info("{} / {}: {}".format(counter, len(gil), gi.name)) pd_genome = os.path.join(env["pd-data"], gi.name) pf_sequence = os.path.join(pd_genome, "sequence.fasta") gc = compute_gc_from_file(pf_sequence) df = analysis_per_query_for_genome(env, gi, gcfid_to_pd_sbsp[gi.name]) df["GCFID"] = gi.name df["Name"] = gi.attributes[ "name"] if "name" in gi.attributes else gi.name df["Genome GC"] = gc df["Ancestor"] = gi.attributes[ "ancestor"] if "ancestor" in gi.attributes else "" append_data_frame_to_csv(df, pf_output_summary) counter += 1
def run_msa_on_sequences(env, sequences, sbsp_options, **kwargs): # type: (Environment, List[Seq], SBSPOptions, Dict[str, Any]) -> MSAType pd_work = env["pd-work"] fn_tmp_prefix = get_value(kwargs, "fn_tmp_prefix", "", default_if_none=True) # write sequences to file pf_fasta = os.path.join(pd_work, "{}tmp_sequences.fasta".format(fn_tmp_prefix)) remove_p(pf_fasta) write_sequence_list_to_fasta_file(sequences, pf_fasta) # run msa pf_msa = os.path.join(pd_work, "{}tmp_msa.txt".format(fn_tmp_prefix)) run_msa_on_sequence_file(pf_fasta, sbsp_options, pf_msa, **kwargs) msa_t = MSAType.init_from_file(pf_msa) remove_p(pf_msa, pf_fasta) return msa_t