def download_data_from_assembly_summary(df_assembly_summary, pd_output, **kwargs): # type: (pd.DataFrame, str, Dict[str, Any]) -> GenomeInfoList """ Attempt to download all genomes from assembly summary. :param df_assembly_summary: Data frame containing assembly summary entries :param pd_output: Path to download directory :param kwargs: - pf_output: path to output file which will contain list of downloaded genomes :return: Genome information list of successfully downloaded entries """ pf_output_list = get_value(kwargs, "pf_output_list", None) attributes = get_value(kwargs, "attributes", dict(), default_if_none=True) df_assembly_summary = filter_entries_with_equal_taxid( df_assembly_summary, **kwargs) pd_output = os.path.abspath(pd_output) success_downloads = list() total = 0 for _, gcfid_info in tqdm(df_assembly_summary.iterrows(), "Downloading", total=len(df_assembly_summary)): total += 1 logger.debug("Trying {}".format(gcfid_info["assembly_accession"])) try: gcfid_info = download_assembly_summary_entry( gcfid_info, pd_output, **kwargs) success_downloads.append(gcfid_info) # print_progress("Download", len(success_downloads), total) except (IOError, OSError, ValueError): # print_progress("Download", len(success_downloads), total) pass gil = GenomeInfoList([ GenomeInfo("{}_{}".format(d["assembly_accession"], d["asm_name"]), d["genetic_code"], attributes={ "name": d["name"], "parent_id": d["parent_id"], **get_genome_specific_attributes(pd_output, d), **attributes }) for d in success_downloads ]) if pf_output_list is not None: gil.to_file(pf_output_list) return gil
def setup_gi_and_run(env, gi, sbsp_options, prl_options, clade_to_pf_db, **kwargs): # type: (Environment, GenomeInfo, SBSPOptions, ParallelizationOptions, Dict[str, str], Dict[str, Any]) -> None dn_run = get_value(kwargs, "dn_run", "sbsp") # Check if clade is known try: pf_t_db = clade_to_pf_db[gi.attributes["ancestor"]] except KeyError: raise ValueError("Unknown clade {}".format(gi.attributes["ancestor"])) logger.info("Scheduling: {}".format(gi.name)) pd_work = os_join(env["pd-work"], gi.name, dn_run) # genome working environment curr_env = env.duplicate({"pd-work": pd_work}) # create environment for genome pf_output = os_join(pd_work, "output.csv") # output file mkdir_p(pd_work) # create working directory # write genome name to file list (for running) pf_list = os_join(pd_work, "query.list") GenomeInfoList([gi]).to_file(pf_list) # create options for pipeline for current genome po = PipelineSBSPOptions(curr_env, pf_list, pf_t_db=pf_t_db, pf_output=pf_output, sbsp_options=sbsp_options, prl_options=prl_options, **kwargs) sbsp_on_gi(gi, po)
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) prl_options = ParallelizationOptions.init_from_dict(env, vars(args)) if prl_options is not None: pbs = PBS(env, prl_options, splitter=split_genome_info_list, merger=merge_identity ) output = pbs.run( data={"gil": gil}, func=collect_start_info_from_gil, func_kwargs={ "env": env, } ) df = pd.concat(output, sort=False) else: df = collect_start_info_from_gil(env, gil) save_obj(df, args.pf_output)
def main(env, args): # type: (Environment, argparse.Namespace) -> None list_gil = [ GenomeInfoList.init_from_file(pf) for pf in args.pf_genome_lists ] list_names = args.list_names if len(list_names) != len(list_gil): raise Warning( f"Names and genome lists must have the same length {len(list_names)} != {len(list_gil)}" ) list_dn_tools = args.dn_tools list_tool_names = args.tool_names if len(list_dn_tools) != len(list_tool_names): raise Warning( f"Tools and dirs must have the same length {len(list_dn_tools)} != {len(list_tool_names)}" ) prl_options = ParallelizationOptions.init_from_dict(env, vars(args)) stats_tools_5prime(env, list_gil, list_names, list_dn_tools, list_tool_names, args.pf_output, prl_options=prl_options)
def main(env, args): # type: (Environment, argparse.Namespace) -> None logger.info("Reading assembly file") df_assembly_summary = AssemblySummary.init_from_file( args.pf_assembly_summary) logger.info("Reading genome file") gil = GenomeInfoList.init_from_file(args.pf_genome_list) # only keep assemblies that match genomes from list wanted_gcfids = {gi.name: gi for gi in gil} df_assembly_summary["name"] = df_assembly_summary.apply( lambda r: f"{r['assembly_accession']}_{r['asm_name'].replace(' ' , '_')}", axis=1) df_assembly_summary = df_assembly_summary[df_assembly_summary["name"].isin( wanted_gcfids.keys())].copy() for i in df_assembly_summary.index: name = df_assembly_summary.at[i, "name"] gi = wanted_gcfids[name] # type: GenomeInfo genetic_code = gi.attributes.get("genetic_code") df_assembly_summary.loc[i, "genetic_code"] = genetic_code logger.info(f"Request {len(gil)}. Found {len(df_assembly_summary)}") logger.info("Downloading genomes") download_data_from_assembly_summary(df_assembly_summary, env["pd-data"])
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) prl_options = ParallelizationOptions.init_from_dict(env, vars(args)) if prl_options is not None: pbs = PBS(env, prl_options, splitter=split_genome_info_list, merger=merge_identity) output = pbs.run(data={ "gil": gil, "pf_output_template": args.pf_output + "_{}" }, func=gather_mgm_test_set, func_kwargs={ "env": env, }) # output is list of filenames merge_csv_files(output, args.pf_output) else: gather_mgm_test_set(env, gil, args.pf_output)
def run_blast(env, pf_q_list, pf_t_list, pf_blast_output, **kwargs): # type: (Environment, str, str, str, Dict[str, Any]) -> None fn_q_labels = sbsp_general.general.get_value(kwargs, "fn_q_labels", "ncbi.gff") fn_t_labels = sbsp_general.general.get_value(kwargs, "fn_t_labels", "ncbi.gff") fn_q_proteins = sbsp_general.general.get_value(kwargs, "fn_q_proteins", "q_proteins.faa") fn_t_proteins = sbsp_general.general.get_value(kwargs, "fn_t_proteins", "t_proteins.faa") fn_q_nucl = "q.fnt" fn_t_nucl = "t.fnt" fn_blast_db = sbsp_general.general.get_value(kwargs, "fn_blast_db", "db") clean = sbsp_general.general.get_value(kwargs, "clean", False) max_evalue = sbsp_general.general.get_value(kwargs, "max_evalue", None) pd_work = env["pd-work"] # get paths to files pf_q_proteins = os.path.join(pd_work, fn_q_proteins) pf_t_proteins = os.path.join(pd_work, fn_t_proteins) pf_blast_db = os.path.join(pd_work, fn_blast_db) # extract all proteins extract_genes_for_multiple_genomes(env, GenomeInfoList.init_from_file(pf_q_list), fn_q_labels, pf_q_proteins) extract_genes_for_multiple_genomes(env, GenomeInfoList.init_from_file(pf_t_list), fn_t_labels, pf_t_proteins) # create blast data base from target proteins create_blast_database(pf_t_proteins, pf_blast_db, seq_type="prot", use_diamond=True) # run blastp run_blast_alignment(pf_q_proteins, pf_blast_db, pf_blast_output, use_diamond=True, max_evalue=max_evalue) if clean: try: os.remove(pf_blast_db + ".dmnd") except OSError: pass try: os.remove(pf_q_proteins) except OSError: pass try: os.remove(pf_t_proteins) except OSError: pass
def split_query_genomes_target_genomes_one_vs_group(data, num_splits, pd_work, **kwargs): # type: (Dict[str, str], int, str, Dict[str, Any]) -> List[Dict[str, str]] fn_q_split_formatted = get_value(kwargs, "fn_q_split_formatted", "q_split_{}.list") fn_t_split_formatted = get_value(kwargs, "fn_t_split_formatted", "t_split_{}.list") pf_q_list = data["pf_q_list"] pf_t_list = data["pf_t_list"] pf_output_template = data["pf_output_template"] list_pf_splits = list() q_list = GenomeInfoList.init_from_file(pf_q_list) t_list = GenomeInfoList.init_from_file(pf_t_list) split_number = 1 for q_genome in q_list: # split for split in generate_splits(t_list, num_splits): pf_q_split = os.path.join( pd_work, fn_q_split_formatted.format(split_number)) pf_t_split = os.path.join( pd_work, fn_t_split_formatted.format(split_number)) q_split = GenomeInfoList([q_genome]) t_split = GenomeInfoList(split) q_split.to_file(pf_q_split) t_split.to_file(pf_t_split) list_pf_splits.append({ "pf_q_list": pf_q_split, "pf_t_list": pf_t_split, "pf_output": pf_output_template.format(split_number) }) split_number += 1 return list_pf_splits
def get_orthologs_from_files(env, pf_q_list, pf_t_list, pf_output, **kwargs): # type: (Environment, str, str, str, Dict[str, Any]) -> str sbsp_options = get_value(kwargs, "sbsp_options", SBSPOptions(env)) # type: SBSPOptions fn_q_labels = get_value(kwargs, "fn_q_labels", "ncbi.gff") fn_t_labels = get_value(kwargs, "fn_t_labels", "ncbi.gff") q_gil = GenomeInfoList.init_from_file(pf_q_list) t_gil = GenomeInfoList.init_from_file(pf_t_list) pd_work = env["pd-work"] # Extract data for blast run pf_q_aa = os.path.join(pd_work, "q.faa") pf_q_nt = os.path.join(pd_work, "q.fnt") pf_t_aa = os.path.join(pd_work, "t.faa") pf_t_nt = os.path.join(pd_work, "t.fnt") custom = { "reverse_complement": True, "ignore_frameshifted": True, "ignore_partial": True } extract_labeled_sequences_for_genomes(env, q_gil, pf_q_aa, fn_labels=fn_q_labels, **custom, **kwargs) extract_labeled_sequences_for_genomes(env, t_gil, pf_t_aa, fn_labels=fn_t_labels, **custom, **kwargs) pf_blast_db = os.path.join(pd_work, "blast.db") create_blast_database(pf_t_aa, pf_blast_db, seq_type="prot", use_diamond=True) # FIXME: cleanup # Run blast pf_blast_results = os.path.join(pd_work, "blast.xml") run_blast_on_sequence_file(env, pf_q_aa, pf_blast_db, pf_blast_results, **kwargs) # Parse data, filter, and write to CSV parse_filter_and_convert_to_csv(pf_blast_results, pf_output, pf_q_original_nt=pf_q_nt, pf_t_original_nt=pf_t_nt, pf_q_original_aa=pf_q_aa, pf_t_original_aa=pf_t_aa, distance_min=sbsp_options.safe_get("filter-min-distance"), distance_max=sbsp_options.safe_get("filter-max-distance"), **kwargs) return pf_output
def main(env, args): # type: (Environment, argparse.Namespace) -> None logger.debug("Running: sbsp_step_get_orthologs") prl_options = ParallelizationOptions.init_from_dict(env, vars(args)) prl_options = duplicate_parallelization_options_with_updated_paths( env, prl_options) gil = GenomeInfoList.init_from_file(args.pf_genome_list) if prl_options["use-pbs"]: if prl_options.safe_get("pd-data-compute"): env = env.duplicate({"pd-data": prl_options["pd-data-compute"]}) pbs = PBS(env, prl_options, splitter=split_genome_info_list, merger=merge_identity) output = pbs.run(data={ "gil": gil, "pf_output_template": os.path.join(prl_options["pbs-pd-head"], "sequences_{}.faa") }, func=extract_labeled_sequences_for_genomes, func_kwargs={ "env": env, "fn_labels": "ncbi.gff", "reverse_complement": True, "ignore_frameshifted": True, "ignore_partial": True }) with open(args.pf_output, "w") as f_output: for pf_tmp in output: if pf_tmp is None or not os.path.isfile(pf_tmp): continue sequences = read_fasta_into_hash(pf_tmp, stop_at_first_space=False) for k, v in sequences.items(): f_output.write(">{}\n{}\n".format(k, v)) remove(pf_tmp) f_output.close() else: extract_labeled_sequences_for_genomes(env, gil, pf_output=args.pf_output, fn_labels="ncbi.gff", reverse_complement=True, ignore_frameshifted=True, ignore_partial=True)
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) df = pd.read_csv(args.pf_gcfid_to_pd_sbsp) gcfid_to_pd_sbsp = {x["gcfid"]: x["pd-sbsp"] for _, x in df.iterrows()} analyze_predictions_on_verified_genes_for_genome_list( env, gil, gcfid_to_pd_sbsp, fn_prefix=args.fn_prefix)
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) run_tool_on_gil(env, gil, args.tool, dn_run=args.dn_run, genome_type=args.type, use_pbs=args.use_pbs)
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) analysis_per_query(env, gil, args.pf_output_summary, prodigal=args.prodigal, verified=args.verified, dn_run=args.dn_run)
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) df = pd.read_csv(args.pf_gcfid_to_pd_sbsp) gcfid_to_pd_sbsp = {x["gcfid"]: x["pd-sbsp"] for _, x in df.iterrows()} analysis_per_query(env, gil, gcfid_to_pd_sbsp, args.pf_output_summary, prodigal=args.prodigal)
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_q_list) # read genome list sbsp_options = SBSPOptions.init_from_dict(env, vars(args)) # read tool options prl_options = ParallelizationOptions.init_from_dict( env, vars(args)) # read parallelization options # read database index file: shows locations of databases used as targets for each clade clade_to_pf_db = get_clade_to_pf_db(args.pf_db_index) run_sbsp_on_genome_list(env, gil, sbsp_options, prl_options, clade_to_pf_db, simultaneous_genomes=args.simultaneous_genomes, dn_run=args.dn_run, steps=args.steps, fn_q_labels=args.fn_q_labels, fn_t_labels=args.fn_t_labels, fn_q_labels_compare=args.fn_q_labels_compare)
def split_genome_info_list(data, num_splits, pd_work, **kwargs): # type: (Dict[str, Any], int, str, Dict[str, Any]) -> List[Dict[str, Any]] genome_info_list = get_value(data, "gil", required=True) pf_output_template = get_value(data, "pf_output_template", "") if num_splits > len(genome_info_list): num_splits = len(genome_info_list) list_of_list_of_gi = list() for i in range(num_splits): list_of_list_of_gi.append(list()) for index, gi in enumerate(genome_info_list): index_of_list = index % num_splits list_of_list_of_gi[index_of_list].append(gi) return [{ "gil": GenomeInfoList(list_of_list_of_gi[i]), "pf_output": pf_output_template.format(i) } for i in range(len(list_of_list_of_gi))]
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) pd_figures = os_join(env["pd-work"], "figures") mkdir_p(pd_figures) list_run_info = list() for gi in tqdm(gil, total=len(gil)): # get gms2 and toolp models mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi) group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper() mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None) mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None) non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"]) df_gms2 = mm_gms2.pwm_to_df() df_toolp = mm_toolp.pwm_to_df() fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4)) # relative rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information") lm.Logo(rel_mat, color_scheme="classic", ax=axes[0]) axes[0].set_ylim(*[0, 2]) axes[0].set_title("GeneMarkS-2") # shannon sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information") lm.Logo(sha_mat, color_scheme="classic", ax=axes[1]) axes[1].set_ylim(*[0, 2]) axes[1].set_title("StartLink+") plt.tight_layout() plt.savefig(next_name(pd_figures)) plt.show() rel_gms2 = relative_entropy(mm_gms2, non_gms2) rel_toolp = relative_entropy(mm_toolp, non_gms2) gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta")) if not args.verified: list_run_info.append({ "GC": gc, "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi), "RE GMS2": rel_gms2, "RE toolp": rel_toolp }) else: # verified comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[0], "Tool": "GMS2", "RE": rel_gms2, "GC": gc }) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[1], "Tool": "GMS2 with SL", "RE": rel_toolp, "GC": gc }) print(list_run_info[-2:]) import sbsp_viz.sns as sns if args.verified: df = pd.DataFrame(list_run_info) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Error")) sns.lineplot(df, "Genome", "RE", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Relative entropy", )) else: df = pd.DataFrame(list_run_info) sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df = pd.DataFrame(list_run_info) df = df[df["Accuracy"] < 2].copy() sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df.to_csv(next_name(env["pd-work"], ext="csv"))
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) analyze_gms2_components_on_verified_set(env, gil)
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) for gi in tqdm(gil, total=len(gil)): collect_alignments_for_genome(env, gi)
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) prl_options = ParallelizationOptions.init_from_dict(env, vars(args)) if not prl_options["use-pbs"]: df = relative_entropy_analysis(env, gil, prl_options) else: pbs = PBS(env, prl_options, splitter=split_genome_info_list, merger=merge_identity) list_df = pbs.run(data={"gil": gil}, func=relative_entropy_analysis, func_kwargs={ "env": env, "prl_options": prl_options }) df = pd.concat(list_df, ignore_index=True, sort=False) df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False) pd_figures = os_join(env["pd-work"], "summary_figures") mkdir_p(pd_figures) sns.scatterplot(df, "Percent", "Error", figure_options=FigureOptions( ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.scatterplot( df, "RE Motif", "RE Spacer", hue="Genome", identity=True, figure_options=FigureOptions(save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "RE", hue="Genome", figure_options=FigureOptions(save_fig=next_name(pd_figures)))
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) df = create_mgm_test_data(env, gil) df.to_csv(args.pf_output)