def compare_gms2_and_toolp_motifs_for_gi(env, gi): # type: (Environment, GenomeInfo) -> [GMS2Mod, GMS2Mod] pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff") pf_sbsp = os_join(env["pd-runs"], gi.name, "sbsp_submission/accuracy", f"{gi.name}.gff") pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta") pf_toolp = os_join(env["pd-work"], "toolp.gff") # get toolp predictions get_identital_labels( pf_gms2, pf_sbsp, pf_toolp ) # get gms2 model mod_gms2 = train_and_create_models( env, pf_labels = pf_gms2, pf_sequences = pf_sequence ) mod_toolp = train_and_create_models( env, pf_labels=pf_toolp, pf_sequences=pf_sequence ) return mod_gms2, mod_toolp
def run_gms2_with_component_toggles_and_get_accuracy(env, gi, components_off, **kwargs): # type: (Environment, GenomeInfo, Set[str], Dict[str, Any]) -> Dict[str, Any] pf_mod_original = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod") pf_reference = os_join(env["pd-data"], gi.name, "verified.gff") pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta") pf_prediction = os_join(env["pd-work"], "prediction.gff") native_coding_off = get_value(kwargs, "native_coding_off", True) pf_new_mod = os_join(env["pd-work"], "model.mod") turn_off_components(pf_mod_original, pf_new_mod, components_off, native_coding_off=native_coding_off) done = False while not done: try: run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_prediction) done = True except CalledProcessError: pass # compare with verified lcd = LabelsComparisonDetailed(read_labels_from_file(pf_reference), read_labels_from_file(pf_prediction)) return { "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) }
def relative_entropy_analysis_for_gi(env, gi, prl_options): # type: (Environment, GenomeInfo, ParallelizationOptions) -> pd.DataFrame # Steps: list_entries = list() # set up labels (as lst) and sequence for genome setup_info = set_up_labels_and_sequence_for_genome(env, gi) if prl_options["use-pbs"]: pd_figures = os_join(prl_options["pbs-pd-head"], gi.name) else: pd_figures = os_join(env["pd-work"], gi.name) mkdir_p(pd_figures) for percent in range(10, 101, 5): for trial in range(10): info = relative_entropy_analysis_for_gi_for_percent( env, pf_sequence=setup_info["pf_sequence"], pf_labels=setup_info["pf_labels"], group=setup_info["group"], pf_mod=setup_info["pf_mod"], pf_verified=setup_info["pf_verified"], percent=percent, pd_figures=pd_figures) list_entries.append({ "Genome": gi.name, "Percent": percent, "Trial": trial, **info }) return pd.DataFrame(list_entries)
def setup_gi_and_run(env, gi, sbsp_options, prl_options, clade_to_pf_db, **kwargs): # type: (Environment, GenomeInfo, SBSPOptions, ParallelizationOptions, Dict[str, str], Dict[str, Any]) -> None dn_run = get_value(kwargs, "dn_run", "sbsp") # Check if clade is known try: pf_t_db = clade_to_pf_db[gi.attributes["ancestor"]] except KeyError: raise ValueError("Unknown clade {}".format(gi.attributes["ancestor"])) logger.info("Scheduling: {}".format(gi.name)) pd_work = os_join(env["pd-work"], gi.name, dn_run) # genome working environment curr_env = env.duplicate({"pd-work": pd_work}) # create environment for genome pf_output = os_join(pd_work, "output.csv") # output file mkdir_p(pd_work) # create working directory # write genome name to file list (for running) pf_list = os_join(pd_work, "query.list") GenomeInfoList([gi]).to_file(pf_list) # create options for pipeline for current genome po = PipelineSBSPOptions(curr_env, pf_list, pf_t_db=pf_t_db, pf_output=pf_output, sbsp_options=sbsp_options, prl_options=prl_options, **kwargs) sbsp_on_gi(gi, po)
def create_mgm_test_data_for_genome(env, gi, **kwargs): # type: (Environment, GenomeInfo, Dict[str, Any]) -> pd.DataFrame pd_genome = os_join(env["pd-data"], gi.name) pf_sequence = os_join(pd_genome, "sequence.fasta") pd_genome_run = os_join(env["pd-runs"], gi.name) pf_gms2 = os_join(pd_genome_run, "gms2", "gms2.lst") pf_mod = os_join(pd_genome_run, "gms2", "GMS2.mod") labels = read_labels_from_lst_file(pf_gms2) # type: Labels sequences = read_fasta_into_hash(pf_sequence) mod = GMS2Mod.init_from_file(pf_mod) # extract upstream regions list_entries = list() # type: List[Dict[str, Any]] for l in labels: motif_type = l.get_attribute_value("motif-type") if motif_type == 1: # RBS motif = mod.items["RBS_MAT"] motif_pos = mod.items["RBS_POS_DIST"] elif motif_type == 2: # PROMOTER motif = mod.items["PROMOTER_MAT"] motif_pos = mod.items["PROMOTER_POS_DIST"] else: motif = None motif_pos = None # extract upstream sequence return pd.DataFrame(list_entries)
def analyze_gms2_components_on_verified_set_for_gi(env, gi): # type: (Environment, GenomeInfo) -> pd.DataFrame list_entries = list() start_components = { "Start Codons", "Start Context", "RBS", "Promoter", } pd_gi = os_join(env["pd-work"], gi.name) mkdir_p(pd_gi) # for each component to keep on for component_on in sorted(start_components) + ["MGM2*", "MGM", "GMS2"]: components_off = start_components.difference({component_on}) if component_on == "MGM2*" or component_on == "GMS2": components_off = set() elif component_on == "MGM": pass elif not component_in_model_file(env, gi, component_on) and component_on not in {"MGM2*", "MGM", "GMS2"}: continue native_coding_off = False if component_on == "GMS2" else True pd_gi_component = os_join(pd_gi, component_on).replace(" ", "") mkdir_p(pd_gi_component) env_dup = env.duplicate({"pd-work": pd_gi_component}) if component_on == "Start Context": component_on = {component_on} # "rbs", "promoter"} components_off.remove("RBS") components_off.remove("Promoter") else: component_on = {component_on} results = run_gms2_with_component_toggles_and_get_accuracy(env_dup, gi, components_off, native_coding_off=native_coding_off) list_entries.append({ "Genome": gi.name, "Component": next(iter(component_on)).replace("_", "-"), # **{con: True for con in component_on}, # current component is on # **{coff: False for coff in components_off}, # all others are off **results }) return pd.DataFrame(list_entries)
def compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, **kwargs): # type: (Environment, GenomeInfo) -> [float, float] group = get_value(kwargs, "group", None) pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff") pf_gms2_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod") pf_sbsp = os_join(env["pd-runs"], gi.name, "sbsp_submission/accuracy", f"{gi.name}.gff") pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta") pf_toolp = os_join(env["pd-work"], "toolp.gff") pf_verified = os_join(env["pd-data"], gi.name, "verified.gff") # get toolp predictions get_identital_labels( pf_gms2, pf_sbsp, pf_toolp ) # create new motif model with toolp and add it to new model file pf_new_mod = os_join(env["pd-work"], "toolp.mod") add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_toolp, pf_gms2_mod, pf_new_mod, group=group) # run prediction with new model pf_new_pred = os_join(env["pd-work"], "new_pred.gff") run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_new_pred) # compare predictions lcd1 = LabelsComparisonDetailed(read_labels_from_file(pf_gms2), read_labels_from_file(pf_verified)) lcd2 = LabelsComparisonDetailed(read_labels_from_file(pf_new_pred), read_labels_from_file(pf_verified)) return [100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) for lcd in [lcd1, lcd2]]
def run(self): # type: () -> None pd_work = self.env["pd-work"] # make sure working directory is up and running mkdir_p(pd_work) # Copy genome file to local directory, and write sbsp options copyfile(self.pipeline_options["pf-q-list"], os_join(pd_work, "run.list")) self.pipeline_options["sbsp-options"].to_file( os_join(pd_work, "sbsp-options.conf")) state = self._run_helper() # run compute steps self._compare(state) # run comparison
def next_name(pd_work, **kwargs): # type: (str, Dict[str, Any]) -> str ext = get_value(kwargs, "ext", "pdf") if "counter" not in next_name.__dict__: next_name.counter = -1 next_name.counter += 1 return os_join(pd_work, "{}.{}".format(next_name.counter, ext))
def set_up_labels_and_sequence_for_genome(env, gi): # type: (Environment, GenomeInfo) -> Dict[str, Any] pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff") pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta") pf_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod") mod = GMS2Mod.init_from_file(pf_mod) group = mod.items["GENOME_TYPE"].split("-")[1].upper() return { "pf_labels": pf_gms2, "pf_sequence": pf_sequence, "pf_verified": os_join(env["pd-data"], gi.name, "verified.gff"), "pf_mod": pf_mod, "group": group }
def stats_for_gi(env, gi, list_dn_tools, list_tool_names): # type: (Environment, GenomeInfo, List[str], List[str]) -> Dict[str, Any] result = {"Genome": gi.name} # compute GC result["GC"] = compute_gc_from_file( os_join(env["pd-data"], gi.name, "sequence.fasta")) # read all labels labels = read_labels_for_multiple_tools(env, gi, list_dn_tools, list_tool_names) # indexed labels indexed_labels = { name: map_key_3p_to_label(lab) for name, lab in labels.items() } # single analysis result.update(_single_analysis(indexed_labels)) # pairwise analysis result.update(_pairwise_analysis(indexed_labels)) # all together result.update(_all_together_analysis(indexed_labels)) return result
def relative_entropy_analysis_for_gi_for_percent(env, pf_sequence, pf_labels, pf_mod, pf_verified, group, percent, pd_figures): # type: (Environment, str, str, str, str, str, float, str) -> Dict[str, Any] # 1) randomly select percent of labels pf_labels_percent = os_join(env["pd-work"], "labels_percent.lst") pf_mod_percent = os_join(env["pd-work"], "model_percent.mod") pf_labels_predict = os_join(env["pd-work"], "labels_predict.lst") randomly_select_labels(pf_labels, pf_labels_percent, percent) # train new model mod_percent = train_and_create_models(env, pf_sequences=pf_sequence, pf_labels=pf_labels_percent, group=group, clean=False, pf_mod=pf_mod_percent) # add RBSB to GMS2 model add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_labels_percent, pf_mod, pf_mod_percent) logo_rbs_from_gms2_mod_file(pd_figures, pf_mod_percent, str(percent)) # run prediction with new model run_gms2_prediction_with_model(pf_sequence, pf_mod_percent, pf_labels_predict) # compare predictions lcd = LabelsComparisonDetailed(read_labels_from_file(pf_labels_predict), read_labels_from_file(pf_verified)) mm = MotifModel(mod_percent.items["RBS_MAT"], mod_percent.items["RBS_POS_DISTR"]) non = GMS2Noncoding(mod_percent.items["NON_MAT"]) return { "RE": relative_entropy(mm, non), "RE Motif": relative_entropy(mm, non, component="motif"), "RE Spacer": relative_entropy(mm, non, component="spacer"), "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) }
def train_gms2_model(env, pf_new_seq, pf_new_labels, **kwargs): group = get_value(kwargs, "group", "A", default_if_none=True) pf_mod = os_join(env["pd-work"], "a.mod") cmd = f"cd {env['pd-work']}; " cmd += f"{env['pd-bin-external']}/gms2/biogem gms2-training -s {pf_new_seq} -l {pf_new_labels} -m {pf_mod} --order-coding 5 --order-noncoding 2 --only-train-on-native 1 --genetic-code 11 --order-start-context 2 --fgio-dist-thr 25 --genome-group {group} --ga-upstr-len-rbs 20 --align right --ga-width-rbs 6" run_shell_cmd(cmd) mod = GMS2Mod.init_from_file(pf_mod) # remove_p(pf_mod) return mod
def component_in_model_file(env, gi, component): # type: (Environment, GenomeInfo, str) -> bool pf_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod") with open (pf_mod, "r") as f: mod_string = f.read() for t in key_to_gms2_tags(component): if re.findall(r"\$" + t + r"[\s\n]", mod_string): # if (r"$" + t + r"") in mod_string: return True return False
def convert_multi_fasta_to_single(env, pf_sequences, pf_labels): # pf_sequence = sys.argv[1] # pf_labels = sys.argv[2] # pd_work = sys.argv[3] org_seq = read_fasta_into_hash(pf_sequences) org_labels = read_gff(pf_labels, shift=0) new_seq, new_labels = convert_multi_fasta_into_single_fasta( org_seq, org_labels, "anydef") pd_work = env["pd-work"] pf_new_seq = os_join(pd_work, "sequence_joined") pf_new_labels = os_join(pd_work, "labels_joined_lst") import os # write_gff(new_labels, os.path.join(pd_work, "labels_joined"), shift=0) write_lst(new_labels, pf_new_labels, shift=0) write_fasta_hash_to_file(new_seq, pf_new_seq) return pf_new_seq, pf_new_labels
def run_prodigal(env, gi, **kwargs): # type: (Environment, GenomeInfo, Dict[str, Any]) -> None pd_data = env["pd-data"] pd_work = env["pd-work"] pe_tool = os_join(env["pd-bin-external"], "prodigal", "prodigal") pf_sequence = os_join(pd_data, gi.name, "sequence.fasta") use_pbs = get_value(kwargs, "use_pbs", False) # FIXME: put in genetic code cmd_run = "{} -i {} -g 11 -o prodigal.gff -f gff -t prodigal.parameters -q \n".format( pe_tool, pf_sequence) if use_pbs: pf_pbs = os_join(pd_work, "run.pbs") create_pbs_file(env, cmd_run, pf_pbs, job_name=gi.name, **kwargs) run_shell_cmd("qsub {} &".format(pf_pbs)) else: cmd_run = f"cd {pd_work}; {cmd_run}" run_shell_cmd(cmd_run)
def gather_mgm_test_set_for_genome(env, gi, **kwargs): # type: (Environment, GenomeInfo, Dict[str, Any]) -> pd.DataFrame # get upstream sequences df = gather_upstream_sequences_for_genome(env, gi) pf_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod") mod = GMS2Mod.init_from_file(pf_mod) m_rbs = create_motif_model_from_gms2_model(mod, "RBS") m_promoter = create_motif_model_from_gms2_model(mod, "PROMOTER") names = ["RBS", "PROMOTER"] models = [m_rbs, m_promoter] # add score columns to dataframe score_column_names = [ x + "_" + y + "_" + z for x in ["RBS", "PROMOTER"] for y in ["motif", "spacer", "both"] for z in ["score", "position"] ] df = df.reindex(columns=[*(df.columns.tolist() + score_column_names)], fill_value=None) grp = mod.items["GENOME_TYPE"].split("-")[1].upper() if grp == "D2": grp = "D" df["Group"] = grp for idx in df.index: frag = df.at[idx, "upstream_nt"] for name, model in zip(names, models): if model is not None: for c in ["motif", "spacer", "both"]: result = model.find_best_position_and_score(frag, component=c) pos = result[0] score = result[1] df.at[idx, f"{name}_{c}_score"] = score df.at[idx, f"{name}_{c}_position"] = len( frag) - pos - model.motif_width() # get best score across models best = max([(name, df.at[idx, f"{name}_both_score"], df.at[idx, f"{name}_both_position"]) for name in names], key=lambda x: x[1]) df.at[idx, "best_position"] = best[2] df.at[idx, "best_score"] = best[1] df.at[idx, "best_name"] = best[0] return df
def run_gms2(env, gi, **kwargs): # type: (Environment, GenomeInfo, Dict[str, Any]) -> None genome_type = get_value(kwargs, "genome_type", "auto") pd_data = env["pd-data"] pd_work = env["pd-work"] pe_tool = os_join(env["pd-bin-external"], "gms2", "gms2.pl") pf_sequence = os_join(pd_data, gi.name, "sequence.fasta") use_pbs = get_value(kwargs, "use_pbs", False) # FIXME: put in genetic code cmd_run = "{} --gcode 11 --format gff --out gms2.gff --seq {} --v --genome-type {} --fgio-dist-thresh 25".format( pe_tool, pf_sequence, genome_type) if use_pbs: pf_pbs = os_join(pd_work, "run.pbs") create_pbs_file(env, cmd_run, pf_pbs, job_name=gi.name, **kwargs) run_shell_cmd("qsub {} &".format(pf_pbs)) else: cmd_run = f"cd {pd_work}; {cmd_run}" run_shell_cmd(cmd_run)
def gather_upstream_sequences_for_genome(env, gi, **kwargs): # type: (Environment, GenomeInfo, Dict[str, Any]) -> pd.DataFrame list_entries = list() # type: List[Dict[str, Any]] # read sequences pf_sequences = os_join(env["pd-data"], gi.name, "sequence.fasta") pf_labels = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff") sequences = read_fasta_into_hash(pf_sequences) labels = read_labels_from_file(pf_labels) gc = 100 * compute_gc_from_sequences(sequences) upstream_info = extract_upstream_sequences(labels, sequences) for info in upstream_info: label = info[0] # type: Label frag = info[1] # type: Seq gene_gc = 100 * compute_gc_from_sequences({ "any": sequences[label.seqname()][label.left():label.right() + 1] }) list_entries.append({ "GCFID": gi.name, "Accession": label.seqname(), "Genome GC": gc, "Gene GC": gene_gc, "left": label.left() + 1, "right": label.right() + 1, "strand": label.strand(), "upstream_nt": str(frag) }) return pd.DataFrame(list_entries)
def collect_start_info_from_gi(env, gi): # type: (Environment, GenomeInfo) -> Dict[str, Any] pd_genome = os_join(env["pd-data"], gi.name) pf_sequence = os_join(pd_genome, "sequence.fasta") gc = compute_gc_from_file(pf_sequence) pd_genome_run = os_join(env["pd-runs"], gi.name) pd_gms2 = os_join(pd_genome_run, "gms2") pf_mod = os_join(pd_gms2, "GMS2.mod") mod = GMS2Mod.init_from_file(pf_mod) return { "Genome": gi.name, "GC": 100*gc, **{ x: mod.items[x] for x in { "GENOME_TYPE", "RBS_MAT", "RBS_MAT", "PROMOTER_MAT", "PROMOTER_WIDTH", "RBS_WIDTH", "RBS_POS_DISTR", "PROMOTER_POS_DISTR", "ATG", "GTG", "TTG", "TAA", "TGA", "TAG", "NON_MAT" } if x in mod.items.keys() } }
def main(env, args): # type: (Environment, argparse.Namespace) -> None # link to taxonomy dump lp_taxonomy = f"https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.zip" pd_output = args.pd_output mkdir_p(pd_output) pf_output = os_join(pd_output, "taxdump.zip") logger.info(f"Downloading file: {lp_taxonomy}") urllib.request.urlretrieve(lp_taxonomy, pf_output) logger.info("Download complete. Unzipping") run_shell_cmd(f"cd {pd_output}; unzip {pf_output}")
def run_tool_on_gil(env, gil, tool, **kwargs): # type: (Environment, GenomeInfoList, str, Dict[str, Any]) -> None logger.info("Running tool {} on {} genomes".format(tool, len(gil))) dn_run = get_value(kwargs, "dn_run", tool, default_if_none=True) func = { "gms2": run_gms2, "prodigal": run_prodigal, }[tool] for gi in gil: pd_work = os_join(env["pd-work"], gi.name, dn_run) mkdir_p(pd_work) curr_env = env.duplicate({"pd-work": pd_work}) func(curr_env, gi, **kwargs)
def read_labels_for_multiple_tools(env, gi, list_dn_tools, list_tool_names): # type: (Environment, GenomeInfo, List[str], List[str]) -> Dict[str, Labels] common_options = { "shift": 0, "ignore_frameshifted": True, "ignore_partial": True, "ignore_hypothetical": True } labels = dict() for name, dn_tool in zip(list_tool_names, list_dn_tools): pf_labels = os_join(env["pd-runs"], gi.name, dn_tool, f"{dn_tool}.gff") labels[name] = read_labels_from_file(pf_labels, name="SBSP", **common_options) return labels
def analysis_per_query_for_genome(env, gi, pd_sbsp, **kwargs): # type: (Environment, GenomeInfo, str, Dict[str, Any]) -> pd.DataFrame pd_genome = os_join(env["pd-data"], gi.name) pf_gms2 = os_join(pd_genome, "runs", "gms2", "gms2.gff") pf_prodigal = os_join(pd_genome, "runs", "prodigal", "prodigal.gff") pf_sbsp = os_join(pd_sbsp, "accuracy", "{}.gff".format(gi.name)) pf_ncbi = os_join(pd_genome, "ncbi.gff") pf_sbsp_details = os_join(pd_sbsp, "output.csv") # Read all input and sbsp prediction details common_options = {"shift": 0} labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **common_options) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **common_options) labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **common_options) labels_prodigal = read_labels_from_file(pf_prodigal, name="Prodigal", **common_options) df_sbsp_details = pd.read_csv(pf_sbsp_details) add_q_key_3p_to_df(df_sbsp_details, "q-key-3p") # get keys per label key_to_label_sbsp = map_key_3p_to_label(labels_sbsp) key_to_label_gms2 = map_key_3p_to_label(labels_gms2) key_to_label_ncbi = map_key_3p_to_label(labels_ncbi) key_to_label_prodigal = map_key_3p_to_label(labels_prodigal) key_to_df_sbsp_details = map_key_3p_to_df_group(df_sbsp_details) df_result = pd.DataFrame() # Sketch: Dataframe will contain one row per gene (3prime end), for all genes in # the union set of SBSP, GMS2, NCBI, and prodigal all_key_3p = set(key_to_label_sbsp.keys()).union( set(key_to_label_gms2.keys()), set(key_to_label_ncbi.keys()), set(key_to_label_prodigal)) list_analysis = list() for key in all_key_3p: curr_analysis = analyze_query(key, key_to_label_sbsp, key_to_label_gms2, key_to_label_ncbi, key_to_label_prodigal, key_to_df_sbsp_details) list_analysis.append(curr_analysis) if len(list_analysis) == 0: return pd.DataFrame() return pd.DataFrame(list_analysis)
def analyze_predictions_on_verified_genes(env, gi, pd_sbsp, **kwargs): # type: (Environment, GenomeInfo, str, Dict[str, Any]) -> Dict[str, Any] pd_gcfid = os_join(env["pd-data"], gi.name) pf_sbsp = os_join(pd_sbsp, "accuracy", "{}.gff".format(gi.name)) pf_gms2 = os_join(pd_gcfid, "runs", "gms2", "gms2.gff") pf_verified = os_join(pd_gcfid, "verified.gff") pf_ncbi = os_join(pd_gcfid, "ncbi.gff") pf_sbsp_details = os_join(pd_sbsp, "output.csv") kwargs_labels = { "ignore_frameshifted": True, "ignore_partial": True, "shift": 0 } labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **kwargs_labels) labels_verified = read_labels_from_file(pf_verified, name="Verified", **kwargs_labels) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **kwargs_labels) labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **kwargs_labels) df_sbsp_details = pd.read_csv(pf_sbsp_details) add_q_key_3p_to_df(df_sbsp_details, "q-key-3p") add_support_to_labels(labels_sbsp, df_sbsp_details) #labels_sbsp = Labels([l for l in labels_sbsp if l.get_attribute_value('predicted-at-step') != "C"], name="SBSP") labels_sbsp_eq_gms2 = LabelsComparisonDetailed( labels_sbsp, labels_gms2).match_3p_5p("a") labels_sbsp_eq_gms2.name = "GMS2=SBSP" stats = dict() # Stats: 3prime match get_stats_a_from_b_3p(labels_verified, labels_ncbi, stats) get_stats_a_from_b_3p(labels_verified, labels_gms2, stats) get_stats_a_from_b_3p(labels_verified, labels_sbsp, stats) get_stats_a_from_b_3p_by_upstream(labels_verified, labels_ncbi, stats) # SN SP get_stats_sn_sp(labels_verified, labels_sbsp, stats) get_stats_sn_sp(labels_verified, labels_ncbi, stats) get_stats_sn_sp(labels_verified, labels_gms2, stats) # Stats: GMS2=SBSP Accuracy on verified get_stats_sn_sp(labels_verified, labels_sbsp_eq_gms2, stats) # stats by support get_stats_sn_sp_by_support(labels_verified, labels_sbsp, stats, "SBSP") # stats by support get_stats_sn_sp_by_support(labels_verified, labels_sbsp_eq_gms2, stats, "GMS2=SBSP") # stats by steps combinations get_stats_sn_sp_by_step_group(labels_verified, labels_sbsp, stats, "SBSP") # stats by steps combinations get_stats_sn_sp_by_step_group(labels_verified, labels_sbsp_eq_gms2, stats, "GMS2=SBSP") return stats
def train_with_fraction_of_genes(env, gi, percent): # type: (Environment, GenomeInfo, float) -> [str, str] pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff") pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) prl_options = ParallelizationOptions.init_from_dict(env, vars(args)) if not prl_options["use-pbs"]: df = relative_entropy_analysis(env, gil, prl_options) else: pbs = PBS(env, prl_options, splitter=split_genome_info_list, merger=merge_identity) list_df = pbs.run(data={"gil": gil}, func=relative_entropy_analysis, func_kwargs={ "env": env, "prl_options": prl_options }) df = pd.concat(list_df, ignore_index=True, sort=False) df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False) pd_figures = os_join(env["pd-work"], "summary_figures") mkdir_p(pd_figures) sns.scatterplot(df, "Percent", "Error", figure_options=FigureOptions( ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.scatterplot( df, "RE Motif", "RE Spacer", hue="Genome", identity=True, figure_options=FigureOptions(save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "RE", hue="Genome", figure_options=FigureOptions(save_fig=next_name(pd_figures)))
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) pd_figures = os_join(env["pd-work"], "figures") mkdir_p(pd_figures) list_run_info = list() for gi in tqdm(gil, total=len(gil)): # get gms2 and toolp models mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi) group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper() mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None) mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None) non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"]) df_gms2 = mm_gms2.pwm_to_df() df_toolp = mm_toolp.pwm_to_df() fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4)) # relative rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information") lm.Logo(rel_mat, color_scheme="classic", ax=axes[0]) axes[0].set_ylim(*[0, 2]) axes[0].set_title("GeneMarkS-2") # shannon sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information") lm.Logo(sha_mat, color_scheme="classic", ax=axes[1]) axes[1].set_ylim(*[0, 2]) axes[1].set_title("StartLink+") plt.tight_layout() plt.savefig(next_name(pd_figures)) plt.show() rel_gms2 = relative_entropy(mm_gms2, non_gms2) rel_toolp = relative_entropy(mm_toolp, non_gms2) gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta")) if not args.verified: list_run_info.append({ "GC": gc, "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi), "RE GMS2": rel_gms2, "RE toolp": rel_toolp }) else: # verified comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[0], "Tool": "GMS2", "RE": rel_gms2, "GC": gc }) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[1], "Tool": "GMS2 with SL", "RE": rel_toolp, "GC": gc }) print(list_run_info[-2:]) import sbsp_viz.sns as sns if args.verified: df = pd.DataFrame(list_run_info) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Error")) sns.lineplot(df, "Genome", "RE", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Relative entropy", )) else: df = pd.DataFrame(list_run_info) sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df = pd.DataFrame(list_run_info) df = df[df["Accuracy"] < 2].copy() sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df.to_csv(next_name(env["pd-work"], ext="csv"))
def analyze_upstream_distances(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = os_join(env["pd-work"], "upstream_distances") mkdir_p(pd_work) # remove empty lists df = df[df["Upstream-distance"] != "[]"].copy() df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval) df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent) # compute consistencies with different flexibilities for flexibility in {0, 3}: df["PC(x,{})".format(flexibility)] = df[[ "Most frequent upstream", "Upstream-distance" ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[ "Most frequent upstream"], flexibility), axis=1) df = df[df["Support"] > 10].copy() # for mf in range(-20, 50): # df_mf = df[df["Most frequent upstream"] == mf] # if len(df_mf) < 50: # continue # # sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions( # title="PC({},{})".format(mf, 0), # save_fig=next_name(pd_work), # xlim=(0,1) # )) # sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions( # title="PC({},{})".format(mf, 3), # save_fig=next_name(pd_work), # xlim=(0, 1) # )) # plot distribution of Average PC import seaborn import matplotlib.pyplot as plt df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) & (df["Most frequent upstream"] > -50)] # NCBI consistency as a func df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) & (df["Most frequent upstream"] < 100) & (df["Most frequent upstream"] > -50)] df_tmp = stack_columns_as_rows( df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)", "Ancestor"]], ["PC(x,0)", "PC(x,3)"], "PC(x,f)", None, label_col="Flexibility") # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # scatter=False, hue="Flexibility", lowess=True) # plt.show() # # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # hue="Flexibility", lowess=True) # plt.show() # # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # scatter=False, hue="Flexibility") # plt.show() sns.lmplot(df_tmp, "Most frequent upstream", "PC(x,f)", hue="Flexibility", sns_kwargs={ "scatter": False, "lowess": True }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1])) sns.distplot(df, "Most frequent upstream", figure_options=FigureOptions(save_fig=next_name(pd_work)), sns_kwargs={"kde": True}) import seaborn # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor") (df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)].groupby("Ancestor") ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename( 'Percentage (by clade)').reset_index().pipe( (seaborn.catplot, 'data'), x="Most frequent upstream", y='Percentage (by clade)', hue="Ancestor", kind='point', scale=0.5, legend=False, palette=CM.get_map("ancestor"), aspect=1.5)) plt.legend(loc="best", title="Clade") figure_options = FigureOptions( save_fig=next_name(pd_work), xlabel="Most frequent distance to upstream gene", ylabel="Percent of components (by clade)") plt.xlabel(figure_options.xlabel) plt.ylabel(figure_options.ylabel) save_figure(figure_options) plt.show() (df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)].groupby("Ancestor") ["Most frequent upstream"].value_counts().rename( 'number').reset_index().pipe((seaborn.catplot, 'data'), x="Most frequent upstream", y='number', hue="Ancestor", kind='point', scale=0.5, legend=False, palette=CM.get_map("ancestor"), aspect=1.5)) plt.legend(loc="best", title="Clade") figure_options = FigureOptions( save_fig=next_name(pd_work), xlabel="Most frequent distance to upstream gene", ylabel="Number of components") plt.xlabel(figure_options.xlabel) plt.ylabel(figure_options.ylabel) save_figure(figure_options) plt.show() f, ax1 = plt.subplots() ax2 = ax1.twinx() for ancestor, df_group in df.groupby("Ancestor"): seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1) # ax2.set_ylim(0, 3) ax2.yaxis.set_ticks([]) seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2) ax1.set_xlabel('x var') ax1.set_ylabel('Counts') # g = seaborn.FacetGrid(df, hue="Ancestor") # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True) plt.show() print(df["Most frequent upstream"].value_counts(normalize=True)) sns.lmplot( df, "Most frequent upstream", "PC(x,0)", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1]), ) sns.lmplot(df, "Most frequent upstream", "PC(x,3)", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1])) # NCBI sensitivity # collect: # average 5' per ancestor, r, ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)] list_collect = list() for r in ranges: r_filter = (df["Most frequent upstream"] >= r[0]) & (df["Most frequent upstream"] < r[1]) df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter]) # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r)) df_summary_per_gcfid = df_summary_per_gcfid.groupby( "Ancestor", as_index=False).mean() df_summary_per_gcfid["Range"] = str(r) list_collect.append(df_summary_per_gcfid) df_tmp = pd.concat(list_collect, sort=False) sns.catplot(df_tmp, "Range", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df_tmp, "Range", "GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) # do not average per gcfid - average per ancestor list_collect = list() range_avgs = list() range_label = list() for r in ranges: r_filter = (df["Most frequent upstream"] >= r[0]) & (df["Most frequent upstream"] < r[1]) df_r = df[r_filter] for ancestor, df_group in df_r.groupby( "Ancestor", as_index=False): # type: str, pd.DataFrame f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & ( df_group["NCBI"]) f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & ( df_group["(GMS2=SBSP)!=NCBI"]) sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float( f_gms2_eq_sbsp_with_ncbi_pred.sum()) list_collect.append({ "Ancestor": ancestor, "Range": str(r), "range_avg": (r[1] + r[0]) / 2.0, "(GMS2=SBSP)!=NCBI % GMS2=SBSP": sensitivity, "GMS2=SBSP": f_gms2_eq_sbsp_with_ncbi_pred.sum() }) range_label.append(r) range_avgs.append((r[1] + r[0]) / 2.0) df_tmp = pd.DataFrame(list_collect) sns.catplot(df_tmp, "Range", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df_tmp, "Range", "GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) ancestors = list(set(df_tmp["Ancestor"])) fig, axes = plt.subplots( len(ancestors), 1, sharex="all", ) for ancestor, ax in zip(ancestors, axes.ravel()): # type: str, plt.Axes ax2 = ax.twinx() curr_df = df_tmp[df_tmp["Ancestor"] == ancestor] seaborn.lineplot("range_avg", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", data=curr_df, ax=ax) seaborn.lineplot("range_avg", "GMS2=SBSP", data=curr_df, color='r', legend=False, ax=ax2) ax.set_ylabel(None) ax2.set_ylabel(None) ax.set_xlabel("Range Average") plt.xticks(range_avgs, range_label) plt.show() fig, ax = plt.subplots() ax2 = ax.twinx() seaborn.lineplot("range_avg", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", data=df_tmp, ax=ax, color="b", ci=None, hue="Ancestor") seaborn.lineplot("range_avg", "GMS2=SBSP", data=df_tmp, ci=None, color='r', legend=False, ax=ax2, hue="Ancestor") # plt.xticks(range_avgs, range_label) ax.set_ylim([0, None]) ax2.set_ylim([0, None]) ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP") ax2.set_ylabel("Number of GMS2=SBSP genes") ax.set_xlabel("Range Average") ax.yaxis.label.set_color('b') ax2.yaxis.label.set_color('r') ax.set_xlabel("Distance to upstream gene (nt)") plt.show() # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work) # # for ancestor, df_group in df.groupby("Ancestor", as_index=False): # sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor) # sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor) a = 0
def collect_alignments_for_genome(env, gi): # type: (Environment, GenomeInfo) -> None pd_genome = os_join(env["pd-work"], gi.name) mkdir_p(pd_genome) pd_run = os_join(env["pd-runs"], gi.name) # load labels and data files pf_sbsp = os_join(pd_run, "sbsp", "accuracy", f"{gi.name}.gff") pf_gms2 = os_join(pd_run, "gms2", "gms2.gff") pf_ncbi = os_join(pd_run, "ncbi", "ncbi.gff") pf_sbsp_details = os_join(pd_run, "sbsp", "output.csv") common_options = { "ignore_frameshifted": True, "ignore_partial": True, "shift": 0 } try: labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **common_options) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **common_options) labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **common_options) df_details = pd.read_csv(pf_sbsp_details) add_q_key_3p_to_df(df_details, "q-3prime") except FileNotFoundError: return # get genes where GMS2=SBSP lcd_full = LabelsComparisonDetailed(labels_gms2, labels_sbsp, name_a="gms2", name_b="sbsp") labels_gms2_eq_sbsp = lcd_full.match_3p_5p("a") # get labels where gms2_eq_sbsp doesn't match NCBI lcd2 = LabelsComparisonDetailed(labels_gms2_eq_sbsp, labels_ncbi, name_a="gms2_eq_sbsp", name_b="ncbi") labels_gms2_eq_sbsp_not_ncbi = lcd2.match_3p_not_5p("a") # get msa files for all these labels set_3prime_keys = { create_q_key_3p(l.seqname(), l.left(), l.right(), l.strand()) for l in labels_gms2_eq_sbsp_not_ncbi } df_gms2_eq_sbsp_not_ncbi = df_details[df_details["q-3prime"].isin( set_3prime_keys)] set_pf_msa_out = set(df_gms2_eq_sbsp_not_ncbi["pf-msa-output"]) for pf_msa_out in set_pf_msa_out: shutil.copy(pf_msa_out, pd_genome)