def compare_gms2_sbsp_ncbi(env, pf_gms2, pf_sbsp, pf_ncbi, **kwargs): # type: (Environment, str, str, str, Dict[str, Any]) -> None venn_title = get_value(kwargs, "venn_title", None) pf_venn = get_value(kwargs, "pf_venn", os.path.join(env["pd-work"], "venn.pdf")) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2") labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP") labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI") lcd = LabelsComparisonDetailed(labels_gms2, labels_sbsp, name_a="gms2", name_b="sbsp") labels_gms2_sbsp_3p_5p = lcd.intersection("a") lcd_2 = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p, labels_ncbi, name_a="gms2_sbsp", name_b="ncbi") labels_gms2_sbsp_ncbi_3p_5p = lcd_2.intersection("a") out = "gms2,sbsp,ncbi,gms2_sbsp,gms2_sbsp_ncbi" out += "\n{},{},{},{},{}".format(len(labels_gms2), len(labels_sbsp), len(labels_ncbi), len(labels_gms2_sbsp_3p_5p), len(labels_gms2_sbsp_ncbi_3p_5p)) print(out) venn_diagram_5prime(labels_gms2, labels_sbsp, labels_ncbi, FigureOptions(title=venn_title, save_fig=pf_venn))
def run_gms2_with_component_toggles_and_get_accuracy(env, gi, components_off, **kwargs): # type: (Environment, GenomeInfo, Set[str], Dict[str, Any]) -> Dict[str, Any] pf_mod_original = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod") pf_reference = os_join(env["pd-data"], gi.name, "verified.gff") pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta") pf_prediction = os_join(env["pd-work"], "prediction.gff") native_coding_off = get_value(kwargs, "native_coding_off", True) pf_new_mod = os_join(env["pd-work"], "model.mod") turn_off_components(pf_mod_original, pf_new_mod, components_off, native_coding_off=native_coding_off) done = False while not done: try: run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_prediction) done = True except CalledProcessError: pass # compare with verified lcd = LabelsComparisonDetailed(read_labels_from_file(pf_reference), read_labels_from_file(pf_prediction)) return { "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) }
def compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, **kwargs): # type: (Environment, GenomeInfo) -> [float, float] group = get_value(kwargs, "group", None) pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff") pf_gms2_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod") pf_sbsp = os_join(env["pd-runs"], gi.name, "sbsp_submission/accuracy", f"{gi.name}.gff") pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta") pf_toolp = os_join(env["pd-work"], "toolp.gff") pf_verified = os_join(env["pd-data"], gi.name, "verified.gff") # get toolp predictions get_identital_labels( pf_gms2, pf_sbsp, pf_toolp ) # create new motif model with toolp and add it to new model file pf_new_mod = os_join(env["pd-work"], "toolp.mod") add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_toolp, pf_gms2_mod, pf_new_mod, group=group) # run prediction with new model pf_new_pred = os_join(env["pd-work"], "new_pred.gff") run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_new_pred) # compare predictions lcd1 = LabelsComparisonDetailed(read_labels_from_file(pf_gms2), read_labels_from_file(pf_verified)) lcd2 = LabelsComparisonDetailed(read_labels_from_file(pf_new_pred), read_labels_from_file(pf_verified)) return [100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) for lcd in [lcd1, lcd2]]
def compare_gms2_sbsp_ncbi_for_genome_list(env, gil, gcfid_to_pd_sbsp, pf_output_summary, **kwargs): # type: (Environment, GenomeInfoList, Dict[str, str], str, Dict[str, Any]) -> None prodigal = get_value(kwargs, "prodigal", None) list_summary = list() list_pf_gms2_sbsp_not_ncbi = list() list_pf_gms2_sbsp_ncbi = list() for gi in gil: logger.info("{}".format(gi.name)) pd_genome = os.path.join(env["pd-data"], gi.name) pf_gms2 = os.path.join(pd_genome, "runs", "gms2", "gms2.gff") pf_ncbi = os.path.join(pd_genome, "ncbi.gff") pf_sbsp_details = os.path.join(gcfid_to_pd_sbsp[gi.name], "output.csv") labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2") labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI") key_3prime_to_label_gms2 = map_key_to_labels(labels_gms2) key_3prime_to_label_ncbi = map_key_to_labels(labels_ncbi) df_sbsp = pd.read_csv(pf_sbsp_details, header=0) for index, row in df_sbsp.groupby("q-key", as_index=False).agg("first").iterrows(): q_key_3prime = create_3prime_key_from_fields( accession=row["q-accession"], left=row["q-left-sbsp"], right=row["q-right-sbsp"], strand=row["q-strand-sbsp"] ) # make sure key is in both if q_key_3prime in key_3prime_to_label_gms2 and q_key_3prime in key_3prime_to_label_ncbi: # make sure SBSP 5' matches GMS2 label_sbsp = Label( Coordinates(row["q-left-sbsp"]-1, row["q-right-sbsp"]-1, row["q-strand-sbsp"]), seqname=row["q-accession"] ) label_gms2 = key_3prime_to_label_gms2[q_key_3prime] if labels_match_5prime_3prime(label_sbsp, label_gms2): label_ncbi = key_3prime_to_label_ncbi[q_key_3prime] if labels_match_5prime_3prime(label_sbsp, label_ncbi): list_pf_gms2_sbsp_ncbi.append(row["pf-msa-output"]) else: list_pf_gms2_sbsp_not_ncbi.append(row["pf-msa-output"]) pd_gms2_sbsp_ncbi = os.path.join(env["pd-work"], "sbsp_gms2_ncbi") pd_gms2_sbsp_not_ncbi = os.path.join(env["pd-work"], "sbsp_gms2_not_ncbi") mkdir_p(pd_gms2_sbsp_ncbi) mkdir_p(pd_gms2_sbsp_not_ncbi) # copy files copy_files_with_new_indexing(list_pf_gms2_sbsp_ncbi, pd_gms2_sbsp_ncbi) copy_files_with_new_indexing(list_pf_gms2_sbsp_not_ncbi, pd_gms2_sbsp_not_ncbi)
def analysis_per_query_for_genome(env, gi, pd_sbsp, **kwargs): # type: (Environment, GenomeInfo, str, Dict[str, Any]) -> pd.DataFrame pd_genome = os_join(env["pd-data"], gi.name) pf_gms2 = os_join(pd_genome, "runs", "gms2", "gms2.gff") pf_prodigal = os_join(pd_genome, "runs", "prodigal", "prodigal.gff") pf_sbsp = os_join(pd_sbsp, "accuracy", "{}.gff".format(gi.name)) pf_ncbi = os_join(pd_genome, "ncbi.gff") pf_sbsp_details = os_join(pd_sbsp, "output.csv") # Read all input and sbsp prediction details common_options = {"shift": 0} labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **common_options) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **common_options) labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **common_options) labels_prodigal = read_labels_from_file(pf_prodigal, name="Prodigal", **common_options) df_sbsp_details = pd.read_csv(pf_sbsp_details) add_q_key_3p_to_df(df_sbsp_details, "q-key-3p") # get keys per label key_to_label_sbsp = map_key_3p_to_label(labels_sbsp) key_to_label_gms2 = map_key_3p_to_label(labels_gms2) key_to_label_ncbi = map_key_3p_to_label(labels_ncbi) key_to_label_prodigal = map_key_3p_to_label(labels_prodigal) key_to_df_sbsp_details = map_key_3p_to_df_group(df_sbsp_details) df_result = pd.DataFrame() # Sketch: Dataframe will contain one row per gene (3prime end), for all genes in # the union set of SBSP, GMS2, NCBI, and prodigal all_key_3p = set(key_to_label_sbsp.keys()).union( set(key_to_label_gms2.keys()), set(key_to_label_ncbi.keys()), set(key_to_label_prodigal)) list_analysis = list() for key in all_key_3p: curr_analysis = analyze_query(key, key_to_label_sbsp, key_to_label_gms2, key_to_label_ncbi, key_to_label_prodigal, key_to_df_sbsp_details) list_analysis.append(curr_analysis) if len(list_analysis) == 0: return pd.DataFrame() return pd.DataFrame(list_analysis)
def main(env, args): # type: (Environment, argparse.Namespace) -> None labels_a = read_labels_from_file(args.pf_a) labels_b = read_labels_from_file(args.pf_b) lcd = LabelsComparisonDetailed( labels_a, labels_b, name_a=args.name_a, name_b=args.name_b, tag=args.tag, split_on_attributes=args.split_on_attributes) LabelsComparisonDetailedViz(lcd).run(env["pd-work"])
def get_sequences_for_single_genome(env, gcfid): # type: (Environment, str) -> List[Dict[str, Any]] pd_gcfid = os.path.join(env["pd-data"], gcfid) pf_sequences = os.path.join(pd_gcfid, "sequence.fasta") pf_ncbi = os.path.join(pd_gcfid, "ncbi.gff") sequences = read_fasta_into_hash(pf_sequences) labels = read_labels_from_file(pf_ncbi) result = list() counter = 0 for lab in labels: if lab.is_partial(): continue if lab.is_frameshifted(): continue # not hypothetical if lab.get_attribute_value("product") is not None and "hypothetical" in lab.get_attribute_value("product"): continue entry = get_entry_for_label(sequences, lab, tag=counter) if entry is not None: result.append(entry) counter += 1 return result
def pipeline_step_compute_accuracy(env, df, pipeline_options): # type: (Environment, pd.DataFrame, PipelineSBSPOptions) -> pd.DataFrame from sbsp_io.labels import read_labels_from_file for genome in set(df["q-genome"]): pf_q_labels_true = os.path.join(env["pd-data"], genome, pipeline_options["fn-q-labels-compare"]) labels = read_labels_from_file(pf_q_labels_true, shift=0) df_add_is_true_start(df, labels, "q-", "is-true", coordinates_suffix="-sbsp") df_add_distance_between_predicted_and_true( df, labels, "q-", "distance-to-true", coordinates_suffix="-sbsp") # get labels genome_to_pf_labels = df_print_labels(env, df, "q", suffix_coordinates="sbsp", suffix_fname="") # print accuracies from sbsp_general.labels_comparison import LabelsComparison genome_to_comparison = dict() for genome in genome_to_pf_labels: pf_q_labels_true = os.path.join(env["pd-data"], genome, pipeline_options["fn-q-labels-compare"]) genome_to_comparison[genome] = LabelsComparison(env, pf_q_labels_true, genome_to_pf_labels[genome]) labels_a = read_labels_from_file(pf_q_labels_true) labels_b = read_labels_from_file(genome_to_pf_labels[genome]) lcd = LabelsComparisonDetailed(labels_a, labels_b, name_a="Reference", name_b="SBSP", tag=genome, split_on_attributes=["predicted-at-step"]) # LabelsComparisonDetailedViz(lcd).run(env["pd-work"]) accuracy = LabelsComparison.stringify_genome_accuracies(genome_to_comparison, ",") import sbsp_io.general pf_accuracy = os.path.join(env["pd-work"], pipeline_options["fn-compare"]) sbsp_io.general.write_string_to_file(accuracy, pf_accuracy) return df
def relative_entropy_analysis_for_gi_for_percent(env, pf_sequence, pf_labels, pf_mod, pf_verified, group, percent, pd_figures): # type: (Environment, str, str, str, str, str, float, str) -> Dict[str, Any] # 1) randomly select percent of labels pf_labels_percent = os_join(env["pd-work"], "labels_percent.lst") pf_mod_percent = os_join(env["pd-work"], "model_percent.mod") pf_labels_predict = os_join(env["pd-work"], "labels_predict.lst") randomly_select_labels(pf_labels, pf_labels_percent, percent) # train new model mod_percent = train_and_create_models(env, pf_sequences=pf_sequence, pf_labels=pf_labels_percent, group=group, clean=False, pf_mod=pf_mod_percent) # add RBSB to GMS2 model add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_labels_percent, pf_mod, pf_mod_percent) logo_rbs_from_gms2_mod_file(pd_figures, pf_mod_percent, str(percent)) # run prediction with new model run_gms2_prediction_with_model(pf_sequence, pf_mod_percent, pf_labels_predict) # compare predictions lcd = LabelsComparisonDetailed(read_labels_from_file(pf_labels_predict), read_labels_from_file(pf_verified)) mm = MotifModel(mod_percent.items["RBS_MAT"], mod_percent.items["RBS_POS_DISTR"]) non = GMS2Noncoding(mod_percent.items["NON_MAT"]) return { "RE": relative_entropy(mm, non), "RE Motif": relative_entropy(mm, non, component="motif"), "RE Spacer": relative_entropy(mm, non, component="spacer"), "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) }
def extract_labeled_sequences_for_genome(env, gi, **kwargs): # type: (Environment, GenomeInfo, Dict[str, Any]) -> Dict[str, Seq] pf_sequences = get_pf_sequences_for_genome(env, gi) pf_labels = get_pf_labels_for_genome(env, gi, **kwargs) try: sequences = read_fasta_into_hash(pf_sequences) labels = read_labels_from_file(pf_labels, **kwargs) except IOError as e: log.warning("Could not read sequence/labels files for genome: {}".format(gi.name)) raise e return extract_labeled_sequences(sequences, labels, **kwargs)
def add_gene_labels_from_file(env, df, **kwargs): # type: (Dict[str, Any], pd.DataFrame, Dict[str, Any]) -> None fn_q_labels = get_value(kwargs, "fn_q_labels", "verified.gff") source = get_value(kwargs, "source", "q") suffix_coordinates = get_value(kwargs, "suffix_corrdinates", "ref") from sbsp_io.labels import read_labels_from_file import sbsp_general.dataframe all_genomes = set(df["{}-genome".format(source)]) genome_to_genekey_to_label = dict() for genome_name in all_genomes: pf_q_labels = os.path.join(env["pd-data"], genome_name, fn_q_labels) labels = read_labels_from_file(pf_q_labels) key_3prime_to_label = dict() for l in labels: key_3prime = create_key_3prime_from_label(l) key_3prime_to_label[key_3prime] = l genome_to_genekey_to_label[genome_name] = key_3prime_to_label # now add to data frame column_left = "{}-left-{}".format(source, suffix_coordinates) column_right = "{}-right-{}".format(source, suffix_coordinates) column_strand = "{}-strand-{}".format(source, suffix_coordinates) df[column_left] = -1 df[column_right] = -1 df[column_strand] = "" for index, row in df.iterrows(): curr_genome = row["{}-genome".format(source)] curr_label = sbsp_general.dataframe.df_get_label_from_row( df, index, source) curr_key = create_key_3prime_from_label(curr_label) if curr_key in genome_to_genekey_to_label[curr_genome].keys(): sbsp_general.dataframe.df_coordinates_to_row( df, index, curr_label, source, suffix_coordinates=suffix_coordinates)
def randomly_select_labels(pf_labels, pf_labels_percent, percent): # type: (str, str, float) -> None labels = [l for l in read_labels_from_file(pf_labels)] # for seqname in labels.keys(): # total = len(labels[seqname]) # tmp = sorted(np.random.choice(labels[seqname], size=int(total * percent / float(100)), replace=False), # key=lambda x: x["left"]) # new_labels[seqname] = tmp new_labels = Labels( sorted(np.random.choice(labels, size=int(len(labels) * percent / float(100)), replace=False), key=lambda l: l.left()), ) write_labels_to_file(new_labels, pf_labels_percent)
def read_labels_for_multiple_tools(env, gi, list_dn_tools, list_tool_names): # type: (Environment, GenomeInfo, List[str], List[str]) -> Dict[str, Labels] common_options = { "shift": 0, "ignore_frameshifted": True, "ignore_partial": True, "ignore_hypothetical": True } labels = dict() for name, dn_tool in zip(list_tool_names, list_dn_tools): pf_labels = os_join(env["pd-runs"], gi.name, dn_tool, f"{dn_tool}.gff") labels[name] = read_labels_from_file(pf_labels, name="SBSP", **common_options) return labels
def main(env, args): # type: (Environment, argparse.Namespace) -> None sequences = SeqIO.to_dict(SeqIO.parse(args.pf_sequence, "fasta")) labels = read_labels_from_file(args.pf_labels, shift=-1) for lab in labels: seq = sequences[lab.seqname()] left = int(lab.left()) right = int(lab.right()) if lab.strand() == "+": codon = seq[lab.left():lab.left() + 3] else: codon = seq[lab.right() - 2:lab.right() + 1] # type: SeqIO.SeqRecord codon = codon.reverse_complement() print(codon.seq._data)
def gather_upstream_sequences_for_genome(env, gi, **kwargs): # type: (Environment, GenomeInfo, Dict[str, Any]) -> pd.DataFrame list_entries = list() # type: List[Dict[str, Any]] # read sequences pf_sequences = os_join(env["pd-data"], gi.name, "sequence.fasta") pf_labels = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff") sequences = read_fasta_into_hash(pf_sequences) labels = read_labels_from_file(pf_labels) gc = 100 * compute_gc_from_sequences(sequences) upstream_info = extract_upstream_sequences(labels, sequences) for info in upstream_info: label = info[0] # type: Label frag = info[1] # type: Seq gene_gc = 100 * compute_gc_from_sequences({ "any": sequences[label.seqname()][label.left():label.right() + 1] }) list_entries.append({ "GCFID": gi.name, "Accession": label.seqname(), "Genome GC": gc, "Gene GC": gene_gc, "left": label.left() + 1, "right": label.right() + 1, "strand": label.strand(), "upstream_nt": str(frag) }) return pd.DataFrame(list_entries)
def analyze_predictions_on_verified_genes(env, gi, pd_sbsp, **kwargs): # type: (Environment, GenomeInfo, str, Dict[str, Any]) -> Dict[str, Any] pd_gcfid = os_join(env["pd-data"], gi.name) pf_sbsp = os_join(pd_sbsp, "accuracy", "{}.gff".format(gi.name)) pf_gms2 = os_join(pd_gcfid, "runs", "gms2", "gms2.gff") pf_verified = os_join(pd_gcfid, "verified.gff") pf_ncbi = os_join(pd_gcfid, "ncbi.gff") pf_sbsp_details = os_join(pd_sbsp, "output.csv") kwargs_labels = { "ignore_frameshifted": True, "ignore_partial": True, "shift": 0 } labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **kwargs_labels) labels_verified = read_labels_from_file(pf_verified, name="Verified", **kwargs_labels) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **kwargs_labels) labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **kwargs_labels) df_sbsp_details = pd.read_csv(pf_sbsp_details) add_q_key_3p_to_df(df_sbsp_details, "q-key-3p") add_support_to_labels(labels_sbsp, df_sbsp_details) #labels_sbsp = Labels([l for l in labels_sbsp if l.get_attribute_value('predicted-at-step') != "C"], name="SBSP") labels_sbsp_eq_gms2 = LabelsComparisonDetailed( labels_sbsp, labels_gms2).match_3p_5p("a") labels_sbsp_eq_gms2.name = "GMS2=SBSP" stats = dict() # Stats: 3prime match get_stats_a_from_b_3p(labels_verified, labels_ncbi, stats) get_stats_a_from_b_3p(labels_verified, labels_gms2, stats) get_stats_a_from_b_3p(labels_verified, labels_sbsp, stats) get_stats_a_from_b_3p_by_upstream(labels_verified, labels_ncbi, stats) # SN SP get_stats_sn_sp(labels_verified, labels_sbsp, stats) get_stats_sn_sp(labels_verified, labels_ncbi, stats) get_stats_sn_sp(labels_verified, labels_gms2, stats) # Stats: GMS2=SBSP Accuracy on verified get_stats_sn_sp(labels_verified, labels_sbsp_eq_gms2, stats) # stats by support get_stats_sn_sp_by_support(labels_verified, labels_sbsp, stats, "SBSP") # stats by support get_stats_sn_sp_by_support(labels_verified, labels_sbsp_eq_gms2, stats, "GMS2=SBSP") # stats by steps combinations get_stats_sn_sp_by_step_group(labels_verified, labels_sbsp, stats, "SBSP") # stats by steps combinations get_stats_sn_sp_by_step_group(labels_verified, labels_sbsp_eq_gms2, stats, "GMS2=SBSP") return stats
def compare_gms2_sbsp_ncbi(env, pf_gms2, pf_sbsp, pf_ncbi, **kwargs): # type: (Environment, str, str, str, Dict[str, Any]) -> Dict[str, Any] venn_title = get_value(kwargs, "venn_title", None) pf_venn = get_value(kwargs, "pf_venn", os.path.join(env["pd-work"], "venn.pdf")) pf_prodigal = get_value(kwargs, "pf_prodigal", None) start_candidate_analysis = get_value(kwargs, "start_candidate_analysis", False) gcfid = get_value(kwargs, "gcfid", None) predicted_at_step = get_value(kwargs, "predicted_at_step", None) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2") labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP") labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI") if start_candidate_analysis: # add number of start candidates per gene sequences = read_fasta_into_hash( os.path.join(env["pd-data"], gcfid, "sequence.fasta")) add_number_of_start_candidates_to_labels(sequences, labels_gms2) if predicted_at_step is not None: labels_sbsp = Labels([ l for l in labels_sbsp if l.get_attribute_value("predicted-at-step") == predicted_at_step ], name="SBSP") lcd = LabelsComparisonDetailed(labels_gms2, labels_sbsp, name_a="gms2", name_b="sbsp") labels_gms2_sbsp_3p_5p = lcd.intersection("a") lcd_2 = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p, labels_ncbi, name_a="gms2_sbsp", name_b="ncbi") labels_gms2_sbsp_ncbi_3p_5p = lcd_2.intersection("a") # venn_diagram_5prime(labels_gms2, labels_sbsp, labels_ncbi, FigureOptions( # title=venn_title, # save_fig=pf_venn # )) labels_prodigal = None prodigal_info = dict() if pf_prodigal is not None: labels_prodigal = read_labels_from_file(pf_prodigal, name="Prodigal") lcd_prodigal = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p, labels_prodigal, name_a="gms2_sbsp", name_b="prodigal") labels_gms2_sbsp_prodigal_3p_5p = lcd_prodigal.intersection("a") # Goal: check (GMS2=SBSP) != (Prodigal=NCBI) # step1: Prodigal=NCBI labels_ncbi_prodigal_3p_5p = LabelsComparisonDetailed( labels_ncbi, labels_prodigal, name_a="ncbi", name_b="prodigal").match_3p_5p("a") # Get same genes in (GMS2=SBSP) and (Prodigal=NCBI) lcd_full = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p, labels_ncbi_prodigal_3p_5p, name_a="gms2_sbsp", name_b="ncbi_prodigal") labels_match_3p = lcd_full.match_3p("a") labels_match_3p_5p = lcd_full.match_3p_5p("a") prodigal_info = { "(GMS2=SBSP)!=Prodigal": len(labels_gms2_sbsp_3p_5p) - len(labels_gms2_sbsp_prodigal_3p_5p), "(GMS2=SBSP)!=(NCBI=Prodigal)": len(labels_match_3p) - len(labels_match_3p_5p), } return { "GMS2": len(labels_gms2), "SBSP": len(labels_sbsp), "NCBI": len(labels_ncbi), "GMS2=SBSP": len(labels_gms2_sbsp_3p_5p), "GMS2=SBSP=NCBI": len(labels_gms2_sbsp_ncbi_3p_5p), **prodigal_info }
def collect_alignments_for_genome(env, gi): # type: (Environment, GenomeInfo) -> None pd_genome = os_join(env["pd-work"], gi.name) mkdir_p(pd_genome) pd_run = os_join(env["pd-runs"], gi.name) # load labels and data files pf_sbsp = os_join(pd_run, "sbsp", "accuracy", f"{gi.name}.gff") pf_gms2 = os_join(pd_run, "gms2", "gms2.gff") pf_ncbi = os_join(pd_run, "ncbi", "ncbi.gff") pf_sbsp_details = os_join(pd_run, "sbsp", "output.csv") common_options = { "ignore_frameshifted": True, "ignore_partial": True, "shift": 0 } try: labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **common_options) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **common_options) labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **common_options) df_details = pd.read_csv(pf_sbsp_details) add_q_key_3p_to_df(df_details, "q-3prime") except FileNotFoundError: return # get genes where GMS2=SBSP lcd_full = LabelsComparisonDetailed(labels_gms2, labels_sbsp, name_a="gms2", name_b="sbsp") labels_gms2_eq_sbsp = lcd_full.match_3p_5p("a") # get labels where gms2_eq_sbsp doesn't match NCBI lcd2 = LabelsComparisonDetailed(labels_gms2_eq_sbsp, labels_ncbi, name_a="gms2_eq_sbsp", name_b="ncbi") labels_gms2_eq_sbsp_not_ncbi = lcd2.match_3p_not_5p("a") # get msa files for all these labels set_3prime_keys = { create_q_key_3p(l.seqname(), l.left(), l.right(), l.strand()) for l in labels_gms2_eq_sbsp_not_ncbi } df_gms2_eq_sbsp_not_ncbi = df_details[df_details["q-3prime"].isin( set_3prime_keys)] set_pf_msa_out = set(df_gms2_eq_sbsp_not_ncbi["pf-msa-output"]) for pf_msa_out in set_pf_msa_out: shutil.copy(pf_msa_out, pd_genome)