Esempio n. 1
0
def run_gms2_with_component_toggles_and_get_accuracy(env, gi, components_off, **kwargs):
    # type: (Environment, GenomeInfo, Set[str], Dict[str, Any]) -> Dict[str, Any]

    pf_mod_original = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")
    pf_reference = os_join(env["pd-data"], gi.name, "verified.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_prediction = os_join(env["pd-work"], "prediction.gff")

    native_coding_off = get_value(kwargs, "native_coding_off", True)

    pf_new_mod = os_join(env["pd-work"], "model.mod")
    turn_off_components(pf_mod_original, pf_new_mod, components_off, native_coding_off=native_coding_off)

    done = False
    while not done:
        try:
            run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_prediction)
            done = True
        except CalledProcessError:
            pass

    # compare with verified
    lcd = LabelsComparisonDetailed(read_labels_from_file(pf_reference), read_labels_from_file(pf_prediction))

    return {
        "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a'))
    }
Esempio n. 2
0
def compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, **kwargs):
    # type: (Environment, GenomeInfo) -> [float, float]

    group = get_value(kwargs, "group", None)

    pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff")
    pf_gms2_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")
    pf_sbsp = os_join(env["pd-runs"], gi.name, "sbsp_submission/accuracy", f"{gi.name}.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_toolp = os_join(env["pd-work"], "toolp.gff")
    pf_verified = os_join(env["pd-data"], gi.name, "verified.gff")

    # get toolp predictions
    get_identital_labels(
        pf_gms2, pf_sbsp, pf_toolp
    )

    # create new motif model with toolp and add it to new model file
    pf_new_mod = os_join(env["pd-work"], "toolp.mod")
    add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_toolp, pf_gms2_mod, pf_new_mod, group=group)

    # run prediction with new model
    pf_new_pred = os_join(env["pd-work"], "new_pred.gff")
    run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_new_pred)

    # compare predictions
    lcd1 = LabelsComparisonDetailed(read_labels_from_file(pf_gms2), read_labels_from_file(pf_verified))
    lcd2 = LabelsComparisonDetailed(read_labels_from_file(pf_new_pred), read_labels_from_file(pf_verified))

    return [100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) for lcd in [lcd1, lcd2]]
def get_stats_sn_sp(labels_a, labels_b, output):
    # type: (Labels, Labels, Dict[str, Any]) -> None

    lcd = LabelsComparisonDetailed(labels_a, labels_b)

    a_name = labels_a.name if labels_a.name is not None else "a"
    b_name = labels_b.name if labels_b.name is not None else "b"

    a_total = len(labels_a)
    b_total = len(labels_b)

    match_3p = len(lcd.match_3p("a"))
    match_3p_5p = len(lcd.match_3p_5p("a"))

    sp = 0 if a_total == 0 else round(100 * match_3p / float(a_total), 2)
    sn = 0 if match_3p == 0 else round(100 * match_3p_5p / float(match_3p), 2)

    output.update({
        a_name:
        a_total,
        b_name:
        b_total,
        "Number 3p match: {} from {}".format(a_name, b_name):
        match_3p,
        "Percentage 3p match: {} from {}".format(a_name, b_name):
        sp,
        "Number 5p-3p match: {} from {}".format(a_name, b_name):
        match_3p_5p,
        "Percentage 5p-3p match: {} from {}".format(a_name, b_name):
        sn,
    })
def get_stats_a_from_b_3p(labels_a, labels_b, output):
    # type: (Labels, Labels, Dict[str, Any]) -> None

    lcd = LabelsComparisonDetailed(labels_a, labels_b)

    a_name = labels_a.name if labels_a.name is not None else "a"
    b_name = labels_b.name if labels_b.name is not None else "b"

    output.update({
        a_name:
        len(labels_a),
        b_name:
        len(labels_b),
        "Number 3p match: {} from {}".format(a_name, b_name):
        len(lcd.match_3p("a")),
        "Percentage 3p match: {} from {}".format(a_name, b_name):
        round(100 * len(lcd.match_3p("a")) / float(len(labels_b)), 2)
    })
def relative_entropy_analysis_for_gi_for_percent(env, pf_sequence, pf_labels,
                                                 pf_mod, pf_verified, group,
                                                 percent, pd_figures):
    # type: (Environment, str, str, str, str, str, float, str) -> Dict[str, Any]

    # 1)  randomly select percent of labels
    pf_labels_percent = os_join(env["pd-work"], "labels_percent.lst")
    pf_mod_percent = os_join(env["pd-work"], "model_percent.mod")
    pf_labels_predict = os_join(env["pd-work"], "labels_predict.lst")

    randomly_select_labels(pf_labels, pf_labels_percent, percent)

    # train new model
    mod_percent = train_and_create_models(env,
                                          pf_sequences=pf_sequence,
                                          pf_labels=pf_labels_percent,
                                          group=group,
                                          clean=False,
                                          pf_mod=pf_mod_percent)

    # add RBSB to GMS2 model
    add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_labels_percent, pf_mod,
                                pf_mod_percent)

    logo_rbs_from_gms2_mod_file(pd_figures, pf_mod_percent, str(percent))

    # run prediction with new model
    run_gms2_prediction_with_model(pf_sequence, pf_mod_percent,
                                   pf_labels_predict)

    # compare predictions
    lcd = LabelsComparisonDetailed(read_labels_from_file(pf_labels_predict),
                                   read_labels_from_file(pf_verified))

    mm = MotifModel(mod_percent.items["RBS_MAT"],
                    mod_percent.items["RBS_POS_DISTR"])
    non = GMS2Noncoding(mod_percent.items["NON_MAT"])
    return {
        "RE": relative_entropy(mm, non),
        "RE Motif": relative_entropy(mm, non, component="motif"),
        "RE Spacer": relative_entropy(mm, non, component="spacer"),
        "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a'))
    }
Esempio n. 6
0
def compare_gms2_sbsp_ncbi(env, pf_gms2, pf_sbsp, pf_ncbi, **kwargs):
    # type: (Environment, str, str, str, Dict[str, Any]) -> None

    venn_title = get_value(kwargs, "venn_title", None)
    pf_venn = get_value(kwargs, "pf_venn",
                        os.path.join(env["pd-work"], "venn.pdf"))

    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2")
    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP")
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI")

    lcd = LabelsComparisonDetailed(labels_gms2,
                                   labels_sbsp,
                                   name_a="gms2",
                                   name_b="sbsp")

    labels_gms2_sbsp_3p_5p = lcd.intersection("a")

    lcd_2 = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p,
                                     labels_ncbi,
                                     name_a="gms2_sbsp",
                                     name_b="ncbi")

    labels_gms2_sbsp_ncbi_3p_5p = lcd_2.intersection("a")

    out = "gms2,sbsp,ncbi,gms2_sbsp,gms2_sbsp_ncbi"
    out += "\n{},{},{},{},{}".format(len(labels_gms2), len(labels_sbsp),
                                     len(labels_ncbi),
                                     len(labels_gms2_sbsp_3p_5p),
                                     len(labels_gms2_sbsp_ncbi_3p_5p))

    print(out)

    venn_diagram_5prime(labels_gms2, labels_sbsp, labels_ncbi,
                        FigureOptions(title=venn_title, save_fig=pf_venn))
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    labels_a = read_labels_from_file(args.pf_a)
    labels_b = read_labels_from_file(args.pf_b)

    lcd = LabelsComparisonDetailed(
        labels_a,
        labels_b,
        name_a=args.name_a,
        name_b=args.name_b,
        tag=args.tag,
        split_on_attributes=args.split_on_attributes)

    LabelsComparisonDetailedViz(lcd).run(env["pd-work"])
Esempio n. 8
0
def pipeline_step_compute_accuracy(env, df, pipeline_options):
    # type: (Environment, pd.DataFrame, PipelineSBSPOptions) -> pd.DataFrame

    from sbsp_io.labels import read_labels_from_file

    for genome in set(df["q-genome"]):
        pf_q_labels_true = os.path.join(env["pd-data"], genome, pipeline_options["fn-q-labels-compare"])

        labels = read_labels_from_file(pf_q_labels_true, shift=0)

        df_add_is_true_start(df, labels, "q-", "is-true",
                                                    coordinates_suffix="-sbsp")
        df_add_distance_between_predicted_and_true(
            df, labels, "q-", "distance-to-true",
            coordinates_suffix="-sbsp")


    # get labels
    genome_to_pf_labels = df_print_labels(env, df, "q", suffix_coordinates="sbsp",
                                           suffix_fname="")

    # print accuracies
    from sbsp_general.labels_comparison import LabelsComparison
    genome_to_comparison = dict()

    for genome in genome_to_pf_labels:
        pf_q_labels_true = os.path.join(env["pd-data"], genome, pipeline_options["fn-q-labels-compare"])

        genome_to_comparison[genome] = LabelsComparison(env, pf_q_labels_true, genome_to_pf_labels[genome])

        labels_a = read_labels_from_file(pf_q_labels_true)
        labels_b = read_labels_from_file(genome_to_pf_labels[genome])

        lcd = LabelsComparisonDetailed(labels_a, labels_b,
                                       name_a="Reference",
                                       name_b="SBSP",
                                       tag=genome,
                                       split_on_attributes=["predicted-at-step"])

        # LabelsComparisonDetailedViz(lcd).run(env["pd-work"])

    accuracy = LabelsComparison.stringify_genome_accuracies(genome_to_comparison, ",")
    import sbsp_io.general
    pf_accuracy = os.path.join(env["pd-work"], pipeline_options["fn-compare"])
    sbsp_io.general.write_string_to_file(accuracy, pf_accuracy)

    return df
    def get_comparisons_per_tag(self, attribute_info=None):
        # type: (Union[Tuple[str, Any], None]) -> Dict[str, Dict[str, Any]]

        dict_comparisons = dict()

        for tag, lcd in self.tag_to_lcd.items():
            if attribute_info is None:
                dict_comparisons[tag] = lcd.comparison["all"]

            else:
                name, value = attribute_info

                if name in lcd.comparison["attribute"] and value in lcd.comparison["attribute"][name]:
                    dict_comparisons[tag] = lcd.comparison["attribute"][name][value]
                else:
                    dict_comparisons[tag] = LabelsComparisonDetailed(Labels(), Labels())

        return dict_comparisons
def analyze_predictions_on_verified_genes(env, gi, pd_sbsp, **kwargs):
    # type: (Environment, GenomeInfo, str, Dict[str, Any]) -> Dict[str, Any]
    pd_gcfid = os_join(env["pd-data"], gi.name)

    pf_sbsp = os_join(pd_sbsp, "accuracy", "{}.gff".format(gi.name))
    pf_gms2 = os_join(pd_gcfid, "runs", "gms2", "gms2.gff")
    pf_verified = os_join(pd_gcfid, "verified.gff")
    pf_ncbi = os_join(pd_gcfid, "ncbi.gff")
    pf_sbsp_details = os_join(pd_sbsp, "output.csv")

    kwargs_labels = {
        "ignore_frameshifted": True,
        "ignore_partial": True,
        "shift": 0
    }

    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **kwargs_labels)
    labels_verified = read_labels_from_file(pf_verified,
                                            name="Verified",
                                            **kwargs_labels)
    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **kwargs_labels)
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **kwargs_labels)
    df_sbsp_details = pd.read_csv(pf_sbsp_details)
    add_q_key_3p_to_df(df_sbsp_details, "q-key-3p")

    add_support_to_labels(labels_sbsp, df_sbsp_details)

    #labels_sbsp = Labels([l for l in labels_sbsp if l.get_attribute_value('predicted-at-step') != "C"], name="SBSP")

    labels_sbsp_eq_gms2 = LabelsComparisonDetailed(
        labels_sbsp, labels_gms2).match_3p_5p("a")
    labels_sbsp_eq_gms2.name = "GMS2=SBSP"

    stats = dict()

    # Stats: 3prime match
    get_stats_a_from_b_3p(labels_verified, labels_ncbi, stats)
    get_stats_a_from_b_3p(labels_verified, labels_gms2, stats)
    get_stats_a_from_b_3p(labels_verified, labels_sbsp, stats)
    get_stats_a_from_b_3p_by_upstream(labels_verified, labels_ncbi, stats)

    # SN SP
    get_stats_sn_sp(labels_verified, labels_sbsp, stats)
    get_stats_sn_sp(labels_verified, labels_ncbi, stats)
    get_stats_sn_sp(labels_verified, labels_gms2, stats)

    # Stats: GMS2=SBSP Accuracy on verified
    get_stats_sn_sp(labels_verified, labels_sbsp_eq_gms2, stats)

    # stats by support
    get_stats_sn_sp_by_support(labels_verified, labels_sbsp, stats, "SBSP")

    # stats by support
    get_stats_sn_sp_by_support(labels_verified, labels_sbsp_eq_gms2, stats,
                               "GMS2=SBSP")

    # stats by steps combinations
    get_stats_sn_sp_by_step_group(labels_verified, labels_sbsp, stats, "SBSP")

    # stats by steps combinations
    get_stats_sn_sp_by_step_group(labels_verified, labels_sbsp_eq_gms2, stats,
                                  "GMS2=SBSP")

    return stats
def collect_alignments_for_genome(env, gi):
    # type: (Environment, GenomeInfo) -> None
    pd_genome = os_join(env["pd-work"], gi.name)

    mkdir_p(pd_genome)

    pd_run = os_join(env["pd-runs"], gi.name)

    # load labels and data files
    pf_sbsp = os_join(pd_run, "sbsp", "accuracy", f"{gi.name}.gff")
    pf_gms2 = os_join(pd_run, "gms2", "gms2.gff")
    pf_ncbi = os_join(pd_run, "ncbi", "ncbi.gff")
    pf_sbsp_details = os_join(pd_run, "sbsp", "output.csv")

    common_options = {
        "ignore_frameshifted": True,
        "ignore_partial": True,
        "shift": 0
    }

    try:

        labels_sbsp = read_labels_from_file(pf_sbsp,
                                            name="SBSP",
                                            **common_options)
        labels_gms2 = read_labels_from_file(pf_gms2,
                                            name="GMS2",
                                            **common_options)
        labels_ncbi = read_labels_from_file(pf_ncbi,
                                            name="NCBI",
                                            **common_options)
        df_details = pd.read_csv(pf_sbsp_details)
        add_q_key_3p_to_df(df_details, "q-3prime")
    except FileNotFoundError:
        return

    # get genes where GMS2=SBSP
    lcd_full = LabelsComparisonDetailed(labels_gms2,
                                        labels_sbsp,
                                        name_a="gms2",
                                        name_b="sbsp")

    labels_gms2_eq_sbsp = lcd_full.match_3p_5p("a")

    # get labels where gms2_eq_sbsp doesn't match NCBI
    lcd2 = LabelsComparisonDetailed(labels_gms2_eq_sbsp,
                                    labels_ncbi,
                                    name_a="gms2_eq_sbsp",
                                    name_b="ncbi")
    labels_gms2_eq_sbsp_not_ncbi = lcd2.match_3p_not_5p("a")

    # get msa files for all these labels
    set_3prime_keys = {
        create_q_key_3p(l.seqname(), l.left(), l.right(), l.strand())
        for l in labels_gms2_eq_sbsp_not_ncbi
    }

    df_gms2_eq_sbsp_not_ncbi = df_details[df_details["q-3prime"].isin(
        set_3prime_keys)]

    set_pf_msa_out = set(df_gms2_eq_sbsp_not_ncbi["pf-msa-output"])

    for pf_msa_out in set_pf_msa_out:
        shutil.copy(pf_msa_out, pd_genome)
def compare_gms2_sbsp_ncbi(env, pf_gms2, pf_sbsp, pf_ncbi, **kwargs):
    # type: (Environment, str, str, str, Dict[str, Any]) -> Dict[str, Any]

    venn_title = get_value(kwargs, "venn_title", None)
    pf_venn = get_value(kwargs, "pf_venn",
                        os.path.join(env["pd-work"], "venn.pdf"))
    pf_prodigal = get_value(kwargs, "pf_prodigal", None)

    start_candidate_analysis = get_value(kwargs, "start_candidate_analysis",
                                         False)
    gcfid = get_value(kwargs, "gcfid", None)

    predicted_at_step = get_value(kwargs, "predicted_at_step", None)

    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2")
    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP")
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI")

    if start_candidate_analysis:
        # add number of start candidates per gene
        sequences = read_fasta_into_hash(
            os.path.join(env["pd-data"], gcfid, "sequence.fasta"))
        add_number_of_start_candidates_to_labels(sequences, labels_gms2)

    if predicted_at_step is not None:
        labels_sbsp = Labels([
            l for l in labels_sbsp
            if l.get_attribute_value("predicted-at-step") == predicted_at_step
        ],
                             name="SBSP")

    lcd = LabelsComparisonDetailed(labels_gms2,
                                   labels_sbsp,
                                   name_a="gms2",
                                   name_b="sbsp")

    labels_gms2_sbsp_3p_5p = lcd.intersection("a")

    lcd_2 = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p,
                                     labels_ncbi,
                                     name_a="gms2_sbsp",
                                     name_b="ncbi")

    labels_gms2_sbsp_ncbi_3p_5p = lcd_2.intersection("a")

    # venn_diagram_5prime(labels_gms2, labels_sbsp, labels_ncbi, FigureOptions(
    #     title=venn_title,
    #     save_fig=pf_venn
    # ))

    labels_prodigal = None
    prodigal_info = dict()
    if pf_prodigal is not None:
        labels_prodigal = read_labels_from_file(pf_prodigal, name="Prodigal")
        lcd_prodigal = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p,
                                                labels_prodigal,
                                                name_a="gms2_sbsp",
                                                name_b="prodigal")

        labels_gms2_sbsp_prodigal_3p_5p = lcd_prodigal.intersection("a")

        # Goal: check (GMS2=SBSP) != (Prodigal=NCBI)

        # step1: Prodigal=NCBI
        labels_ncbi_prodigal_3p_5p = LabelsComparisonDetailed(
            labels_ncbi, labels_prodigal, name_a="ncbi",
            name_b="prodigal").match_3p_5p("a")

        # Get same genes in (GMS2=SBSP) and (Prodigal=NCBI)
        lcd_full = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p,
                                            labels_ncbi_prodigal_3p_5p,
                                            name_a="gms2_sbsp",
                                            name_b="ncbi_prodigal")

        labels_match_3p = lcd_full.match_3p("a")
        labels_match_3p_5p = lcd_full.match_3p_5p("a")

        prodigal_info = {
            "(GMS2=SBSP)!=Prodigal":
            len(labels_gms2_sbsp_3p_5p) - len(labels_gms2_sbsp_prodigal_3p_5p),
            "(GMS2=SBSP)!=(NCBI=Prodigal)":
            len(labels_match_3p) - len(labels_match_3p_5p),
        }

    return {
        "GMS2": len(labels_gms2),
        "SBSP": len(labels_sbsp),
        "NCBI": len(labels_ncbi),
        "GMS2=SBSP": len(labels_gms2_sbsp_3p_5p),
        "GMS2=SBSP=NCBI": len(labels_gms2_sbsp_ncbi_3p_5p),
        **prodigal_info
    }