Example #1
0
def compare_gms2_sbsp_ncbi(env, pf_gms2, pf_sbsp, pf_ncbi, **kwargs):
    # type: (Environment, str, str, str, Dict[str, Any]) -> None

    venn_title = get_value(kwargs, "venn_title", None)
    pf_venn = get_value(kwargs, "pf_venn",
                        os.path.join(env["pd-work"], "venn.pdf"))

    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2")
    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP")
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI")

    lcd = LabelsComparisonDetailed(labels_gms2,
                                   labels_sbsp,
                                   name_a="gms2",
                                   name_b="sbsp")

    labels_gms2_sbsp_3p_5p = lcd.intersection("a")

    lcd_2 = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p,
                                     labels_ncbi,
                                     name_a="gms2_sbsp",
                                     name_b="ncbi")

    labels_gms2_sbsp_ncbi_3p_5p = lcd_2.intersection("a")

    out = "gms2,sbsp,ncbi,gms2_sbsp,gms2_sbsp_ncbi"
    out += "\n{},{},{},{},{}".format(len(labels_gms2), len(labels_sbsp),
                                     len(labels_ncbi),
                                     len(labels_gms2_sbsp_3p_5p),
                                     len(labels_gms2_sbsp_ncbi_3p_5p))

    print(out)

    venn_diagram_5prime(labels_gms2, labels_sbsp, labels_ncbi,
                        FigureOptions(title=venn_title, save_fig=pf_venn))
Example #2
0
def run_gms2_with_component_toggles_and_get_accuracy(env, gi, components_off, **kwargs):
    # type: (Environment, GenomeInfo, Set[str], Dict[str, Any]) -> Dict[str, Any]

    pf_mod_original = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")
    pf_reference = os_join(env["pd-data"], gi.name, "verified.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_prediction = os_join(env["pd-work"], "prediction.gff")

    native_coding_off = get_value(kwargs, "native_coding_off", True)

    pf_new_mod = os_join(env["pd-work"], "model.mod")
    turn_off_components(pf_mod_original, pf_new_mod, components_off, native_coding_off=native_coding_off)

    done = False
    while not done:
        try:
            run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_prediction)
            done = True
        except CalledProcessError:
            pass

    # compare with verified
    lcd = LabelsComparisonDetailed(read_labels_from_file(pf_reference), read_labels_from_file(pf_prediction))

    return {
        "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a'))
    }
Example #3
0
def compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, **kwargs):
    # type: (Environment, GenomeInfo) -> [float, float]

    group = get_value(kwargs, "group", None)

    pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff")
    pf_gms2_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")
    pf_sbsp = os_join(env["pd-runs"], gi.name, "sbsp_submission/accuracy", f"{gi.name}.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_toolp = os_join(env["pd-work"], "toolp.gff")
    pf_verified = os_join(env["pd-data"], gi.name, "verified.gff")

    # get toolp predictions
    get_identital_labels(
        pf_gms2, pf_sbsp, pf_toolp
    )

    # create new motif model with toolp and add it to new model file
    pf_new_mod = os_join(env["pd-work"], "toolp.mod")
    add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_toolp, pf_gms2_mod, pf_new_mod, group=group)

    # run prediction with new model
    pf_new_pred = os_join(env["pd-work"], "new_pred.gff")
    run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_new_pred)

    # compare predictions
    lcd1 = LabelsComparisonDetailed(read_labels_from_file(pf_gms2), read_labels_from_file(pf_verified))
    lcd2 = LabelsComparisonDetailed(read_labels_from_file(pf_new_pred), read_labels_from_file(pf_verified))

    return [100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) for lcd in [lcd1, lcd2]]
Example #4
0
def compare_gms2_sbsp_ncbi_for_genome_list(env, gil, gcfid_to_pd_sbsp, pf_output_summary, **kwargs):
    # type: (Environment, GenomeInfoList, Dict[str, str], str, Dict[str, Any]) -> None

    prodigal = get_value(kwargs, "prodigal", None)
    list_summary = list()
    list_pf_gms2_sbsp_not_ncbi = list()
    list_pf_gms2_sbsp_ncbi = list()

    for gi in gil:
        logger.info("{}".format(gi.name))
        pd_genome = os.path.join(env["pd-data"], gi.name)
        pf_gms2 = os.path.join(pd_genome, "runs", "gms2", "gms2.gff")
        pf_ncbi = os.path.join(pd_genome, "ncbi.gff")
        pf_sbsp_details = os.path.join(gcfid_to_pd_sbsp[gi.name], "output.csv")

        labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2")
        labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI")

        key_3prime_to_label_gms2 = map_key_to_labels(labels_gms2)
        key_3prime_to_label_ncbi = map_key_to_labels(labels_ncbi)

        df_sbsp = pd.read_csv(pf_sbsp_details, header=0)

        for index, row in df_sbsp.groupby("q-key", as_index=False).agg("first").iterrows():

            q_key_3prime = create_3prime_key_from_fields(
                accession=row["q-accession"], left=row["q-left-sbsp"], right=row["q-right-sbsp"],
                strand=row["q-strand-sbsp"]
            )


            # make sure key is in both
            if q_key_3prime in key_3prime_to_label_gms2 and q_key_3prime in key_3prime_to_label_ncbi:

                # make sure SBSP 5' matches GMS2
                label_sbsp = Label(
                    Coordinates(row["q-left-sbsp"]-1, row["q-right-sbsp"]-1, row["q-strand-sbsp"]),
                    seqname=row["q-accession"]
                )

                label_gms2 = key_3prime_to_label_gms2[q_key_3prime]

                if labels_match_5prime_3prime(label_sbsp, label_gms2):

                    label_ncbi = key_3prime_to_label_ncbi[q_key_3prime]
                    if labels_match_5prime_3prime(label_sbsp, label_ncbi):
                        list_pf_gms2_sbsp_ncbi.append(row["pf-msa-output"])
                    else:
                        list_pf_gms2_sbsp_not_ncbi.append(row["pf-msa-output"])

    pd_gms2_sbsp_ncbi = os.path.join(env["pd-work"], "sbsp_gms2_ncbi")
    pd_gms2_sbsp_not_ncbi = os.path.join(env["pd-work"], "sbsp_gms2_not_ncbi")

    mkdir_p(pd_gms2_sbsp_ncbi)
    mkdir_p(pd_gms2_sbsp_not_ncbi)

    # copy files
    copy_files_with_new_indexing(list_pf_gms2_sbsp_ncbi, pd_gms2_sbsp_ncbi)
    copy_files_with_new_indexing(list_pf_gms2_sbsp_not_ncbi, pd_gms2_sbsp_not_ncbi)
def analysis_per_query_for_genome(env, gi, pd_sbsp, **kwargs):
    # type: (Environment, GenomeInfo, str, Dict[str, Any]) -> pd.DataFrame

    pd_genome = os_join(env["pd-data"], gi.name)
    pf_gms2 = os_join(pd_genome, "runs", "gms2", "gms2.gff")
    pf_prodigal = os_join(pd_genome, "runs", "prodigal", "prodigal.gff")
    pf_sbsp = os_join(pd_sbsp, "accuracy", "{}.gff".format(gi.name))
    pf_ncbi = os_join(pd_genome, "ncbi.gff")
    pf_sbsp_details = os_join(pd_sbsp, "output.csv")

    # Read all input and sbsp prediction details
    common_options = {"shift": 0}
    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **common_options)
    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **common_options)
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **common_options)
    labels_prodigal = read_labels_from_file(pf_prodigal,
                                            name="Prodigal",
                                            **common_options)
    df_sbsp_details = pd.read_csv(pf_sbsp_details)
    add_q_key_3p_to_df(df_sbsp_details, "q-key-3p")

    # get keys per label
    key_to_label_sbsp = map_key_3p_to_label(labels_sbsp)
    key_to_label_gms2 = map_key_3p_to_label(labels_gms2)
    key_to_label_ncbi = map_key_3p_to_label(labels_ncbi)
    key_to_label_prodigal = map_key_3p_to_label(labels_prodigal)
    key_to_df_sbsp_details = map_key_3p_to_df_group(df_sbsp_details)

    df_result = pd.DataFrame()

    # Sketch: Dataframe will contain one row per gene (3prime end), for all genes in
    # the union set of SBSP, GMS2, NCBI, and prodigal
    all_key_3p = set(key_to_label_sbsp.keys()).union(
        set(key_to_label_gms2.keys()), set(key_to_label_ncbi.keys()),
        set(key_to_label_prodigal))

    list_analysis = list()
    for key in all_key_3p:

        curr_analysis = analyze_query(key, key_to_label_sbsp,
                                      key_to_label_gms2, key_to_label_ncbi,
                                      key_to_label_prodigal,
                                      key_to_df_sbsp_details)
        list_analysis.append(curr_analysis)

    if len(list_analysis) == 0:
        return pd.DataFrame()

    return pd.DataFrame(list_analysis)
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    labels_a = read_labels_from_file(args.pf_a)
    labels_b = read_labels_from_file(args.pf_b)

    lcd = LabelsComparisonDetailed(
        labels_a,
        labels_b,
        name_a=args.name_a,
        name_b=args.name_b,
        tag=args.tag,
        split_on_attributes=args.split_on_attributes)

    LabelsComparisonDetailedViz(lcd).run(env["pd-work"])
def get_sequences_for_single_genome(env, gcfid):
    # type: (Environment, str) -> List[Dict[str, Any]]

    pd_gcfid = os.path.join(env["pd-data"], gcfid)

    pf_sequences = os.path.join(pd_gcfid, "sequence.fasta")
    pf_ncbi = os.path.join(pd_gcfid, "ncbi.gff")

    sequences = read_fasta_into_hash(pf_sequences)
    labels = read_labels_from_file(pf_ncbi)

    result = list()

    counter = 0
    for lab in labels:

        if lab.is_partial():
            continue

        if lab.is_frameshifted():
            continue

        # not hypothetical
        if lab.get_attribute_value("product") is not None and "hypothetical" in lab.get_attribute_value("product"):
            continue

        entry = get_entry_for_label(sequences, lab, tag=counter)

        if entry is not None:
            result.append(entry)
            counter += 1

    return result
Example #8
0
def pipeline_step_compute_accuracy(env, df, pipeline_options):
    # type: (Environment, pd.DataFrame, PipelineSBSPOptions) -> pd.DataFrame

    from sbsp_io.labels import read_labels_from_file

    for genome in set(df["q-genome"]):
        pf_q_labels_true = os.path.join(env["pd-data"], genome, pipeline_options["fn-q-labels-compare"])

        labels = read_labels_from_file(pf_q_labels_true, shift=0)

        df_add_is_true_start(df, labels, "q-", "is-true",
                                                    coordinates_suffix="-sbsp")
        df_add_distance_between_predicted_and_true(
            df, labels, "q-", "distance-to-true",
            coordinates_suffix="-sbsp")


    # get labels
    genome_to_pf_labels = df_print_labels(env, df, "q", suffix_coordinates="sbsp",
                                           suffix_fname="")

    # print accuracies
    from sbsp_general.labels_comparison import LabelsComparison
    genome_to_comparison = dict()

    for genome in genome_to_pf_labels:
        pf_q_labels_true = os.path.join(env["pd-data"], genome, pipeline_options["fn-q-labels-compare"])

        genome_to_comparison[genome] = LabelsComparison(env, pf_q_labels_true, genome_to_pf_labels[genome])

        labels_a = read_labels_from_file(pf_q_labels_true)
        labels_b = read_labels_from_file(genome_to_pf_labels[genome])

        lcd = LabelsComparisonDetailed(labels_a, labels_b,
                                       name_a="Reference",
                                       name_b="SBSP",
                                       tag=genome,
                                       split_on_attributes=["predicted-at-step"])

        # LabelsComparisonDetailedViz(lcd).run(env["pd-work"])

    accuracy = LabelsComparison.stringify_genome_accuracies(genome_to_comparison, ",")
    import sbsp_io.general
    pf_accuracy = os.path.join(env["pd-work"], pipeline_options["fn-compare"])
    sbsp_io.general.write_string_to_file(accuracy, pf_accuracy)

    return df
def relative_entropy_analysis_for_gi_for_percent(env, pf_sequence, pf_labels,
                                                 pf_mod, pf_verified, group,
                                                 percent, pd_figures):
    # type: (Environment, str, str, str, str, str, float, str) -> Dict[str, Any]

    # 1)  randomly select percent of labels
    pf_labels_percent = os_join(env["pd-work"], "labels_percent.lst")
    pf_mod_percent = os_join(env["pd-work"], "model_percent.mod")
    pf_labels_predict = os_join(env["pd-work"], "labels_predict.lst")

    randomly_select_labels(pf_labels, pf_labels_percent, percent)

    # train new model
    mod_percent = train_and_create_models(env,
                                          pf_sequences=pf_sequence,
                                          pf_labels=pf_labels_percent,
                                          group=group,
                                          clean=False,
                                          pf_mod=pf_mod_percent)

    # add RBSB to GMS2 model
    add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_labels_percent, pf_mod,
                                pf_mod_percent)

    logo_rbs_from_gms2_mod_file(pd_figures, pf_mod_percent, str(percent))

    # run prediction with new model
    run_gms2_prediction_with_model(pf_sequence, pf_mod_percent,
                                   pf_labels_predict)

    # compare predictions
    lcd = LabelsComparisonDetailed(read_labels_from_file(pf_labels_predict),
                                   read_labels_from_file(pf_verified))

    mm = MotifModel(mod_percent.items["RBS_MAT"],
                    mod_percent.items["RBS_POS_DISTR"])
    non = GMS2Noncoding(mod_percent.items["NON_MAT"])
    return {
        "RE": relative_entropy(mm, non),
        "RE Motif": relative_entropy(mm, non, component="motif"),
        "RE Spacer": relative_entropy(mm, non, component="spacer"),
        "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a'))
    }
def extract_labeled_sequences_for_genome(env, gi, **kwargs):
    # type: (Environment, GenomeInfo, Dict[str, Any]) -> Dict[str, Seq]

    pf_sequences = get_pf_sequences_for_genome(env, gi)
    pf_labels = get_pf_labels_for_genome(env, gi, **kwargs)

    try:
        sequences = read_fasta_into_hash(pf_sequences)
        labels = read_labels_from_file(pf_labels, **kwargs)
    except IOError as e:
        log.warning("Could not read sequence/labels files for genome: {}".format(gi.name))
        raise e

    return extract_labeled_sequences(sequences, labels, **kwargs)
Example #11
0
def add_gene_labels_from_file(env, df, **kwargs):
    # type: (Dict[str, Any], pd.DataFrame, Dict[str, Any]) -> None

    fn_q_labels = get_value(kwargs, "fn_q_labels", "verified.gff")
    source = get_value(kwargs, "source", "q")
    suffix_coordinates = get_value(kwargs, "suffix_corrdinates", "ref")

    from sbsp_io.labels import read_labels_from_file
    import sbsp_general.dataframe

    all_genomes = set(df["{}-genome".format(source)])
    genome_to_genekey_to_label = dict()

    for genome_name in all_genomes:
        pf_q_labels = os.path.join(env["pd-data"], genome_name, fn_q_labels)

        labels = read_labels_from_file(pf_q_labels)

        key_3prime_to_label = dict()
        for l in labels:
            key_3prime = create_key_3prime_from_label(l)
            key_3prime_to_label[key_3prime] = l

        genome_to_genekey_to_label[genome_name] = key_3prime_to_label

    # now add to data frame
    column_left = "{}-left-{}".format(source, suffix_coordinates)
    column_right = "{}-right-{}".format(source, suffix_coordinates)
    column_strand = "{}-strand-{}".format(source, suffix_coordinates)

    df[column_left] = -1
    df[column_right] = -1
    df[column_strand] = ""

    for index, row in df.iterrows():

        curr_genome = row["{}-genome".format(source)]
        curr_label = sbsp_general.dataframe.df_get_label_from_row(
            df, index, source)
        curr_key = create_key_3prime_from_label(curr_label)

        if curr_key in genome_to_genekey_to_label[curr_genome].keys():

            sbsp_general.dataframe.df_coordinates_to_row(
                df,
                index,
                curr_label,
                source,
                suffix_coordinates=suffix_coordinates)
def randomly_select_labels(pf_labels, pf_labels_percent, percent):
    # type: (str, str, float) -> None
    labels = [l for l in read_labels_from_file(pf_labels)]

    # for seqname in labels.keys():
    #     total = len(labels[seqname])
    #     tmp = sorted(np.random.choice(labels[seqname], size=int(total * percent / float(100)), replace=False),
    #                  key=lambda x: x["left"])
    #     new_labels[seqname] = tmp

    new_labels = Labels(
        sorted(np.random.choice(labels,
                                size=int(len(labels) * percent / float(100)),
                                replace=False),
               key=lambda l: l.left()), )

    write_labels_to_file(new_labels, pf_labels_percent)
def read_labels_for_multiple_tools(env, gi, list_dn_tools, list_tool_names):
    # type: (Environment, GenomeInfo, List[str], List[str]) -> Dict[str, Labels]

    common_options = {
        "shift": 0,
        "ignore_frameshifted": True,
        "ignore_partial": True,
        "ignore_hypothetical": True
    }

    labels = dict()
    for name, dn_tool in zip(list_tool_names, list_dn_tools):
        pf_labels = os_join(env["pd-runs"], gi.name, dn_tool, f"{dn_tool}.gff")
        labels[name] = read_labels_from_file(pf_labels,
                                             name="SBSP",
                                             **common_options)

    return labels
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    sequences = SeqIO.to_dict(SeqIO.parse(args.pf_sequence, "fasta"))
    labels = read_labels_from_file(args.pf_labels, shift=-1)

    for lab in labels:
        seq = sequences[lab.seqname()]

        left = int(lab.left())
        right = int(lab.right())

        if lab.strand() == "+":
            codon = seq[lab.left():lab.left() + 3]
        else:
            codon = seq[lab.right() - 2:lab.right() +
                        1]  # type: SeqIO.SeqRecord
            codon = codon.reverse_complement()

        print(codon.seq._data)
def gather_upstream_sequences_for_genome(env, gi, **kwargs):
    # type: (Environment, GenomeInfo, Dict[str, Any]) -> pd.DataFrame

    list_entries = list()  # type: List[Dict[str, Any]]

    # read sequences
    pf_sequences = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_labels = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff")

    sequences = read_fasta_into_hash(pf_sequences)
    labels = read_labels_from_file(pf_labels)

    gc = 100 * compute_gc_from_sequences(sequences)

    upstream_info = extract_upstream_sequences(labels, sequences)

    for info in upstream_info:
        label = info[0]  # type: Label
        frag = info[1]  # type: Seq

        gene_gc = 100 * compute_gc_from_sequences({
            "any":
            sequences[label.seqname()][label.left():label.right() + 1]
        })

        list_entries.append({
            "GCFID": gi.name,
            "Accession": label.seqname(),
            "Genome GC": gc,
            "Gene GC": gene_gc,
            "left": label.left() + 1,
            "right": label.right() + 1,
            "strand": label.strand(),
            "upstream_nt": str(frag)
        })

    return pd.DataFrame(list_entries)
def analyze_predictions_on_verified_genes(env, gi, pd_sbsp, **kwargs):
    # type: (Environment, GenomeInfo, str, Dict[str, Any]) -> Dict[str, Any]
    pd_gcfid = os_join(env["pd-data"], gi.name)

    pf_sbsp = os_join(pd_sbsp, "accuracy", "{}.gff".format(gi.name))
    pf_gms2 = os_join(pd_gcfid, "runs", "gms2", "gms2.gff")
    pf_verified = os_join(pd_gcfid, "verified.gff")
    pf_ncbi = os_join(pd_gcfid, "ncbi.gff")
    pf_sbsp_details = os_join(pd_sbsp, "output.csv")

    kwargs_labels = {
        "ignore_frameshifted": True,
        "ignore_partial": True,
        "shift": 0
    }

    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **kwargs_labels)
    labels_verified = read_labels_from_file(pf_verified,
                                            name="Verified",
                                            **kwargs_labels)
    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **kwargs_labels)
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **kwargs_labels)
    df_sbsp_details = pd.read_csv(pf_sbsp_details)
    add_q_key_3p_to_df(df_sbsp_details, "q-key-3p")

    add_support_to_labels(labels_sbsp, df_sbsp_details)

    #labels_sbsp = Labels([l for l in labels_sbsp if l.get_attribute_value('predicted-at-step') != "C"], name="SBSP")

    labels_sbsp_eq_gms2 = LabelsComparisonDetailed(
        labels_sbsp, labels_gms2).match_3p_5p("a")
    labels_sbsp_eq_gms2.name = "GMS2=SBSP"

    stats = dict()

    # Stats: 3prime match
    get_stats_a_from_b_3p(labels_verified, labels_ncbi, stats)
    get_stats_a_from_b_3p(labels_verified, labels_gms2, stats)
    get_stats_a_from_b_3p(labels_verified, labels_sbsp, stats)
    get_stats_a_from_b_3p_by_upstream(labels_verified, labels_ncbi, stats)

    # SN SP
    get_stats_sn_sp(labels_verified, labels_sbsp, stats)
    get_stats_sn_sp(labels_verified, labels_ncbi, stats)
    get_stats_sn_sp(labels_verified, labels_gms2, stats)

    # Stats: GMS2=SBSP Accuracy on verified
    get_stats_sn_sp(labels_verified, labels_sbsp_eq_gms2, stats)

    # stats by support
    get_stats_sn_sp_by_support(labels_verified, labels_sbsp, stats, "SBSP")

    # stats by support
    get_stats_sn_sp_by_support(labels_verified, labels_sbsp_eq_gms2, stats,
                               "GMS2=SBSP")

    # stats by steps combinations
    get_stats_sn_sp_by_step_group(labels_verified, labels_sbsp, stats, "SBSP")

    # stats by steps combinations
    get_stats_sn_sp_by_step_group(labels_verified, labels_sbsp_eq_gms2, stats,
                                  "GMS2=SBSP")

    return stats
def compare_gms2_sbsp_ncbi(env, pf_gms2, pf_sbsp, pf_ncbi, **kwargs):
    # type: (Environment, str, str, str, Dict[str, Any]) -> Dict[str, Any]

    venn_title = get_value(kwargs, "venn_title", None)
    pf_venn = get_value(kwargs, "pf_venn",
                        os.path.join(env["pd-work"], "venn.pdf"))
    pf_prodigal = get_value(kwargs, "pf_prodigal", None)

    start_candidate_analysis = get_value(kwargs, "start_candidate_analysis",
                                         False)
    gcfid = get_value(kwargs, "gcfid", None)

    predicted_at_step = get_value(kwargs, "predicted_at_step", None)

    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2")
    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP")
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI")

    if start_candidate_analysis:
        # add number of start candidates per gene
        sequences = read_fasta_into_hash(
            os.path.join(env["pd-data"], gcfid, "sequence.fasta"))
        add_number_of_start_candidates_to_labels(sequences, labels_gms2)

    if predicted_at_step is not None:
        labels_sbsp = Labels([
            l for l in labels_sbsp
            if l.get_attribute_value("predicted-at-step") == predicted_at_step
        ],
                             name="SBSP")

    lcd = LabelsComparisonDetailed(labels_gms2,
                                   labels_sbsp,
                                   name_a="gms2",
                                   name_b="sbsp")

    labels_gms2_sbsp_3p_5p = lcd.intersection("a")

    lcd_2 = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p,
                                     labels_ncbi,
                                     name_a="gms2_sbsp",
                                     name_b="ncbi")

    labels_gms2_sbsp_ncbi_3p_5p = lcd_2.intersection("a")

    # venn_diagram_5prime(labels_gms2, labels_sbsp, labels_ncbi, FigureOptions(
    #     title=venn_title,
    #     save_fig=pf_venn
    # ))

    labels_prodigal = None
    prodigal_info = dict()
    if pf_prodigal is not None:
        labels_prodigal = read_labels_from_file(pf_prodigal, name="Prodigal")
        lcd_prodigal = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p,
                                                labels_prodigal,
                                                name_a="gms2_sbsp",
                                                name_b="prodigal")

        labels_gms2_sbsp_prodigal_3p_5p = lcd_prodigal.intersection("a")

        # Goal: check (GMS2=SBSP) != (Prodigal=NCBI)

        # step1: Prodigal=NCBI
        labels_ncbi_prodigal_3p_5p = LabelsComparisonDetailed(
            labels_ncbi, labels_prodigal, name_a="ncbi",
            name_b="prodigal").match_3p_5p("a")

        # Get same genes in (GMS2=SBSP) and (Prodigal=NCBI)
        lcd_full = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p,
                                            labels_ncbi_prodigal_3p_5p,
                                            name_a="gms2_sbsp",
                                            name_b="ncbi_prodigal")

        labels_match_3p = lcd_full.match_3p("a")
        labels_match_3p_5p = lcd_full.match_3p_5p("a")

        prodigal_info = {
            "(GMS2=SBSP)!=Prodigal":
            len(labels_gms2_sbsp_3p_5p) - len(labels_gms2_sbsp_prodigal_3p_5p),
            "(GMS2=SBSP)!=(NCBI=Prodigal)":
            len(labels_match_3p) - len(labels_match_3p_5p),
        }

    return {
        "GMS2": len(labels_gms2),
        "SBSP": len(labels_sbsp),
        "NCBI": len(labels_ncbi),
        "GMS2=SBSP": len(labels_gms2_sbsp_3p_5p),
        "GMS2=SBSP=NCBI": len(labels_gms2_sbsp_ncbi_3p_5p),
        **prodigal_info
    }
def collect_alignments_for_genome(env, gi):
    # type: (Environment, GenomeInfo) -> None
    pd_genome = os_join(env["pd-work"], gi.name)

    mkdir_p(pd_genome)

    pd_run = os_join(env["pd-runs"], gi.name)

    # load labels and data files
    pf_sbsp = os_join(pd_run, "sbsp", "accuracy", f"{gi.name}.gff")
    pf_gms2 = os_join(pd_run, "gms2", "gms2.gff")
    pf_ncbi = os_join(pd_run, "ncbi", "ncbi.gff")
    pf_sbsp_details = os_join(pd_run, "sbsp", "output.csv")

    common_options = {
        "ignore_frameshifted": True,
        "ignore_partial": True,
        "shift": 0
    }

    try:

        labels_sbsp = read_labels_from_file(pf_sbsp,
                                            name="SBSP",
                                            **common_options)
        labels_gms2 = read_labels_from_file(pf_gms2,
                                            name="GMS2",
                                            **common_options)
        labels_ncbi = read_labels_from_file(pf_ncbi,
                                            name="NCBI",
                                            **common_options)
        df_details = pd.read_csv(pf_sbsp_details)
        add_q_key_3p_to_df(df_details, "q-3prime")
    except FileNotFoundError:
        return

    # get genes where GMS2=SBSP
    lcd_full = LabelsComparisonDetailed(labels_gms2,
                                        labels_sbsp,
                                        name_a="gms2",
                                        name_b="sbsp")

    labels_gms2_eq_sbsp = lcd_full.match_3p_5p("a")

    # get labels where gms2_eq_sbsp doesn't match NCBI
    lcd2 = LabelsComparisonDetailed(labels_gms2_eq_sbsp,
                                    labels_ncbi,
                                    name_a="gms2_eq_sbsp",
                                    name_b="ncbi")
    labels_gms2_eq_sbsp_not_ncbi = lcd2.match_3p_not_5p("a")

    # get msa files for all these labels
    set_3prime_keys = {
        create_q_key_3p(l.seqname(), l.left(), l.right(), l.strand())
        for l in labels_gms2_eq_sbsp_not_ncbi
    }

    df_gms2_eq_sbsp_not_ncbi = df_details[df_details["q-3prime"].isin(
        set_3prime_keys)]

    set_pf_msa_out = set(df_gms2_eq_sbsp_not_ncbi["pf-msa-output"])

    for pf_msa_out in set_pf_msa_out:
        shutil.copy(pf_msa_out, pd_genome)