def num_pass_excel_gene_level(fail_pass_excel_dict, cur, ref_genome, out_path,
                              biotype, gencode, transcript_gene_map):
    file_name = "{}_num_pass_excel_gene_level".format(gencode)
    results = []
    for genome, (fail_ids, pass_specific_ids,
                 excel_ids) in fail_pass_excel_dict.iteritems():
        excel_genes = {
            transcript_gene_map[psl_lib.strip_alignment_numbers(x)]
            for x in excel_ids
        }
        pass_specific_genes = {
            transcript_gene_map[psl_lib.strip_alignment_numbers(x)]
            for x in pass_specific_ids
        }
        fail_genes = {
            transcript_gene_map[psl_lib.strip_alignment_numbers(x)]
            for x in fail_ids
        }
        num_genes = len(set(transcript_gene_map.values()))
        num_excel_genes = len(excel_genes)
        num_pass_genes = len(pass_specific_genes - excel_genes)
        num_fail_genes = len(fail_genes - (pass_specific_genes | excel_genes))
        num_no_aln = num_genes - (num_excel_genes + num_pass_genes +
                                  num_fail_genes)
        raw = np.array(
            [num_excel_genes, num_pass_genes, num_fail_genes, num_no_aln])
        assert all([x >= 0 for x in raw])
        norm = raw / (0.01 * num_genes)
        results.append([genome, norm])
    title_string = "Proportion of {:,} {} genes in {}\nwith at least one transcript categorized as Excellent/Pass/Fail"
    title_string = title_string.format(num_genes, biotype.replace("_", " "),
                                       gencode)
    legend_labels = ["Excellent", "Pass", "Fail", "NoAln"]
    plot_lib.stacked_barplot(results, legend_labels, out_path, file_name,
                             title_string)
Esempio n. 2
0
def transcript_gene_plot(evals, out_path, gencode, mode, biotype):
    results, categories = convert_dicts_to_dataframe(evals, norm=True)
    total = find_total(evals)
    base_title = "Breakdown of {:,} {} {} categorized by consensus finding\nfrom annotation set {}"
    title = base_title.format(total, biotype, mode, gencode)
    out_name = "{}_{}_{}_consensus".format(gencode, biotype, mode)
    palette = etc.config.palette if mode == "genes" or biotype != "protein_coding" else etc.config.triple_palette
    plot_lib.stacked_barplot(results,
                             categories,
                             out_path,
                             out_name,
                             title,
                             color_palette=palette)
def paralogy_plot(cur, genomes, out_path, biotype, biotype_ids, gencode):
    results = []
    file_name = "{}_{}".format(gencode, "paralogy")
    for g in genomes:
        p = paralogy(cur, g)
        p = [p.get(x, 0) for x in biotype_ids]
        # we roll the list backwards one to put 0 on top
        norm, raw = make_hist(p, paralogy_bins, reverse=False, roll=-1)
        results.append([g, norm])
    title_string = "Proportion of {:,} {} transcripts in {}\nthat have multiple alignments"
    title_string = title_string.format(len(biotype_ids), biotype, gencode)
    legend_labels = ["= {}".format(x) for x in paralogy_bins[1:-2]] + [u"\u2265 {}".format(paralogy_bins[-2])] + \
                    ["= {}".format(paralogy_bins[0])]
    plot_lib.stacked_barplot(results, legend_labels, out_path, file_name,
                             title_string)
def num_pass_excel(fail_pass_excel_dict, cur, ref_genome, out_path, biotype,
                   gencode, biotype_ids):
    file_name = "{}_num_pass_excel".format(gencode)
    results = []
    for genome, (fail_ids, pass_specific_ids,
                 excel_ids) in fail_pass_excel_dict.iteritems():
        num_no_aln = len(biotype_ids) - sum(
            [len(x) for x in [fail_ids, pass_specific_ids, excel_ids]])
        raw = np.array([
            len(excel_ids),
            len(pass_specific_ids),
            len(fail_ids), num_no_aln
        ])
        assert all([x >= 0 for x in raw])
        norm = raw / (0.01 * len(biotype_ids))
        results.append([genome, norm])
    title_string = "Proportion of {:,} {} transcripts in {}\ncategorized as Excellent/Pass/Fail"
    title_string = title_string.format(len(biotype_ids),
                                       biotype.replace("_", " "), gencode)
    legend_labels = ["Excellent", "Pass", "Fail", "NoAln"]
    plot_lib.stacked_barplot(results, legend_labels, out_path, file_name,
                             title_string)
def metrics_plot(highest_cov_dict, bins, genomes, out_path, file_name, biotype,
                 gencode, biotype_ids, analysis):
    results = []
    for g in genomes:
        covs = highest_cov_dict[g]
        vals = [
            eval(analysis)
            for tx_id, (aln_id, coverage, identity) in covs.iteritems()
            if tx_id in biotype_ids
        ]
        vals.extend([0] * (len(biotype_ids) -
                           len(vals)))  # add all of the unmapped transcripts
        norm, raw = make_hist(vals, bins, reverse=True, roll=0)
        results.append([g, norm])
    title_string = "transMap alignment {} breakdown for\n{:,} {} transcripts in {}"
    title_string = title_string.format(analysis, len(biotype_ids), biotype,
                                       gencode)
    legend_labels = ["= {0:.1f}%".format(bins[-1])]
    legend_labels.extend(["< {0:.1f}%".format(x) for x in bins[2:-1][::-1]])
    legend_labels.append("= {0:.1f}%".format(bins[0]))
    plot_lib.stacked_barplot(results, legend_labels, out_path, file_name,
                             title_string)