def num_pass_excel_gene_level(fail_pass_excel_dict, cur, ref_genome, out_path, biotype, gencode, transcript_gene_map): file_name = "{}_num_pass_excel_gene_level".format(gencode) results = [] for genome, (fail_ids, pass_specific_ids, excel_ids) in fail_pass_excel_dict.iteritems(): excel_genes = { transcript_gene_map[psl_lib.strip_alignment_numbers(x)] for x in excel_ids } pass_specific_genes = { transcript_gene_map[psl_lib.strip_alignment_numbers(x)] for x in pass_specific_ids } fail_genes = { transcript_gene_map[psl_lib.strip_alignment_numbers(x)] for x in fail_ids } num_genes = len(set(transcript_gene_map.values())) num_excel_genes = len(excel_genes) num_pass_genes = len(pass_specific_genes - excel_genes) num_fail_genes = len(fail_genes - (pass_specific_genes | excel_genes)) num_no_aln = num_genes - (num_excel_genes + num_pass_genes + num_fail_genes) raw = np.array( [num_excel_genes, num_pass_genes, num_fail_genes, num_no_aln]) assert all([x >= 0 for x in raw]) norm = raw / (0.01 * num_genes) results.append([genome, norm]) title_string = "Proportion of {:,} {} genes in {}\nwith at least one transcript categorized as Excellent/Pass/Fail" title_string = title_string.format(num_genes, biotype.replace("_", " "), gencode) legend_labels = ["Excellent", "Pass", "Fail", "NoAln"] plot_lib.stacked_barplot(results, legend_labels, out_path, file_name, title_string)
def transcript_gene_plot(evals, out_path, gencode, mode, biotype): results, categories = convert_dicts_to_dataframe(evals, norm=True) total = find_total(evals) base_title = "Breakdown of {:,} {} {} categorized by consensus finding\nfrom annotation set {}" title = base_title.format(total, biotype, mode, gencode) out_name = "{}_{}_{}_consensus".format(gencode, biotype, mode) palette = etc.config.palette if mode == "genes" or biotype != "protein_coding" else etc.config.triple_palette plot_lib.stacked_barplot(results, categories, out_path, out_name, title, color_palette=palette)
def paralogy_plot(cur, genomes, out_path, biotype, biotype_ids, gencode): results = [] file_name = "{}_{}".format(gencode, "paralogy") for g in genomes: p = paralogy(cur, g) p = [p.get(x, 0) for x in biotype_ids] # we roll the list backwards one to put 0 on top norm, raw = make_hist(p, paralogy_bins, reverse=False, roll=-1) results.append([g, norm]) title_string = "Proportion of {:,} {} transcripts in {}\nthat have multiple alignments" title_string = title_string.format(len(biotype_ids), biotype, gencode) legend_labels = ["= {}".format(x) for x in paralogy_bins[1:-2]] + [u"\u2265 {}".format(paralogy_bins[-2])] + \ ["= {}".format(paralogy_bins[0])] plot_lib.stacked_barplot(results, legend_labels, out_path, file_name, title_string)
def num_pass_excel(fail_pass_excel_dict, cur, ref_genome, out_path, biotype, gencode, biotype_ids): file_name = "{}_num_pass_excel".format(gencode) results = [] for genome, (fail_ids, pass_specific_ids, excel_ids) in fail_pass_excel_dict.iteritems(): num_no_aln = len(biotype_ids) - sum( [len(x) for x in [fail_ids, pass_specific_ids, excel_ids]]) raw = np.array([ len(excel_ids), len(pass_specific_ids), len(fail_ids), num_no_aln ]) assert all([x >= 0 for x in raw]) norm = raw / (0.01 * len(biotype_ids)) results.append([genome, norm]) title_string = "Proportion of {:,} {} transcripts in {}\ncategorized as Excellent/Pass/Fail" title_string = title_string.format(len(biotype_ids), biotype.replace("_", " "), gencode) legend_labels = ["Excellent", "Pass", "Fail", "NoAln"] plot_lib.stacked_barplot(results, legend_labels, out_path, file_name, title_string)
def metrics_plot(highest_cov_dict, bins, genomes, out_path, file_name, biotype, gencode, biotype_ids, analysis): results = [] for g in genomes: covs = highest_cov_dict[g] vals = [ eval(analysis) for tx_id, (aln_id, coverage, identity) in covs.iteritems() if tx_id in biotype_ids ] vals.extend([0] * (len(biotype_ids) - len(vals))) # add all of the unmapped transcripts norm, raw = make_hist(vals, bins, reverse=True, roll=0) results.append([g, norm]) title_string = "transMap alignment {} breakdown for\n{:,} {} transcripts in {}" title_string = title_string.format(analysis, len(biotype_ids), biotype, gencode) legend_labels = ["= {0:.1f}%".format(bins[-1])] legend_labels.extend(["< {0:.1f}%".format(x) for x in bins[2:-1][::-1]]) legend_labels.append("= {0:.1f}%".format(bins[0])) plot_lib.stacked_barplot(results, legend_labels, out_path, file_name, title_string)