def build_pass_track(target, args): """ Builds a specific track of Good transcripts for the current mode. """ colors = { "coding": "59,101,69", "noncoding": "98,124,191", "not_pass": "******" } con, cur = sql_lib.attach_databases(args.outDir, args.mode) biotype_map = sql_lib.get_transcript_biotype_map(cur, args.refGenome) if args.mode == "augustus": query = etc.config.augustusEval(args.genome, args.refGenome) pass_ids = sql_lib.get_query_ids(cur, query) out_pass_bed_path, out_pass_big_bed_path = get_bed_paths( args.outDir, "augustus", args.genome) gp_dict = seq_lib.get_transcript_dict(args.augustusGp) elif args.mode == "reference": # for reference, we are more interested in what is NOT Good query = etc.config.refEval(args.refGenome) pass_ids = biotype_map.viewkeys() - sql_lib.get_query_ids( cur, query) # actually not pass out_pass_bed_path, out_pass_big_bed_path = get_bed_paths( args.outDir, "reference", args.refGenome) gp_dict = seq_lib.get_transcript_dict(args.annotationGp) elif args.mode == "transMap": pass_ids = get_all_tm_pass(cur, args.refGenome, args.genome) out_pass_bed_path, out_pass_big_bed_path = get_bed_paths( args.outDir, "transMap", args.genome) gp_dict = seq_lib.get_transcript_dict(args.targetGp) else: raise RuntimeError( "Somehow your argparse object does not contain a valid mode.") with open(out_pass_bed_path, "w") as outf: for aln_id, rec in gp_dict.iteritems(): tx_id = psl_lib.strip_alignment_numbers(aln_id) if aln_id in pass_ids: if biotype_map[tx_id] == "protein_coding": bed = rec.get_bed(rgb=colors["coding"]) outf.write("".join(["\t".join(map(str, bed)), "\n"])) else: bed = rec.get_bed(rgb=colors["noncoding"]) outf.write("".join(["\t".join(map(str, bed)), "\n"])) else: bed = rec.get_bed(rgb=colors["not_pass"]) outf.write("".join(["\t".join(map(str, bed)), "\n"])) make_big_bed(out_pass_bed_path, args.sizes, out_pass_big_bed_path)
def get_all_tm_pass(cur, ref_genome, genome): """ transMap pass varies depending on if the transcript is coding or noncoding. We will build a set of IDs for both. """ biotypes = sql_lib.get_all_biotypes(cur, genome, gene_level=False) all_ids = set() for biotype in biotypes: query = etc.config.transMapEval(ref_genome, genome, biotype=biotype, passing=False) query_ids = sql_lib.get_query_ids(cur, query) all_ids |= query_ids return all_ids
def categorized_plot(cur, highest_cov_dict, genomes, out_path, file_name, biotype, biotype_ids, gencode, query_fn): results = [] for g in genomes: best_ids = set(zip(*highest_cov_dict[g].itervalues())[0]) query = query_fn(g, biotype, details=False) categorized_ids = sql_lib.get_query_ids(cur, query) num_categorized = len({ x for x in categorized_ids if x in best_ids and psl_lib.remove_alignment_number(x) in biotype_ids }) norm = num_categorized / (0.01 * len(biotype_ids)) results.append([g, norm, num_categorized]) title_string = "Proportion of {:,} {} transcripts in {}\ncategorized as {}" title_string = title_string.format(len(biotype_ids), biotype, gencode, query_fn.__name__) plot_lib.barplot(results, out_path, file_name, title_string, adjust_y=False)
def consensus_by_biotype(cur, ref_genome, genome, biotype, gps, transcript_gene_map, gene_transcript_map, stats, mode, ref_intervals, tgt_intervals): """ Main consensus finding function. """ fail_ids, pass_specific_ids, excel_ids = sql_lib.get_fail_passing_excel_ids( cur, ref_genome, genome, biotype, best_cov_only=False) # hacky way to avoid duplicating code in consensus finding - we will always have an aug_id set, it just may be empty if mode == "augustus" and biotype == "protein_coding": aug_query = etc.config.augustusEval(genome, ref_genome) aug_ids = sql_lib.get_query_ids(cur, aug_query) id_names = ["fail_ids", "pass_specific_ids", "excel_ids", "aug_ids"] id_list = [fail_ids, pass_specific_ids, excel_ids, aug_ids] else: id_names = ["fail_ids", "pass_specific_ids", "excel_ids"] id_list = [fail_ids, pass_specific_ids, excel_ids] data_dict = build_data_dict(id_names, id_list, transcript_gene_map, gene_transcript_map) binned_transcripts = find_best_transcripts(data_dict, stats, mode, biotype) consensus = find_consensus(binned_transcripts, stats, gps, ref_intervals, tgt_intervals, mode) return binned_transcripts, consensus
if aug_cov > tm_cov and aug_ident > tm_ident: r["higher_both"].append(aug_id) elif aug_cov > tm_cov: r["higher_cov"].append(aug_id) elif aug_ident > tm_ident: r["higher_ident"].append(aug_id) else: r["worse"].append(aug_id) transcript_gene_map = sql_lib.get_transcript_gene_map(cur, ref_genome, biotype=None, filter_chroms=filter_chroms) gene_transcript_map = sql_lib.get_gene_transcript_map(cur, ref_genome, biotype=biotype, filter_chroms=filter_chroms) stats = merge_stats(cur, 'gorilla') fail_ids, good_specific_ids, pass_ids = sql_lib.get_fail_good_pass_ids(cur, ref_genome, genome, biotype) aug_query = etc.config.augustusEval(genome, ref_genome) aug_ids = sql_lib.get_query_ids(cur, aug_query) id_names = ["fail_ids", "good_specific_ids", "pass_ids", "aug_ids"] id_list = [fail_ids, good_specific_ids, pass_ids, aug_ids] data_dict = build_data_dict(id_names, id_list, transcript_gene_map, gene_transcript_map) binned_transcripts = find_best_transcripts(data_dict, stats) consensus = find_consensus(binned_transcripts, stats) aug_cons = {x for x in consensus if 'aug' in x} transcript_evaluation = OrderedDict((x, []) for x in ["PassTM", "PassAug", "PassTie", "GoodTM", "GoodAug", "GoodTie", "FailTM", "FailAug", "FailTie", "NoTransMap"]) gene_evaluation = OrderedDict((x, []) for x in ["Pass", "Good", "Fail", "NoTransMap"]) gene_fail_evaluation = OrderedDict((x, []) for x in ["Fail", "NoTransMap"]) for gene_id in binned_transcripts: categories = set() for ens_id in binned_transcripts[gene_id]: