コード例 #1
0
def build_pass_track(target, args):
    """
    Builds a specific track of Good transcripts for the current mode.
    """
    colors = {
        "coding": "59,101,69",
        "noncoding": "98,124,191",
        "not_pass": "******"
    }
    con, cur = sql_lib.attach_databases(args.outDir, args.mode)
    biotype_map = sql_lib.get_transcript_biotype_map(cur, args.refGenome)
    if args.mode == "augustus":
        query = etc.config.augustusEval(args.genome, args.refGenome)
        pass_ids = sql_lib.get_query_ids(cur, query)
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "augustus", args.genome)
        gp_dict = seq_lib.get_transcript_dict(args.augustusGp)
    elif args.mode == "reference":  # for reference, we are more interested in what is NOT Good
        query = etc.config.refEval(args.refGenome)
        pass_ids = biotype_map.viewkeys() - sql_lib.get_query_ids(
            cur, query)  # actually not pass
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "reference", args.refGenome)
        gp_dict = seq_lib.get_transcript_dict(args.annotationGp)
    elif args.mode == "transMap":
        pass_ids = get_all_tm_pass(cur, args.refGenome, args.genome)
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "transMap", args.genome)
        gp_dict = seq_lib.get_transcript_dict(args.targetGp)
    else:
        raise RuntimeError(
            "Somehow your argparse object does not contain a valid mode.")
    with open(out_pass_bed_path, "w") as outf:
        for aln_id, rec in gp_dict.iteritems():
            tx_id = psl_lib.strip_alignment_numbers(aln_id)
            if aln_id in pass_ids:
                if biotype_map[tx_id] == "protein_coding":
                    bed = rec.get_bed(rgb=colors["coding"])
                    outf.write("".join(["\t".join(map(str, bed)), "\n"]))
                else:
                    bed = rec.get_bed(rgb=colors["noncoding"])
                    outf.write("".join(["\t".join(map(str, bed)), "\n"]))
            else:
                bed = rec.get_bed(rgb=colors["not_pass"])
                outf.write("".join(["\t".join(map(str, bed)), "\n"]))
    make_big_bed(out_pass_bed_path, args.sizes, out_pass_big_bed_path)
コード例 #2
0
def get_all_tm_pass(cur, ref_genome, genome):
    """
    transMap pass varies depending on if the transcript is coding or noncoding. We will build a set of IDs for both.
    """
    biotypes = sql_lib.get_all_biotypes(cur, genome, gene_level=False)
    all_ids = set()
    for biotype in biotypes:
        query = etc.config.transMapEval(ref_genome,
                                        genome,
                                        biotype=biotype,
                                        passing=False)
        query_ids = sql_lib.get_query_ids(cur, query)
        all_ids |= query_ids
    return all_ids
コード例 #3
0
def categorized_plot(cur, highest_cov_dict, genomes, out_path, file_name,
                     biotype, biotype_ids, gencode, query_fn):
    results = []
    for g in genomes:
        best_ids = set(zip(*highest_cov_dict[g].itervalues())[0])
        query = query_fn(g, biotype, details=False)
        categorized_ids = sql_lib.get_query_ids(cur, query)
        num_categorized = len({
            x
            for x in categorized_ids if x in best_ids
            and psl_lib.remove_alignment_number(x) in biotype_ids
        })
        norm = num_categorized / (0.01 * len(biotype_ids))
        results.append([g, norm, num_categorized])
    title_string = "Proportion of {:,} {} transcripts in {}\ncategorized as {}"
    title_string = title_string.format(len(biotype_ids), biotype, gencode,
                                       query_fn.__name__)
    plot_lib.barplot(results,
                     out_path,
                     file_name,
                     title_string,
                     adjust_y=False)
コード例 #4
0
def consensus_by_biotype(cur, ref_genome, genome, biotype, gps,
                         transcript_gene_map, gene_transcript_map, stats, mode,
                         ref_intervals, tgt_intervals):
    """
    Main consensus finding function.
    """
    fail_ids, pass_specific_ids, excel_ids = sql_lib.get_fail_passing_excel_ids(
        cur, ref_genome, genome, biotype, best_cov_only=False)
    # hacky way to avoid duplicating code in consensus finding - we will always have an aug_id set, it just may be empty
    if mode == "augustus" and biotype == "protein_coding":
        aug_query = etc.config.augustusEval(genome, ref_genome)
        aug_ids = sql_lib.get_query_ids(cur, aug_query)
        id_names = ["fail_ids", "pass_specific_ids", "excel_ids", "aug_ids"]
        id_list = [fail_ids, pass_specific_ids, excel_ids, aug_ids]
    else:
        id_names = ["fail_ids", "pass_specific_ids", "excel_ids"]
        id_list = [fail_ids, pass_specific_ids, excel_ids]
    data_dict = build_data_dict(id_names, id_list, transcript_gene_map,
                                gene_transcript_map)
    binned_transcripts = find_best_transcripts(data_dict, stats, mode, biotype)
    consensus = find_consensus(binned_transcripts, stats, gps, ref_intervals,
                               tgt_intervals, mode)
    return binned_transcripts, consensus
コード例 #5
0
    if aug_cov > tm_cov and aug_ident > tm_ident:
        r["higher_both"].append(aug_id)
    elif aug_cov > tm_cov:
        r["higher_cov"].append(aug_id)
    elif aug_ident > tm_ident:
        r["higher_ident"].append(aug_id)
    else:
        r["worse"].append(aug_id)


transcript_gene_map = sql_lib.get_transcript_gene_map(cur, ref_genome, biotype=None, filter_chroms=filter_chroms)
gene_transcript_map = sql_lib.get_gene_transcript_map(cur, ref_genome, biotype=biotype, filter_chroms=filter_chroms)
stats = merge_stats(cur, 'gorilla')
fail_ids, good_specific_ids, pass_ids = sql_lib.get_fail_good_pass_ids(cur, ref_genome, genome, biotype)
aug_query = etc.config.augustusEval(genome, ref_genome)
aug_ids = sql_lib.get_query_ids(cur, aug_query)
id_names = ["fail_ids", "good_specific_ids", "pass_ids", "aug_ids"]
id_list = [fail_ids, good_specific_ids, pass_ids, aug_ids]
data_dict = build_data_dict(id_names, id_list, transcript_gene_map, gene_transcript_map)
binned_transcripts = find_best_transcripts(data_dict, stats)
consensus = find_consensus(binned_transcripts, stats)
aug_cons = {x for x in consensus if 'aug' in x}


transcript_evaluation = OrderedDict((x, []) for x in ["PassTM", "PassAug", "PassTie", "GoodTM", "GoodAug", "GoodTie",
                                                     "FailTM", "FailAug", "FailTie", "NoTransMap"])
gene_evaluation = OrderedDict((x, []) for x in ["Pass", "Good", "Fail", "NoTransMap"])
gene_fail_evaluation = OrderedDict((x, []) for x in ["Fail", "NoTransMap"])
for gene_id in binned_transcripts:
    categories = set()
    for ens_id in binned_transcripts[gene_id]: