Esempio n. 1
0
def find_isoform(gff3, genome_bam, isoform_gff3, tss_tes_stat, genomefa,
                 transcript_fa, downsample_ratio, config_dict,
                 raw_splice_isoform):
    # find isoform
    print "#### Read genne annotations"
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(
        gff3)

    transcript_to_junctions = {
        tr: blocks_to_junctions(transcript_to_exon[tr])
        for tr in transcript_to_exon
    }
    remove_similar_tr(gene_to_transcript, transcript_to_exon)
    gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon)
    chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript)

    # finding isoforms are required
    print "#### find isoforms"
    group_bam2isoform(genome_bam,
                      isoform_gff3,
                      tss_tes_stat,
                      "",
                      chr_to_blocks,
                      gene_dict,
                      transcript_to_junctions,
                      transcript_dict,
                      genomefa,
                      config=config_dict["isoform_parameters"],
                      downsample_ratio=downsample_ratio,
                      raw_gff3=None)
    #raw_gff3=raw_splice_isoform if config_dict["global_parameters"]["generate_raw_isoform"] else None)

    # get fasta
    #print "### generate transcript fasta file", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i = parse_gff_tree(
        isoform_gff3)
    ref_dict = {
        "chr_to_gene": chr_to_gene,
        "transcript_dict": transcript_dict,
        "gene_to_transcript": gene_to_transcript,
        "transcript_to_exon": transcript_to_exon
    }
    if not config_dict["realign_parameters"]["use_annotation"]:
        ref_dict = None
    get_transcript_seq(genomefa,
                       transcript_fa,
                       chr_to_gene_i,
                       transcript_dict_i,
                       gene_to_transcript_i,
                       transcript_to_exon_i,
                       ref_dict=ref_dict)

    return {
        "transcript_dict": transcript_dict,
        "transcript_dict_i": transcript_dict_i
    }
Esempio n. 2
0
def get_all_SNV_table(bam_in, chr_to_blocks, transcript_to_exon, fa_f, out_dir, cb_seq_dict, bam_short, known_position_dict, min_cov=100, report_pct=(0.15, 0.85)):
    # four array.arrays of the same length in order A C G T
    c2i = {"A": 0, "C": 1, "G": 2, "T": 3}
    fa_dict = {}
    acc_pct = []
    REF_cnt_dict = {}
    ALT_cnt_dict = {}
    cb_seq_set = set(cb_seq_dict.keys())
    reporting_summary = []
    for c in get_fa(fa_f):
        fa_dict[c[0]] = c[1]
    bamfile = pysam.AlignmentFile(bam_in, "rb")
    if bam_short is not None:
        bam_s = pysam.AlignmentFile(bam_short, "rb")
    cb_corr_cnt = Counter()
    for ch in chr_to_blocks:
        print(ch)
        homo_dict = find_homo_regions(fa_dict[ch], chr_to_blocks[ch])
        for ith, bl in enumerate(chr_to_blocks[ch]):
            tmp_bl_flat = get_gene_flat(
                {"NNN": bl.transcript_list}, transcript_to_exon)
            for ex in tmp_bl_flat["NNN"]:
                cnt = bamfile.count(ch, ex[0], ex[1])
                if cnt < min_cov:
                    continue
                cov = bamfile.count_coverage(ch, ex[0], ex[1],
                                             quality_threshold=0)  # four array.arrays of the same length in order A C G T
                if len(cov[0]) < 20:
                    continue  # ignore tiny exons
                # ignore the bases at the beginning and the end (close to splicing site)
                for i in range(5, len(cov[0])-5):
                    tot = float(cov[0][i]+cov[1][i]+cov[2][i]+cov[3][i])
                    v_pos = ex[0]+i
                    if tot > min_cov and (fa_dict[ch][v_pos] != "N"):
                        freq = cov[c2i[fa_dict[ch][v_pos]]][i]/tot
                        acc_pct.append(freq)
                        base_freq = [("A", cov[0][i]), ("C", cov[1][i]),
                                     ("G", cov[2][i]), ("T", cov[3][i])]
                        base_freq.sort(key=lambda x: x[1], reverse=True)
                        if v_pos == 63318364:
                            print(base_freq)
                        # the most enriched ALT allele
                        ALT = [it[0] for it in base_freq if it[0]
                               != fa_dict[ch][v_pos]][0]
                        alt_freq = cov[c2i[ALT]][i]/tot
                        if (report_pct[0] < alt_freq < report_pct[1]) or ((ch, v_pos) in known_position_dict):
                            tmp_atcg_set = {}
                            if bam_short is not None:
                                try:
                                    cov_s = bam_s.count_coverage(
                                        ch, v_pos, v_pos+1, quality_threshold=20)
                                    s_tot = cov_s[0][0]+cov_s[1][0] + \
                                        cov_s[2][0]+cov_s[3][0]
                                    if s_tot > (min_cov/2):
                                        s_freq = cov_s[c2i[fa_dict[ch]
                                                           [v_pos]]][0]/float(s_tot)
                                    else:
                                        s_freq = -1
                                except:
                                    s_freq = -1
                            else:
                                s_freq = -1
                            seq_ent = seq_entropy(
                                fa_dict[ch][(v_pos-10):(v_pos+10)])
                            indel_freq = -1
                            if ((ch, v_pos) in known_position_dict) or ((ex[0]+i not in homo_dict) and (seq_ent > 1) and (s_freq == -1 or (0.05 < s_freq < 0.95))):
                                for pileupcolumn in bamfile.pileup(ch, v_pos, v_pos+1, truncate=True, min_base_quality=0, ignore_overlaps=False, max_depth=20000):
                                    c_keep = 0
                                    c_del = 0
                                    for pileupread in pileupcolumn.pileups:
                                        if not pileupread.is_del:
                                            if not pileupread.is_refskip:
                                                c_keep += 1
                                                cb_seq, umi_seq = pileupread.alignment.query_name.split("#")[
                                                    0].split("_")
                                                if cb_seq in cb_seq_set:
                                                    tmp_atcg_set.setdefault(
                                                        pileupread.alignment.query_sequence[pileupread.query_position], Counter())[cb_seq] += 1
                                                    #tmp_set[cb_seq] += 1
                                                    if pileupread.alignment.query_sequence[pileupread.query_position] == fa_dict[ch][v_pos]:
                                                        REF_cnt_dict.setdefault(
                                                            (ch, v_pos), []).append(cb_seq)
                                                    if pileupread.alignment.query_sequence[pileupread.query_position] == ALT:
                                                        ALT_cnt_dict.setdefault(
                                                            (ch, v_pos), []).append(cb_seq)
                                        else:
                                            if not pileupread.is_refskip:
                                                c_del += 1
                                indel_freq = c_del/float(c_keep+c_del)
                                tmp_set = set()
                                for b in tmp_atcg_set:
                                    tmp_atcg_set[b] = set(
                                        it for it in tmp_atcg_set[b] if tmp_atcg_set[b][it] <= 2)
                                if (base_freq[0][0] in tmp_atcg_set) and (base_freq[1][0] in tmp_atcg_set):
                                    tmp_set.update(
                                        tmp_atcg_set[base_freq[0][0]])
                                    tmp_set.update(
                                        tmp_atcg_set[base_freq[1][0]])
                                    rv = hypergeom(len(tmp_set), len(tmp_atcg_set[base_freq[0][0]]), len(
                                        tmp_atcg_set[base_freq[1][0]]))
                                    hpg_prob = rv.pmf(
                                        len(tmp_atcg_set[base_freq[0][0]].intersection(tmp_atcg_set[base_freq[1][0]])))
                                else:
                                    hpg_prob = 1
                                reporting_summary.append(
                                    (ch, v_pos, fa_dict[ch][v_pos], ALT, freq, s_freq, hpg_prob, seq_ent, indel_freq))
    print("number:", len(reporting_summary))
    subfolder_name = "mutation"
    if not os.path.exists(os.path.join(out_dir, subfolder_name)):
        os.makedirs(os.path.join(out_dir, subfolder_name))
    with gzip.open(os.path.join(out_dir, subfolder_name, "ref_cnt.csv.gz"), "wt") as ref_cnt_f:
        # write header
        ref_cnt_f.write("chr,position," +
                        ",".join(list(cb_seq_dict.keys()))+"\n")
        for p in REF_cnt_dict:
            tmp_c = Counter(REF_cnt_dict[p])
            ref_cnt_f.write("{},{},".format(
                p[0], p[1])+",".join(str(tmp_c[it]) for it in list(cb_seq_dict.keys()))+"\n")
    with gzip.open(os.path.join(out_dir, subfolder_name, "alt_cnt.csv.gz"), "wt") as alt_cnt_f:
        # write header
        alt_cnt_f.write("chr,position," +
                        ",".join(list(cb_seq_dict.keys()))+"\n")
        for p in ALT_cnt_dict:
            tmp_c = Counter(ALT_cnt_dict[p])
            alt_cnt_f.write("{},{},".format(
                p[0], p[1])+",".join(str(tmp_c[it]) for it in list(cb_seq_dict.keys()))+"\n")
    with gzip.open(os.path.join(out_dir, subfolder_name, "allele_stat.csv.gz"), "wt") as al_stat:
        # write header
        al_stat.write(
            "chr,position,REF,ALT,REF_frequency,REF_frequency_in_short_reads,hypergeom_test_p_value,sequence_entrophy,INDEL_frequency\n")
        for rec in reporting_summary:
            al_stat.write(",".join(str(it) for it in rec)+"\n")
    pct_bin, pt = np.histogram(acc_pct, bins=500, range=(0, 1))
    with open(os.path.join(out_dir, subfolder_name, "freq_summary.csv"), "w") as cov_bin_out:
        for ix in range(500):
            cov_bin_out.write("{},{}\n".format(pt[ix], pct_bin[ix]))
Esempio n. 3
0
            "chr,position,REF,ALT,REF_frequency,REF_frequency_in_short_reads,hypergeom_test_p_value,sequence_entrophy,INDEL_frequency,mean_base_quality\n")
        for rec in reporting_summary:
            al_stat.write(",".join(str(it) for it in rec)+"\n")
    pct_bin, pt = np.histogram(acc_pct, bins=500, range=(0, 1))
    with open(os.path.join(out_dir, "mutation", "MT_freq_summary.csv"), "w") as cov_bin_out:
        for ix in range(500):
            cov_bin_out.write("{},{}\n".format(pt[ix], pct_bin[ix]))


if __name__ == "__main__":
    known_position_dict = {("chr18", 63318364): 0}
    fa_f = "/stornext/General/data/user_managed/grpu_mritchie_1/LuyiTian/Index/GRCh38.primary_assembly.genome.fa"
    gff_f = "/stornext/General/data/user_managed/grpu_mritchie_1/LuyiTian/Index/gencode.v33.annotation.gff3"
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(
        gff_f)
    gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon)
    chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript)
    # CLL141 capture
    cb_seq_dict = dict((it.strip().split("-")[0], it.strip().split("-")[0]) for it in open(
        "/stornext/General/data/user_managed/grpu_mritchie_1/RachelThijssen/sclr_data/Illumina_data/Thijssen_count80/outs/filtered_feature_bc_matrix/barcodes.tsv"))
    bam_short = "/stornext/General/data/user_managed/grpu_mritchie_1/hongkePeng/Rachel/all_fastq/CLL141-CLL-cells_S8_Rsubread.sorted.bam"
    iso_dir = "/stornext/Genomics/data/CLL_venetoclax/single_cell_data/capture_test/isoform_out"
    bam_in = os.path.join(iso_dir, "align2genome.bam")
    gff_f = os.path.join(iso_dir, "isoform_annotated.gff3")
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(
        gff_f)
    gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon)
    chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript)
    get_all_SNV_table(bam_in, chr_to_blocks, transcript_to_exon,
                      fa_f, iso_dir, cb_seq_dict, bam_short, known_position_dict)
    """
Esempio n. 4
0
def sc_long_pipeline(args):
    # parse configuration file

    if os.path.isfile(args.config_file):
        print("Use config file: {}".format(args.config_file))
        config_dict = parse_json_config(args.config_file)
    elif os.path.isfile(os.path.join(sys.path[0], args.config_file)):
        print("Use config file: {}".format(os.path.join(sys.path[0], args.config_file)))
        config_dict = parse_json_config(os.path.join(sys.path[0], args.config_file))
    else:
        print("Cannot find config file in current directory or script depository: {}".format(args.config_file))
        exit()
    print_config(config_dict)
    # check if files exist
    if args.downsample_ratio>1 or args.downsample_ratio<=0:
        print("downsample_ratio shoulw between 0 and 1: {}".format(args.downsample_ratio))
        exit()
    if not (os.path.isfile(args.infq) and os.path.isfile(args.gff3) and os.path.isfile(args.genomefa)):
        print("make sure all file exists:")
        print(args.infq)
        print(args.gff3)
        print(args.genomefa)
        exit()
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
        print("output directory not exist, create one:")
        print(args.outdir)
    if args.inbam != "" and (not os.path.isfile(args.inbam)):
        print("make sure input inbam file exists:")
        print(args.inbam)
        exit()
    # output files:
    isoform_gff3 = os.path.join(args.outdir, "isoform_annotated.gff3")
    isoform_gff3_f = os.path.join(args.outdir, "isoform_annotated.filtered.gff3")
    FSM_anno_out = os.path.join(args.outdir, "isoform_FSM_annotation.csv")
    raw_splice_isoform = os.path.join(args.outdir, "splice_raw.gff3")
    tss_tes_stat = os.path.join(args.outdir, "tss_tes.bedgraph")
    transcript_fa = os.path.join(args.outdir, "transcript_assembly.fa")
    transcript_fa_idx = os.path.join(args.outdir, "transcript_assembly.fa.fai")
    tmp_bam = os.path.join(args.outdir, "tmp.align.bam")
    tmp_bed = os.path.join(args.outdir, "tmp.splice_anno.bed12")
    genome_bam = os.path.join(args.outdir, "align2genome.bam")
    realign_bam = os.path.join(args.outdir, "realign2transcript.bam")
    tr_cnt_csv = os.path.join(args.outdir, "transcript_count.csv.gz")
    tr_badcov_cnt_csv = os.path.join(args.outdir, "transcript_count.bad_coverage.csv.gz")
    print "Input parameters:"
    print "\tgene annotation:", args.gff3
    print "\tgenome fasta:", args.genomefa
    if args.inbam != "":
        print "\tinput bam:", args.inbam
        genome_bam = args.inbam
    else:
        print "\tinput fastq:", args.infq
    print "\toutput directory:", args.outdir
    print "\tdirectory contains minimap2:", args.minimap2_dir

    # align reads to genome
    if args.inbam == "" and config_dict["pipeline_parameters"]["do_genome_alignment"]:
        print "### align reads to genome using minimap2", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        if config_dict["alignment_parameters"]["use_junctions"]:
            gff3_to_bed12(args.minimap2_dir, args.gff3, tmp_bed)
        minimap2_align(args.minimap2_dir, args.genomefa, args.infq, tmp_bam, no_flank=config_dict["alignment_parameters"]["no_flank"], bed12_junc=tmp_bed if config_dict["alignment_parameters"]["use_junctions"] else None)
        samtools_sort_index(tmp_bam, genome_bam)
        os.remove(tmp_bam)
        if config_dict["alignment_parameters"]["use_junctions"]:
            os.remove(tmp_bed)
    else:
        print "### skip aligning reads to genome", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # find isoform
    print "### read gene annotation", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(args.gff3)
    transcript_to_junctions = {tr: blocks_to_junctions(transcript_to_exon[tr]) for tr in transcript_to_exon}
    remove_similar_tr(transcript_dict, gene_to_transcript, transcript_to_exon)
    gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon)
    chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript)
    if config_dict["pipeline_parameters"]["do_isoform_identification"]:
        print "### find isoforms", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        group_bam2isoform(genome_bam, isoform_gff3, tss_tes_stat, "", chr_to_blocks, gene_dict, transcript_to_junctions, transcript_dict, args.genomefa,
        config=config_dict["isoform_parameters"], 
        downsample_ratio=args.downsample_ratio,
        raw_gff3=raw_splice_isoform if config_dict["global_parameters"]["generate_raw_isoform"] else None)
    else:
        print "### skip finding isoforms", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # get fasta
    #print "### generate transcript fasta file", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i = parse_gff_tree(isoform_gff3)
    ref_dict = {"chr_to_gene":chr_to_gene, "transcript_dict":transcript_dict, "gene_to_transcript":gene_to_transcript, "transcript_to_exon":transcript_to_exon}
    if not config_dict["realign_parameters"]["use_annotation"]:
        ref_dict = None
    get_transcript_seq(args.genomefa, transcript_fa, chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i,ref_dict=ref_dict)

    # realign to transcript using minimap2
    if config_dict["pipeline_parameters"]["do_read_realignment"]:
        print "### realign to transcript using minimap2", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        minimap2_tr_align(args.minimap2_dir, transcript_fa, args.infq, tmp_bam)
        samtools_sort_index(tmp_bam, realign_bam)
        os.remove(tmp_bam)
    else:
        print "### skip read realignment", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # quantification
    if config_dict["pipeline_parameters"]["do_transcript_quantification"]:
        print "### generate transcript count matrix", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        bc_tr_count_dict, bc_tr_badcov_count_dict, tr_kept = parse_realigned_bam(realign_bam, transcript_fa_idx, config_dict["isoform_parameters"]["Min_sup_cnt"], config_dict["transcript_counting"]["min_tr_coverage"], config_dict["transcript_counting"]["min_read_coverage"])
        #realigned_bam_coverage(realign_bam, transcript_fa_idx, args.outdir)
        tr_cnt = wrt_tr_to_csv(bc_tr_count_dict, transcript_dict_i, tr_cnt_csv, transcript_dict, config_dict["global_parameters"]["has_UMI"])
        wrt_tr_to_csv(bc_tr_badcov_count_dict, transcript_dict_i, tr_badcov_cnt_csv, transcript_dict, config_dict["global_parameters"]["has_UMI"])
        annotate_filter_gff(isoform_gff3,args.gff3,isoform_gff3_f,FSM_anno_out,tr_cnt,config_dict["isoform_parameters"]["Min_sup_cnt"])
    else:
        print "### skip transcript quantification", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")