def find_isoform(gff3, genome_bam, isoform_gff3, tss_tes_stat, genomefa, transcript_fa, downsample_ratio, config_dict, raw_splice_isoform): # find isoform print "#### Read genne annotations" chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree( gff3) transcript_to_junctions = { tr: blocks_to_junctions(transcript_to_exon[tr]) for tr in transcript_to_exon } remove_similar_tr(gene_to_transcript, transcript_to_exon) gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon) chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript) # finding isoforms are required print "#### find isoforms" group_bam2isoform(genome_bam, isoform_gff3, tss_tes_stat, "", chr_to_blocks, gene_dict, transcript_to_junctions, transcript_dict, genomefa, config=config_dict["isoform_parameters"], downsample_ratio=downsample_ratio, raw_gff3=None) #raw_gff3=raw_splice_isoform if config_dict["global_parameters"]["generate_raw_isoform"] else None) # get fasta #print "### generate transcript fasta file", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i = parse_gff_tree( isoform_gff3) ref_dict = { "chr_to_gene": chr_to_gene, "transcript_dict": transcript_dict, "gene_to_transcript": gene_to_transcript, "transcript_to_exon": transcript_to_exon } if not config_dict["realign_parameters"]["use_annotation"]: ref_dict = None get_transcript_seq(genomefa, transcript_fa, chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i, ref_dict=ref_dict) return { "transcript_dict": transcript_dict, "transcript_dict_i": transcript_dict_i }
def get_all_SNV_table(bam_in, chr_to_blocks, transcript_to_exon, fa_f, out_dir, cb_seq_dict, bam_short, known_position_dict, min_cov=100, report_pct=(0.15, 0.85)): # four array.arrays of the same length in order A C G T c2i = {"A": 0, "C": 1, "G": 2, "T": 3} fa_dict = {} acc_pct = [] REF_cnt_dict = {} ALT_cnt_dict = {} cb_seq_set = set(cb_seq_dict.keys()) reporting_summary = [] for c in get_fa(fa_f): fa_dict[c[0]] = c[1] bamfile = pysam.AlignmentFile(bam_in, "rb") if bam_short is not None: bam_s = pysam.AlignmentFile(bam_short, "rb") cb_corr_cnt = Counter() for ch in chr_to_blocks: print(ch) homo_dict = find_homo_regions(fa_dict[ch], chr_to_blocks[ch]) for ith, bl in enumerate(chr_to_blocks[ch]): tmp_bl_flat = get_gene_flat( {"NNN": bl.transcript_list}, transcript_to_exon) for ex in tmp_bl_flat["NNN"]: cnt = bamfile.count(ch, ex[0], ex[1]) if cnt < min_cov: continue cov = bamfile.count_coverage(ch, ex[0], ex[1], quality_threshold=0) # four array.arrays of the same length in order A C G T if len(cov[0]) < 20: continue # ignore tiny exons # ignore the bases at the beginning and the end (close to splicing site) for i in range(5, len(cov[0])-5): tot = float(cov[0][i]+cov[1][i]+cov[2][i]+cov[3][i]) v_pos = ex[0]+i if tot > min_cov and (fa_dict[ch][v_pos] != "N"): freq = cov[c2i[fa_dict[ch][v_pos]]][i]/tot acc_pct.append(freq) base_freq = [("A", cov[0][i]), ("C", cov[1][i]), ("G", cov[2][i]), ("T", cov[3][i])] base_freq.sort(key=lambda x: x[1], reverse=True) if v_pos == 63318364: print(base_freq) # the most enriched ALT allele ALT = [it[0] for it in base_freq if it[0] != fa_dict[ch][v_pos]][0] alt_freq = cov[c2i[ALT]][i]/tot if (report_pct[0] < alt_freq < report_pct[1]) or ((ch, v_pos) in known_position_dict): tmp_atcg_set = {} if bam_short is not None: try: cov_s = bam_s.count_coverage( ch, v_pos, v_pos+1, quality_threshold=20) s_tot = cov_s[0][0]+cov_s[1][0] + \ cov_s[2][0]+cov_s[3][0] if s_tot > (min_cov/2): s_freq = cov_s[c2i[fa_dict[ch] [v_pos]]][0]/float(s_tot) else: s_freq = -1 except: s_freq = -1 else: s_freq = -1 seq_ent = seq_entropy( fa_dict[ch][(v_pos-10):(v_pos+10)]) indel_freq = -1 if ((ch, v_pos) in known_position_dict) or ((ex[0]+i not in homo_dict) and (seq_ent > 1) and (s_freq == -1 or (0.05 < s_freq < 0.95))): for pileupcolumn in bamfile.pileup(ch, v_pos, v_pos+1, truncate=True, min_base_quality=0, ignore_overlaps=False, max_depth=20000): c_keep = 0 c_del = 0 for pileupread in pileupcolumn.pileups: if not pileupread.is_del: if not pileupread.is_refskip: c_keep += 1 cb_seq, umi_seq = pileupread.alignment.query_name.split("#")[ 0].split("_") if cb_seq in cb_seq_set: tmp_atcg_set.setdefault( pileupread.alignment.query_sequence[pileupread.query_position], Counter())[cb_seq] += 1 #tmp_set[cb_seq] += 1 if pileupread.alignment.query_sequence[pileupread.query_position] == fa_dict[ch][v_pos]: REF_cnt_dict.setdefault( (ch, v_pos), []).append(cb_seq) if pileupread.alignment.query_sequence[pileupread.query_position] == ALT: ALT_cnt_dict.setdefault( (ch, v_pos), []).append(cb_seq) else: if not pileupread.is_refskip: c_del += 1 indel_freq = c_del/float(c_keep+c_del) tmp_set = set() for b in tmp_atcg_set: tmp_atcg_set[b] = set( it for it in tmp_atcg_set[b] if tmp_atcg_set[b][it] <= 2) if (base_freq[0][0] in tmp_atcg_set) and (base_freq[1][0] in tmp_atcg_set): tmp_set.update( tmp_atcg_set[base_freq[0][0]]) tmp_set.update( tmp_atcg_set[base_freq[1][0]]) rv = hypergeom(len(tmp_set), len(tmp_atcg_set[base_freq[0][0]]), len( tmp_atcg_set[base_freq[1][0]])) hpg_prob = rv.pmf( len(tmp_atcg_set[base_freq[0][0]].intersection(tmp_atcg_set[base_freq[1][0]]))) else: hpg_prob = 1 reporting_summary.append( (ch, v_pos, fa_dict[ch][v_pos], ALT, freq, s_freq, hpg_prob, seq_ent, indel_freq)) print("number:", len(reporting_summary)) subfolder_name = "mutation" if not os.path.exists(os.path.join(out_dir, subfolder_name)): os.makedirs(os.path.join(out_dir, subfolder_name)) with gzip.open(os.path.join(out_dir, subfolder_name, "ref_cnt.csv.gz"), "wt") as ref_cnt_f: # write header ref_cnt_f.write("chr,position," + ",".join(list(cb_seq_dict.keys()))+"\n") for p in REF_cnt_dict: tmp_c = Counter(REF_cnt_dict[p]) ref_cnt_f.write("{},{},".format( p[0], p[1])+",".join(str(tmp_c[it]) for it in list(cb_seq_dict.keys()))+"\n") with gzip.open(os.path.join(out_dir, subfolder_name, "alt_cnt.csv.gz"), "wt") as alt_cnt_f: # write header alt_cnt_f.write("chr,position," + ",".join(list(cb_seq_dict.keys()))+"\n") for p in ALT_cnt_dict: tmp_c = Counter(ALT_cnt_dict[p]) alt_cnt_f.write("{},{},".format( p[0], p[1])+",".join(str(tmp_c[it]) for it in list(cb_seq_dict.keys()))+"\n") with gzip.open(os.path.join(out_dir, subfolder_name, "allele_stat.csv.gz"), "wt") as al_stat: # write header al_stat.write( "chr,position,REF,ALT,REF_frequency,REF_frequency_in_short_reads,hypergeom_test_p_value,sequence_entrophy,INDEL_frequency\n") for rec in reporting_summary: al_stat.write(",".join(str(it) for it in rec)+"\n") pct_bin, pt = np.histogram(acc_pct, bins=500, range=(0, 1)) with open(os.path.join(out_dir, subfolder_name, "freq_summary.csv"), "w") as cov_bin_out: for ix in range(500): cov_bin_out.write("{},{}\n".format(pt[ix], pct_bin[ix]))
"chr,position,REF,ALT,REF_frequency,REF_frequency_in_short_reads,hypergeom_test_p_value,sequence_entrophy,INDEL_frequency,mean_base_quality\n") for rec in reporting_summary: al_stat.write(",".join(str(it) for it in rec)+"\n") pct_bin, pt = np.histogram(acc_pct, bins=500, range=(0, 1)) with open(os.path.join(out_dir, "mutation", "MT_freq_summary.csv"), "w") as cov_bin_out: for ix in range(500): cov_bin_out.write("{},{}\n".format(pt[ix], pct_bin[ix])) if __name__ == "__main__": known_position_dict = {("chr18", 63318364): 0} fa_f = "/stornext/General/data/user_managed/grpu_mritchie_1/LuyiTian/Index/GRCh38.primary_assembly.genome.fa" gff_f = "/stornext/General/data/user_managed/grpu_mritchie_1/LuyiTian/Index/gencode.v33.annotation.gff3" chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree( gff_f) gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon) chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript) # CLL141 capture cb_seq_dict = dict((it.strip().split("-")[0], it.strip().split("-")[0]) for it in open( "/stornext/General/data/user_managed/grpu_mritchie_1/RachelThijssen/sclr_data/Illumina_data/Thijssen_count80/outs/filtered_feature_bc_matrix/barcodes.tsv")) bam_short = "/stornext/General/data/user_managed/grpu_mritchie_1/hongkePeng/Rachel/all_fastq/CLL141-CLL-cells_S8_Rsubread.sorted.bam" iso_dir = "/stornext/Genomics/data/CLL_venetoclax/single_cell_data/capture_test/isoform_out" bam_in = os.path.join(iso_dir, "align2genome.bam") gff_f = os.path.join(iso_dir, "isoform_annotated.gff3") chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree( gff_f) gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon) chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript) get_all_SNV_table(bam_in, chr_to_blocks, transcript_to_exon, fa_f, iso_dir, cb_seq_dict, bam_short, known_position_dict) """
def sc_long_pipeline(args): # parse configuration file if os.path.isfile(args.config_file): print("Use config file: {}".format(args.config_file)) config_dict = parse_json_config(args.config_file) elif os.path.isfile(os.path.join(sys.path[0], args.config_file)): print("Use config file: {}".format(os.path.join(sys.path[0], args.config_file))) config_dict = parse_json_config(os.path.join(sys.path[0], args.config_file)) else: print("Cannot find config file in current directory or script depository: {}".format(args.config_file)) exit() print_config(config_dict) # check if files exist if args.downsample_ratio>1 or args.downsample_ratio<=0: print("downsample_ratio shoulw between 0 and 1: {}".format(args.downsample_ratio)) exit() if not (os.path.isfile(args.infq) and os.path.isfile(args.gff3) and os.path.isfile(args.genomefa)): print("make sure all file exists:") print(args.infq) print(args.gff3) print(args.genomefa) exit() if not os.path.exists(args.outdir): os.makedirs(args.outdir) print("output directory not exist, create one:") print(args.outdir) if args.inbam != "" and (not os.path.isfile(args.inbam)): print("make sure input inbam file exists:") print(args.inbam) exit() # output files: isoform_gff3 = os.path.join(args.outdir, "isoform_annotated.gff3") isoform_gff3_f = os.path.join(args.outdir, "isoform_annotated.filtered.gff3") FSM_anno_out = os.path.join(args.outdir, "isoform_FSM_annotation.csv") raw_splice_isoform = os.path.join(args.outdir, "splice_raw.gff3") tss_tes_stat = os.path.join(args.outdir, "tss_tes.bedgraph") transcript_fa = os.path.join(args.outdir, "transcript_assembly.fa") transcript_fa_idx = os.path.join(args.outdir, "transcript_assembly.fa.fai") tmp_bam = os.path.join(args.outdir, "tmp.align.bam") tmp_bed = os.path.join(args.outdir, "tmp.splice_anno.bed12") genome_bam = os.path.join(args.outdir, "align2genome.bam") realign_bam = os.path.join(args.outdir, "realign2transcript.bam") tr_cnt_csv = os.path.join(args.outdir, "transcript_count.csv.gz") tr_badcov_cnt_csv = os.path.join(args.outdir, "transcript_count.bad_coverage.csv.gz") print "Input parameters:" print "\tgene annotation:", args.gff3 print "\tgenome fasta:", args.genomefa if args.inbam != "": print "\tinput bam:", args.inbam genome_bam = args.inbam else: print "\tinput fastq:", args.infq print "\toutput directory:", args.outdir print "\tdirectory contains minimap2:", args.minimap2_dir # align reads to genome if args.inbam == "" and config_dict["pipeline_parameters"]["do_genome_alignment"]: print "### align reads to genome using minimap2", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if config_dict["alignment_parameters"]["use_junctions"]: gff3_to_bed12(args.minimap2_dir, args.gff3, tmp_bed) minimap2_align(args.minimap2_dir, args.genomefa, args.infq, tmp_bam, no_flank=config_dict["alignment_parameters"]["no_flank"], bed12_junc=tmp_bed if config_dict["alignment_parameters"]["use_junctions"] else None) samtools_sort_index(tmp_bam, genome_bam) os.remove(tmp_bam) if config_dict["alignment_parameters"]["use_junctions"]: os.remove(tmp_bed) else: print "### skip aligning reads to genome", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # find isoform print "### read gene annotation", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(args.gff3) transcript_to_junctions = {tr: blocks_to_junctions(transcript_to_exon[tr]) for tr in transcript_to_exon} remove_similar_tr(transcript_dict, gene_to_transcript, transcript_to_exon) gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon) chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript) if config_dict["pipeline_parameters"]["do_isoform_identification"]: print "### find isoforms", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") group_bam2isoform(genome_bam, isoform_gff3, tss_tes_stat, "", chr_to_blocks, gene_dict, transcript_to_junctions, transcript_dict, args.genomefa, config=config_dict["isoform_parameters"], downsample_ratio=args.downsample_ratio, raw_gff3=raw_splice_isoform if config_dict["global_parameters"]["generate_raw_isoform"] else None) else: print "### skip finding isoforms", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # get fasta #print "### generate transcript fasta file", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i = parse_gff_tree(isoform_gff3) ref_dict = {"chr_to_gene":chr_to_gene, "transcript_dict":transcript_dict, "gene_to_transcript":gene_to_transcript, "transcript_to_exon":transcript_to_exon} if not config_dict["realign_parameters"]["use_annotation"]: ref_dict = None get_transcript_seq(args.genomefa, transcript_fa, chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i,ref_dict=ref_dict) # realign to transcript using minimap2 if config_dict["pipeline_parameters"]["do_read_realignment"]: print "### realign to transcript using minimap2", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") minimap2_tr_align(args.minimap2_dir, transcript_fa, args.infq, tmp_bam) samtools_sort_index(tmp_bam, realign_bam) os.remove(tmp_bam) else: print "### skip read realignment", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # quantification if config_dict["pipeline_parameters"]["do_transcript_quantification"]: print "### generate transcript count matrix", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") bc_tr_count_dict, bc_tr_badcov_count_dict, tr_kept = parse_realigned_bam(realign_bam, transcript_fa_idx, config_dict["isoform_parameters"]["Min_sup_cnt"], config_dict["transcript_counting"]["min_tr_coverage"], config_dict["transcript_counting"]["min_read_coverage"]) #realigned_bam_coverage(realign_bam, transcript_fa_idx, args.outdir) tr_cnt = wrt_tr_to_csv(bc_tr_count_dict, transcript_dict_i, tr_cnt_csv, transcript_dict, config_dict["global_parameters"]["has_UMI"]) wrt_tr_to_csv(bc_tr_badcov_count_dict, transcript_dict_i, tr_badcov_cnt_csv, transcript_dict, config_dict["global_parameters"]["has_UMI"]) annotate_filter_gff(isoform_gff3,args.gff3,isoform_gff3_f,FSM_anno_out,tr_cnt,config_dict["isoform_parameters"]["Min_sup_cnt"]) else: print "### skip transcript quantification", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")