def Get_Skipend_dict(region_fetch,bamfile,strand): bam_reader = HTSeq.BAM_Reader(bamfile) read_seq = bam_reader.fetch(region=region_fetch) read_seq_iter = iter(bam_reader.fetch()) one_read = next(read_seq_iter) skip_list=[] pe_mode = one_read.paired_end if pe_mode: read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) for a in read_seq: if not pe_mode: if not a.aligned: continue if a.optional_field('NH') > 1: continue if strand == "+": skip_list.extend([int(cigop.ref_iv.end) for cigop in a.cigar if cigop.type == "N" and cigop.size >0]) else: skip_list.extend([int(cigop.ref_iv.start) for cigop in a.cigar if cigop.type == "N" and cigop.size >0]) else: if ((a[0] and a[0].aQual<minaqual) or (a[1] and a[1].aQual<minaqual)): continue if ((a[0] and a[0].optional_field('NH') > 1) or (a[1] and a[1].optional_field('NH')>1)): continue if a[0] is not None and a[0].aligned: if strand == "+": skip_list.extend([int(cigop.ref_iv.end) for cigop in a[0].cigar if cigop.type =="N" and cigop.size > 0]) else: skip_list.extend([int(cigop.ref_iv.start) for cigop in a[0].cigar if cigop.type =="N" and cigop.size > 0]) if a[1] is not None and a[1].aligned: if strand == "+": skip_list.extend([int(cigop.ref_iv.end) for cigop in a[1].cigar if cigop.type =="N" and cigop.size > 0]) else: skip_list.extend([int(cigop.ref_iv.start) for cigop in a[1].cigar if cigop.type =="N" and cigop.size > 0]) skip_dict = dict(collections.Counter(skip_list)) return skip_dict
def compute_quality( readfilename, file_type, nosplit, readlen, max_qual, gamma, primary_only=False, max_records=-1, ): if file_type in ("sam", "bam"): readfile = HTSeq.BAM_Reader(readfilename) isAlnmntFile = True elif file_type == "solexa-export": readfile = HTSeq.SolexaExportReader(readfilename) isAlnmntFile = True elif file_type == "fastq": readfile = HTSeq.FastqReader(readfilename) isAlnmntFile = False elif file_type == "solexa-fastq": readfile = HTSeq.FastqReader(readfilename, "solexa") isAlnmntFile = False else: raise ValueError('File format not recognized: {:}'.format(file_type)) twoColumns = isAlnmntFile and (not nosplit) if readlen is None: readlen = get_read_length(readfile, isAlnmntFile) # Initialize count arrays base_arr_U = np.zeros((readlen, 5), np.int64) qual_arr_U = np.zeros((readlen, max_qual+1), np.int64) if twoColumns: base_arr_A = np.zeros((readlen, 5), np.int64) qual_arr_A = np.zeros((readlen, max_qual+1), np.int64) # Main counting loop i = 0 try: for a in readfile: if isAlnmntFile: r = a.read else: r = a # Exclude non-primary alignments if requested if isAlnmntFile and primary_only: if a.aligned and a.not_primary_alignment: continue if twoColumns and isAlnmntFile and a.aligned: r.add_bases_to_count_array(base_arr_A) r.add_qual_to_count_array(qual_arr_A) else: r.add_bases_to_count_array(base_arr_U) r.add_qual_to_count_array(qual_arr_U) i += 1 if i == max_records: break if (i % 200000) == 0: if (not isAlnmntFile) or primary_only: print(i, "reads processed") else: print(i, "alignments processed") except: sys.stderr.write("Error occured in: %s\n" % readfile.get_line_number_string()) raise if (not isAlnmntFile) or primary_only: print(i, "reads processed") else: print(i, "alignments processed") # Normalize result def norm_by_pos(arr): arr = np.array(arr, np.float64) arr_n = (arr.T / arr.sum(1)).T arr_n[arr == 0] = 0 return arr_n def norm_by_start(arr): arr = np.array(arr, np.float64) arr_n = (arr.T / arr.sum(1)[0]).T arr_n[arr == 0] = 0 return arr_n result = { 'isAlnmntFile': isAlnmntFile, 'readlen': readlen, 'twoColumns': twoColumns, 'base_arr_U_n': norm_by_pos(base_arr_U), 'qual_arr_U_n': norm_by_start(qual_arr_U), 'nreads_U': base_arr_U[0, :].sum(), } if twoColumns: result['base_arr_A_n'] = norm_by_pos(base_arr_A) result['qual_arr_A_n'] = norm_by_start(qual_arr_A) result['nreads_A'] = base_arr_A[0, :].sum() return result
def count_reads(start_codon_sites, stop_codon_sites, ORF_features, counts, map_file, stranded, min_quality, count_mode, first_exclude_codons, last_exclude_codons, min_read, max_read, exclude_min_ORF): lowqual = 0 notaligned = 0 nonunique = 0 too_short = 0 too_long = 0 min_read_string = "__too_short(<%i)" % min_read max_read_string = "__too_long(<%i)" % max_read first_exclude_nt = first_exclude_codons * 3 last_exclude_nt = last_exclude_codons * 3 pysam_fh = pysam.AlignmentFile(map_file) is_bam = pysam_fh.is_bam pysam_fh.close() if is_bam: tracks = HTSeq.BAM_Reader(map_file) else: tracks = HTSeq.SAM_Reader(map_file) # for i,r in enumerate(tracks): for r in tracks: # if i % 100000 == 0: # sys.stderr.write("%d alignment record processed.\r" % i) if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < min_quality: lowqual += 1 continue read_len = len(r.read.seq) if read_len < min_read: too_short += 1 continue if read_len > max_read: too_long += 1 continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) try: if count_mode == "intersection-strict": fs = None for iv in iv_seq: for iv2, fs2 in ORF_features[iv].steps(): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) elif count_mode == "union": fs = set() for iv in iv_seq: for iv2, fs2 in ORF_features[iv].steps(): fs = fs.union(fs2) if fs is None or len(fs) == 0: continue elif len(fs) > 1: continue else: orf_id = list(fs)[0] if read_len < exclude_min_ORF: counts[orf_id] += 1 continue try: if abs(start_codon_sites[orf_id] - r.iv.start_d) < first_exclude_nt: continue elif abs(r.iv.end_d - stop_codon_sites[orf_id]) < last_exclude_nt: continue else: counts[orf_id] += 1 except: counts[orf_id] += 1 except: sys.stderr.write( "Error occurred when processing mapping file in line:%s\n" % r.get_sam_line()) counts["__too_low_quality"] += lowqual counts["__not_aligned"] += notaligned counts[min_read_string] += too_short counts[max_read_string] += too_long counts["__alignment_not_unique"] += nonunique return counts
def Get_IPAevent(input_tuple): label,all_bamfiles = input_tuple curr_label_all_gas = [] curr_label_all_ga = [] curr_label_all_gene_count = [] IPA_result = [] min_count = 30 for bamfile in all_bamfiles: bam_reader = HTSeq.BAM_Reader(bamfile) gas,ga,gene_count = Get_label_information(label,annot,bam_reader) curr_label_all_gas.append(gas) curr_label_all_ga.append(ga) curr_label_all_gene_count.append(gene_count) for feature,rank,chrom,start,end,strand,length,exon_rank_left,exon_rank_right in annot[label]: if feature == "intron" and int(length)>250: intron_start = start intron_end = end end_value = 15 index_list = [index for index,gene_count in enumerate(curr_label_all_gene_count) if gene_count[('intron',rank)]> min_count] if index_list != []: iv = HTSeq.GenomicInterval(chrom,intron_start,intron_end,strand) IPAtype = "Composite" curr_label_all_cov = [] for index in index_list: if strand == "-": curr_label_all_cov.append(list(curr_label_all_ga[index][iv])[::-1]) else: curr_label_all_cov.append(list(curr_label_all_ga[index][iv])) intron_region = chrom+":"+str(intron_start)+"-"+str(intron_end) skipend_dict_list = [Get_Skipend_dict(intron_region,bamfile,strand) for bamfile in all_bamfiles] for index,skipend_dict in enumerate(skipend_dict_list): for key,value in skipend_dict.items(): if int(start)+50 < int(key) < int(end)-50 and int(value) > 10: if strand == "+": skip_position = int(key)-int(start) else: skip_position = int(end)-int(key) curr_label_all_cov = [cvg_region[skip_position:] for cvg_region in curr_label_all_cov] IPAtype = "Skipped" start = int(key) end = int(key) end_value = int(value) break else: continue break min_mseratio_list,min_mse_point_list = Get_min_mseratio_list(curr_label_all_cov) min_mseratio = min(min_mseratio_list) min_mseratio_index = min_mseratio_list.index(min_mseratio) if min_mseratio < 0.5: min_mseratio_list_refine,min_mse_point_list_refine = Get_min_mseratio_list_refine(curr_label_all_cov,min_mse_point_list[min_mseratio_index]) min_mseratio_refine = min(min_mseratio_list_refine) min_mseratio_index_refine = min_mseratio_list_refine.index(min_mseratio_refine) IPA_point = int(min_mse_point_list_refine[min_mseratio_index_refine]) up_down_diff = max([np.mean(coverage[:IPA_point])-np.mean(coverage[IPA_point:]) for coverage in curr_label_all_cov]) upstream_cov = max([len(list(filter(lambda x:x>5,coverage[:IPA_point])))/IPA_point for coverage in curr_label_all_cov]) downstream_cov = np.mean([len(list(filter(lambda x:x>5,coverage[IPA_point:])))/(len(coverage)-IPA_point) for coverage in curr_label_all_cov]) if min_mseratio_refine < 0.5 and up_down_diff > 1 and upstream_cov > 0.8 and downstream_cov < 0.5: if strand == "+": IPA_location = int(start)+IPA_point IPA_inf = chrom+":"+str(start)+"-"+str(IPA_location) else: IPA_location = int(end)-IPA_point IPA_inf = chrom+":"+str(IPA_location)+"-"+str(end) skipstart_dict = Get_Skipstart_dict(intron_region,all_bamfiles,strand) for key,value in skipstart_dict.items(): if IPA_location-20<int(key)<IPA_location+20 and int(value) > end_value*0.8: break else: intronPA_inf = label + ";"+feature + "_" + str(rank) + ";" + IPA_inf + ";" + IPAtype IPA_information = Get_IPAsite_IPUI((intronPA_inf,curr_label_all_ga,gas)) IPA_result.append(IPA_information) return IPA_result
if args.stranded == 'yes': feature_array = hts.GenomicArrayOfSets("auto", stranded=True) elif args.stranded == 'no': feature_array = hts.GenomicArrayOfSets("auto", stranded=False) for feature in gtf: if feature.type == args.type: feature_array[feature.iv] += feature.name print "done.\n\n" # create Reader class for samfile: if args.format == 'sam': alnmt_file = hts.SAM_Reader(args.alignment_file[0]) else: alnmt_file = hts.BAM_Reader(args.alignment_file[0]) # count reads: print "Counting reads..." if args.read_type == 'single_end': counts = ungapped_se_counter(alnmt_file, feature_array) print "\nSample output for ungapped SE counts:" countlist = sorted(counts.items()) for g, c in countlist[-10:]: print "%-10s %d" % (g, c) else: counts = ungapped_pe_counter(alnmt_file, feature_array) print "\nSample output for ungapped PE counts:"
def test_bam_inconsistent_mate(): print('Test inconsistent BAM file') bamfile = HTSeq.BAM_Reader("example_data/inconsistent_mate.bam") for read in bamfile: pass print("Test passed")
attribute_label) #TE_gtf="/home/daniel/local_data/hg19/annotation/hg19_rmsk_TE.gtf" #TE_gtf="/home/daniel/local_data/hg19/annotation/hg19_rmsk_TE_exon_filtered.gtf" #TE_gtf="/home/daniel/local_data/hg19/annotation/hg19_rmsk_TE_fort2014_CAGEseq.gtf" attribute_label = 'transcript_id' feature_type = 'exon' TE_features = extract_GTF_features(args.TE_gtf, feature_type, attribute_label) TE_first_counts = collections.Counter() TE_second_counts = collections.Counter() TE_only_counts = collections.Counter() #bam_file="/home/daniel/local_data/hipsci/star/test_bam_chr19_sorted.bam" almnt_file = HTSeq.BAM_Reader(args.bam_file) nUnmapped = 0 nMultipleAlignments = 0 nAttributeErrors = 0 for bundle in HTSeq.pair_SAM_alignments(almnt_file, bundle=True): if len(bundle) != 1: nMultipleAlignments += 1 continue # Skip multiple alignments first_almnt, second_almnt = bundle[0] # extract pair if not (first_almnt and second_almnt): nUnmapped += 1 continue if (first_almnt.iv is None) or (second_almnt.iv is None):
def run(BED, BAMS1, BAMS2, mil_reads): sortedbamfile1rep1 = HTSeq.BAM_Reader(BAMS1[0]) sortedbamfile1rep2 = HTSeq.BAM_Reader(BAMS1[1]) sortedbamfile2rep1 = HTSeq.BAM_Reader(BAMS2[0]) sortedbamfile2rep2 = HTSeq.BAM_Reader(BAMS2[1]) bedfile = list() with open(BED) as F: for line in F: line = line.strip('\n').split('\t') chrom, start, stop = line[:3] start = int(start) stop = int(stop) if len(chrom) <= 5: #i added a window because it looked like we were missing peaks without it if start < 1000: bedfile.append( HTSeq.GenomicInterval(chrom, 0, stop + 1000, '.')) #normalizing for length below is not totally accurate because of this, but it's probably okay else: bedfile.append( HTSeq.GenomicInterval(chrom, start - 1000, stop + 1000, '.')) counts1rep1 = list() for region in bedfile: counts1rep1.append(0.0) length = region.length + 2000 for almnt in sortedbamfile1rep1[region]: counts1rep1[-1] += 1.0 counts1rep1[-1] /= (length / 1000.0) counts1rep1[-1] /= mil_reads[0][0] counts1rep2 = list() for region in bedfile: counts1rep2.append(0.0) length = region.length + 2000 for almnt in sortedbamfile1rep2[region]: counts1rep2[-1] += 1.0 counts1rep2[-1] /= (length / 1000.0) counts1rep2[-1] /= mil_reads[0][1] counts2rep1 = list() for region in bedfile: counts2rep1.append(0.0) length = region.length + 2000 for almnt in sortedbamfile2rep1[region]: counts2rep1[-1] += 1.0 counts2rep1[-1] /= (length / 1000.0) counts2rep1[-1] /= mil_reads[1][0] counts2rep2 = list() for region in bedfile: counts2rep2.append(0.0) length = region.length + 2000 for almnt in sortedbamfile2rep2[region]: counts2rep2[-1] += 1.0 counts2rep2[-1] /= (length / 1000.0) counts2rep2[-1] /= mil_reads[1][1] counts1avg = [(x + y) / 2.0 for x, y in zip(counts1rep1, counts1rep2)] counts2avg = [(x + y) / 2.0 for x, y in zip(counts2rep1, counts2rep2)] #log10 but excludes 0 (removes both entries) counts1avgclean = counts1avg counts2avgclean = counts2avg #so i decided to find any zero in the first list and make it zero in the second list, then clean it up after for i in range(len(counts1avg)): if counts1avg[i] == 0.0: counts2avgclean[i] = 0.0 elif counts2avg[i] == 0.0: counts1avgclean[i] = 0.0 counts1avgclean = [x for x in counts1avgclean if x != 0.0] counts2avgclean = [x for x in counts2avgclean if x != 0.0] counts1avglog = [math.log10(x) for x in counts1avgclean] counts2avglog = [math.log10(x) for x in counts2avgclean] #2-sample KS test KStest = stats.ks_2samp(counts1avglog, counts2avglog) #fold change foldchange = [] for i in range(len(counts2avg)): try: foldchange.append(counts2avg[i] / counts1avg[i]) except: pass foldchangelog = [] for x in foldchange: try: foldchangelog.append(math.log10(x)) except: pass return [counts1avglog, counts2avglog, KStest, foldchangelog]
import sys gtffile = sys.argv[1] listOfBams = sys.argv[2:] #print(listOfBams) exons = HTSeq.GenomicArrayOfSets("auto", stranded=True) gtf = HTSeq.GFF_Reader(gtffile, end_included=True) for feature in gtf: if feature.type == "exon": exons[feature.iv] += feature.name for bamfile in listOfBams: bamObj = HTSeq.BAM_Reader(bamfile) for alignment in bamObj: if alignment.aligned: iset = None for interval2, step_set in exons[alignment.iv].steps(): if iset is None: iset = step_set.copy() else: iset.intersection_update(step_set) if len(iset) == 1: counts[list(iset)[0]] += 1 for name in sorted(counts.keys()): print(name, counts[name])
def mapping_reads2shared_exons_introns(refGene_txt, bam_filename, minaqual, stranded, order, max_buffer_size): # initialise counters counts = {} counts['_empty'] = 0 counts['_ambiguous'] = 0 counts['_lowaqual'] = 0 counts['_notaligned'] = 0 counts['_ambiguous_readpair_position'] = 0 # Read BAM file bam_reader = HTSeq.BAM_Reader(bam_filename) # CIGAR match characters (including alignment match, sequence match, and sequence mismatch cigar_char = ('M', '=', 'X') # (Refer to HTSeq-count)strand-associated stranded_boolean = stranded == 'yes' or stranded == 'reverse' reverse_boolean = stranded == 'reverse' def invert_strand(iv): iv2 = iv.copy() if iv2.strand == "+": iv2.strand = "-" elif iv2.strand == "-": iv2.strand = "+" else: raise ValueError("Illegal strand") return iv2 sys.stdout.write( "Gene\tfeature\trank\tposition\tlength\tread_counts\tread_counts_norm\tcoverage(%)\n" ) annot = collections.OrderedDict() for line in open(refGene_txt): gene_label, feature, rank, position, length = line.strip().split('\t') chrom, iv_str, strand = position.strip().split(':') start, end = map(int, iv_str.strip().split('-')) annot.setdefault(gene_label, []).append( (feature, int(rank), chrom, start, end, strand, int(length))) for gene_name in annot: gene_count = {} gas = HTSeq.GenomicArrayOfSets("auto", stranded=stranded_boolean) ga = HTSeq.GenomicArray("auto", stranded=stranded_boolean, typecode="i") cvg_list = [] # Annotation for feature, rank, chrom, start, end, strand, length in annot[ gene_name]: iv = HTSeq.GenomicInterval(chrom, start, end, strand) gas[iv] += (feature, rank) gene_count[(feature, rank)] = 0 # 直接对bam_reader取iter有问题,作者说是pysam的bug导致的。修正:加fetch boundary_left, boundary_right = min( [i[3] for i in annot[gene_name]]), max([i[4] for i in annot[gene_name]]) region_fetch = annot[gene_name][0][2] + ':' + str( int(boundary_left) - 500) + '-' + str(int(boundary_right) + 500) read_seq = bam_reader.fetch(region=region_fetch) # distinguish SE and PE mode: read_seq_iter = iter(bam_reader.fetch()) one_read = next(read_seq_iter) pe_mode = one_read.paired_end if pe_mode: if order == 'name': read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == 'pos': read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size) else: raise ValueError("Illegal order name.") # Mapping for a in read_seq: if not pe_mode: if not a.aligned: counts['_notaligned'] += 1 continue if a.optional_field('NH') > 1: continue if a.aQual < minaqual: counts['_lowaqual'] += 1 continue if not reverse_boolean: iv_seq = (cigop.ref_iv for cigop in a.cigar if cigop.type == "M" and cigop.size > 0) else: iv_seq = (invert_strand(cigop.ref_iv) for cigop in a.cigar if cigop.type in cigar_char and cigop.size > 0) # pe mode else: if ((a[0] and a[0].aQual < minaqual) or (a[1] and a[1].aQual < minaqual)): counts['_lowaqual'] += 1 continue if ((a[0] and a[0].optional_field('NH') > 1) or (a[1] and a[1].optional_field('NH') > 1)): continue if a[0] is not None and a[0].aligned: if not reverse_boolean: iv_seq = ( cigop.ref_iv for cigop in a[0].cigar if cigop.type in cigar_char and cigop.size > 0) else: iv_seq = ( invert_strand(cigop.ref_iv) for cigop in a[0].cigar if cigop.type in cigar_char and cigop.size > 0) else: iv_seq = tuple() if a[1] is not None and a[1].aligned: if not reverse_boolean: iv_seq = itertools.chain( iv_seq, (invert_strand(cigop.ref_iv) for cigop in a[1].cigar if cigop.type in cigar_char and cigop.size > 0)) else: iv_seq = itertools.chain( iv_seq, (cigop.ref_iv for cigop in a[1].cigar if cigop.type in cigar_char and cigop.size > 0)) feature_aligned = set() for iv in iv_seq: for iv2, val2 in gas[iv].steps(): feature_aligned |= val2 ga[iv] += 1 # for calculating coverage if len(feature_aligned) == 0: counts['_empty'] += 1 continue # when mapping to intron, discard exons for f in [item for item in feature_aligned if item[0] == 'intron']: gene_count[f] += 1 # when no mapping to intron, count all exons if 'intron' not in [x for x, y in feature_aligned]: for f in feature_aligned: gene_count[f] += 1 res = [] for feature, rank, chrom, start, end, strand, length in annot[ gene_name]: feature_count = gene_count[(feature, rank)] feature_count_norm = feature_count / length * 1000 # Coverage calculation iv = HTSeq.GenomicInterval(chrom, start, end, strand) cvg_region = list(ga[iv]) cvg = len(filter(lambda x: x > 0, cvg_region)) / len(cvg_region) * 100 res.append([ feature, rank, chrom, start, end, strand, length, feature_count, feature_count_norm, cvg ]) # Output for feature, rank, chrom, start, end, strand, length, feature_count, feature_count_norm, cvg in res: pos = "%s:%d-%d:%s" % (chrom, start, end, strand) sys.stdout.write('\t'.join( map(str, [ gene_name, feature, rank, pos, length, feature_count, feature_count_norm, cvg ])) + '\n') for fn in counts.keys(): sys.stderr.write('%s\t%d\n' % (fn, counts[fn]))
# read in gtf and create a Genomic Array of Sets for all exons we find viral_gtf_path = HTSeq.GFF_Reader(args.viral_gtf_path) exons = HTSeq.GenomicArrayOfSets('auto', stranded=True) # get all contigs from the gtf file viral_gtf_contigs = {f.iv.chrom for f in viral_gtf_path} for feature in viral_gtf_path: if feature.type == 'exon': exons[feature.iv] += feature.attr['gene_id'] # add gene id to this feature's coordinates in the exons array # get alignments by umi by cell barcode alignments_by_umi_by_cell_barcode = dict() umi_to_ignore_by_cell_barcode = dict() # cell-umi barcodes that map to non-viral contigs for almnt in HTSeq.BAM_Reader(args.bam_path): assert isinstance(almnt, HTSeq.SAM_Alignment) # ignore secondary alignments and unmapped reads if not almnt.aligned or almnt.not_primary_alignment: continue # ignore alignments with invalid cell barcode or umi tags_present = {kv[0] for kv in almnt.optional_fields} if 'CB' not in tags_present or 'UB' not in tags_present: continue cell_barcode = almnt.optional_field('CB').split('-')[0] # get cell barcode # ignore cells not mapped to a predicted real cell
def readChrwithBam(): # print(chr) reads_dict = {} totalsjfile = opt.totalsj bamfile = opt.bam bam = HTSeq.BAM_Reader(bamfile) for eachLine in open(totalsjfile): line = eachLine.strip().split("\t") # chr7 34247275 34664347 + # if line[0] != chr: # continue if line[0] == "chrM": continue if not line[0].startswith("chr"): continue reads_left = 0 reads_right = 0 # if int(line[4])<opt.sjreads: # continue s = int(line[1]) e = int(line[2]) iv1 = HTSeq.GenomicInterval(line[0], s, s + opt.span, line[3]) iv2 = HTSeq.GenomicInterval(line[0], e - opt.span, e, line[3]) name = line[0] + "\t" + line[1] + "\t" + line[2] # chr = name.split("\t")[0] iv = iv1 usedreads = {} for r in bam[iv]: flag = 0 for co in r.cigar: if co.type == "N": flag = 1 break if flag == 1: continue # if r.iv.strand != iv.strand: # continue if ((r.iv.strand != iv.strand and (not r.paired_end)) or (r.paired_end and r.iv.strand != iv.strand and r.pe_which == "first") or (r.paired_end and r.iv.strand == iv.strand and r.pe_which == "second")): continue if r.iv.start < iv.start and r.iv.end >= iv.end: r_name = r.read.name if r_name in usedreads: continue else: usedreads[r.read.name] = "" reads_left += 1 # print(reads_left) iv = iv2 usedreads = {} for r in bam[iv]: flag = 0 for co in r.cigar: if co.type == "N": flag = 1 break if flag == 1: continue # if r.iv.strand != iv.strand: # continue if ((r.iv.strand != iv.strand and (not r.paired_end)) or (r.paired_end and r.iv.strand != iv.strand and r.pe_which == "first") or (r.paired_end and r.iv.strand == iv.strand and r.pe_which == "second")): continue if r.iv.start <= iv.start and r.iv.end > iv.end: r_name = r.read.name if r_name in usedreads: continue else: usedreads[r.read.name] = "" reads_right += 1 # print(reads_right) if name not in sjnum: sjnum[name] = "0" # print(d[c]["left"]) w.writelines(eachLine.strip() + "\t" + sjnum[name] + "\t") if line[3] == "+": w.writelines(str(reads_left) + "\t" + str(reads_right) + "\n") else: w.writelines(str(reads_right) + "\t" + str(reads_left) + "\n")
def main(): try: import matplotlib except ImportError: sys.stderr.write("This script needs the 'matplotlib' library, which ") sys.stderr.write("was not found. Please install it." ) matplotlib.use('PDF') from matplotlib import pyplot # Matplotlib <1.5 uses normalize, so this block will be deprecated try: from matplotlib.pyplot import Normalize except ImportError: from matplotlib.pyplot import normalize as Normalize # **** Parse command line **** optParser = optparse.OptionParser( usage = "%prog [options] read_file", description= "This script take a file with high-throughput sequencing reads " + "(supported formats: SAM, Solexa _export.txt, FASTQ, Solexa " + "_sequence.txt) and performs a simply quality assessment by " + "producing plots showing the distribution of called bases and " + "base-call quality scores by position within the reads. The " + "plots are output as a PDF file.", epilog = "Written by Simon Anders ([email protected]), European Molecular Biology " + " Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " + " Public License v3. Part of the 'HTSeq' framework, version %s." % HTSeq.__version__ ) optParser.add_option( "-t", "--type", type="choice", dest="type", choices = ("sam", "bam", "solexa-export", "fastq", "solexa-fastq"), default = "sam", help="type of read_file (one of: sam [default], bam, " + "solexa-export, fastq, solexa-fastq)" ) optParser.add_option( "-o", "--outfile", type="string", dest="outfile", help="output filename (default is <read_file>.pdf)" ) optParser.add_option( "-r", "--readlength", type="int", dest="readlen", help="the maximum read length (when not specified, the script guesses from the file" ) optParser.add_option( "-g", "--gamma", type="float", dest="gamma", default = 0.3, help="the gamma factor for the contrast adjustment of the quality score plot" ) optParser.add_option( "-n", "--nosplit", action="store_true", dest="nosplit", help="do not split reads in unaligned and aligned ones" ) optParser.add_option( "-m", "--maxqual", type="int", dest="maxqual", default=41, help="the maximum quality score that appears in the data (default: 41)" ) if len( sys.argv ) == 1: optParser.print_help() sys.exit(1) (opts, args) = optParser.parse_args() if len( args ) != 1: sys.stderr.write( sys.argv[0] + ": Error: Please provide one argument (the read_file).\n" ) sys.stderr.write( " Call with '-h' to get usage information.\n" ) sys.exit( 1 ) readfilename = args[0] if opts.type == "sam": readfile = HTSeq.SAM_Reader( readfilename ) isAlnmntFile = True elif opts.type == "bam": readfile = HTSeq.BAM_Reader( readfilename ) isAlnmntFile = True elif opts.type == "solexa-export": readfile = HTSeq.SolexaExportReader( readfilename ) isAlnmntFile = True elif opts.type == "fastq": readfile = HTSeq.FastqReader( readfilename ) isAlnmntFile = False elif opts.type == "solexa-fastq": readfile = HTSeq.FastqReader( readfilename, "solexa" ) isAlnmntFile = False else: sys.error( "Oops." ) twoColumns = isAlnmntFile and not opts.nosplit if opts.outfile is None: outfilename = os.path.basename( readfilename ) + ".pdf" else: outfilename = opts.outfile # **** Get read length **** if opts.readlen is not None: readlen = opts.readlen else: readlen = 0 if isAlnmntFile: reads = ( a.read for a in readfile ) else: reads = readfile for r in islice( reads, 10000 ): if len( r ) > readlen: readlen = len( r ) max_qual = opts.maxqual gamma = opts.gamma # **** Initialize count arrays **** base_arr_U = numpy.zeros( ( readlen, 5 ), numpy.int ) qual_arr_U = numpy.zeros( ( readlen, max_qual+1 ), numpy.int ) if twoColumns: base_arr_A = numpy.zeros( ( readlen, 5 ), numpy.int ) qual_arr_A = numpy.zeros( ( readlen, max_qual+1 ), numpy.int ) # **** Main counting loop **** i = 0 try: for a in readfile: if isAlnmntFile: r = a.read else: r = a if twoColumns and (isAlnmntFile and a.aligned): r.add_bases_to_count_array( base_arr_A ) r.add_qual_to_count_array( qual_arr_A ) else: r.add_bases_to_count_array( base_arr_U ) r.add_qual_to_count_array( qual_arr_U ) i += 1 if (i % 200000) == 0: print(i, "reads processed") except: sys.stderr.write( "Error occured in: %s\n" % readfile.get_line_number_string() ) raise print(i, "reads processed") # **** Normalize result **** def norm_by_pos( arr ): arr = numpy.array( arr, numpy.float ) arr_n = ( arr.T / arr.sum( 1 ) ).T arr_n[ arr == 0 ] = 0 return arr_n def norm_by_start( arr ): arr = numpy.array( arr, numpy.float ) arr_n = ( arr.T / arr.sum( 1 )[ 0 ] ).T arr_n[ arr == 0 ] = 0 return arr_n base_arr_U_n = norm_by_pos( base_arr_U ) qual_arr_U_n = norm_by_start( qual_arr_U ) nreads_U = base_arr_U[0,:].sum() if twoColumns: base_arr_A_n = norm_by_pos( base_arr_A ) qual_arr_A_n = norm_by_start( qual_arr_A ) nreads_A = base_arr_A[0,:].sum() # **** Make plot **** def plot_bases( arr ): xg = numpy.arange( readlen ) pyplot.plot( xg, arr[ : , 0 ], marker='.', color='red') pyplot.plot( xg, arr[ : , 1 ], marker='.', color='darkgreen') pyplot.plot( xg, arr[ : , 2 ], marker='.',color='lightgreen') pyplot.plot( xg, arr[ : , 3 ], marker='.',color='orange') pyplot.plot( xg, arr[ : , 4 ], marker='.',color='grey') pyplot.axis( (0, readlen-1, 0, 1 ) ) pyplot.text( readlen*.70, .9, "A", color="red" ) pyplot.text( readlen*.75, .9, "C", color="darkgreen" ) pyplot.text( readlen*.80, .9, "G", color="lightgreen" ) pyplot.text( readlen*.85, .9, "T", color="orange" ) pyplot.text( readlen*.90, .9, "N", color="grey" ) pyplot.figure() pyplot.subplots_adjust( top=.85 ) pyplot.suptitle( os.path.basename(readfilename), fontweight='bold' ) if twoColumns: pyplot.subplot( 221 ) plot_bases( base_arr_U_n ) pyplot.ylabel( "proportion of base" ) pyplot.title( "non-aligned reads\n%.0f%% (%.3f million)" % ( 100. * nreads_U / (nreads_U+nreads_A), nreads_U / 1e6 ) ) pyplot.subplot( 222 ) plot_bases( base_arr_A_n ) pyplot.title( "aligned reads\n%.0f%% (%.3f million)" % ( 100. * nreads_A / (nreads_U+nreads_A), nreads_A / 1e6 ) ) pyplot.subplot( 223 ) pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens, norm=Normalize( 0, 1 ) ) pyplot.axis( (0, readlen-1, 0, max_qual+1 ) ) pyplot.xlabel( "position in read" ) pyplot.ylabel( "base-call quality score" ) pyplot.subplot( 224 ) pyplot.pcolor( qual_arr_A_n.T ** gamma, cmap=pyplot.cm.Greens, norm=Normalize( 0, 1 ) ) pyplot.axis( (0, readlen-1, 0, max_qual+1 ) ) pyplot.xlabel( "position in read" ) else: pyplot.subplot( 211 ) plot_bases( base_arr_U_n ) pyplot.ylabel( "proportion of base" ) pyplot.title( "%.3f million reads" % ( nreads_U / 1e6 ) ) pyplot.subplot( 212 ) pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens, norm=Normalize( 0, 1 ) ) pyplot.axis( (0, readlen-1, 0, max_qual+1 ) ) pyplot.xlabel( "position in read" ) pyplot.ylabel( "base-call quality score" ) pyplot.savefig( outfilename )
0x4 : "segment unmapped", 0x8 : "next segment in the template unmapped", 0x10 : "SEQ being reverse complemented", 0x20 : "SEQ of the next segment in the template being reversed", 0x40 : "the first segment in the template", 0x80 : "the last segment in the template", 0x100 : "secondary alignment", 0x200 : "not passing quality controls", 0x400 : "PCR or optical duplicate", 0x800 : "supplementary alignment"} mate_mapped_same_chr, mate_mapped_dif_chr, mate_mapped_dif_chr_a5 = 0,0,0 unmapped, paired, read1, read2, properly, duplicate, total = 0, 0, 0, 0, 0, 0, 0 bamfile = HTSeq.BAM_Reader(bam) for almnt in bamfile: if almnt.aligned: if almnt.flag & 0x900 == 0: total += 1 if almnt.flag & 0x400 != 0: duplicate += 1 if almnt.mate_aligned: paired +=1 if almnt.proper_pair: properly += 1 if almnt.iv.chrom == almnt.mate_start.chrom: mate_mapped_same_chr += 1 else: mate_mapped_dif_chr += 1 if almnt.aQual >= 5:
#!/usr/bin/python import os, sys, HTSeq bam = HTSeq.BAM_Reader(sys.argv[1]) for each in bam: if each.aligned and each.mate_aligned: if each.pe_which == 'first': print abs(each.inferred_insert_size)
def readChr_unstrand(chr, reads): print(chr) reads_dict = {} reads_dict["left"] = {} reads_dict["right"] = {} totalsjfile = opt.totalsj bamfile = opt.bam bam = HTSeq.BAM_Reader(bamfile) reads_dict["left"] = {} reads_dict["right"] = {} i = 0 j = 0 for eachLine in open(totalsjfile): line = eachLine.strip().split("\t") # chr7 34247275 34664347 + if line[0] != chr: continue # print(eachLine) j += 1 if j > 0 and j % 1000 == 0: sys.stderr.write("%s : %d sj processed.\n" % (chr, j)) i += 1 key = str(i) # if line[0] == "chrM": # continue # if not line[0].startswith("chr"): # continue reads_left = 0 reads_right = 0 lss = line[0] + ":" + line[1] + ":" + line[3] rss = line[0] + ":" + line[2] + ":" + line[3] # if int(line[4])<opt.sjreads: # continue s = int(line[1]) e = int(line[2]) iv1 = HTSeq.GenomicInterval(line[0], s - 1, s + opt.span, ".") iv2 = HTSeq.GenomicInterval(line[0], e - 1 - opt.span, e, ".") name = line[0] + "\t" + line[1] + "\t" + line[2] # chr = name.split("\t")[0] if lss in reads_dict["left"]: reads_left = reads_dict["left"][lss] else: iv = iv1 usedreads = {} # print(">sj iv:") # print(iv) for r in bam[iv]: if r.iv.length > 150: continue # print(r.iv) flag = 0 for co in r.cigar: if co.type == "N": flag = 1 break if flag == 1: continue # if r.iv.strand != iv.strand: # continue # if ((r.iv.strand != iv.strand and (not r.paired_end)) or (r.paired_end and r.iv.strand != iv.strand and r.pe_which == "first") or (r.paired_end and r.iv.strand == iv.strand and r.pe_which == "second")): # continue if r.iv.start < iv.start and r.iv.end >= iv.end: r_name = r.read.name if r_name in usedreads: continue else: usedreads[r.read.name] = "" reads_left += 1 reads_dict["left"][lss] = reads_left # print(reads_left) if rss in reads_dict["right"]: reads_right = reads_dict["right"][rss] else: iv = iv2 usedreads = {} for r in bam[iv]: if r.iv.length > 150: continue flag = 0 for co in r.cigar: if co.type == "N": flag = 1 break if flag == 1: continue # if r.iv.strand != iv.strand: # continue # if ((r.iv.strand != iv.strand and (not r.paired_end)) or (r.paired_end and r.iv.strand != iv.strand and r.pe_which == "first") or (r.paired_end and r.iv.strand == iv.strand and r.pe_which == "second")): # continue if r.iv.start <= iv.start and r.iv.end > iv.end: r_name = r.read.name if r_name in usedreads: continue else: usedreads[r.read.name] = "" reads_right += 1 reads_dict["right"][rss] = reads_right # print(reads_right) # if name not in sjnum: # sjnum[name] = "0" # # print(d[c]["left"]) # tmp=eachLine.strip() + "\t" + sjnum[name] + "\t" # if line[3] == "+": # tmp+=str(reads_left) + "\t" + str(reads_right) + "\n" # else: # tmp+=str(reads_right) + "\t" + str(reads_left) + "\n" # reads_dict[key] = tmp # print(reads_dict) reads[chr] = reads_dict.copy() del reads_dict logging.info("done %s" % chr)
bamfile = outdir + '/Aligned.toTranscriptome.out.bam' ############################################################ # process bam if not os.path.exists(bamfile): print "cannot find bamfile", bamfile sys.exit(2) statfile = os.path.dirname(os.path.abspath(bamfile)) + '/ReadsPerGene.out.tab' if not os.path.exists(statfile): print "Warning: Can't analyze the mapping stat because ", statfile, " not exist" bam_reader = HTSeq.BAM_Reader(bamfile) total = 0 print 'readname\ttranscript' for align in bam_reader: total += 1 myread = align.read.name mytrpt = align.iv.chrom print '{}\t{}'.format(myread, mytrpt) ############################################################ # mapping stat if not os.path.exists(statfile): print "cannot find mapping stat file", statfile
def count_biotype_overlaps(aligned_bam, selected_features, biotype_count_dict, number_lines=10000000): """ Go thorough an aligned bam, counting overlaps with biotype features """ # Set up filenames & objects aligned_bam = os.path.realpath(aligned_bam) bamfile = HTSeq.BAM_Reader(aligned_bam) # Go through alignments, counting transcript biotypes logging.info("\nReading BAM file (will stop at {}): ".format(number_lines)) aligned_reads = 0 for i, alnmt in enumerate(bamfile): if i > int(number_lines): i -= 1 logging.info( "Reached {} lines in the aligned file, exiting..".format( number_lines)) break if i % 1000000 == 0 and i > 0: logging.debug("{} lines processed..".format(i)) if alnmt.aligned: aligned_reads += 1 iset = None for iv2, step_set in selected_features[alnmt.iv].steps(): if iset is None: iset = step_set.copy() else: iset.intersection_update(step_set) # Feature values were set as biotype label. Overlap with multiple # features with the same biotype will give length == 1 key = 'multiple_features' if len(iset) == 1: key = list(iset)[0] elif len(iset) == 0: key = 'no_overlap' biotype_count_dict['biotype_counts'][key] += 1 biotype_count_dict['biotype_lengths'][key][alnmt.iv.length] += 1 logging.info ("\n{} overlaps found from {} aligned reads ({} reads total)" \ .format(aligned_reads-biotype_count_dict['biotype_counts']['no_overlap'], aligned_reads, i)) logging.info ("{} reads had multiple feature overlaps\n" \ .format(biotype_count_dict['biotype_counts']['multiple_features'])) # Make a string table out of the counts counts_string = 'Type\tRead Count\n' for biotype in sorted(biotype_count_dict['biotype_counts'], key=biotype_count_dict['biotype_counts'].get, reverse=True): if biotype_count_dict['biotype_counts'][biotype] == 0: continue counts_string += "{}\t{}{}".format( biotype, biotype_count_dict['biotype_counts'][biotype], os.linesep) # Save to file file_basename = os.path.splitext(os.path.basename(aligned_bam))[0] counts_file = "{}_biotypeCounts.txt".format(file_basename) try: with open(counts_file, 'w') as fh: print(counts_string, file=fh) except IOError as e: raise IOError(e) # Return the counts return biotype_count_dict
#We need this little helper below: def reverse_strand(s): if s == "+": return "-" elif s == "-": return "+" else: raise SystemError, "illegal strand" # Now go through the aligned reads if not is_BAM: tmp_obj = HTSeq.SAM_Reader(sam_file) else: tmp_obj = HTSeq.BAM_Reader(sam_file) if not is_PE: num_reads = 0 # for a in HTSeq.SAM_Reader( sam_file ): for a in tmp_obj: if not a.aligned: counts['_notaligned'] += 1 continue if a.aQual < minaqual: counts['_lowaqual'] += 1 continue rs = set() for cigop in a.cigar: if cigop.type != "M":
import datetime if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--input', '-i', type=argparse.FileType('rb'), required=True) parser.add_argument('--output', '-o', type=argparse.FileType('w'), required=True) args = parser.parse_args() almnt_file = HTSeq.BAM_Reader(args.input) counts = HTSeq.GenomicArray("auto", stranded=False, typecode='i') fcounts = HTSeq.GenomicArray("auto", stranded=False, typecode='i') curChrom = None for almnt in almnt_file: if not almnt.aligned or almnt.not_primary_alignment or almnt.supplementary: continue if curChrom != almnt.iv.chrom: dt = datetime.datetime.now() print(dt.isoformat(), args.input.name, "Switching Chromosome", curChrom, almnt.iv.chrom)
def modifHTSeq(bam_filename, gff_filename, out_file, overlap_mode, feature_type, id_attribute, minaqual, exclude_start_distance, exclude_stop_distance, min_len, max_len): #feature GenomicArrayOfSets features = HTSeq.GenomicArrayOfSets("auto", stranded=True) counts = {} start_codon_sites = {} stop_codon_sites = {} #GTF gff = HTSeq.GFF_Reader(gff_filename, end_included=True) i = 0 for f in gff: if f.type == feature_type: if id_attribute in f.attr: #the same to the f.attr.keys() feature_id = f.attr[ id_attribute] # f.attr will return the 9-th colum of the input gtf file as {} else: feature_id = f.attr[ 'gene_id'] #in the gtf file of Rat, there are some CDS/exon dont have gene_name ,but every items have gene_id features[ f. iv] += feature_id #label the chrmosome with gene_name, if dont have gene_name,replaced by gene_id #counts[ f.attr[ id_attribute ] ] = 0 #only counts reads for genes with id_attribute, so cant repaced by counts[ feature_id ] = 0 counts[feature_id] = 0 ### if there are multiple TIS, use the most 5' end start codon and the most 3' end stop codon if f.type == "start_codon": if id_attribute in f.attr: gname = f.attr[id_attribute] if gname not in start_codon_sites: start_codon_sites[gname] = f.iv.start_d else: if f.iv.strand == "+": start_codon_sites[gname] = min(f.iv.start_d, start_codon_sites[gname]) else: start_codon_sites[gname] = max(f.iv.start_d, start_codon_sites[gname]) # if f.type == "stop_codon": if id_attribute in f.attr: gname = f.attr[id_attribute] if gname not in stop_codon_sites: stop_codon_sites[gname] = f.iv.end_d else: if f.iv.strand == "+": stop_codon_sites[gname] = max(f.iv.end_d, stop_codon_sites[gname]) else: stop_codon_sites[gname] = min(f.iv.end_d, stop_codon_sites[gname]) i += 1 if i % 100000 == 0: sys.stderr.write("%d GFF lines processed.\n" % i) #bam read_seq = HTSeq.BAM_Reader(bam_filename) #counts empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0: sys.stderr.write("%d SAM alignment record processed.\n" % i) i += 1 if not r.aligned: notaligned += 1 continue if r.optional_field("NH") > 1: nonunique += 1 continue if r.aQual < minaqual: lowqual += 1 continue ### if len(r.read.seq) < min_len or len(r.read.seq) > max_len: continue iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) if overlap_mode == "union": fs = set() for iv in iv_seq: for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: try: #some genes may dont have start or stop codon if abs(start_codon_sites[list(fs)[0]] - r.iv.start_d) < exclude_start_distance: continue elif abs(r.iv.end_d - stop_codon_sites[list(fs)[0]] ) < exclude_stop_distance: continue else: counts[list(fs)[0]] += 1 except: counts[list(fs)[0]] += 1 #output with open(out_file, "w") as fout: fout.write("%s\t%s\n" % (id_attribute.strip(), "count")) for fn in sorted(counts.keys()): fout.write("%s\t%s\n" % (fn, counts[fn])) fout.write("__no_feature\t%d\n" % empty) fout.write("__ambiguous\t%d\n" % ambiguous) fout.write("__too_low_aQual\t%d\n" % lowqual) fout.write("__not_aligned\t%d\n" % notaligned) fout.write("__alignment_not_unique\t%d\n" % nonunique)
def readChrwithBam(chr, reads): print(chr) reads_dict = {} reads_dict["left"] = {} reads_dict["right"] = {} usedreads = {} totalsjfile = opt.totalsj ga = HTSeq.GenomicArrayOfSets([chr], stranded=True) ga2 = HTSeq.GenomicArrayOfSets([chr], stranded=True) minpos = 10000000000 maxpos = 0 for eachLine in open(totalsjfile): line = eachLine.strip().split("\t") # chr7 34247275 34664347 + if line[0] != chr: continue # chr = line[0] lss = line[0] + ":" + line[1] + ":" + line[3] rss = line[0] + ":" + line[2] + ":" + line[3] reads_dict["left"][lss] = 0 reads_dict["right"][rss] = 0 s = int(line[1]) e = int(line[2]) if s < minpos: minpos = s if e > maxpos: maxpos = e iv1 = HTSeq.GenomicInterval(line[0], s - 1, s, line[3]) iv2 = HTSeq.GenomicInterval(line[0], e - 1, e, line[3]) usedreads[iv1] = {} usedreads[iv2] = {} ga[iv1] += lss ga2[iv2] += rss # utemp = dict(usedreads).copy() bamfile = opt.bam bam = HTSeq.BAM_Reader(bamfile) giv = HTSeq.GenomicInterval(chr, minpos, maxpos, ".") j = 0 print("start reading bam of " + chr) # print(giv) for r in bam[giv]: j += 1 if j > 0 and j % 100000 == 0: sys.stderr.write("%s : %d sj processed.\n" % (chr, j)) if j > 0 and j % 1000000 == 0: for tiv in usedreads: if tiv.start < r.iv.start - 5000: usedreads[tiv].clear() r_name = r.read.name # print(r_name + ":" + r.iv.strand + ":" + r.pe_which) iv_seq = [] if opt.unstrand: iv_seq1 = [ co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ] iv_seq2 = [ invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0 ] iv_seq = iv_seq1 + iv_seq2 else: if ((not r.paired_end) or (r.paired_end and r.pe_which == "first")): iv_seq = [ co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ] # 只读取匹配类型为M的部分,记录在iv_seq if (r.paired_end and r.pe_which == "second"): iv_seq = [ invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0 ] for iv2 in iv_seq: # print(iv2) for iv, fs in ga[iv2].steps(): if len(fs) == 1: if iv.start - 1 >= iv2.start and iv.start + opt.span <= iv2.end: if r_name in usedreads[iv]: # print(r_name) continue else: # print(r_name) usedreads[iv][r_name] = True ss = list(fs)[0] reads_dict["left"][ss] += 1 for iv, fs in ga2[iv2].steps(): if len(fs) == 1: if iv.start - opt.span >= iv2.start and iv.start + 1 <= iv2.end: if r_name in usedreads[iv]: continue else: # print(r_name) # print(iv2) usedreads[iv][r_name] = True ss = list(fs)[0] reads_dict["right"][ss] += 1 # print(reads_dict) reads[chr] = reads_dict.copy() del reads_dict del usedreads logging.info("done %s" % chr)
def pool(infile, targets, intron_set, fiveSS, threeSS, Branches, Branchto3ss): SI_counts = defaultdict(int) junction_counts = defaultdict(int) for f, s in HTSeq.pair_SAM_alignments_with_buffer( HTSeq.BAM_Reader('%s/%s.bam' % (infile, infile))): if f != None and f.aligned == True and f.aQual > 5: chrome = f.iv.chrom start = f.iv.start end = f.iv.end strand = f.iv.strand if strand == '+': geneint = HTSeq.GenomicPosition(chrome, start, strand) else: geneint = HTSeq.GenomicPosition(chrome, end, strand) if len(targets[geneint]) == 0: introns = set() junctions = set() for i, cigop in enumerate(f.cigar): if cigop.type == 'M': for iv, val in targets[cigop.ref_iv].steps(): introns |= val elif cigop.type == 'N': if f.cigar[i - 1].type == 'M' and f.cigar[ i - 1].size > 3 and f.cigar[ i + 1].type == 'M' and f.cigar[i + 1].size > 3: for iv, val in targets[cigop.ref_iv].steps(): junctions |= val chrom = cigop.ref_iv.chrom if cigop.ref_iv.strand == '+': first = cigop.ref_iv.end second = cigop.ref_iv.start + 1 strand = "+" else: first = cigop.ref_iv.start + 1 second = cigop.ref_iv.end strand = '-' if (chrom, first, strand) in fiveSS and (chrom, second, strand) in threeSS: up = fiveSS[chrom, first, strand] down = threeSS[chrom, second, strand] if up[0] == down[0]: if up[1] == down[1]: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Constituitive")] += 1 else: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Exon Skipping")] += 1 elif (chrom, first, strand) in fiveSS: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Alternative 3'")] += 1 elif (chrom, second, strand) in threeSS: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Alternative 5'")] += 1 intron_num_mat = {} intron_num_pre = {} intron = '' junction = '' if len(introns) > 0: for i in introns: a = i.split(';') intron_num_pre[i] = a[1] intron = max(intron_num_pre.items(), key=lambda x: x[1]) intron = intron[0] if len(junctions) > 0: for i in junctions: a = i.split(';') intron_num_mat[i] = a[1] junction = max(intron_num_mat.items(), key=lambda x: x[1]) junction = junction[0] if junction == intron: intron = '' junction = '' if junction and intron: if junction.split(';')[1] > intron.split(';')[1]: intron = '' else: junction = '' candidate_genes = set() for i in introns: candidate_genes.add(i.split(';')[0]) for i in junctions: candidate_genes.add(i.split(';')[0]) if len(candidate_genes) == 1: if junction: SI_counts[('mature', junction)] += 1 if intron: SI_counts[('premature', intron)] += 1 if f.proper_pair == True and s.proper_pair == True and s.aligned == True and s.aQual > 5: if junction: SI_counts[('concordant_mature', junction)] += 1 if intron: SI_counts[('concordant_premature', intron)] += 1 # Counts starting position of read 2's that fall within specified lariat intermediate and branch to 3'SS windows if intron > 0 and s.aligned == True and s.proper_pair == True and s.aQual > 5: chrome = s.iv.chrom start = s.iv.start end = s.iv.end strand = s.iv.strand if strand == '+': geneint = HTSeq.GenomicPosition(chrome, start, strand) else: geneint = HTSeq.GenomicPosition(chrome, end, strand) if intron in Branches[geneint] and len( Branches[geneint]) == 1: SI_counts[('lariat_int', intron)] += 1 if intron in Branchto3ss[geneint] and len( Branchto3ss[geneint]) == 1: SI_counts[('branch_to3ss', intron)] += 1 with open('%s/%s_splicing_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\t%d\n' % (intron, SI_counts[('mature', intron)], SI_counts[('premature', intron)])) with open('%s/%s_concordant_splicing_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\t%d\n' % (intron, SI_counts[('concordant_mature', intron)], SI_counts[('concordant_premature', intron)])) with open('%s/%s_lariat_int_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\n' % (intron, SI_counts[('lariat_int', intron)])) with open('%s/%s_branch_to3ss_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\n' % (intron, SI_counts[('branch_to3ss', intron)])) with open('%s/%s_junction_counts.txt' % (infile, infile), 'w') as out: out.write('Gene\tUpstream\tDownstream\tType\tCount\n') for junc in sorted(junction_counts): out.write( '%s\t%d\t%d\t%s\t%d\n' % (junc[1], junc[2], junc[3], junc[4], junction_counts[junc]))
def count_circrna(args): import HTSeq import numpy as np import pandas as pd from collections import OrderedDict, defaultdict from ioutils import open_file_or_stdout logger.info('read input BAM/SAM file: ' + args.input_file) if args.input_file.endswith('.sam'): sam = HTSeq.SAM_Reader(args.input_file) elif args.input_file.endswith('.bam'): sam = HTSeq.BAM_Reader(args.input_file) else: raise ValueError('unsupported file extension') # extract junction positions from SAM header logger.info('extract junction positions') junction_positions = OrderedDict() for sq in sam.get_header_dict()['SQ']: junction_positions[sq['SN']] = sq['LN'] // 2 # initialize counts gene_ids = list(junction_positions.keys()) counts = pd.Series(np.zeros(len(gene_ids), dtype='int'), index=gene_ids) # count reads min_mapping_quality = args.min_mapping_quality strandness = args.strandness if args.paired_end: logger.info('count paired-end fragments') stats = defaultdict(int) for bundle in HTSeq.pair_SAM_alignments(sam, bundle=True): stats['total_pairs'] += 1 # ignore multi-mapped pairs if len(bundle) != 1: stats['multi_mapping'] += 1 continue read1, read2 = bundle[0] # ignore singletons if (read1 is None) or (read2 is None): stats['singleton'] += 1 continue # ignore unmapped reads if not (read1.aligned and read2.aligned): stats['unmapped'] += 1 continue # ignore pairs with mapping quality below threshold if (read1.aQual < min_mapping_quality) or (read2.aQual < min_mapping_quality): stats['low_mapping_quality'] += 1 continue if (strandness == 'forward') and (not ((read1.iv.strand == '+') and (read2.iv.strand == '-'))): stats['improper_strand'] += 1 continue if (strandness == 'reverse') and (not ((read1.iv.strand == '-') and (read2.iv.strand == '+'))): stats['improper_strand'] += 1 continue # ignore pairs on different chromosomes if read1.iv.chrom != read2.iv.chrom: stats['diff_chrom'] += 1 continue pos = junction_positions[read1.iv.chrom] if read1.iv.start < pos <= read2.iv.end: counts[read1.iv.chrom] += 1 for key, val in stats.items(): logger.info('{}: {}'.format(key, val)) else: logger.info('count single-end reads') for read in sam: # ignore unmapped read if not read.aligned: continue # ignore reads with mapping quality below threshold if read.aQual < min_mapping_quality: continue if (strandness == 'forward') and (read.iv.strand == '-'): continue if (strandness == 'reverse') and (not ((read.iv.strand == '+'))): continue pos = junction_positions[read.iv.chrom] if read.iv.start < pos <= read.iv.end: counts[read.iv.chrom] += 1 # output counts logger.info('count fragments: {}'.format(counts.sum())) logger.info('write counts to file: ' + args.output_file) with open_file_or_stdout(args.output_file) as fout: counts.to_csv(fout, sep='\t', header=None, index=True, na_rep='NA')
def centipede_footprint(bed_file, bam_file, sites, sample_name, plots_dir, fragmentsize=1, orientation=True, duplicates=True, strand_specific=True): """ Gets read coverage in genomic intervals. Passes coverage to centipede_call_footprints and returns posterior probabilities. :param bed_file: Bed file. :type bed_file: str :param bam: HTSeq.BAM_Reader object, must be sorted and indexed with .bai file. :type bam: HTSeq.BAM_Reader :type fragmentsize: int :type stranded: bool :type duplicates: bool :returns: OrderedDict with regionName:numpy.array(coverage) :rtype: collections.OrderedDict """ import pybedtools import os import HTSeq import numpy as np # read in bedfile motifs = pybedtools.BedTool(bed_file) # get motif name motif_name = os.path.basename(bed_file.split(".")[0]) # get motif length (length of first interval) motif_length = motifs[0].length # convert intervals to HTSeq.GenomicInterval intervals = map(bedtools_interval_to_genomic_interval, motifs) # Handle bam file bam = HTSeq.BAM_Reader(bam_file) # exclude bad chroms chroms_exclude = ['chrM', 'chrX', 'chrY'] # get dimensions of matrix to store profiles of Tn5 transposition n = len(intervals) m = intervals[0].length # create empty matrix if not strand_specific: coverage = np.zeros((n, m), dtype=np.float64) else: # if "strand_specific", get signal for both strands independently, but concatenated coverage = np.zeros((n, m * 2), dtype=np.float64) # Loop through intervals, get coverage, increment matrix count for i, feature in enumerate(intervals): # counter just to track if i % 1000 == 0: print(n - i) # Check if feature is not in bad chromosomes if feature.chrom in chroms_exclude: continue # Fetch alignments in interval for aln in bam[feature]: # check it's aligned if not aln.aligned: continue # check if duplicate if not duplicates and aln.pcr_or_optical_duplicate: continue aln.iv.length = fragmentsize # adjust reads to specified size # get position relative to window if required (motif-oriented) if orientation: if feature.strand == "+" or feature.strand == ".": start_in_window = aln.iv.start - feature.start - 1 end_in_window = aln.iv.end - feature.start - 1 else: start_in_window = feature.length - abs(feature.start - aln.iv.end) - 1 end_in_window = feature.length - abs(feature.start - aln.iv.start) - 1 else: start_in_window = aln.iv.start - feature.start - 1 end_in_window = aln.iv.end - feature.start - 1 # check fragment is within window; this is because of fragmentsize adjustment if start_in_window < 0 or end_in_window > feature.length: continue # add +1 to all positions overlapped by read within window if not strand_specific: coverage[i, start_in_window:end_in_window] += 1 else: if aln.iv.strand == "+": coverage[i, start_in_window:end_in_window] += 1 else: coverage[i, m + start_in_window:m + end_in_window] += 1 # Call footprints, get posterior probabilities try: probs = centipede_call_footprints( coverage, np.ones([len(coverage), 1]), motif_length, os.path.join(plots_dir, sample_name + "." + motif_name + ".pdf")) if len(probs) != len(coverage): probs = np.zeros(len(coverage)) except: # if error, return zeros probs = np.zeros(len(coverage)) return probs
#example 3 # sample_name = 'yuww165' # gene_sym = 'WASH7P' # sj_pos = 'chr1:17055-17605' #example 4 # sample_name = 'yuhimo' # # sample_name = 'yukadi' # gene_sym = 'LOC100132287' # sj_pos = 'chr1:27623647-27624428' #example 5 sample_name = 'gapi' # sample_name = 'yukadi' gene_sym = 'PSMC6' sj_pos = 'chr14:53185756-53190682' #retrieve the mapped reads to see which reads uniquely mapped path_bam = DIR_RNASEQ + "/tophat_sample_" + sample_name + "/accepted_hits.bam" bam_reader = HTSeq.BAM_Reader(path_bam) pysam_file = pysam.AlignmentFile(path_bam, 'rb') # hash_htseq = sj_read_support( bam_reader, sj_pos ) # print "see read counts: ", hash_htseq # sj_read_support_TEST( bam_reader, sj_pos ) sj_read_support_variety_reads(bam_reader, sj_pos) print "------------ TDD Completed: 170601_SJ_Metrics_V2.py ------------"
import sys import HTSeq import numpy import matplotlib as mpl mpl.use('pdf') from matplotlib import pyplot #bamfile = HTSeq.BAM_Reader( "Chr1.unique.bam" ) #sortedbamfile = HTSeq.BAM_Reader( "../input/Nucleosome.Chr1.unique.bam" ) #sortedbamfile = HTSeq.BAM_Reader( "../input/DHS.unique.bam" ) #gtffile = HTSeq.GFF_Reader( "../input/MSU7.gene.exon_number.gtf" ) sortedbamfile = HTSeq.BAM_Reader(sys.argv[1]) gtffile = HTSeq.GFF_Reader(sys.argv[2]) halfwinwidth = 2000 fragmentsize = 73 readlen = 36 #total = 60745783.00/1000000 ## nucleosome #total = 7480914/1000000 ## nucleosome chr1 #total = 23299296/1000000 #DHS unique #gsize = 372000000 #coverage = HTSeq.GenomicArray( "auto", stranded=False, typecode="i" ) #for almnt in bamfile: # if almnt.aligned: # #almnt.iv.length = fragmentsize # print almnt.iv # if not almnt.iv.start < 500: # coverage[ almnt.iv ] += 1 tsspos = set()
import sys import HTSeq import numpy import matplotlib as mpl mpl.use('pdf') from matplotlib import pyplot #bamfile = HTSeq.BAM_Reader( "Chr1.unique.bam" ) #bamfile = HTSeq.BAM_Reader( "../input/Nucleosome.unique.bam" ) bamfile = HTSeq.BAM_Reader("../input/DHS.Chr1.unique.bam") #gtffile = HTSeq.GFF_Reader( "../input/MSU7.gene.exon_number.HighExp.gtf" ) gtffile = HTSeq.GFF_Reader(sys.argv[1]) halfwinwidth = 2000 fragmentsize = 150 total = 60745783.00 / 1000000 ## nucleosome gsize = 372000000 coverage = HTSeq.GenomicArray("auto", stranded=False, typecode="i") for almnt in bamfile: if almnt.aligned: #almnt.iv.length = fragmentsize #print almnt.iv if not almnt.iv.start < 500: coverage[almnt.iv] += 1 tsspos = set() for feature in gtffile: if feature.type == "exon" and feature.attr["exon_number"] == "-1": #print feature.iv.start_d_as_pos.pos if feature.iv.start_d_as_pos.pos > 5000:
def count_reads_with_barcodes( sam_filename, features, feature_attr, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samout_format, samout_filename, cb_tag, ub_tag, ): def write_to_samout(r, assignment, samoutfile, template=None): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) if samout_format in ('SAM', 'sam'): samoutfile.write(read.get_sam_line() + "\n") else: samoutfile.write(read.to_pysam_AlignedSegment(template)) def identify_barcodes(r): '''Identify barcode from the read or pair (both must have the same)''' if not pe_mode: r = (r,) # cell, UMI barcodes = [None, None] nbar = 0 for read in r: if read is not None: for tag, val in read.optional_fields: if tag == cb_tag: barcodes[0] = val nbar += 1 if nbar == 2: return barcodes elif tag == ub_tag: barcodes[1] = val nbar += 1 if nbar == 2: return barcodes return barcodes try: if sam_filename == "-": read_seq_file = HTSeq.BAM_Reader(sys.stdin) else: read_seq_file = HTSeq.BAM_Reader(sam_filename) # Get template for output BAM if samout_filename is None: template = None samoutfile = None elif samout_format in ('bam', 'BAM'): template = read_seq_file.get_template() samoutfile = pysam.AlignmentFile( samout_filename, 'wb', template=template, ) else: template = None samoutfile = open(samout_filename, 'w') read_seq_iter = iter(read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) pe_mode = first_read.paired_end # FIXME: catchall can hide subtle bugs except: first_read = None pe_mode = False if first_read is not None: read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq = [] except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n") raise # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") # The nesting is cell barcode, UMI, feature counts = defaultdict(lambda: defaultdict(Counter)) i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 cb, ub = identify_barcodes(r) if not pe_mode: if not r.aligned: counts[cb][ub]['__not_aligned'] += 1 write_to_samout( r, "__not_aligned", samoutfile, template) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: counts[cb][ub]['__alignment_not_unique'] += 1 write_to_samout( r, "__alignment_not_unique", samoutfile, template) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: counts[cb][ub]['__too_low_aQual'] += 1 write_to_samout( r, "__too_low_aQual", samoutfile, template) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout( r, "__not_aligned", samoutfile, template) counts[cb][ub]['__not_aligned'] += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): write_to_samout( r, "__alignment_not_unique", samoutfile, template) counts[cb][ub]['__alignment_not_unique'] += 1 if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): write_to_samout( r, "__too_low_aQual", samoutfile, template) counts[cb][ub]['__too_low_aQual'] += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout( r, "__no_feature", samoutfile, template) counts[cb][ub]['__no_feature'] += 1 elif len(fs) > 1: write_to_samout( r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile, template) counts[cb][ub]['__ambiguous'] += 1 else: write_to_samout( r, list(fs)[0], samoutfile, template) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[cb][ub][list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[cb][ub][fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout( r, "__no_feature", samoutfile, template) counts[cb][ub]['__no_feature'] += 1 except: sys.stderr.write( "Error occured when processing input (%s):\n" % (read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write( "%d %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() # Get rid of UMI by majority rule cbs = sorted(counts.keys()) counts_noumi = {} for cb in cbs: counts_cell = Counter() for ub, udic in counts.pop(cb).items(): # In case of a tie, do not increment either feature top = udic.most_common(2) if (len(top) == 2) and (top[0][1] == top[1][1]): continue counts_cell[top[0][0]] += 1 counts_noumi[cb] = counts_cell return { 'cell_barcodes': cbs, 'counts': counts_noumi, }
def main(): arg_parser = argparse.ArgumentParser(description='Processes a BAM file into TSV.') arg_parser.add_argument("input_file",type=str, help='<input file>, can be a stream indicating "-"') arg_parser.add_argument("-id","--min_id",type=float, default=95.0, help='Minimal %% of identity to reference sequence to gather the read. (Default = 95.0)') arg_parser.add_argument("-len","--min_len",type=int, default=60, help='Minimal lenght of the read to be proccessed. (Default = 60)') arg_parser.add_argument("-clip","--max_clip",type=float, default=0.3, help='Max clipping allowed on the alignment. (Default = 0.30)') arg_parser.add_argument("--out_dir",type=str, default='./', help='Folder where to store the output files.') arg_parser.add_argument("--mode",type=str, default='paired', help='Alignment type of the input files. (paired or single)') arg_parser.add_argument("--dataset",type=str, help='Custom dataset name.') args = arg_parser.parse_args() if args.input_file == '': print "No input file given. exiting..." sys.exit(1) elif args.input_file == '-': bam_file = HTSeq.SAM_Reader(sys.stdin) if args.dataset: dataset_id = args.dataset else: sys.exit("If using a stream you need to provide a name for the dataset.") elif args.input_file != '-': import os bam_file = HTSeq.BAM_Reader(args.input_file) dataset_id = os.path.basename(args.input_file) dataset_id = dataset_id.split('.')[0] elif not args.input_file: sys.exit("No input file given. exiting...") if args.min_id: min_id = float(args.min_id) if args.min_len: min_len = int(args.min_len) if args.max_clip: max_clip = float(args.max_clip) if args.out_dir: if args.out_dir != './': import os out_dir = str(args.out_dir) + '/' if not os.path.exists(out_dir): os.makedirs(out_dir) else: out_dir = str(args.out_dir) if args.mode == 'paired': mode = str(args.mode) elif args.mode == 'single': mode = str(args.mode) else: sys.exit("No valid aligment type.") '''DF containing the raw alignments''' df = bam_parser_2(bam_file, min_len=min_len, max_clip=max_clip, min_id=min_id, mode=mode) try: if len(df) > 0: if dataset_id == '': dataset_id = df.ix[0]['QUERY'].split('.')[0] except Exception as e: if args.input_file != '-': error_msg = 'Error: No alignments in input file.' + args.input_file sys.exit(error_msg) raise amb_summary = None aligned_aln_list = list() amb_list = list() df2 = df.sort_values(by=['ALN','SCORE'], ascending=[1,0]).drop_duplicates('ALN') df2['MASTER_QUERY'] = df2['QUERY'].apply(get_read_name) gdf2 = df2.groupby('MASTER_QUERY') aligned_aln_list, amb_list = dupe_remover(gdf2) if len(aligned_aln_list) > 0: unique_df = pd.concat(aligned_aln_list) else: error_msg = "Error: No relevant alignments to process in " + args.input_file sys.exit(error_msg) '''If there are ambiguous reads it will write the FASTA and TSV files''' if len(amb_list) > 0: amb_df = pd.concat(amb_list) g_amb_df = amb_df.groupby('MASTER_QUERY') amb_df = g_amb_df.apply(amb_cluster) amb_df = amb_df.reset_index(level=0, drop=True) amb_df.columns = ['ALN','QUERY','REF','SEQ','LEN','ID','SCORE','CLIP_PCT','MASTER_QUERY','AMB_STR'] '''Counts the ambiguous reads''' amb_count = len(amb_df.drop_duplicates('MASTER_QUERY')) amb_summary = 'ambiguous\t' + str(amb_count) + '\n' for ref in sorted(amb_df['REF'].unique()): amb_count = len(amb_df.loc[amb_df['REF'] == ref]) amb_summary += ref + '-amb\t' + str(amb_count) + '\n' '''FASTA file writing of ambiguously aligned reads''' with open(out_dir + dataset_id + '.amb.fasta','w') as fh_amb: ambiguous_reads = amb_df.apply(lambda x: df_2_fasta(x), axis = 1).reset_index(drop=True) for ambiguous_read in ambiguous_reads: fh_amb.write(ambiguous_read) output_columns = ['MASTER_QUERY','REF','SCORE','ID','AMB_STR'] amb_df = amb_df[output_columns] amb_df.rename(columns={'MASTER_QUERY': 'QUERY'}, inplace=True) amb_df.to_csv(out_dir + dataset_id + '.amb.tsv', sep='\t', header=False, index=False) '''FASTA file writing''' with open(out_dir + dataset_id + '.fasta','w') as fh_aligned: aligned_reads = unique_df.apply(lambda x: df_2_fasta(x), axis = 1).reset_index(drop=True) for read in aligned_reads: fh_aligned.write(read) '''tsv file writing''' output_columns = ['QUERY','REF','SCORE','ID'] unique_df = unique_df[output_columns] unique_df.to_csv(out_dir + dataset_id + '.unique_counts.tsv', sep='\t', header=False, index=False) '''Counts file writing''' with open(out_dir + dataset_id + '.counts','w') as fh_aligned_counts: g_unique = unique_df.groupby('REF') for query in sorted(unique_df['REF'].unique()): query_count = len(unique_df.loc[unique_df['REF'] == query]) query_string = query + '\t' + str(query_count) + '\n' fh_aligned_counts.write(query_string) if amb_summary: fh_aligned_counts.write(amb_summary)