def main(): usage = 'usage: %prog [options] <bam_file>' parser = OptionParser(usage) parser.add_option( '-g', dest='filter_gff', help= 'Filter the TEs by overlap with genes in the given gff file [Default: %default]' ) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK']) #parser.add_option('-f', dest='family', help='Limit to this family') #parser.add_option('-r', dest='repeat', help='Limit to this repeat') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide BAM file.') else: bam_file = args[0] ############################################ # GFF filter ############################################ # filter TEs and read alignments by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp( dir='%s/research/scratch/temp' % os.environ['HOME']) subprocess.call('intersectBed -a %s -b %s > %s' % (options.te_gff, filter_merged_bed_file, te_gff_file), shell=True) options.te_gff = te_gff_file # filter BAM bam_gff_fd, bam_gff_file = tempfile.mkstemp( dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_file) bam_file = bam_gff_file os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) ############################################ # count TE fragments ############################################ fragments, te_fragments = count_te_fragments(bam_file, options.te_gff, options.strand_split) ############################################ # print table ############################################ for line in open(options.te_gff): a = line.split('\t') te_chrom = a[0] te_start = int(a[3]) if options.strand_split: te_count = te_fragments.get((te_chrom, te_start, '+'), 0) te_pct = te_count / float(fragments) cols = (te_chrom, te_start, te_count, te_pct) print '%-5s %9d + %6d %9.2e' % cols te_count = te_fragments.get((te_chrom, te_start, '-'), 0) te_pct = te_count / float(fragments) cols = (te_chrom, te_start, te_count, te_pct) print '%-5s %9d - %6d %9.2e' % cols else: te_count = te_fragments.get((te_chrom, te_start), 0) te_pct = te_count / float(fragments) cols = (te_chrom, te_start, te_count, te_pct) print '%-5s %9d %6d %9.2e' % cols ############################################ # clean ############################################ if options.filter_gff: os.close(te_gff_fd) os.remove(te_gff_file) os.close(bam_gff_fd) os.remove(bam_gff_file)
def main(): usage = 'usage: %prog [options] <bam_file,bam_file2,...>' parser = OptionParser(usage) parser.add_option('-c', dest='control_bam_files', help='Control BAM file to paramterize null distribution [Default: %default]') parser.add_option('-f', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-g', dest='genome', default='HG19', help='Genome directory to obtain lengths from [Default: %default]') parser.add_option('-m', dest='mapq', default=False, action='store_true', help='Consider only reads with mapq>0 [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK']) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide BAM files.') else: bam_files = args[0].split(',') control_bam_files = [] if options.control_bam_files: control_bam_files = options.control_bam_files.split(',') ############################################ # GFF filter ############################################ # filter TEs and read alignments by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter BAM bam_gff_fds = [None]*len(bam_files) bam_gff_files = [None]*len(bam_files) for i in range(len(bam_files)): bam_gff_fds[i], bam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_files[i]) bam_files[i] = bam_gff_files[i] # filter control BAM if control_bam_files: cbam_gff_fds = [None]*len(control_bam_files) cbam_gff_files = [None]*len(control_bam_files) for i in range(len(control_bam_files)): cbam_gff_fds[i], cbam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(control_bam_files[i], filter_merged_bed_file, cbam_gff_files[i]) control_bam_files[i] = cbam_gff_files[i] ############################################ # lengths ############################################ # estimate read length (just averaging across replicates for now) read_lens = [] for bam_file in bam_files: read_lens.append(estimate_read_length(bam_file)) read_len = stats.mean(read_lens) # compute size of search space if options.filter_gff: genome_length = count_bed(filter_merged_bed_file, read_len) else: genome_length = count_genome(options.genome) # hash counted repeat genomic bp if options.filter_gff: te_lengths = te_target_size_bed(options.repeats_gff, filter_merged_bed_file, read_len) else: te_lengths = te_target_size(options.repeats_gff, read_len) ############################################ # count TE fragments ############################################ fragments = [] te_fragments = [] for bam_file in bam_files: rep_fragments, rep_te_fragments = count_te_fragments(bam_file, options.repeats_gff, options.strand_split) fragments.append(rep_fragments) te_fragments.append(rep_te_fragments) if control_bam_files: control_fragments = [] control_te_fragments = [] for control_bam_file in control_bam_files: rep_fragments, rep_te_fragments = count_te_fragments(control_bam_file, options.repeats_gff, options.strand_split) control_fragments.append(rep_fragments) control_te_fragments.append(rep_te_fragments) ############################################ # combine replicates into fragment rates ############################################ te_fragment_rates = {} for (rep,fam) in te_lengths: if options.strand_split: # positive rate_list = [te_fragments[i].get((rep+'+',fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep+'+',fam)] = stats.geo_mean(rate_list) # negative rate_list = [te_fragments[i].get((rep+'-',fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep+'-',fam)] = stats.geo_mean(rate_list) else: rate_list = [te_fragments[i].get((rep,fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep,fam)] = stats.geo_mean(rate_list) if control_bam_files: control_te_fragment_rates = {} for te in te_fragment_rates: rate_list = [control_te_fragments[i].get(te,1)/float(control_fragments[i]) for i in range(len(control_bam_files))] control_te_fragment_rates[te] = stats.geo_mean(rate_list) ############################################ # compute stats, print table ############################################ for (rep,fam) in te_fragment_rates: # compute TE length if options.strand_split: te_len = te_lengths[(rep[:-1],fam)] else: te_len = te_lengths[(rep,fam)] # parameterize null model if options.control_bam_files: null_rate = control_te_fragment_rates[(rep,fam)] else: if options.strand_split: null_rate = float(te_lengths[(rep[:-1],fam)]) / (2*genome_length) else: null_rate = float(te_lengths[(rep,fam)]) / genome_length # compute fragment counts count = te_fragment_rates[(rep,fam)]*sum(fragments) null_count = null_rate*sum(fragments) # compute fold change if null_rate > 0: fold = te_fragment_rates[(rep,fam)]/null_rate else: fold = 0 # compute p-value of enrichment/depletion p_val = 1.0 for i in range(len(bam_files)): if te_fragment_rates[(rep,fam)] > null_rate: p_val *= binom.sf(int(te_fragments[i].get((rep,fam),1))-1, int(fragments[i]), null_rate) else: p_val *= binom.cdf(int(te_fragments[i].get((rep,fam),1)), int(fragments[i]), null_rate) cols = (rep, fam, te_len, count, null_count, fold, p_val) print '%-18s %-18s %10d %10.1f %10.1f %10.3f %10.2e' % cols ############################################ # clean ############################################ if options.filter_gff: os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) os.close(te_gff_fd) os.remove(te_gff_file) for i in range(len(bam_files)): os.close(bam_gff_fds[i]) os.remove(bam_gff_files[i]) if options.control_bam_files: for i in range(len(control_bam_files)): os.close(cbam_gff_fds[i]) os.remove(cbam_gff_files[i])
def main(): usage = 'usage: %prog [options] <bam_file>' parser = OptionParser(usage) parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK']) #parser.add_option('-f', dest='family', help='Limit to this family') #parser.add_option('-r', dest='repeat', help='Limit to this repeat') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide BAM file.') else: bam_file = args[0] ############################################ # GFF filter ############################################ # filter TEs and read alignments by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) subprocess.call('intersectBed -a %s -b %s > %s' % (options.te_gff, filter_merged_bed_file, te_gff_file), shell=True) options.te_gff = te_gff_file # filter BAM bam_gff_fd, bam_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_file) bam_file = bam_gff_file os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) ############################################ # count TE fragments ############################################ fragments, te_fragments = count_te_fragments(bam_file, options.te_gff, options.strand_split) ############################################ # print table ############################################ for line in open(options.te_gff): a = line.split('\t') te_chrom = a[0] te_start = int(a[3]) if options.strand_split: te_count = te_fragments.get((te_chrom, te_start, '+'), 0) te_pct = te_count / float(fragments) cols = (te_chrom, te_start, te_count, te_pct) print '%-5s %9d + %6d %9.2e' % cols te_count = te_fragments.get((te_chrom, te_start, '-'), 0) te_pct = te_count / float(fragments) cols = (te_chrom, te_start, te_count, te_pct) print '%-5s %9d - %6d %9.2e' % cols else: te_count = te_fragments.get((te_chrom, te_start), 0) te_pct = te_count / float(fragments) cols = (te_chrom, te_start, te_count, te_pct) print '%-5s %9d %6d %9.2e' % cols ############################################ # clean ############################################ if options.filter_gff: os.close(te_gff_fd) os.remove(te_gff_file) os.close(bam_gff_fd) os.remove(bam_gff_file)
def main(): usage = 'usage: %prog [options] <bam_file,bam_file2,...>' parser = OptionParser(usage) parser.add_option('-c', dest='control_bam_files', help='Control BAM file to paramterize null distribution [Default: %default]') parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-m', dest='mapq', default=False, action='store_true', help='Consider only reads with mapq>0 [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK']) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a BAM file.') else: bam_files = args[0].split(',') control_bam_files = [] if options.control_bam_files: control_bam_files = options.control_bam_files.split(',') ############################################ # GFF filter ############################################ # filter TEs and read alignments by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter BAM bam_gff_fds = [None]*len(bam_files) bam_gff_files = [None]*len(bam_files) for i in range(len(bam_files)): bam_gff_fds[i], bam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_files[i]) bam_files[i] = bam_gff_files[i] # filter control BAM if control_bam_files: cbam_gff_fds = [None]*len(control_bam_files) cbam_gff_files = [None]*len(control_bam_files) for i in range(len(control_bam_files)): cbam_gff_fds[i], cbam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(control_bam_files[i], filter_merged_bed_file, cbam_gff_files[i]) control_bam_files[i] = cbam_gff_files[i] ############################################ # lengths ############################################ # estimate read length (just averaging across replicates for now) read_lens = [] for bam_file in bam_files: read_lens.append(estimate_read_length(bam_file)) read_len = stats.mean(read_lens) # compute size of search space if options.filter_gff: genome_length = count_bed(filter_merged_bed_file, read_len) else: genome_length = count_hg19() # hash counted repeat genomic bp if options.filter_gff: te_lengths = te_target_size_bed(options.repeats_gff, filter_merged_bed_file, read_len) else: te_lengths = te_target_size(options.repeats_gff, read_len) ############################################ # count TE fragments ############################################ fragments = [] te_fragments = [] for bam_file in bam_files: rep_fragments, rep_te_fragments = count_te_fragments(bam_file, options.repeats_gff, options.strand_split) fragments.append(rep_fragments) te_fragments.append(rep_te_fragments) if control_bam_files: control_fragments = [] control_te_fragments = [] for control_bam_file in control_bam_files: rep_fragments, rep_te_fragments = count_te_fragments(control_bam_file, options.repeats_gff, options.strand_split) control_fragments.append(rep_fragments) control_te_fragments.append(rep_te_fragments) ############################################ # combine replicates into fragment rates ############################################ te_fragment_rates = {} for (rep,fam) in te_lengths: if options.strand_split: # positive rate_list = [te_fragments[i].get((rep+'+',fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep+'+',fam)] = stats.geo_mean(rate_list) # negative rate_list = [te_fragments[i].get((rep+'-',fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep+'-',fam)] = stats.geo_mean(rate_list) else: rate_list = [te_fragments[i].get((rep,fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep,fam)] = stats.geo_mean(rate_list) if control_bam_files: control_te_fragment_rates = {} for te in te_fragment_rates: rate_list = [control_te_fragments[i].get(te,1)/float(control_fragments[i]) for i in range(len(control_bam_files))] control_te_fragment_rates[te] = stats.geo_mean(rate_list) ############################################ # compute stats, print table ############################################ for (rep,fam) in te_fragment_rates: # compute TE length if options.strand_split: te_len = te_lengths[(rep[:-1],fam)] else: te_len = te_lengths[(rep,fam)] # parameterize null model if options.control_bam_files: null_rate = control_te_fragment_rates[(rep,fam)] else: if options.strand_split: null_rate = float(te_lengths[(rep[:-1],fam)]) / (2*genome_length) else: null_rate = float(te_lengths[(rep,fam)]) / genome_length # compute fragment counts count = te_fragment_rates[(rep,fam)]*sum(fragments) null_count = null_rate*sum(fragments) # compute fold change if null_rate > 0: fold = te_fragment_rates[(rep,fam)]/null_rate else: fold = 0 # compute p-value of enrichment/depletion p_val = 1.0 for i in range(len(bam_files)): if te_fragment_rates[(rep,fam)] > null_rate: p_val *= binom.sf(int(te_fragments[i].get((rep,fam),1))-1, int(fragments[i]), null_rate) else: p_val *= binom.cdf(int(te_fragments[i].get((rep,fam),1)), int(fragments[i]), null_rate) cols = (rep, fam, te_len, count, null_count, fold, p_val) print '%-18s %-18s %10d %10.1f %10.1f %10.3f %10.2e' % cols ############################################ # clean ############################################ if options.filter_gff: os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) os.close(te_gff_fd) os.remove(te_gff_file) for i in range(len(bam_files)): os.close(bam_gff_fds[i]) os.remove(bam_gff_files[i]) if options.control_bam_files: for i in range(len(control_bam_files)): os.close(cbam_gff_fds[i]) os.remove(cbam_gff_files[i])