def main():
    usage = 'usage: %prog [options] <bam_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-g',
        dest='filter_gff',
        help=
        'Filter the TEs by overlap with genes in the given gff file [Default: %default]'
    )
    parser.add_option('-s',
                      dest='strand_split',
                      default=False,
                      action='store_true',
                      help='Split statistics by strand [Default: %default]')
    parser.add_option('-t',
                      dest='te_gff',
                      default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])

    #parser.add_option('-f', dest='family', help='Limit to this family')
    #parser.add_option('-r', dest='repeat', help='Limit to this repeat')

    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide BAM file.')
    else:
        bam_file = args[0]

    ############################################
    # GFF filter
    ############################################
    # filter TEs and read alignments by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' %
                        (options.filter_gff, filter_merged_bed_file),
                        shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp(
            dir='%s/research/scratch/temp' % os.environ['HOME'])
        subprocess.call('intersectBed -a %s -b %s > %s' %
                        (options.te_gff, filter_merged_bed_file, te_gff_file),
                        shell=True)
        options.te_gff = te_gff_file

        # filter BAM
        bam_gff_fd, bam_gff_file = tempfile.mkstemp(
            dir='%s/research/scratch/temp' % os.environ['HOME'])
        bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_file)
        bam_file = bam_gff_file

        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)

    ############################################
    # count TE fragments
    ############################################
    fragments, te_fragments = count_te_fragments(bam_file, options.te_gff,
                                                 options.strand_split)

    ############################################
    # print table
    ############################################
    for line in open(options.te_gff):
        a = line.split('\t')

        te_chrom = a[0]
        te_start = int(a[3])

        if options.strand_split:
            te_count = te_fragments.get((te_chrom, te_start, '+'), 0)
            te_pct = te_count / float(fragments)

            cols = (te_chrom, te_start, te_count, te_pct)
            print '%-5s  %9d  +  %6d  %9.2e' % cols

            te_count = te_fragments.get((te_chrom, te_start, '-'), 0)
            te_pct = te_count / float(fragments)

            cols = (te_chrom, te_start, te_count, te_pct)
            print '%-5s  %9d  -  %6d  %9.2e' % cols

        else:
            te_count = te_fragments.get((te_chrom, te_start), 0)
            te_pct = te_count / float(fragments)

            cols = (te_chrom, te_start, te_count, te_pct)
            print '%-5s  %9d  %6d  %9.2e' % cols

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(te_gff_fd)
        os.remove(te_gff_file)

        os.close(bam_gff_fd)
        os.remove(bam_gff_file)
Beispiel #2
0
def main():
    usage = 'usage: %prog [options] <bam_file,bam_file2,...>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='control_bam_files', help='Control BAM file to paramterize null distribution [Default: %default]')
    parser.add_option('-f', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-g', dest='genome', default='HG19', help='Genome directory to obtain lengths from [Default: %default]')
    parser.add_option('-m', dest='mapq', default=False, action='store_true', help='Consider only reads with mapq>0 [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])
    parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide BAM files.')
    else:
        bam_files = args[0].split(',')

    control_bam_files = []
    if options.control_bam_files:
        control_bam_files = options.control_bam_files.split(',')

    ############################################
    # GFF filter
    ############################################
    # filter TEs and read alignments by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter BAM
        bam_gff_fds = [None]*len(bam_files)
        bam_gff_files = [None]*len(bam_files)
        for i in range(len(bam_files)):
            bam_gff_fds[i], bam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
            bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_files[i])
            bam_files[i] = bam_gff_files[i]

        # filter control BAM
        if control_bam_files:
            cbam_gff_fds = [None]*len(control_bam_files)
            cbam_gff_files = [None]*len(control_bam_files)
            for i in range(len(control_bam_files)):
                cbam_gff_fds[i], cbam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
                bedtools.abam_f1(control_bam_files[i], filter_merged_bed_file, cbam_gff_files[i])
                control_bam_files[i] = cbam_gff_files[i]

    ############################################
    # lengths
    ############################################
    # estimate read length (just averaging across replicates for now)
    read_lens = []
    for bam_file in bam_files:
        read_lens.append(estimate_read_length(bam_file))
    read_len = stats.mean(read_lens)

    # compute size of search space
    if options.filter_gff:
        genome_length = count_bed(filter_merged_bed_file, read_len)
    else:
        genome_length = count_genome(options.genome)

    # hash counted repeat genomic bp
    if options.filter_gff:
        te_lengths = te_target_size_bed(options.repeats_gff, filter_merged_bed_file, read_len)
    else:
        te_lengths = te_target_size(options.repeats_gff, read_len)

    ############################################
    # count TE fragments
    ############################################
    fragments = []
    te_fragments = []
    for bam_file in bam_files:
        rep_fragments, rep_te_fragments = count_te_fragments(bam_file, options.repeats_gff, options.strand_split)
        fragments.append(rep_fragments)
        te_fragments.append(rep_te_fragments)

    if control_bam_files:
        control_fragments = []
        control_te_fragments = []
        for control_bam_file in control_bam_files:
            rep_fragments, rep_te_fragments = count_te_fragments(control_bam_file, options.repeats_gff, options.strand_split)
            control_fragments.append(rep_fragments)
            control_te_fragments.append(rep_te_fragments)

    ############################################
    # combine replicates into fragment rates
    ############################################
    te_fragment_rates = {}
    for (rep,fam) in te_lengths:
        if options.strand_split:
            # positive
            rate_list = [te_fragments[i].get((rep+'+',fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep+'+',fam)] = stats.geo_mean(rate_list)
            # negative
            rate_list = [te_fragments[i].get((rep+'-',fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep+'-',fam)] = stats.geo_mean(rate_list)
        else:
            rate_list = [te_fragments[i].get((rep,fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep,fam)] = stats.geo_mean(rate_list)

    if control_bam_files:
        control_te_fragment_rates = {}
        for te in te_fragment_rates:
            rate_list = [control_te_fragments[i].get(te,1)/float(control_fragments[i]) for i in range(len(control_bam_files))]
            control_te_fragment_rates[te] = stats.geo_mean(rate_list)

    ############################################
    # compute stats, print table
    ############################################
    for (rep,fam) in te_fragment_rates:
        # compute TE length
        if options.strand_split:
            te_len = te_lengths[(rep[:-1],fam)]
        else:
            te_len = te_lengths[(rep,fam)]

        # parameterize null model
        if options.control_bam_files:
            null_rate = control_te_fragment_rates[(rep,fam)]
        else:
            if options.strand_split:
                null_rate = float(te_lengths[(rep[:-1],fam)]) / (2*genome_length)
            else:
                null_rate = float(te_lengths[(rep,fam)]) / genome_length

        # compute fragment counts
        count = te_fragment_rates[(rep,fam)]*sum(fragments)
        null_count = null_rate*sum(fragments)

        # compute fold change
        if null_rate > 0:
            fold = te_fragment_rates[(rep,fam)]/null_rate
        else:
            fold = 0

        # compute p-value of enrichment/depletion
        p_val = 1.0
        for i in range(len(bam_files)):
            if te_fragment_rates[(rep,fam)] > null_rate:
                p_val *= binom.sf(int(te_fragments[i].get((rep,fam),1))-1, int(fragments[i]), null_rate)
            else:
                p_val *= binom.cdf(int(te_fragments[i].get((rep,fam),1)), int(fragments[i]), null_rate)

        cols = (rep, fam, te_len, count, null_count, fold, p_val)
        print '%-18s %-18s %10d %10.1f %10.1f %10.3f %10.2e' % cols

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)
        os.close(te_gff_fd)
        os.remove(te_gff_file)

        for i in range(len(bam_files)):
            os.close(bam_gff_fds[i])
            os.remove(bam_gff_files[i])

        if options.control_bam_files:
            for i in range(len(control_bam_files)):
                os.close(cbam_gff_fds[i])
                os.remove(cbam_gff_files[i])
Beispiel #3
0
def main():
    usage = 'usage: %prog [options] <bam_file>'
    parser = OptionParser(usage)    
    parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]')
    parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])

    #parser.add_option('-f', dest='family', help='Limit to this family')
    #parser.add_option('-r', dest='repeat', help='Limit to this repeat')

    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide BAM file.')
    else:
        bam_file = args[0]

    ############################################
    # GFF filter
    ############################################
    # filter TEs and read alignments by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.te_gff, filter_merged_bed_file, te_gff_file), shell=True)
        options.te_gff = te_gff_file

        # filter BAM
        bam_gff_fd, bam_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
        bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_file)
        bam_file = bam_gff_file

        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)

    ############################################
    # count TE fragments
    ############################################
    fragments, te_fragments = count_te_fragments(bam_file, options.te_gff, options.strand_split)

    ############################################
    # print table
    ############################################
    for line in open(options.te_gff):
        a = line.split('\t')

        te_chrom = a[0]
        te_start = int(a[3])

        if options.strand_split:
            te_count = te_fragments.get((te_chrom, te_start, '+'), 0)
            te_pct = te_count / float(fragments)

            cols = (te_chrom, te_start, te_count, te_pct)
            print '%-5s  %9d  +  %6d  %9.2e' % cols

            te_count = te_fragments.get((te_chrom, te_start, '-'), 0)
            te_pct = te_count / float(fragments)

            cols = (te_chrom, te_start, te_count, te_pct)
            print '%-5s  %9d  -  %6d  %9.2e' % cols

        else:
            te_count = te_fragments.get((te_chrom, te_start), 0)
            te_pct = te_count / float(fragments)

            cols = (te_chrom, te_start, te_count, te_pct)
            print '%-5s  %9d  %6d  %9.2e' % cols

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(te_gff_fd)
        os.remove(te_gff_file)

        os.close(bam_gff_fd)
        os.remove(bam_gff_file)
Beispiel #4
0
def main():
    usage = 'usage: %prog [options] <bam_file,bam_file2,...>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='control_bam_files', help='Control BAM file to paramterize null distribution [Default: %default]')
    parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-m', dest='mapq', default=False, action='store_true', help='Consider only reads with mapq>0 [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])
    parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a BAM file.')
    else:
        bam_files = args[0].split(',')

    control_bam_files = []
    if options.control_bam_files:
        control_bam_files = options.control_bam_files.split(',')

    ############################################
    # GFF filter
    ############################################
    # filter TEs and read alignments by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter BAM
        bam_gff_fds = [None]*len(bam_files)
        bam_gff_files = [None]*len(bam_files)
        for i in range(len(bam_files)):
            bam_gff_fds[i], bam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
            bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_files[i])
            bam_files[i] = bam_gff_files[i]

        # filter control BAM
        if control_bam_files:
            cbam_gff_fds = [None]*len(control_bam_files)
            cbam_gff_files = [None]*len(control_bam_files)
            for i in range(len(control_bam_files)):
                cbam_gff_fds[i], cbam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
                bedtools.abam_f1(control_bam_files[i], filter_merged_bed_file, cbam_gff_files[i])
                control_bam_files[i] = cbam_gff_files[i]

    ############################################
    # lengths
    ############################################
    # estimate read length (just averaging across replicates for now)
    read_lens = []
    for bam_file in bam_files:
        read_lens.append(estimate_read_length(bam_file))
    read_len = stats.mean(read_lens)

    # compute size of search space
    if options.filter_gff:
        genome_length = count_bed(filter_merged_bed_file, read_len)
    else:
        genome_length = count_hg19()

    # hash counted repeat genomic bp
    if options.filter_gff:
        te_lengths = te_target_size_bed(options.repeats_gff, filter_merged_bed_file, read_len)
    else:
        te_lengths = te_target_size(options.repeats_gff, read_len)

    ############################################
    # count TE fragments
    ############################################
    fragments = []
    te_fragments = []
    for bam_file in bam_files:
        rep_fragments, rep_te_fragments = count_te_fragments(bam_file, options.repeats_gff, options.strand_split)
        fragments.append(rep_fragments)
        te_fragments.append(rep_te_fragments)

    if control_bam_files:        
        control_fragments = []
        control_te_fragments = []
        for control_bam_file in control_bam_files:
            rep_fragments, rep_te_fragments = count_te_fragments(control_bam_file, options.repeats_gff, options.strand_split)
            control_fragments.append(rep_fragments)
            control_te_fragments.append(rep_te_fragments)

    ############################################
    # combine replicates into fragment rates
    ############################################
    te_fragment_rates = {}
    for (rep,fam) in te_lengths:
        if options.strand_split:
            # positive
            rate_list = [te_fragments[i].get((rep+'+',fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep+'+',fam)] = stats.geo_mean(rate_list)
            # negative
            rate_list = [te_fragments[i].get((rep+'-',fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep+'-',fam)] = stats.geo_mean(rate_list)
        else:
            rate_list = [te_fragments[i].get((rep,fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep,fam)] = stats.geo_mean(rate_list)

    if control_bam_files:
        control_te_fragment_rates = {}
        for te in te_fragment_rates:
            rate_list = [control_te_fragments[i].get(te,1)/float(control_fragments[i]) for i in range(len(control_bam_files))]
            control_te_fragment_rates[te] = stats.geo_mean(rate_list)

    ############################################
    # compute stats, print table
    ############################################
    for (rep,fam) in te_fragment_rates:
        # compute TE length
        if options.strand_split:
            te_len = te_lengths[(rep[:-1],fam)]
        else:
            te_len = te_lengths[(rep,fam)]

        # parameterize null model
        if options.control_bam_files:
            null_rate = control_te_fragment_rates[(rep,fam)]
        else:
            if options.strand_split:
                null_rate = float(te_lengths[(rep[:-1],fam)]) / (2*genome_length)
            else:
                null_rate = float(te_lengths[(rep,fam)]) / genome_length

        # compute fragment counts
        count = te_fragment_rates[(rep,fam)]*sum(fragments)
        null_count = null_rate*sum(fragments)

        # compute fold change
        if null_rate > 0:
            fold = te_fragment_rates[(rep,fam)]/null_rate
        else:
            fold = 0

        # compute p-value of enrichment/depletion
        p_val = 1.0
        for i in range(len(bam_files)):
            if te_fragment_rates[(rep,fam)] > null_rate:            
                p_val *= binom.sf(int(te_fragments[i].get((rep,fam),1))-1, int(fragments[i]), null_rate)
            else:
                p_val *= binom.cdf(int(te_fragments[i].get((rep,fam),1)), int(fragments[i]), null_rate)

        cols = (rep, fam, te_len, count, null_count, fold, p_val)
        print '%-18s %-18s %10d %10.1f %10.1f %10.3f %10.2e' % cols

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)
        os.close(te_gff_fd)
        os.remove(te_gff_file)

        for i in range(len(bam_files)):
            os.close(bam_gff_fds[i])
            os.remove(bam_gff_files[i])

        if options.control_bam_files:
            for i in range(len(control_bam_files)):
                os.close(cbam_gff_fds[i])
                os.remove(cbam_gff_files[i])