Ejemplo n.º 1
0
def merge_bam(bam1, bam2, bam):
    if os.path.isfile(bam):
        logger.info(f'BAM file {bam} already exist.')
    else:
        cmd = f'samtools merge {bam} {bam1} {bam2}'
        cmder.run(cmd, msg=f'Merging {bam1} and {bam2} ...')
    return bam
Ejemplo n.º 2
0
def split_bam(bam, bam1, bam2):
    def count_mapped_reads(bam):
        count = int(
            cmder.run(f'samtools view -c -F 0x4 {bam}', msg='').stdout.read())
        logger.info(f'Found {count:,} mapped reads in {bam}.')
        return count

    if os.path.isfile(bam1) and os.path.isfile(bam2):
        logger.info(f'BAMs {bam1} and {bam2} already exist.')
    else:
        half_lines = int(count_mapped_reads(bam) / 2) + 1
        cmd = f'samtools view {bam} | shuf | split -d -l {half_lines} - {bam}'
        cmder.run(cmd, msg=f'Shuffling and splitting {bam} ...')
        tmp_bam1, tmp_bam2 = bam1.replace('.bam', '.tmp.bam'), bam2.replace(
            '.bam', '.tmp.bam')
        cmd = f'samtools view -H {bam} | cat - {bam}00 | samtools view -bS - > {tmp_bam1}'

        cmder.run(cmd, msg=f'Creating headers for {bam1} ...')
        cmder.run(f'samtools sort -@ {options.cpus} -o {bam1} {tmp_bam1}')
        cmd = f'samtools view -H {bam} | cat - {bam}01 | samtools view -bS - > {tmp_bam2}'

        cmder.run(cmd, msg=f'Creating headers for {bam2} ...')
        cmder.run(f'samtools sort -@ {options.cpus} -o {bam2} {tmp_bam2}')
        cmder.run(f'rm {bam}00 {bam}01 {tmp_bam1} {tmp_bam2}')
    return bam1, bam2
Ejemplo n.º 3
0
def merge_bam(bams, bam):
    if os.path.isfile(bam):
        logger.info(f'BAM file {bam} already exist.')
    else:
        cmd = f'samtools merge {bam} {" ".join(bams)}'
        cmder.run(cmd, msg=f'Merging {" ".join(bams)} to {bam} ...')
    return bam
Ejemplo n.º 4
0
def make_bigwig_files(bam, bigwig):
    def bam_to_bigwig(bam, scale, strand, bw):
        bg = bw.replace('.bw', '.bg')
        cmd = f'genomeCoverageBed -ibam {bam} -bg -scale {scale} -strand {strand} -du -split | sort -k1,1 -k2,2n > {bg}'
        cmder.run(
            cmd,
            msg=f'Calculating genome coverage for {bam} ({strand} strand) ...')
        cmd = f'bedGraphToBigWig {bg} {options.genome}/chrNameLength.txt {bw}'
        cmder.run(cmd, msg=f'Converting {bg} to {bw} ...')
        cmder.run(f'rm {bg}')

    logger.info(f'Make BigWig files for {bam} ...')
    pos_bw, neg_bw = bigwig, bigwig.replace('.plus.bw', '.minus.bw')
    with pysam.AlignmentFile(bam, 'rb') as sam:
        total_reads = sam.mapped
    try:
        scale = 1000000.0 / total_reads
    except ZeroDivisionError:
        logger.warning(
            f'No reads was found in BAM {bam}, empty BigWig file was created.')
        with open(bigwig, 'w') as o:
            o.write('')
        return bigwig
    bam_to_bigwig(bam, scale, '+', pos_bw)
    bam_to_bigwig(bam, -1 * scale, '-', neg_bw)
    logger.info(f'Make BigWig files for {bam} complete.')
    return bigwig
Ejemplo n.º 5
0
def collapse_barcode(bam, out):
    logger.info(f'Deduplicating {bam} {size(bam)} by collapsing barcodes ...')
    verbosity = pysam.set_verbosity(0)
    with pysam.AlignmentFile(bam, 'rb') as b1, pysam.AlignmentFile(bam,
                                                                   'rb') as b2:
        results = {}
        for read1, read2 in zip(itertools.islice(b1, 0, None, 2),
                                itertools.islice(b2, 1, None, 2)):
            if read1.query_name != read2.query_name:
                raise ValueError(
                    f'Read names do not match: {read1.query_name} != {read2.query_name}.'
                )
            if read1.is_unmapped or read2.is_unmapped or read1.reference_name != read2.reference_name:
                continue
            if not read1.is_read1:
                read1, read2 = read2, read1
            randomer = read1.query_name.split(':')[0]
            start = read1.positions[-1] if read1.is_reverse else read1.pos
            stop = read2.positions[-1] if read2.is_reverse else read2.pos
            strand = '-' if read1.is_reverse else '+'
            location = (read1.reference_name, start, stop, strand, randomer)
            if location in results:
                continue
            results[location] = (read1, read2)
        with pysam.AlignmentFile(out, 'wb', template=b1) as o:
            for (read1, read2) in results.values():
                o.write(read1)
                o.write(read2)
        logger.info(
            f'Deduplicating {bam} {size(bam)} by collapsing barcodes complete.'
        )
    pysam.set_verbosity(verbosity)
Ejemplo n.º 6
0
def clipper_peaks(bam, bed=''):
    bed = bed if bed else bam.replace('.ip.bam', '.peak.clusters.bed')
    if os.path.isfile(bed):
        logger.info(f'Clipper bed {bed} already exists.')
    else:
        cmd = f'clipper --species {options.species} --processors {options.cpus} --bam {bam} --outfile {bed}'
        cmder.run(cmd, msg=f'Calling peaks from {bam} using clipper ...', pmt=True)
    return bed
Ejemplo n.º 7
0
def motif_analysis(bed, output):
    basename = output.split('.motifs.')[0]
    cmd = [
        'motif', bed, options.species, options.outdir, basename, options.l10p,
        options.l2fc, options.cpus
    ]
    cmder.run(cmd, msg=f'Finding motifs in {bed} ...')
    logger.info(f'Parsing and compiling motifs for {basename} ...')
    compile_motif_html(basename, output)
    logger.info(f'Parsing and compiling motifs for {basename} complete.')
Ejemplo n.º 8
0
def split_bam(bam, basename, n):
    def count_mapped_reads(bam):
        count = int(
            cmder.run(f'samtools view -c -F 0x4 {bam}', msg='').stdout.read())
        logger.info(f'Found {count:,} mapped reads in {bam}.')
        return count

    bams = [f'{basename}{i}.bam' for i in range(n)]
    if all([os.path.isfile(b) for b in bams]):
        logger.info(f'Split bams already exist.')
    else:
        lines = int(count_mapped_reads(bam) / n) + 1
        cmd = f'samtools view {bam} | shuf | split - -a 1 --additional-suffix=.bam -d -l {lines} {basename}'
        cmder.run(cmd, msg=f'Shuffling and splitting {bam} ...')
        for b in bams:
            tmp = b.replace(".bam", ".tmp.bam")
            cmder.run(
                f'samtools view -H {bam} | cat - {b} | samtools view -bS - > {tmp}'
            )
            cmder.run(f'samtools sort -@ {options.cpus} -o {b} {tmp}')
            cmder.run(f'rm {tmp}')
    return bams
Ejemplo n.º 9
0
def make_bigwig_files(bam, bigwig):
    def bam_to_bigwig(bam, scale, strand, bw):
        bg, bg_sort = bw.replace('.bw', '.bg'), bw.replace('.bw', '.sort.bg')
        cmd = f'genomeCoverageBed -ibam {bam} -bg -scale {scale} -strand {strand} -du -split > {bg}'
        cmder.run(cmd)
        cmd = f'bedSort {bg} {bg_sort}'
        cmder.run(cmd)
        cmd = f'bedGraphToBigWig {bg_sort} {options.genome}/chrNameLength.txt {bw}'
        cmder.run(cmd)
        cmder.run(f'rm {bg} {bg_sort}')

    message, start_time = f'Make BigWig files for {bam} ...', time.perf_counter(
    )
    logger.info(message)
    pos_bw, neg_bw = bigwig, bigwig.replace('.plus.bw', '.minus.bw')
    with pysam.AlignmentFile(bam, 'rb') as sam:
        total_reads = sam.mapped
    total_reads = total_reads / 2 if TYPE == 'paired' else total_reads
    try:
        scale = 1000000.0 / total_reads
    except ZeroDivisionError:
        logger.error(
            f'No reads was found in BAM {bam}, empty BigWig file was created.')
        with open(bigwig, 'w') as o:
            o.write('')
        return bigwig
    if TYPE == 'single':
        bam_to_bigwig(bam, scale, '+', pos_bw)
        bam_to_bigwig(bam, -1 * scale, '-', neg_bw)
    else:
        bam_to_bigwig(bam, -1 * scale, '-', pos_bw)
        bam_to_bigwig(bam, scale, '+', neg_bw)
    run_time = int(time.perf_counter() - start_time)
    message = message.replace(
        ' ...',
        f' completed in [{str(datetime.timedelta(seconds=run_time))}].')
    logger.info(message)
    return bigwig
Ejemplo n.º 10
0
 def count_mapped_reads(bam):
     count = int(cmder.run(f'samtools view -c -F 0x4 {bam}', msg='').stdout.read())
     logger.info(f'Found {count:,} mapped reads in {bam}.')
     return count
Ejemplo n.º 11
0
def count_lines(file):
    lines = int(cmder.run(f'wc -l {file}').stdout.read().split()[0])
    logger.info(f'Found {lines:,} lines in {file}.')
    return lines
Ejemplo n.º 12
0
def make_hub_file(inputs, output):
    logger.info('Make hub track file ...')
    header = f"""hub {options.track.replace(' ', '_')}
shortLabel {options.track}
longLabel {options.track}
useOneFile on
email {options.email if options.email else '*****@*****.**'}

genome {options.track_genome}

track {options.track.replace(' ', '_')}
shortLabel {options.track}
longLabel {options.track}
type bigWig
superTrack on
"""
    block = """
track {basename}
shortLabel {basename}
longLabel {basename}
type bigWig
visibility full
alwaysZero on
autoScale on
aggregate transparentOverlay
showSubtrackColorOnUi on
parent {track}
container multiWig

    track {name1}
    bigDataUrl {plus}
    shortLabel {basename} Plus strand
    longLabel {basename} Plus strand
    type bigWig
    color 0,100,0
    parent {basename}

    track {name2}
    bigDataUrl {minus}
    shortLabel {basename} Minus strand
    longLabel {basename} Minus strand
    type bigWig
    color 100,0,0
    parent {basename}
    """

    track = options.track.replace(' ', '_')
    with open(output, 'w') as o:
        o.write(header)
        for bw in glob.iglob('*.plus.bw'):
            key = bw.replace('.plus.bw', '')
            plus, minus = f'{key}.pos.bw', f'{key}.neg.bw'
            name1, name2 = f'{key} plus', f'{key} minus'
            o.write(
                block.format(track=track,
                             name1=name1,
                             name2=name2,
                             basename=key,
                             plus=plus,
                             minus=minus))
    logger.info('Make hub track file complete.')
Ejemplo n.º 13
0
def make_hub_files(inputs, output):
    message, start_time = 'Make hub track file ...', time.perf_counter()
    logger.info(message)
    header = f"""hub {options.track.replace(' ', '_')}
shortLabel {options.track_label}
longLabel {options.track_label}
useOneFile on
email {options.email if options.email else '*****@*****.**'}

genome {options.track_genome}

track {options.track.replace(' ', '_')}
shortLabel {options.track_label}
longLabel {options.track_label}
type bigWig
superTrack on
"""
    block = """
track {basename}
shortLabel {basename}
longLabel {basename}
type bigWig
visibility full
alwaysZero on
autoScale on
aggregate transparentOverlay
showSubtrackColorOnUi on
parent {track}
container multiWig

    track {name1}
    bigDataUrl {plus}
    shortLabel {basename} Plus strand
    longLabel {basename} Plus strand
    type bigWig
    color 0,100,0
    parent {basename}

    track {name2}
    bigDataUrl {minus}
    shortLabel {basename} Minus strand
    longLabel {basename} Minus strand
    type bigWig
    color 100,0,0
    parent {basename}
    """

    track = options.track.replace(' ', '_')
    with open(output, 'w') as o:
        o.write(header)
        for key in READS:
            plus = f'{key}.plus.bw'
            name1 = plus.replace('.bw', '').replace('.', '_')
            name2 = name1.replace('plus', 'minus')
            minus = f'{key}.minus.bw'
            o.write(
                block.format(track=track,
                             name1=name1,
                             name2=name2,
                             basename=key,
                             plus=plus,
                             minus=minus))
    run_time = int(time.perf_counter() - start_time)
    message = message.replace(
        ' ...',
        f' completed in [{str(datetime.timedelta(seconds=run_time))}].')
    logger.info(message)
Ejemplo n.º 14
0
def demux(fastq1, fastq2, basename, barcodes):
    """Demultiplex paired-end reads."""
    def hamming(key, barcode, seq, allow_mismatch):
        mismatch = len(barcode) - sum(x == y or x == 'N' or y == 'N'
                                      for x, y in zip(barcode, seq))
        return (key, len(barcode),
                mismatch) if mismatch <= allow_mismatch else None

    logger.info(
        f'Demultiplexing {fastq1} and {fastq2} with barcodes {" and ".join(barcodes)} ...'
    )
    barcodes_dict = {
        'A01': 'AAGCAAT',
        'A03': 'ATGACCNNNNT',
        'A04': 'CAGCTTNNNNT',
        'B06': 'GGCTTGT',
        'C01': 'ACAAGTT',
        'D8f': 'TGGTCCT',
        'F05': 'GGATACNNNNT',
        'G07': 'TCCTGTNNNNT',
        'X1A': 'NNNNNCCTATAT',
        'X1B': 'NNNNNTGCTATT',
        'X2A': 'NNNNNTATACTT',
        'X2B': 'NNNNNATCTTCT'
    }
    allow_mismatch, randomer_length = options.allow_mismatch, options.randomer_length
    max_barcode_length = max(
        len(barcode) for barcode in barcodes_dict.values())
    writers = {}
    for barcode in set(barcodes):
        file1, file2 = f'{basename}.{barcode}.r1.fastq.gz', f'{basename}.{barcode}.r2.fastq.gz'
        writers[barcode] = (gzip.open(file1, 'wt'), gzip.open(file2, 'wt'))
        NEED_TO_REMOVE.extend([file1, file2])

    with gzip.open(fastq1, 'rt') as f1, gzip.open(fastq2, 'rt') as f2:
        for i, (read1, read2) in enumerate(
                zip(FastqGeneralIterator(f1), FastqGeneralIterator(f2))):
            (name1, seq1, quality1), (name2, seq2, quality2) = read1, read2
            n1, n2 = name1.split()[0], name2.split()[0]
            assert n1 == n2, ValueError(
                f'Paired-End reads have mismatch names: {name1} != {name2}')

            matches = (hamming(key, barcode, seq1[:max_barcode_length],
                               allow_mismatch)
                       for key, barcode in barcodes_dict.items())
            matches = [match for match in matches if match]
            if matches:
                barcode, barcode_length, _ = sorted(matches,
                                                    key=lambda x: x[2])[0]
                r1 = f'@{seq2[:randomer_length]}:{name1}\n{seq1[barcode_length:]}\n+\n{quality1[barcode_length:]}\n'
            else:
                barcode = 'NIL'
                r1 = f'@{seq2[:randomer_length]}:{name1}\n{seq1}\n+\n{quality1}\n'
            r2 = f'@{seq2[:randomer_length]}:{name2}\n{seq2[randomer_length:]}\n+\n{quality2[randomer_length:]}\n'

            if barcode in writers:
                writer1, writer2 = writers[barcode]
                writer1.write(r1)
                writer2.write(r2)
    _ = [[v[0].close(), v[1].close()] for v in writers.values()]
    logger.info(
        f'Demultiplexing {fastq1} and {fastq2} with barcodes {" and ".join(barcodes)} complete.'
    )