def get_adapters():
     if adapters:
         adapters1, adapters2 = parse_adapters('-a',
                                               adapters), parse_adapters(
                                                   '-a', adapters)
         overlap1, overlap2 = '1', '5'
     else:
         cmd = [
             'parse_barcodes.sh', randomer_length, barcodes_fasta, barcode1,
             barcode2
         ]
         folder = tempfile.mkdtemp(dir=outdir)
         cmder.run(cmd,
                   msg='Parsing barcodes and finding adapters ...',
                   cwd=folder)
         adapters1 = parse_adapters(
             '-g', os.path.join(folder, 'g_adapters.fasta'))
         adapters1 += parse_adapters(
             '-A', os.path.join(folder, 'A_adapters.fasta'))
         adapters1 += parse_adapters(
             '-a', os.path.join(folder, 'a_adapters.fasta'))
         adapters2 = parse_adapters(
             '-A', os.path.join(folder, 'A_adapters.fasta'))
         overlap1 = parse_overlap(
             os.path.join(folder, 'trim_first_overlap_length.txt'))
         overlap2 = parse_overlap(
             os.path.join(folder, 'trim_again_overlap_length.txt'))
         shutil.rmtree(folder)
     return adapters1, adapters2, overlap1, overlap2
Exemple #2
0
def merge_bam(bams, bam):
    if os.path.isfile(bam):
        logger.info(f'BAM file {bam} already exist.')
    else:
        cmd = f'samtools merge {bam} {" ".join(bams)}'
        cmder.run(cmd, msg=f'Merging {" ".join(bams)} to {bam} ...')
    return bam
Exemple #3
0
 def trim_adapters(adapters, overlap, ios, message):
     cmd = [
         'cutadapt', '-O', overlap, '-j', options.cores,
         '--match-read-wildcards', '--times', 1, '-e', 0.1,
         '--quality-cutoff', 6, '-m', 18
     ] + adapters + ios
     cmder.run(cmd, msg=message, pmt=True)
Exemple #4
0
def cut_adapt(fastq, output):
    key = fastq.rsplit(".umi.fastq.gz", maxsplit=1)[0]
    cmd = [
        'eclip_cut_adapt', fastq, '-o', output, '-a', fastq_to_adapters[key],
        '-c', args.cpus
    ]
    cmder.run(cmd, stdout=sys.stdout, stderr=sys.stderr, log_cmd=False)
Exemple #5
0
def map_to_repeat_elements(fastq, mate1):
    fastq1, fastq2 = fastq, fastq.replace('.r1.', '.r2.')
    prefix = os.path.dirname(mate1)
    if not os.path.isdir(prefix):
        os.mkdir(prefix)
    cmd = [
        'STAR', '--runMode', 'alignReads', '--runThreadN', options.cores,
        '--alignEndsType', 'EndToEnd', '--genomeDir', options.repeat,
        '--genomeLoad', 'NoSharedMemory', '--outBAMcompression', 10,
        '--outFileNamePrefix', f"{prefix}/", '--outFilterMultimapNmax', 30,
        '--outFilterMultimapScoreRange', 1, '--outFilterScoreMin', 10,
        '--outFilterType', 'BySJout', '--outReadsUnmapped', 'Fastx',
        '--outSAMattrRGline', 'ID:foo', '--outSAMattributes', 'All',
        '--outSAMmode', 'Full', '--outSAMtype', 'BAM', 'Unsorted',
        '--outSAMunmapped', 'Within', '--outStd', 'Log', '--readFilesIn',
        fastq1
    ]
    if os.path.exists(fastq2):
        cmd.append(fastq2)
        message = (f'Map paired reads {fastq1} {size(fastq1)} and\n{28 * " "}'
                   f'{fastq2} {size(fastq2)} to repeat elements ...')
    else:
        message = f'Map single read {fastq1} {size(fastq1)} to repeat elements ...'
    cmder.run(cmd, msg=message, pmt=True)
    return mate1
Exemple #6
0
def map_to_repbase(fastq, mate):
    bam = f'{mate.rsplit(".mate1.gz", maxsplit=1)[0]}.bam'
    cmd = [
        'star_repbase_map', fastq, '-x', args.repeat, '-c', args.cpus, '-o',
        bam
    ]
    cmder.run(cmd, stdout=sys.stdout, stderr=sys.stderr, log_cmd=False)
Exemple #7
0
 def get_adapters(read):
     adapter, adapters = read.adapters, []
     if adapter:
         adapters1 = parse_adapters('-a', adapter)
         adapters2 = adapters1
         overlap1, overlap2 = '1', '5'
     else:
         cmd = [
             'parse_barcodes.sh', options.randomer_length,
             options.barcodes_fasta
         ] + read.barcodes
         folder = tempfile.mkdtemp(dir=ECLIP)
         cmder.run(cmd,
                   msg='Parsing barcodes and finding adapters ...',
                   cwd=folder)
         adapters = parse_adapters('-g',
                                   os.path.join(folder, 'g_adapters.fasta'))
         adapters += parse_adapters(
             '-A', os.path.join(folder, 'A_adapters.fasta'))
         adapters += parse_adapters(
             '-a', os.path.join(folder, 'a_adapters.fasta'))
         adapters1 = adapters
         adapters2 = parse_adapters(
             '-A', os.path.join(folder, 'A_adapters.fasta'))
         overlap1 = parse_overlap(
             os.path.join(folder, 'trim_first_overlap_length.txt'))
         overlap2 = parse_overlap(
             os.path.join(folder, 'trim_again_overlap_length.txt'))
         shutil.rmtree(folder)
     return adapters1, adapters2, overlap1, overlap2
Exemple #8
0
def extract_umi(fastq, umi):
    message = f'Extract UMIs for {fastq} ...'
    cmd = [
        'umi_tools', 'extract', '--random-seed', 1, '--stdin', fastq,
        '--bc-pattern', 'NNNNNNNNNN', '--stdout', umi, '>', '/dev/null'
    ]
    cmder.run(cmd, msg=message, pmt=True)
Exemple #9
0
def merge_bam(bam1, bam2, bam):
    if os.path.isfile(bam):
        logger.info(f'BAM file {bam} already exist.')
    else:
        cmd = f'samtools merge {bam} {bam1} {bam2}'
        cmder.run(cmd, msg=f'Merging {bam1} and {bam2} ...')
    return bam
Exemple #10
0
def map_to_reference_genome(mate1, bam):
    # '--outSAMunmapped' flag needs to be set to 'Within', otherwise barcode_collapse.py for duplication removal will
    # throw out name not match error.
    prefix = os.path.dirname(bam)
    if not os.path.isdir(prefix):
        os.mkdir(prefix)
    mate2 = mate1.replace('.mate1', '.mate2')
    cmd = [
        'STAR', '--runMode', 'alignReads', '--runThreadN', options.cores,
        '--alignEndsType', 'EndToEnd', '--genomeDir', options.genome,
        '--genomeLoad', 'NoSharedMemory', '--outBAMcompression', 10,
        '--outFileNamePrefix', f"{prefix}/", '--outFilterMultimapNmax', 1,
        '--outFilterMultimapScoreRange', 1, '--outFilterScoreMin', 10,
        '--outFilterType', 'BySJout', '--outReadsUnmapped', 'Fastx',
        '--outSAMattrRGline', 'ID:foo', '--outSAMattributes', 'All',
        '--outSAMmode', 'Full', '--outSAMtype', 'BAM', 'Unsorted',
        '--outSAMunmapped', 'Within', '--outStd', 'Log', '--readFilesIn', mate1
    ]
    if os.path.exists(mate2):
        cmd.append(mate2)
        message = f'Map paired mates {mate1} {size(mate1)} and\n{28 * " "}{mate2} {size(mate2)} to reference genome ...'
    else:
        message = f'Map single mate {mate1} {size(mate1)} to reference genome ...'
    cmder.run(cmd, msg=message, pmt=True)
    return bam
Exemple #11
0
def dedup_bam(bam, out):
    """Deduplicate SE BAM using umi_tools dedup."""
    cmd = [
        'umi_tools', 'dedup', '--random-seed', 1, '--stdin', bam, '--method',
        'unique', '--stdout', out
    ]
    cmder.run(cmd, msg=f'Deduplicating {bam} by umi_tools dedup ...')
    cmder.run(f'samtools index {out}', msg=f'Indexing {bam} ...')
Exemple #12
0
def pigz(fastq, gz):
    print(f'Compressing {fastq}, which {os.path.exists(gz)}')
    if options.debug:
        cmd = f'pigz -p {PROCESSES} {fastq}'
    else:
        cmd = f'pigz --processes {PROCESSES} {fastq}'
    cmder.run(cmd, msg=f'Compressing {fastq} ...', pmt=True)
    return gz
Exemple #13
0
def index_bam(bam, out):
    if TYPE == 'paired':
        cmder.run(f'samtools view -f 128 -@ {options.cores} -b -o {out} {bam}',
                  msg=f'Creating bam {bam} {size(bam)} with r2 reads only ...')
    else:
        cmder.run(f'cp {bam} {out}')
    if not os.path.exists(f'{bam}.bai'):
        index_sorted_bam(out)
Exemple #14
0
def clipper_peaks(bam, bed=''):
    bed = bed if bed else bam.replace('.ip.bam', '.peak.clusters.bed')
    if os.path.isfile(bed):
        logger.info(f'Clipper bed {bed} already exists.')
    else:
        cmd = f'clipper --species {options.species} --processors {options.cpus} --bam {bam} --outfile {bed}'
        cmder.run(cmd, msg=f'Calling peaks from {bam} using clipper ...', pmt=True)
    return bed
Exemple #15
0
def prepare_bam(bam, out):
    if TYPE == 'single':
        name_sort = out.replace('.sort.bam', '.name.sort.bam')
        name_sort_bam(bam, name_sort)
        position_sort_bam(name_sort, out)
        index_sorted_bam(out)
        cmder.run(f'rm {name_sort}')
    else:
        name_sort_bam(bam, out)
Exemple #16
0
def motif_analysis(bed, output):
    basename = output.split('.motifs.')[0]
    cmd = [
        'motif', bed, options.species, options.outdir, basename, options.l10p,
        options.l2fc, options.cpus
    ]
    cmder.run(cmd, msg=f'Finding motifs in {bed} ...')
    logger.info(f'Parsing and compiling motifs for {basename} ...')
    compile_motif_html(basename, output)
    logger.info(f'Parsing and compiling motifs for {basename} complete.')
Exemple #17
0
def falco(fastq, txt):
    tmp = tempfile.mkdtemp(suffix='_qc', prefix='falco_', dir=QC)
    cmd = f'falco --outdir {tmp} --skip-html {fastq}'
    try:
        cmder.run(
            cmd,
            msg=f'Checking reads in {fastq} {size(fastq)} using falco ...')
        cmder.run(f'mv {tmp}/fastqc_data.txt {txt}')
    finally:
        shutil.rmtree(tmp)
Exemple #18
0
def dedup_bam(bam, out):
    """Collapse barcodes of paired-end bam or umi_tools dedup single-end bam."""
    if TYPE == 'single':
        cmd = [
            'umi_tools', 'dedup', '--random-seed', 1, '--stdin', bam,
            '--method', 'unique', '--stdout', out
        ]
        message = f'Deduplicating {bam} {size(bam)} by umi_tools dedup ...'
        cmder.run(cmd, msg=message, pmt=True)
    else:
        collapse_barcode(bam, out)
Exemple #19
0
def peak(ip_bams, input_bams, peak_beds, reproducible_bed, outdir, cwd=''):
    cmd = [
        'peak', '--ip_bams', ' '.join(ip_bams), '--input_bam',
        ' '.join(input_bams), '--peak_beds', ' '.join(peak_beds),
        '--read_type', 'PE' if TYPE == 'paired' else 'SE', '--species',
        'hg19' if options.species in ('hg19',
                                      'hg19chr19') else options.species,
        '--outdir', outdir, '--cores', options.cores
    ]
    cwd = cwd if cwd else os.path.dirname(reproducible_bed)
    cmder.run(cmd, cwd=cwd, stdout=sys.stdout, stderr=sys.stderr)
    return reproducible_bed
Exemple #20
0
def peak(ip_bams, input_bams, peak_beds, reproducible_bed, outdir):
    cmd = [
        'peak', '--ip_bams', ' '.join(ip_bams), '--input_bam',
        ' '.join(input_bams), '--peak_beds', ' '.join(peak_beds),
        '--read_type', 'SE', '--species',
        'hg19' if options.species in ('hg19',
                                      'hg19chr19') else options.species,
        '--outdir', outdir, '--cores', options.cpus, '--l2fc', options.l2fc,
        '--l10p', options.l10p
    ]
    cmder.run(cmd, cwd=options.outdir, stdout=sys.stdout, stderr=sys.stderr)
    return reproducible_bed
 def trim_adapters(adapters, overlap, ios, message):
     cmd = [
         'cutadapt', '-O', overlap, '-j', cpus, '--match-read-wildcards',
         '--times', 1, '-e', 0.1, '--quality-cutoff', 6, '-m', 18
     ] + adapters + ios
     if debug:
         cmder.run(cmd,
                   msg=message,
                   pmt=True,
                   stdout=sys.stdout,
                   stderr=sys.stderr)
     else:
         cmder.run(cmd, msg=message, pmt=True)
Exemple #22
0
def peak(ip_bams, input_bams, peak_beds, reproducible_bed, outdir, cwd):
    cmd = ['peak', '--ip_bams', ' '.join(ip_bams),
           '--input_bam', ' '.join(input_bams),
           '--peak_beds', ' '.join(peak_beds),
           '--read_type', 'SE',
           '--species', 'hg19' if options.species in ('hg19', 'hg19chr19') else options.species,
           '--outdir', outdir, '--cores', options.cpus,
           '--l2fc', options.l2fc, '--l10p', options.l10p]
    if ids:
        cmd.extend(['--ids', ' '.join(ids)])
    cwd = cwd if cwd else os.path.dirname(reproducible_bed)
    cmder.run(cmd, cwd=cwd, stdout=sys.stdout, stderr=sys.stderr)
    return reproducible_bed
Exemple #23
0
def pureclip(bam, bed):
    ip_bam, input_bam = [[sample.ip_read.bam, sample.input_read.bam]
                         for sample in SAMPLES if sample.ip_read.bam == bam][0]
    # '-iv', "'chr1;chr2;chr3'", Genomic chromosomes to learn HMM parameters
    cmd = [
        'pureclip', '-i', ip_bam, '-bai', f'{ip_bam}.bai', '-g',
        f'{options.genome}/genome.fa', '-nt', options.cores, '-ibam',
        input_bam, '-ibai', f'{input_bam}.bai', '-o', bed, '-or',
        bed.replace('.crosslink.sites.bed', '.binding.regions.bed'), '>',
        bed.replace('.crosslink.sites.bed', '.pureclip.log')
    ]
    cmder.run(cmd,
              msg=f'Calling peaks from {bam} {size(bam)} using pureCLIP ...',
              pmt=True)
Exemple #24
0
def map_to_reference_genome(mate, bam):
    prefix = mate.replace('.repeat.unmap.fastq.gz', '.genome.map')
    try:
        if not os.path.isdir(prefix):
            os.mkdir(prefix)
        cmd = [
            'STAR', '--runMode', 'alignReads', '--runThreadN', options.cpus,
            '--alignEndsType', 'EndToEnd', '--genomeDir', options.genome,
            '--genomeLoad', 'NoSharedMemory', '--outBAMcompression', 10,
            '--outFileNamePrefix', f"{prefix}/", '--outFilterMultimapNmax', 1,
            '--outFilterMultimapScoreRange', 1, '--outFilterScoreMin', 10,
            '--outFilterType', 'BySJout', '--outReadsUnmapped', 'Fastx',
            '--outSAMattrRGline', 'ID:foo', '--outSAMattributes', 'All',
            '--outSAMmode', 'Full', '--outSAMtype', 'BAM', 'Unsorted',
            '--outSAMunmapped', 'None', '--outStd', 'Log',
            '--readFilesCommand', 'zcat', '--readFilesIn', mate
        ]
        message = f'Map SE repeat elements unmapped reads in {mate} to reference genome ...'
        cmder.run(cmd, msg=message)
        cmder.run(
            f'mv {prefix}/Log.final.out {bam.replace(".genome.map.bam", ".genome.map.log")}'
        )
        unmap = bam.replace('.genome.map.bam', '.genome.unmap.fastq.gz')
        cmder.run(
            f'pigz -c -p {options.cpus} {prefix}/Unmapped.out.mate1 > {unmap}')
        cmder.run(f'mv {prefix}/Aligned.out.bam {bam}')
    finally:
        shutil.rmtree(prefix)
    return bam
Exemple #25
0
def repetitive_elements_map(bam, tsv):
    cmd = [
        'repeat-maps', '--fastq',
        bam.replace('.genome.map.sort.bam',
                    '.trim.fastq.gz'), '--bam', bam, '--dataset',
        bam.replace('.genome.map.sort.bam',
                    ''), '--scheduler', 'local', '--cpus', options.cpus,
        '--species', options.species, '--outdir', options.outdir
    ]
    cmder.run(
        cmd,
        msg=
        f'Mapping {bam.replace(".genome.map.sort.bam", "")} repetitive elements ...'
    )
Exemple #26
0
def merge_paired_bam(bam, out):
    if not os.path.exists(out):
        key = out.replace(".merge.bam", "")
        barcodes = READS[os.path.basename(key)].barcodes
        if barcodes[0] == 'NIL':
            cmder.run(f'cp {bam} {out}')
        else:
            b1, b2 = barcodes
            if b1 in bam:
                b1, b2 = bam, bam.replace(b1, b2)
            else:
                b1, b2 = bam.replace(b2, b1), bam
            cmder.run(f'samtools merge -@ {options.cores} {out} {b1} {b2}',
                      msg=f'Merging {b1} {size(b1)} and {b2} {size(b2)} ...',
                      pmt=True)
def starmap(fastq1, fastq2, index, mnm, bam, cpus=1, debug=False):
    """
    Map reads to reference genome or repeat elements using STAR.

    :param fastq1: str, path to a FASTQ file (read 1).
    :param fastq2: str, path to a FASTQ file (read 2).
                 For single-end dataset, using a non-existing file path or special string
                 'none' or 'None' for fastq2 to avoid required argument error.
    :param index: str, path to a STAR genome or repeat elements index directory.
    :param mnm: int, maximum number of loci the read is allowed to map to.
    :param bam: str, path to the output BAM file, must ends with .bam extension.
    :param cpus: int, the number of CPUs can be used.
    :param debug: bool, set to True to invoke debug mode (only for development purpose).
    """

    outdir = os.path.dirname(fastq1) or os.getcwd()
    folder = tempfile.mkdtemp(prefix=os.path.basename(fastq1),
                              suffix='.star.map',
                              dir=outdir)
    cmd = [
        'STAR', '--runMode', 'alignReads', '--runThreadN', cpus,
        '--alignEndsType', 'EndToEnd', '--genomeDir', index,
        '--outBAMcompression', 10, '--outFileNamePrefix', f'{folder}/',
        '--outFilterMultimapNmax', mnm, '--outFilterScoreMin', 10,
        '--outReadsUnmapped', 'Fastx', '--outSAMattrRGline', 'ID:foo',
        '--outSAMattributes', 'All', '--outSAMtype', 'BAM', 'Unsorted',
        '--readFilesCommand', 'zcat' if fastq1.endswith('.gz') else '-',
        '--readFilesIn', fastq1
    ]
    if os.path.isfile(fastq2):
        cmd.append(fastq2)
        message = f'Map paired reads {fastq1} and\n{28 * " "}{fastq2} to {label} using STAR ...'
    else:
        message = f'Map single read {fastq1} to {label} using STAR ...'
    cmder.run(cmd, msg=message, pmt=True, debug=debug)

    move = shutil.copy if debug else shutil.move
    move(os.path.join(folder, 'Aligned.out.bam'),
         '{name}.bam'.format(name=basename))
    move(os.path.join(folder, 'Unmapped.out.mate1'),
         '{name}.unmap.mate1'.format(name=basename))
    move(os.path.join(folder, 'Log.final.out'),
         '{name}.log.final.out'.format(name=basename))
    mate2 = os.path.join(folder, 'Unmapped.out.mate2')
    if os.path.isfile(mate2):
        move(mate2, '{name}.unmap.mate2'.format(name=basename))
    if not debug:
        shutil.rmtree(folder)
 def bam_to_bigwig(bam, scale, strand, bw, genome_length):
     bg, bg_sort = bw.replace('.bw', '.bg'), bw.replace('.bw', '.sort.bg')
     cmd = f'genomeCoverageBed -ibam {bam} -bg -scale {scale} -strand {strand} -du -split > {bg}'
     cmder.run(cmd)
     cmd = f'bedSort {bg} {bg_sort}'
     cmder.run(cmd)
     cmd = f'bedGraphToBigWig {bg_sort} {genome_length} {bw}'
     cmder.run(cmd)
     cmder.run(f'rm {bg} {bg_sort}')
Exemple #29
0
def umi_dedup(bam, output, debug=False):
    """
    Deduplicate single-end BAM by umi-tools dedup.

    :param bam: str, path to BAM file.
    :param output: str, path to the output file.
    :param debug: bool, set to True for invoking debug mode.
    """

    cmd = [
        'umi_tools', 'dedup', '--random-seed', 1, '--stdin', bam, '--method',
        'unique', '--stdout', output
    ]
    cmder.run(cmd,
              msg=f'Deduplicating {bam} by umi_tools dedup ...',
              pmt=True,
              debug=debug)
Exemple #30
0
def prepare_reads(link, output):
    """Extract UMIs for single-end reads or demultiplex paired-end reads."""
    read = READS[os.path.basename(link.replace('.r1.fastq.gz', ''))]
    fastq1, fastq2 = read.link1, read.link2
    if fastq2:
        demux(fastq1, fastq2, fastq1.replace('.r1.fastq.gz', ''),
              read.barcodes)
    else:
        message = f'Extract UMIs for single-end read {fastq1} {size(fastq1)} ...'
        cmd = [
            'umi_tools', 'extract', '--random-seed', 1, '--stdin', fastq1,
            '--bc-pattern', 'NNNNNNNNNN', '--log',
            fastq1.replace('.fastq.gz', '.extract.metrics'), '--stdout',
            fastq1.replace('.r1.fastq.gz', '.umi.r1.fastq.gz')
        ]
        cmder.run(cmd, msg=message, pmt=True)
        NEED_TO_REMOVE.append(
            fastq1.replace('.r1.fastq.gz', '.umi.r1.fastq.gz'))