Example #1
0
def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs):
    tmpfq1_a = mkstempfname('.fastq')
    tmpfq1_b = mkstempfname('.fastq')
    tmpfq2_b = mkstempfname('.fastq')
    tmpfq2_c = mkstempfname('.fastq')
    # deplete fq1
    deplete_blastn(infq1, tmpfq1_a, refDbs)
    # purge fq2 of read pairs lost in fq1
    # (this should significantly speed up the second run of deplete_blastn)
    read_utils.purge_unmated(tmpfq1_a, infq2, tmpfq1_b, tmpfq2_b)
    # deplete fq2
    deplete_blastn(tmpfq2_b, tmpfq2_c, refDbs)
    # purge fq1 of read pairs lost in fq2
    read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2)
Example #2
0
def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs, threads):
    'Use blastn to remove reads that match at least one of the databases.'
    tmpfq1_a = mkstempfname('.fastq')
    tmpfq1_b = mkstempfname('.fastq')
    tmpfq2_b = mkstempfname('.fastq')
    tmpfq2_c = mkstempfname('.fastq')
    # deplete fq1
    deplete_blastn(infq1, tmpfq1_a, refDbs)
    # purge fq2 of read pairs lost in fq1
    # (this should significantly speed up the second run of deplete_blastn)
    read_utils.purge_unmated(tmpfq1_a, infq2, tmpfq1_b, tmpfq2_b)
    # deplete fq2
    deplete_blastn(tmpfq2_b, tmpfq2_c, refDbs, threads)
    # purge fq1 of read pairs lost in fq2
    read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2)
Example #3
0
def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs, threads):
    'Use blastn to remove reads that match at least one of the databases.'
    tmpfq1_a = mkstempfname('.fastq')
    tmpfq1_b = mkstempfname('.fastq')
    tmpfq2_b = mkstempfname('.fastq')
    tmpfq2_c = mkstempfname('.fastq')
    # deplete fq1
    deplete_blastn(infq1, tmpfq1_a, refDbs)
    # purge fq2 of read pairs lost in fq1
    # (this should significantly speed up the second run of deplete_blastn)
    read_utils.purge_unmated(tmpfq1_a, infq2, tmpfq1_b, tmpfq2_b)
    # deplete fq2
    deplete_blastn(tmpfq2_b, tmpfq2_c, refDbs, threads)
    # purge fq1 of read pairs lost in fq2
    read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2)
Example #4
0
def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
    ''' Take reads through Trimmomatic, Prinseq, and subsampling.
        This should probably move over to read_utils or taxon_filter.
    '''
    
    # BAM -> fastq
    infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq']))
    tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1])
    
    # Trimmomatic
    trimfq = list(map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq']))
    taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb)
    os.unlink(infq[0])
    os.unlink(infq[1])
    
    # Prinseq
    rmdupfq = list(map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq']))
    read_utils.rmdup_prinseq_fastq(trimfq[0], rmdupfq[0])
    read_utils.rmdup_prinseq_fastq(trimfq[1], rmdupfq[1])
    os.unlink(trimfq[0])
    os.unlink(trimfq[1])
    
    # Purge unmated
    purgefq = list(map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq']))
    read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1])
    os.unlink(rmdupfq[0])
    os.unlink(rmdupfq[1])

    # Log count
    with open(purgefq[0], 'rt') as inf:
        n = int(sum(1 for line in inf)/4)
        log.info("PRE-SUBSAMPLE COUNT: %s read pairs", n)
    
    # Subsample
    subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
    cmd = [os.path.join(util.file.get_scripts_path(), 'subsampler.py'),
        '-n', str(n_reads),
        '-mode', 'p',
        '-in', purgefq[0], purgefq[1],
        '-out', subsampfq[0], subsampfq[1],
        ]
    subprocess.check_call(cmd)
    os.unlink(purgefq[0])
    os.unlink(purgefq[1])
    
    # Fastq -> BAM
    # Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way
    # breaks out the read groups and perform the above steps in a way that preserves
    # the RG IDs.
    tmp_bam = util.file.mkstempfname('.subsamp.bam')
    tmp_header = util.file.mkstempfname('.header.sam')
    tools.samtools.SamtoolsTool().dumpHeader(inBam, tmp_header)
    if n == 0:
        # FastqToSam cannot deal with empty input
        # but Picard SamFormatConverter can deal with empty files
        opts = ['INPUT='+tmp_header, 'OUTPUT='+outBam, 'VERBOSITY=ERROR']
        tools.picard.PicardTools().execute('SamFormatConverter', opts, JVMmemory='50m')
    else:
        tools.picard.FastqToSamTool().execute(
            subsampfq[0], subsampfq[1], 'Dummy', tmp_bam)
        tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam)
    os.unlink(tmp_bam)
    os.unlink(tmp_header)
    os.unlink(subsampfq[0])
    os.unlink(subsampfq[1])
Example #5
0
def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
    ''' Take reads through Trimmomatic, Prinseq, and subsampling.
        This should probably move over to read_utils or taxon_filter.
    '''

    # BAM -> fastq
    infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq']))
    tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1])

    # Trimmomatic
    trimfq = list(
        map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq']))
    taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb)
    os.unlink(infq[0])
    os.unlink(infq[1])

    # Prinseq
    rmdupfq = list(
        map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq']))
    read_utils.rmdup_prinseq_fastq(trimfq[0], rmdupfq[0])
    read_utils.rmdup_prinseq_fastq(trimfq[1], rmdupfq[1])
    os.unlink(trimfq[0])
    os.unlink(trimfq[1])

    # Purge unmated
    purgefq = list(
        map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq']))
    read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1])
    os.unlink(rmdupfq[0])
    os.unlink(rmdupfq[1])

    # Log count
    with open(purgefq[0], 'rt') as inf:
        n = int(sum(1 for line in inf) / 4)
        log.info("PRE-SUBSAMPLE COUNT: %s read pairs", n)

    # Subsample
    subsampfq = list(
        map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
    cmd = [
        os.path.join(util.file.get_scripts_path(), 'subsampler.py'),
        '-n',
        str(n_reads),
        '-mode',
        'p',
        '-in',
        purgefq[0],
        purgefq[1],
        '-out',
        subsampfq[0],
        subsampfq[1],
    ]
    subprocess.check_call(cmd)
    os.unlink(purgefq[0])
    os.unlink(purgefq[1])

    # Fastq -> BAM
    # Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way
    # breaks out the read groups and perform the above steps in a way that preserves
    # the RG IDs.
    tmp_bam = util.file.mkstempfname('.subsamp.bam')
    tmp_header = util.file.mkstempfname('.header.sam')
    tools.samtools.SamtoolsTool().dumpHeader(inBam, tmp_header)
    if n == 0:
        # FastqToSam cannot deal with empty input
        # but Picard SamFormatConverter can deal with empty files
        opts = ['INPUT=' + tmp_header, 'OUTPUT=' + outBam, 'VERBOSITY=ERROR']
        tools.picard.PicardTools().execute('SamFormatConverter',
                                           opts,
                                           JVMmemory='50m')
    else:
        tools.picard.FastqToSamTool().execute(subsampfq[0], subsampfq[1],
                                              'Dummy', tmp_bam)
        tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam)
    os.unlink(tmp_bam)
    os.unlink(tmp_header)
    os.unlink(subsampfq[0])
    os.unlink(subsampfq[1])