Ejemplo n.º 1
0
def grep_count(file_path, to_match, additional_flags=None, fixed_mode=True, starts_with=False):
    '''
        This uses grep for fast counting of strings in a file
    '''
    if not os.path.isfile(file_path) or os.path.getsize(file_path)==0:
        return 0

    env = os.environ.copy()
    env['LC_ALL'] = 'C' #use C locale rather than UTF8 for faster grep

    cmd = ["grep"]
    # '-c' returns the match count
    cmd.append("-c")
    if additional_flags:
        cmd.extend(additional_flags)

    # fixed mode cannot be used with starts_with, since it does not match regular expressions
    # only add the fixed_mode flag if we're not using starts_with
    if not starts_with:
        if fixed_mode:
            cmd.append("-F")
        cmd.append(to_match)
    else:
        cmd.append("^"+to_match)

    cmd.append(file_path)

    number_of_seqs = util.misc.run_and_print(cmd, silent=False, check=True, env=env)
    return int(number_of_seqs.stdout.decode("utf-8").rstrip(os.linesep))
Ejemplo n.º 2
0
def grep_count(file_path, to_match, additional_flags=None, fixed_mode=True, starts_with=False):
    '''
        This uses grep for fast counting of strings in a file
    '''
    if not os.path.isfile(file_path) or os.path.getsize(file_path)==0:
        return 0

    env = os.environ.copy()
    env['LC_ALL'] = 'C' #use C locale rather than UTF8 for faster grep

    cmd = ["grep"]
    # '-c' returns the match count
    cmd.append("-c")
    if additional_flags:
        cmd.extend(additional_flags)

    # fixed mode cannot be used with starts_with, since it does not match regular expressions
    # only add the fixed_mode flag if we're not using starts_with
    if not starts_with:
        if fixed_mode:
            cmd.append("-F")
        cmd.append(to_match)
    else:
        cmd.append("^"+to_match)

    cmd.append(file_path)

    number_of_seqs = util.misc.run_and_print(cmd, silent=False, check=True, env=env)
    return int(number_of_seqs.stdout.decode("utf-8").rstrip(os.linesep))
Ejemplo n.º 3
0
def lastal_chunked_fastq(
    inFastq,
    db,
    outFastq,
    max_gapless_alignments_per_position=1,
    min_length_for_initial_matches=5,
    max_length_for_initial_matches=50,
    max_initial_matches_per_position=100,
    chunk_size=100000
):

    lastal_path = tools.last.Lastal().install_and_get_path()
    mafsort_path = tools.last.MafSort().install_and_get_path()
    mafconvert_path = tools.last.MafConvert().install_and_get_path()
    no_blast_like_hits_path = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py')

    filtered_fastq_files = []
    with open(inFastq, "rt") as fastqFile:
        record_iter = SeqIO.parse(fastqFile, "fastq")
        for batch in util.misc.batch_iterator(record_iter, chunk_size):

            chunk_fastq = mkstempfname('.fastq')
            with open(chunk_fastq, "wt") as handle:
                SeqIO.write(batch, handle, "fastq")
            batch = None

            lastal_out = mkstempfname('.lastal')
            with open(lastal_out, 'wt') as outf:
                cmd = [lastal_path, '-Q1', '-P0']
                cmd.extend(
                    [
                        '-n', max_gapless_alignments_per_position, '-l', min_length_for_initial_matches, '-L',
                        max_length_for_initial_matches, '-m', max_initial_matches_per_position
                    ]
                )
                cmd = [str(x) for x in cmd]
                cmd.extend([db, chunk_fastq])
                log.debug(' '.join(cmd) + ' > ' + lastal_out)
                util.misc.run_and_save(cmd, outf=outf)
            # everything below this point in this method should be replaced with
            # our own code that just reads lastal output and makes a list of read names

            mafsort_out = mkstempfname('.mafsort')
            with open(mafsort_out, 'wt') as outf:
                with open(lastal_out, 'rt') as inf:
                    cmd = [mafsort_path, '-n2']
                    log.debug('cat ' + lastal_out + ' | ' + ' '.join(cmd) + ' > ' + mafsort_out)
                    subprocess.check_call(cmd, stdin=inf, stdout=outf)
            os.unlink(lastal_out)

            mafconvert_out = mkstempfname('.mafconvert')
            with open(mafconvert_out, 'wt') as outf:
                cmd = ["python", mafconvert_path, 'tab', mafsort_out]
                log.debug(' '.join(cmd) + ' > ' + mafconvert_out)
                subprocess.check_call(cmd, stdout=outf)
            os.unlink(mafsort_out)

            filtered_fastq_chunk = mkstempfname('.filtered.fastq')
            with open(filtered_fastq_chunk, 'wt') as outf:
                cmd = [no_blast_like_hits_path, '-b', mafconvert_out, '-r', chunk_fastq, '-m', 'hit']
                log.debug(' '.join(cmd) + ' > ' + filtered_fastq_chunk)
                subprocess.check_call(cmd, stdout=outf)
                filtered_fastq_files.append(filtered_fastq_chunk)
            os.unlink(mafconvert_out)

    # concatenate filtered fastq files to outFastq
    util.file.concat(filtered_fastq_files, outFastq)

    # remove temp fastq files
    for tempfastq in filtered_fastq_files:
        os.unlink(tempfastq)
Ejemplo n.º 4
0
def lastal_chunked_fastq(inFastq,
                         db,
                         outFastq,
                         max_gapless_alignments_per_position=1,
                         min_length_for_initial_matches=5,
                         max_length_for_initial_matches=50,
                         max_initial_matches_per_position=100,
                         chunk_size=100000):

    lastal_path = tools.last.Lastal().install_and_get_path()
    mafsort_path = tools.last.MafSort().install_and_get_path()
    mafconvert_path = tools.last.MafConvert().install_and_get_path()
    no_blast_like_hits_path = os.path.join(util.file.get_scripts_path(),
                                           'noBlastLikeHits.py')

    filtered_fastq_files = []
    with open(inFastq, "rt") as fastqFile:
        record_iter = SeqIO.parse(fastqFile, "fastq")
        for batch in util.misc.batch_iterator(record_iter, chunk_size):

            chunk_fastq = mkstempfname('.fastq')
            with open(chunk_fastq, "wt") as handle:
                SeqIO.write(batch, handle, "fastq")
            batch = None

            lastal_out = mkstempfname('.lastal')
            with open(lastal_out, 'wt') as outf:
                cmd = [lastal_path, '-Q1', '-P0']
                cmd.extend([
                    '-n', max_gapless_alignments_per_position, '-l',
                    min_length_for_initial_matches, '-L',
                    max_length_for_initial_matches, '-m',
                    max_initial_matches_per_position
                ])
                cmd = [str(x) for x in cmd]
                cmd.extend([db, chunk_fastq])
                log.debug(' '.join(cmd) + ' > ' + lastal_out)
                util.misc.run_and_save(cmd, outf=outf)
            # everything below this point in this method should be replaced with
            # our own code that just reads lastal output and makes a list of read names

            mafsort_out = mkstempfname('.mafsort')
            with open(mafsort_out, 'wt') as outf:
                with open(lastal_out, 'rt') as inf:
                    cmd = [mafsort_path, '-n2']
                    log.debug('cat ' + lastal_out + ' | ' + ' '.join(cmd) +
                              ' > ' + mafsort_out)
                    subprocess.check_call(cmd, stdin=inf, stdout=outf)
            os.unlink(lastal_out)

            mafconvert_out = mkstempfname('.mafconvert')
            with open(mafconvert_out, 'wt') as outf:
                cmd = ["python", mafconvert_path, 'tab', mafsort_out]
                log.debug(' '.join(cmd) + ' > ' + mafconvert_out)
                subprocess.check_call(cmd, stdout=outf)
            os.unlink(mafsort_out)

            filtered_fastq_chunk = mkstempfname('.filtered.fastq')
            with open(filtered_fastq_chunk, 'wt') as outf:
                cmd = [
                    no_blast_like_hits_path, '-b', mafconvert_out, '-r',
                    chunk_fastq, '-m', 'hit'
                ]
                log.debug(' '.join(cmd) + ' > ' + filtered_fastq_chunk)
                subprocess.check_call(cmd, stdout=outf)
                filtered_fastq_files.append(filtered_fastq_chunk)
            os.unlink(mafconvert_out)

    # concatenate filtered fastq files to outFastq
    util.file.concat(filtered_fastq_files, outFastq)

    # remove temp fastq files
    for tempfastq in filtered_fastq_files:
        os.unlink(tempfastq)