Python Log.err Examples

Programming Language: Python

Namespace/Package Name: kakapo.utils.logging

Class/Type: Log

Method/Function: err

Examples at hotexamples.com: 30

Python Log.err - 30 examples found. These are the top rated real world Python examples of kakapo.utils.logging.Log.err extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

err(30)

msg(30)

inf(28)

wrn(19)

set_colors(1)

set_file(1)

set_write(1)

Example #1

Show file

def dep_check_blast(dir_dep, os_id, dist_id, debian_dists, redhat_dists,
                    force):
    if os_id == 'mac':
        url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.10.1/'
               'ncbi-blast-2.10.1+-x64-macosx.tar.gz')
    elif os_id == 'linux':
        if dist_id in debian_dists:
            url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/'
                   '2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz')
        elif dist_id in redhat_dists:
            url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/'
                   '2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz')

    dnld_path = opj(dir_dep, 'ncbi-blast.tar.gz')

    makeblastdb = None
    blastn = None
    tblastn = None

    try:
        if force is True:
            raise
        makeblastdb = which('makeblastdb')
        blastn = which('blastn')
        tblastn = which('tblastn')
        run([makeblastdb, '-help'])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'ncbi-blast'))
            makeblastdb = opj(dir_bin, 'bin', 'makeblastdb')
            blastn = opj(dir_bin, 'bin', 'blastn')
            tblastn = opj(dir_bin, 'bin', 'tblastn')
            run([makeblastdb, '-help'])
        except Exception:
            Log.wrn('BLAST+ was not found on this system, trying to download.')
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'ncbi-blast'))
            makeblastdb = opj(dir_bin, 'bin', 'makeblastdb')
            blastn = opj(dir_bin, 'bin', 'blastn')
            tblastn = opj(dir_bin, 'bin', 'tblastn')

            if not ope(makeblastdb) or \
                    not ope(blastn) or \
                    not ope(tblastn):
                Log.err('Could not download BLAST+.')
                return None, None, None

    regexp = r'\sblast\s([\d\.]*)'
    v = get_dep_version([makeblastdb, '-version'], regexp)
    Log.msg('makeblastdb is available:', v + ' ' + makeblastdb)
    v = get_dep_version([blastn, '-version'], regexp)
    Log.msg('blastn is available:', v + ' ' + blastn)
    v = get_dep_version([tblastn, '-version'], regexp)
    Log.msg('tblastn is available:', v + ' ' + tblastn)

    return makeblastdb, blastn, tblastn

Example #2

Show file

def dep_check_bowtie2(dir_dep, os_id, force):
    if os_id == 'mac':
        url = ('https://sourceforge.net/projects/bowtie-bio/files/bowtie2/'
               '2.4.1/bowtie2-2.4.1-macos-x86_64.zip/download')
    elif os_id == 'linux':
        url = ('https://sourceforge.net/projects/bowtie-bio/files/bowtie2/'
               '2.4.1/bowtie2-2.4.1-linux-x86_64.zip/download')

    dnld_path = opj(dir_dep, 'bowtie2.zip')

    try:
        if force is True:
            raise
        bowtie2 = which('bowtie2')
        bowtie2_build = which('bowtie2-build')
        run([bowtie2, '-h'])
        run([bowtie2_build, '-h'])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'bowtie2'))
            bowtie2 = opj(dir_bin, 'bowtie2')
            bowtie2_build = opj(dir_bin, 'bowtie2-build')
            run([bowtie2, '-h'])
            run([bowtie2_build, '-h'])
        except Exception:
            Log.wrn('Bowtie 2 was not found on this system, trying to '
                    'download.')
            download_file(url, dnld_path)
            zip_ref = zipfile.ZipFile(dnld_path, 'r')
            zip_ref.extractall(dir_dep)
            zip_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'bowtie2'))
            bowtie2 = opj(dir_bin, 'bowtie2')
            bowtie2_build = opj(dir_bin, 'bowtie2-build')

            bowtie2_execs = ('', '-align-l', '-align-l-debug', '-align-s',
                             '-align-s-debug', '-build', '-build-l',
                             '-build-l-debug', '-build-s', '-build-s-debug',
                             '-inspect', '-inspect-l', '-inspect-l-debug',
                             '-inspect-s', '-inspect-s-debug')

            for bt2exe in bowtie2_execs:
                chmod(
                    bowtie2 + bt2exe, stat.S_IRWXU | stat.S_IRGRP
                    | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)

            if not ope(bowtie2):
                Log.err('Could not download Bowtie 2.')
                return None, None

    regexp = r'^.*?version\s([\d\.]*)'
    v = get_dep_version([bowtie2, '--version'], regexp)
    Log.msg('bowtie2 is available:', v + ' ' + bowtie2)
    v = get_dep_version([bowtie2_build, '--version'], regexp)
    Log.msg('bowtie2-build is available:', v + ' ' + bowtie2_build)

    return bowtie2, bowtie2_build

Example #3

Show file

File: kraken.py Project: muti99/kakapo

def _use_memory_mapping(db_path):
    db_size = stat(opj(db_path, 'hash.k2d')).st_size / (1024**3)
    mem_max = RAM / 3
    if mem_max < db_size:
        db_name = splitext(basename(db_path))[0]
        Log.err('Not enough memory for Kraken2 database {}. '
                'Switching to a slower memory-mapping mode.'.format(db_name))
        return '--memory-mapping'
    else:
        return None

Example #4

Show file

def dep_check_sra_toolkit(dir_dep, os_id, dist_id, debian_dists, redhat_dists,
                          force):
    if os_id == 'mac':
        url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/'
               'sratoolkit.2.10.8-mac64.tar.gz')
    elif os_id == 'linux':
        if dist_id in debian_dists:
            url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/'
                   'sratoolkit.2.10.8-ubuntu64.tar.gz')
        elif dist_id in redhat_dists:
            url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/'
                   'sratoolkit.2.10.8-centos_linux64.tar.gz')

    dnld_path = opj(dir_dep, 'sra-toolkit.tar.gz')

    fasterq_dump = None
    try:
        if force is True:
            raise
        fasterq_dump = which('fasterq-dump')
        dir_bin = dirname(fasterq_dump).strip('bin')
        _ensure_vdb_cfg(dir_bin)
        run(fasterq_dump)
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'sratoolkit'))
            _ensure_vdb_cfg(dir_bin)
            fasterq_dump = opj(dir_bin, 'bin', 'fasterq-dump')
            run(fasterq_dump)
        except Exception:
            Log.wrn('SRA Toolkit was not found on this system, trying to '
                    'download.')
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'sratoolkit'))
            fasterq_dump = opj(dir_bin, 'bin', 'fasterq-dump')

            _ensure_vdb_cfg(dir_bin)

            if not ope(fasterq_dump):
                Log.err('Could not download SRA Toolkit.')
                return None

    v = get_dep_version([fasterq_dump, '--version'], r':\s([\d\.]*)')
    if v == '?':
        v = get_dep_version([fasterq_dump, '--version'], r'version\s([\d\.]*)')
    Log.msg('fasterq-dump is available:', v + ' ' + fasterq_dump)

    return fasterq_dump

Example #5

Show file

File: kakapolib.py Project: karolisr/kakapo

def dep_check_kakapolib(force=False, quiet=False):
    kkpl = KAKAPOLIB
    if not ope(kkpl):
        if quiet is False:
            Log.wrn('Compiling kakapolib.')
        run(['make', 'install'], cwd=DIR_C_SRC)
    if ope(kkpl):
        if quiet is False:
            Log.msg('kakapolib is available:', kkpl)
    else:
        Log.err('Compilation of kakapolib failed.')
        return None
    return ctypes.CDLL(kkpl)

Example #6

Show file

def dep_check_seqtk(dir_dep, force):
    url = 'https://github.com/lh3/seqtk/archive/master.zip'
    dnld_path = opj(dir_dep, 'seqtk.zip')
    dir_bin = opj(dir_dep, 'seqtk-master')

    fp = NamedTemporaryFile()
    fp.write(str.encode('>seq' + lns + 'ATGC'))
    fp.seek(0)
    cmd = ['', 'seq', '-r', fp.name]

    try:
        if force is True:
            raise
        seqtk = which('seqtk')
        cmd[0] = seqtk
        run(cmd, do_not_raise=True)
    except Exception:
        try:
            seqtk = opj(dir_bin, 'seqtk')
            cmd[0] = seqtk
            run(cmd, do_not_raise=True)
        except Exception:
            Log.wrn('Seqtk was not found on this system, trying to download.')
            download_file(url, dnld_path)
            zip_ref = zipfile.ZipFile(dnld_path, 'r')
            zip_ref.extractall(dir_dep)
            zip_ref.close()
            try:
                Log.wrn('Compiling Seqtk.')
                run('make', cwd=dir_bin)
                run(cmd, do_not_raise=True)
            except Exception:
                replace_line_in_file(opj(dir_bin, 'Makefile'), 'CC=gcc',
                                     'CC=cc')
                try:
                    run('make', cwd=dir_bin)
                    run(cmd, do_not_raise=True)
                except Exception:
                    Log.err(
                        'Something went wrong while trying to compile Seqtk.')
                    Log.msg('Try downloading and installing it manually from: '
                            'https://github.com/lh3/seqtk')
                    fp.close()
                    return None

    fp.close()

    v = get_dep_version([seqtk], r'Version\:\s([\d\w\.\-]*)')
    Log.msg('Seqtk is available:', v + ' ' + seqtk)

    return seqtk

Example #7

Show file

def dep_check_vsearch(dir_dep, os_id, dist_id, debian_dists, redhat_dists,
                      force):
    if os_id == 'mac':
        url = ('https://github.com/torognes/vsearch/releases/download/v2.15.0/'
               'vsearch-2.15.0-macos-x86_64.tar.gz')
    elif os_id == 'linux':
        if dist_id in debian_dists:
            url = ('https://github.com/torognes/vsearch/releases/download/'
                   'v2.15.0/vsearch-2.15.0-linux-x86_64.tar.gz')
        elif dist_id in redhat_dists:
            url = ('https://github.com/torognes/vsearch/releases/download/'
                   'v2.15.0/vsearch-2.15.0-linux-x86_64.tar.gz')

    dnld_path = opj(dir_dep, 'vsearch.tar.gz')

    try:
        if force is True:
            raise
        vsearch = which('vsearch')
        run(vsearch)
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'vsearch'))
            vsearch = opj(dir_bin, 'bin', 'vsearch')
            run(vsearch)
        except Exception:
            Log.wrn(
                'Vsearch was not found on this system, trying to download.')
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()
            try:
                dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'vsearch'))
                vsearch = opj(dir_bin, 'bin', 'vsearch')
                if not ope(vsearch):
                    Log.err('Could not download Vsearch.')
                    return None
                else:
                    run(vsearch)
            except Exception:
                Log.err('Vsearch was downloaded, but does not execute.')
                Log.msg('Try downloading and installing it manually from: '
                        'https://github.com/torognes/vsearch')
                return None

    v = get_dep_version([vsearch, '-version'], r'vsearch\sv([\d\.]*)')
    Log.msg('Vsearch is available:', v + ' ' + vsearch)

    return vsearch

Example #8

Show file

File: c_process_reads.py Project: muti99/kakapo

def makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim,
                   makeblastdb, fpatt):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Building BLAST databases for reads.')
        if makeblastdb is None:
            Log.err('makeblastdb is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, se)
        fa_path = se_fastq_files[se]['filter_path_fa']
        out_f = opj(dir_blast_fa_trim_sample, se)
        se_fastq_files[se]['blast_db_path'] = out_f

        if ope(dir_blast_fa_trim_sample):
            Log.msg('BLAST database already exists:', se)
        else:
            make_dirs(dir_blast_fa_trim_sample)
            Log.msg(basename(fa_path))
            make_blast_db(exec_file=makeblastdb,
                          in_file=fa_path,
                          out_file=out_f,
                          title=se,
                          dbtype='nucl')

    for pe in pe_fastq_files:
        dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, pe)
        fa_paths = pe_fastq_files[pe]['filter_path_fa']
        out_fs = [x.replace('@D@', dir_blast_fa_trim_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        pe_fastq_files[pe]['blast_db_path'] = out_fs

        if ope(dir_blast_fa_trim_sample):
            Log.msg('BLAST database already exists:', pe)
        else:
            make_dirs(dir_blast_fa_trim_sample)
            pe_trim_files = zip(fa_paths, out_fs)
            for x in pe_trim_files:
                Log.msg(basename(x[0]))
                make_blast_db(exec_file=makeblastdb,
                              in_file=x[0],
                              out_file=x[1],
                              title=basename(x[1]),
                              dbtype='nucl')

Example #9

Show file

def dep_check_trimmomatic(dir_dep):
    url = ('http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/'
           'Trimmomatic-0.39.zip')
    dnld_path = opj(dir_dep, 'Trimmomatic-0.39.zip')
    dir_bin = opj(dir_dep, 'Trimmomatic-0.39')
    trimmomatic = opj(dir_bin, 'trimmomatic-0.39.jar')

    if not ope(trimmomatic):
        download_file(url, dnld_path)
        zip_ref = zipfile.ZipFile(dnld_path, 'r')
        zip_ref.extractall(dir_dep)
        zip_ref.close()

    if not ope(trimmomatic):
        Log.err('Could not download Trimmomatic.')
        return None, None

    v = get_dep_version(['java', '-jar', trimmomatic, '-version'], r'\d+\.\d+')
    Log.msg('Trimmomatic is available:', v + ' ' + trimmomatic)

    path_adapters = _write_trimmomatic_adapters_file(dir_dep)

    return trimmomatic, path_adapters

Example #10

Show file

def makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb):
    if len(assemblies) > 0:
        print()
        Log.inf('Building BLAST databases for assemblies.')
        if makeblastdb is None:
            Log.err('makeblastdb is not available. Cannot continue. Exiting.')
            exit(0)
    for a in assemblies:
        assmbl_name = a['name']

        assmbl_blast_db_dir = opj(dir_prj_blast_assmbl, assmbl_name)
        assmbl_blast_db_file = opj(assmbl_blast_db_dir, assmbl_name)

        a['blast_db_path'] = assmbl_blast_db_file

        if ope(assmbl_blast_db_dir):
            Log.msg('BLAST database already exists:', assmbl_name)
        else:
            Log.msg(assmbl_name)
            make_dirs(assmbl_blast_db_dir)
            make_blast_db(exec_file=makeblastdb,
                          in_file=a['path'],
                          out_file=assmbl_blast_db_file,
                          title=assmbl_name)

Example #11

Show file

File: c_process_reads.py Project: muti99/kakapo

def filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk,
                      fpatt):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Converting FASTQ to FASTA using Seqtk.')
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_fa_trim_data_sample = opj(dir_fa_trim_data, se)
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_fa_trim_data_sample, se + '.fasta')
        se_fastq_files[se]['filter_path_fa'] = out_f

        if ope(dir_fa_trim_data_sample):
            Log.msg('Filtered FASTA files already exist:', se)
        else:
            make_dirs(dir_fa_trim_data_sample)
            Log.msg(basename(fq_path))
            seqtk_fq_to_fa(seqtk, fq_path, out_f)

    for pe in pe_fastq_files:
        dir_fa_trim_data_sample = opj(dir_fa_trim_data, pe)
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_fa_trim_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        pe_fastq_files[pe]['filter_path_fa'] = out_fs

        if ope(dir_fa_trim_data_sample):
            Log.msg('Filtered FASTA files already exist:', pe)
        else:
            make_dirs(dir_fa_trim_data_sample)
            pe_trim_files = zip(fq_paths, out_fs)
            for x in pe_trim_files:
                Log.msg(basename(x[0]))
                seqtk_fq_to_fa(seqtk, x[0], x[1])

Example #12

Show file

def dep_check_spades(dir_dep, os_id, force):
    if os_id == 'mac':
        url = ('http://cab.spbu.ru/files/release3.14.1/'
               'SPAdes-3.14.1-Darwin.tar.gz')
    elif os_id == 'linux':
        url = ('http://cab.spbu.ru/files/release3.14.1/'
               'SPAdes-3.14.1-Linux.tar.gz')

    dnld_path = opj(dir_dep, 'SPAdes.tar.gz')

    try:
        if force is True:
            raise
        spades = which('spades.py')
        run([PY3, spades])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'SPAdes'))
            spades = opj(dir_bin, 'bin', 'spades.py')
            run([PY3, spades])
        except Exception:
            Log.wrn('SPAdes was not found on this system, trying to download.')
            try:
                download_file(url, dnld_path)
                tar_ref = tarfile.open(dnld_path, 'r:gz')
                tar_ref.extractall(dir_dep)
                tar_ref.close()
            except Exception:
                Log.err('Could not download SPAdes.')
                return None
            try:
                dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'SPAdes'))
                spades = opj(dir_bin, 'bin', 'spades.py')
                # replace_line_in_file(spades,
                #                      '#!/usr/bin/env python',
                #                      '#!/usr/bin/env python3')
                if ope(spades):
                    run([PY3, spades])
                else:
                    Log.err('Could not download SPAdes.')
                    return None
            except Exception:
                Log.err('SPAdes was downloaded, but does not execute.')
                return None

    v = get_dep_version([PY3, spades, '--version'], r'^.*SPAdes.*v([\d\.]*)')
    Log.msg('SPAdes is available:', v + ' ' + spades)

    return spades

Example #13

Show file

File: c_process_reads.py Project: muti99/kakapo

def _should_run_bt2(taxid, taxonomy, bt2_order, bowtie2, bowtie2_build):

    dbs = OrderedDict()

    for x in bt2_order:
        db_path_ok = False

        if x == MT:
            if taxonomy.is_eukaryote(taxid) is True:
                if bt2_order[MT] == '':
                    dbs[MT] = MT
                    db_path_ok = True

        elif x == PT:
            if taxonomy.is_eukaryote(taxid) is True:
                if taxonomy.contains_plastid(taxid) is True:
                    if bt2_order[PT] == '':
                        dbs[PT] = PT
                        db_path_ok = True

        if db_path_ok is False:
            db_path = bt2_order[x]
            if ope(db_path) and isfile(db_path):
                dbs[x] = db_path
            else:
                Log.err('File not found:', db_path)
                exit(1)

    if len(dbs) > 0:

        if bowtie2 is None:
            Log.err('bowtie2 is not available. ' + 'Cannot continue. Exiting.')
            exit(0)

        if bowtie2_build is None:
            Log.err('bowtie2-build is not available. ' +
                    'Cannot continue. Exiting.')
            exit(0)

    return dbs

Example #14

Show file

File: c_process_reads.py Project: muti99/kakapo

def run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data,
                    trimmomatic, adapters, fpatt, threads):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Running Trimmomatic.')
        if trimmomatic is None:
            Log.err('trimmomatic is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_fq_trim_data_sample = opj(dir_fq_trim_data, se)
        fq_path = se_fastq_files[se]['cor_path_fq']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        min_acc_len = se_fastq_files[se]['min_acc_len']
        stats_f = opj(dir_fq_trim_data_sample, se + '.txt')
        out_f = opj(dir_fq_trim_data_sample, se + '.fastq' + ext)
        se_fastq_files[se]['trim_path_fq'] = out_f

        if ope(dir_fq_trim_data_sample):
            Log.msg('Trimmed FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_trim_data_sample)
            Log.msg('SE mode:', se)
            trimmomatic_se(trimmomatic=trimmomatic,
                           adapters=adapters,
                           in_file=fq_path,
                           out_file=out_f,
                           stats_file=stats_f,
                           threads=threads,
                           minlen=min_acc_len)

    for pe in pe_fastq_files:
        dir_fq_trim_data_sample = opj(dir_fq_trim_data, pe)
        fq_path_1 = pe_fastq_files[pe]['cor_path_fq'][0]
        fq_path_2 = pe_fastq_files[pe]['cor_path_fq'][1]
        fq_path_3 = None
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        if len(pe_fastq_files[pe]['cor_path_fq']) == 3:
            fq_path_3 = pe_fastq_files[pe]['cor_path_fq'][2]
        min_acc_len = pe_fastq_files[pe]['min_acc_len']
        stats_f = opj(dir_fq_trim_data_sample, pe + '.txt')
        out_fs = [x.replace('@D@', dir_fq_trim_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x + ext for x in out_fs]
        pe_fastq_files[pe]['trim_path_fq'] = out_fs

        if ope(dir_fq_trim_data_sample):
            Log.msg('Trimmed FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_trim_data_sample)
            Log.msg('PE mode:', pe)
            trimmomatic_pe(trimmomatic=trimmomatic,
                           adapters=adapters,
                           in_file_1=fq_path_1,
                           in_file_2=fq_path_2,
                           out_file_paired_1=out_fs[0],
                           out_file_paired_2=out_fs[1],
                           out_file_unpaired_1=out_fs[2],
                           out_file_unpaired_2=out_fs[3],
                           stats_file=stats_f,
                           threads=threads,
                           minlen=min_acc_len)

            if fq_path_3 is not None:

                out_f = opj(dir_fq_trim_data_sample, 'unpaired.fastq' + ext)
                stats_f = opj(dir_fq_trim_data_sample, pe + '_unpaired.txt')

                Log.msg(
                    'SE mode (Paired-read SRA run contains unpaired reads):',
                    pe)

                trimmomatic_se(trimmomatic=trimmomatic,
                               adapters=adapters,
                               in_file=fq_path_3,
                               out_file=out_f,
                               stats_file=stats_f,
                               threads=threads,
                               minlen=min_acc_len)

                _ = opj(dir_fq_trim_data_sample, 'temp.fastq' + ext)
                f_temp = fqopen(_, w_mode)
                with fileinput.FileInput(
                        files=[out_fs[2], out_f],
                        openhook=fileinput.hook_compressed) as f:
                    for line in f:
                        f_temp.write(line)
                f_temp.close()

                remove(out_fs[2])
                remove(out_f)
                copyfile(_, out_fs[2])
                remove(_)

Example #15

Show file

File: c_process_reads.py Project: muti99/kakapo

def dnld_sra_info(sras, dir_cache_prj):

    sra_runs_info = {}
    sras_acceptable = []

    if len(sras) > 0:
        print()
        Log.inf('Downloading SRA run information.')
    else:
        return sra_runs_info, sras_acceptable

    __ = opj(dir_cache_prj, 'sra_runs_info_cache')

    if ope(__):
        with open(__, 'rb') as f:
            sra_runs_info = pickle.load(f)

    sras_local = [k for k in sra_runs_info.keys()]
    sras_to_dnld = set(sras).difference(set(sras_local))
    if len(sras_to_dnld) > 0:
        temp = sra_run_info(list(sras_to_dnld))
        new_sra_runs_info = {i['Run']: i for i in temp}
        sra_runs_info.update(new_sra_runs_info)

    for sra in sras:

        if sra in sra_runs_info:

            info = sra_runs_info[sra]

            sra_lib_layout = info['LibraryLayout'].lower()
            sra_lib_source = info['LibrarySource'].lower()
            sra_lib_strategy = info['LibraryStrategy']
            sra_seq_platform = info['Platform'].lower().capitalize()
            sra_seq_platform_model = info['Model']
            sra_species = info['ScientificName']
            sra_taxid = info['TaxID']
            sra_spots = int(info['spots'])
            sra_spots_with_mates = int(info['spots_with_mates'])

            sample_base_name = (sra_species.replace(' ', '_') + '_' +
                                sra_taxid + '_' + sra)

            sra_runs_info[sra]['KakapoSampleBaseName'] = sample_base_name

            src_check = sra_lib_source.lower()
            strategy_check = sra_lib_strategy.lower()

            if not ('transcript' in src_check or 'rna' in src_check
                    or 'rna' in strategy_check):

                sra_info_str = ('{sra}: the SRA library source type "{ltype}" '
                                'or library strategy "{strategy}" '
                                'is not supported.').format(
                                    sra=sra,
                                    ltype=sra_lib_source,
                                    strategy=sra_lib_strategy)

                Log.err(sra_info_str, 'Skipping.')

            elif sra_seq_platform != 'Illumina':
                sra_info_str = ('{sra}: the SRA library sequencing platform '
                                '"{plat}" is not supported').format(
                                    sra=sra, plat=sra_seq_platform)

                Log.err(sra_info_str, 'Skipping.')

            else:
                # sra_info_str = ('SRA run {sra} {strategy} ({source}) '
                #                 '{layout}-end library.\n'
                #                 'Sourced from {species} '
                #                 '(TaxID: {txid}).\n'
                #                 'Sequenced using {platform} platform on '
                #                 '{model}.').format(
                #                     sra=sra,
                #                     source=sra_lib_source.title(),
                #                     strategy=sra_lib_strategy,
                #                     layout=sra_lib_layout,
                #                     platform=sra_seq_platform,
                #                     model=sra_seq_platform_model,
                #                     species=sra_species,
                #                     txid=sra_taxid)

                Log.msg(
                    '{sra}:'.format(sra=sra),
                    '{strategy} {layout}-end library ({source}).'.format(
                        strategy=sra_lib_strategy,
                        layout=sra_lib_layout,
                        source=sra_lib_source.title()))
                Log.msg(
                    '    Source:',
                    '{species} (TaxID: {txid}).'.format(species=sra_species,
                                                        txid=sra_taxid), False)
                Log.msg(
                    'Technology:', '{platform} platform on {model}.'.format(
                        platform=sra_seq_platform,
                        model=sra_seq_platform_model), False)

                sra_runs_info[sra]['KakapoLibraryLayout'] = \
                    sra_runs_info[sra]['LibraryLayout']

                if sra_lib_layout == 'paired' and sra_spots_with_mates == 0:
                    sra_runs_info[sra]['KakapoLibraryLayout'] = 'SINGLE'
                    # sra_info_str = (
                    #     sra_info_str + '\nListed as containing '
                    #     'paired-end reads, but only a single set of reads '
                    #     'is available. Treating as single-ended.')

                elif (sra_lib_layout == 'paired'
                      and sra_spots != sra_spots_with_mates):
                    sra_runs_info[sra]['KakapoLibraryLayout'] = 'PAIRED_UNP'
                    # sra_info_str = (
                    #     sra_info_str + '\nListed as containing '
                    #     'paired-end reads, but not all reads are paired.')

                sras_acceptable.append(sra)

                # Log.msg(sra_info_str)

    with open(__, 'wb') as f:
        pickle.dump(sra_runs_info, f, protocol=PICKLE_PROTOCOL)

    return sra_runs_info, sras_acceptable

Example #16

Show file

File: c_process_reads.py Project: muti99/kakapo

def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   threads, dir_temp, should_run):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        if should_run is False:
            Log.wrn('Skipping Rcorrector as requested.')
        else:
            Log.inf('Running Rcorrector.')

        if rcorrector is None:
            Log.err('Rcorrector is not available. Cannot continue. Exiting.')
            exit(0)

    for se in se_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, se)
        fq_path = se_fastq_files[se]['path']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        log_f = opj(dir_fq_cor_data_sample, se + '.txt')
        out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext)

        se_fastq_files[se]['cor_path_fq'] = out_f

        if should_run is False:
            se_fastq_files[se]['cor_path_fq'] = fq_path
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('SE mode:', se)
            run_rcorrector_se(rcorrector=rcorrector,
                              in_file=fq_path,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path))
            fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext

            filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f)

            remove(fq_cor_path)

    for pe in pe_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe)
        fq_path_1 = pe_fastq_files[pe]['path'][0]
        fq_path_2 = pe_fastq_files[pe]['path'][1]
        fq_path_3 = None
        out_f_3 = None
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        log_f = opj(dir_fq_cor_data_sample, pe + '.txt')
        out_f_1 = opj(dir_fq_cor_data_sample, pe + '_R1.fastq' + ext)
        out_f_2 = opj(dir_fq_cor_data_sample, pe + '_R2.fastq' + ext)

        pe_fastq_files[pe]['cor_path_fq'] = [out_f_1, out_f_2]

        if len(pe_fastq_files[pe]['path']) == 3:
            fq_path_3 = pe_fastq_files[pe]['path'][2]
            out_f_3 = opj(dir_fq_cor_data_sample, pe + '_R3.fastq' + ext)
            pe_fastq_files[pe]['cor_path_fq'].append(out_f_3)

        if should_run is False:
            pe_fastq_files[pe]['cor_path_fq'] = [fq_path_1, fq_path_2]
            if fq_path_3 is not None:
                pe_fastq_files[pe]['cor_path_fq'].append(fq_path_3)
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('PE mode:', pe)
            run_rcorrector_pe(rcorrector=rcorrector,
                              in_file_1=fq_path_1,
                              in_file_2=fq_path_2,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1))
            fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext
            fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2))
            fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext

            filter_unc_pe(in_file_1=fq_cor_path_1,
                          in_file_2=fq_cor_path_2,
                          out_file_1=out_f_1,
                          out_file_2=out_f_2,
                          log_file=log_f)

            remove(fq_cor_path_1)
            remove(fq_cor_path_2)

            if fq_path_3 is not None:

                Log.msg(
                    'SE mode (Paired-read SRA run contains unpaired reads):',
                    pe)

                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_3,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_3 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_3))
                fq_cor_path_3 = splitext_gz(fq_base_path_3)[0] + '.cor.fq'
                log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired.txt')

                filter_unc_se(in_file=fq_cor_path_3,
                              out_file=out_f_3,
                              log_file=log_f_3)

                remove(fq_cor_path_3)

Example #17

Show file

File: c_process_reads.py Project: muti99/kakapo

def min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp,
                        dir_cache_fq_minlen, vsearch):
    # lowest allowable
    low = 35

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Calculating minimum acceptable read length.')
        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)
    else:
        return None

    __ = opj(dir_cache_fq_minlen, 'minlen')

    pickled = {}

    if ope(__):
        with open(__, 'rb') as f:
            pickled = pickle.load(f)

    queue = []

    for se in se_fastq_files:
        src = se_fastq_files[se]['src']
        avg_len = se_fastq_files[se]['avg_len']
        if src == 'sra':
            ml = max(avg_len // 3, low)
            se_fastq_files[se]['min_acc_len'] = ml
            Log.msg(str(ml) + ' nt:', se)
            continue

        fq_path = se_fastq_files[se]['path']
        stats_file = opj(dir_temp, se + '_stats.txt')
        queue.append([se, fq_path, stats_file, 'se'])

    for pe in pe_fastq_files:
        src = pe_fastq_files[pe]['src']
        avg_len = pe_fastq_files[pe]['avg_len']
        if src == 'sra':
            ml = max(avg_len // 3, low)
            pe_fastq_files[pe]['min_acc_len'] = ml
            Log.msg(str(ml) + ' nt:', pe)
            continue

        fq_path = pe_fastq_files[pe]['path'][0]
        stats_file = opj(dir_temp, pe + '_stats.txt')
        queue.append([pe, fq_path, stats_file, 'pe'])

    for x in queue:

        if x[0] in pickled:
            ml = pickled[x[0]]

        else:
            # ----------------------------------------------------------------
            # Use 'vsearch --fastq_stats'. About 2x slower than the
            #   approx_avg_read_len_fq function.
            #
            # cmd = [vsearch, '--fastq_stats', x[1], '--log', x[2]]
            # run(cmd, do_not_raise=True)
            # with open(x[2]) as f:
            #     stats = f.read()
            # remove(x[2])
            # ml = re.findall(r'>=\s+(\d+)', stats)
            # if len(ml) != 0:
            #     ml = max(int(ml[0]) // 3, low)
            # else:
            #     ml = None
            # ----------------------------------------------------------------
            # 22:59:12 50 nt: Hylocereus_polyrhizus_1195597_SRR7829961
            # 22:59:46 50 nt: Schlumbergera_truncata_15H-02_pol_S47    34s
            # 23:00:30 50 nt: Schlumbergera_truncata_15H-02_sty_S49    44s
            # ----------------------------------------------------------------

            # ----------------------------------------------------------------
            ml = approx_avg_read_len_fq(x[1])
            ml = max(int(ml) // 3, low)
            # ----------------------------------------------------------------
            # 23:12:06 50 nt: Hylocereus_polyrhizus_1195597_SRR7829961
            # 23:12:20 50 nt: Schlumbergera_truncata_15H-02_pol_S47    14s
            # 23:12:39 50 nt: Schlumbergera_truncata_15H-02_sty_S49    19s
            # ----------------------------------------------------------------

            pickled[x[0]] = ml

        if ml is not None:
            Log.msg(str(ml) + ' nt:', x[0])
        else:
            Log.msg(' ?' + ' nt:', x[0])
            ml = low

        if x[3] == 'se':
            se_fastq_files[x[0]]['min_acc_len'] = ml

        elif x[3] == 'pe':
            pe_fastq_files[x[0]]['min_acc_len'] = ml

        with open(__, 'wb') as f:
            pickle.dump(pickled, f, protocol=PICKLE_PROTOCOL)

Example #18

Show file

File: c_process_reads.py Project: muti99/kakapo

def dnld_sra_fastq_files(sras, sra_runs_info, dir_fq_data, fasterq_dump,
                         threads, dir_temp):

    if len(sras) > 0:
        if fasterq_dump is None:
            Log.err('fasterq-dump from SRA Toolkit is not available. ' +
                    'Cannot continue. Exiting.')
            exit(0)

        print()
        Log.inf('Downloading SRA read data.')

    se_fastq_files = {}
    pe_fastq_files = {}

    for sra in sras:
        sra_run_info = sra_runs_info[sra]
        sra_lib_layout = sra_run_info['LibraryLayout'].lower()
        sra_lib_layout_k = sra_run_info['KakapoLibraryLayout'].lower()
        sample_base_name = sra_run_info['KakapoSampleBaseName']
        sra_taxid = int(sra_run_info['TaxID'])
        avg_len = int(sra_run_info['avgLength'])

        sra_dnld_needed = False

        if sra_lib_layout == 'single' or sra_lib_layout_k == 'single':
            se_file = opj(dir_fq_data, sra + '.fastq')
            se_fastq_files[sample_base_name] = {'path': se_file}
            se_fastq_files[sample_base_name]['src'] = 'sra'
            se_fastq_files[sample_base_name]['avg_len'] = avg_len
            se_fastq_files[sample_base_name]['tax_id'] = sra_taxid
            if not ope(se_file):
                sra_dnld_needed = True

        elif sra_lib_layout == 'paired':
            pe_file_1 = opj(dir_fq_data, sra + '_1.fastq')
            pe_file_2 = opj(dir_fq_data, sra + '_2.fastq')
            pe_file_1_renamed = opj(dir_fq_data, sra + '_R1.fastq')
            pe_file_2_renamed = opj(dir_fq_data, sra + '_R2.fastq')
            pe_fastq_files[sample_base_name] = {
                'path': [pe_file_1_renamed, pe_file_2_renamed]
            }
            pe_fastq_files[sample_base_name]['src'] = 'sra'
            pe_fastq_files[sample_base_name]['avg_len'] = avg_len // 2
            pe_fastq_files[sample_base_name]['tax_id'] = sra_taxid
            if sra_lib_layout_k == 'paired_unp':
                pe_file_3 = opj(dir_fq_data, sra + '.fastq')
                pe_file_3_renamed = opj(dir_fq_data, sra + '_R3.fastq')
                pe_fastq_files[sample_base_name]['path'].append(
                    pe_file_3_renamed)
            if not ope(pe_file_1_renamed) or not ope(pe_file_2_renamed):
                sra_dnld_needed = True

        if not sra_dnld_needed:
            Log.msg('FASTQ reads are available locally:', sample_base_name)

        retry_count = 0
        while sra_dnld_needed:

            if retry_count > 50:
                Log.err('Download failed. Exiting.')
                rmtree(dir_temp)
                exit(1)

            elif retry_count > 0:
                Log.wrn('Download failed. Retrying.')
                sleep(2)

            retry_count += 1

            Log.msg('Downloading FASTQ reads for:', sample_base_name)

            cmd = [
                fasterq_dump, '--threads',
                str(threads * 2), '--split-3', '--bufsize', '819200',
                '--outdir', dir_fq_data, '--temp', dir_temp, sra
            ]

            run(cmd, do_not_raise=True)

            if sra_lib_layout == 'single' or sra_lib_layout_k == 'single':
                if not ope(se_file):
                    continue

            elif sra_lib_layout == 'paired':

                if not ope(pe_file_1) or not ope(pe_file_2):
                    continue
                else:
                    move(pe_file_1, pe_file_1_renamed)
                    move(pe_file_2, pe_file_2_renamed)

                if sra_lib_layout_k == 'paired_unp':
                    if not ope(pe_file_3):
                        continue
                    else:
                        move(pe_file_3, pe_file_3_renamed)

            sra_dnld_needed = False

            if sra_lib_layout == 'single' or sra_lib_layout_k == 'single':
                if ope(se_file):
                    Log.msg('Renaming FASTQ reads in:', se_file)
                    rename_fq_seqs(se_file, sra, '1:N:0')

            elif sra_lib_layout == 'paired':
                if ope(pe_file_1_renamed):
                    Log.msg('Renaming FASTQ reads in:', pe_file_1_renamed)
                    rename_fq_seqs(pe_file_1_renamed, sra, '1:N:0')
                if ope(pe_file_2_renamed):
                    Log.msg('Renaming FASTQ reads in:', pe_file_2_renamed)
                    rename_fq_seqs(pe_file_2_renamed, sra, '2:N:0')
                if sra_lib_layout_k == 'paired_unp':
                    if ope(pe_file_3_renamed):
                        Log.msg('Renaming FASTQ reads in:', pe_file_3_renamed)
                        rename_fq_seqs(pe_file_3_renamed, sra + '_unpaired',
                                       '1:N:0')

    return se_fastq_files, pe_fastq_files, sra_runs_info

Example #19

Show file

File: d_search_reads.py Project: muti99/kakapo

def run_vsearch_on_reads(se_fastq_files, pe_fastq_files, vsearch,
                         dir_vsearch_results_fa_trim, fpatt, ss, seqtk):

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    # FixMe: Expose in configuration files?
    ident = 0.85

    for se in se_fastq_files:
        dir_results = opj(dir_vsearch_results_fa_trim, se)
        min_acc_len = se_fastq_files[se]['min_acc_len']
        blast_results_fa_path = se_fastq_files[se]['blast_results_path' +
                                                   '__' + ss]
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_results, se + '__' + ss + '.txt')
        out_f_fastq = out_f.replace('.txt', '.fastq')
        se_fastq_files[se]['vsearch_results_path' + '__' + ss] = out_f_fastq

        if ope(out_f_fastq):
            Log.msg('Vsearch results already exist:', se)
        else:
            make_dirs(dir_results)
            Log.msg('Running vsearch on: ' + basename(fq_path), ss)
            run_vsearch(vsearch,
                        ident=ident,
                        q_file=blast_results_fa_path,
                        db_file=fq_path,
                        out_file=out_f,
                        minlen=min_acc_len)

            Log.msg('Extracting unique vsearch hits using Seqtk:', ss)
            keep_unique_lines_in_file(out_f)
            seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f)
            osremove(out_f)

    for pe in pe_fastq_files:
        dir_results = opj(dir_vsearch_results_fa_trim, pe)
        min_acc_len = pe_fastq_files[pe]['min_acc_len']
        blast_results_fa_path = pe_fastq_files[pe]['blast_results_path' +
                                                   '__' + ss]
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_results) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x.replace('@Q@', ss) for x in out_fs]
        out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs]
        pe_fastq_files[pe]['vsearch_results_path' + '__' + ss] = out_fs_fastq

        if ope(out_fs_fastq[0]) and ope(out_fs_fastq[1]) and \
           ope(out_fs_fastq[2]) and ope(out_fs_fastq[3]):
            Log.msg('Vsearch results already exist:', pe)
        else:
            make_dirs(dir_results)
            pe_trim_files = zip(fq_paths, out_fs, out_fs_fastq)
            for x in pe_trim_files:
                Log.msg('Running vsearch on: ' + basename(x[0]), ss)
                run_vsearch(vsearch,
                            ident=ident,
                            q_file=blast_results_fa_path,
                            db_file=x[0],
                            out_file=x[1],
                            minlen=min_acc_len)

            Log.msg(
                'Extracting unique vsearch hits from paired files '
                'using Seqtk:', ss)

            p1txt = out_fs[0]
            p2txt = out_fs[1]

            p1fq = fq_paths[0]
            p2fq = fq_paths[1]

            p1fq_out = out_fs_fastq[0]
            p2fq_out = out_fs_fastq[1]

            p12txt_temp = opj(dir_results, pe + '__' + ss + '_paired.txt')

            combine_text_files([p1txt, p2txt], p12txt_temp)
            keep_unique_lines_in_file(p12txt_temp)

            seqtk_extract_reads(seqtk, p1fq, p1fq_out, p12txt_temp)
            seqtk_extract_reads(seqtk, p2fq, p2fq_out, p12txt_temp)

            osremove(p1txt)
            osremove(p2txt)
            osremove(p12txt_temp)

            Log.msg(
                'Extracting unique vsearch hits from unpaired files '
                'using Seqtk:', ss)

            u1txt = out_fs[2]
            u2txt = out_fs[3]

            u1fq = fq_paths[2]
            u2fq = fq_paths[3]

            u1fq_out = out_fs_fastq[2]
            u2fq_out = out_fs_fastq[3]

            keep_unique_lines_in_file(u1txt)
            keep_unique_lines_in_file(u2txt)

            seqtk_extract_reads(seqtk, u1fq, u1fq_out, u1txt)
            seqtk_extract_reads(seqtk, u2fq, u2fq_out, u2txt)

            osremove(u1txt)
            osremove(u2txt)

Example #20

Show file

def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   threads, dir_temp, fpatt, should_run):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        if should_run is False:
            Log.wrn('Skipping Rcorrector as requested.')
        else:
            Log.inf('Running Rcorrector.')

            if rcorrector is None:
                Log.err(
                    'Rcorrector is not available. Cannot continue. Exiting.')
                exit(0)

    for se in se_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, se)
        fq_path = se_fastq_files[se]['trim_path_fq']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        log_f = opj(dir_fq_cor_data_sample, se + '.txt')
        out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext)

        se_fastq_files[se]['cor_path_fq'] = out_f

        if should_run is False:
            se_fastq_files[se]['cor_path_fq'] = fq_path
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('SE mode:', se)
            run_rcorrector_se(rcorrector=rcorrector,
                              in_file=fq_path,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path))
            fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext

            filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f)

            remove(fq_cor_path)

    for pe in pe_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe)

        fq_path_1 = pe_fastq_files[pe]['trim_path_fq'][0]
        fq_path_2 = pe_fastq_files[pe]['trim_path_fq'][1]
        fq_path_3 = pe_fastq_files[pe]['trim_path_fq'][2]
        fq_path_4 = pe_fastq_files[pe]['trim_path_fq'][3]

        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        log_f = opj(dir_fq_cor_data_sample, pe + '_paired.txt')

        out_fs = [x.replace('@D@', dir_fq_cor_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x + ext for x in out_fs]

        pe_fastq_files[pe]['cor_path_fq'] = out_fs

        if should_run is False:
            pe_fastq_files[pe]['cor_path_fq'] = [
                fq_path_1, fq_path_2, fq_path_3, fq_path_4
            ]
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('PE mode:', pe)
            run_rcorrector_pe(rcorrector=rcorrector,
                              in_file_1=fq_path_1,
                              in_file_2=fq_path_2,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1))
            fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext
            fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2))
            fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext

            filter_unc_pe(in_file_1=fq_cor_path_1,
                          in_file_2=fq_cor_path_2,
                          out_file_1=out_fs[0],
                          out_file_2=out_fs[1],
                          log_file=log_f)

            remove(fq_cor_path_1)
            remove(fq_cor_path_2)

            # unpaired 1
            if stat(fq_path_3).st_size != 0:
                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_3,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_3 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_3))
                fq_cor_path_3 = splitext_gz(
                    fq_base_path_3)[0] + '.cor.fq' + ext
                log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired_1.txt')

                filter_unc_se(in_file=fq_cor_path_3,
                              out_file=out_fs[2],
                              log_file=log_f_3)

                remove(fq_cor_path_3)
            else:
                with open(out_fs[2], 'w') as f:
                    f.write('')

            # unpaired 2
            if stat(fq_path_4).st_size != 0:
                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_4,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_4 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_4))
                fq_cor_path_4 = splitext_gz(
                    fq_base_path_4)[0] + '.cor.fq' + ext
                log_f_4 = opj(dir_fq_cor_data_sample, pe + '_unpaired_2.txt')

                filter_unc_se(in_file=fq_cor_path_4,
                              out_file=out_fs[3],
                              log_file=log_f_4)
                remove(fq_cor_path_4)

            else:
                with open(out_fs[3], 'w') as f:
                    f.write('')

Example #21

Show file

File: c_process_reads.py Project: muti99/kakapo

def run_kraken2(order, dbs, se_fastq_files, pe_fastq_files, dir_fq_filter_data,
                confidence, kraken2, threads, dir_temp, fpatt):

    if (len(se_fastq_files) > 0 or len(pe_fastq_files) > 0) and len(order) > 0:
        print()
        Log.inf('Running Kraken2.', 'Confidence: ' + str(confidence))
        if kraken2 is None:
            Log.err('kraken2 is not available. Cannot continue. Exiting.')
            exit(0)

    nuclear = None
    for nuc in order:
        if nuc[1] == 'nuclear':
            nuclear = nuc[0]
            break

    for se in se_fastq_files:

        if len(order) == 0:
            continue

        if se_fastq_files[se]['path'] is None:
            continue

        fq_path = se_fastq_files[se]['filter_path_fq']
        dir_fq_filter_data_sample = opj(dir_fq_filter_data, se)

        if nuclear is None:
            out_f = opj(dir_fq_filter_data_sample, se + '.fastq')
        else:
            out_f = opj(dir_fq_filter_data_sample, nuclear, se + '.fastq')

        se_fastq_files[se]['filter_path_fq'] = out_f

        if ope(dir_fq_filter_data_sample):
            Log.msg('Kraken2 filtered FASTQ files already exist:', se)
        else:
            make_dirs(dir_fq_filter_data_sample)
            print()
            Log.msg('SE mode:', se)
            run_kraken_filters(order=order,
                               dbs=dbs,
                               base_name=se,
                               in_files=fq_path,
                               dir_out=dir_fq_filter_data_sample,
                               confidence=confidence,
                               kraken2=kraken2,
                               threads=threads,
                               dir_temp=dir_temp)

    for pe in pe_fastq_files:

        if len(order) == 0:
            continue

        if pe_fastq_files[pe]['path'] is None:
            continue

        fq_path = pe_fastq_files[pe]['filter_path_fq']
        dir_fq_filter_data_sample = opj(dir_fq_filter_data, pe)

        if nuclear is None:
            dir_name_nuclear = dir_fq_filter_data_sample
        else:
            dir_name_nuclear = dir_fq_filter_data_sample + ops + nuclear

        out_fs = [x.replace('@D@', dir_name_nuclear) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]

        pe_fastq_files[pe]['filter_path_fq'] = out_fs

        if ope(dir_fq_filter_data_sample):
            Log.msg('Kraken2 filtered FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_filter_data_sample)
            print()
            Log.msg('PE mode:', pe)
            run_kraken_filters(order=order,
                               dbs=dbs,
                               base_name=pe,
                               in_files=fq_path,
                               dir_out=dir_fq_filter_data_sample,
                               confidence=confidence,
                               kraken2=kraken2,
                               threads=threads,
                               dir_temp=dir_temp)

Example #22

Show file

def dep_check_kraken2(dir_dep, os_id, release_name, force):
    url = 'https://github.com/karolisr/kraken2/archive/master.tar.gz'

    dnld_path = opj(dir_dep, 'kraken2.tar.gz')

    try:
        if force is True:
            raise
        kraken2 = which('kraken2')
        kraken2_build = which('kraken2-build')

        dir_bin = dirname(kraken2)
        classify_bin = opj(dir_bin, 'classify')
        _ = run([classify_bin], do_not_raise=True)
        if not _.stderr.startswith('classify: mandatory filename'):
            raise

        run([kraken2, '--help'])
        run([kraken2_build, '--help'])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'kraken2'))
            kraken2 = opj(dir_bin, 'bin', 'kraken2')
            kraken2_build = opj(dir_bin, 'bin', 'kraken2-build')

            classify_bin = opj(dir_bin, 'bin', 'classify')
            _ = run([classify_bin], do_not_raise=True)
            if not _.stderr.startswith('classify: mandatory filename'):
                raise

            run([kraken2, '--help'])
            run([kraken2_build, '--help'])
        except Exception:
            Log.wrn('Kraken2 was not found on this system, trying to '
                    'download.')

            if ope(dnld_path):
                remove(dnld_path)

            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'kraken2'))
            classify_bin = opj(dir_bin, 'bin', 'classify')
            kraken2 = opj(dir_bin, 'bin', 'kraken2')
            kraken2_build = opj(dir_bin, 'bin', 'kraken2-build')

            makefile = opj(dir_bin, 'src', 'Makefile')
            replace_line_in_file(makefile, 'cp $(PROGS) $(KRAKEN2_DIR)/',
                                 'cp $(PROGS) "$(KRAKEN2_DIR)"/')
            try:
                Log.wrn('Compiling Kraken2 Attempt 1')
                run(['./install_kraken2.sh', 'bin'], cwd=dir_bin)

                _ = run([classify_bin], do_not_raise=True)
                if not _.stderr.startswith('classify: mandatory filename'):
                    raise

                run([kraken2, '--help'])
                run([kraken2_build, '--help'])

            except Exception:
                try:
                    Log.wrn('Compiling Kraken2 Attempt 2')

                    dir_libomp = opj(dir_dep, 'libomp')

                    if ope(dir_libomp):
                        rmtree(dir_libomp)

                    libomp_fp, v = brew_get('libomp', os_id, release_name,
                                            dir_dep)

                    tar_ref = tarfile.open(libomp_fp, 'r:gz')
                    tar_ref.extractall(dir_dep)
                    tar_ref.close()

                    dir_libomp_l = opj(dir_libomp, v, 'lib')
                    dir_libomp_i = opj(dir_libomp, v, 'include')

                    if os_id == 'mac':
                        # Changes the shared library identification name of a
                        # dynamic shared library.
                        dylib_f = opj(dir_libomp_l, 'libomp.dylib')

                        chmod(
                            dylib_f, stat.S_IRWXU | stat.S_IRUSR | stat.S_IWUSR
                            | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH
                            | stat.S_IWOTH)

                        cmd = ['install_name_tool', '-id', dylib_f, dylib_f]
                        run(cmd)

                        cxx_flags = ('CXXFLAGS = -L{} -I{} -Xpreprocessor '
                                     '-fopenmp -lomp -Wall -std=c++11 -O3')

                    elif os_id == 'linux':
                        cxx_flags = ('CXXFLAGS = -L{} -I{} -fopenmp -lomp '
                                     '-static -Wall -std=c++11 -O3')

                    cxx_flags = cxx_flags.format(dir_libomp_l, dir_libomp_i)

                    makefile = opj(dir_bin, 'src', 'Makefile')

                    replace_line_in_file(
                        makefile, 'CXXFLAGS = -fopenmp -Wall -std=c++11'
                        ' -O3', cxx_flags)

                    run(['./install_kraken2.sh', 'bin'], cwd=dir_bin)

                    _ = run([classify_bin], do_not_raise=True)
                    if not _.stderr.startswith('classify: mandatory filename'):
                        raise

                    run([kraken2, '--help'])
                    run([kraken2_build, '--help'])

                except Exception:
                    try:
                        Log.wrn('Compiling Kraken2 Attempt 3')
                        makefile = opj(dir_bin, 'src', 'Makefile')
                        replace_line_in_file(
                            makefile, cxx_flags,
                            'CXXFLAGS = -Wall -std=c++11 -O3')
                        run(['./install_kraken2.sh', 'bin'], cwd=dir_bin)

                        _ = run([classify_bin], do_not_raise=True)
                        if not _.stderr.startswith(
                                'classify: mandatory filename'):
                            raise

                        run([kraken2, '--help'])
                        run([kraken2_build, '--help'])
                    except Exception:
                        pass

            if not ope(kraken2):
                Log.err('Something went wrong while trying to compile '
                        'Kraken2.')
                Log.msg('Try downloading and installing it manually from: '
                        'https://github.com/karolisr/kraken2')
                return None, None

    regexp = r'^.*?version\s([\d\.\-A-Za-z]*)'
    v = get_dep_version([kraken2, '--version'], regexp)
    Log.msg('kraken2 is available:', v + ' ' + kraken2)
    v = get_dep_version([kraken2_build, '--version'], regexp)
    Log.msg('kraken2-build is available:', v + ' ' + kraken2_build)

    return kraken2, kraken2_build

Example #23

Show file

def find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk, dir_temp,
                        prepend_assmbl, min_target_orf_len, max_target_orf_len,
                        allow_non_aug, allow_no_strt_cod, allow_no_stop_cod,
                        tax, tax_group, tax_ids_user, min_overlap, organelle):

    if len(assemblies) > 0:
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    for a in assemblies:

        if ('blast_hits_aa__' + ss) not in a:
            continue

        assmbl_name = a['name']
        tax_id = a['tax_id']

        parsed_hits = a['blast_hits_aa__' + ss]

        a_path = a['path']

        gc_tt = a['gc_tt']
        if tax.is_eukaryote(tax_id) is True:
            if organelle == 'mitochondrion':
                gc_tt = a['gc_tt_mito']
            if tax.contains_plastid(tax_id) is True:
                if organelle == 'plastid':
                    gc_tt = a['gc_tt_plastid']

        transcripts_nt_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_nt__' + ss + '.fasta')

        transcripts_nt_orf_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_nt_orf__' + ss + '.fasta')

        transcripts_aa_orf_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_aa_orf__' + ss + '.fasta')

        transcripts_nt = {}
        transcripts_nt_orf = {}
        transcripts_aa_orf = {}

        transcripts_with_acceptable_orfs = set()

        ann_key = 'annotations__'

        a[ann_key + ss] = {}

        collated = collate_blast_results(parsed_hits)

        ######################################################################
        # Use seqtk to sample the assembly FASTA file for sequences with
        # BLAST hits. This increases the speed substantially when the assembly
        # file is large.
        temp_a_file = opj(dir_temp, 'temp__' + ss + '.fasta')
        temp_s_file = opj(dir_temp, 'temp__' + ss + '.txt')
        sseqids_subsample = []
        for hit in collated:
            target_name = hit['sseqid']
            sseqids_subsample.append(target_name)
        sseqids_subsample_text = '\n'.join(sseqids_subsample)
        with open(temp_s_file, 'w') as f:
            f.write(sseqids_subsample_text)
        seqtk_extract_reads(seqtk,
                            in_file=a_path,
                            out_file=temp_a_file,
                            ids_file=temp_s_file)

        with open(temp_a_file, 'r') as f:
            _ = f.read()

        if _.strip() == '':
            continue

        print()
        Log.inf('Analyzing BLAST hits', '=' * 113 + '\n')
        Log.msg('Assembly:', assmbl_name, False)
        Log.msg('Search Strategy:', ss + '\n\n' + '-' * 134 + '\n', False)

        parsed_fasta = trim_desc_to_first_space_in_fasta_text(_, SEQ_TYPE_DNA)
        parsed_fasta = seq_records_to_dict(parsed_fasta)
        ######################################################################

        all_kakapo_results = {}
        json_dump_file_path = opj(dir_prj_transcripts,
                                  assmbl_name + '_ann_kakapo__' + ss + '.json')

        for hit in collated:

            target_name = hit['sseqid']
            target_seq = parsed_fasta[target_name]
            query_name = hit['qseqid']
            hit_evalue = hit['evalue']

            # Prepend assembly name to the sequence name:
            if prepend_assmbl is True:
                target_name = assmbl_name + '__' + target_name
                # Also prepend taxonomic info to the sequence name:
                if tax_id is not None:
                    fm = tax.higher_rank_for_taxid(tax_id, rank='family')
                    if fm is not None:
                        target_name = fm + '__' + target_name

            hit_start = hit['start']
            hit_end = hit['end']
            hit_frame = hit['frame']

            if allow_non_aug is True:
                start_codons = gc_tt.start_codons_ambiguous
            else:
                start_codons = ['ATG']

            stop_codons = gc_tt.stop_codons_ambiguous

            ##################################################################
            if tax_id is not None:
                tax_ids_for_orf = (tax_id, )
            else:
                tax_ids_for_orf = tax_ids_user

            cntx_txids_avail = tuple(
                sorted(
                    set(
                        map(lambda x: int(x.split('_')[0]),
                            atg_contexts.keys()))))

            cntx_taxid = set()
            for txid in tax_ids_for_orf:
                tax_path = partial(tax.path_between_taxids, txid)
                path_len = tuple(
                    map(len, tuple(map(tax_path, cntx_txids_avail))))
                cntx_taxid.add(cntx_txids_avail[path_len.index(min(path_len))])
            cntx_taxid = tuple(cntx_taxid)[0]

            cntx_l_key = str(cntx_taxid) + '_L'
            cntx_r_key = str(cntx_taxid) + '_R'

            cntx_l = atg_contexts[cntx_l_key]
            cntx_r = atg_contexts[cntx_r_key]
            ##################################################################

            orf_log_str = ('grade'.rjust(5) + 'ovrlp'.rjust(7) +
                           'cntx'.rjust(6) + 'length'.center(9) +
                           'cntx_l'.rjust(7) + 'cntx_r'.rjust(15) + '\n')

            orf = find_orf_for_blast_hit(seq=target_seq,
                                         frame=hit_frame,
                                         hit_start=hit_start,
                                         hit_end=hit_end,
                                         stop_codons=stop_codons,
                                         start_codons=start_codons,
                                         context_l=cntx_l,
                                         context_r=cntx_r,
                                         min_overlap=min_overlap,
                                         min_len=min_target_orf_len,
                                         max_len=max_target_orf_len,
                                         allow_no_strt_cod=allow_no_strt_cod,
                                         allow_no_stop_cod=allow_no_stop_cod)

            orf_log_str += orf[2]

            rev_comp_def_str = ''
            if hit_frame > 0:
                ann_hit_b = hit_start
                ann_hit_e = hit_end
            else:
                target_seq = reverse_complement(target_seq)
                ann_hit_b = len(target_seq) - hit_start
                ann_hit_e = len(target_seq) - hit_end
                rev_comp_def_str = '; RevComp'

            target_def = target_name + ' ' + query_name + rev_comp_def_str

            a[ann_key + ss][target_name] = {}

            good_orfs = orf[0]
            bad_orfs = orf[1]

            if len(good_orfs) > 0:
                a[ann_key + ss][target_name]['orfs_good'] = dict()
                orfs_good_dict = a[ann_key + ss][target_name]['orfs_good']
                orf_log_str += '\n' + 'VALID ' + '-' * 128 + '\n'

                for i, good_orf in enumerate(good_orfs):

                    good_orf_frame = good_orf[2]

                    if good_orf_frame > 0:
                        ann_orf_b = good_orf[0]
                        ann_orf_e = good_orf[1] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]
                    else:
                        ann_orf_b = len(target_seq) - good_orf[1]
                        ann_orf_e = len(target_seq) - good_orf[0] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]

                    orf_good_dict = dict()
                    orf_good_dict['orf_begin'] = ann_orf_b
                    orf_good_dict['orf_end'] = ann_orf_e
                    orf_good_dict['orf_frame'] = abs(good_orf_frame)
                    orf_good_dict['orf_grade'] = good_orf[3]
                    orf_good_dict['orf_tt_id'] = str(gc_tt.gc_id)
                    orf_good_dict['orf_tt_name'] = gc_tt.gc_name

                    orfs_good_dict['ORF{:03d}'.format(i + 1)] = orf_good_dict

                    target_def_orf = (target_name +
                                      '__ORF{:03d}'.format(i + 1) + ' ' +
                                      query_name + rev_comp_def_str)

                    transcripts_nt_orf[target_def_orf] = orf_seq

                    transcripts_with_acceptable_orfs.add(target_name)

                    transl_seq = translate(orf_seq, gc_tt.table_ambiguous,
                                           start_codons)

                    transcripts_aa_orf[target_def_orf] = transl_seq[:-1]

            else:
                orf_log_str += '\n' + 'NOT VALID ' + '-' * 124 + '\n'

            Log.msg('Transcript:', target_name, False)
            Log.msg('     Query:', query_name + '\n\n' + orf_log_str, False)

            if len(bad_orfs) > 0:
                a[ann_key + ss][target_name]['orfs_bad'] = dict()
                orfs_bad_dict = a[ann_key + ss][target_name]['orfs_bad']

                for i, bad_orf in enumerate(bad_orfs):

                    bad_orf_frame = bad_orf[2]

                    if bad_orf_frame > 0:
                        ann_orf_b = bad_orf[0]
                        ann_orf_e = bad_orf[1] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]
                    else:
                        ann_orf_b = len(target_seq) - bad_orf[1]
                        ann_orf_e = len(target_seq) - bad_orf[0] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]

                    orf_bad_dict = dict()
                    orf_bad_dict['orf_begin'] = ann_orf_b
                    orf_bad_dict['orf_end'] = ann_orf_e
                    orf_bad_dict['orf_frame'] = abs(bad_orf_frame)
                    orf_bad_dict['orf_grade'] = bad_orf[3]
                    orf_bad_dict['orf_tt_id'] = str(gc_tt.gc_id)
                    orf_bad_dict['orf_tt_name'] = gc_tt.gc_name

                    orfs_bad_dict['ORF{:03d}'.format(i + 1)] = orf_bad_dict

            transcripts_nt[target_def] = target_seq

            a[ann_key + ss][target_name]['blast_hit'] = dict()
            blast_hit_dict = a[ann_key + ss][target_name]['blast_hit']
            blast_hit_dict['query_name'] = query_name
            blast_hit_dict['query_id'] = ss
            blast_hit_dict['evalue'] = hit_evalue
            blast_hit_dict['frame'] = abs(hit_frame)
            blast_hit_dict['blast_hit_begin'] = ann_hit_b
            blast_hit_dict['blast_hit_end'] = ann_hit_e

            # Collect ORF and BLAST hit annotations for downstream use. ######
            kakapo_json = [{}]
            kakapo_json[0]['kakapo_annotations__' + ss] = (a[ann_key +
                                                             ss][target_name])
            all_kakapo_results[target_name] = kakapo_json
            ##################################################################

        # --------------------------------------------------------------------

        Log.msg('Assembly:', assmbl_name, False)
        Log.msg('Search Strategy:', ss, False)
        Log.msg('Transcripts:', str(len(transcripts_nt)), False)
        Log.msg('Transcripts with acceptable ORFs:',
                str(len(transcripts_with_acceptable_orfs)) + '\n' + '=' * 134,
                False)

        if len(transcripts_nt) > 0:
            write_fasta(transcripts_nt, transcripts_nt_fasta_file)
            a['transcripts_nt_fasta_file__' + ss] = transcripts_nt_fasta_file
        else:
            a['transcripts_nt_fasta_file__' + ss] = None

        if len(transcripts_nt_orf) > 0:
            write_fasta(transcripts_nt_orf, transcripts_nt_orf_fasta_file)
            a['transcripts_nt_orf_fasta_file__' +
              ss] = transcripts_nt_orf_fasta_file
        else:
            a['transcripts_nt_orf_fasta_file__' + ss] = None

        if len(transcripts_aa_orf) > 0:
            write_fasta(transcripts_aa_orf, transcripts_aa_orf_fasta_file)
            a['transcripts_aa_orf_fasta_file__' +
              ss] = transcripts_aa_orf_fasta_file
        else:
            a['transcripts_aa_orf_fasta_file__' + ss] = None

        # Save ORF and BLAST hit annotations for downstream use.--------------
        with open(json_dump_file_path, 'w') as f:
            json.dump(all_kakapo_results, f, sort_keys=True, indent=4)

Example #24

Show file

File: __main__.py Project: karolisr/kakapo

def main():
    """Run the script."""
    # Prepare initial logger (before we know the log file path) --------------
    prj_log_file_suffix = time_stamp() + '.log'
    log_stream = StringIO()

    Log.set_colors(COLORS)
    Log.set_file(log_stream)
    Log.set_write(True)

    # Prepare configuration directory ----------------------------------------
    if ope(DIR_CFG):
        Log.inf('Found configuration directory:', DIR_CFG)
    else:
        Log.wrn('Creating configuration directory:', DIR_CFG)
        make_dirs(DIR_CFG)

    print()

    # Check for dependencies -------------------------------------------------
    Log.inf('Checking for dependencies.')
    make_dirs(DIR_DEP)
    make_dirs(DIR_KRK)
    seqtk = deps.dep_check_seqtk(DIR_DEP, FORCE_DEPS)
    trimmomatic, adapters = deps.dep_check_trimmomatic(DIR_DEP)
    fasterq_dump = deps.dep_check_sra_toolkit(DIR_DEP, OS_ID, DIST_ID,
                                              DEBIAN_DISTS, REDHAT_DISTS,
                                              FORCE_DEPS)
    makeblastdb, _, tblastn = deps.dep_check_blast(DIR_DEP, OS_ID, DIST_ID,
                                                   DEBIAN_DISTS, REDHAT_DISTS,
                                                   FORCE_DEPS)
    vsearch = deps.dep_check_vsearch(DIR_DEP, OS_ID, DIST_ID, DEBIAN_DISTS,
                                     REDHAT_DISTS, FORCE_DEPS)
    spades = deps.dep_check_spades(DIR_DEP, OS_ID, FORCE_DEPS)
    bowtie2, bowtie2_build = deps.dep_check_bowtie2(DIR_DEP, OS_ID, FORCE_DEPS)
    rcorrector = deps.dep_check_rcorrector(DIR_DEP, FORCE_DEPS)
    kraken2, kraken2_build = deps.dep_check_kraken2(DIR_DEP, OS_ID,
                                                    RELEASE_NAME, FORCE_DEPS)

    kakapolib = deps.dep_check_kakapolib(FORCE_DEPS)
    if kakapolib is None:
        Log.err('Could not compile "kakapolib". Cannot continue.')
        exit(0)

    print()

    kraken2_dbs = deps.dnld_kraken2_dbs(DIR_KRK)

    if INSTALL_DEPS is True or DNLD_KRAKEN_DBS is True:
        exit(0)

    print()

    # Initialize NCBI taxonomy database --------------------------------------
    tax = Taxonomy()
    if tax.is_initialized() is False:
        tax.init(data_dir_path=DIR_TAX, logger=Log)
        print()

    # Parse configuration file -----------------------------------------------
    Log.inf('Reading configuration file:', CONFIG_FILE_PATH)
    _ = config_file_parse(CONFIG_FILE_PATH, tax)

    allow_no_stop_cod = _['allow_no_stop_cod']
    allow_no_strt_cod = _['allow_no_strt_cod']
    allow_non_aug = _['allow_non_aug']

    blast_1_evalue = _['blast_1_evalue']
    blast_1_max_hsps = _['blast_1_max_hsps']
    blast_1_qcov_hsp_perc = _['blast_1_qcov_hsp_perc']
    blast_1_best_hit_overhang = _['blast_1_best_hit_overhang']
    blast_1_best_hit_score_edge = _['blast_1_best_hit_score_edge']
    blast_1_max_target_seqs = _['blast_1_max_target_seqs']

    blast_2_evalue = _['blast_2_evalue']
    blast_2_max_hsps = _['blast_2_max_hsps']
    blast_2_qcov_hsp_perc = _['blast_2_qcov_hsp_perc']
    blast_2_best_hit_overhang = _['blast_2_best_hit_overhang']
    blast_2_best_hit_score_edge = _['blast_2_best_hit_score_edge']
    blast_2_max_target_seqs = _['blast_2_max_target_seqs']

    dir_out = _['output_directory']
    email = _['email']
    requery_after = _['requery_after']
    fq_pe = _['fq_pe']
    fq_se = _['fq_se']
    should_run_rcorrector = _['should_run_rcorrector']
    should_run_ipr = _['should_run_ipr']
    bt2_order = _['bt2_order']
    kraken_confidence = _['kraken_confidence']
    krkn_order = _['krkn_order']
    prepend_assmbl = _['prepend_assmbl']
    prj_name = _['project_name']
    sras = _['sras']
    tax_group = _['tax_group']
    # tax_group_name = _['tax_group_name']
    tax_ids_user = _['tax_ids']
    user_assemblies = _['assmbl']

    print()

    # Parse search strategies file -------------------------------------------
    if SS_FILE_PATH is not None:
        Log.inf('Reading search strategies file:', SS_FILE_PATH)
        sss = ss_file_parse(SS_FILE_PATH)
    else:
        Log.wrn('Search strategies file was not provided.\n' +
                'Will process reads, assemblies and then stop.')
        sss = dict()

    print()

    # Create output directory ------------------------------------------------
    if dir_out is not None:
        if ope(dir_out):
            Log.inf('Found output directory:', dir_out)
        else:
            Log.wrn('Creating output directory:', dir_out)
            make_dirs(dir_out)

    print()

    # Write Kakapo version information to the output directory ---------------
    version_file = opj(dir_out, 'kakapo_version.txt')
    if ope(version_file):
        with open(version_file, 'r') as f:
            version_prev = f.read().strip()
            if __version__ != version_prev:
                Log.wrn('The output directory contains data produced by a ' +
                        'different version of Kakapo: ' + version_prev +
                        '.\nThe currently running version is: ' + __version__ +
                        '.\n' +
                        'Delete "kakapo_version.txt" file located in the ' +
                        'output directory if you would like to continue.')
                exit(0)

    with open(version_file, 'w') as f:
        f.write(__version__)

    # Create subdirectories in the output directory --------------------------
    _ = prepare_output_directories(dir_out, prj_name)

    dir_temp = _['dir_temp']
    dir_cache_pfam_acc = _['dir_cache_pfam_acc']
    dir_cache_fq_minlen = _['dir_cache_fq_minlen']
    dir_cache_prj = _['dir_cache_prj']
    dir_cache_refseqs = _['dir_cache_refseqs']
    dir_prj_logs = _['dir_prj_logs']
    dir_prj_queries = _['dir_prj_queries']
    dir_fq_data = _['dir_fq_data']
    dir_fq_cor_data = _['dir_fq_cor_data']
    dir_fq_trim_data = _['dir_fq_trim_data']
    dir_fq_filter_bt2_data = _['dir_fq_filter_bt2_data']
    dir_fq_filter_krkn2_data = _['dir_fq_filter_krkn2_data']
    dir_fa_trim_data = _['dir_fa_trim_data']
    dir_blast_fa_trim = _['dir_blast_fa_trim']
    dir_prj_blast_results_fa_trim = _['dir_prj_blast_results_fa_trim']
    dir_prj_vsearch_results_fa_trim = _['dir_prj_vsearch_results_fa_trim']
    dir_prj_spades_assemblies = _['dir_prj_spades_assemblies']
    dir_prj_blast_assmbl = _['dir_prj_blast_assmbl']
    dir_prj_assmbl_blast_results = _['dir_prj_assmbl_blast_results']
    dir_prj_transcripts = _['dir_prj_transcripts']
    dir_prj_ips = _['dir_prj_ips']
    dir_prj_transcripts_combined = _['dir_prj_transcripts_combined']

    # Prepare logger ---------------------------------------------------------
    prj_log_file = opj(dir_prj_logs, prj_name + '_' + prj_log_file_suffix)
    with open(prj_log_file, 'w') as f:
        f.write(SCRIPT_INFO.strip() + '\n\n' + log_stream.getvalue())

    Log.set_colors(COLORS)
    Log.set_file(prj_log_file)
    Log.set_write(True)

    log_stream.close()

    # Resolve descending taxonomy nodes --------------------------------------
    tax_ids = tax.all_descending_taxids_for_taxids([tax_group])

    # Pfam uniprot accessions ------------------------------------------------
    pfam_uniprot_acc = OrderedDict()
    for ss in sss:
        pfam_acc = sss[ss]['pfam_families']
        pfam_uniprot_acc[ss] = pfam_uniprot_accessions(ss, pfam_acc, tax_ids,
                                                       dir_cache_pfam_acc)

    # Download Pfam uniprot sequences if needed ------------------------------
    aa_uniprot_files = OrderedDict()
    for ss in sss:
        aa_uniprot_files[ss] = opj(dir_prj_queries, 'aa_uniprot__' + ss +
                                   '.fasta')
        # ToDo: add support for the requery_after parameter.
        dnld_pfam_uniprot_seqs(ss, pfam_uniprot_acc[ss], aa_uniprot_files[ss],
                               dir_cache_prj)

    # User provided entrez query ---------------------------------------------
    prot_acc_user_from_query = OrderedDict()
    for ss in sss:
        entrez_queries = sss[ss]['entrez_search_queries']
        prot_acc_user_from_query[ss] = user_entrez_search(ss, entrez_queries,
                                                          dir_cache_prj,
                                                          requery_after)

    # User provided protein accessions ---------------------------------------
    prot_acc_user = OrderedDict()
    for ss in sss:
        print()
        prot_acc_all = sorted(set(sss[ss]['ncbi_accessions_aa'] +
                                  prot_acc_user_from_query[ss]))
        prot_acc_user[ss] = user_protein_accessions(ss, prot_acc_all,
                                                    dir_cache_prj, tax)

    # Download from NCBI if needed -------------------------------------------
    aa_prot_ncbi_files = OrderedDict()
    for ss in sss:
        aa_prot_ncbi_files[ss] = opj(dir_prj_queries, 'aa_prot_ncbi__' + ss +
                                     '.fasta')
        prot_acc_user[ss] = dnld_prot_seqs(ss, prot_acc_user[ss],
                                           aa_prot_ncbi_files[ss],
                                           dir_cache_prj)

    # User provided protein sequences ----------------------------------------
    aa_prot_user_files = OrderedDict()
    for ss in sss:
        user_queries = sss[ss]['fasta_files_aa']
        aa_prot_user_files[ss] = opj(dir_prj_queries, 'aa_prot_user__' + ss +
                                     '.fasta')
        user_aa_fasta(ss, user_queries, aa_prot_user_files[ss])

    # Combine all AA queries -------------------------------------------------
    print()
    aa_queries_files = OrderedDict()
    for ss in sss:
        aa_queries_files[ss] = opj(dir_prj_queries, 'aa_all__' + ss + '.fasta')
        combine_aa_fasta(ss, [aa_uniprot_files[ss], aa_prot_ncbi_files[ss],
                              aa_prot_user_files[ss]], aa_queries_files[ss])

    # Filter AA queries ------------------------------------------------------
    prot_acc_user_filtered = OrderedDict()
    for ss in sss:
        min_query_length = sss[ss]['min_query_length']
        max_query_length = sss[ss]['max_query_length']
        max_query_identity = sss[ss]['max_query_identity']

        # Dereplicate all queries
        filter_queries(ss, aa_queries_files[ss], min_query_length,
                       max_query_length, max_query_identity,
                       vsearch, prot_acc_user[ss], overwrite=True)

        # Dereplicate only NCBI queries. CDS for these will be downloaded
        # later for reference.
        if ope(aa_prot_ncbi_files[ss]):
            prot_acc_user_filtered[ss] = filter_queries(
                ss, aa_prot_ncbi_files[ss], min_query_length, max_query_length,
                max_query_identity, vsearch, prot_acc_user[ss],
                overwrite=False, logging=False)

    # Download SRA run metadata if needed ------------------------------------
    sra_runs_info, sras_acceptable = dnld_sra_info(sras, dir_cache_prj)

    # Download SRA run FASTQ files if needed ---------------------------------
    x, y, z = dnld_sra_fastq_files(sras_acceptable, sra_runs_info, dir_fq_data,
                                   fasterq_dump, THREADS, dir_temp)

    se_fastq_files_sra = x
    pe_fastq_files_sra = y
    sra_runs_info = z

    # User provided FASTQ files ----------------------------------------------
    se_fastq_files_usr, pe_fastq_files_usr = user_fastq_files(fq_se, fq_pe)

    # Collate FASTQ file info ------------------------------------------------
    se_fastq_files = se_fastq_files_sra.copy()
    se_fastq_files.update(se_fastq_files_usr)
    pe_fastq_files = pe_fastq_files_sra.copy()
    pe_fastq_files.update(pe_fastq_files_usr)

    def gc_tt(k, d, tax):
        taxid = d[k]['tax_id']

        gc = tax.genetic_code_for_taxid(taxid)

        d[k]['gc_id'] = gc
        d[k]['gc_tt'] = TranslationTable(gc)

        gc_mito = None
        tt_mito = None

        gc_plastid = None
        tt_plastid = None

        if tax.is_eukaryote(taxid) is True:
            gc_mito = tax.mito_genetic_code_for_taxid(taxid)
            if gc_mito != '0':
                tt_mito = TranslationTable(gc_mito)

            if tax.contains_plastid(taxid) is True:
                gc_plastid = tax.plastid_genetic_code_for_taxid(taxid)
                if gc_plastid != '0':
                    tt_plastid = TranslationTable(gc_plastid)

        d[k]['gc_id_mito'] = gc_mito
        d[k]['gc_tt_mito'] = tt_mito

        d[k]['gc_id_plastid'] = gc_plastid
        d[k]['gc_tt_plastid'] = tt_plastid

    for se in se_fastq_files:
        gc_tt(se, se_fastq_files, tax)

    for pe in pe_fastq_files:
        gc_tt(pe, pe_fastq_files, tax)

    # Minimum acceptable read length -----------------------------------------
    min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp,
                        dir_cache_fq_minlen)

    # File name patterns -----------------------------------------------------
    a, b, c, d, e = file_name_patterns()

    pe_trim_fq_file_patterns = a
    pe_trim_fa_file_patterns = b
    pe_blast_db_file_patterns = c
    pe_blast_results_file_patterns = d
    pe_vsearch_results_file_patterns = e

    # Run Trimmomatic --------------------------------------------------------
    run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data,
                    trimmomatic, adapters, pe_trim_fq_file_patterns, THREADS)

    # Run Rcorrector ---------------------------------------------------------
    run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   THREADS, dir_temp, pe_trim_fq_file_patterns,
                   should_run_rcorrector)

    # Run Bowtie 2 -----------------------------------------------------------
    run_bt2_fq(se_fastq_files, pe_fastq_files, dir_fq_filter_bt2_data,
               bowtie2, bowtie2_build, THREADS, dir_temp, bt2_order,
               pe_trim_fq_file_patterns, tax, dir_cache_refseqs)

    # Run Kraken2 ------------------------------------------------------------
    run_kraken2(krkn_order, kraken2_dbs, se_fastq_files, pe_fastq_files,
                dir_fq_filter_krkn2_data, kraken_confidence, kraken2, THREADS,
                dir_temp, pe_trim_fq_file_patterns)

    se_fastq_files = OrderedDict(se_fastq_files)
    pe_fastq_files = OrderedDict(pe_fastq_files)

    se_fastq_files = OrderedDict(sorted(se_fastq_files.items(),
                                        key=lambda x: x[1]['filter_path_fq']))

    pe_fastq_files = OrderedDict(sorted(pe_fastq_files.items(),
                                        key=lambda x: x[1]['filter_path_fq']))

    # Stop After Filter ------------------------------------------------------
    if STOP_AFTER_FILTER is True:
        Log.wrn('Stopping after Kraken2/Bowtie2 filtering step as requested.')
        exit(0)

    # Convert filtered FASTQ files to FASTA ----------------------------------
    filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk,
                      pe_trim_fa_file_patterns)

    # Run makeblastdb on reads -----------------------------------------------
    makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim,
                   makeblastdb, pe_blast_db_file_patterns)

    # Check if there are any query sequences.
    any_queries = False
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        else:
            any_queries = True

    # Run tblastn on reads ---------------------------------------------------
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        changed_blast_1 = run_tblastn_on_reads(
            se_fastq_files, pe_fastq_files, aa_queries_files[ss], tblastn,
            blast_1_evalue, blast_1_max_hsps, blast_1_qcov_hsp_perc,
            blast_1_best_hit_overhang, blast_1_best_hit_score_edge,
            blast_1_max_target_seqs, dir_prj_blast_results_fa_trim,
            pe_blast_results_file_patterns, ss, THREADS, seqtk, vsearch,
            dir_cache_prj)

        if changed_blast_1 is True:
            if ope(dir_prj_vsearch_results_fa_trim):
                rmtree(dir_prj_vsearch_results_fa_trim)
            if ope(dir_prj_spades_assemblies):
                rmtree(dir_prj_spades_assemblies)
            if ope(dir_prj_blast_assmbl):
                rmtree(dir_prj_blast_assmbl)
            if ope(dir_prj_assmbl_blast_results):
                rmtree(dir_prj_assmbl_blast_results)
            if ope(dir_prj_transcripts):
                rmtree(dir_prj_transcripts)
            if ope(dir_prj_transcripts_combined):
                rmtree(dir_prj_transcripts_combined)

    prepare_output_directories(dir_out, prj_name)

    # Run vsearch on reads ---------------------------------------------------
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        print()
        Log.inf('Checking if Vsearch should be run:', ss)
        run_vsearch_on_reads(se_fastq_files, pe_fastq_files, vsearch,
                             dir_prj_vsearch_results_fa_trim,
                             pe_vsearch_results_file_patterns, ss, seqtk)

    # Run SPAdes -------------------------------------------------------------
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            for se in se_fastq_files:
                se_fastq_files[se]['spades_assembly' + '__' + ss] = None
            for pe in pe_fastq_files:
                pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None
            continue
        print()
        Log.inf('Checking if SPAdes should be run:', ss)
        run_spades(se_fastq_files, pe_fastq_files, dir_prj_spades_assemblies,
                   spades, dir_temp, ss, THREADS, RAM)

    # Combine SPAdes and user provided assemblies ----------------------------
    assemblies = combine_assemblies(se_fastq_files, pe_fastq_files,
                                    user_assemblies, tax, sss)

    # Run makeblastdb on assemblies  -----------------------------------------
    makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb)

    if any_queries is False:
        Log.wrn('No query sequences were provided.')

    # Run tblastn on assemblies ----------------------------------------------
    for ss in sss:

        if stat(aa_queries_files[ss]).st_size == 0:
            continue

        should_run_tblastn = False
        for a in assemblies:
            assmbl_src = a['src']
            assmbl_name = a['name']
            if assmbl_src != 'user_fasta':
                if assmbl_name.endswith('__' + ss):
                    should_run_tblastn = True
                    break
            else:
                should_run_tblastn = True
                break

        if should_run_tblastn is False:
            print()
            Log.inf('Will not run BLAST. No transcripts exist:', ss)
            continue

        blast_2_evalue_ss = sss[ss]['blast_2_evalue']
        blast_2_max_hsps_ss = sss[ss]['blast_2_max_hsps']
        blast_2_qcov_hsp_perc_ss = sss[ss]['blast_2_qcov_hsp_perc']
        blast_2_best_hit_overhang_ss = sss[ss]['blast_2_best_hit_overhang']
        blast_2_best_hit_score_edge_ss = sss[ss]['blast_2_best_hit_score_edge']
        blast_2_max_target_seqs_ss = sss[ss]['blast_2_max_target_seqs']

        if blast_2_evalue_ss is None:
            blast_2_evalue_ss = blast_2_evalue
        if blast_2_max_hsps_ss is None:
            blast_2_max_hsps_ss = blast_2_max_hsps
        if blast_2_qcov_hsp_perc_ss is None:
            blast_2_qcov_hsp_perc_ss = blast_2_qcov_hsp_perc
        if blast_2_best_hit_overhang_ss is None:
            blast_2_best_hit_overhang_ss = blast_2_best_hit_overhang
        if blast_2_best_hit_score_edge_ss is None:
            blast_2_best_hit_score_edge_ss = blast_2_best_hit_score_edge
        if blast_2_max_target_seqs_ss is None:
            blast_2_max_target_seqs_ss = blast_2_max_target_seqs

        run_tblastn_on_assemblies(ss, assemblies, aa_queries_files[ss],
                                  tblastn, dir_prj_assmbl_blast_results,
                                  blast_2_evalue_ss, blast_2_max_hsps_ss,
                                  blast_2_qcov_hsp_perc_ss,
                                  blast_2_best_hit_overhang_ss,
                                  blast_2_best_hit_score_edge_ss,
                                  blast_2_max_target_seqs_ss, THREADS,
                                  dir_cache_prj, dir_prj_ips)

    # Prepare BLAST hits for analysis: find ORFs, translate ------------------
    for ss in sss:

        if stat(aa_queries_files[ss]).st_size == 0:
            continue

        min_target_orf_len_ss = sss[ss]['min_target_orf_length']
        max_target_orf_len_ss = sss[ss]['max_target_orf_length']
        organelle = sss[ss]['organelle']

        blast_2_qcov_hsp_perc_ss = sss[ss]['blast_2_qcov_hsp_perc']

        if blast_2_qcov_hsp_perc_ss is None:
            blast_2_qcov_hsp_perc_ss = blast_2_qcov_hsp_perc

        find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk,
                            dir_temp, prepend_assmbl, min_target_orf_len_ss,
                            max_target_orf_len_ss, allow_non_aug,
                            allow_no_strt_cod,
                            allow_no_stop_cod, tax, tax_group, tax_ids_user,
                            blast_2_qcov_hsp_perc_ss, organelle)

    # GFF3 files from kakapo results JSON files ------------------------------
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        gff_from_json(ss, assemblies, dir_prj_ips,
                      dir_prj_transcripts_combined, prj_name)

    # Run InterProScan 5 -----------------------------------------------------
    if should_run_ipr is True:
        print()
        ss_names = tuple(sss.keys())

        # Determine the length of printed strings, for better spacing --------
        max_title_a_len = 0
        max_run_id_len = 0
        for a in assemblies:
            for ss in ss_names:
                if 'transcripts_aa_orf_fasta_file__' + ss not in a:
                    continue

                aa_file = a['transcripts_aa_orf_fasta_file__' + ss]

                if aa_file is None:
                    continue

                assmbl_name = a['name']
                run_id = ss + '_' + assmbl_name
                max_run_id_len = max(len(run_id), max_run_id_len)

                seqs = seq_records_to_dict(read_fasta(aa_file, SEQ_TYPE_AA))

                # Filter all ORFs except the first one.
                for seq_def in tuple(seqs.keys()):
                    seq_def_prefix = seq_def.split(' ')[0]
                    if seq_def_prefix.endswith('ORF001'):
                        max_title_a_len = max(len(seq_def_prefix),
                                              max_title_a_len)

        max_title_a_len += 2
        max_run_id_len += 2
        # --------------------------------------------------------------------

        parallel_run_count = min(THREADS, len(ss_names))

        def run_inter_pro_scan_parallel(ss):
            if stat(aa_queries_files[ss]).st_size == 0:
                return

            run_inter_pro_scan(ss, assemblies, email, dir_prj_ips,
                               dir_cache_prj, parallel_run_count,
                               max_title_a_len, max_run_id_len)

            # GFF3 files from kakapo and InterProScan 5 results JSON files
            gff_from_json(ss, assemblies, dir_prj_ips,
                          dir_prj_transcripts_combined, prj_name)

        Parallel(n_jobs=parallel_run_count, verbose=0, require='sharedmem')(
            delayed(run_inter_pro_scan_parallel)(ss) for ss in ss_names)

    # Download CDS for NCBI protein queries ----------------------------------
    print()
    prot_cds_ncbi_files = OrderedDict()

    def dnld_cds_for_ncbi_prot_acc_parallel(ss):
        if stat(aa_queries_files[ss]).st_size == 0:
            return

        if ss not in prot_acc_user_filtered:
            return

        prot_cds_ncbi_files[ss] = opj(dir_prj_transcripts_combined, prj_name +
                                      '_ncbi_query_cds__' + ss + '.fasta')

        if len(prot_acc_user_filtered[ss]) > 0:
            dnld_cds_for_ncbi_prot_acc(ss, prot_acc_user_filtered[ss],
                                       prot_cds_ncbi_files[ss], tax,
                                       dir_cache_prj)

    ss_names = tuple(sss.keys())
    Parallel(n_jobs=2, verbose=0, require='sharedmem')(
        delayed(dnld_cds_for_ncbi_prot_acc_parallel)(ss) for ss in ss_names)

    # ------------------------------------------------------------------------

    rmtree(dir_temp)

    # ------------------------------------------------------------------------

    rerun = input('\nRepeat ([y]/n)? ').lower().strip()
    if rerun.startswith('y') or rerun == '':
        print()
        return False
    else:
        print('\nExiting...')
        return True

Example #25

Show file

def run_spades(se_fastq_files, pe_fastq_files, dir_spades_assemblies,
               spades, dir_temp, ss, threads, ram):

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        if spades is None:
            Log.err('SPAdes is not available. Cannot continue. Exiting.')
            exit(0)

    for se in se_fastq_files:
        dir_results = opj(dir_spades_assemblies, se + '__' + ss)
        fq_path = se_fastq_files[se]['vsearch_results_path' + '__' + ss]
        se_fastq_files[se]['spades_assembly' + '__' + ss] = None

        if ope(dir_results):
            Log.msg('SPAdes assembly already exists:', se)
        else:
            make_dirs(dir_results)
            Log.msg('Running SPAdes on:', se)
            run_spades_se(spades,
                          out_dir=dir_results,
                          input_file=fq_path,
                          threads=threads,
                          memory=ram,
                          rna=True)

        assmbl_path = opj(dir_results, 'transcripts.fasta')
        if ope(assmbl_path):
            count = len(read_fasta(assmbl_path, SEQ_TYPE_NT))
            tr_str = ' transcripts.'
            if count == 1:
                tr_str = ' transcript.'
            Log.msg('SPAdes produced ' + str(count) + tr_str, False)
            se_fastq_files[se]['spades_assembly' + '__' + ss] = assmbl_path
        else:
            Log.wrn('SPAdes produced no transcripts.', False)

    for pe in pe_fastq_files:
        dir_results = opj(dir_spades_assemblies, pe + '__' + ss)
        fq_paths = pe_fastq_files[pe]['vsearch_results_path' + '__' + ss]
        pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None

        if ope(dir_results):
            Log.msg('SPAdes assembly already exists:', pe)
        else:
            make_dirs(dir_results)
            Log.msg('Running SPAdes on: ' + pe)

            if osstat(fq_paths[0]).st_size > 0 and \
               osstat(fq_paths[1]).st_size > 0:

                run_spades_pe(spades,
                              out_dir=dir_results,
                              input_files=fq_paths,
                              threads=threads,
                              memory=ram,
                              rna=True)

            else:
                _ = opj(dir_temp, 'temp.fasta')
                combine_text_files(fq_paths, _)
                run_spades_se(spades,
                              out_dir=dir_results,
                              input_file=_,
                              threads=threads,
                              memory=ram,
                              rna=True)
                osremove(_)

        assmbl_path = opj(dir_results, 'transcripts.fasta')
        if ope(assmbl_path):
            count = len(read_fasta(assmbl_path, SEQ_TYPE_NT))
            tr_str = ' transcripts.'
            if count == 1:
                tr_str = ' transcript.'
            Log.msg('SPAdes produced ' + str(count) + tr_str, False)
            pe_fastq_files[pe]['spades_assembly' + '__' + ss] = assmbl_path
        else:
            Log.wrn('SPAdes produced no transcripts.', False)

Example #26

Show file

File: d_search_reads.py Project: muti99/kakapo

def run_tblastn_on_reads(se_fastq_files, pe_fastq_files, aa_queries_file,
                         tblastn, blast_1_evalue, blast_1_max_hsps,
                         blast_1_qcov_hsp_perc, blast_1_best_hit_overhang,
                         blast_1_best_hit_score_edge, blast_1_max_target_seqs,
                         dir_blast_results_fa_trim, fpatt, ss, threads, seqtk,
                         vsearch, dir_cache_prj):

    changed_blast_1 = False

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Running BLAST on reads:', ss)
        if tblastn is None:
            Log.err('tblastn is not available. Cannot continue. Exiting.')
            exit(0)

        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)

        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    cache_file = opj(dir_cache_prj, 'blast_1_settings_cache__' + ss)

    pickled = dict()
    settings = {
        'blast_1_evalue': blast_1_evalue,
        'blast_1_max_hsps': blast_1_max_hsps,
        'blast_1_qcov_hsp_perc': blast_1_qcov_hsp_perc,
        'blast_1_best_hit_overhang': blast_1_best_hit_overhang,
        'blast_1_best_hit_score_edge': blast_1_best_hit_score_edge,
        'blast_1_max_target_seqs': blast_1_max_target_seqs,
        'queries': seq_records_to_dict(read_fasta(aa_queries_file,
                                                  SEQ_TYPE_AA))
    }

    Log.msg('evalue:', str(blast_1_evalue))
    Log.msg('max_hsps:', str(blast_1_max_hsps))
    Log.msg('qcov_hsp_perc:', str(blast_1_qcov_hsp_perc))
    Log.msg('best_hit_overhang:', str(blast_1_best_hit_overhang))
    Log.msg('best_hit_score_edge:', str(blast_1_best_hit_score_edge))
    Log.msg('max_target_seqs:', str(blast_1_max_target_seqs))
    print()

    # FixMe: Expose in configuration files?
    ident = 0.85

    for se in se_fastq_files:
        dir_results = opj(dir_blast_results_fa_trim, se)
        blast_db_path = se_fastq_files[se]['blast_db_path']
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_results, se + '__' + ss + '.txt')
        out_f_fastq = out_f.replace('.txt', '.fastq')
        out_f_fasta = out_f.replace('.txt', '.fasta')
        se_fastq_files[se]['blast_results_path' + '__' + ss] = out_f_fasta
        genetic_code = se_fastq_files[se]['gc_id']

        if ope(out_f_fasta) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(out_f_fasta) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', se)

        else:
            changed_blast_1 = True
            make_dirs(dir_results)
            Log.msg('Running tblastn on: ' + basename(blast_db_path), ss)
            run_blast(exec_file=tblastn,
                      task='tblastn',
                      threads=threads,
                      db_path=blast_db_path,
                      queries_file=aa_queries_file,
                      out_file=out_f,
                      evalue=blast_1_evalue,
                      max_hsps=blast_1_max_hsps,
                      qcov_hsp_perc=blast_1_qcov_hsp_perc,
                      best_hit_overhang=blast_1_best_hit_overhang,
                      best_hit_score_edge=blast_1_best_hit_score_edge,
                      max_target_seqs=blast_1_max_target_seqs,
                      db_genetic_code=genetic_code,
                      out_cols=BLST_RES_COLS_1)

            Log.inf('Extracting unique BLAST hits using Seqtk:', ss)

            keep_unique_lines_in_file(out_f)

            seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f)
            seqtk_fq_to_fa(seqtk, out_f_fastq, out_f_fasta)

            osremove(out_f)
            osremove(out_f_fastq)

            out_f_fasta_temp = out_f_fasta + '_temp'
            copyfile(out_f_fasta, out_f_fasta_temp)
            run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta)
            osremove(out_f_fasta_temp)

    for pe in pe_fastq_files:
        dir_results = opj(dir_blast_results_fa_trim, pe)
        blast_db_paths = pe_fastq_files[pe]['blast_db_path']
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_results) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x.replace('@Q@', ss) for x in out_fs]
        out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs]
        out_fs_fasta = [x.replace('.txt', '.fasta') for x in out_fs]
        out_f_fasta = opj(dir_results, pe + '__' + ss + '.fasta')
        pe_fastq_files[pe]['blast_results_path' + '__' + ss] = out_f_fasta
        genetic_code = pe_fastq_files[pe]['gc_id']

        if ope(out_f_fasta) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(out_f_fasta) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', pe)

        else:
            changed_blast_1 = True
            make_dirs(dir_results)
            pe_trim_files = zip(blast_db_paths, out_fs, fq_paths, out_fs_fastq,
                                out_fs_fasta)
            for x in pe_trim_files:
                Log.msg('Running tblastn on: ' + basename(x[0]), ss)
                run_blast(exec_file=tblastn,
                          task='tblastn',
                          threads=threads,
                          db_path=x[0],
                          queries_file=aa_queries_file,
                          out_file=x[1],
                          evalue=blast_1_evalue,
                          max_hsps=blast_1_max_hsps,
                          qcov_hsp_perc=blast_1_qcov_hsp_perc,
                          best_hit_overhang=blast_1_best_hit_overhang,
                          best_hit_score_edge=blast_1_best_hit_score_edge,
                          max_target_seqs=blast_1_max_target_seqs,
                          db_genetic_code=genetic_code,
                          out_cols=BLST_RES_COLS_1)

                Log.msg('Extracting unique BLAST hits using Seqtk:', ss)

                keep_unique_lines_in_file(x[1])

                seqtk_extract_reads(seqtk, x[2], x[3], x[1])
                seqtk_fq_to_fa(seqtk, x[3], x[4])

                osremove(x[1])
                osremove(x[3])

            combine_text_files(out_fs_fasta, out_f_fasta)

            out_f_fasta_temp = out_f_fasta + '_temp'
            copyfile(out_f_fasta, out_f_fasta_temp)
            run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta)
            osremove(out_f_fasta_temp)

            for x in out_fs_fasta:
                osremove(x)

    with open(cache_file, 'wb') as f:
        pickle.dump(settings, f, protocol=PICKLE_PROTOCOL)

    return changed_blast_1

Example #27

Show file

def ss_file_parse(file_path):

    cfg = ConfigParser(delimiters=('='),
                       allow_no_value=True,
                       empty_lines_in_values=True)
    cfg.optionxform = str
    cfg.SECTCRE = re.compile(r'\[\s*(?P<header>[^]]+?)\s*\]')

    try:
        cfg.read(file_path)
    except MissingSectionHeaderError:
        Log.err(
            'Missing section header(s) in the provided "Search Strategies" file:',
            file_path)
        exit(1)

    required_options = set(('organelle', 'min_query_length',
                            'max_query_length', 'max_query_identity',
                            'min_target_orf_length', 'max_target_orf_length'))

    ret_dict = OrderedDict()

    sections = cfg.sections()

    for s in sections:

        o = cfg.options(s)

        if not required_options <= set(o):
            missing = required_options - (required_options & set(o))
            Log.err(
                'Missing required option(s):' + ', '.join(missing) +
                'for search strategy', s)
            exit(1)

        organelle = cfg[s]['organelle']

        if organelle not in ('nucleus', 'plastid', 'mitochondrion'):
            Log.err('Organelle "' + organelle + '" should be one of:',
                    'nucleus, plastid, or mitochondrion.')
            exit(1)

        min_query_length = int(cfg[s]['min_query_length'])
        max_query_length = int(cfg[s]['max_query_length'])
        max_query_identity = float(cfg[s]['max_query_identity'])
        min_target_orf_length = int(cfg[s]['min_target_orf_length'])
        max_target_orf_length = int(cfg[s]['max_target_orf_length'])

        evalue = None
        max_hsps = None
        qcov_hsp_perc = None
        best_hit_overhang = None
        best_hit_score_edge = None
        max_target_seqs = None

        pfam_families = None
        ncbi_accessions_aa = None
        entrez_search_queries = None
        fasta_files_aa = None

        if cfg.has_option(s, 'evalue'):
            evalue = float(cfg[s]['evalue'])

        if cfg.has_option(s, 'max_hsps'):
            max_hsps = int(cfg[s]['max_hsps'])

        if cfg.has_option(s, 'qcov_hsp_perc'):
            qcov_hsp_perc = float(cfg[s]['qcov_hsp_perc'])

        if cfg.has_option(s, 'best_hit_overhang'):
            best_hit_overhang = float(cfg[s]['best_hit_overhang'])

        if cfg.has_option(s, 'best_hit_score_edge'):
            best_hit_score_edge = float(cfg[s]['best_hit_score_edge'])

        if cfg.has_option(s, 'max_target_seqs'):
            max_target_seqs = int(cfg[s]['max_target_seqs'])

        if cfg.has_option(s, 'pfam_families'):
            pfam_families = str(cfg[s]['pfam_families'])
            pfam_families = set(pfam_families.split('\n')) - \
                set(('', 'None'))
            pfam_families = _parse_pfam(pfam_entries=pfam_families,
                                        config_file_path=file_path)
            pfam_families = sorted(pfam_families)

        if cfg.has_option(s, 'ncbi_accessions_aa'):
            ncbi_accessions_aa = str(cfg[s]['ncbi_accessions_aa'])
            ncbi_accessions_aa = sorted(
                set(ncbi_accessions_aa.split('\n')) - set(('', 'None')))

        if cfg.has_option(s, 'entrez_search_queries'):
            entrez_search_queries = str(cfg[s]['entrez_search_queries'])
            entrez_search_queries = sorted(
                set(entrez_search_queries.split('\n')) - set(('', 'None')))

        if cfg.has_option(s, 'fasta_files_aa'):
            fasta_files_aa = str(cfg[s]['fasta_files_aa'])
            fasta_files_aa = set(fasta_files_aa.split('\n')) - \
                set(('', 'None'))
            fasta_files_aa = [abspath(expanduser(x)) for x in fasta_files_aa]
            fasta_files_aa = sorted(fasta_files_aa)

        section_dict = OrderedDict({
            'organelle': organelle,
            'min_query_length': min_query_length,
            'max_query_length': max_query_length,
            'max_query_identity': max_query_identity,
            'min_target_orf_length': min_target_orf_length,
            'max_target_orf_length': max_target_orf_length,
            'blast_2_evalue': evalue,
            'blast_2_max_hsps': max_hsps,
            'blast_2_qcov_hsp_perc': qcov_hsp_perc,
            'blast_2_best_hit_overhang': best_hit_overhang,
            'blast_2_best_hit_score_edge': best_hit_score_edge,
            'blast_2_max_target_seqs': max_target_seqs,
            'pfam_families': pfam_families,
            'ncbi_accessions_aa': ncbi_accessions_aa,
            'entrez_search_queries': entrez_search_queries,
            'fasta_files_aa': fasta_files_aa
        })

        ret_dict[s] = section_dict

    return ret_dict

Example #28

Show file

def config_file_parse(file_path, taxonomy):

    cfg = ConfigParser(delimiters=('='), allow_no_value=True)
    cfg.optionxform = str

    try:
        cfg.read(file_path)
    except MissingSectionHeaderError:
        Log.err(
            'Missing section header(s) in the provided ' +
            'configuration file:', file_path)
        exit(1)

    try:
        # General
        project_name = cfg.get('General', 'project_name')
        email = cfg.get('General', 'email')
        output_directory = abspath(
            expanduser(cfg.get('General', 'output_directory')))
        should_run_ipr = cfg.getboolean('General', 'run_inter_pro_scan')
        should_run_rcorrector = cfg.getboolean('General', 'run_rcorrector')
        prepend_assmbl = cfg.getboolean(
            'General', 'prepend_assembly_name_to_sequence_name')
        kraken_confidence = cfg.getfloat('General', 'kraken_2_confidence')
        requery_after = cfg.getfloat('General', 'requery_after')
        requery_after = datetime.timedelta(days=requery_after)

        # Target filters
        allow_non_aug = cfg.getboolean('Target filters',
                                       'allow_non_aug_start_codon')
        allow_no_strt_cod = cfg.getboolean('Target filters',
                                           'allow_missing_start_codon')
        allow_no_stop_cod = cfg.getboolean('Target filters',
                                           'allow_missing_stop_codon')

        # Query taxonomic group
        tax_group_raw = cfg.items('Query taxonomic group')

        if len(tax_group_raw) != 1:
            raise Exception('One taxonomic group should be listed.')

        tax_group = tax_group_raw[0][0].lower()
        tax_group_name = tax_group.title()

        group_tax_ids = {
            'animals': 33208,
            'archaea': 2157,
            'bacteria': 2,
            'fungi': 4751,
            'plants': 33090,
            'viruses': 10239
        }

        tax_group = group_tax_ids[tax_group]

        # Target SRA accessions
        sras = cfg.items('Target SRA accessions')
        sras = [x[0] for x in sras]

        all_tax_ids = set()

        # Target FASTQ files
        fastq_temp = cfg.items('Target FASTQ files')

        fq_pe = []
        fq_se = []

        for entry in fastq_temp:

            key = entry[0]
            val = entry[1]

            val = val.split(':')
            if len(val) == 1:
                tmp_genus_species = basename(val[0]).split('_')
                if len(tmp_genus_species) == 1:
                    genus = tmp_genus_species[0]
                    val = [genus, val[0]]
                elif len(tmp_genus_species) >= 2:
                    genus_species = tmp_genus_species[
                        0] + ' ' + tmp_genus_species[1]
                    val = [genus_species, val[0]]
                else:
                    val = ['', val[0]]

            taxa_temp = [val[0]]

            # FixMe: It is possible for tax_id to be None!
            #        What happens then?
            tax_id = _parse_taxa(taxa=taxa_temp,
                                 tax_group=tax_group,
                                 taxonomy=taxonomy,
                                 config_file_path=file_path)[0]

            # See FixMe above.
            if tax_id is None:
                tax_id = tax_group

            if key.startswith('pe_'):
                f_name = basename(val[1])
                d_path = abspath(expanduser(dirname(val[1])))
                pattern = re.escape(f_name).replace('\\*', '.')
                try:
                    files, err = list_of_files_at_path(d_path)
                except Exception:
                    exit(1)
                pe = [
                    f for f in files
                    if re.match(pattern, basename(f)) is not None
                ]
                pe.sort()
                pe = [join(d_path, f) for f in pe]
                fq_pe.append([tax_id, pe])

            elif key.startswith('se_'):
                se = abspath(expanduser(val[1]))
                fq_se.append([tax_id, se])

            # See FixMe above.
            if tax_id != tax_group:
                all_tax_ids.add(tax_id)

        # Target assemblies: FASTA files (DNA)
        assmbl_temp = cfg.items('Target assemblies: FASTA files (DNA)')
        assmbl_temp = [x[0].split(':') for x in assmbl_temp]

        for i, val in enumerate(copy(assmbl_temp)):
            if len(val) == 1:
                tmp_genus_species = basename(val[0]).split('_')
                if len(tmp_genus_species) == 1:
                    genus = tmp_genus_species[0]
                    assmbl_temp[i] = [genus, val[0]]
                elif len(tmp_genus_species) >= 2:
                    genus_species = tmp_genus_species[
                        0] + ' ' + tmp_genus_species[1]
                    assmbl_temp[i] = [genus_species, val[0]]
                else:
                    assmbl_temp[i] = ['', val[0]]

        taxa_temp = [x[0] for x in assmbl_temp]
        taxa_temp = [x.split('.')[0] for x in taxa_temp]

        # FixMe: It is possible for one of the tax_ids to be None!
        #        What happens then?
        tax_ids = _parse_taxa(taxa=taxa_temp,
                              tax_group=tax_group,
                              taxonomy=taxonomy,
                              config_file_path=file_path)

        assmbl_temp = [abspath(expanduser(x[1])) for x in assmbl_temp]
        assmbl_temp = list(zip(tax_ids, assmbl_temp))

        assmbl = list()
        tax_ids = list()
        all_assemblies_found = True
        for i, a in enumerate(copy(assmbl_temp)):
            # See FixMe above.
            a = list(a)
            if a[0] is None:
                a[0] = tax_group
            tax_ids.append(a[0])
            a_path = a[1]
            if not ope(a_path):
                Log.err('Cannot find the assembly file:', a_path)
                all_assemblies_found = False
            assmbl.append(tuple(a))

        if all_assemblies_found is False:
            Log.err('Stopping.')
            exit(1)

        for tax_id in tax_ids:
            # See FixMe above.
            if tax_id != tax_group:
                all_tax_ids.add(tax_id)
        all_tax_ids = tuple(sorted(all_tax_ids))

        # Bowtie2 filter order
        bt2_sctn = 'Bowtie2 filter order'
        bt2_order = []
        if cfg.has_section(bt2_sctn):
            bt2_order = OrderedDict(cfg.items(bt2_sctn))

        # Kraken2 filter order
        krkn_sctn = 'Kraken2 filter order'
        krkn_order = []
        if cfg.has_section(krkn_sctn):
            krkn_order = cfg.items(krkn_sctn)

        # BLAST SRA/FASTQ
        blast_1_evalue = cfg.getfloat('BLAST SRA/FASTQ', 'evalue')
        blast_1_max_hsps = cfg.getint('BLAST SRA/FASTQ', 'max_hsps')
        blast_1_qcov_hsp_perc = cfg.getfloat('BLAST SRA/FASTQ',
                                             'qcov_hsp_perc')
        blast_1_best_hit_overhang = cfg.getfloat('BLAST SRA/FASTQ',
                                                 'best_hit_overhang')
        blast_1_best_hit_score_edge = cfg.getfloat('BLAST SRA/FASTQ',
                                                   'best_hit_score_edge')
        blast_1_max_target_seqs = cfg.getint('BLAST SRA/FASTQ',
                                             'max_target_seqs')

        # BLAST assemblies
        blast_2_evalue = cfg.getfloat('BLAST assemblies', 'evalue')
        blast_2_max_hsps = cfg.getint('BLAST assemblies', 'max_hsps')
        blast_2_qcov_hsp_perc = cfg.getfloat('BLAST assemblies',
                                             'qcov_hsp_perc')
        blast_2_best_hit_overhang = cfg.getfloat('BLAST assemblies',
                                                 'best_hit_overhang')
        blast_2_best_hit_score_edge = cfg.getfloat('BLAST assemblies',
                                                   'best_hit_score_edge')
        blast_2_max_target_seqs = cfg.getint('BLAST assemblies',
                                             'max_target_seqs')

    except NoSectionError as err:
        Log.err(
            'Missing required section "' + err.section +
            '" in configuration file:', file_path)
        exit(1)

    except NoOptionError as err:
        Log.err(
            'Missing required option "' + err.option + '" under section "' +
            err.section + '" in configuration file:', file_path)
        exit(1)

    # ------------------------------------------------------------------------

    ret_dict = {
        'allow_no_stop_cod': allow_no_stop_cod,
        'allow_no_strt_cod': allow_no_strt_cod,
        'allow_non_aug': allow_non_aug,
        'assmbl': assmbl,
        'blast_1_evalue': blast_1_evalue,
        'blast_1_max_hsps': blast_1_max_hsps,
        'blast_1_qcov_hsp_perc': blast_1_qcov_hsp_perc,
        'blast_1_best_hit_overhang': blast_1_best_hit_overhang,
        'blast_1_best_hit_score_edge': blast_1_best_hit_score_edge,
        'blast_1_max_target_seqs': blast_1_max_target_seqs,
        'blast_2_evalue': blast_2_evalue,
        'blast_2_max_hsps': blast_2_max_hsps,
        'blast_2_qcov_hsp_perc': blast_2_qcov_hsp_perc,
        'blast_2_best_hit_overhang': blast_2_best_hit_overhang,
        'blast_2_best_hit_score_edge': blast_2_best_hit_score_edge,
        'blast_2_max_target_seqs': blast_2_max_target_seqs,
        'email': email,
        'requery_after': requery_after,
        'fq_pe': fq_pe,
        'fq_se': fq_se,
        'should_run_rcorrector': should_run_rcorrector,
        'should_run_ipr': should_run_ipr,
        'bt2_order': bt2_order,
        'kraken_confidence': kraken_confidence,
        'krkn_order': krkn_order,
        'output_directory': output_directory,
        'prepend_assmbl': prepend_assmbl,
        'project_name': project_name,
        'sras': sras,
        'tax_group': tax_group,
        'tax_group_name': tax_group_name,
        'tax_ids': all_tax_ids
    }

    return ret_dict

Example #29

Show file

File: f_search_assmbl.py Project: muti99/kakapo

def run_tblastn_on_assemblies(ss, assemblies, aa_queries_file, tblastn,
                              dir_prj_assmbl_blast_results, blast_2_evalue,
                              blast_2_max_hsps, blast_2_qcov_hsp_perc,
                              blast_2_best_hit_overhang,
                              blast_2_best_hit_score_edge,
                              blast_2_max_target_seqs, threads, dir_cache_prj,
                              dir_prj_ips):

    if len(assemblies) > 0:
        print()
        Log.inf('Running BLAST on assemblies:', ss)
        if tblastn is None:
            Log.err('tblastn is not available. Cannot continue. Exiting.')
            exit(0)
    else:
        Log.wrn('There are no assemblies. Nothing to do, stopping.')
        exit(0)

    cache_file = opj(dir_cache_prj, 'blast_2_settings_cache__' + ss)

    pickled = dict()
    settings = {'blast_2_evalue': blast_2_evalue,
                'blast_2_max_hsps': blast_2_max_hsps,
                'blast_2_qcov_hsp_perc': blast_2_qcov_hsp_perc,
                'blast_2_best_hit_overhang': blast_2_best_hit_overhang,
                'blast_2_best_hit_score_edge': blast_2_best_hit_score_edge,
                'blast_2_max_target_seqs': blast_2_max_target_seqs,
                'queries': seq_records_to_dict(
                    read_fasta(aa_queries_file, SEQ_TYPE_AA))}

    Log.msg('evalue:', str(blast_2_evalue))
    Log.msg('max_hsps:', str(blast_2_max_hsps))
    Log.msg('qcov_hsp_perc:', str(blast_2_qcov_hsp_perc))
    Log.msg('best_hit_overhang:', str(blast_2_best_hit_overhang))
    Log.msg('best_hit_score_edge:', str(blast_2_best_hit_score_edge))
    Log.msg('max_target_seqs:', str(blast_2_max_target_seqs))
    print()

    for a in assemblies:

        assmbl_src = a['src']
        assmbl_name = a['name']

        if assmbl_src != 'user_fasta':
            if assmbl_name.endswith('__' + ss):
                assmbl_name = assmbl_name.replace('__' + ss, '')
            else:
                continue

        assmbl_blast_db_path = a['blast_db_path']
        assmbl_genetic_code = a['gc_id']

        ips_json_dump_path = opj(dir_prj_ips, assmbl_name + '_ann_ips__' + ss +
                                 '.json')

        _ = opj(dir_prj_assmbl_blast_results, assmbl_name + '__' + ss + '.tsv')

        if ope(_) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(_) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', assmbl_name)

        else:
            Log.msg('Running tblastn on: ' + assmbl_name, ss)

            if ope(ips_json_dump_path):
                osremove(ips_json_dump_path)

            run_blast(exec_file=tblastn,
                      task='tblastn',
                      threads=threads,
                      db_path=assmbl_blast_db_path,
                      queries_file=aa_queries_file,
                      out_file=_,
                      evalue=blast_2_evalue,
                      max_hsps=blast_2_max_hsps,
                      qcov_hsp_perc=blast_2_qcov_hsp_perc,
                      best_hit_overhang=blast_2_best_hit_overhang,
                      best_hit_score_edge=blast_2_best_hit_score_edge,
                      max_target_seqs=blast_2_max_target_seqs,
                      db_genetic_code=assmbl_genetic_code,
                      out_cols=BLST_RES_COLS_2)

        a['blast_hits_aa__' + ss] = parse_blast_results_file(_, BLST_RES_COLS_2)

    with open(cache_file, 'wb') as f:
        pickle.dump(settings, f, protocol=PICKLE_PROTOCOL)

Example #30

Show file

def dep_check_rcorrector(dir_dep, force):
    url = 'https://github.com/karolisr/Rcorrector/archive/master.tar.gz'
    dnld_path = opj(dir_dep, 'rcorrector.tar.gz')

    try:
        try:
            jellyfish = which('jellyfish')
            run([jellyfish, '--help'])
        except Exception:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'Rcorrector'))
            jellyfish = opj(dir_bin, 'jellyfish', 'bin', 'jellyfish')
            raise
        if force is True:
            raise
        rcorrector = which('run_rcorrector.pl')
        run([rcorrector, '-version'])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'Rcorrector'))
            try:
                rcorrector = opj(dir_bin, 'run_rcorrector.pl')
                run([rcorrector, '-version'])
            except Exception:
                Log.wrn('Rcorrector was not found on this system, trying to '
                        'download.')
                raise
            try:
                run([jellyfish, '--version'])
            except Exception:
                Log.wrn(
                    'jellyfish is required by Rcorrector, but was not found. '
                    'Trying to download and recompile Rcorrector and '
                    'jellyfish.')
                raise
        except Exception:
            if ope(dnld_path):
                remove(dnld_path)
            if dir_bin != opj(dir_dep, ''):
                rmtree(dir_bin)
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'Rcorrector'))
            try:
                Log.wrn('Compiling Rcorrector.')
                run('make', cwd=dir_bin)
                rcorrector = opj(dir_bin, 'run_rcorrector.pl')
                jellyfish = opj(dir_bin, 'jellyfish', 'bin', 'jellyfish')
                chmod(
                    rcorrector, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP
                    | stat.S_IROTH | stat.S_IXOTH)
                run([rcorrector, '-version'])
                if not ope(jellyfish):
                    jellyfish = which('jellyfish')
                run([jellyfish, '--version'])
            except Exception:
                Log.err('Something went wrong while trying to compile '
                        'Rcorrector.')
                Log.msg('Try downloading and installing it manually from: '
                        'https://github.com/karolisr/Rcorrector')
                return None

    v = get_dep_version([rcorrector, '-version'], r'^Rcorrector\sv([\d\.]*)')
    Log.msg('Rcorrector is available:', v + ' ' + rcorrector)

    return rcorrector