Example #1
0
def dep_check_blast(dir_dep, os_id, dist_id, debian_dists, redhat_dists,
                    force):
    if os_id == 'mac':
        url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.10.1/'
               'ncbi-blast-2.10.1+-x64-macosx.tar.gz')
    elif os_id == 'linux':
        if dist_id in debian_dists:
            url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/'
                   '2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz')
        elif dist_id in redhat_dists:
            url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/'
                   '2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz')

    dnld_path = opj(dir_dep, 'ncbi-blast.tar.gz')

    makeblastdb = None
    blastn = None
    tblastn = None

    try:
        if force is True:
            raise
        makeblastdb = which('makeblastdb')
        blastn = which('blastn')
        tblastn = which('tblastn')
        run([makeblastdb, '-help'])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'ncbi-blast'))
            makeblastdb = opj(dir_bin, 'bin', 'makeblastdb')
            blastn = opj(dir_bin, 'bin', 'blastn')
            tblastn = opj(dir_bin, 'bin', 'tblastn')
            run([makeblastdb, '-help'])
        except Exception:
            Log.wrn('BLAST+ was not found on this system, trying to download.')
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'ncbi-blast'))
            makeblastdb = opj(dir_bin, 'bin', 'makeblastdb')
            blastn = opj(dir_bin, 'bin', 'blastn')
            tblastn = opj(dir_bin, 'bin', 'tblastn')

            if not ope(makeblastdb) or \
                    not ope(blastn) or \
                    not ope(tblastn):
                Log.err('Could not download BLAST+.')
                return None, None, None

    regexp = r'\sblast\s([\d\.]*)'
    v = get_dep_version([makeblastdb, '-version'], regexp)
    Log.msg('makeblastdb is available:', v + ' ' + makeblastdb)
    v = get_dep_version([blastn, '-version'], regexp)
    Log.msg('blastn is available:', v + ' ' + blastn)
    v = get_dep_version([tblastn, '-version'], regexp)
    Log.msg('tblastn is available:', v + ' ' + tblastn)

    return makeblastdb, blastn, tblastn
Example #2
0
def dep_check_bowtie2(dir_dep, os_id, force):
    if os_id == 'mac':
        url = ('https://sourceforge.net/projects/bowtie-bio/files/bowtie2/'
               '2.4.1/bowtie2-2.4.1-macos-x86_64.zip/download')
    elif os_id == 'linux':
        url = ('https://sourceforge.net/projects/bowtie-bio/files/bowtie2/'
               '2.4.1/bowtie2-2.4.1-linux-x86_64.zip/download')

    dnld_path = opj(dir_dep, 'bowtie2.zip')

    try:
        if force is True:
            raise
        bowtie2 = which('bowtie2')
        bowtie2_build = which('bowtie2-build')
        run([bowtie2, '-h'])
        run([bowtie2_build, '-h'])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'bowtie2'))
            bowtie2 = opj(dir_bin, 'bowtie2')
            bowtie2_build = opj(dir_bin, 'bowtie2-build')
            run([bowtie2, '-h'])
            run([bowtie2_build, '-h'])
        except Exception:
            Log.wrn('Bowtie 2 was not found on this system, trying to '
                    'download.')
            download_file(url, dnld_path)
            zip_ref = zipfile.ZipFile(dnld_path, 'r')
            zip_ref.extractall(dir_dep)
            zip_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'bowtie2'))
            bowtie2 = opj(dir_bin, 'bowtie2')
            bowtie2_build = opj(dir_bin, 'bowtie2-build')

            bowtie2_execs = ('', '-align-l', '-align-l-debug', '-align-s',
                             '-align-s-debug', '-build', '-build-l',
                             '-build-l-debug', '-build-s', '-build-s-debug',
                             '-inspect', '-inspect-l', '-inspect-l-debug',
                             '-inspect-s', '-inspect-s-debug')

            for bt2exe in bowtie2_execs:
                chmod(
                    bowtie2 + bt2exe, stat.S_IRWXU | stat.S_IRGRP
                    | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)

            if not ope(bowtie2):
                Log.err('Could not download Bowtie 2.')
                return None, None

    regexp = r'^.*?version\s([\d\.]*)'
    v = get_dep_version([bowtie2, '--version'], regexp)
    Log.msg('bowtie2 is available:', v + ' ' + bowtie2)
    v = get_dep_version([bowtie2_build, '--version'], regexp)
    Log.msg('bowtie2-build is available:', v + ' ' + bowtie2_build)

    return bowtie2, bowtie2_build
Example #3
0
def _use_memory_mapping(db_path):
    db_size = stat(opj(db_path, 'hash.k2d')).st_size / (1024**3)
    mem_max = RAM / 3
    if mem_max < db_size:
        db_name = splitext(basename(db_path))[0]
        Log.err('Not enough memory for Kraken2 database {}. '
                'Switching to a slower memory-mapping mode.'.format(db_name))
        return '--memory-mapping'
    else:
        return None
Example #4
0
def dep_check_sra_toolkit(dir_dep, os_id, dist_id, debian_dists, redhat_dists,
                          force):
    if os_id == 'mac':
        url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/'
               'sratoolkit.2.10.8-mac64.tar.gz')
    elif os_id == 'linux':
        if dist_id in debian_dists:
            url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/'
                   'sratoolkit.2.10.8-ubuntu64.tar.gz')
        elif dist_id in redhat_dists:
            url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/'
                   'sratoolkit.2.10.8-centos_linux64.tar.gz')

    dnld_path = opj(dir_dep, 'sra-toolkit.tar.gz')

    fasterq_dump = None
    try:
        if force is True:
            raise
        fasterq_dump = which('fasterq-dump')
        dir_bin = dirname(fasterq_dump).strip('bin')
        _ensure_vdb_cfg(dir_bin)
        run(fasterq_dump)
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'sratoolkit'))
            _ensure_vdb_cfg(dir_bin)
            fasterq_dump = opj(dir_bin, 'bin', 'fasterq-dump')
            run(fasterq_dump)
        except Exception:
            Log.wrn('SRA Toolkit was not found on this system, trying to '
                    'download.')
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'sratoolkit'))
            fasterq_dump = opj(dir_bin, 'bin', 'fasterq-dump')

            _ensure_vdb_cfg(dir_bin)

            if not ope(fasterq_dump):
                Log.err('Could not download SRA Toolkit.')
                return None

    v = get_dep_version([fasterq_dump, '--version'], r':\s([\d\.]*)')
    if v == '?':
        v = get_dep_version([fasterq_dump, '--version'], r'version\s([\d\.]*)')
    Log.msg('fasterq-dump is available:', v + ' ' + fasterq_dump)

    return fasterq_dump
Example #5
0
def dep_check_kakapolib(force=False, quiet=False):
    kkpl = KAKAPOLIB
    if not ope(kkpl):
        if quiet is False:
            Log.wrn('Compiling kakapolib.')
        run(['make', 'install'], cwd=DIR_C_SRC)
    if ope(kkpl):
        if quiet is False:
            Log.msg('kakapolib is available:', kkpl)
    else:
        Log.err('Compilation of kakapolib failed.')
        return None
    return ctypes.CDLL(kkpl)
Example #6
0
def dep_check_seqtk(dir_dep, force):
    url = 'https://github.com/lh3/seqtk/archive/master.zip'
    dnld_path = opj(dir_dep, 'seqtk.zip')
    dir_bin = opj(dir_dep, 'seqtk-master')

    fp = NamedTemporaryFile()
    fp.write(str.encode('>seq' + lns + 'ATGC'))
    fp.seek(0)
    cmd = ['', 'seq', '-r', fp.name]

    try:
        if force is True:
            raise
        seqtk = which('seqtk')
        cmd[0] = seqtk
        run(cmd, do_not_raise=True)
    except Exception:
        try:
            seqtk = opj(dir_bin, 'seqtk')
            cmd[0] = seqtk
            run(cmd, do_not_raise=True)
        except Exception:
            Log.wrn('Seqtk was not found on this system, trying to download.')
            download_file(url, dnld_path)
            zip_ref = zipfile.ZipFile(dnld_path, 'r')
            zip_ref.extractall(dir_dep)
            zip_ref.close()
            try:
                Log.wrn('Compiling Seqtk.')
                run('make', cwd=dir_bin)
                run(cmd, do_not_raise=True)
            except Exception:
                replace_line_in_file(opj(dir_bin, 'Makefile'), 'CC=gcc',
                                     'CC=cc')
                try:
                    run('make', cwd=dir_bin)
                    run(cmd, do_not_raise=True)
                except Exception:
                    Log.err(
                        'Something went wrong while trying to compile Seqtk.')
                    Log.msg('Try downloading and installing it manually from: '
                            'https://github.com/lh3/seqtk')
                    fp.close()
                    return None

    fp.close()

    v = get_dep_version([seqtk], r'Version\:\s([\d\w\.\-]*)')
    Log.msg('Seqtk is available:', v + ' ' + seqtk)

    return seqtk
Example #7
0
def dep_check_vsearch(dir_dep, os_id, dist_id, debian_dists, redhat_dists,
                      force):
    if os_id == 'mac':
        url = ('https://github.com/torognes/vsearch/releases/download/v2.15.0/'
               'vsearch-2.15.0-macos-x86_64.tar.gz')
    elif os_id == 'linux':
        if dist_id in debian_dists:
            url = ('https://github.com/torognes/vsearch/releases/download/'
                   'v2.15.0/vsearch-2.15.0-linux-x86_64.tar.gz')
        elif dist_id in redhat_dists:
            url = ('https://github.com/torognes/vsearch/releases/download/'
                   'v2.15.0/vsearch-2.15.0-linux-x86_64.tar.gz')

    dnld_path = opj(dir_dep, 'vsearch.tar.gz')

    try:
        if force is True:
            raise
        vsearch = which('vsearch')
        run(vsearch)
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'vsearch'))
            vsearch = opj(dir_bin, 'bin', 'vsearch')
            run(vsearch)
        except Exception:
            Log.wrn(
                'Vsearch was not found on this system, trying to download.')
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()
            try:
                dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'vsearch'))
                vsearch = opj(dir_bin, 'bin', 'vsearch')
                if not ope(vsearch):
                    Log.err('Could not download Vsearch.')
                    return None
                else:
                    run(vsearch)
            except Exception:
                Log.err('Vsearch was downloaded, but does not execute.')
                Log.msg('Try downloading and installing it manually from: '
                        'https://github.com/torognes/vsearch')
                return None

    v = get_dep_version([vsearch, '-version'], r'vsearch\sv([\d\.]*)')
    Log.msg('Vsearch is available:', v + ' ' + vsearch)

    return vsearch
Example #8
0
def makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim,
                   makeblastdb, fpatt):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Building BLAST databases for reads.')
        if makeblastdb is None:
            Log.err('makeblastdb is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, se)
        fa_path = se_fastq_files[se]['filter_path_fa']
        out_f = opj(dir_blast_fa_trim_sample, se)
        se_fastq_files[se]['blast_db_path'] = out_f

        if ope(dir_blast_fa_trim_sample):
            Log.msg('BLAST database already exists:', se)
        else:
            make_dirs(dir_blast_fa_trim_sample)
            Log.msg(basename(fa_path))
            make_blast_db(exec_file=makeblastdb,
                          in_file=fa_path,
                          out_file=out_f,
                          title=se,
                          dbtype='nucl')

    for pe in pe_fastq_files:
        dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, pe)
        fa_paths = pe_fastq_files[pe]['filter_path_fa']
        out_fs = [x.replace('@D@', dir_blast_fa_trim_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        pe_fastq_files[pe]['blast_db_path'] = out_fs

        if ope(dir_blast_fa_trim_sample):
            Log.msg('BLAST database already exists:', pe)
        else:
            make_dirs(dir_blast_fa_trim_sample)
            pe_trim_files = zip(fa_paths, out_fs)
            for x in pe_trim_files:
                Log.msg(basename(x[0]))
                make_blast_db(exec_file=makeblastdb,
                              in_file=x[0],
                              out_file=x[1],
                              title=basename(x[1]),
                              dbtype='nucl')
Example #9
0
def dep_check_trimmomatic(dir_dep):
    url = ('http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/'
           'Trimmomatic-0.39.zip')
    dnld_path = opj(dir_dep, 'Trimmomatic-0.39.zip')
    dir_bin = opj(dir_dep, 'Trimmomatic-0.39')
    trimmomatic = opj(dir_bin, 'trimmomatic-0.39.jar')

    if not ope(trimmomatic):
        download_file(url, dnld_path)
        zip_ref = zipfile.ZipFile(dnld_path, 'r')
        zip_ref.extractall(dir_dep)
        zip_ref.close()

    if not ope(trimmomatic):
        Log.err('Could not download Trimmomatic.')
        return None, None

    v = get_dep_version(['java', '-jar', trimmomatic, '-version'], r'\d+\.\d+')
    Log.msg('Trimmomatic is available:', v + ' ' + trimmomatic)

    path_adapters = _write_trimmomatic_adapters_file(dir_dep)

    return trimmomatic, path_adapters
Example #10
0
def makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb):
    if len(assemblies) > 0:
        print()
        Log.inf('Building BLAST databases for assemblies.')
        if makeblastdb is None:
            Log.err('makeblastdb is not available. Cannot continue. Exiting.')
            exit(0)
    for a in assemblies:
        assmbl_name = a['name']

        assmbl_blast_db_dir = opj(dir_prj_blast_assmbl, assmbl_name)
        assmbl_blast_db_file = opj(assmbl_blast_db_dir, assmbl_name)

        a['blast_db_path'] = assmbl_blast_db_file

        if ope(assmbl_blast_db_dir):
            Log.msg('BLAST database already exists:', assmbl_name)
        else:
            Log.msg(assmbl_name)
            make_dirs(assmbl_blast_db_dir)
            make_blast_db(exec_file=makeblastdb,
                          in_file=a['path'],
                          out_file=assmbl_blast_db_file,
                          title=assmbl_name)
Example #11
0
def filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk,
                      fpatt):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Converting FASTQ to FASTA using Seqtk.')
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_fa_trim_data_sample = opj(dir_fa_trim_data, se)
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_fa_trim_data_sample, se + '.fasta')
        se_fastq_files[se]['filter_path_fa'] = out_f

        if ope(dir_fa_trim_data_sample):
            Log.msg('Filtered FASTA files already exist:', se)
        else:
            make_dirs(dir_fa_trim_data_sample)
            Log.msg(basename(fq_path))
            seqtk_fq_to_fa(seqtk, fq_path, out_f)

    for pe in pe_fastq_files:
        dir_fa_trim_data_sample = opj(dir_fa_trim_data, pe)
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_fa_trim_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        pe_fastq_files[pe]['filter_path_fa'] = out_fs

        if ope(dir_fa_trim_data_sample):
            Log.msg('Filtered FASTA files already exist:', pe)
        else:
            make_dirs(dir_fa_trim_data_sample)
            pe_trim_files = zip(fq_paths, out_fs)
            for x in pe_trim_files:
                Log.msg(basename(x[0]))
                seqtk_fq_to_fa(seqtk, x[0], x[1])
Example #12
0
def dep_check_spades(dir_dep, os_id, force):
    if os_id == 'mac':
        url = ('http://cab.spbu.ru/files/release3.14.1/'
               'SPAdes-3.14.1-Darwin.tar.gz')
    elif os_id == 'linux':
        url = ('http://cab.spbu.ru/files/release3.14.1/'
               'SPAdes-3.14.1-Linux.tar.gz')

    dnld_path = opj(dir_dep, 'SPAdes.tar.gz')

    try:
        if force is True:
            raise
        spades = which('spades.py')
        run([PY3, spades])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'SPAdes'))
            spades = opj(dir_bin, 'bin', 'spades.py')
            run([PY3, spades])
        except Exception:
            Log.wrn('SPAdes was not found on this system, trying to download.')
            try:
                download_file(url, dnld_path)
                tar_ref = tarfile.open(dnld_path, 'r:gz')
                tar_ref.extractall(dir_dep)
                tar_ref.close()
            except Exception:
                Log.err('Could not download SPAdes.')
                return None
            try:
                dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'SPAdes'))
                spades = opj(dir_bin, 'bin', 'spades.py')
                # replace_line_in_file(spades,
                #                      '#!/usr/bin/env python',
                #                      '#!/usr/bin/env python3')
                if ope(spades):
                    run([PY3, spades])
                else:
                    Log.err('Could not download SPAdes.')
                    return None
            except Exception:
                Log.err('SPAdes was downloaded, but does not execute.')
                return None

    v = get_dep_version([PY3, spades, '--version'], r'^.*SPAdes.*v([\d\.]*)')
    Log.msg('SPAdes is available:', v + ' ' + spades)

    return spades
Example #13
0
def _should_run_bt2(taxid, taxonomy, bt2_order, bowtie2, bowtie2_build):

    dbs = OrderedDict()

    for x in bt2_order:
        db_path_ok = False

        if x == MT:
            if taxonomy.is_eukaryote(taxid) is True:
                if bt2_order[MT] == '':
                    dbs[MT] = MT
                    db_path_ok = True

        elif x == PT:
            if taxonomy.is_eukaryote(taxid) is True:
                if taxonomy.contains_plastid(taxid) is True:
                    if bt2_order[PT] == '':
                        dbs[PT] = PT
                        db_path_ok = True

        if db_path_ok is False:
            db_path = bt2_order[x]
            if ope(db_path) and isfile(db_path):
                dbs[x] = db_path
            else:
                Log.err('File not found:', db_path)
                exit(1)

    if len(dbs) > 0:

        if bowtie2 is None:
            Log.err('bowtie2 is not available. ' + 'Cannot continue. Exiting.')
            exit(0)

        if bowtie2_build is None:
            Log.err('bowtie2-build is not available. ' +
                    'Cannot continue. Exiting.')
            exit(0)

    return dbs
Example #14
0
def run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data,
                    trimmomatic, adapters, fpatt, threads):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Running Trimmomatic.')
        if trimmomatic is None:
            Log.err('trimmomatic is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_fq_trim_data_sample = opj(dir_fq_trim_data, se)
        fq_path = se_fastq_files[se]['cor_path_fq']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        min_acc_len = se_fastq_files[se]['min_acc_len']
        stats_f = opj(dir_fq_trim_data_sample, se + '.txt')
        out_f = opj(dir_fq_trim_data_sample, se + '.fastq' + ext)
        se_fastq_files[se]['trim_path_fq'] = out_f

        if ope(dir_fq_trim_data_sample):
            Log.msg('Trimmed FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_trim_data_sample)
            Log.msg('SE mode:', se)
            trimmomatic_se(trimmomatic=trimmomatic,
                           adapters=adapters,
                           in_file=fq_path,
                           out_file=out_f,
                           stats_file=stats_f,
                           threads=threads,
                           minlen=min_acc_len)

    for pe in pe_fastq_files:
        dir_fq_trim_data_sample = opj(dir_fq_trim_data, pe)
        fq_path_1 = pe_fastq_files[pe]['cor_path_fq'][0]
        fq_path_2 = pe_fastq_files[pe]['cor_path_fq'][1]
        fq_path_3 = None
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        if len(pe_fastq_files[pe]['cor_path_fq']) == 3:
            fq_path_3 = pe_fastq_files[pe]['cor_path_fq'][2]
        min_acc_len = pe_fastq_files[pe]['min_acc_len']
        stats_f = opj(dir_fq_trim_data_sample, pe + '.txt')
        out_fs = [x.replace('@D@', dir_fq_trim_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x + ext for x in out_fs]
        pe_fastq_files[pe]['trim_path_fq'] = out_fs

        if ope(dir_fq_trim_data_sample):
            Log.msg('Trimmed FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_trim_data_sample)
            Log.msg('PE mode:', pe)
            trimmomatic_pe(trimmomatic=trimmomatic,
                           adapters=adapters,
                           in_file_1=fq_path_1,
                           in_file_2=fq_path_2,
                           out_file_paired_1=out_fs[0],
                           out_file_paired_2=out_fs[1],
                           out_file_unpaired_1=out_fs[2],
                           out_file_unpaired_2=out_fs[3],
                           stats_file=stats_f,
                           threads=threads,
                           minlen=min_acc_len)

            if fq_path_3 is not None:

                out_f = opj(dir_fq_trim_data_sample, 'unpaired.fastq' + ext)
                stats_f = opj(dir_fq_trim_data_sample, pe + '_unpaired.txt')

                Log.msg(
                    'SE mode (Paired-read SRA run contains unpaired reads):',
                    pe)

                trimmomatic_se(trimmomatic=trimmomatic,
                               adapters=adapters,
                               in_file=fq_path_3,
                               out_file=out_f,
                               stats_file=stats_f,
                               threads=threads,
                               minlen=min_acc_len)

                _ = opj(dir_fq_trim_data_sample, 'temp.fastq' + ext)
                f_temp = fqopen(_, w_mode)
                with fileinput.FileInput(
                        files=[out_fs[2], out_f],
                        openhook=fileinput.hook_compressed) as f:
                    for line in f:
                        f_temp.write(line)
                f_temp.close()

                remove(out_fs[2])
                remove(out_f)
                copyfile(_, out_fs[2])
                remove(_)
Example #15
0
def dnld_sra_info(sras, dir_cache_prj):

    sra_runs_info = {}
    sras_acceptable = []

    if len(sras) > 0:
        print()
        Log.inf('Downloading SRA run information.')
    else:
        return sra_runs_info, sras_acceptable

    __ = opj(dir_cache_prj, 'sra_runs_info_cache')

    if ope(__):
        with open(__, 'rb') as f:
            sra_runs_info = pickle.load(f)

    sras_local = [k for k in sra_runs_info.keys()]
    sras_to_dnld = set(sras).difference(set(sras_local))
    if len(sras_to_dnld) > 0:
        temp = sra_run_info(list(sras_to_dnld))
        new_sra_runs_info = {i['Run']: i for i in temp}
        sra_runs_info.update(new_sra_runs_info)

    for sra in sras:

        if sra in sra_runs_info:

            info = sra_runs_info[sra]

            sra_lib_layout = info['LibraryLayout'].lower()
            sra_lib_source = info['LibrarySource'].lower()
            sra_lib_strategy = info['LibraryStrategy']
            sra_seq_platform = info['Platform'].lower().capitalize()
            sra_seq_platform_model = info['Model']
            sra_species = info['ScientificName']
            sra_taxid = info['TaxID']
            sra_spots = int(info['spots'])
            sra_spots_with_mates = int(info['spots_with_mates'])

            sample_base_name = (sra_species.replace(' ', '_') + '_' +
                                sra_taxid + '_' + sra)

            sra_runs_info[sra]['KakapoSampleBaseName'] = sample_base_name

            src_check = sra_lib_source.lower()
            strategy_check = sra_lib_strategy.lower()

            if not ('transcript' in src_check or 'rna' in src_check
                    or 'rna' in strategy_check):

                sra_info_str = ('{sra}: the SRA library source type "{ltype}" '
                                'or library strategy "{strategy}" '
                                'is not supported.').format(
                                    sra=sra,
                                    ltype=sra_lib_source,
                                    strategy=sra_lib_strategy)

                Log.err(sra_info_str, 'Skipping.')

            elif sra_seq_platform != 'Illumina':
                sra_info_str = ('{sra}: the SRA library sequencing platform '
                                '"{plat}" is not supported').format(
                                    sra=sra, plat=sra_seq_platform)

                Log.err(sra_info_str, 'Skipping.')

            else:
                # sra_info_str = ('SRA run {sra} {strategy} ({source}) '
                #                 '{layout}-end library.\n'
                #                 'Sourced from {species} '
                #                 '(TaxID: {txid}).\n'
                #                 'Sequenced using {platform} platform on '
                #                 '{model}.').format(
                #                     sra=sra,
                #                     source=sra_lib_source.title(),
                #                     strategy=sra_lib_strategy,
                #                     layout=sra_lib_layout,
                #                     platform=sra_seq_platform,
                #                     model=sra_seq_platform_model,
                #                     species=sra_species,
                #                     txid=sra_taxid)

                Log.msg(
                    '{sra}:'.format(sra=sra),
                    '{strategy} {layout}-end library ({source}).'.format(
                        strategy=sra_lib_strategy,
                        layout=sra_lib_layout,
                        source=sra_lib_source.title()))
                Log.msg(
                    '    Source:',
                    '{species} (TaxID: {txid}).'.format(species=sra_species,
                                                        txid=sra_taxid), False)
                Log.msg(
                    'Technology:', '{platform} platform on {model}.'.format(
                        platform=sra_seq_platform,
                        model=sra_seq_platform_model), False)

                sra_runs_info[sra]['KakapoLibraryLayout'] = \
                    sra_runs_info[sra]['LibraryLayout']

                if sra_lib_layout == 'paired' and sra_spots_with_mates == 0:
                    sra_runs_info[sra]['KakapoLibraryLayout'] = 'SINGLE'
                    # sra_info_str = (
                    #     sra_info_str + '\nListed as containing '
                    #     'paired-end reads, but only a single set of reads '
                    #     'is available. Treating as single-ended.')

                elif (sra_lib_layout == 'paired'
                      and sra_spots != sra_spots_with_mates):
                    sra_runs_info[sra]['KakapoLibraryLayout'] = 'PAIRED_UNP'
                    # sra_info_str = (
                    #     sra_info_str + '\nListed as containing '
                    #     'paired-end reads, but not all reads are paired.')

                sras_acceptable.append(sra)

                # Log.msg(sra_info_str)

    with open(__, 'wb') as f:
        pickle.dump(sra_runs_info, f, protocol=PICKLE_PROTOCOL)

    return sra_runs_info, sras_acceptable
Example #16
0
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   threads, dir_temp, should_run):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        if should_run is False:
            Log.wrn('Skipping Rcorrector as requested.')
        else:
            Log.inf('Running Rcorrector.')

        if rcorrector is None:
            Log.err('Rcorrector is not available. Cannot continue. Exiting.')
            exit(0)

    for se in se_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, se)
        fq_path = se_fastq_files[se]['path']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        log_f = opj(dir_fq_cor_data_sample, se + '.txt')
        out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext)

        se_fastq_files[se]['cor_path_fq'] = out_f

        if should_run is False:
            se_fastq_files[se]['cor_path_fq'] = fq_path
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('SE mode:', se)
            run_rcorrector_se(rcorrector=rcorrector,
                              in_file=fq_path,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path))
            fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext

            filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f)

            remove(fq_cor_path)

    for pe in pe_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe)
        fq_path_1 = pe_fastq_files[pe]['path'][0]
        fq_path_2 = pe_fastq_files[pe]['path'][1]
        fq_path_3 = None
        out_f_3 = None
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        log_f = opj(dir_fq_cor_data_sample, pe + '.txt')
        out_f_1 = opj(dir_fq_cor_data_sample, pe + '_R1.fastq' + ext)
        out_f_2 = opj(dir_fq_cor_data_sample, pe + '_R2.fastq' + ext)

        pe_fastq_files[pe]['cor_path_fq'] = [out_f_1, out_f_2]

        if len(pe_fastq_files[pe]['path']) == 3:
            fq_path_3 = pe_fastq_files[pe]['path'][2]
            out_f_3 = opj(dir_fq_cor_data_sample, pe + '_R3.fastq' + ext)
            pe_fastq_files[pe]['cor_path_fq'].append(out_f_3)

        if should_run is False:
            pe_fastq_files[pe]['cor_path_fq'] = [fq_path_1, fq_path_2]
            if fq_path_3 is not None:
                pe_fastq_files[pe]['cor_path_fq'].append(fq_path_3)
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('PE mode:', pe)
            run_rcorrector_pe(rcorrector=rcorrector,
                              in_file_1=fq_path_1,
                              in_file_2=fq_path_2,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1))
            fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext
            fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2))
            fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext

            filter_unc_pe(in_file_1=fq_cor_path_1,
                          in_file_2=fq_cor_path_2,
                          out_file_1=out_f_1,
                          out_file_2=out_f_2,
                          log_file=log_f)

            remove(fq_cor_path_1)
            remove(fq_cor_path_2)

            if fq_path_3 is not None:

                Log.msg(
                    'SE mode (Paired-read SRA run contains unpaired reads):',
                    pe)

                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_3,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_3 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_3))
                fq_cor_path_3 = splitext_gz(fq_base_path_3)[0] + '.cor.fq'
                log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired.txt')

                filter_unc_se(in_file=fq_cor_path_3,
                              out_file=out_f_3,
                              log_file=log_f_3)

                remove(fq_cor_path_3)
Example #17
0
def min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp,
                        dir_cache_fq_minlen, vsearch):
    # lowest allowable
    low = 35

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Calculating minimum acceptable read length.')
        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)
    else:
        return None

    __ = opj(dir_cache_fq_minlen, 'minlen')

    pickled = {}

    if ope(__):
        with open(__, 'rb') as f:
            pickled = pickle.load(f)

    queue = []

    for se in se_fastq_files:
        src = se_fastq_files[se]['src']
        avg_len = se_fastq_files[se]['avg_len']
        if src == 'sra':
            ml = max(avg_len // 3, low)
            se_fastq_files[se]['min_acc_len'] = ml
            Log.msg(str(ml) + ' nt:', se)
            continue

        fq_path = se_fastq_files[se]['path']
        stats_file = opj(dir_temp, se + '_stats.txt')
        queue.append([se, fq_path, stats_file, 'se'])

    for pe in pe_fastq_files:
        src = pe_fastq_files[pe]['src']
        avg_len = pe_fastq_files[pe]['avg_len']
        if src == 'sra':
            ml = max(avg_len // 3, low)
            pe_fastq_files[pe]['min_acc_len'] = ml
            Log.msg(str(ml) + ' nt:', pe)
            continue

        fq_path = pe_fastq_files[pe]['path'][0]
        stats_file = opj(dir_temp, pe + '_stats.txt')
        queue.append([pe, fq_path, stats_file, 'pe'])

    for x in queue:

        if x[0] in pickled:
            ml = pickled[x[0]]

        else:
            # ----------------------------------------------------------------
            # Use 'vsearch --fastq_stats'. About 2x slower than the
            #   approx_avg_read_len_fq function.
            #
            # cmd = [vsearch, '--fastq_stats', x[1], '--log', x[2]]
            # run(cmd, do_not_raise=True)
            # with open(x[2]) as f:
            #     stats = f.read()
            # remove(x[2])
            # ml = re.findall(r'>=\s+(\d+)', stats)
            # if len(ml) != 0:
            #     ml = max(int(ml[0]) // 3, low)
            # else:
            #     ml = None
            # ----------------------------------------------------------------
            # 22:59:12 50 nt: Hylocereus_polyrhizus_1195597_SRR7829961
            # 22:59:46 50 nt: Schlumbergera_truncata_15H-02_pol_S47    34s
            # 23:00:30 50 nt: Schlumbergera_truncata_15H-02_sty_S49    44s
            # ----------------------------------------------------------------

            # ----------------------------------------------------------------
            ml = approx_avg_read_len_fq(x[1])
            ml = max(int(ml) // 3, low)
            # ----------------------------------------------------------------
            # 23:12:06 50 nt: Hylocereus_polyrhizus_1195597_SRR7829961
            # 23:12:20 50 nt: Schlumbergera_truncata_15H-02_pol_S47    14s
            # 23:12:39 50 nt: Schlumbergera_truncata_15H-02_sty_S49    19s
            # ----------------------------------------------------------------

            pickled[x[0]] = ml

        if ml is not None:
            Log.msg(str(ml) + ' nt:', x[0])
        else:
            Log.msg(' ?' + ' nt:', x[0])
            ml = low

        if x[3] == 'se':
            se_fastq_files[x[0]]['min_acc_len'] = ml

        elif x[3] == 'pe':
            pe_fastq_files[x[0]]['min_acc_len'] = ml

        with open(__, 'wb') as f:
            pickle.dump(pickled, f, protocol=PICKLE_PROTOCOL)
Example #18
0
def dnld_sra_fastq_files(sras, sra_runs_info, dir_fq_data, fasterq_dump,
                         threads, dir_temp):

    if len(sras) > 0:
        if fasterq_dump is None:
            Log.err('fasterq-dump from SRA Toolkit is not available. ' +
                    'Cannot continue. Exiting.')
            exit(0)

        print()
        Log.inf('Downloading SRA read data.')

    se_fastq_files = {}
    pe_fastq_files = {}

    for sra in sras:
        sra_run_info = sra_runs_info[sra]
        sra_lib_layout = sra_run_info['LibraryLayout'].lower()
        sra_lib_layout_k = sra_run_info['KakapoLibraryLayout'].lower()
        sample_base_name = sra_run_info['KakapoSampleBaseName']
        sra_taxid = int(sra_run_info['TaxID'])
        avg_len = int(sra_run_info['avgLength'])

        sra_dnld_needed = False

        if sra_lib_layout == 'single' or sra_lib_layout_k == 'single':
            se_file = opj(dir_fq_data, sra + '.fastq')
            se_fastq_files[sample_base_name] = {'path': se_file}
            se_fastq_files[sample_base_name]['src'] = 'sra'
            se_fastq_files[sample_base_name]['avg_len'] = avg_len
            se_fastq_files[sample_base_name]['tax_id'] = sra_taxid
            if not ope(se_file):
                sra_dnld_needed = True

        elif sra_lib_layout == 'paired':
            pe_file_1 = opj(dir_fq_data, sra + '_1.fastq')
            pe_file_2 = opj(dir_fq_data, sra + '_2.fastq')
            pe_file_1_renamed = opj(dir_fq_data, sra + '_R1.fastq')
            pe_file_2_renamed = opj(dir_fq_data, sra + '_R2.fastq')
            pe_fastq_files[sample_base_name] = {
                'path': [pe_file_1_renamed, pe_file_2_renamed]
            }
            pe_fastq_files[sample_base_name]['src'] = 'sra'
            pe_fastq_files[sample_base_name]['avg_len'] = avg_len // 2
            pe_fastq_files[sample_base_name]['tax_id'] = sra_taxid
            if sra_lib_layout_k == 'paired_unp':
                pe_file_3 = opj(dir_fq_data, sra + '.fastq')
                pe_file_3_renamed = opj(dir_fq_data, sra + '_R3.fastq')
                pe_fastq_files[sample_base_name]['path'].append(
                    pe_file_3_renamed)
            if not ope(pe_file_1_renamed) or not ope(pe_file_2_renamed):
                sra_dnld_needed = True

        if not sra_dnld_needed:
            Log.msg('FASTQ reads are available locally:', sample_base_name)

        retry_count = 0
        while sra_dnld_needed:

            if retry_count > 50:
                Log.err('Download failed. Exiting.')
                rmtree(dir_temp)
                exit(1)

            elif retry_count > 0:
                Log.wrn('Download failed. Retrying.')
                sleep(2)

            retry_count += 1

            Log.msg('Downloading FASTQ reads for:', sample_base_name)

            cmd = [
                fasterq_dump, '--threads',
                str(threads * 2), '--split-3', '--bufsize', '819200',
                '--outdir', dir_fq_data, '--temp', dir_temp, sra
            ]

            run(cmd, do_not_raise=True)

            if sra_lib_layout == 'single' or sra_lib_layout_k == 'single':
                if not ope(se_file):
                    continue

            elif sra_lib_layout == 'paired':

                if not ope(pe_file_1) or not ope(pe_file_2):
                    continue
                else:
                    move(pe_file_1, pe_file_1_renamed)
                    move(pe_file_2, pe_file_2_renamed)

                if sra_lib_layout_k == 'paired_unp':
                    if not ope(pe_file_3):
                        continue
                    else:
                        move(pe_file_3, pe_file_3_renamed)

            sra_dnld_needed = False

            if sra_lib_layout == 'single' or sra_lib_layout_k == 'single':
                if ope(se_file):
                    Log.msg('Renaming FASTQ reads in:', se_file)
                    rename_fq_seqs(se_file, sra, '1:N:0')

            elif sra_lib_layout == 'paired':
                if ope(pe_file_1_renamed):
                    Log.msg('Renaming FASTQ reads in:', pe_file_1_renamed)
                    rename_fq_seqs(pe_file_1_renamed, sra, '1:N:0')
                if ope(pe_file_2_renamed):
                    Log.msg('Renaming FASTQ reads in:', pe_file_2_renamed)
                    rename_fq_seqs(pe_file_2_renamed, sra, '2:N:0')
                if sra_lib_layout_k == 'paired_unp':
                    if ope(pe_file_3_renamed):
                        Log.msg('Renaming FASTQ reads in:', pe_file_3_renamed)
                        rename_fq_seqs(pe_file_3_renamed, sra + '_unpaired',
                                       '1:N:0')

    return se_fastq_files, pe_fastq_files, sra_runs_info
Example #19
0
def run_vsearch_on_reads(se_fastq_files, pe_fastq_files, vsearch,
                         dir_vsearch_results_fa_trim, fpatt, ss, seqtk):

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    # FixMe: Expose in configuration files?
    ident = 0.85

    for se in se_fastq_files:
        dir_results = opj(dir_vsearch_results_fa_trim, se)
        min_acc_len = se_fastq_files[se]['min_acc_len']
        blast_results_fa_path = se_fastq_files[se]['blast_results_path' +
                                                   '__' + ss]
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_results, se + '__' + ss + '.txt')
        out_f_fastq = out_f.replace('.txt', '.fastq')
        se_fastq_files[se]['vsearch_results_path' + '__' + ss] = out_f_fastq

        if ope(out_f_fastq):
            Log.msg('Vsearch results already exist:', se)
        else:
            make_dirs(dir_results)
            Log.msg('Running vsearch on: ' + basename(fq_path), ss)
            run_vsearch(vsearch,
                        ident=ident,
                        q_file=blast_results_fa_path,
                        db_file=fq_path,
                        out_file=out_f,
                        minlen=min_acc_len)

            Log.msg('Extracting unique vsearch hits using Seqtk:', ss)
            keep_unique_lines_in_file(out_f)
            seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f)
            osremove(out_f)

    for pe in pe_fastq_files:
        dir_results = opj(dir_vsearch_results_fa_trim, pe)
        min_acc_len = pe_fastq_files[pe]['min_acc_len']
        blast_results_fa_path = pe_fastq_files[pe]['blast_results_path' +
                                                   '__' + ss]
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_results) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x.replace('@Q@', ss) for x in out_fs]
        out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs]
        pe_fastq_files[pe]['vsearch_results_path' + '__' + ss] = out_fs_fastq

        if ope(out_fs_fastq[0]) and ope(out_fs_fastq[1]) and \
           ope(out_fs_fastq[2]) and ope(out_fs_fastq[3]):
            Log.msg('Vsearch results already exist:', pe)
        else:
            make_dirs(dir_results)
            pe_trim_files = zip(fq_paths, out_fs, out_fs_fastq)
            for x in pe_trim_files:
                Log.msg('Running vsearch on: ' + basename(x[0]), ss)
                run_vsearch(vsearch,
                            ident=ident,
                            q_file=blast_results_fa_path,
                            db_file=x[0],
                            out_file=x[1],
                            minlen=min_acc_len)

            Log.msg(
                'Extracting unique vsearch hits from paired files '
                'using Seqtk:', ss)

            p1txt = out_fs[0]
            p2txt = out_fs[1]

            p1fq = fq_paths[0]
            p2fq = fq_paths[1]

            p1fq_out = out_fs_fastq[0]
            p2fq_out = out_fs_fastq[1]

            p12txt_temp = opj(dir_results, pe + '__' + ss + '_paired.txt')

            combine_text_files([p1txt, p2txt], p12txt_temp)
            keep_unique_lines_in_file(p12txt_temp)

            seqtk_extract_reads(seqtk, p1fq, p1fq_out, p12txt_temp)
            seqtk_extract_reads(seqtk, p2fq, p2fq_out, p12txt_temp)

            osremove(p1txt)
            osremove(p2txt)
            osremove(p12txt_temp)

            Log.msg(
                'Extracting unique vsearch hits from unpaired files '
                'using Seqtk:', ss)

            u1txt = out_fs[2]
            u2txt = out_fs[3]

            u1fq = fq_paths[2]
            u2fq = fq_paths[3]

            u1fq_out = out_fs_fastq[2]
            u2fq_out = out_fs_fastq[3]

            keep_unique_lines_in_file(u1txt)
            keep_unique_lines_in_file(u2txt)

            seqtk_extract_reads(seqtk, u1fq, u1fq_out, u1txt)
            seqtk_extract_reads(seqtk, u2fq, u2fq_out, u2txt)

            osremove(u1txt)
            osremove(u2txt)
Example #20
0
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   threads, dir_temp, fpatt, should_run):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        if should_run is False:
            Log.wrn('Skipping Rcorrector as requested.')
        else:
            Log.inf('Running Rcorrector.')

            if rcorrector is None:
                Log.err(
                    'Rcorrector is not available. Cannot continue. Exiting.')
                exit(0)

    for se in se_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, se)
        fq_path = se_fastq_files[se]['trim_path_fq']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        log_f = opj(dir_fq_cor_data_sample, se + '.txt')
        out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext)

        se_fastq_files[se]['cor_path_fq'] = out_f

        if should_run is False:
            se_fastq_files[se]['cor_path_fq'] = fq_path
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('SE mode:', se)
            run_rcorrector_se(rcorrector=rcorrector,
                              in_file=fq_path,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path))
            fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext

            filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f)

            remove(fq_cor_path)

    for pe in pe_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe)

        fq_path_1 = pe_fastq_files[pe]['trim_path_fq'][0]
        fq_path_2 = pe_fastq_files[pe]['trim_path_fq'][1]
        fq_path_3 = pe_fastq_files[pe]['trim_path_fq'][2]
        fq_path_4 = pe_fastq_files[pe]['trim_path_fq'][3]

        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        log_f = opj(dir_fq_cor_data_sample, pe + '_paired.txt')

        out_fs = [x.replace('@D@', dir_fq_cor_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x + ext for x in out_fs]

        pe_fastq_files[pe]['cor_path_fq'] = out_fs

        if should_run is False:
            pe_fastq_files[pe]['cor_path_fq'] = [
                fq_path_1, fq_path_2, fq_path_3, fq_path_4
            ]
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('PE mode:', pe)
            run_rcorrector_pe(rcorrector=rcorrector,
                              in_file_1=fq_path_1,
                              in_file_2=fq_path_2,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1))
            fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext
            fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2))
            fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext

            filter_unc_pe(in_file_1=fq_cor_path_1,
                          in_file_2=fq_cor_path_2,
                          out_file_1=out_fs[0],
                          out_file_2=out_fs[1],
                          log_file=log_f)

            remove(fq_cor_path_1)
            remove(fq_cor_path_2)

            # unpaired 1
            if stat(fq_path_3).st_size != 0:
                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_3,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_3 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_3))
                fq_cor_path_3 = splitext_gz(
                    fq_base_path_3)[0] + '.cor.fq' + ext
                log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired_1.txt')

                filter_unc_se(in_file=fq_cor_path_3,
                              out_file=out_fs[2],
                              log_file=log_f_3)

                remove(fq_cor_path_3)
            else:
                with open(out_fs[2], 'w') as f:
                    f.write('')

            # unpaired 2
            if stat(fq_path_4).st_size != 0:
                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_4,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_4 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_4))
                fq_cor_path_4 = splitext_gz(
                    fq_base_path_4)[0] + '.cor.fq' + ext
                log_f_4 = opj(dir_fq_cor_data_sample, pe + '_unpaired_2.txt')

                filter_unc_se(in_file=fq_cor_path_4,
                              out_file=out_fs[3],
                              log_file=log_f_4)
                remove(fq_cor_path_4)

            else:
                with open(out_fs[3], 'w') as f:
                    f.write('')
Example #21
0
def run_kraken2(order, dbs, se_fastq_files, pe_fastq_files, dir_fq_filter_data,
                confidence, kraken2, threads, dir_temp, fpatt):

    if (len(se_fastq_files) > 0 or len(pe_fastq_files) > 0) and len(order) > 0:
        print()
        Log.inf('Running Kraken2.', 'Confidence: ' + str(confidence))
        if kraken2 is None:
            Log.err('kraken2 is not available. Cannot continue. Exiting.')
            exit(0)

    nuclear = None
    for nuc in order:
        if nuc[1] == 'nuclear':
            nuclear = nuc[0]
            break

    for se in se_fastq_files:

        if len(order) == 0:
            continue

        if se_fastq_files[se]['path'] is None:
            continue

        fq_path = se_fastq_files[se]['filter_path_fq']
        dir_fq_filter_data_sample = opj(dir_fq_filter_data, se)

        if nuclear is None:
            out_f = opj(dir_fq_filter_data_sample, se + '.fastq')
        else:
            out_f = opj(dir_fq_filter_data_sample, nuclear, se + '.fastq')

        se_fastq_files[se]['filter_path_fq'] = out_f

        if ope(dir_fq_filter_data_sample):
            Log.msg('Kraken2 filtered FASTQ files already exist:', se)
        else:
            make_dirs(dir_fq_filter_data_sample)
            print()
            Log.msg('SE mode:', se)
            run_kraken_filters(order=order,
                               dbs=dbs,
                               base_name=se,
                               in_files=fq_path,
                               dir_out=dir_fq_filter_data_sample,
                               confidence=confidence,
                               kraken2=kraken2,
                               threads=threads,
                               dir_temp=dir_temp)

    for pe in pe_fastq_files:

        if len(order) == 0:
            continue

        if pe_fastq_files[pe]['path'] is None:
            continue

        fq_path = pe_fastq_files[pe]['filter_path_fq']
        dir_fq_filter_data_sample = opj(dir_fq_filter_data, pe)

        if nuclear is None:
            dir_name_nuclear = dir_fq_filter_data_sample
        else:
            dir_name_nuclear = dir_fq_filter_data_sample + ops + nuclear

        out_fs = [x.replace('@D@', dir_name_nuclear) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]

        pe_fastq_files[pe]['filter_path_fq'] = out_fs

        if ope(dir_fq_filter_data_sample):
            Log.msg('Kraken2 filtered FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_filter_data_sample)
            print()
            Log.msg('PE mode:', pe)
            run_kraken_filters(order=order,
                               dbs=dbs,
                               base_name=pe,
                               in_files=fq_path,
                               dir_out=dir_fq_filter_data_sample,
                               confidence=confidence,
                               kraken2=kraken2,
                               threads=threads,
                               dir_temp=dir_temp)
Example #22
0
def dep_check_kraken2(dir_dep, os_id, release_name, force):
    url = 'https://github.com/karolisr/kraken2/archive/master.tar.gz'

    dnld_path = opj(dir_dep, 'kraken2.tar.gz')

    try:
        if force is True:
            raise
        kraken2 = which('kraken2')
        kraken2_build = which('kraken2-build')

        dir_bin = dirname(kraken2)
        classify_bin = opj(dir_bin, 'classify')
        _ = run([classify_bin], do_not_raise=True)
        if not _.stderr.startswith('classify: mandatory filename'):
            raise

        run([kraken2, '--help'])
        run([kraken2_build, '--help'])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'kraken2'))
            kraken2 = opj(dir_bin, 'bin', 'kraken2')
            kraken2_build = opj(dir_bin, 'bin', 'kraken2-build')

            classify_bin = opj(dir_bin, 'bin', 'classify')
            _ = run([classify_bin], do_not_raise=True)
            if not _.stderr.startswith('classify: mandatory filename'):
                raise

            run([kraken2, '--help'])
            run([kraken2_build, '--help'])
        except Exception:
            Log.wrn('Kraken2 was not found on this system, trying to '
                    'download.')

            if ope(dnld_path):
                remove(dnld_path)

            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'kraken2'))
            classify_bin = opj(dir_bin, 'bin', 'classify')
            kraken2 = opj(dir_bin, 'bin', 'kraken2')
            kraken2_build = opj(dir_bin, 'bin', 'kraken2-build')

            makefile = opj(dir_bin, 'src', 'Makefile')
            replace_line_in_file(makefile, 'cp $(PROGS) $(KRAKEN2_DIR)/',
                                 'cp $(PROGS) "$(KRAKEN2_DIR)"/')
            try:
                Log.wrn('Compiling Kraken2 Attempt 1')
                run(['./install_kraken2.sh', 'bin'], cwd=dir_bin)

                _ = run([classify_bin], do_not_raise=True)
                if not _.stderr.startswith('classify: mandatory filename'):
                    raise

                run([kraken2, '--help'])
                run([kraken2_build, '--help'])

            except Exception:
                try:
                    Log.wrn('Compiling Kraken2 Attempt 2')

                    dir_libomp = opj(dir_dep, 'libomp')

                    if ope(dir_libomp):
                        rmtree(dir_libomp)

                    libomp_fp, v = brew_get('libomp', os_id, release_name,
                                            dir_dep)

                    tar_ref = tarfile.open(libomp_fp, 'r:gz')
                    tar_ref.extractall(dir_dep)
                    tar_ref.close()

                    dir_libomp_l = opj(dir_libomp, v, 'lib')
                    dir_libomp_i = opj(dir_libomp, v, 'include')

                    if os_id == 'mac':
                        # Changes the shared library identification name of a
                        # dynamic shared library.
                        dylib_f = opj(dir_libomp_l, 'libomp.dylib')

                        chmod(
                            dylib_f, stat.S_IRWXU | stat.S_IRUSR | stat.S_IWUSR
                            | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH
                            | stat.S_IWOTH)

                        cmd = ['install_name_tool', '-id', dylib_f, dylib_f]
                        run(cmd)

                        cxx_flags = ('CXXFLAGS = -L{} -I{} -Xpreprocessor '
                                     '-fopenmp -lomp -Wall -std=c++11 -O3')

                    elif os_id == 'linux':
                        cxx_flags = ('CXXFLAGS = -L{} -I{} -fopenmp -lomp '
                                     '-static -Wall -std=c++11 -O3')

                    cxx_flags = cxx_flags.format(dir_libomp_l, dir_libomp_i)

                    makefile = opj(dir_bin, 'src', 'Makefile')

                    replace_line_in_file(
                        makefile, 'CXXFLAGS = -fopenmp -Wall -std=c++11'
                        ' -O3', cxx_flags)

                    run(['./install_kraken2.sh', 'bin'], cwd=dir_bin)

                    _ = run([classify_bin], do_not_raise=True)
                    if not _.stderr.startswith('classify: mandatory filename'):
                        raise

                    run([kraken2, '--help'])
                    run([kraken2_build, '--help'])

                except Exception:
                    try:
                        Log.wrn('Compiling Kraken2 Attempt 3')
                        makefile = opj(dir_bin, 'src', 'Makefile')
                        replace_line_in_file(
                            makefile, cxx_flags,
                            'CXXFLAGS = -Wall -std=c++11 -O3')
                        run(['./install_kraken2.sh', 'bin'], cwd=dir_bin)

                        _ = run([classify_bin], do_not_raise=True)
                        if not _.stderr.startswith(
                                'classify: mandatory filename'):
                            raise

                        run([kraken2, '--help'])
                        run([kraken2_build, '--help'])
                    except Exception:
                        pass

            if not ope(kraken2):
                Log.err('Something went wrong while trying to compile '
                        'Kraken2.')
                Log.msg('Try downloading and installing it manually from: '
                        'https://github.com/karolisr/kraken2')
                return None, None

    regexp = r'^.*?version\s([\d\.\-A-Za-z]*)'
    v = get_dep_version([kraken2, '--version'], regexp)
    Log.msg('kraken2 is available:', v + ' ' + kraken2)
    v = get_dep_version([kraken2_build, '--version'], regexp)
    Log.msg('kraken2-build is available:', v + ' ' + kraken2_build)

    return kraken2, kraken2_build
Example #23
0
def find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk, dir_temp,
                        prepend_assmbl, min_target_orf_len, max_target_orf_len,
                        allow_non_aug, allow_no_strt_cod, allow_no_stop_cod,
                        tax, tax_group, tax_ids_user, min_overlap, organelle):

    if len(assemblies) > 0:
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    for a in assemblies:

        if ('blast_hits_aa__' + ss) not in a:
            continue

        assmbl_name = a['name']
        tax_id = a['tax_id']

        parsed_hits = a['blast_hits_aa__' + ss]

        a_path = a['path']

        gc_tt = a['gc_tt']
        if tax.is_eukaryote(tax_id) is True:
            if organelle == 'mitochondrion':
                gc_tt = a['gc_tt_mito']
            if tax.contains_plastid(tax_id) is True:
                if organelle == 'plastid':
                    gc_tt = a['gc_tt_plastid']

        transcripts_nt_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_nt__' + ss + '.fasta')

        transcripts_nt_orf_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_nt_orf__' + ss + '.fasta')

        transcripts_aa_orf_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_aa_orf__' + ss + '.fasta')

        transcripts_nt = {}
        transcripts_nt_orf = {}
        transcripts_aa_orf = {}

        transcripts_with_acceptable_orfs = set()

        ann_key = 'annotations__'

        a[ann_key + ss] = {}

        collated = collate_blast_results(parsed_hits)

        ######################################################################
        # Use seqtk to sample the assembly FASTA file for sequences with
        # BLAST hits. This increases the speed substantially when the assembly
        # file is large.
        temp_a_file = opj(dir_temp, 'temp__' + ss + '.fasta')
        temp_s_file = opj(dir_temp, 'temp__' + ss + '.txt')
        sseqids_subsample = []
        for hit in collated:
            target_name = hit['sseqid']
            sseqids_subsample.append(target_name)
        sseqids_subsample_text = '\n'.join(sseqids_subsample)
        with open(temp_s_file, 'w') as f:
            f.write(sseqids_subsample_text)
        seqtk_extract_reads(seqtk,
                            in_file=a_path,
                            out_file=temp_a_file,
                            ids_file=temp_s_file)

        with open(temp_a_file, 'r') as f:
            _ = f.read()

        if _.strip() == '':
            continue

        print()
        Log.inf('Analyzing BLAST hits', '=' * 113 + '\n')
        Log.msg('Assembly:', assmbl_name, False)
        Log.msg('Search Strategy:', ss + '\n\n' + '-' * 134 + '\n', False)

        parsed_fasta = trim_desc_to_first_space_in_fasta_text(_, SEQ_TYPE_DNA)
        parsed_fasta = seq_records_to_dict(parsed_fasta)
        ######################################################################

        all_kakapo_results = {}
        json_dump_file_path = opj(dir_prj_transcripts,
                                  assmbl_name + '_ann_kakapo__' + ss + '.json')

        for hit in collated:

            target_name = hit['sseqid']
            target_seq = parsed_fasta[target_name]
            query_name = hit['qseqid']
            hit_evalue = hit['evalue']

            # Prepend assembly name to the sequence name:
            if prepend_assmbl is True:
                target_name = assmbl_name + '__' + target_name
                # Also prepend taxonomic info to the sequence name:
                if tax_id is not None:
                    fm = tax.higher_rank_for_taxid(tax_id, rank='family')
                    if fm is not None:
                        target_name = fm + '__' + target_name

            hit_start = hit['start']
            hit_end = hit['end']
            hit_frame = hit['frame']

            if allow_non_aug is True:
                start_codons = gc_tt.start_codons_ambiguous
            else:
                start_codons = ['ATG']

            stop_codons = gc_tt.stop_codons_ambiguous

            ##################################################################
            if tax_id is not None:
                tax_ids_for_orf = (tax_id, )
            else:
                tax_ids_for_orf = tax_ids_user

            cntx_txids_avail = tuple(
                sorted(
                    set(
                        map(lambda x: int(x.split('_')[0]),
                            atg_contexts.keys()))))

            cntx_taxid = set()
            for txid in tax_ids_for_orf:
                tax_path = partial(tax.path_between_taxids, txid)
                path_len = tuple(
                    map(len, tuple(map(tax_path, cntx_txids_avail))))
                cntx_taxid.add(cntx_txids_avail[path_len.index(min(path_len))])
            cntx_taxid = tuple(cntx_taxid)[0]

            cntx_l_key = str(cntx_taxid) + '_L'
            cntx_r_key = str(cntx_taxid) + '_R'

            cntx_l = atg_contexts[cntx_l_key]
            cntx_r = atg_contexts[cntx_r_key]
            ##################################################################

            orf_log_str = ('grade'.rjust(5) + 'ovrlp'.rjust(7) +
                           'cntx'.rjust(6) + 'length'.center(9) +
                           'cntx_l'.rjust(7) + 'cntx_r'.rjust(15) + '\n')

            orf = find_orf_for_blast_hit(seq=target_seq,
                                         frame=hit_frame,
                                         hit_start=hit_start,
                                         hit_end=hit_end,
                                         stop_codons=stop_codons,
                                         start_codons=start_codons,
                                         context_l=cntx_l,
                                         context_r=cntx_r,
                                         min_overlap=min_overlap,
                                         min_len=min_target_orf_len,
                                         max_len=max_target_orf_len,
                                         allow_no_strt_cod=allow_no_strt_cod,
                                         allow_no_stop_cod=allow_no_stop_cod)

            orf_log_str += orf[2]

            rev_comp_def_str = ''
            if hit_frame > 0:
                ann_hit_b = hit_start
                ann_hit_e = hit_end
            else:
                target_seq = reverse_complement(target_seq)
                ann_hit_b = len(target_seq) - hit_start
                ann_hit_e = len(target_seq) - hit_end
                rev_comp_def_str = '; RevComp'

            target_def = target_name + ' ' + query_name + rev_comp_def_str

            a[ann_key + ss][target_name] = {}

            good_orfs = orf[0]
            bad_orfs = orf[1]

            if len(good_orfs) > 0:
                a[ann_key + ss][target_name]['orfs_good'] = dict()
                orfs_good_dict = a[ann_key + ss][target_name]['orfs_good']
                orf_log_str += '\n' + 'VALID ' + '-' * 128 + '\n'

                for i, good_orf in enumerate(good_orfs):

                    good_orf_frame = good_orf[2]

                    if good_orf_frame > 0:
                        ann_orf_b = good_orf[0]
                        ann_orf_e = good_orf[1] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]
                    else:
                        ann_orf_b = len(target_seq) - good_orf[1]
                        ann_orf_e = len(target_seq) - good_orf[0] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]

                    orf_good_dict = dict()
                    orf_good_dict['orf_begin'] = ann_orf_b
                    orf_good_dict['orf_end'] = ann_orf_e
                    orf_good_dict['orf_frame'] = abs(good_orf_frame)
                    orf_good_dict['orf_grade'] = good_orf[3]
                    orf_good_dict['orf_tt_id'] = str(gc_tt.gc_id)
                    orf_good_dict['orf_tt_name'] = gc_tt.gc_name

                    orfs_good_dict['ORF{:03d}'.format(i + 1)] = orf_good_dict

                    target_def_orf = (target_name +
                                      '__ORF{:03d}'.format(i + 1) + ' ' +
                                      query_name + rev_comp_def_str)

                    transcripts_nt_orf[target_def_orf] = orf_seq

                    transcripts_with_acceptable_orfs.add(target_name)

                    transl_seq = translate(orf_seq, gc_tt.table_ambiguous,
                                           start_codons)

                    transcripts_aa_orf[target_def_orf] = transl_seq[:-1]

            else:
                orf_log_str += '\n' + 'NOT VALID ' + '-' * 124 + '\n'

            Log.msg('Transcript:', target_name, False)
            Log.msg('     Query:', query_name + '\n\n' + orf_log_str, False)

            if len(bad_orfs) > 0:
                a[ann_key + ss][target_name]['orfs_bad'] = dict()
                orfs_bad_dict = a[ann_key + ss][target_name]['orfs_bad']

                for i, bad_orf in enumerate(bad_orfs):

                    bad_orf_frame = bad_orf[2]

                    if bad_orf_frame > 0:
                        ann_orf_b = bad_orf[0]
                        ann_orf_e = bad_orf[1] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]
                    else:
                        ann_orf_b = len(target_seq) - bad_orf[1]
                        ann_orf_e = len(target_seq) - bad_orf[0] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]

                    orf_bad_dict = dict()
                    orf_bad_dict['orf_begin'] = ann_orf_b
                    orf_bad_dict['orf_end'] = ann_orf_e
                    orf_bad_dict['orf_frame'] = abs(bad_orf_frame)
                    orf_bad_dict['orf_grade'] = bad_orf[3]
                    orf_bad_dict['orf_tt_id'] = str(gc_tt.gc_id)
                    orf_bad_dict['orf_tt_name'] = gc_tt.gc_name

                    orfs_bad_dict['ORF{:03d}'.format(i + 1)] = orf_bad_dict

            transcripts_nt[target_def] = target_seq

            a[ann_key + ss][target_name]['blast_hit'] = dict()
            blast_hit_dict = a[ann_key + ss][target_name]['blast_hit']
            blast_hit_dict['query_name'] = query_name
            blast_hit_dict['query_id'] = ss
            blast_hit_dict['evalue'] = hit_evalue
            blast_hit_dict['frame'] = abs(hit_frame)
            blast_hit_dict['blast_hit_begin'] = ann_hit_b
            blast_hit_dict['blast_hit_end'] = ann_hit_e

            # Collect ORF and BLAST hit annotations for downstream use. ######
            kakapo_json = [{}]
            kakapo_json[0]['kakapo_annotations__' + ss] = (a[ann_key +
                                                             ss][target_name])
            all_kakapo_results[target_name] = kakapo_json
            ##################################################################

        # --------------------------------------------------------------------

        Log.msg('Assembly:', assmbl_name, False)
        Log.msg('Search Strategy:', ss, False)
        Log.msg('Transcripts:', str(len(transcripts_nt)), False)
        Log.msg('Transcripts with acceptable ORFs:',
                str(len(transcripts_with_acceptable_orfs)) + '\n' + '=' * 134,
                False)

        if len(transcripts_nt) > 0:
            write_fasta(transcripts_nt, transcripts_nt_fasta_file)
            a['transcripts_nt_fasta_file__' + ss] = transcripts_nt_fasta_file
        else:
            a['transcripts_nt_fasta_file__' + ss] = None

        if len(transcripts_nt_orf) > 0:
            write_fasta(transcripts_nt_orf, transcripts_nt_orf_fasta_file)
            a['transcripts_nt_orf_fasta_file__' +
              ss] = transcripts_nt_orf_fasta_file
        else:
            a['transcripts_nt_orf_fasta_file__' + ss] = None

        if len(transcripts_aa_orf) > 0:
            write_fasta(transcripts_aa_orf, transcripts_aa_orf_fasta_file)
            a['transcripts_aa_orf_fasta_file__' +
              ss] = transcripts_aa_orf_fasta_file
        else:
            a['transcripts_aa_orf_fasta_file__' + ss] = None

        # Save ORF and BLAST hit annotations for downstream use.--------------
        with open(json_dump_file_path, 'w') as f:
            json.dump(all_kakapo_results, f, sort_keys=True, indent=4)
Example #24
0
def main():
    """Run the script."""
    # Prepare initial logger (before we know the log file path) --------------
    prj_log_file_suffix = time_stamp() + '.log'
    log_stream = StringIO()

    Log.set_colors(COLORS)
    Log.set_file(log_stream)
    Log.set_write(True)

    # Prepare configuration directory ----------------------------------------
    if ope(DIR_CFG):
        Log.inf('Found configuration directory:', DIR_CFG)
    else:
        Log.wrn('Creating configuration directory:', DIR_CFG)
        make_dirs(DIR_CFG)

    print()

    # Check for dependencies -------------------------------------------------
    Log.inf('Checking for dependencies.')
    make_dirs(DIR_DEP)
    make_dirs(DIR_KRK)
    seqtk = deps.dep_check_seqtk(DIR_DEP, FORCE_DEPS)
    trimmomatic, adapters = deps.dep_check_trimmomatic(DIR_DEP)
    fasterq_dump = deps.dep_check_sra_toolkit(DIR_DEP, OS_ID, DIST_ID,
                                              DEBIAN_DISTS, REDHAT_DISTS,
                                              FORCE_DEPS)
    makeblastdb, _, tblastn = deps.dep_check_blast(DIR_DEP, OS_ID, DIST_ID,
                                                   DEBIAN_DISTS, REDHAT_DISTS,
                                                   FORCE_DEPS)
    vsearch = deps.dep_check_vsearch(DIR_DEP, OS_ID, DIST_ID, DEBIAN_DISTS,
                                     REDHAT_DISTS, FORCE_DEPS)
    spades = deps.dep_check_spades(DIR_DEP, OS_ID, FORCE_DEPS)
    bowtie2, bowtie2_build = deps.dep_check_bowtie2(DIR_DEP, OS_ID, FORCE_DEPS)
    rcorrector = deps.dep_check_rcorrector(DIR_DEP, FORCE_DEPS)
    kraken2, kraken2_build = deps.dep_check_kraken2(DIR_DEP, OS_ID,
                                                    RELEASE_NAME, FORCE_DEPS)

    kakapolib = deps.dep_check_kakapolib(FORCE_DEPS)
    if kakapolib is None:
        Log.err('Could not compile "kakapolib". Cannot continue.')
        exit(0)

    print()

    kraken2_dbs = deps.dnld_kraken2_dbs(DIR_KRK)

    if INSTALL_DEPS is True or DNLD_KRAKEN_DBS is True:
        exit(0)

    print()

    # Initialize NCBI taxonomy database --------------------------------------
    tax = Taxonomy()
    if tax.is_initialized() is False:
        tax.init(data_dir_path=DIR_TAX, logger=Log)
        print()

    # Parse configuration file -----------------------------------------------
    Log.inf('Reading configuration file:', CONFIG_FILE_PATH)
    _ = config_file_parse(CONFIG_FILE_PATH, tax)

    allow_no_stop_cod = _['allow_no_stop_cod']
    allow_no_strt_cod = _['allow_no_strt_cod']
    allow_non_aug = _['allow_non_aug']

    blast_1_evalue = _['blast_1_evalue']
    blast_1_max_hsps = _['blast_1_max_hsps']
    blast_1_qcov_hsp_perc = _['blast_1_qcov_hsp_perc']
    blast_1_best_hit_overhang = _['blast_1_best_hit_overhang']
    blast_1_best_hit_score_edge = _['blast_1_best_hit_score_edge']
    blast_1_max_target_seqs = _['blast_1_max_target_seqs']

    blast_2_evalue = _['blast_2_evalue']
    blast_2_max_hsps = _['blast_2_max_hsps']
    blast_2_qcov_hsp_perc = _['blast_2_qcov_hsp_perc']
    blast_2_best_hit_overhang = _['blast_2_best_hit_overhang']
    blast_2_best_hit_score_edge = _['blast_2_best_hit_score_edge']
    blast_2_max_target_seqs = _['blast_2_max_target_seqs']

    dir_out = _['output_directory']
    email = _['email']
    requery_after = _['requery_after']
    fq_pe = _['fq_pe']
    fq_se = _['fq_se']
    should_run_rcorrector = _['should_run_rcorrector']
    should_run_ipr = _['should_run_ipr']
    bt2_order = _['bt2_order']
    kraken_confidence = _['kraken_confidence']
    krkn_order = _['krkn_order']
    prepend_assmbl = _['prepend_assmbl']
    prj_name = _['project_name']
    sras = _['sras']
    tax_group = _['tax_group']
    # tax_group_name = _['tax_group_name']
    tax_ids_user = _['tax_ids']
    user_assemblies = _['assmbl']

    print()

    # Parse search strategies file -------------------------------------------
    if SS_FILE_PATH is not None:
        Log.inf('Reading search strategies file:', SS_FILE_PATH)
        sss = ss_file_parse(SS_FILE_PATH)
    else:
        Log.wrn('Search strategies file was not provided.\n' +
                'Will process reads, assemblies and then stop.')
        sss = dict()

    print()

    # Create output directory ------------------------------------------------
    if dir_out is not None:
        if ope(dir_out):
            Log.inf('Found output directory:', dir_out)
        else:
            Log.wrn('Creating output directory:', dir_out)
            make_dirs(dir_out)

    print()

    # Write Kakapo version information to the output directory ---------------
    version_file = opj(dir_out, 'kakapo_version.txt')
    if ope(version_file):
        with open(version_file, 'r') as f:
            version_prev = f.read().strip()
            if __version__ != version_prev:
                Log.wrn('The output directory contains data produced by a ' +
                        'different version of Kakapo: ' + version_prev +
                        '.\nThe currently running version is: ' + __version__ +
                        '.\n' +
                        'Delete "kakapo_version.txt" file located in the ' +
                        'output directory if you would like to continue.')
                exit(0)

    with open(version_file, 'w') as f:
        f.write(__version__)

    # Create subdirectories in the output directory --------------------------
    _ = prepare_output_directories(dir_out, prj_name)

    dir_temp = _['dir_temp']
    dir_cache_pfam_acc = _['dir_cache_pfam_acc']
    dir_cache_fq_minlen = _['dir_cache_fq_minlen']
    dir_cache_prj = _['dir_cache_prj']
    dir_cache_refseqs = _['dir_cache_refseqs']
    dir_prj_logs = _['dir_prj_logs']
    dir_prj_queries = _['dir_prj_queries']
    dir_fq_data = _['dir_fq_data']
    dir_fq_cor_data = _['dir_fq_cor_data']
    dir_fq_trim_data = _['dir_fq_trim_data']
    dir_fq_filter_bt2_data = _['dir_fq_filter_bt2_data']
    dir_fq_filter_krkn2_data = _['dir_fq_filter_krkn2_data']
    dir_fa_trim_data = _['dir_fa_trim_data']
    dir_blast_fa_trim = _['dir_blast_fa_trim']
    dir_prj_blast_results_fa_trim = _['dir_prj_blast_results_fa_trim']
    dir_prj_vsearch_results_fa_trim = _['dir_prj_vsearch_results_fa_trim']
    dir_prj_spades_assemblies = _['dir_prj_spades_assemblies']
    dir_prj_blast_assmbl = _['dir_prj_blast_assmbl']
    dir_prj_assmbl_blast_results = _['dir_prj_assmbl_blast_results']
    dir_prj_transcripts = _['dir_prj_transcripts']
    dir_prj_ips = _['dir_prj_ips']
    dir_prj_transcripts_combined = _['dir_prj_transcripts_combined']

    # Prepare logger ---------------------------------------------------------
    prj_log_file = opj(dir_prj_logs, prj_name + '_' + prj_log_file_suffix)
    with open(prj_log_file, 'w') as f:
        f.write(SCRIPT_INFO.strip() + '\n\n' + log_stream.getvalue())

    Log.set_colors(COLORS)
    Log.set_file(prj_log_file)
    Log.set_write(True)

    log_stream.close()

    # Resolve descending taxonomy nodes --------------------------------------
    tax_ids = tax.all_descending_taxids_for_taxids([tax_group])

    # Pfam uniprot accessions ------------------------------------------------
    pfam_uniprot_acc = OrderedDict()
    for ss in sss:
        pfam_acc = sss[ss]['pfam_families']
        pfam_uniprot_acc[ss] = pfam_uniprot_accessions(ss, pfam_acc, tax_ids,
                                                       dir_cache_pfam_acc)

    # Download Pfam uniprot sequences if needed ------------------------------
    aa_uniprot_files = OrderedDict()
    for ss in sss:
        aa_uniprot_files[ss] = opj(dir_prj_queries, 'aa_uniprot__' + ss +
                                   '.fasta')
        # ToDo: add support for the requery_after parameter.
        dnld_pfam_uniprot_seqs(ss, pfam_uniprot_acc[ss], aa_uniprot_files[ss],
                               dir_cache_prj)

    # User provided entrez query ---------------------------------------------
    prot_acc_user_from_query = OrderedDict()
    for ss in sss:
        entrez_queries = sss[ss]['entrez_search_queries']
        prot_acc_user_from_query[ss] = user_entrez_search(ss, entrez_queries,
                                                          dir_cache_prj,
                                                          requery_after)

    # User provided protein accessions ---------------------------------------
    prot_acc_user = OrderedDict()
    for ss in sss:
        print()
        prot_acc_all = sorted(set(sss[ss]['ncbi_accessions_aa'] +
                                  prot_acc_user_from_query[ss]))
        prot_acc_user[ss] = user_protein_accessions(ss, prot_acc_all,
                                                    dir_cache_prj, tax)

    # Download from NCBI if needed -------------------------------------------
    aa_prot_ncbi_files = OrderedDict()
    for ss in sss:
        aa_prot_ncbi_files[ss] = opj(dir_prj_queries, 'aa_prot_ncbi__' + ss +
                                     '.fasta')
        prot_acc_user[ss] = dnld_prot_seqs(ss, prot_acc_user[ss],
                                           aa_prot_ncbi_files[ss],
                                           dir_cache_prj)

    # User provided protein sequences ----------------------------------------
    aa_prot_user_files = OrderedDict()
    for ss in sss:
        user_queries = sss[ss]['fasta_files_aa']
        aa_prot_user_files[ss] = opj(dir_prj_queries, 'aa_prot_user__' + ss +
                                     '.fasta')
        user_aa_fasta(ss, user_queries, aa_prot_user_files[ss])

    # Combine all AA queries -------------------------------------------------
    print()
    aa_queries_files = OrderedDict()
    for ss in sss:
        aa_queries_files[ss] = opj(dir_prj_queries, 'aa_all__' + ss + '.fasta')
        combine_aa_fasta(ss, [aa_uniprot_files[ss], aa_prot_ncbi_files[ss],
                              aa_prot_user_files[ss]], aa_queries_files[ss])

    # Filter AA queries ------------------------------------------------------
    prot_acc_user_filtered = OrderedDict()
    for ss in sss:
        min_query_length = sss[ss]['min_query_length']
        max_query_length = sss[ss]['max_query_length']
        max_query_identity = sss[ss]['max_query_identity']

        # Dereplicate all queries
        filter_queries(ss, aa_queries_files[ss], min_query_length,
                       max_query_length, max_query_identity,
                       vsearch, prot_acc_user[ss], overwrite=True)

        # Dereplicate only NCBI queries. CDS for these will be downloaded
        # later for reference.
        if ope(aa_prot_ncbi_files[ss]):
            prot_acc_user_filtered[ss] = filter_queries(
                ss, aa_prot_ncbi_files[ss], min_query_length, max_query_length,
                max_query_identity, vsearch, prot_acc_user[ss],
                overwrite=False, logging=False)

    # Download SRA run metadata if needed ------------------------------------
    sra_runs_info, sras_acceptable = dnld_sra_info(sras, dir_cache_prj)

    # Download SRA run FASTQ files if needed ---------------------------------
    x, y, z = dnld_sra_fastq_files(sras_acceptable, sra_runs_info, dir_fq_data,
                                   fasterq_dump, THREADS, dir_temp)

    se_fastq_files_sra = x
    pe_fastq_files_sra = y
    sra_runs_info = z

    # User provided FASTQ files ----------------------------------------------
    se_fastq_files_usr, pe_fastq_files_usr = user_fastq_files(fq_se, fq_pe)

    # Collate FASTQ file info ------------------------------------------------
    se_fastq_files = se_fastq_files_sra.copy()
    se_fastq_files.update(se_fastq_files_usr)
    pe_fastq_files = pe_fastq_files_sra.copy()
    pe_fastq_files.update(pe_fastq_files_usr)

    def gc_tt(k, d, tax):
        taxid = d[k]['tax_id']

        gc = tax.genetic_code_for_taxid(taxid)

        d[k]['gc_id'] = gc
        d[k]['gc_tt'] = TranslationTable(gc)

        gc_mito = None
        tt_mito = None

        gc_plastid = None
        tt_plastid = None

        if tax.is_eukaryote(taxid) is True:
            gc_mito = tax.mito_genetic_code_for_taxid(taxid)
            if gc_mito != '0':
                tt_mito = TranslationTable(gc_mito)

            if tax.contains_plastid(taxid) is True:
                gc_plastid = tax.plastid_genetic_code_for_taxid(taxid)
                if gc_plastid != '0':
                    tt_plastid = TranslationTable(gc_plastid)

        d[k]['gc_id_mito'] = gc_mito
        d[k]['gc_tt_mito'] = tt_mito

        d[k]['gc_id_plastid'] = gc_plastid
        d[k]['gc_tt_plastid'] = tt_plastid

    for se in se_fastq_files:
        gc_tt(se, se_fastq_files, tax)

    for pe in pe_fastq_files:
        gc_tt(pe, pe_fastq_files, tax)

    # Minimum acceptable read length -----------------------------------------
    min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp,
                        dir_cache_fq_minlen)

    # File name patterns -----------------------------------------------------
    a, b, c, d, e = file_name_patterns()

    pe_trim_fq_file_patterns = a
    pe_trim_fa_file_patterns = b
    pe_blast_db_file_patterns = c
    pe_blast_results_file_patterns = d
    pe_vsearch_results_file_patterns = e

    # Run Trimmomatic --------------------------------------------------------
    run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data,
                    trimmomatic, adapters, pe_trim_fq_file_patterns, THREADS)

    # Run Rcorrector ---------------------------------------------------------
    run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   THREADS, dir_temp, pe_trim_fq_file_patterns,
                   should_run_rcorrector)

    # Run Bowtie 2 -----------------------------------------------------------
    run_bt2_fq(se_fastq_files, pe_fastq_files, dir_fq_filter_bt2_data,
               bowtie2, bowtie2_build, THREADS, dir_temp, bt2_order,
               pe_trim_fq_file_patterns, tax, dir_cache_refseqs)

    # Run Kraken2 ------------------------------------------------------------
    run_kraken2(krkn_order, kraken2_dbs, se_fastq_files, pe_fastq_files,
                dir_fq_filter_krkn2_data, kraken_confidence, kraken2, THREADS,
                dir_temp, pe_trim_fq_file_patterns)

    se_fastq_files = OrderedDict(se_fastq_files)
    pe_fastq_files = OrderedDict(pe_fastq_files)

    se_fastq_files = OrderedDict(sorted(se_fastq_files.items(),
                                        key=lambda x: x[1]['filter_path_fq']))

    pe_fastq_files = OrderedDict(sorted(pe_fastq_files.items(),
                                        key=lambda x: x[1]['filter_path_fq']))

    # Stop After Filter ------------------------------------------------------
    if STOP_AFTER_FILTER is True:
        Log.wrn('Stopping after Kraken2/Bowtie2 filtering step as requested.')
        exit(0)

    # Convert filtered FASTQ files to FASTA ----------------------------------
    filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk,
                      pe_trim_fa_file_patterns)

    # Run makeblastdb on reads -----------------------------------------------
    makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim,
                   makeblastdb, pe_blast_db_file_patterns)

    # Check if there are any query sequences.
    any_queries = False
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        else:
            any_queries = True

    # Run tblastn on reads ---------------------------------------------------
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        changed_blast_1 = run_tblastn_on_reads(
            se_fastq_files, pe_fastq_files, aa_queries_files[ss], tblastn,
            blast_1_evalue, blast_1_max_hsps, blast_1_qcov_hsp_perc,
            blast_1_best_hit_overhang, blast_1_best_hit_score_edge,
            blast_1_max_target_seqs, dir_prj_blast_results_fa_trim,
            pe_blast_results_file_patterns, ss, THREADS, seqtk, vsearch,
            dir_cache_prj)

        if changed_blast_1 is True:
            if ope(dir_prj_vsearch_results_fa_trim):
                rmtree(dir_prj_vsearch_results_fa_trim)
            if ope(dir_prj_spades_assemblies):
                rmtree(dir_prj_spades_assemblies)
            if ope(dir_prj_blast_assmbl):
                rmtree(dir_prj_blast_assmbl)
            if ope(dir_prj_assmbl_blast_results):
                rmtree(dir_prj_assmbl_blast_results)
            if ope(dir_prj_transcripts):
                rmtree(dir_prj_transcripts)
            if ope(dir_prj_transcripts_combined):
                rmtree(dir_prj_transcripts_combined)

    prepare_output_directories(dir_out, prj_name)

    # Run vsearch on reads ---------------------------------------------------
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        print()
        Log.inf('Checking if Vsearch should be run:', ss)
        run_vsearch_on_reads(se_fastq_files, pe_fastq_files, vsearch,
                             dir_prj_vsearch_results_fa_trim,
                             pe_vsearch_results_file_patterns, ss, seqtk)

    # Run SPAdes -------------------------------------------------------------
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            for se in se_fastq_files:
                se_fastq_files[se]['spades_assembly' + '__' + ss] = None
            for pe in pe_fastq_files:
                pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None
            continue
        print()
        Log.inf('Checking if SPAdes should be run:', ss)
        run_spades(se_fastq_files, pe_fastq_files, dir_prj_spades_assemblies,
                   spades, dir_temp, ss, THREADS, RAM)

    # Combine SPAdes and user provided assemblies ----------------------------
    assemblies = combine_assemblies(se_fastq_files, pe_fastq_files,
                                    user_assemblies, tax, sss)

    # Run makeblastdb on assemblies  -----------------------------------------
    makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb)

    if any_queries is False:
        Log.wrn('No query sequences were provided.')

    # Run tblastn on assemblies ----------------------------------------------
    for ss in sss:

        if stat(aa_queries_files[ss]).st_size == 0:
            continue

        should_run_tblastn = False
        for a in assemblies:
            assmbl_src = a['src']
            assmbl_name = a['name']
            if assmbl_src != 'user_fasta':
                if assmbl_name.endswith('__' + ss):
                    should_run_tblastn = True
                    break
            else:
                should_run_tblastn = True
                break

        if should_run_tblastn is False:
            print()
            Log.inf('Will not run BLAST. No transcripts exist:', ss)
            continue

        blast_2_evalue_ss = sss[ss]['blast_2_evalue']
        blast_2_max_hsps_ss = sss[ss]['blast_2_max_hsps']
        blast_2_qcov_hsp_perc_ss = sss[ss]['blast_2_qcov_hsp_perc']
        blast_2_best_hit_overhang_ss = sss[ss]['blast_2_best_hit_overhang']
        blast_2_best_hit_score_edge_ss = sss[ss]['blast_2_best_hit_score_edge']
        blast_2_max_target_seqs_ss = sss[ss]['blast_2_max_target_seqs']

        if blast_2_evalue_ss is None:
            blast_2_evalue_ss = blast_2_evalue
        if blast_2_max_hsps_ss is None:
            blast_2_max_hsps_ss = blast_2_max_hsps
        if blast_2_qcov_hsp_perc_ss is None:
            blast_2_qcov_hsp_perc_ss = blast_2_qcov_hsp_perc
        if blast_2_best_hit_overhang_ss is None:
            blast_2_best_hit_overhang_ss = blast_2_best_hit_overhang
        if blast_2_best_hit_score_edge_ss is None:
            blast_2_best_hit_score_edge_ss = blast_2_best_hit_score_edge
        if blast_2_max_target_seqs_ss is None:
            blast_2_max_target_seqs_ss = blast_2_max_target_seqs

        run_tblastn_on_assemblies(ss, assemblies, aa_queries_files[ss],
                                  tblastn, dir_prj_assmbl_blast_results,
                                  blast_2_evalue_ss, blast_2_max_hsps_ss,
                                  blast_2_qcov_hsp_perc_ss,
                                  blast_2_best_hit_overhang_ss,
                                  blast_2_best_hit_score_edge_ss,
                                  blast_2_max_target_seqs_ss, THREADS,
                                  dir_cache_prj, dir_prj_ips)

    # Prepare BLAST hits for analysis: find ORFs, translate ------------------
    for ss in sss:

        if stat(aa_queries_files[ss]).st_size == 0:
            continue

        min_target_orf_len_ss = sss[ss]['min_target_orf_length']
        max_target_orf_len_ss = sss[ss]['max_target_orf_length']
        organelle = sss[ss]['organelle']

        blast_2_qcov_hsp_perc_ss = sss[ss]['blast_2_qcov_hsp_perc']

        if blast_2_qcov_hsp_perc_ss is None:
            blast_2_qcov_hsp_perc_ss = blast_2_qcov_hsp_perc

        find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk,
                            dir_temp, prepend_assmbl, min_target_orf_len_ss,
                            max_target_orf_len_ss, allow_non_aug,
                            allow_no_strt_cod,
                            allow_no_stop_cod, tax, tax_group, tax_ids_user,
                            blast_2_qcov_hsp_perc_ss, organelle)

    # GFF3 files from kakapo results JSON files ------------------------------
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        gff_from_json(ss, assemblies, dir_prj_ips,
                      dir_prj_transcripts_combined, prj_name)

    # Run InterProScan 5 -----------------------------------------------------
    if should_run_ipr is True:
        print()
        ss_names = tuple(sss.keys())

        # Determine the length of printed strings, for better spacing --------
        max_title_a_len = 0
        max_run_id_len = 0
        for a in assemblies:
            for ss in ss_names:
                if 'transcripts_aa_orf_fasta_file__' + ss not in a:
                    continue

                aa_file = a['transcripts_aa_orf_fasta_file__' + ss]

                if aa_file is None:
                    continue

                assmbl_name = a['name']
                run_id = ss + '_' + assmbl_name
                max_run_id_len = max(len(run_id), max_run_id_len)

                seqs = seq_records_to_dict(read_fasta(aa_file, SEQ_TYPE_AA))

                # Filter all ORFs except the first one.
                for seq_def in tuple(seqs.keys()):
                    seq_def_prefix = seq_def.split(' ')[0]
                    if seq_def_prefix.endswith('ORF001'):
                        max_title_a_len = max(len(seq_def_prefix),
                                              max_title_a_len)

        max_title_a_len += 2
        max_run_id_len += 2
        # --------------------------------------------------------------------

        parallel_run_count = min(THREADS, len(ss_names))

        def run_inter_pro_scan_parallel(ss):
            if stat(aa_queries_files[ss]).st_size == 0:
                return

            run_inter_pro_scan(ss, assemblies, email, dir_prj_ips,
                               dir_cache_prj, parallel_run_count,
                               max_title_a_len, max_run_id_len)

            # GFF3 files from kakapo and InterProScan 5 results JSON files
            gff_from_json(ss, assemblies, dir_prj_ips,
                          dir_prj_transcripts_combined, prj_name)

        Parallel(n_jobs=parallel_run_count, verbose=0, require='sharedmem')(
            delayed(run_inter_pro_scan_parallel)(ss) for ss in ss_names)

    # Download CDS for NCBI protein queries ----------------------------------
    print()
    prot_cds_ncbi_files = OrderedDict()

    def dnld_cds_for_ncbi_prot_acc_parallel(ss):
        if stat(aa_queries_files[ss]).st_size == 0:
            return

        if ss not in prot_acc_user_filtered:
            return

        prot_cds_ncbi_files[ss] = opj(dir_prj_transcripts_combined, prj_name +
                                      '_ncbi_query_cds__' + ss + '.fasta')

        if len(prot_acc_user_filtered[ss]) > 0:
            dnld_cds_for_ncbi_prot_acc(ss, prot_acc_user_filtered[ss],
                                       prot_cds_ncbi_files[ss], tax,
                                       dir_cache_prj)

    ss_names = tuple(sss.keys())
    Parallel(n_jobs=2, verbose=0, require='sharedmem')(
        delayed(dnld_cds_for_ncbi_prot_acc_parallel)(ss) for ss in ss_names)

    # ------------------------------------------------------------------------

    rmtree(dir_temp)

    # ------------------------------------------------------------------------

    rerun = input('\nRepeat ([y]/n)? ').lower().strip()
    if rerun.startswith('y') or rerun == '':
        print()
        return False
    else:
        print('\nExiting...')
        return True
Example #25
0
def run_spades(se_fastq_files, pe_fastq_files, dir_spades_assemblies,
               spades, dir_temp, ss, threads, ram):

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        if spades is None:
            Log.err('SPAdes is not available. Cannot continue. Exiting.')
            exit(0)

    for se in se_fastq_files:
        dir_results = opj(dir_spades_assemblies, se + '__' + ss)
        fq_path = se_fastq_files[se]['vsearch_results_path' + '__' + ss]
        se_fastq_files[se]['spades_assembly' + '__' + ss] = None

        if ope(dir_results):
            Log.msg('SPAdes assembly already exists:', se)
        else:
            make_dirs(dir_results)
            Log.msg('Running SPAdes on:', se)
            run_spades_se(spades,
                          out_dir=dir_results,
                          input_file=fq_path,
                          threads=threads,
                          memory=ram,
                          rna=True)

        assmbl_path = opj(dir_results, 'transcripts.fasta')
        if ope(assmbl_path):
            count = len(read_fasta(assmbl_path, SEQ_TYPE_NT))
            tr_str = ' transcripts.'
            if count == 1:
                tr_str = ' transcript.'
            Log.msg('SPAdes produced ' + str(count) + tr_str, False)
            se_fastq_files[se]['spades_assembly' + '__' + ss] = assmbl_path
        else:
            Log.wrn('SPAdes produced no transcripts.', False)

    for pe in pe_fastq_files:
        dir_results = opj(dir_spades_assemblies, pe + '__' + ss)
        fq_paths = pe_fastq_files[pe]['vsearch_results_path' + '__' + ss]
        pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None

        if ope(dir_results):
            Log.msg('SPAdes assembly already exists:', pe)
        else:
            make_dirs(dir_results)
            Log.msg('Running SPAdes on: ' + pe)

            if osstat(fq_paths[0]).st_size > 0 and \
               osstat(fq_paths[1]).st_size > 0:

                run_spades_pe(spades,
                              out_dir=dir_results,
                              input_files=fq_paths,
                              threads=threads,
                              memory=ram,
                              rna=True)

            else:
                _ = opj(dir_temp, 'temp.fasta')
                combine_text_files(fq_paths, _)
                run_spades_se(spades,
                              out_dir=dir_results,
                              input_file=_,
                              threads=threads,
                              memory=ram,
                              rna=True)
                osremove(_)

        assmbl_path = opj(dir_results, 'transcripts.fasta')
        if ope(assmbl_path):
            count = len(read_fasta(assmbl_path, SEQ_TYPE_NT))
            tr_str = ' transcripts.'
            if count == 1:
                tr_str = ' transcript.'
            Log.msg('SPAdes produced ' + str(count) + tr_str, False)
            pe_fastq_files[pe]['spades_assembly' + '__' + ss] = assmbl_path
        else:
            Log.wrn('SPAdes produced no transcripts.', False)
Example #26
0
def run_tblastn_on_reads(se_fastq_files, pe_fastq_files, aa_queries_file,
                         tblastn, blast_1_evalue, blast_1_max_hsps,
                         blast_1_qcov_hsp_perc, blast_1_best_hit_overhang,
                         blast_1_best_hit_score_edge, blast_1_max_target_seqs,
                         dir_blast_results_fa_trim, fpatt, ss, threads, seqtk,
                         vsearch, dir_cache_prj):

    changed_blast_1 = False

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Running BLAST on reads:', ss)
        if tblastn is None:
            Log.err('tblastn is not available. Cannot continue. Exiting.')
            exit(0)

        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)

        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    cache_file = opj(dir_cache_prj, 'blast_1_settings_cache__' + ss)

    pickled = dict()
    settings = {
        'blast_1_evalue': blast_1_evalue,
        'blast_1_max_hsps': blast_1_max_hsps,
        'blast_1_qcov_hsp_perc': blast_1_qcov_hsp_perc,
        'blast_1_best_hit_overhang': blast_1_best_hit_overhang,
        'blast_1_best_hit_score_edge': blast_1_best_hit_score_edge,
        'blast_1_max_target_seqs': blast_1_max_target_seqs,
        'queries': seq_records_to_dict(read_fasta(aa_queries_file,
                                                  SEQ_TYPE_AA))
    }

    Log.msg('evalue:', str(blast_1_evalue))
    Log.msg('max_hsps:', str(blast_1_max_hsps))
    Log.msg('qcov_hsp_perc:', str(blast_1_qcov_hsp_perc))
    Log.msg('best_hit_overhang:', str(blast_1_best_hit_overhang))
    Log.msg('best_hit_score_edge:', str(blast_1_best_hit_score_edge))
    Log.msg('max_target_seqs:', str(blast_1_max_target_seqs))
    print()

    # FixMe: Expose in configuration files?
    ident = 0.85

    for se in se_fastq_files:
        dir_results = opj(dir_blast_results_fa_trim, se)
        blast_db_path = se_fastq_files[se]['blast_db_path']
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_results, se + '__' + ss + '.txt')
        out_f_fastq = out_f.replace('.txt', '.fastq')
        out_f_fasta = out_f.replace('.txt', '.fasta')
        se_fastq_files[se]['blast_results_path' + '__' + ss] = out_f_fasta
        genetic_code = se_fastq_files[se]['gc_id']

        if ope(out_f_fasta) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(out_f_fasta) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', se)

        else:
            changed_blast_1 = True
            make_dirs(dir_results)
            Log.msg('Running tblastn on: ' + basename(blast_db_path), ss)
            run_blast(exec_file=tblastn,
                      task='tblastn',
                      threads=threads,
                      db_path=blast_db_path,
                      queries_file=aa_queries_file,
                      out_file=out_f,
                      evalue=blast_1_evalue,
                      max_hsps=blast_1_max_hsps,
                      qcov_hsp_perc=blast_1_qcov_hsp_perc,
                      best_hit_overhang=blast_1_best_hit_overhang,
                      best_hit_score_edge=blast_1_best_hit_score_edge,
                      max_target_seqs=blast_1_max_target_seqs,
                      db_genetic_code=genetic_code,
                      out_cols=BLST_RES_COLS_1)

            Log.inf('Extracting unique BLAST hits using Seqtk:', ss)

            keep_unique_lines_in_file(out_f)

            seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f)
            seqtk_fq_to_fa(seqtk, out_f_fastq, out_f_fasta)

            osremove(out_f)
            osremove(out_f_fastq)

            out_f_fasta_temp = out_f_fasta + '_temp'
            copyfile(out_f_fasta, out_f_fasta_temp)
            run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta)
            osremove(out_f_fasta_temp)

    for pe in pe_fastq_files:
        dir_results = opj(dir_blast_results_fa_trim, pe)
        blast_db_paths = pe_fastq_files[pe]['blast_db_path']
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_results) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x.replace('@Q@', ss) for x in out_fs]
        out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs]
        out_fs_fasta = [x.replace('.txt', '.fasta') for x in out_fs]
        out_f_fasta = opj(dir_results, pe + '__' + ss + '.fasta')
        pe_fastq_files[pe]['blast_results_path' + '__' + ss] = out_f_fasta
        genetic_code = pe_fastq_files[pe]['gc_id']

        if ope(out_f_fasta) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(out_f_fasta) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', pe)

        else:
            changed_blast_1 = True
            make_dirs(dir_results)
            pe_trim_files = zip(blast_db_paths, out_fs, fq_paths, out_fs_fastq,
                                out_fs_fasta)
            for x in pe_trim_files:
                Log.msg('Running tblastn on: ' + basename(x[0]), ss)
                run_blast(exec_file=tblastn,
                          task='tblastn',
                          threads=threads,
                          db_path=x[0],
                          queries_file=aa_queries_file,
                          out_file=x[1],
                          evalue=blast_1_evalue,
                          max_hsps=blast_1_max_hsps,
                          qcov_hsp_perc=blast_1_qcov_hsp_perc,
                          best_hit_overhang=blast_1_best_hit_overhang,
                          best_hit_score_edge=blast_1_best_hit_score_edge,
                          max_target_seqs=blast_1_max_target_seqs,
                          db_genetic_code=genetic_code,
                          out_cols=BLST_RES_COLS_1)

                Log.msg('Extracting unique BLAST hits using Seqtk:', ss)

                keep_unique_lines_in_file(x[1])

                seqtk_extract_reads(seqtk, x[2], x[3], x[1])
                seqtk_fq_to_fa(seqtk, x[3], x[4])

                osremove(x[1])
                osremove(x[3])

            combine_text_files(out_fs_fasta, out_f_fasta)

            out_f_fasta_temp = out_f_fasta + '_temp'
            copyfile(out_f_fasta, out_f_fasta_temp)
            run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta)
            osremove(out_f_fasta_temp)

            for x in out_fs_fasta:
                osremove(x)

    with open(cache_file, 'wb') as f:
        pickle.dump(settings, f, protocol=PICKLE_PROTOCOL)

    return changed_blast_1
Example #27
0
def ss_file_parse(file_path):

    cfg = ConfigParser(delimiters=('='),
                       allow_no_value=True,
                       empty_lines_in_values=True)
    cfg.optionxform = str
    cfg.SECTCRE = re.compile(r'\[\s*(?P<header>[^]]+?)\s*\]')

    try:
        cfg.read(file_path)
    except MissingSectionHeaderError:
        Log.err(
            'Missing section header(s) in the provided "Search Strategies" file:',
            file_path)
        exit(1)

    required_options = set(('organelle', 'min_query_length',
                            'max_query_length', 'max_query_identity',
                            'min_target_orf_length', 'max_target_orf_length'))

    ret_dict = OrderedDict()

    sections = cfg.sections()

    for s in sections:

        o = cfg.options(s)

        if not required_options <= set(o):
            missing = required_options - (required_options & set(o))
            Log.err(
                'Missing required option(s):' + ', '.join(missing) +
                'for search strategy', s)
            exit(1)

        organelle = cfg[s]['organelle']

        if organelle not in ('nucleus', 'plastid', 'mitochondrion'):
            Log.err('Organelle "' + organelle + '" should be one of:',
                    'nucleus, plastid, or mitochondrion.')
            exit(1)

        min_query_length = int(cfg[s]['min_query_length'])
        max_query_length = int(cfg[s]['max_query_length'])
        max_query_identity = float(cfg[s]['max_query_identity'])
        min_target_orf_length = int(cfg[s]['min_target_orf_length'])
        max_target_orf_length = int(cfg[s]['max_target_orf_length'])

        evalue = None
        max_hsps = None
        qcov_hsp_perc = None
        best_hit_overhang = None
        best_hit_score_edge = None
        max_target_seqs = None

        pfam_families = None
        ncbi_accessions_aa = None
        entrez_search_queries = None
        fasta_files_aa = None

        if cfg.has_option(s, 'evalue'):
            evalue = float(cfg[s]['evalue'])

        if cfg.has_option(s, 'max_hsps'):
            max_hsps = int(cfg[s]['max_hsps'])

        if cfg.has_option(s, 'qcov_hsp_perc'):
            qcov_hsp_perc = float(cfg[s]['qcov_hsp_perc'])

        if cfg.has_option(s, 'best_hit_overhang'):
            best_hit_overhang = float(cfg[s]['best_hit_overhang'])

        if cfg.has_option(s, 'best_hit_score_edge'):
            best_hit_score_edge = float(cfg[s]['best_hit_score_edge'])

        if cfg.has_option(s, 'max_target_seqs'):
            max_target_seqs = int(cfg[s]['max_target_seqs'])

        if cfg.has_option(s, 'pfam_families'):
            pfam_families = str(cfg[s]['pfam_families'])
            pfam_families = set(pfam_families.split('\n')) - \
                set(('', 'None'))
            pfam_families = _parse_pfam(pfam_entries=pfam_families,
                                        config_file_path=file_path)
            pfam_families = sorted(pfam_families)

        if cfg.has_option(s, 'ncbi_accessions_aa'):
            ncbi_accessions_aa = str(cfg[s]['ncbi_accessions_aa'])
            ncbi_accessions_aa = sorted(
                set(ncbi_accessions_aa.split('\n')) - set(('', 'None')))

        if cfg.has_option(s, 'entrez_search_queries'):
            entrez_search_queries = str(cfg[s]['entrez_search_queries'])
            entrez_search_queries = sorted(
                set(entrez_search_queries.split('\n')) - set(('', 'None')))

        if cfg.has_option(s, 'fasta_files_aa'):
            fasta_files_aa = str(cfg[s]['fasta_files_aa'])
            fasta_files_aa = set(fasta_files_aa.split('\n')) - \
                set(('', 'None'))
            fasta_files_aa = [abspath(expanduser(x)) for x in fasta_files_aa]
            fasta_files_aa = sorted(fasta_files_aa)

        section_dict = OrderedDict({
            'organelle': organelle,
            'min_query_length': min_query_length,
            'max_query_length': max_query_length,
            'max_query_identity': max_query_identity,
            'min_target_orf_length': min_target_orf_length,
            'max_target_orf_length': max_target_orf_length,
            'blast_2_evalue': evalue,
            'blast_2_max_hsps': max_hsps,
            'blast_2_qcov_hsp_perc': qcov_hsp_perc,
            'blast_2_best_hit_overhang': best_hit_overhang,
            'blast_2_best_hit_score_edge': best_hit_score_edge,
            'blast_2_max_target_seqs': max_target_seqs,
            'pfam_families': pfam_families,
            'ncbi_accessions_aa': ncbi_accessions_aa,
            'entrez_search_queries': entrez_search_queries,
            'fasta_files_aa': fasta_files_aa
        })

        ret_dict[s] = section_dict

    return ret_dict
Example #28
0
def config_file_parse(file_path, taxonomy):

    cfg = ConfigParser(delimiters=('='), allow_no_value=True)
    cfg.optionxform = str

    try:
        cfg.read(file_path)
    except MissingSectionHeaderError:
        Log.err(
            'Missing section header(s) in the provided ' +
            'configuration file:', file_path)
        exit(1)

    try:
        # General
        project_name = cfg.get('General', 'project_name')
        email = cfg.get('General', 'email')
        output_directory = abspath(
            expanduser(cfg.get('General', 'output_directory')))
        should_run_ipr = cfg.getboolean('General', 'run_inter_pro_scan')
        should_run_rcorrector = cfg.getboolean('General', 'run_rcorrector')
        prepend_assmbl = cfg.getboolean(
            'General', 'prepend_assembly_name_to_sequence_name')
        kraken_confidence = cfg.getfloat('General', 'kraken_2_confidence')
        requery_after = cfg.getfloat('General', 'requery_after')
        requery_after = datetime.timedelta(days=requery_after)

        # Target filters
        allow_non_aug = cfg.getboolean('Target filters',
                                       'allow_non_aug_start_codon')
        allow_no_strt_cod = cfg.getboolean('Target filters',
                                           'allow_missing_start_codon')
        allow_no_stop_cod = cfg.getboolean('Target filters',
                                           'allow_missing_stop_codon')

        # Query taxonomic group
        tax_group_raw = cfg.items('Query taxonomic group')

        if len(tax_group_raw) != 1:
            raise Exception('One taxonomic group should be listed.')

        tax_group = tax_group_raw[0][0].lower()
        tax_group_name = tax_group.title()

        group_tax_ids = {
            'animals': 33208,
            'archaea': 2157,
            'bacteria': 2,
            'fungi': 4751,
            'plants': 33090,
            'viruses': 10239
        }

        tax_group = group_tax_ids[tax_group]

        # Target SRA accessions
        sras = cfg.items('Target SRA accessions')
        sras = [x[0] for x in sras]

        all_tax_ids = set()

        # Target FASTQ files
        fastq_temp = cfg.items('Target FASTQ files')

        fq_pe = []
        fq_se = []

        for entry in fastq_temp:

            key = entry[0]
            val = entry[1]

            val = val.split(':')
            if len(val) == 1:
                tmp_genus_species = basename(val[0]).split('_')
                if len(tmp_genus_species) == 1:
                    genus = tmp_genus_species[0]
                    val = [genus, val[0]]
                elif len(tmp_genus_species) >= 2:
                    genus_species = tmp_genus_species[
                        0] + ' ' + tmp_genus_species[1]
                    val = [genus_species, val[0]]
                else:
                    val = ['', val[0]]

            taxa_temp = [val[0]]

            # FixMe: It is possible for tax_id to be None!
            #        What happens then?
            tax_id = _parse_taxa(taxa=taxa_temp,
                                 tax_group=tax_group,
                                 taxonomy=taxonomy,
                                 config_file_path=file_path)[0]

            # See FixMe above.
            if tax_id is None:
                tax_id = tax_group

            if key.startswith('pe_'):
                f_name = basename(val[1])
                d_path = abspath(expanduser(dirname(val[1])))
                pattern = re.escape(f_name).replace('\\*', '.')
                try:
                    files, err = list_of_files_at_path(d_path)
                except Exception:
                    exit(1)
                pe = [
                    f for f in files
                    if re.match(pattern, basename(f)) is not None
                ]
                pe.sort()
                pe = [join(d_path, f) for f in pe]
                fq_pe.append([tax_id, pe])

            elif key.startswith('se_'):
                se = abspath(expanduser(val[1]))
                fq_se.append([tax_id, se])

            # See FixMe above.
            if tax_id != tax_group:
                all_tax_ids.add(tax_id)

        # Target assemblies: FASTA files (DNA)
        assmbl_temp = cfg.items('Target assemblies: FASTA files (DNA)')
        assmbl_temp = [x[0].split(':') for x in assmbl_temp]

        for i, val in enumerate(copy(assmbl_temp)):
            if len(val) == 1:
                tmp_genus_species = basename(val[0]).split('_')
                if len(tmp_genus_species) == 1:
                    genus = tmp_genus_species[0]
                    assmbl_temp[i] = [genus, val[0]]
                elif len(tmp_genus_species) >= 2:
                    genus_species = tmp_genus_species[
                        0] + ' ' + tmp_genus_species[1]
                    assmbl_temp[i] = [genus_species, val[0]]
                else:
                    assmbl_temp[i] = ['', val[0]]

        taxa_temp = [x[0] for x in assmbl_temp]
        taxa_temp = [x.split('.')[0] for x in taxa_temp]

        # FixMe: It is possible for one of the tax_ids to be None!
        #        What happens then?
        tax_ids = _parse_taxa(taxa=taxa_temp,
                              tax_group=tax_group,
                              taxonomy=taxonomy,
                              config_file_path=file_path)

        assmbl_temp = [abspath(expanduser(x[1])) for x in assmbl_temp]
        assmbl_temp = list(zip(tax_ids, assmbl_temp))

        assmbl = list()
        tax_ids = list()
        all_assemblies_found = True
        for i, a in enumerate(copy(assmbl_temp)):
            # See FixMe above.
            a = list(a)
            if a[0] is None:
                a[0] = tax_group
            tax_ids.append(a[0])
            a_path = a[1]
            if not ope(a_path):
                Log.err('Cannot find the assembly file:', a_path)
                all_assemblies_found = False
            assmbl.append(tuple(a))

        if all_assemblies_found is False:
            Log.err('Stopping.')
            exit(1)

        for tax_id in tax_ids:
            # See FixMe above.
            if tax_id != tax_group:
                all_tax_ids.add(tax_id)
        all_tax_ids = tuple(sorted(all_tax_ids))

        # Bowtie2 filter order
        bt2_sctn = 'Bowtie2 filter order'
        bt2_order = []
        if cfg.has_section(bt2_sctn):
            bt2_order = OrderedDict(cfg.items(bt2_sctn))

        # Kraken2 filter order
        krkn_sctn = 'Kraken2 filter order'
        krkn_order = []
        if cfg.has_section(krkn_sctn):
            krkn_order = cfg.items(krkn_sctn)

        # BLAST SRA/FASTQ
        blast_1_evalue = cfg.getfloat('BLAST SRA/FASTQ', 'evalue')
        blast_1_max_hsps = cfg.getint('BLAST SRA/FASTQ', 'max_hsps')
        blast_1_qcov_hsp_perc = cfg.getfloat('BLAST SRA/FASTQ',
                                             'qcov_hsp_perc')
        blast_1_best_hit_overhang = cfg.getfloat('BLAST SRA/FASTQ',
                                                 'best_hit_overhang')
        blast_1_best_hit_score_edge = cfg.getfloat('BLAST SRA/FASTQ',
                                                   'best_hit_score_edge')
        blast_1_max_target_seqs = cfg.getint('BLAST SRA/FASTQ',
                                             'max_target_seqs')

        # BLAST assemblies
        blast_2_evalue = cfg.getfloat('BLAST assemblies', 'evalue')
        blast_2_max_hsps = cfg.getint('BLAST assemblies', 'max_hsps')
        blast_2_qcov_hsp_perc = cfg.getfloat('BLAST assemblies',
                                             'qcov_hsp_perc')
        blast_2_best_hit_overhang = cfg.getfloat('BLAST assemblies',
                                                 'best_hit_overhang')
        blast_2_best_hit_score_edge = cfg.getfloat('BLAST assemblies',
                                                   'best_hit_score_edge')
        blast_2_max_target_seqs = cfg.getint('BLAST assemblies',
                                             'max_target_seqs')

    except NoSectionError as err:
        Log.err(
            'Missing required section "' + err.section +
            '" in configuration file:', file_path)
        exit(1)

    except NoOptionError as err:
        Log.err(
            'Missing required option "' + err.option + '" under section "' +
            err.section + '" in configuration file:', file_path)
        exit(1)

    # ------------------------------------------------------------------------

    ret_dict = {
        'allow_no_stop_cod': allow_no_stop_cod,
        'allow_no_strt_cod': allow_no_strt_cod,
        'allow_non_aug': allow_non_aug,
        'assmbl': assmbl,
        'blast_1_evalue': blast_1_evalue,
        'blast_1_max_hsps': blast_1_max_hsps,
        'blast_1_qcov_hsp_perc': blast_1_qcov_hsp_perc,
        'blast_1_best_hit_overhang': blast_1_best_hit_overhang,
        'blast_1_best_hit_score_edge': blast_1_best_hit_score_edge,
        'blast_1_max_target_seqs': blast_1_max_target_seqs,
        'blast_2_evalue': blast_2_evalue,
        'blast_2_max_hsps': blast_2_max_hsps,
        'blast_2_qcov_hsp_perc': blast_2_qcov_hsp_perc,
        'blast_2_best_hit_overhang': blast_2_best_hit_overhang,
        'blast_2_best_hit_score_edge': blast_2_best_hit_score_edge,
        'blast_2_max_target_seqs': blast_2_max_target_seqs,
        'email': email,
        'requery_after': requery_after,
        'fq_pe': fq_pe,
        'fq_se': fq_se,
        'should_run_rcorrector': should_run_rcorrector,
        'should_run_ipr': should_run_ipr,
        'bt2_order': bt2_order,
        'kraken_confidence': kraken_confidence,
        'krkn_order': krkn_order,
        'output_directory': output_directory,
        'prepend_assmbl': prepend_assmbl,
        'project_name': project_name,
        'sras': sras,
        'tax_group': tax_group,
        'tax_group_name': tax_group_name,
        'tax_ids': all_tax_ids
    }

    return ret_dict
Example #29
0
def run_tblastn_on_assemblies(ss, assemblies, aa_queries_file, tblastn,
                              dir_prj_assmbl_blast_results, blast_2_evalue,
                              blast_2_max_hsps, blast_2_qcov_hsp_perc,
                              blast_2_best_hit_overhang,
                              blast_2_best_hit_score_edge,
                              blast_2_max_target_seqs, threads, dir_cache_prj,
                              dir_prj_ips):

    if len(assemblies) > 0:
        print()
        Log.inf('Running BLAST on assemblies:', ss)
        if tblastn is None:
            Log.err('tblastn is not available. Cannot continue. Exiting.')
            exit(0)
    else:
        Log.wrn('There are no assemblies. Nothing to do, stopping.')
        exit(0)

    cache_file = opj(dir_cache_prj, 'blast_2_settings_cache__' + ss)

    pickled = dict()
    settings = {'blast_2_evalue': blast_2_evalue,
                'blast_2_max_hsps': blast_2_max_hsps,
                'blast_2_qcov_hsp_perc': blast_2_qcov_hsp_perc,
                'blast_2_best_hit_overhang': blast_2_best_hit_overhang,
                'blast_2_best_hit_score_edge': blast_2_best_hit_score_edge,
                'blast_2_max_target_seqs': blast_2_max_target_seqs,
                'queries': seq_records_to_dict(
                    read_fasta(aa_queries_file, SEQ_TYPE_AA))}

    Log.msg('evalue:', str(blast_2_evalue))
    Log.msg('max_hsps:', str(blast_2_max_hsps))
    Log.msg('qcov_hsp_perc:', str(blast_2_qcov_hsp_perc))
    Log.msg('best_hit_overhang:', str(blast_2_best_hit_overhang))
    Log.msg('best_hit_score_edge:', str(blast_2_best_hit_score_edge))
    Log.msg('max_target_seqs:', str(blast_2_max_target_seqs))
    print()

    for a in assemblies:

        assmbl_src = a['src']
        assmbl_name = a['name']

        if assmbl_src != 'user_fasta':
            if assmbl_name.endswith('__' + ss):
                assmbl_name = assmbl_name.replace('__' + ss, '')
            else:
                continue

        assmbl_blast_db_path = a['blast_db_path']
        assmbl_genetic_code = a['gc_id']

        ips_json_dump_path = opj(dir_prj_ips, assmbl_name + '_ann_ips__' + ss +
                                 '.json')

        _ = opj(dir_prj_assmbl_blast_results, assmbl_name + '__' + ss + '.tsv')

        if ope(_) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(_) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', assmbl_name)

        else:
            Log.msg('Running tblastn on: ' + assmbl_name, ss)

            if ope(ips_json_dump_path):
                osremove(ips_json_dump_path)

            run_blast(exec_file=tblastn,
                      task='tblastn',
                      threads=threads,
                      db_path=assmbl_blast_db_path,
                      queries_file=aa_queries_file,
                      out_file=_,
                      evalue=blast_2_evalue,
                      max_hsps=blast_2_max_hsps,
                      qcov_hsp_perc=blast_2_qcov_hsp_perc,
                      best_hit_overhang=blast_2_best_hit_overhang,
                      best_hit_score_edge=blast_2_best_hit_score_edge,
                      max_target_seqs=blast_2_max_target_seqs,
                      db_genetic_code=assmbl_genetic_code,
                      out_cols=BLST_RES_COLS_2)

        a['blast_hits_aa__' + ss] = parse_blast_results_file(_, BLST_RES_COLS_2)

    with open(cache_file, 'wb') as f:
        pickle.dump(settings, f, protocol=PICKLE_PROTOCOL)
Example #30
0
def dep_check_rcorrector(dir_dep, force):
    url = 'https://github.com/karolisr/Rcorrector/archive/master.tar.gz'
    dnld_path = opj(dir_dep, 'rcorrector.tar.gz')

    try:
        try:
            jellyfish = which('jellyfish')
            run([jellyfish, '--help'])
        except Exception:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'Rcorrector'))
            jellyfish = opj(dir_bin, 'jellyfish', 'bin', 'jellyfish')
            raise
        if force is True:
            raise
        rcorrector = which('run_rcorrector.pl')
        run([rcorrector, '-version'])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'Rcorrector'))
            try:
                rcorrector = opj(dir_bin, 'run_rcorrector.pl')
                run([rcorrector, '-version'])
            except Exception:
                Log.wrn('Rcorrector was not found on this system, trying to '
                        'download.')
                raise
            try:
                run([jellyfish, '--version'])
            except Exception:
                Log.wrn(
                    'jellyfish is required by Rcorrector, but was not found. '
                    'Trying to download and recompile Rcorrector and '
                    'jellyfish.')
                raise
        except Exception:
            if ope(dnld_path):
                remove(dnld_path)
            if dir_bin != opj(dir_dep, ''):
                rmtree(dir_bin)
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'Rcorrector'))
            try:
                Log.wrn('Compiling Rcorrector.')
                run('make', cwd=dir_bin)
                rcorrector = opj(dir_bin, 'run_rcorrector.pl')
                jellyfish = opj(dir_bin, 'jellyfish', 'bin', 'jellyfish')
                chmod(
                    rcorrector, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP
                    | stat.S_IROTH | stat.S_IXOTH)
                run([rcorrector, '-version'])
                if not ope(jellyfish):
                    jellyfish = which('jellyfish')
                run([jellyfish, '--version'])
            except Exception:
                Log.err('Something went wrong while trying to compile '
                        'Rcorrector.')
                Log.msg('Try downloading and installing it manually from: '
                        'https://github.com/karolisr/Rcorrector')
                return None

    v = get_dep_version([rcorrector, '-version'], r'^Rcorrector\sv([\d\.]*)')
    Log.msg('Rcorrector is available:', v + ' ' + rcorrector)

    return rcorrector