Exemple #1
0
def import_contig_lengths(contigs_fasta, log_file, quiet):
    message = 'Gathering contig lengths from {0}.'.format(contigs_fasta)
    shared.give_user_feedback(message, log_file, quiet)

    contig2length = {}

    with open(contigs_fasta, 'r') as f1:
        for line in f1:
            line = line.rstrip()

            if line.startswith('>'):
                contig = line.split(' ')[0].lstrip('>')

                contig2length[contig] = 0
            else:
                try:
                    contig2length[contig] += len(line)
                except:
                    message = '{0} is not a contigs fasta'.format(
                        contigs_fasta)
                    shared.give_user_feedback(message,
                                              log_file,
                                              quiet,
                                              error=True)

                    sys.exit(1)

    return contig2length
Exemple #2
0
def find_offspring(taxonomy_folder, fastaid2LCAtaxid_file, log_file, quiet):
    nodes_dmp = '{0}/nodes.dmp'.format(taxonomy_folder)
    (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet)

    message = 'Searching nr database for taxids with multiple offspring.'
    shared.give_user_feedback(message, log_file, quiet)

    taxid2offspring = {}

    with open(fastaid2LCAtaxid_file, 'r') as f1:
        for line in f1:
            line = line.rstrip().split('\t')

            taxid = line[1]
            lineage = tax.find_lineage(taxid, taxid2parent)

            for (i, taxid) in enumerate(lineage):
                # The first taxid in the lineage does not have a daughter node.
                if i == 0:
                    continue

                if taxid not in taxid2offspring:
                    taxid2offspring[taxid] = set()

                offspring = lineage[i - 1]

                taxid2offspring[taxid].add(offspring)
                
    return taxid2offspring
Exemple #3
0
def import_prot_accession2taxid(prot_accession2taxid_file,
                                prot_accessions_whitelist, log_file, quiet):
    message = 'Loading file {0}.'.format(prot_accession2taxid_file)
    shared.give_user_feedback(message, log_file, quiet)

    prot_accession2taxid = {}

    with gzip.open(prot_accession2taxid_file, 'rb') as f1:
        for n, line in enumerate(f1):
            line = line.decode('utf-8')

            line = line.rstrip().split('\t')

            if n == 0:
                index_1 = line.index('accession.version')
                index_2 = line.index('taxid')

                continue

            prot_accession = line[index_1]

            if prot_accession in prot_accessions_whitelist:
                prot_accession2taxid[prot_accession] = line[index_2]

    return prot_accession2taxid
Exemple #4
0
def make_fastaid2LCAtaxid_file(nodes_dmp, fastaid2LCAtaxid_file, nr_file,
                               prot_accession2taxid_file, taxid2parent,
                               log_file, quiet):
    (fastaid2prot_accessions,
     prot_accessions_whitelist) = import_headers_nr(nr_file, log_file, quiet)
    prot_accession2taxid = import_prot_accession2taxid(
        prot_accession2taxid_file, prot_accessions_whitelist, log_file, quiet)

    message = 'Finding LCA of all protein accession numbers in fasta headers.'
    shared.give_user_feedback(message, log_file, quiet)

    no_taxid = 0
    corrected = 0
    total = 0
    with open(fastaid2LCAtaxid_file, 'w') as outf1:
        for fastaid, prot_accessions in fastaid2prot_accessions.items():
            list_of_lineages = []
            for prot_accession in prot_accessions:
                try:
                    taxid = prot_accession2taxid[prot_accession]
                    lineage = tax.find_lineage(taxid, taxid2parent)
                    list_of_lineages.append(lineage)
                except:
                    # This accounts for missing accession numbers in
                    # prot.accession2taxid and missing nodes in nodes.dmp.
                    continue

            total += 1

            if len(list_of_lineages) == 0:
                # This accounts for entries that only contain accession numbers
                # that are missing in prot.accession2taxid or whose taxid is
                # missing in nodes.dmp. NOTE that these entries are thus not
                # present in the output file.
                no_taxid += 1

                continue

            LCAtaxid = tax.find_LCA(list_of_lineages)

            outf1.write('{0}\t{1}\n'.format(fastaid, LCAtaxid))

            if (fastaid not in prot_accession2taxid
                    or LCAtaxid != prot_accession2taxid[fastaid]):
                # If the fastaid cannot be found in prot.accession2taxid, but
                # a taxid is given to the fastaid based on secondary accession
                # numbers, or if the taxid of the header is different from the
                # LCA taxid, it is counted as corrected.
                corrected += 1

    message = (
        'Done! File {0} is created. '
        '{1:,d} of {2:,d} headers ({3:.1f}%) corrected. '
        '{4:,d} headers ({5:.1f}%) do not have a taxid assigned.'.format(
            fastaid2LCAtaxid_file, corrected, total, corrected / total * 100,
            no_taxid, no_taxid / total * 100))
    shared.give_user_feedback(message, log_file, quiet)

    return
Exemple #5
0
def write_taxids_with_multiple_offspring_file(
        taxids_with_multiple_offspring_file, taxid2offspring, log_file, quiet):
    message = 'Writing {0}.'.format(taxids_with_multiple_offspring_file)
    shared.give_user_feedback(message, log_file, quiet)

    with open(taxids_with_multiple_offspring_file, 'w') as outf1:
        for taxid in taxid2offspring:
            if len(taxid2offspring[taxid]) >= 2:
                outf1.write('{0}\n'.format(taxid))
Exemple #6
0
def check_in_and_output_file(input_file, output_file, log_file, quiet):
    error = False

    if input_file == output_file:
        message = 'input file and output file can not be the same.'
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    return error
Exemple #7
0
def check_input_file(input_file, log_file, quiet):
    error = False

    if not os.path.isfile(input_file):
        message = 'ERROR: input file {0} does not exist.'.format(input_file)
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    return error
Exemple #8
0
def check_whether_ORFs_are_based_on_contigs(contig_names, contig2ORFs,
                                            log_file, quiet):
    for contig in contig2ORFs:
        if contig not in contig_names:
            message = ('ERROR: found a protein in the predicted proteins '
                       'fasta file that can not be traced back to one of the '
                       'contigs in the contigs fasta file: {0}. Proteins '
                       'should be named contig_name_#.'
                       ''.format(contig2ORFs[contig][0]))
            shared.give_user_feedback(message, log_file, quiet, error=True)

            sys.exit(1)
Exemple #9
0
def check_output_file(output_file, log_file, quiet):
    error = False

    if os.path.isfile(output_file):
        message = ('ERROR: output file {0} already exists. You can choose to '
                   'overwrite existing files with the [--force] argument.'
                   ''.format(output_file))
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    return error
Exemple #10
0
def check_md5_gz(gz_file, md5_file, log_file, quiet):
    message = 'Checking file integrity via MD5 checksum.'
    shared.give_user_feedback(message, log_file, quiet)

    with open(md5_file, 'r') as f:
        md5_exp = f.read().split(' ')[0]

    if md5_exp == '':
        message = ('WARNING: no MD5 found in {0}. Integrity of {1} can not be '
                   'established.'.format(md5_file, gz_file))
        shared.give_user_feedback(message, log_file, quiet)
    else:
        md5 = hashlib.md5()

        block_size = 4096
        with open(gz_file, 'rb') as f:
            for chunk in iter(lambda: f.read(block_size), b''):
                md5.update(chunk)
        md5 = md5.hexdigest()

        if md5 != md5_exp:
            message = 'MD5 of {0} does not check out.'.format(gz_file)
            shared.give_user_feedback(message, log_file, quiet, error=True)

            sys.exit(1)
        else:
            message = 'MD5 of {0} checks out.'.format(gz_file)
            shared.give_user_feedback(message, log_file, quiet)

    return
Exemple #11
0
def download_nr(nr_file, log_file, quiet):
    url = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/'
    message = 'Downloading nr database from {0} to database folder.'.format(
        url)
    shared.give_user_feedback(message, log_file, quiet)

    url = '{0}nr.gz'.format(url)
    try:
        urllib.request.urlretrieve(url, nr_file)
    except:
        message = 'download of {0} failed.'.format(url)
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    url = '{0}.md5'.format(url)
    md5_file = '{0}.md5'.format(nr_file)
    try:
        urllib.request.urlretrieve(url, md5_file)
    except:
        message = 'download of {0} failed.'.format(url)
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    message = 'Download complete.'
    shared.give_user_feedback(message, log_file, quiet)

    check.check_md5_gz(nr_file, md5_file, log_file, quiet)

    return
Exemple #12
0
def download_taxonomy_files(taxonomy_folder, date, log_file, quiet):
    url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
    tmp_taxonomy_file = '{0}/{1}.taxdump.tar.gz'.format(taxonomy_folder, date)

    message = ('Downloading and extracting taxonomy files from {0} to {1}.'
               ''.format(url, taxonomy_folder))
    shared.give_user_feedback(message, log_file, quiet)
    
    try:
        urllib.request.urlretrieve(url, tmp_taxonomy_file)
    except:
        message = 'ERROR: donwload of taxonomy files failed.'
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)
    
    try:
        with tarfile.open(tmp_taxonomy_file) as tar:
            tar.extractall(taxonomy_folder)
    except:
        message = ('ERROR: something went wrong while extracting the taxonomy '
                   'files.')
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)
        
    message = 'Download complete!'
    shared.give_user_feedback(message, log_file, quiet)
Exemple #13
0
def download_prot_accession2taxid_file(prot_accession2taxid_file, date,
                                       log_file, quiet):
    url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/'
    message = 'Downloading mapping file from {0} to taxonomy folder.'.format(
        url)
    shared.give_user_feedback(message, log_file, quiet)

    url = '{0}prot.accession2taxid.FULL.gz'.format(url)
    try:
        urllib.request.urlretrieve(url, prot_accession2taxid_file)
    except:
        message = 'download of {0} failed.'.format(url)
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    url = '{0}.md5'.format(url)
    md5_file = '{0}.md5'.format(prot_accession2taxid_file)
    try:
        urllib.request.urlretrieve(url, md5_file)
    except:
        message = 'download of {0} failed.'.format(url)
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    message = 'Download complete.'
    shared.give_user_feedback(message, log_file, quiet)

    check.check_md5_gz(prot_accession2taxid_file, md5_file, log_file, quiet)

    return
Exemple #14
0
def import_taxids_with_multiple_offspring(taxids_with_multiple_offspring_file,
                                          log_file, quiet):
    message = 'Importing file {0}.'.format(taxids_with_multiple_offspring_file)
    shared.give_user_feedback(message, log_file, quiet)

    taxids_with_multiple_offspring = set()

    with open(taxids_with_multiple_offspring_file, 'r') as f1:
        for line in f1:
            line = line.rstrip()

            taxids_with_multiple_offspring.add(line)

    return taxids_with_multiple_offspring
Exemple #15
0
def import_prot_accession2taxid(prot_accession2taxid_file, log_file, quiet):
    message = ('Loading {0} into memory. Please be patient...'
               ''.format(prot_accession2taxid_file))
    shared.give_user_feedback(message, log_file, quiet)

    prot_accession2taxid = {}

    with gzip.open(prot_accession2taxid_file, 'rt') as f1:
        for line in f1:
            line = line.split('\t')

            prot_accession2taxid[line[1]] = line[2]

    return prot_accession2taxid
Exemple #16
0
def import_fastaid2LCAtaxid(fastaid2LCAtaxid_file, all_hits, log_file, quiet):
    message = 'Importing file {0}.'.format(fastaid2LCAtaxid_file)
    shared.give_user_feedback(message, log_file, quiet)

    fastaid2LCAtaxid = {}

    with shared.open_maybe_gzip(fastaid2LCAtaxid_file, 'rt') as f1:
        for line in f1:
            line = line.rstrip().split('\t')

            if line[0] in all_hits:
                # Only include fastaids that are found in hits.
                fastaid2LCAtaxid[line[0]] = line[1]

    return fastaid2LCAtaxid
Exemple #17
0
def check_top(top, r, log_file, quiet):
    error = False

    if top < 50:
        message = ('WARNING: [--top] is set lower than 50. This might '
                   'conflict with future runs with higher settings of the '
                   '[-r / --range] parameter, see README.md.')
        shared.give_user_feedback(message, log_file, quiet)

    if top <= r:
        message = 'ERROR: [--top] should be higher than [-r / --range].'
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    return error
Exemple #18
0
def check_bin_fasta(bin_fasta, log_file, quiet):
    error = False

    if check_fasta(bin_fasta, log_file, quiet):
        error = True

    if os.path.isdir(bin_fasta):
        message = (
            '{0} is a directory. If you want to classify more than 1 bin '
            'you can run \'CAT bins\' instead of \'CAT bin\'.'.format(
                bin_fasta))
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    return error
Exemple #19
0
def make_concatenated_fasta(concatenated_fasta, bin2contigs, bin_folder,
                            log_file, quiet):
    message = 'Writing {0}.'.format(concatenated_fasta)
    shared.give_user_feedback(message, log_file, quiet)

    with open(concatenated_fasta, 'w') as outf1:
        for bin_ in sorted(bin2contigs):
            with open('{0}/{1}'.format(bin_folder, bin_), 'r') as f1:
                for line in f1:
                    if line.startswith('>'):
                        contig = line.split(' ')[0].rstrip().lstrip('>')

                        # add bin name in front of the contig name.
                        outf1.write('>{0}_{1}\n'.format(bin_, contig))
                    else:
                        outf1.write(line)
Exemple #20
0
def import_names(names_dmp, log_file, quiet):
    message = 'Importing file {0}.'.format(names_dmp)
    shared.give_user_feedback(message, log_file, quiet)

    taxid2name = {}

    with shared.open_maybe_gzip(names_dmp, 'rt') as f1:
        for line in f1:
            line = line.split('\t')

            if line[6] == 'scientific name':
                taxid = line[0]
                name = line[2]

                taxid2name[taxid] = name

    return taxid2name
Exemple #21
0
def memory_bottleneck(args):
    (total_memory, error) = check.check_memory(args.min_mem)
    if error:
        message = ('at least {0}GB of memory is needed for the database '
                   'construction. {1}GB is found on your system. You can try '
                   'to find a machine with more memory, or download '
                   'preconstructed database files from '
                   'tbb.bio.uu.nl/bastiaan/CAT_prepare/.'.format(
                       args.min_mem, total_memory))
        shared.give_user_feedback(message,
                                  args.log_file,
                                  args.quiet,
                                  error=True)

        sys.exit(1)

    return
Exemple #22
0
def check_bin_fasta(bin_fasta, log_file, quiet):
    error = False

    if not check_whether_file_is_fasta(bin_fasta):
        message = 'ERROR: {0} is not a fasta file.'.format(bin_fasta)
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    if os.path.isdir(bin_fasta):
        message = ('ERROR: {0} is a directory. If you want to classify more '
                   'than 1 bin you can run \'CAT bins\' instead of '
                   '\'CAT bin\'.'.format(bin_fasta))
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    return error
Exemple #23
0
def check_prodigal_binaries(path_to_prodigal, log_file, quiet):
    error = False

    try:
        p = subprocess.Popen([path_to_prodigal, '-v'], stderr=subprocess.PIPE)
        c = p.communicate()
        output = c[1].decode().rstrip().lstrip()

        message = 'Prodigal found: {0}.'.format(output)
        shared.give_user_feedback(message, log_file, quiet)
    except OSError:
        message = ('ERROR: can not find Prodigal. Please check whether it is '
                   'installed or path to the binaries is provided.')
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    return error
Exemple #24
0
def check_diamond_binaries(path_to_diamond, log_file, quiet):
    error = False

    try:
        p = subprocess.Popen([path_to_diamond, '--version'],
                             stdout=subprocess.PIPE)
        c = p.communicate()
        output = c[0].decode().rstrip()

        message = 'DIAMOND found: {0}.'.format(output)
        shared.give_user_feedback(message, log_file, quiet)
    except OSError:
        message = ('ERROR: can not find DIAMOND. Please check whether it is '
                   'installed or path to the binaries is provided.')
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    return error
Exemple #25
0
def import_nodes(nodes_dmp, log_file, quiet):
    message = 'Importing file {0}.'.format(nodes_dmp)
    shared.give_user_feedback(message, log_file, quiet)

    taxid2parent = {}
    taxid2rank = {}

    with open(nodes_dmp, 'r') as f1:
        for line in f1:
            line = line.split('\t')

            taxid = line[0]
            parent = line[2]
            rank = line[4]

            taxid2parent[taxid] = parent
            taxid2rank[taxid] = rank

    return (taxid2parent, taxid2rank)
Exemple #26
0
def check_fasta(file_, log_file, quiet):
    error = False

    if not os.path.isfile(file_):
        error = True
    else:
        with open(file_, 'r') as f1:
            for n, line in enumerate(f1):
                if n == 0:
                    if not line.startswith('>'):
                        error = True

                    break

    if error:
        message = '{0} is not a fasta file.'.format(file_)
        shared.give_user_feedback(message, log_file, quiet, error=True)

    return error
Exemple #27
0
def check_out_prefix(out_prefix, log_file, quiet):
    error = False

    if '/' in out_prefix:
        if out_prefix.endswith('/'):
            message = ('ERROR: prefix for output files ({0}) appears to be a '
                       'directory.'.format(out_prefix))
            shared.give_user_feedback(message, log_file, quiet, error=True)

            error = True

        directory = out_prefix.rsplit('/', 1)[0]
        if not os.path.isdir(directory):
            message = ('ERROR: can not find output directory {0} to which '
                       'output files should be written.'.format(directory))
            shared.give_user_feedback(message, log_file, quiet, error=True)

            error = True

    return error
Exemple #28
0
def check_out_prefix(out_prefix, log_file, quiet):
    error = False

    if os.path.isdir(out_prefix):
        message = 'prefix for output files ({0}) is a directory.'.format(
            out_prefix)
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    dir_ = out_prefix.rsplit('/', 1)[0]

    if not os.path.isdir(dir_):
        message = ('can not find output directory {0} to which output files '
                   'should be written.'.format(dir_))
        shared.give_user_feedback(message, log_file, quiet, error=True)

        error = True

    return error
Exemple #29
0
def make_diamond_database(path_to_diamond,
                          nr_file,
                          diamond_database_prefix,
                          nproc,
                          log_file,
                          quiet):
    message = ('Constructing DIAMOND database {0}.dmnd from {1} '
               'using {2} cores. Please be patient...'
               ''.format(diamond_database_prefix, nr_file, nproc))
    shared.give_user_feedback(message, log_file, quiet)

    command = [path_to_diamond, 'makedb',
               '--in', nr_file,
               '-d', diamond_database_prefix,
               '-p', str(nproc),
               '--quiet']
    try:
        subprocess.check_call(command)
    except:
        message = 'ERROR: DIAMOND database could not be created.'
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)
        
    message = 'DIAMOND database constructed!'
    shared.give_user_feedback(message, log_file, quiet)
Exemple #30
0
def make_diamond_database(path_to_diamond, nr_file, diamond_database_prefix,
                          nproc, log_file, quiet, verbose):
    message = ('Constructing DIAMOND database {0}.dmnd from {1} using {2} '
               'cores.'.format(diamond_database_prefix, nr_file, nproc))
    shared.give_user_feedback(message, log_file, quiet)

    command = [
        path_to_diamond, 'makedb', '--in', nr_file, '-d',
        diamond_database_prefix, '-p',
        str(nproc)
    ]

    if not verbose:
        command += ['--quiet']

    try:
        subprocess.check_call(command)
    except:
        message = 'DIAMOND database could not be created.'
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    message = 'DIAMOND database constructed.'
    shared.give_user_feedback(message, log_file, quiet)

    return