def import_contig_lengths(contigs_fasta, log_file, quiet): message = 'Gathering contig lengths from {0}.'.format(contigs_fasta) shared.give_user_feedback(message, log_file, quiet) contig2length = {} with open(contigs_fasta, 'r') as f1: for line in f1: line = line.rstrip() if line.startswith('>'): contig = line.split(' ')[0].lstrip('>') contig2length[contig] = 0 else: try: contig2length[contig] += len(line) except: message = '{0} is not a contigs fasta'.format( contigs_fasta) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) return contig2length
def find_offspring(taxonomy_folder, fastaid2LCAtaxid_file, log_file, quiet): nodes_dmp = '{0}/nodes.dmp'.format(taxonomy_folder) (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet) message = 'Searching nr database for taxids with multiple offspring.' shared.give_user_feedback(message, log_file, quiet) taxid2offspring = {} with open(fastaid2LCAtaxid_file, 'r') as f1: for line in f1: line = line.rstrip().split('\t') taxid = line[1] lineage = tax.find_lineage(taxid, taxid2parent) for (i, taxid) in enumerate(lineage): # The first taxid in the lineage does not have a daughter node. if i == 0: continue if taxid not in taxid2offspring: taxid2offspring[taxid] = set() offspring = lineage[i - 1] taxid2offspring[taxid].add(offspring) return taxid2offspring
def import_prot_accession2taxid(prot_accession2taxid_file, prot_accessions_whitelist, log_file, quiet): message = 'Loading file {0}.'.format(prot_accession2taxid_file) shared.give_user_feedback(message, log_file, quiet) prot_accession2taxid = {} with gzip.open(prot_accession2taxid_file, 'rb') as f1: for n, line in enumerate(f1): line = line.decode('utf-8') line = line.rstrip().split('\t') if n == 0: index_1 = line.index('accession.version') index_2 = line.index('taxid') continue prot_accession = line[index_1] if prot_accession in prot_accessions_whitelist: prot_accession2taxid[prot_accession] = line[index_2] return prot_accession2taxid
def make_fastaid2LCAtaxid_file(nodes_dmp, fastaid2LCAtaxid_file, nr_file, prot_accession2taxid_file, taxid2parent, log_file, quiet): (fastaid2prot_accessions, prot_accessions_whitelist) = import_headers_nr(nr_file, log_file, quiet) prot_accession2taxid = import_prot_accession2taxid( prot_accession2taxid_file, prot_accessions_whitelist, log_file, quiet) message = 'Finding LCA of all protein accession numbers in fasta headers.' shared.give_user_feedback(message, log_file, quiet) no_taxid = 0 corrected = 0 total = 0 with open(fastaid2LCAtaxid_file, 'w') as outf1: for fastaid, prot_accessions in fastaid2prot_accessions.items(): list_of_lineages = [] for prot_accession in prot_accessions: try: taxid = prot_accession2taxid[prot_accession] lineage = tax.find_lineage(taxid, taxid2parent) list_of_lineages.append(lineage) except: # This accounts for missing accession numbers in # prot.accession2taxid and missing nodes in nodes.dmp. continue total += 1 if len(list_of_lineages) == 0: # This accounts for entries that only contain accession numbers # that are missing in prot.accession2taxid or whose taxid is # missing in nodes.dmp. NOTE that these entries are thus not # present in the output file. no_taxid += 1 continue LCAtaxid = tax.find_LCA(list_of_lineages) outf1.write('{0}\t{1}\n'.format(fastaid, LCAtaxid)) if (fastaid not in prot_accession2taxid or LCAtaxid != prot_accession2taxid[fastaid]): # If the fastaid cannot be found in prot.accession2taxid, but # a taxid is given to the fastaid based on secondary accession # numbers, or if the taxid of the header is different from the # LCA taxid, it is counted as corrected. corrected += 1 message = ( 'Done! File {0} is created. ' '{1:,d} of {2:,d} headers ({3:.1f}%) corrected. ' '{4:,d} headers ({5:.1f}%) do not have a taxid assigned.'.format( fastaid2LCAtaxid_file, corrected, total, corrected / total * 100, no_taxid, no_taxid / total * 100)) shared.give_user_feedback(message, log_file, quiet) return
def write_taxids_with_multiple_offspring_file( taxids_with_multiple_offspring_file, taxid2offspring, log_file, quiet): message = 'Writing {0}.'.format(taxids_with_multiple_offspring_file) shared.give_user_feedback(message, log_file, quiet) with open(taxids_with_multiple_offspring_file, 'w') as outf1: for taxid in taxid2offspring: if len(taxid2offspring[taxid]) >= 2: outf1.write('{0}\n'.format(taxid))
def check_in_and_output_file(input_file, output_file, log_file, quiet): error = False if input_file == output_file: message = 'input file and output file can not be the same.' shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error
def check_input_file(input_file, log_file, quiet): error = False if not os.path.isfile(input_file): message = 'ERROR: input file {0} does not exist.'.format(input_file) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error
def check_whether_ORFs_are_based_on_contigs(contig_names, contig2ORFs, log_file, quiet): for contig in contig2ORFs: if contig not in contig_names: message = ('ERROR: found a protein in the predicted proteins ' 'fasta file that can not be traced back to one of the ' 'contigs in the contigs fasta file: {0}. Proteins ' 'should be named contig_name_#.' ''.format(contig2ORFs[contig][0])) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1)
def check_output_file(output_file, log_file, quiet): error = False if os.path.isfile(output_file): message = ('ERROR: output file {0} already exists. You can choose to ' 'overwrite existing files with the [--force] argument.' ''.format(output_file)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error
def check_md5_gz(gz_file, md5_file, log_file, quiet): message = 'Checking file integrity via MD5 checksum.' shared.give_user_feedback(message, log_file, quiet) with open(md5_file, 'r') as f: md5_exp = f.read().split(' ')[0] if md5_exp == '': message = ('WARNING: no MD5 found in {0}. Integrity of {1} can not be ' 'established.'.format(md5_file, gz_file)) shared.give_user_feedback(message, log_file, quiet) else: md5 = hashlib.md5() block_size = 4096 with open(gz_file, 'rb') as f: for chunk in iter(lambda: f.read(block_size), b''): md5.update(chunk) md5 = md5.hexdigest() if md5 != md5_exp: message = 'MD5 of {0} does not check out.'.format(gz_file) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) else: message = 'MD5 of {0} checks out.'.format(gz_file) shared.give_user_feedback(message, log_file, quiet) return
def download_nr(nr_file, log_file, quiet): url = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/' message = 'Downloading nr database from {0} to database folder.'.format( url) shared.give_user_feedback(message, log_file, quiet) url = '{0}nr.gz'.format(url) try: urllib.request.urlretrieve(url, nr_file) except: message = 'download of {0} failed.'.format(url) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) url = '{0}.md5'.format(url) md5_file = '{0}.md5'.format(nr_file) try: urllib.request.urlretrieve(url, md5_file) except: message = 'download of {0} failed.'.format(url) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'Download complete.' shared.give_user_feedback(message, log_file, quiet) check.check_md5_gz(nr_file, md5_file, log_file, quiet) return
def download_taxonomy_files(taxonomy_folder, date, log_file, quiet): url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' tmp_taxonomy_file = '{0}/{1}.taxdump.tar.gz'.format(taxonomy_folder, date) message = ('Downloading and extracting taxonomy files from {0} to {1}.' ''.format(url, taxonomy_folder)) shared.give_user_feedback(message, log_file, quiet) try: urllib.request.urlretrieve(url, tmp_taxonomy_file) except: message = 'ERROR: donwload of taxonomy files failed.' shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) try: with tarfile.open(tmp_taxonomy_file) as tar: tar.extractall(taxonomy_folder) except: message = ('ERROR: something went wrong while extracting the taxonomy ' 'files.') shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'Download complete!' shared.give_user_feedback(message, log_file, quiet)
def download_prot_accession2taxid_file(prot_accession2taxid_file, date, log_file, quiet): url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/' message = 'Downloading mapping file from {0} to taxonomy folder.'.format( url) shared.give_user_feedback(message, log_file, quiet) url = '{0}prot.accession2taxid.FULL.gz'.format(url) try: urllib.request.urlretrieve(url, prot_accession2taxid_file) except: message = 'download of {0} failed.'.format(url) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) url = '{0}.md5'.format(url) md5_file = '{0}.md5'.format(prot_accession2taxid_file) try: urllib.request.urlretrieve(url, md5_file) except: message = 'download of {0} failed.'.format(url) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'Download complete.' shared.give_user_feedback(message, log_file, quiet) check.check_md5_gz(prot_accession2taxid_file, md5_file, log_file, quiet) return
def import_taxids_with_multiple_offspring(taxids_with_multiple_offspring_file, log_file, quiet): message = 'Importing file {0}.'.format(taxids_with_multiple_offspring_file) shared.give_user_feedback(message, log_file, quiet) taxids_with_multiple_offspring = set() with open(taxids_with_multiple_offspring_file, 'r') as f1: for line in f1: line = line.rstrip() taxids_with_multiple_offspring.add(line) return taxids_with_multiple_offspring
def import_prot_accession2taxid(prot_accession2taxid_file, log_file, quiet): message = ('Loading {0} into memory. Please be patient...' ''.format(prot_accession2taxid_file)) shared.give_user_feedback(message, log_file, quiet) prot_accession2taxid = {} with gzip.open(prot_accession2taxid_file, 'rt') as f1: for line in f1: line = line.split('\t') prot_accession2taxid[line[1]] = line[2] return prot_accession2taxid
def import_fastaid2LCAtaxid(fastaid2LCAtaxid_file, all_hits, log_file, quiet): message = 'Importing file {0}.'.format(fastaid2LCAtaxid_file) shared.give_user_feedback(message, log_file, quiet) fastaid2LCAtaxid = {} with shared.open_maybe_gzip(fastaid2LCAtaxid_file, 'rt') as f1: for line in f1: line = line.rstrip().split('\t') if line[0] in all_hits: # Only include fastaids that are found in hits. fastaid2LCAtaxid[line[0]] = line[1] return fastaid2LCAtaxid
def check_top(top, r, log_file, quiet): error = False if top < 50: message = ('WARNING: [--top] is set lower than 50. This might ' 'conflict with future runs with higher settings of the ' '[-r / --range] parameter, see README.md.') shared.give_user_feedback(message, log_file, quiet) if top <= r: message = 'ERROR: [--top] should be higher than [-r / --range].' shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error
def check_bin_fasta(bin_fasta, log_file, quiet): error = False if check_fasta(bin_fasta, log_file, quiet): error = True if os.path.isdir(bin_fasta): message = ( '{0} is a directory. If you want to classify more than 1 bin ' 'you can run \'CAT bins\' instead of \'CAT bin\'.'.format( bin_fasta)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error
def make_concatenated_fasta(concatenated_fasta, bin2contigs, bin_folder, log_file, quiet): message = 'Writing {0}.'.format(concatenated_fasta) shared.give_user_feedback(message, log_file, quiet) with open(concatenated_fasta, 'w') as outf1: for bin_ in sorted(bin2contigs): with open('{0}/{1}'.format(bin_folder, bin_), 'r') as f1: for line in f1: if line.startswith('>'): contig = line.split(' ')[0].rstrip().lstrip('>') # add bin name in front of the contig name. outf1.write('>{0}_{1}\n'.format(bin_, contig)) else: outf1.write(line)
def import_names(names_dmp, log_file, quiet): message = 'Importing file {0}.'.format(names_dmp) shared.give_user_feedback(message, log_file, quiet) taxid2name = {} with shared.open_maybe_gzip(names_dmp, 'rt') as f1: for line in f1: line = line.split('\t') if line[6] == 'scientific name': taxid = line[0] name = line[2] taxid2name[taxid] = name return taxid2name
def memory_bottleneck(args): (total_memory, error) = check.check_memory(args.min_mem) if error: message = ('at least {0}GB of memory is needed for the database ' 'construction. {1}GB is found on your system. You can try ' 'to find a machine with more memory, or download ' 'preconstructed database files from ' 'tbb.bio.uu.nl/bastiaan/CAT_prepare/.'.format( args.min_mem, total_memory)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) return
def check_bin_fasta(bin_fasta, log_file, quiet): error = False if not check_whether_file_is_fasta(bin_fasta): message = 'ERROR: {0} is not a fasta file.'.format(bin_fasta) shared.give_user_feedback(message, log_file, quiet, error=True) error = True if os.path.isdir(bin_fasta): message = ('ERROR: {0} is a directory. If you want to classify more ' 'than 1 bin you can run \'CAT bins\' instead of ' '\'CAT bin\'.'.format(bin_fasta)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error
def check_prodigal_binaries(path_to_prodigal, log_file, quiet): error = False try: p = subprocess.Popen([path_to_prodigal, '-v'], stderr=subprocess.PIPE) c = p.communicate() output = c[1].decode().rstrip().lstrip() message = 'Prodigal found: {0}.'.format(output) shared.give_user_feedback(message, log_file, quiet) except OSError: message = ('ERROR: can not find Prodigal. Please check whether it is ' 'installed or path to the binaries is provided.') shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error
def check_diamond_binaries(path_to_diamond, log_file, quiet): error = False try: p = subprocess.Popen([path_to_diamond, '--version'], stdout=subprocess.PIPE) c = p.communicate() output = c[0].decode().rstrip() message = 'DIAMOND found: {0}.'.format(output) shared.give_user_feedback(message, log_file, quiet) except OSError: message = ('ERROR: can not find DIAMOND. Please check whether it is ' 'installed or path to the binaries is provided.') shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error
def import_nodes(nodes_dmp, log_file, quiet): message = 'Importing file {0}.'.format(nodes_dmp) shared.give_user_feedback(message, log_file, quiet) taxid2parent = {} taxid2rank = {} with open(nodes_dmp, 'r') as f1: for line in f1: line = line.split('\t') taxid = line[0] parent = line[2] rank = line[4] taxid2parent[taxid] = parent taxid2rank[taxid] = rank return (taxid2parent, taxid2rank)
def check_fasta(file_, log_file, quiet): error = False if not os.path.isfile(file_): error = True else: with open(file_, 'r') as f1: for n, line in enumerate(f1): if n == 0: if not line.startswith('>'): error = True break if error: message = '{0} is not a fasta file.'.format(file_) shared.give_user_feedback(message, log_file, quiet, error=True) return error
def check_out_prefix(out_prefix, log_file, quiet): error = False if '/' in out_prefix: if out_prefix.endswith('/'): message = ('ERROR: prefix for output files ({0}) appears to be a ' 'directory.'.format(out_prefix)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True directory = out_prefix.rsplit('/', 1)[0] if not os.path.isdir(directory): message = ('ERROR: can not find output directory {0} to which ' 'output files should be written.'.format(directory)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error
def check_out_prefix(out_prefix, log_file, quiet): error = False if os.path.isdir(out_prefix): message = 'prefix for output files ({0}) is a directory.'.format( out_prefix) shared.give_user_feedback(message, log_file, quiet, error=True) error = True dir_ = out_prefix.rsplit('/', 1)[0] if not os.path.isdir(dir_): message = ('can not find output directory {0} to which output files ' 'should be written.'.format(dir_)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error
def make_diamond_database(path_to_diamond, nr_file, diamond_database_prefix, nproc, log_file, quiet): message = ('Constructing DIAMOND database {0}.dmnd from {1} ' 'using {2} cores. Please be patient...' ''.format(diamond_database_prefix, nr_file, nproc)) shared.give_user_feedback(message, log_file, quiet) command = [path_to_diamond, 'makedb', '--in', nr_file, '-d', diamond_database_prefix, '-p', str(nproc), '--quiet'] try: subprocess.check_call(command) except: message = 'ERROR: DIAMOND database could not be created.' shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'DIAMOND database constructed!' shared.give_user_feedback(message, log_file, quiet)
def make_diamond_database(path_to_diamond, nr_file, diamond_database_prefix, nproc, log_file, quiet, verbose): message = ('Constructing DIAMOND database {0}.dmnd from {1} using {2} ' 'cores.'.format(diamond_database_prefix, nr_file, nproc)) shared.give_user_feedback(message, log_file, quiet) command = [ path_to_diamond, 'makedb', '--in', nr_file, '-d', diamond_database_prefix, '-p', str(nproc) ] if not verbose: command += ['--quiet'] try: subprocess.check_call(command) except: message = 'DIAMOND database could not be created.' shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'DIAMOND database constructed.' shared.give_user_feedback(message, log_file, quiet) return