def find_offspring(taxonomy_folder, fastaid2LCAtaxid_file, log_file, quiet): nodes_dmp = '{0}/nodes.dmp'.format(taxonomy_folder) (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet) message = 'Searching nr database for taxids with multiple offspring.' shared.give_user_feedback(message, log_file, quiet) taxid2offspring = {} with shared.open_maybe_gzip(fastaid2LCAtaxid_file, 'rt') as f1: for line in f1: line = line.rstrip().split('\t') taxid = line[1] lineage = tax.find_lineage(taxid, taxid2parent) for (i, taxid) in enumerate(lineage): # The first taxid in the lineage does not have a daughter node. if i == 0: continue if taxid not in taxid2offspring: taxid2offspring[taxid] = set() offspring = lineage[i - 1] taxid2offspring[taxid].add(offspring) return taxid2offspring
def import_contig_lengths(contigs_fasta, log_file, quiet): message = 'Gathering contig lengths from {0}.'.format(contigs_fasta) shared.give_user_feedback(message, log_file, quiet) contig2length = {} with shared.open_maybe_gzip(contigs_fasta, 'rt') as f1: for line in f1: line = line.rstrip() if line.startswith('>'): contig = line.split(' ')[0].lstrip('>') contig2length[contig] = 0 else: try: contig2length[contig] += len(line) except: message = ('ERROR: {0} is not a contigs fasta' ''.format(contigs_fasta)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) return contig2length
def make_concatenated_fasta(concatenated_fasta, bin2contigs, bin_folder, log_file, quiet): message = 'Writing {0}.'.format(concatenated_fasta) shared.give_user_feedback(message, log_file, quiet) with shared.open_maybe_gzip(concatenated_fasta, 'wt') as outf1: for bin_ in sorted(bin2contigs): with shared.open_maybe_gzip('{0}/{1}'.format(bin_folder, bin_), 'rt') as f1: for line in f1: if line.startswith('>'): contig = line.split(' ')[0].rstrip().lstrip('>') # add bin name in front of the contig name. outf1.write('>{0}_{1}\n'.format(bin_, contig)) else: outf1.write(line)
def write_taxids_with_multiple_offspring_file( taxids_with_multiple_offspring_file, taxid2offspring, log_file, quiet): message = 'Writing {0}.'.format(taxids_with_multiple_offspring_file) shared.give_user_feedback(message, log_file, quiet) with shared.open_maybe_gzip(taxids_with_multiple_offspring_file, 'wt') as outf1: for taxid in taxid2offspring: if len(taxid2offspring[taxid]) >= 2: outf1.write('{0}\n'.format(taxid))
def import_bins(bin_folder, bin_suffix, log_file, quiet): message = 'Importing bins from {0}/.'.format(bin_folder) shared.give_user_feedback(message, log_file, quiet) bin2contigs = {} contig_names = set() for file_ in os.listdir(bin_folder): if file_.startswith('.'): # Skip hidden files. continue if not file_.endswith(bin_suffix): continue if '.concatenated.' in file_: # Skip concatenated contig fasta and predicted protein fasta files # from earlier runs. continue # Keep the suffix in the bin name. bin_ = file_ bin2contigs[bin_] = [] with shared.open_maybe_gzip('{0}/{1}'.format(bin_folder, file_), 'rt') as f1: for line in f1: if line.startswith('>'): contig = line.split(' ')[0].rstrip().lstrip('>') # Add bin name in front of the contig name. new_contig_name = '{0}_{1}'.format(bin_, contig) if new_contig_name in contig_names: message = ('ERROR: BAT has encountered {0} twice in ' 'bin {1}. Each fasta header should be ' 'unique in each bin.' ''.format(contig, bin_)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) contig_names.add(new_contig_name) bin2contigs[bin_].append(new_contig_name) message = '{0} bin(s) found!'.format(len(bin2contigs)) shared.give_user_feedback(message, log_file, quiet) return (bin2contigs, contig_names)
def check_whether_file_is_fasta(file_): is_fasta = False if not os.path.isfile(file_): return is_fasta with shared.open_maybe_gzip(file_, 'rt') as f1: for line in f1: if line.startswith('>'): is_fasta = True break return is_fasta
def import_taxids_with_multiple_offspring(taxids_with_multiple_offspring_file, log_file, quiet): message = 'Importing file {0}.'.format(taxids_with_multiple_offspring_file) shared.give_user_feedback(message, log_file, quiet) taxids_with_multiple_offspring = set() with shared.open_maybe_gzip(taxids_with_multiple_offspring_file, 'rt') as f1: for line in f1: line = line.rstrip() taxids_with_multiple_offspring.add(line) return taxids_with_multiple_offspring
def import_fastaid2LCAtaxid(fastaid2LCAtaxid_file, all_hits, log_file, quiet): message = 'Importing file {0}.'.format(fastaid2LCAtaxid_file) shared.give_user_feedback(message, log_file, quiet) fastaid2LCAtaxid = {} with shared.open_maybe_gzip(fastaid2LCAtaxid_file, 'rt') as f1: for line in f1: line = line.rstrip().split('\t') if line[0] in all_hits: # Only include fastaids that are found in hits. fastaid2LCAtaxid[line[0]] = line[1] return fastaid2LCAtaxid
def import_names(names_dmp, log_file, quiet): message = 'Importing file {0}.'.format(names_dmp) shared.give_user_feedback(message, log_file, quiet) taxid2name = {} with shared.open_maybe_gzip(names_dmp, 'rt') as f1: for line in f1: line = line.split('\t') if line[6] == 'scientific name': taxid = line[0] name = line[2] taxid2name[taxid] = name return taxid2name
def import_nodes(nodes_dmp, log_file, quiet): message = 'Importing file {0}.'.format(nodes_dmp) shared.give_user_feedback(message, log_file, quiet) taxid2parent = {} taxid2rank = {} with shared.open_maybe_gzip(nodes_dmp, 'rt') as f1: for line in f1: line = line.split('\t') taxid = line[0] parent = line[2] rank = line[4] taxid2parent[taxid] = parent taxid2rank[taxid] = rank return (taxid2parent, taxid2rank)
def make_fastaid2LCAtaxid_file(taxonomy_folder, fastaid2LCAtaxid_file, nr_file, prot_accession2taxid_file, log_file, quiet): prot_accession2taxid = import_prot_accession2taxid( prot_accession2taxid_file, log_file, quiet) nodes_dmp = '{0}/nodes.dmp'.format(taxonomy_folder) (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet) message = ('Finding LCA of all protein accession numbers in fasta headers ' 'of {0}. Please be patient...'.format(nr_file)) shared.give_user_feedback(message, log_file, quiet) corrected = 0 total = 0 with gzip.open(nr_file, 'rt') as f1, shared.open_maybe_gzip(fastaid2LCAtaxid_file, 'wt') as outf1: for line in f1: if not line.startswith('>'): continue line = line.lstrip('>').split('\x01') accession_numbers = [i.split(' ')[0] for i in line] fastaid = accession_numbers[0] list_of_lineages = [] for accession_number in accession_numbers: try: taxid = prot_accession2taxid[accession_number] lineage = tax.find_lineage(taxid, taxid2parent) list_of_lineages.append(lineage) except: # This accounts for missing accession numbers in # prot.accession2taxid and missing nodes in nodes.dmp. continue total += 1 if len(list_of_lineages) == 0: # This accounts for entries that only contain accession numbers # that are missing in prot.accession2taxid or whose taxid is # missing in nodes.dmp. Note that these entries are thus not # present in the output file. continue LCAtaxid = tax.find_LCA(list_of_lineages) outf1.write('{0}\t{1}\n'.format(fastaid, LCAtaxid)) try: if LCAtaxid != prot_accession2taxid[fastaid]: corrected += 1 except: # If the fastaid cannot be found in prot.accession2taxid, but # a taxid is given to the fastaid based on secondary accession # numbers, it is counted as a correction as well. corrected += 1 message = ('Done! File {0} is created. ' '{1} of {2} headers ({3:.1f}%) corrected. Please wait ' 'patiently for Python to collect garbage.' ''.format(fastaid2LCAtaxid_file, corrected, total, corrected / total * 100)) shared.give_user_feedback(message, log_file, quiet)
def contigs(args): step_list = [] (contigs_fasta, database_folder, taxonomy_folder, r, one_minus_r, f, out_prefix, predicted_proteins_fasta, diamond_file, path_to_prodigal, path_to_diamond, no_stars, compress, force, quiet, no_log, nproc, sensitive, block_size, index_chunks, tmpdir, top) = check.convert_arguments(args) if no_log: log_file = None else: # Check out_prefix already as the log file needs to be written to a # valid location. error = check.check_out_prefix(out_prefix, None, quiet) if error: sys.exit(1) log_file = '{0}.log'.format(out_prefix) with open(log_file, 'w') as outf1: pass message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, log_file, quiet, show_time=False) # Check at which state to start. if predicted_proteins_fasta is None and diamond_file is None: message = ('\n' 'CAT is running. Protein prediction, alignment, and contig ' 'classification are carried out.\n' 'Rarw!\n\n' 'Supplied command: {0}\n\n' 'Contigs fasta: {1}\n' 'Taxonomy folder: {2}/\n' 'Database folder: {3}/\n' 'Parameter r: {4}\n' 'Parameter f: {5}\n' 'Log file: {6}\n\n' '-----------------\n'.format(' '.join(sys.argv), contigs_fasta, taxonomy_folder, database_folder, args.r, args.f, log_file)) shared.give_user_feedback(message, log_file, quiet, show_time=False) step_list.append('run_prodigal') step_list.append('run_diamond') elif (predicted_proteins_fasta is not None and diamond_file is None): message = ('\n' 'CAT is running. Since a predicted protein fasta is ' 'supplied, only alignment and contig classification are ' 'carried out.\n' 'Rarw!\n\n' 'Supplied command: {0}\n\n' 'Contigs fasta: {1}\n' 'Taxonomy folder: {2}/\n' 'Database folder: {3}/\n' 'Parameter r: {4}\n' 'Parameter f: {5}\n' 'Log file: {6}\n\n' '-----------------\n'.format(' '.join(sys.argv), contigs_fasta, taxonomy_folder, database_folder, args.r, args.f, log_file)) shared.give_user_feedback(message, log_file, quiet, show_time=False) step_list.append('run_diamond') elif (predicted_proteins_fasta is not None and diamond_file is not None): message = ('\n' 'CAT is running. Since a predicted protein fasta and ' 'DIAMOND alignment file are supplied, only contig ' 'classification is carried out.\n' 'Rarw!\n\n' 'Supplied command: {0}\n\n' 'Contigs fasta: {1}\n' 'Taxonomy folder: {2}/\n' 'Database folder: {3}/\n' 'Parameter r: {4}\n' 'Parameter f: {5}\n' 'Log file: {6}\n\n' '-----------------\n'.format(' '.join(sys.argv), contigs_fasta, taxonomy_folder, database_folder, args.r, args.f, log_file)) shared.give_user_feedback(message, log_file, quiet, show_time=False) elif (predicted_proteins_fasta is None and diamond_file is not None): message = ('ERROR: if you want CAT to directly do the classification, ' 'you should not only supply a DIAMOND alignment table but ' 'also a predicted protein fasta file with argument ' '[-p / --proteins].') shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) # Check binaries, output files, taxonomy folder and database folder, and # set parameters. message = 'Doing some pre-flight checks first.' shared.give_user_feedback(message, log_file, quiet, show_time=False) errors = [] errors.append(check.check_out_prefix(out_prefix, log_file, quiet)) if 'run_prodigal' in step_list: errors.append( check.check_prodigal_binaries(path_to_prodigal, log_file, quiet)) predicted_proteins_fasta = ('{0}.predicted_proteins.faa' ''.format(out_prefix)) predicted_proteins_gff = ('{0}.predicted_proteins.gff' ''.format(out_prefix)) if not force: errors.append( check.check_output_file(predicted_proteins_fasta, log_file, quiet)) errors.append( check.check_output_file(predicted_proteins_gff, log_file, quiet)) compress_suffix = ".gz" if compress else "" if 'run_diamond' in step_list: errors.append( check.check_diamond_binaries(path_to_diamond, log_file, quiet)) diamond_file = '{0}.alignment.diamond{1}'.format( out_prefix, compress_suffix) if not force: errors.append( check.check_output_file(diamond_file, log_file, quiet)) else: diamond_file = diamond_file errors.append( check.check_folders_for_run(taxonomy_folder, database_folder, step_list, log_file, quiet)) contig2classification_output_file = ('{0}.contig2classification.txt{1}' ''.format(out_prefix, compress_suffix)) ORF2LCA_output_file = '{0}.ORF2LCA.txt{1}'.format(out_prefix, compress_suffix) if not force: errors.append( check.check_output_file(contig2classification_output_file, log_file, quiet)) errors.append( check.check_output_file(ORF2LCA_output_file, log_file, quiet)) if 'run_prodigal' not in step_list: if not check.check_whether_file_is_fasta(predicted_proteins_fasta): message = ('ERROR: {0} is not a fasta file.' ''.format(predicted_proteins_fasta)) shared.give_user_feedback(message, log_file, quiet, error=True) errors.append(True) errors.append(check.check_top(top, r, log_file, quiet)) if True in errors: sys.exit(1) (nodes_dmp, names_dmp, prot_accession2taxid_file ) = check.inspect_taxonomy_folder(taxonomy_folder) (nr_file, diamond_database, fastaid2LCAtaxid_file, taxids_with_multiple_offspring_file ) = check.inspect_database_folder(database_folder) message = 'Ready to fly!\n\n-----------------\n' shared.give_user_feedback(message, log_file, quiet, show_time=False) # Start CAT. contig_names = shared.import_contig_names(contigs_fasta, log_file, quiet) if 'run_prodigal' in step_list: shared.run_prodigal(path_to_prodigal, contigs_fasta, predicted_proteins_fasta, predicted_proteins_gff, tmpdir, log_file, quiet) contig2ORFs = shared.import_ORFs(predicted_proteins_fasta, log_file, quiet) check.check_whether_ORFs_are_based_on_contigs(contig_names, contig2ORFs, log_file, quiet) if 'run_diamond' in step_list: shared.run_diamond(path_to_diamond, diamond_database, predicted_proteins_fasta, diamond_file, nproc, sensitive, block_size, index_chunks, tmpdir, top, log_file, compress, quiet) (ORF2hits, all_hits) = shared.parse_diamond_file(diamond_file, one_minus_r, log_file, quiet) (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet) fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid(fastaid2LCAtaxid_file, all_hits, log_file, quiet) taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring( taxids_with_multiple_offspring_file, log_file, quiet) message = ('CAT is spinning! Files {0} and {1} are created.' ''.format(contig2classification_output_file, ORF2LCA_output_file)) shared.give_user_feedback(message, log_file, quiet) number_of_classified_contigs = 0 with shared.open_maybe_gzip(contig2classification_output_file, 'wt') as outf1, shared.open_maybe_gzip( ORF2LCA_output_file, 'wt') as outf2: outf1.write('# contig\tclassification\treason\tlineage\t' 'lineage scores\n') outf2.write('# ORF\tlineage\tbit-score\n') for contig in sorted(contig_names): if contig not in contig2ORFs: outf1.write('{0}\tunclassified\tno ORFs found\n' ''.format(contig)) continue LCAs_ORFs = [] for ORF in contig2ORFs[contig]: if ORF not in ORF2hits: outf2.write('{0}\tORF has no hit to database\n' ''.format(ORF)) continue (taxid, top_bitscore) = tax.find_LCA_for_ORF(ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent) if taxid.startswith('no taxid found'): outf2.write('{0}\t{1}\t{2}\n'.format( ORF, taxid, top_bitscore)) else: lineage = tax.find_lineage(taxid, taxid2parent) if not no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) outf2.write('{0}\t{1}\t{2}\n' ''.format(ORF, ';'.join(lineage[::-1]), top_bitscore)) LCAs_ORFs.append((taxid, top_bitscore), ) if len(LCAs_ORFs) == 0: outf1.write('{0}\tunclassified\tno hits to database\n' ''.format(contig)) continue (lineages, lineages_scores, based_on_number_of_ORFs) = tax.find_weighted_LCA( LCAs_ORFs, taxid2parent, f) if lineages == 'no ORFs with taxids found.': outf1.write('{0}\tunclassified\t' 'hits not found in taxonomy files\n' ''.format(contig)) continue if lineages == 'no lineage whitelisted.': outf1.write('{0}\tunclassified\t' 'no lineage reached minimum bit-score support\n' ''.format(contig)) continue # The contig has a valid classification. number_of_classified_contigs += 1 for (i, lineage) in enumerate(lineages): if not no_stars: lineage = tax.star_lineage(lineage, taxids_with_multiple_offspring) scores = [ '{0:.2f}'.format(score) for score in lineages_scores[i] ] if len(lineages) == 1: # There is only one classification. outf1.write('{0}\tclassified\t' 'based on {1}/{2} ORFs\t{3}\t{4}\n' ''.format(contig, based_on_number_of_ORFs, len(contig2ORFs[contig]), ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) else: # There are multiple classifications. outf1.write('{0}\tclassified ({1}/{2})\t' 'based on {3}/{4} ORFs\t{5}\t{6}\n' ''.format(contig, i + 1, len(lineages), based_on_number_of_ORFs, len(contig2ORFs[contig]), ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) message = ('\n-----------------\n\n' '[{0}] CAT is done! {1}/{2} contigs classified.' ''.format(datetime.datetime.now(), number_of_classified_contigs, len(contig_names))) shared.give_user_feedback(message, log_file, quiet, show_time=False) if f < 0.5: message = ('\nWARNING: since f is set to smaller than 0.5, one ' 'contig may have multiple classifications.') shared.give_user_feedback(message, log_file, quiet, show_time=False)
def summarise_bins(input_file, output_file, force, quiet): # Currently summarise does not a allow for a log file. log_file = None message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, log_file, quiet, show_time=False) errors = [] errors.append(check.check_input_file(input_file, log_file, quiet)) if not force: errors.append(check.check_output_file(output_file, log_file, quiet)) if True in errors: sys.exit(1) message = 'Summarising...' shared.give_user_feedback(message, log_file, quiet) with shared.open_maybe_gzip(input_file, 'rt') as f1: for line in f1: if line.startswith('#'): line = line.split('\t') if line[0] != '# bin': message = ('ERROR: {0} is not a BAT classification file.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) if line[0] == '# contig': message = ('ERROR: {0} appears to be a CAT ' 'classification file. If you want to ' 'summarise contig classifications, please ' 'supply a contigs fasta.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) try: superkingdom_index = line.index('superkingdom') except: message = ('ERROR: official ranks not found in header of ' '{0}. Make sure that the BAT classification ' 'file is named with official ranks with \'CAT ' 'add_names --only_official\'.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) break else: message = 'ERROR: input file does not have a recognisable header.' shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) number_of_bins = {} number_of_bins['unclassified'] = 0 official_ranks = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] for rank in official_ranks: number_of_bins[rank] = {} n = 0 bin_trace = set() doubles = set() with open(input_file, 'r') as f1: for line in f1: line = line.rstrip() if line.startswith('#'): continue n += 1 line = line.split('\t') bin_ = line[0] if bin_ in bin_trace: doubles.add(bin_) bin_trace.add(bin_) if line[1] == 'unclassified': number_of_bins['unclassified'] += 1 continue for (i, classification) in enumerate(line[superkingdom_index:]): classification = classification.rsplit(': ', 1)[0].rstrip('*') rank = official_ranks[i] if classification not in number_of_bins[rank]: number_of_bins[rank][classification] = 0 number_of_bins[rank][classification] += 1 if len(doubles) != 0: message = ('ERROR: some bins have multiple classifications. CAT ' 'summarise currently does not allow for this. Bins with ' 'multiple classifications: {0}.' ''.format(', '.join(list(doubles)))) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) number_of_classified_bins = n - number_of_bins['unclassified'] with shared.open_maybe_gzip(output_file, 'wt') as outf1: outf1.write('# total number of bins is {0}, of which {1} ({2:.2f}%) ' 'are classified.\n' ''.format(n, number_of_classified_bins, number_of_classified_bins / n * 100)) outf1.write('#\n') outf1.write('# rank\tclade\tnumber of bins\n') for rank in official_ranks: for clade in sorted(number_of_bins[rank], key=lambda x: number_of_bins[rank][x], reverse=True): outf1.write('{0}\t{1}\t{2}\n' ''.format(rank, clade, number_of_bins[rank][clade])) message = '{0} is created!'.format(output_file) shared.give_user_feedback(message, log_file, quiet)
def summarise_contigs(input_file, output_file, contigs_fasta, force, quiet): # Currently summarise does not a allow for a log file. log_file = None message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, log_file, quiet, show_time=False) errors = [] errors.append(check.check_input_file(input_file, log_file, quiet)) if not force: errors.append(check.check_output_file(output_file, log_file, quiet)) if True in errors: sys.exit(1) contig2length = import_contig_lengths(contigs_fasta, log_file, quiet) message = 'Summarising...' shared.give_user_feedback(message, log_file, quiet) with shared.open_maybe_gzip(input_file, 'rt') as f1: for line in f1: if line.startswith('#'): line = line.split('\t') if line[0] != '# contig': message = ('ERROR: {0} is not a CAT classification file.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) if line[0] == '# bin': message = ('ERROR: {0} appears to be a BAT ' 'classification file. If you want to ' 'summarise bin classifications, just ' 'don\'t supply a contigs fasta and ' 'everything should be fine!' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) try: superkingdom_index = line.index('superkingdom') except: message = ('ERROR: official ranks not found in header of ' '{0}. Make sure that the CAT classification ' 'file is named with official ranks with \'CAT ' 'add_names --only_official\'.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) break else: message = 'ERROR: input file does not have a recognisable header.' shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) length = {} length['unclassified'] = [] ORFs = {} official_ranks = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] for rank in official_ranks: length[rank] = {} ORFs[rank] = {} n = 0 contig_trace = set() doubles = set() with shared.open_maybe_gzip(input_file, 'rt') as f1: for line in f1: line = line.rstrip() if line.startswith('#'): continue n += 1 line = line.split('\t') contig = line[0] if contig in contig_trace: doubles.add(contig) contig_trace.add(contig) if contig not in contig2length: message = ('ERROR: contig {0} in CAT classification file is ' 'not found in supplied contigs fasta file. Are you ' 'sure the CAT classification file is based on the ' 'contigs fasta?'.format(contig)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) if line[1] == 'unclassified': length['unclassified'].append(contig2length[contig]) continue for (i, classification) in enumerate(line[superkingdom_index:]): classification = classification.rsplit(': ', 1)[0].rstrip('*') rank = official_ranks[i] if classification not in length[rank]: length[rank][classification] = [] ORFs[rank][classification] = [] length[rank][classification].append(contig2length[contig]) # Note that the total number of ORFs on a contig is reproted, # not only the number of ORFs a classification is based on. ORFs_on_contig = int(line[2].split('/')[1].split(' ')[0]) ORFs[rank][classification].append(ORFs_on_contig) if len(doubles) != 0: message = ('ERROR: some contigs have multiple classifications. CAT ' 'summarise currently does not allow for this. Contigs with ' 'multiple classifications: {0}.' ''.format(', '.join(list(doubles)))) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) if n != len(contig2length): message = ('ERROR: the number of classified contigs is not the same ' 'as the number of contigs in contigs fasta. Are you sure ' 'the CAT classification file is based on the contigs ' 'fasta?') shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) with shared.open_maybe_gzip(output_file, 'wt') as outf1: number_of_contigs = len(contig2length) total_length = sum(contig2length.values()) number_of_classified_contigs = number_of_contigs - len( length['unclassified']) total_classified_length = total_length - sum(length['unclassified']) outf1.write('# total number of contigs in {0} is {1} representing {2} ' 'positions.\n' ''.format(contigs_fasta, number_of_contigs, total_length)) outf1.write('# {0} contigs are classified ({1:.2f}%) representing {2} ' 'positions ({3:.2f}%) in {4}.\n' ''.format( number_of_classified_contigs, number_of_classified_contigs / number_of_contigs * 100, total_classified_length, total_classified_length / total_length * 100, input_file)) outf1.write('#\n') outf1.write('# rank\t' 'clade\t' 'number of contigs\t' 'number of ORFs\t' 'number of positions\n') for rank in official_ranks: for clade in sorted(length[rank], key=lambda x: sum(length[rank][x]), reverse=True): outf1.write('{0}\t{1}\t{2}\t{3}\t{4}\n' ''.format(rank, clade, len(length[rank][clade]), sum(ORFs[rank][clade]), sum(length[rank][clade]))) message = '{0} is created!'.format(output_file) shared.give_user_feedback(message, log_file, quiet)
def add_names(args): (input_file, output_file, taxonomy_folder, only_official, exclude_scores, force, quiet) = check.convert_arguments(args) # Currently add_names does not allow for a log file. log_file = None message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, log_file, quiet, show_time=False) errors = [] errors.append(check.check_input_file(input_file, log_file, quiet)) if not force: errors.append(check.check_output_file(output_file, log_file, quiet)) if True in errors: sys.exit(1) (nodes_dmp, names_dmp, prot_accession2taxid_file ) = check.inspect_taxonomy_folder(taxonomy_folder) (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet) taxid2name = tax.import_names(names_dmp, log_file, quiet) message = 'Appending names...' shared.give_user_feedback(message, log_file, quiet) with shared.open_maybe_gzip(input_file, 'rt') as f1: for line in f1: if line.startswith('#'): line = line.rstrip().split('\t') try: lineage_index = line.index('lineage') except: message = ('ERROR: {0} is not a supported classification ' 'file.'.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) try: scores_index = line.index('lineage scores') except: scores_index = None full_length = len(line) break else: message = ('ERROR: {0} is not a supported classification file.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) with shared.open_maybe_gzip(input_file, 'rt') as f1, shared.open_maybe_gzip( output_file, 'wt') as outf1: for line in f1: line = line.rstrip() if line.startswith('#'): if only_official: outf1.write('{0}\tsuperkingdom\tphylum\tclass\torder\t' 'family\tgenus\tspecies\n'.format(line)) else: outf1.write('{0}\tfull lineage names\n'.format(line)) continue line = line.split('\t') if len(line) != full_length: # Entry does not have a full annotation. outf1.write('{0}\n'.format('\t'.join(line))) continue if (line[1].startswith('no taxid found') or line[2].startswith('no taxid found')): # ORF has database hits but the accession number is not found # in the taxonomy files. outf1.write('{0}\n'.format('\t'.join(line))) continue lineage = line[lineage_index].split(';') if scores_index and not exclude_scores: scores = line[scores_index].split(';') else: scores = None if only_official: names = tax.convert_to_official_names(lineage, taxid2rank, taxid2name, scores) else: names = tax.convert_to_names(lineage, taxid2rank, taxid2name, scores) outf1.write('{0}\t{1}\n'.format('\t'.join(line), '\t'.join(names))) message = 'Names written to {0}!'.format(output_file) shared.give_user_feedback(message, log_file, quiet)