def contigs(args): step_list = [] (contigs_fasta, database_folder, taxonomy_folder, r, one_minus_r, f, out_prefix, predicted_proteins_fasta, diamond_file, path_to_prodigal, path_to_diamond, no_stars, force, quiet, no_log, nproc, sensitive, block_size, index_chunks, tmpdir, top) = check.convert_arguments(args) if no_log: log_file = None else: # Check out_prefix already as the log file needs to be written to a # valid location. error = check.check_out_prefix(out_prefix, None, quiet) if error: sys.exit(1) log_file = '{0}.log'.format(out_prefix) with open(log_file, 'w') as outf1: pass message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, log_file, quiet, show_time=False) # Check at which state to start. if predicted_proteins_fasta is None and diamond_file is None: message = ('\n' 'CAT is running. Protein prediction, alignment, and contig ' 'classification are carried out.\n' 'Rarw!\n\n' 'Supplied command: {0}\n\n' 'Contigs fasta: {1}\n' 'Taxonomy folder: {2}/\n' 'Database folder: {3}/\n' 'Parameter r: {4}\n' 'Parameter f: {5}\n' 'Log file: {6}\n\n' '-----------------\n'.format(' '.join(sys.argv), contigs_fasta, taxonomy_folder, database_folder, args.r, args.f, log_file)) shared.give_user_feedback(message, log_file, quiet, show_time=False) step_list.append('run_prodigal') step_list.append('run_diamond') elif (predicted_proteins_fasta is not None and diamond_file is None): message = ('\n' 'CAT is running. Since a predicted protein fasta is ' 'supplied, only alignment and contig classification are ' 'carried out.\n' 'Rarw!\n\n' 'Supplied command: {0}\n\n' 'Contigs fasta: {1}\n' 'Taxonomy folder: {2}/\n' 'Database folder: {3}/\n' 'Parameter r: {4}\n' 'Parameter f: {5}\n' 'Log file: {6}\n\n' '-----------------\n'.format(' '.join(sys.argv), contigs_fasta, taxonomy_folder, database_folder, args.r, args.f, log_file)) shared.give_user_feedback(message, log_file, quiet, show_time=False) step_list.append('run_diamond') elif (predicted_proteins_fasta is not None and diamond_file is not None): message = ('\n' 'CAT is running. Since a predicted protein fasta and ' 'DIAMOND alignment file are supplied, only contig ' 'classification is carried out.\n' 'Rarw!\n\n' 'Supplied command: {0}\n\n' 'Contigs fasta: {1}\n' 'Taxonomy folder: {2}/\n' 'Database folder: {3}/\n' 'Parameter r: {4}\n' 'Parameter f: {5}\n' 'Log file: {6}\n\n' '-----------------\n'.format(' '.join(sys.argv), contigs_fasta, taxonomy_folder, database_folder, args.r, args.f, log_file)) shared.give_user_feedback(message, log_file, quiet, show_time=False) elif (predicted_proteins_fasta is None and diamond_file is not None): message = ('ERROR: if you want CAT to directly do the classification, ' 'you should not only supply a DIAMOND alignment table but ' 'also a predicted protein fasta file with argument ' '[-p / --proteins].') shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) # Check binaries, output files, taxonomy folder and database folder, and # set parameters. message = 'Doing some pre-flight checks first.' shared.give_user_feedback(message, log_file, quiet, show_time=False) errors = [] errors.append(check.check_out_prefix(out_prefix, log_file, quiet)) if 'run_prodigal' in step_list: errors.append( check.check_prodigal_binaries(path_to_prodigal, log_file, quiet)) predicted_proteins_fasta = ('{0}.predicted_proteins.faa' ''.format(out_prefix)) predicted_proteins_gff = ('{0}.predicted_proteins.gff' ''.format(out_prefix)) if not force: errors.append( check.check_output_file(predicted_proteins_fasta, log_file, quiet)) errors.append( check.check_output_file(predicted_proteins_gff, log_file, quiet)) if 'run_diamond' in step_list: errors.append( check.check_diamond_binaries(path_to_diamond, log_file, quiet)) diamond_file = '{0}.alignment.diamond'.format(out_prefix) if not force: errors.append( check.check_output_file(diamond_file, log_file, quiet)) else: diamond_file = diamond_file errors.append( check.check_folders_for_run(taxonomy_folder, database_folder, step_list, log_file, quiet)) contig2classification_output_file = ('{0}.contig2classification.txt' ''.format(out_prefix)) ORF2LCA_output_file = '{0}.ORF2LCA.txt'.format(out_prefix) if not force: errors.append( check.check_output_file(contig2classification_output_file, log_file, quiet)) errors.append( check.check_output_file(ORF2LCA_output_file, log_file, quiet)) if 'run_prodigal' not in step_list: if not check.check_whether_file_is_fasta(predicted_proteins_fasta): message = ('ERROR: {0} is not a fasta file.' ''.format(predicted_proteins_fasta)) shared.give_user_feedback(message, log_file, quiet, error=True) errors.append(True) errors.append(check.check_top(top, r, log_file, quiet)) if True in errors: sys.exit(1) (nodes_dmp, names_dmp, prot_accession2taxid_file ) = check.inspect_taxonomy_folder(taxonomy_folder) (nr_file, diamond_database, fastaid2LCAtaxid_file, taxids_with_multiple_offspring_file ) = check.inspect_database_folder(database_folder) message = 'Ready to fly!\n\n-----------------\n' shared.give_user_feedback(message, log_file, quiet, show_time=False) # Start CAT. contig_names = shared.import_contig_names(contigs_fasta, log_file, quiet) if 'run_prodigal' in step_list: shared.run_prodigal(path_to_prodigal, contigs_fasta, predicted_proteins_fasta, predicted_proteins_gff, log_file, quiet) contig2ORFs = shared.import_ORFs(predicted_proteins_fasta, log_file, quiet) check.check_whether_ORFs_are_based_on_contigs(contig_names, contig2ORFs, log_file, quiet) if 'run_diamond' in step_list: shared.run_diamond(path_to_diamond, diamond_database, predicted_proteins_fasta, diamond_file, nproc, sensitive, block_size, index_chunks, tmpdir, top, log_file, quiet) (ORF2hits, all_hits) = shared.parse_diamond_file(diamond_file, one_minus_r, log_file, quiet) (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet) fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid(fastaid2LCAtaxid_file, all_hits, log_file, quiet) taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring( taxids_with_multiple_offspring_file, log_file, quiet) message = ('CAT is spinning! Files {0} and {1} are created.' ''.format(contig2classification_output_file, ORF2LCA_output_file)) shared.give_user_feedback(message, log_file, quiet) number_of_classified_contigs = 0 with open(contig2classification_output_file, 'w') as outf1, open(ORF2LCA_output_file, 'w') as outf2: outf1.write('# contig\tclassification\treason\tlineage\t' 'lineage scores\n') outf2.write('# ORF\tlineage\tbit-score\n') for contig in sorted(contig_names): if contig not in contig2ORFs: outf1.write('{0}\tunclassified\tno ORFs found\n' ''.format(contig)) continue LCAs_ORFs = [] for ORF in contig2ORFs[contig]: if ORF not in ORF2hits: outf2.write('{0}\tORF has no hit to database\n' ''.format(ORF)) continue (taxid, top_bitscore) = tax.find_LCA_for_ORF(ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent) if taxid.startswith('no taxid found'): outf2.write('{0}\t{1}\t{2}\n'.format( ORF, taxid, top_bitscore)) else: lineage = tax.find_lineage(taxid, taxid2parent) if not no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) outf2.write('{0}\t{1}\t{2}\n' ''.format(ORF, ';'.join(lineage[::-1]), top_bitscore)) LCAs_ORFs.append((taxid, top_bitscore), ) if len(LCAs_ORFs) == 0: outf1.write('{0}\tunclassified\tno hits to database\n' ''.format(contig)) continue (lineages, lineages_scores, based_on_number_of_ORFs) = tax.find_weighted_LCA( LCAs_ORFs, taxid2parent, f) if lineages == 'no ORFs with taxids found.': outf1.write('{0}\tunclassified\t' 'hits not found in taxonomy files\n' ''.format(contig)) continue if lineages == 'no lineage whitelisted.': outf1.write('{0}\tunclassified\t' 'no lineage reached minimum bit-score support\n' ''.format(contig)) continue # The contig has a valid classification. number_of_classified_contigs += 1 for (i, lineage) in enumerate(lineages): if not no_stars: lineage = tax.star_lineage(lineage, taxids_with_multiple_offspring) scores = [ '{0:.2f}'.format(score) for score in lineages_scores[i] ] if len(lineages) == 1: # There is only one classification. outf1.write('{0}\tclassified\t' 'based on {1}/{2} ORFs\t{3}\t{4}\n' ''.format(contig, based_on_number_of_ORFs, len(contig2ORFs[contig]), ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) else: # There are multiple classifications. outf1.write('{0}\tclassified ({1}/{2})\t' 'based on {3}/{4} ORFs\t{5}\t{6}\n' ''.format(contig, i + 1, len(lineages), based_on_number_of_ORFs, len(contig2ORFs[contig]), ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) message = ('\n-----------------\n\n' '[{0}] CAT is done! {1}/{2} contigs classified.' ''.format(datetime.datetime.now(), number_of_classified_contigs, len(contig_names))) shared.give_user_feedback(message, log_file, quiet, show_time=False) if f < 0.5: message = ('\nWARNING: since f is set to smaller than 0.5, one ' 'contig may have multiple classifications.') shared.give_user_feedback(message, log_file, quiet, show_time=False)
def run_existing(args, date): step_list = [] (database_folder, taxonomy_folder, path_to_diamond, quiet, no_log, nproc) = check.convert_arguments(args) if no_log: log_file = None else: log_file = '{0}.CAT_prepare.existing.log'.format(date) with open(log_file, 'w') as outf1: pass message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, log_file, quiet, show_time=False) message = ('\n' 'CAT prepare is running, constructing only parts of the ' 'database that are missing. Rawr!\n\n' 'WARNING: CAT prepare at this point does not check whether the ' 'existing files are OK or corrupted, only if they are there.\n' 'WARNING: note that the database and taxonomy files should be ' 'downloaded preferably at the same date.\n' 'WARNING: preparing the database files may take a couple of ' 'hours.\n\n' 'Supplied command: {0}\n\n' 'Supplied command: {0}\n\n' 'Taxonomy folder: {1}/\n' 'Database folder: {2}/\n' 'Log file: {3}\n\n' '-----------------\n'.format(' '.join(sys.argv), taxonomy_folder, database_folder, log_file)) shared.give_user_feedback(message, log_file, quiet, show_time=False) message = 'Doing some pre-flight checks first.' shared.give_user_feedback(message, log_file, quiet, show_time=False) # Check DIAMOND path. error = check.check_diamond_binaries(path_to_diamond, log_file, quiet) if error: sys.exit(1) # Check taxonomy folder. taxonomy_folder_inspect = check.inspect_taxonomy_folder(taxonomy_folder) if taxonomy_folder_inspect == [None]: message = ('Taxonomy folder not found. Directory will be created ' 'fresh and taxonomy files downloaded to it.') shared.give_user_feedback(message, log_file, quiet) nodes_dmp = None names_dmp = None prot_accession2taxid_file = None else: (nodes_dmp, names_dmp, prot_accession2taxid_file) = taxonomy_folder_inspect message = ('Taxonomy folder found.') shared.give_user_feedback(message, log_file, quiet) if ((nodes_dmp is None and names_dmp is not None) or (nodes_dmp is not None and names_dmp is None)): message = ('ERROR: CAT prepare did not find both nodes.dmp and ' 'names.dmp in the taxonomy folder. They should be ' 'downloaded together. Remove {0} and try again.' ''.format([file_ for file_ in (nodes_dmp, names_dmp) if file_ is not None][0])) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) if nodes_dmp is None and names_dmp is None: message = ('Nodes.dmp and names.dmp will be downloaded to taxonomy ' 'folder.') shared.give_user_feedback(message, log_file, quiet) step_list.append('download_taxonomy_files') else: message = 'Nodes.dmp found: {0}.'.format(nodes_dmp) shared.give_user_feedback(message, log_file, quiet) message = 'Names.dmp found: {0}.'.format(names_dmp) shared.give_user_feedback(message, log_file, quiet) if prot_accession2taxid_file is None: message = ('Prot.accession2taxid file will be downloaded to taxonomy ' 'folder.') shared.give_user_feedback(message, log_file, quiet) prot_accession2taxid_file = ('{0}/{1}.prot.accession2taxid.gz' ''.format(taxonomy_folder, date)) step_list.append('download_prot_accession2taxid_file') else: message = ('Prot.accession2taxid file found: {0}.' ''.format(prot_accession2taxid_file)) shared.give_user_feedback(message, log_file, quiet) # Check database folder. database_folder_inspect = check.inspect_database_folder(database_folder) if database_folder_inspect == [None]: message = ('Database folder not found. Directory will be created ' 'fresh and necessary database files will be downloaded to ' 'it / constructed in it.') shared.give_user_feedback(message, log_file, quiet) nr_file = None diamond_database = None fastaid2LCAtaxid_file = None taxids_with_multiple_offspring_file = None else: (nr_file, diamond_database, fastaid2LCAtaxid_file, taxids_with_multiple_offspring_file) = database_folder_inspect message = ('Database folder found.') shared.give_user_feedback(message, log_file, quiet) tmp = (diamond_database, fastaid2LCAtaxid_file, taxids_with_multiple_offspring_file) if (nr_file is None and None in tmp and not all([file_ is None for file_ in tmp])): message = ('ERROR: Database folder does not contain an nr file, while ' 'some but not all of the downstream files that depend on ' 'it are present. In order to prevent strange bugs from ' 'arising, please remove all files from the database folder ' 'and try again.') shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) if (fastaid2LCAtaxid_file is None and taxids_with_multiple_offspring_file is not None): message = ('ERROR: file taxids_with_multiple_offspring exists but ' 'fastaid2LCAtaxid is not found in the database folder ' 'whilst taxids_with_multiple_offspring depends on it. In ' 'order to prevent strange bugs from arising, please remove ' '{0} and try again.' ''.format(taxids_with_multiple_offspring_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) whether_to_download_nr = True if (nr_file is None and diamond_database is not None and fastaid2LCAtaxid_file is not None and taxids_with_multiple_offspring_file is not None): whether_to_download_nr = False if nr_file is None: if whether_to_download_nr: message = 'Nr file will be downloaded to database folder.' shared.give_user_feedback(message, log_file, quiet) nr_file = '{0}/{1}.nr.gz'.format(database_folder, date) step_list.append('download_nr') else: pass else: message = 'Nr file found: {0}.'.format(nr_file) shared.give_user_feedback(message, log_file, quiet) if diamond_database is None: message = ('DIAMOND database will be constructed from the nr file.' ''.format(nr_file)) shared.give_user_feedback(message, log_file, quiet) diamond_database_prefix = '{0}/{1}.nr'.format(database_folder, date) step_list.append('make_diamond_database') else: message = 'DIAMOND database found: {0}.'.format(diamond_database) shared.give_user_feedback(message, log_file, quiet) diamond_database_prefix = diamond_database.rsplit('.dmnd', 1)[0] if fastaid2LCAtaxid_file is None: message = 'File fastaid2LCAtaxid will be created.' shared.give_user_feedback(message, log_file, quiet) fastaid2LCAtaxid_file = ('{0}/{1}.nr.fastaid2LCAtaxid' ''.format(database_folder, date)) step_list.append('make_fastaid2LCAtaxid_file') else: message = ('Fastaid2LCAtaxid found: {0}.' ''.format(fastaid2LCAtaxid_file)) shared.give_user_feedback(message, log_file, quiet) if taxids_with_multiple_offspring_file is None: message = 'File taxids_with_multiple_offspring will be created.' shared.give_user_feedback(message, log_file, quiet) taxids_with_multiple_offspring_file = ('{0}/{1}.nr.taxids_with_' 'multiple_offspring' ''.format(database_folder, date)) step_list.append('make_taxids_with_multiple_offspring_file') else: message = ('Taxids_with_multiple_offspring found: {0}.' ''.format(taxids_with_multiple_offspring_file)) shared.give_user_feedback(message, log_file, quiet) if nr_file is None and whether_to_download_nr is False: # This is pushed here just for the logic of the user. message = ('NOTE: Database folder contains all the necessary files ' 'except for nr.gz. Since nr.gz is not used by CAT or BAT, ' 'this is fine.') shared.give_user_feedback(message, log_file, quiet) if taxonomy_folder_inspect == [None] and database_folder_inspect == [None]: message = ('\n-----------------\n\n' 'WARNING: no taxonomy or database folder was found. CAT ' 'prepare will create them fresh. Are you sure you are ' 'linking to existing folders?') shared.give_user_feedback(message, log_file, quiet, show_time=False) if ('make_fastaid2LCAtaxid_file' in step_list or 'make_taxids_with_multiple_offspring_file' in step_list): # Check memory. min_mem = 100 (total_memory, error) = check.check_memory(min_mem) if error: message = ('ERROR: at least {0}GB of memory is needed for the ' 'database construction. {1}GB is found on your system. ' 'You can either try to find a machine with more ' 'memory, or download preconstructed database files ' 'from tbb.bio.uu.nl/bastiaan/CAT_prepare/.' ''.format(min_mem, total_memory)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) if len(step_list) == 0: message = ('All necessary files are found. Existing database does not ' 'need any more work...') shared.give_user_feedback(message, log_file, quiet, show_time=False) else: message = 'Ready to fly!\n\n-----------------\n' shared.give_user_feedback(message, log_file, quiet, show_time=False) if taxonomy_folder_inspect == [None]: os.mkdir(taxonomy_folder) message = 'Taxonomy folder {0} is created.'.format(taxonomy_folder) shared.give_user_feedback(message, log_file, quiet) if database_folder_inspect == [None]: os.mkdir(database_folder) message = 'Database folder {0} is created.'.format(database_folder) shared.give_user_feedback(message, log_file, quiet) prepare(step_list, taxonomy_folder, database_folder, date, prot_accession2taxid_file, nr_file, path_to_diamond, diamond_database_prefix, nproc, fastaid2LCAtaxid_file, taxids_with_multiple_offspring_file, log_file, quiet)
def run_fresh(args, date): (database_folder, taxonomy_folder, path_to_diamond, quiet, no_log, nproc) = check.convert_arguments(args) if no_log: log_file = None else: log_file = '{0}.CAT_prepare.fresh.log'.format(date) with open(log_file, 'w') as outf1: pass message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, log_file, quiet, show_time=False) message = ('\n' 'CAT prepare is running, constructing a fresh database.\n' 'Rawr!\n\n' 'WARNING: preparing the database files may take a couple of ' 'hours.\n\n' 'Supplied command: {0}\n\n' 'Taxonomy folder: {1}/\n' 'Database folder: {2}/\n' 'Log file: {3}\n\n' '-----------------\n'.format(' '.join(sys.argv), taxonomy_folder, database_folder, log_file)) shared.give_user_feedback(message, log_file, quiet, show_time=False) # Check diamond path. error = check.check_diamond_binaries(path_to_diamond, log_file, quiet) if error: sys.exit(1) # Check taxonomy folder. taxonomy_folder_inspect = check.inspect_taxonomy_folder(taxonomy_folder) if taxonomy_folder_inspect != [None]: if len([file for file in taxonomy_folder_inspect if file is not None]) > 0: message = ('ERROR: taxonomy folder {0} exists already and ' 'contains taxonomy files. Please supply a novel or ' 'empty folder if you want to start fresh, or run ' 'CAT prepare --existing.' ''.format(taxonomy_folder)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = ('Taxonomy folder exists already. Taxonomy files will be ' 'downloaded to it.') shared.give_user_feedback(message, log_file, quiet) database_folder_inspect = check.inspect_database_folder(database_folder) # Check database folder. if database_folder_inspect != [None]: if len([file_ for file_ in database_folder_inspect if file_ is not None]) > 0: message = ('ERROR: database folder {0} exists already and ' 'contains database files. Please supply a novel or ' 'empty folder if you want to start fresh.' ''.format(database_folder)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = ('Database folder exists already. Database file will be ' 'downloaded to it / constructed in it.') shared.give_user_feedback(message, log_file, quiet) # Check memory. min_mem = 100 (total_memory, error) = check.check_memory(min_mem) if error: message = ('ERROR: at least {0}GB of memory is needed for a fresh ' 'database construction. {1}GB is found on your system. You ' 'can either try to find a machine with more memory, or ' 'download preconstructed database files from ' 'tbb.bio.uu.nl/bastiaan/CAT_prepare/.' ''.format(min_mem, total_memory)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) if taxonomy_folder_inspect == [None]: os.mkdir(taxonomy_folder) message = '{0} is created.'.format(taxonomy_folder) shared.give_user_feedback(message, log_file, quiet) if database_folder_inspect == [None]: os.mkdir(database_folder) message = '{0} is created.'.format(database_folder) shared.give_user_feedback(message, log_file, quiet) prot_accession2taxid_file = ('{0}/{1}.prot.accession2taxid.gz' ''.format(taxonomy_folder, date)) nr_file = '{0}/{1}.nr.gz'.format(database_folder, date) diamond_database_prefix = '{0}/{1}.nr'.format(database_folder, date) fastaid2LCAtaxid_file = ('{0}/{1}.nr.fastaid2LCAtaxid' ''.format(database_folder, date)) taxids_with_multiple_offspring_file = ('{0}/{1}.nr.taxids_with_multiple_' 'offspring' ''.format(database_folder, date)) step_list = ['download_taxonomy_files', 'download_prot_accession2taxid_file', 'download_nr', 'make_diamond_database', 'make_fastaid2LCAtaxid_file', 'make_taxids_with_multiple_offspring_file'] prepare(step_list, taxonomy_folder, database_folder, date, prot_accession2taxid_file, nr_file, path_to_diamond, diamond_database_prefix, nproc, fastaid2LCAtaxid_file, taxids_with_multiple_offspring_file, log_file, quiet)
def run(): args = parse_arguments() message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check at which state to start. step_list = [] if not args.proteins_fasta and not args.alignment_file: message = ( '\n' 'BAT is running. Protein prediction, alignment, and bin ' 'classification are carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) step_list.append('predict_proteins') step_list.append('align') elif args.proteins_fasta and not args.alignment_file: message = ( '\n' 'BAT is running. Since a predicted protein fasta is supplied, ' 'only alignment and bin classification are carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) step_list.append('align') elif args.proteins_fasta and args.alignment_file: message = ( '\n' 'BAT is running. Since a predicted protein fasta and ' 'alignment file are supplied, only bin classification is ' 'carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) elif not args.proteins_fasta and args.alignment_file: message = ( 'if you want BAT to directly classify a set of bins, you ' 'should not only supply a DIAMOND alignment table but also a ' 'concatenated predicted protein fasta file with argument ' '[-p / --proteins].') shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) step_list.append('classify') # Print variables. message = ( 'Rarw!\n\n' 'Supplied command: {0}\n\n' 'Bin folder: {1}\n' 'Taxonomy folder: {2}\n' 'Database folder: {3}\n' 'Parameter r: {4}\n' 'Parameter f: {5}\n' 'Log file: {6}\n\n' '-----------------\n'.format( ' '.join(sys.argv), args.bin_folder, args.taxonomy_folder, args.database_folder, int(args.r), float(args.f), args.log_file)) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check binaries, output files, taxonomy folder and database folder, and # set variables. message = 'Doing some pre-flight checks first.' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_bin_folder( args.bin_folder, args.bin_suffix, args.log_file, args.quiet)) errors.append( check.check_out_prefix(args.out_prefix, args.log_file, args.quiet)) if 'predict_proteins' in step_list: errors.append( check.check_prodigal_binaries( args.path_to_prodigal, args.log_file, args.quiet)) setattr(args, 'concatenated_fasta', '{0}.concatenated.fasta'.format(args.out_prefix)) setattr(args, 'proteins_fasta', '{0}.concatenated.predicted_proteins.faa'.format( args.out_prefix)) setattr(args, 'proteins_gff', '{0}.concatenated.predicted_proteins.gff'.format( args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.concatenated_fasta, args.log_file, args.quiet)) errors.append( check.check_output_file( args.proteins_fasta, args.log_file, args.quiet)) errors.append( check.check_output_file( args.proteins_gff, args.log_file, args.quiet)) if 'align' in step_list: errors.append( check.check_diamond_binaries( args.path_to_diamond, args.log_file, args.quiet)) setattr(args, 'alignment_file', '{0}.concatenated.alignment.diamond'.format(args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.alignment_file, args.log_file, args.quiet)) errors.append( check.check_folders_for_run( args.taxonomy_folder, args.nodes_dmp, args.names_dmp, args.database_folder, args.diamond_database, args.fastaid2LCAtaxid_file, args.taxids_with_multiple_offspring_file, step_list, args.log_file, args.quiet)) setattr(args, 'bin2classification_output_file', '{0}.bin2classification.txt'.format(args.out_prefix)) setattr(args, 'ORF2LCA_output_file', '{0}.ORF2LCA.txt'.format(args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.bin2classification_output_file, args.log_file, args.quiet)) errors.append( check.check_output_file( args.ORF2LCA_output_file, args.log_file, args.quiet)) if 'predict_proteins' not in step_list: errors.append( check.check_fasta( args.proteins_fasta, args.log_file, args.quiet)) if 'align' in step_list: errors.append( check.check_top(args.top, args.r, args.log_file, args.quiet)) # Print all variables. shared.print_variables(args, step_list) if True in errors: sys.exit(1) message = 'Ready to fly!\n\n-----------------\n' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Start BAT. (bin2contigs, contig_names) = import_bins( args.bin_folder, args.bin_suffix, args.log_file, args.quiet) if 'predict_proteins' in step_list: make_concatenated_fasta( args.concatenated_fasta, bin2contigs, args.bin_folder, args.log_file, args.quiet) shared.run_prodigal( args.path_to_prodigal, args.concatenated_fasta, args.proteins_fasta, args.proteins_gff, args.log_file, args.quiet) contig2ORFs = shared.import_ORFs( args.proteins_fasta, args.log_file, args.quiet) check.check_whether_ORFs_are_based_on_contigs( contig_names, contig2ORFs, args.log_file, args.quiet) if 'align' in step_list: shared.run_diamond(args) (ORF2hits, all_hits) = shared.parse_tabular_alignment( args.alignment_file, args.one_minus_r, args.log_file, args.quiet) (taxid2parent, taxid2rank) = tax.import_nodes( args.nodes_dmp, args.log_file, args.quiet) fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid( args.fastaid2LCAtaxid_file, all_hits, args.log_file, args.quiet) taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring( args.taxids_with_multiple_offspring_file, args.log_file, args.quiet) message = 'BAT is flying! Files {0} and {1} are created.'.format( args.bin2classification_output_file, args.ORF2LCA_output_file) shared.give_user_feedback(message, args.log_file, args.quiet) n_classified_bins = 0 with open(args.bin2classification_output_file, 'w') as outf1, open(args.ORF2LCA_output_file, 'w') as outf2: outf1.write('# bin\tclassification\treason\tlineage\tlineage scores\n') outf2.write('# ORF\tbin\tnumber of hits\tlineage\ttop bit-score\n') for bin_ in sorted(bin2contigs): LCAs_ORFs = [] for contig in sorted(bin2contigs[bin_]): if contig not in contig2ORFs: continue for ORF in contig2ORFs[contig]: if ORF not in ORF2hits: outf2.write('{0}\t{1}\tORF has no hit to database\n' ''.format(ORF, bin_)) continue n_hits = len(ORF2hits[ORF]) (taxid, top_bitscore) = tax.find_LCA_for_ORF( ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent) if taxid.startswith('no taxid found'): outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format( ORF, bin_, n_hits, taxid, top_bitscore)) else: lineage = tax.find_lineage(taxid, taxid2parent) if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format( ORF, bin_, n_hits, ';'.join(lineage[::-1]), top_bitscore)) LCAs_ORFs.append((taxid, top_bitscore),) if len(LCAs_ORFs) == 0: outf1.write('{0}\tno taxid assigned\tno hits to database\n' ''.format(bin_)) continue (lineages, lineages_scores, based_on_n_ORFs) = tax.find_weighted_LCA( LCAs_ORFs, taxid2parent, args.f) if lineages == 'no ORFs with taxids found.': outf1.write('{0}\tno taxid assigned\t' 'hits not found in taxonomy files\n'.format(bin_)) continue if lineages == 'no lineage whitelisted.': outf1.write( '{0}\tno taxid assigned\t' 'no lineage reached minimum bit-score support\n' ''.format(bin_)) continue # The bin has a valid classification. n_classified_bins += 1 total_n_ORFs = sum([len(contig2ORFs[contig]) for contig in bin2contigs[bin_] if contig in contig2ORFs]) for (i, lineage) in enumerate(lineages): if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) scores = ['{0:.2f}'.format(score) for score in lineages_scores[i]] if len(lineages) == 1: # There is only one classification. outf1.write( '{0}\t' 'taxid assigned\t' 'based on {1}/{2} ORFs\t' '{3}\t' '{4}\n'.format( bin_, based_on_n_ORFs, total_n_ORFs, ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) else: # There are multiple classifications. outf1.write( '{0}\t' 'taxid assigned ({1}/{2})\t' 'based on {3}/{4} ORFs\t' '{5}\t' '{6}\n'.format( bin_, i + 1, len(lineages), based_on_n_ORFs, total_n_ORFs, ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) message = ('\n-----------------\n\n' '{0} BAT is done! {1:,d}/{2:,d} bins have taxonomy assigned.' ''.format(shared.timestamp(), n_classified_bins, len(bin2contigs))) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) if args.f < 0.5: message = ('\nWARNING: since f is set to smaller than 0.5, one bin ' 'may have multiple classifications.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) return
def run_existing(args): step_list = [] message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) message = ( '\n' 'CAT prepare is running, constructing only parts of the database ' 'that are missing. Rawr!\n\n' 'WARNING: CAT prepare does not check whether the existing files ' 'are OK or corrupted, only if they are there.\n' 'WARNING: note that the database and taxonomy files should be ' 'downloaded preferably at the same date.\n' 'WARNING: preparing the database files may take a couple of hours.' '\n\n' 'Supplied command: {0}\n\n' 'Taxonomy folder: {1}\n' 'Database folder: {2}\n' 'Log file: {3}\n\n' '-----------------\n'.format(' '.join(sys.argv), args.taxonomy_folder, args.database_folder, args.log_file)) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) message = 'Doing some pre-flight checks first.' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check DIAMOND path. error = check.check_diamond_binaries(args.path_to_diamond, args.log_file, args.quiet) if error: sys.exit(1) # Check taxonomy folder. if not os.path.isdir(args.taxonomy_folder): message = ('Taxonomy folder not found. Directory will be created ' 'fresh and taxonomy files downloaded to it.') shared.give_user_feedback(message, args.log_file, args.quiet) else: message = ('Taxonomy folder found.') shared.give_user_feedback(message, args.log_file, args.quiet) if ((not args.nodes_dmp and args.names_dmp) or (args.nodes_dmp and not args.names_dmp)): message = ( 'CAT prepare did not find both nodes.dmp and names.dmp in the ' 'taxonomy folder. They should be downloaded together. Remove ' '{0} and try again.'.format([ file_ for file_ in (args.nodes_dmp, args.names_dmp) if file_ ][0])) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) if not args.nodes_dmp and not args.names_dmp: message = ('Nodes.dmp and names.dmp will be downloaded to taxonomy ' 'folder.') shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('download_taxonomy_files') else: message = 'Nodes.dmp found: {0}.'.format(args.nodes_dmp) shared.give_user_feedback(message, args.log_file, args.quiet) message = 'Names.dmp found: {0}.'.format(args.names_dmp) shared.give_user_feedback(message, args.log_file, args.quiet) if not args.prot_accession2taxid_file: # NOTE that the file will only be downloaded if a new # fastaid2LCAtaxid_file needs to be constructed. message = 'Prot.accession2taxid file not found in taxonomy folder.' shared.give_user_feedback(message, args.log_file, args.quiet) else: message = 'Prot.accession2taxid file found: {0}.'.format( args.prot_accession2taxid_file) shared.give_user_feedback(message, args.log_file, args.quiet) # Check database folder. if not os.path.isdir(args.database_folder): message = ( 'Database folder not found. Directory will be created fresh ' 'and necessary database files will be downloaded to ' 'it / constructed in it.') shared.give_user_feedback(message, args.log_file, args.quiet) else: message = ('Database folder found.') shared.give_user_feedback(message, args.log_file, args.quiet) tmp = (args.diamond_database, args.fastaid2LCAtaxid_file, args.taxids_with_multiple_offspring_file) if (not args.nr_file and None in tmp and not all([file_ is None for file_ in tmp])): message = ( 'database folder does not contain an nr file, while some but ' 'not all of the downstream files that depend on it are ' 'present. In order to prevent strange bugs from arising, ' 'remove all files from the database folder and try again.') shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) if (not args.fastaid2LCAtaxid_file and args.taxids_with_multiple_offspring_file): message = ( 'file taxids_with_multiple_offspring exists but ' 'fastaid2LCAtaxid is not found in the database folder whilst ' 'taxids_with_multiple_offspring depends on it. In order to ' 'prevent strange bugs from arising, remove {0} and try again.' ''.format(args.taxids_with_multiple_offspring_file)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) whether_to_download_nr = True if (not args.nr_file and args.diamond_database and args.fastaid2LCAtaxid_file and args.taxids_with_multiple_offspring_file): whether_to_download_nr = False if not args.nr_file: if whether_to_download_nr: message = 'Nr file will be downloaded to database folder.' shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('download_nr') else: pass else: message = 'Nr file found: {0}.'.format(args.nr_file) shared.give_user_feedback(message, args.log_file, args.quiet) if not args.diamond_database: message = 'DIAMOND database will be constructed from the nr file.' shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('make_diamond_database') else: message = 'DIAMOND database found: {0}.'.format(args.diamond_database) shared.give_user_feedback(message, args.log_file, args.quiet) if not args.fastaid2LCAtaxid_file: if not args.prot_accession2taxid_file: message = ('Prot.accession2taxid file will be downloaded to ' 'taxonomy folder.') shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('download_prot_accession2taxid_file') message = 'File fastaid2LCAtaxid will be created.' shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('make_fastaid2LCAtaxid_file') else: message = ('Fastaid2LCAtaxid found: {0}.'.format( args.fastaid2LCAtaxid_file)) shared.give_user_feedback(message, args.log_file, args.quiet) if not args.prot_accession2taxid_file: message = 'Prot.accession2taxid file will not be needed.' shared.give_user_feedback(message, args.log_file, args.quiet) if not args.taxids_with_multiple_offspring_file: message = 'File taxids_with_multiple_offspring will be created.' shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('make_taxids_with_multiple_offspring_file') else: message = 'Taxids_with_multiple_offspring found: {0}'.format( args.taxids_with_multiple_offspring_file) shared.give_user_feedback(message, args.log_file, args.quiet) if not args.nr_file and whether_to_download_nr is False: # This is pushed here just for the logic of the user. message = ('NOTE: Database folder contains all the necessary files ' 'except for nr.gz. Since nr.gz is not used by CAT or BAT, ' 'this is fine.') shared.give_user_feedback(message, args.log_file, args.quiet) if (not os.path.isdir(args.taxonomy_folder) and not os.path.isdir(args.database_folder)): message = ( '\n-----------------\n\n' 'WARNING: no taxonomy or database folder was found. CAT ' 'prepare will create them fresh. Are you sure you are linking ' 'to existing folders?') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) if 'make_fastaid2LCAtaxid_file' in step_list: # Check memory. memory_bottleneck(args) if len(step_list) == 0: message = ('All necessary files are found. Existing database does not ' 'need any more work...') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) sys.exit(0) else: message = 'Ready to fly!\n\n-----------------\n' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) prepare(step_list, args) return
def run_fresh(args): message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) message = ( '\n' 'CAT prepare is running, constructing a fresh database.\n' 'Rawr!\n\n' 'WARNING: preparing the database files may take a couple of hours.' '\n\n' 'Supplied command: {0}\n\n' 'Taxonomy folder: {1}\n' 'Database folder: {2}\n' 'Log file: {3}\n\n' '-----------------\n'.format(' '.join(sys.argv), args.taxonomy_folder, args.database_folder, args.log_file)) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check diamond path. error = check.check_diamond_binaries(args.path_to_diamond, args.log_file, args.quiet) if error: sys.exit(1) if os.path.isdir(args.taxonomy_folder): if args.nodes_dmp or args.names_dmp or args.prot_accession2taxid_file: message = ( 'taxonomy folder {0} exists already and contains taxonomy ' 'files. Supply a novel or empty folder if you want ' 'to start fresh, or run CAT prepare --existing.'.format( args.taxonomy_folder)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) message = ('Taxonomy folder exists already. Taxonomy files will be ' 'downloaded to it.') shared.give_user_feedback(message, args.log_file, args.quiet) if os.path.isdir(args.database_folder): if (args.nr_file or args.diamond_database or args.fastaid2LCAtaxid_file or args.taxids_with_multiple_offspring_file): message = ( 'database folder {0} exists already and contains database ' 'files. Supply a novel or empty folder if you want to ' 'start fresh.'.format(args.database_folder)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) message = ('Database folder exists already. Database file will be ' 'downloaded to it / constructed in it.') shared.give_user_feedback(message, args.log_file, args.quiet) # Check memory. memory_bottleneck(args) step_list = [ 'download_taxonomy_files', 'download_prot_accession2taxid_file', 'download_nr', 'make_diamond_database', 'make_fastaid2LCAtaxid_file', 'make_taxids_with_multiple_offspring_file' ] prepare(step_list, args) return