def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) assignment_method = opts.assignment_method if assignment_method == 'blast': if not opts.id_to_taxonomy_fp: option_parser.error('Option --id_to_taxonomy_fp is required when ' 'assigning with blast.') if not (opts.reference_seqs_fp or opts.blast_db): option_parser.error( 'Either a blast db (via -b) or a collection of ' 'reference sequences (via -r) must be passed to ' 'assign taxonomy using blast.') if assignment_method == 'rdp': try: validate_rdp_version() except RuntimeError, e: option_parser.error(e) if opts.id_to_taxonomy_fp is not None: if opts.reference_seqs_fp is None: option_parser.error( 'A filepath for reference sequences must be ' 'specified (via -r) along with the id_to_taxonomy ' 'file to train the Rdp Classifier.') elif opts.reference_seqs_fp is not None: option_parser.error('A filepath for an id to taxonomy map must be ' 'specified (via -t) along with the reference ' 'sequences fp to train the Rdp Classifier.') else: pass
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) assignment_method = opts.assignment_method if assignment_method == 'blast': if not opts.id_to_taxonomy_fp: option_parser.error('Option --id_to_taxonomy_fp is required when ' 'assigning with blast.') if not (opts.reference_seqs_fp or opts.blast_db): option_parser.error('Either a blast db (via -b) or a collection of ' 'reference sequences (via -r) must be passed to ' 'assign taxonomy using blast.') if assignment_method == 'rdp': try: validate_rdp_version() except RuntimeError, e: option_parser.error(e) if opts.id_to_taxonomy_fp is not None: if opts.reference_seqs_fp is None: option_parser.error( 'A filepath for reference sequences must be ' 'specified (via -r) along with the id_to_taxonomy ' 'file to train the Rdp Classifier.') elif opts.reference_seqs_fp is not None: option_parser.error( 'A filepath for an id to taxonomy map must be ' 'specified (via -t) along with the reference ' 'sequences fp to train the Rdp Classifier.') else: pass
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) assignment_method = opts.assignment_method similarity = opts.similarity sortmerna_coverage = opts.sortmerna_coverage sortmerna_db = opts.sortmerna_db if assignment_method == 'sortmerna': # similarity must be between (0,1] if not 0 < similarity <= 1: option_parser.error('--similarity must be between (0,1].') # coverage must be between (0.1] if not 0 < sortmerna_coverage <= 1: option_parser.error('--sortmerna_coverage must be ' 'between (0,1].') # check ID to taxonomy filepath if not opts.id_to_taxonomy_fp: option_parser.error('--id_to_taxonomy_fp is required when ' 'assigning with sortmerna.') # check reference sequences filepath if not opts.reference_seqs_fp: option_parser.error( 'sortmerna always requires --reference_seqs_fp ' '(with or without sortmerna_db)') # check indexed database, if provided (not mandatory) elif sortmerna_db: if isfile(sortmerna_db + '.stats') is False: option_parser.error('%s does not exist, make sure you have ' 'indexed the database using indexdb_rna' % (sortmerna_db + '.stats')) if assignment_method == 'blast': if not opts.id_to_taxonomy_fp: option_parser.error('--id_to_taxonomy_fp is required when ' 'assigning with blast.') if not (opts.reference_seqs_fp or opts.blast_db): option_parser.error('Either a blast db (via -b) or a collection ' 'of reference sequences (via -r) must be ' 'passed to assign taxonomy using blast.') if assignment_method == 'rdp': try: validate_rdp_version() except RuntimeError as e: option_parser.error(e) if opts.id_to_taxonomy_fp is not None: if opts.reference_seqs_fp is None: option_parser.error( 'A filepath for reference sequences must be ' 'specified (via -r) along with the id_to_taxonomy ' 'file to train the Rdp Classifier.') elif opts.reference_seqs_fp is not None: option_parser.error('A filepath for an id to taxonomy map must be ' 'specified (via -t) along with the reference ' 'sequences fp to train the Rdp Classifier.') else: pass if assignment_method == 'uclust': if opts.id_to_taxonomy_fp is None: option_parser.error('--id_to_taxonomy_fp is required when ' 'assigning with uclust.') if opts.reference_seqs_fp is None: option_parser.error('--reference_seqs_fp is required when ' 'assigning with uclust.') if assignment_method == 'rtax': if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None: option_parser.error( 'RTAX classification requires both a filepath for ' 'reference sequences (via -r) and an id_to_taxonomy ' 'file (via -t).') if opts.read_1_seqs_fp is None: # or opts.read_2_seqs_fp is None: option_parser.error( 'RTAX classification requires the FASTA files ' 'produced by split_illumina_fastq.py for both reads, ' 'in addition to the cluster representatives. Pass ' 'these via --read_1_seqs_fp and --read_2_seqs_fp.') if assignment_method == 'mothur': if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]: option_parser.error( 'Mothur classification requires both a filepath for ' 'reference sequences (via -r) and an id_to_taxonomy ' 'file (via -t).') taxon_assigner_constructor =\ assignment_method_constructors[assignment_method] input_sequences_filepath = opts.input_fasta_fp try: id_to_taxonomy_fp = opts.id_to_taxonomy_fp params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp} except IndexError: params = {} # Build the output filenames output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy' try: mkdir(output_dir) except OSError: # output_dir already exists pass fpath, ext = splitext(input_sequences_filepath) input_dir, fname = split(fpath) result_path = output_dir + '/' + fname + '_tax_assignments.txt' log_path = output_dir + '/' + fname + '_tax_assignments.log' if assignment_method == 'blast': # one of these must have a value, otherwise we'd have # an optparse error if opts.blast_db: params['blast_db'] = opts.blast_db else: params['reference_seqs_filepath'] = opts.reference_seqs_fp params['Max E value'] = opts.blast_e_value elif assignment_method == 'mothur': params['Confidence'] = opts.confidence params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp elif assignment_method == 'uclust': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params['min_consensus_fraction'] = opts.min_consensus_fraction params['similarity'] = similarity params['max_accepts'] = opts.uclust_max_accepts elif assignment_method == 'sortmerna': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params['sortmerna_db'] = sortmerna_db params['min_consensus_fraction'] = opts.min_consensus_fraction params['min_percent_id'] = float(similarity * 100.0) params['min_percent_cov'] = float(sortmerna_coverage * 100.0) params['best_N_alignments'] = opts.sortmerna_best_N_alignments params['e_value'] = opts.sortmerna_e_value params['threads'] = opts.sortmerna_threads elif assignment_method == 'rdp': params['Confidence'] = opts.confidence params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params[ 'training_data_properties_fp'] = opts.training_data_properties_fp params['max_memory'] = "%sM" % opts.rdp_max_memory elif assignment_method == 'rtax': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params['read_1_seqs_fp'] = opts.read_1_seqs_fp params['read_2_seqs_fp'] = opts.read_2_seqs_fp params['single_ok'] = opts.single_ok params['no_single_ok_generic'] = opts.no_single_ok_generic params['header_id_regex'] = opts.header_id_regex params['read_id_regex'] = opts.read_id_regex params['amplicon_id_regex'] = opts.amplicon_id_regex else: # should not be able to get here as an unknown classifier would # have raised an optparse error exit(1) fd, temp_result_path = mkstemp(prefix='assign-tax') close(fd) taxon_assigner = taxon_assigner_constructor(params) if assignment_method == "sortmerna": taxon_assigner(input_sequences_filepath, result_path=result_path, log_path=log_path) else: taxon_assigner(input_sequences_filepath, result_path=temp_result_path, log_path=log_path) # This is an ugly hack, and needs to be pushed upstream to # the taxon assigners (except for sortmerna, which already outputs # only the first field for all headers in the Blast tabular output). # The output taxonomy maps that are returned by the taxon assigners # contain the full sequence headers as the first field (so including # "comment" text in the fasta headers), but for consistency with the # input taxonomy maps, should only contain the sequence identifier. # This modifies those entries to contain only the sequence identifer, # discarding any comment information. The formatting of these result # files needs to be centralized, and at that stage this processing # should happen there rather than here. result_f = open(result_path, 'w') for line in open(temp_result_path, 'U'): fields = line.strip().split('\t') seq_id = fields[0].split()[0] result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:]))) result_f.close() remove_files([temp_result_path])
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) assignment_method = opts.assignment_method similarity = opts.similarity sortmerna_coverage = opts.sortmerna_coverage sortmerna_db = opts.sortmerna_db if assignment_method == 'sortmerna': # similarity must be between (0,1] if not 0 < similarity <= 1: option_parser.error('--similarity must be between (0,1].') # coverage must be between (0.1] if not 0 < sortmerna_coverage <= 1: option_parser.error('--sortmerna_coverage must be ' 'between (0,1].') # check ID to taxonomy filepath if not opts.id_to_taxonomy_fp: option_parser.error('--id_to_taxonomy_fp is required when ' 'assigning with sortmerna.') # check reference sequences filepath if not opts.reference_seqs_fp: option_parser.error('sortmerna always requires --reference_seqs_fp ' '(with or without sortmerna_db)') # check indexed database, if provided (not mandatory) elif sortmerna_db: if isfile(sortmerna_db + '.stats') is False: option_parser.error('%s does not exist, make sure you have ' 'indexed the database using indexdb_rna' % (sortmerna_db + '.stats')) if assignment_method == 'blast': if not opts.id_to_taxonomy_fp: option_parser.error('--id_to_taxonomy_fp is required when ' 'assigning with blast.') if not (opts.reference_seqs_fp or opts.blast_db): option_parser.error('Either a blast db (via -b) or a collection ' 'of reference sequences (via -r) must be ' 'passed to assign taxonomy using blast.') if assignment_method == 'rdp': try: validate_rdp_version() except RuntimeError as e: option_parser.error(e) if opts.id_to_taxonomy_fp is not None: if opts.reference_seqs_fp is None: option_parser.error( 'A filepath for reference sequences must be ' 'specified (via -r) along with the id_to_taxonomy ' 'file to train the Rdp Classifier.') elif opts.reference_seqs_fp is not None: option_parser.error( 'A filepath for an id to taxonomy map must be ' 'specified (via -t) along with the reference ' 'sequences fp to train the Rdp Classifier.') else: pass if assignment_method == 'uclust': if opts.id_to_taxonomy_fp is None: option_parser.error('--id_to_taxonomy_fp is required when ' 'assigning with uclust.') if opts.reference_seqs_fp is None: option_parser.error('--reference_seqs_fp is required when ' 'assigning with uclust.') if assignment_method == 'rtax': if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None: option_parser.error('RTAX classification requires both a filepath for ' 'reference sequences (via -r) and an id_to_taxonomy ' 'file (via -t).') if opts.read_1_seqs_fp is None: # or opts.read_2_seqs_fp is None: option_parser.error('RTAX classification requires the FASTA files ' 'produced by split_illumina_fastq.py for both reads, ' 'in addition to the cluster representatives. Pass ' 'these via --read_1_seqs_fp and --read_2_seqs_fp.') if assignment_method == 'mothur': if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]: option_parser.error( 'Mothur classification requires both a filepath for ' 'reference sequences (via -r) and an id_to_taxonomy ' 'file (via -t).') taxon_assigner_constructor =\ assignment_method_constructors[assignment_method] input_sequences_filepath = opts.input_fasta_fp try: id_to_taxonomy_fp = opts.id_to_taxonomy_fp params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp} except IndexError: params = {} # Build the output filenames output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy' try: mkdir(output_dir) except OSError: # output_dir already exists pass fpath, ext = splitext(input_sequences_filepath) input_dir, fname = split(fpath) result_path = output_dir + '/' + fname + '_tax_assignments.txt' log_path = output_dir + '/' + fname + '_tax_assignments.log' if assignment_method == 'blast': # one of these must have a value, otherwise we'd have # an optparse error if opts.blast_db: params['blast_db'] = opts.blast_db else: params['reference_seqs_filepath'] = opts.reference_seqs_fp params['Max E value'] = opts.blast_e_value elif assignment_method == 'mothur': params['Confidence'] = opts.confidence params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp elif assignment_method == 'uclust': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params['min_consensus_fraction'] = opts.min_consensus_fraction params['similarity'] = similarity params['max_accepts'] = opts.uclust_max_accepts elif assignment_method == 'sortmerna': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params['sortmerna_db'] = sortmerna_db params['min_consensus_fraction'] = opts.min_consensus_fraction params['min_percent_id'] = float(similarity*100.0) params['min_percent_cov'] = float(sortmerna_coverage*100.0) params['best_N_alignments'] = opts.sortmerna_best_N_alignments params['e_value'] = opts.sortmerna_e_value params['threads'] = opts.sortmerna_threads elif assignment_method == 'rdp': params['Confidence'] = opts.confidence params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params[ 'training_data_properties_fp'] = opts.training_data_properties_fp params['max_memory'] = "%sM" % opts.rdp_max_memory elif assignment_method == 'rtax': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params['read_1_seqs_fp'] = opts.read_1_seqs_fp params['read_2_seqs_fp'] = opts.read_2_seqs_fp params['single_ok'] = opts.single_ok params['no_single_ok_generic'] = opts.no_single_ok_generic params['header_id_regex'] = opts.header_id_regex params['read_id_regex'] = opts.read_id_regex params['amplicon_id_regex'] = opts.amplicon_id_regex else: # should not be able to get here as an unknown classifier would # have raised an optparse error exit(1) fd, temp_result_path = mkstemp(prefix='assign-tax') close(fd) taxon_assigner = taxon_assigner_constructor(params) if assignment_method == "sortmerna": taxon_assigner(input_sequences_filepath, result_path=result_path, log_path=log_path) else: taxon_assigner(input_sequences_filepath, result_path=temp_result_path, log_path=log_path) # This is an ugly hack, and needs to be pushed upstream to # the taxon assigners (except for sortmerna, which already outputs # only the first field for all headers in the Blast tabular output). # The output taxonomy maps that are returned by the taxon assigners # contain the full sequence headers as the first field (so including # "comment" text in the fasta headers), but for consistency with the # input taxonomy maps, should only contain the sequence identifier. # This modifies those entries to contain only the sequence identifer, # discarding any comment information. The formatting of these result # files needs to be centralized, and at that stage this processing # should happen there rather than here. result_f = open(result_path, 'w') for line in open(temp_result_path, 'U'): fields = line.strip().split('\t') seq_id = fields[0].split()[0] result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:]))) result_f.close() remove_files([temp_result_path])
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) assignment_method = opts.assignment_method if assignment_method == 'blast': if not opts.id_to_taxonomy_fp: option_parser.error('Option --id_to_taxonomy_fp is required when ' 'assigning with blast.') if not (opts.reference_seqs_fp or opts.blast_db): option_parser.error( 'Either a blast db (via -b) or a collection of ' 'reference sequences (via -r) must be passed to ' 'assign taxonomy using blast.') if assignment_method == 'rdp': try: validate_rdp_version() except RuntimeError as e: option_parser.error(e) if opts.id_to_taxonomy_fp is not None: if opts.reference_seqs_fp is None: option_parser.error( 'A filepath for reference sequences must be ' 'specified (via -r) along with the id_to_taxonomy ' 'file to train the Rdp Classifier.') elif opts.reference_seqs_fp is not None: option_parser.error('A filepath for an id to taxonomy map must be ' 'specified (via -t) along with the reference ' 'sequences fp to train the Rdp Classifier.') else: pass if assignment_method == 'uclust': if opts.id_to_taxonomy_fp is None: option_parser.error('Option --id_to_taxonomy_fp is required when ' 'assigning with uclust.') if opts.reference_seqs_fp is None: option_parser.error('Option --reference_seqs_fp is required when ' 'assigning with uclust.') if assignment_method == 'rtax': if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None: option_parser.error( 'RTAX classification requires both a filepath for ' 'reference sequences (via -r) and an id_to_taxonomy ' 'file (via -t).') if opts.read_1_seqs_fp is None: # or opts.read_2_seqs_fp is None: option_parser.error( 'RTAX classification requires the FASTA files ' 'produced by split_illumina_fastq.py for both reads, ' 'in addition to the cluster representatives. Pass ' 'these via --read_1_seqs_fp and --read_2_seqs_fp.') if assignment_method == 'mothur': if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]: option_parser.error( 'Mothur classification requires both a filepath for ' 'reference sequences (via -r) and an id_to_taxonomy ' 'file (via -t).') if assignment_method == 'tax2tree': if opts.tree_fp is None: option_parser.error( 'Tax2Tree classification requires a ' 'filepath to a prebuilt tree (via --tree_fp) containing ' 'both the representative and reference sequences. Check ' 'Tax2Tree documentation for help building a tree.') if opts.id_to_taxonomy_fp is None: option_parser.error( 'Tax2Tree classification requires a ' 'filepath for an id_to_taxonomy file (via -t).') taxon_assigner_constructor =\ assignment_method_constructors[assignment_method] input_sequences_filepath = opts.input_fasta_fp try: id_to_taxonomy_fp = opts.id_to_taxonomy_fp params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp} except IndexError: params = {} # Build the output filenames output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy' try: mkdir(output_dir) except OSError: # output_dir already exists pass fpath, ext = splitext(input_sequences_filepath) input_dir, fname = split(fpath) result_path = output_dir + '/' + fname + '_tax_assignments.txt' log_path = output_dir + '/' + fname + '_tax_assignments.log' if assignment_method == 'blast': # one of these must have a value, otherwise we'd have # an optparse error if opts.blast_db: params['blast_db'] = opts.blast_db else: params['reference_seqs_filepath'] = opts.reference_seqs_fp params['Max E value'] = opts.e_value elif assignment_method == 'mothur': params['Confidence'] = opts.confidence params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp elif assignment_method == 'uclust': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params['min_consensus_fraction'] = opts.uclust_min_consensus_fraction params['similarity'] = opts.uclust_similarity params['max_accepts'] = opts.uclust_max_accepts elif assignment_method == 'rdp': params['Confidence'] = opts.confidence params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params[ 'training_data_properties_fp'] = opts.training_data_properties_fp params['max_memory'] = "%sM" % opts.rdp_max_memory elif assignment_method == 'rtax': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params['read_1_seqs_fp'] = opts.read_1_seqs_fp params['read_2_seqs_fp'] = opts.read_2_seqs_fp params['single_ok'] = opts.single_ok params['no_single_ok_generic'] = opts.no_single_ok_generic params['header_id_regex'] = opts.header_id_regex params['read_id_regex'] = opts.read_id_regex params['amplicon_id_regex'] = opts.amplicon_id_regex elif assignment_method == 'tax2tree': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['tree_fp'] = opts.tree_fp else: # should not be able to get here as an unknown classifier would # have raised an optparse error exit(1) temp_result_path = get_tmp_filename(prefix='assign-tax') taxon_assigner = taxon_assigner_constructor(params) taxon_assigner(input_sequences_filepath, result_path=temp_result_path, log_path=log_path) # This is an ugly hack, and needs to be pushed upstream to # the taxon assigners. The output taxonomy maps that are returned by the # taxon assigners contain the full sequence headers as the first field # (so including "comment" text in the fasta headers), but for consistency # with the input taxonomy maps, should only contain the sequence identifier. # This modifies those entries to contain only the sequence identifer, # discarding any comment information. The formatting of these result files # needs to be centralized, and at that stage this processing should # happen there rather than here. result_f = open(result_path, 'w') for line in open(temp_result_path, 'U'): fields = line.strip().split('\t') seq_id = fields[0].split()[0] result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:]))) result_f.close() remove_files([temp_result_path])
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) assignment_method = opts.assignment_method if assignment_method == "blast": if not opts.id_to_taxonomy_fp: option_parser.error("Option --id_to_taxonomy_fp is required when " "assigning with blast.") if not (opts.reference_seqs_fp or opts.blast_db): option_parser.error( "Either a blast db (via -b) or a collection of " "reference sequences (via -r) must be passed to " "assign taxonomy using blast." ) if assignment_method == "rdp": try: validate_rdp_version() except RuntimeError as e: option_parser.error(e) if opts.id_to_taxonomy_fp is not None: if opts.reference_seqs_fp is None: option_parser.error( "A filepath for reference sequences must be " "specified (via -r) along with the id_to_taxonomy " "file to train the Rdp Classifier." ) elif opts.reference_seqs_fp is not None: option_parser.error( "A filepath for an id to taxonomy map must be " "specified (via -t) along with the reference " "sequences fp to train the Rdp Classifier." ) else: pass if assignment_method == "uclust": if opts.id_to_taxonomy_fp is None: option_parser.error("Option --id_to_taxonomy_fp is required when " "assigning with uclust.") if opts.reference_seqs_fp is None: option_parser.error("Option --reference_seqs_fp is required when " "assigning with uclust.") if assignment_method == "rtax": if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None: option_parser.error( "RTAX classification requires both a filepath for " "reference sequences (via -r) and an id_to_taxonomy " "file (via -t)." ) if opts.read_1_seqs_fp is None: # or opts.read_2_seqs_fp is None: option_parser.error( "RTAX classification requires the FASTA files " "produced by split_illumina_fastq.py for both reads, " "in addition to the cluster representatives. Pass " "these via --read_1_seqs_fp and --read_2_seqs_fp." ) if assignment_method == "mothur": if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]: option_parser.error( "Mothur classification requires both a filepath for " "reference sequences (via -r) and an id_to_taxonomy " "file (via -t)." ) if assignment_method == "tax2tree": if opts.tree_fp is None: option_parser.error( "Tax2Tree classification requires a " "filepath to a prebuilt tree (via --tree_fp) containing " "both the representative and reference sequences. Check " "Tax2Tree documentation for help building a tree." ) if opts.id_to_taxonomy_fp is None: option_parser.error("Tax2Tree classification requires a " "filepath for an id_to_taxonomy file (via -t).") taxon_assigner_constructor = assignment_method_constructors[assignment_method] input_sequences_filepath = opts.input_fasta_fp try: id_to_taxonomy_fp = opts.id_to_taxonomy_fp params = {"id_to_taxonomy_filepath": id_to_taxonomy_fp} except IndexError: params = {} # Build the output filenames output_dir = opts.output_dir or assignment_method + "_assigned_taxonomy" try: mkdir(output_dir) except OSError: # output_dir already exists pass fpath, ext = splitext(input_sequences_filepath) input_dir, fname = split(fpath) result_path = output_dir + "/" + fname + "_tax_assignments.txt" log_path = output_dir + "/" + fname + "_tax_assignments.log" if assignment_method == "blast": # one of these must have a value, otherwise we'd have # an optparse error if opts.blast_db: params["blast_db"] = opts.blast_db else: params["reference_seqs_filepath"] = opts.reference_seqs_fp params["Max E value"] = opts.e_value elif assignment_method == "mothur": params["Confidence"] = opts.confidence params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp params["reference_sequences_fp"] = opts.reference_seqs_fp elif assignment_method == "uclust": params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp params["reference_sequences_fp"] = opts.reference_seqs_fp params["min_consensus_fraction"] = opts.uclust_min_consensus_fraction params["similarity"] = opts.uclust_similarity params["max_accepts"] = opts.uclust_max_accepts elif assignment_method == "rdp": params["Confidence"] = opts.confidence params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp params["reference_sequences_fp"] = opts.reference_seqs_fp params["training_data_properties_fp"] = opts.training_data_properties_fp params["max_memory"] = "%sM" % opts.rdp_max_memory elif assignment_method == "rtax": params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp params["reference_sequences_fp"] = opts.reference_seqs_fp params["read_1_seqs_fp"] = opts.read_1_seqs_fp params["read_2_seqs_fp"] = opts.read_2_seqs_fp params["single_ok"] = opts.single_ok params["no_single_ok_generic"] = opts.no_single_ok_generic params["header_id_regex"] = opts.header_id_regex params["read_id_regex"] = opts.read_id_regex params["amplicon_id_regex"] = opts.amplicon_id_regex elif assignment_method == "tax2tree": params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp params["tree_fp"] = opts.tree_fp else: # should not be able to get here as an unknown classifier would # have raised an optparse error exit(1) fd, temp_result_path = mkstemp(prefix="assign-tax") close(fd) taxon_assigner = taxon_assigner_constructor(params) taxon_assigner(input_sequences_filepath, result_path=temp_result_path, log_path=log_path) # This is an ugly hack, and needs to be pushed upstream to # the taxon assigners. The output taxonomy maps that are returned by the # taxon assigners contain the full sequence headers as the first field # (so including "comment" text in the fasta headers), but for consistency # with the input taxonomy maps, should only contain the sequence identifier. # This modifies those entries to contain only the sequence identifer, # discarding any comment information. The formatting of these result files # needs to be centralized, and at that stage this processing should # happen there rather than here. result_f = open(result_path, "w") for line in open(temp_result_path, "U"): fields = line.strip().split("\t") seq_id = fields[0].split()[0] result_f.write("%s\t%s\n" % (seq_id, "\t".join(fields[1:]))) result_f.close() remove_files([temp_result_path])
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) assignment_method = opts.assignment_method if assignment_method == 'blast': if not opts.id_to_taxonomy_fp: option_parser.error('Option --id_to_taxonomy_fp is required when ' 'assigning with blast.') if not (opts.reference_seqs_fp or opts.blast_db): option_parser.error('Either a blast db (via -b) or a collection of ' 'reference sequences (via -r) must be passed to ' 'assign taxonomy using blast.') if assignment_method == 'rdp': try: validate_rdp_version() except RuntimeError as e: option_parser.error(e) if opts.id_to_taxonomy_fp is not None: if opts.reference_seqs_fp is None: option_parser.error( 'A filepath for reference sequences must be ' 'specified (via -r) along with the id_to_taxonomy ' 'file to train the Rdp Classifier.') elif opts.reference_seqs_fp is not None: option_parser.error( 'A filepath for an id to taxonomy map must be ' 'specified (via -t) along with the reference ' 'sequences fp to train the Rdp Classifier.') else: pass if assignment_method == 'uclust': if opts.id_to_taxonomy_fp is None: option_parser.error('Option --id_to_taxonomy_fp is required when ' 'assigning with uclust.') if opts.reference_seqs_fp is None: option_parser.error('Option --reference_seqs_fp is required when ' 'assigning with uclust.') if assignment_method == 'rtax': if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None: option_parser.error('RTAX classification requires both a filepath for ' 'reference sequences (via -r) and an id_to_taxonomy ' 'file (via -t).') if opts.read_1_seqs_fp is None: # or opts.read_2_seqs_fp is None: option_parser.error('RTAX classification requires the FASTA files ' 'produced by split_illumina_fastq.py for both reads, ' 'in addition to the cluster representatives. Pass ' 'these via --read_1_seqs_fp and --read_2_seqs_fp.') if assignment_method == 'mothur': if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]: option_parser.error( 'Mothur classification requires both a filepath for ' 'reference sequences (via -r) and an id_to_taxonomy ' 'file (via -t).') if assignment_method == 'tax2tree': if opts.tree_fp is None: option_parser.error('Tax2Tree classification requires a ' 'filepath to a prebuilt tree (via --tree_fp) containing ' 'both the representative and reference sequences. Check ' 'Tax2Tree documentation for help building a tree.') if opts.id_to_taxonomy_fp is None: option_parser.error('Tax2Tree classification requires a ' 'filepath for an id_to_taxonomy file (via -t).') taxon_assigner_constructor =\ assignment_method_constructors[assignment_method] input_sequences_filepath = opts.input_fasta_fp try: id_to_taxonomy_fp = opts.id_to_taxonomy_fp params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp} except IndexError: params = {} # Build the output filenames output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy' try: mkdir(output_dir) except OSError: # output_dir already exists pass fpath, ext = splitext(input_sequences_filepath) input_dir, fname = split(fpath) result_path = output_dir + '/' + fname + '_tax_assignments.txt' log_path = output_dir + '/' + fname + '_tax_assignments.log' if assignment_method == 'blast': # one of these must have a value, otherwise we'd have # an optparse error if opts.blast_db: params['blast_db'] = opts.blast_db else: params['reference_seqs_filepath'] = opts.reference_seqs_fp params['Max E value'] = opts.e_value elif assignment_method == 'mothur': params['Confidence'] = opts.confidence params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp elif assignment_method == 'uclust': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params['min_consensus_fraction'] = opts.uclust_min_consensus_fraction params['similarity'] = opts.uclust_similarity params['max_accepts'] = opts.uclust_max_accepts elif assignment_method == 'rdp': params['Confidence'] = opts.confidence params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params[ 'training_data_properties_fp'] = opts.training_data_properties_fp params['max_memory'] = "%sM" % opts.rdp_max_memory elif assignment_method == 'rtax': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['reference_sequences_fp'] = opts.reference_seqs_fp params['read_1_seqs_fp'] = opts.read_1_seqs_fp params['read_2_seqs_fp'] = opts.read_2_seqs_fp params['single_ok'] = opts.single_ok params['no_single_ok_generic'] = opts.no_single_ok_generic params['header_id_regex'] = opts.header_id_regex params['read_id_regex'] = opts.read_id_regex params['amplicon_id_regex'] = opts.amplicon_id_regex elif assignment_method == 'tax2tree': params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp params['tree_fp'] = opts.tree_fp else: # should not be able to get here as an unknown classifier would # have raised an optparse error exit(1) temp_result_path = get_tmp_filename(prefix='assign-tax') taxon_assigner = taxon_assigner_constructor(params) taxon_assigner(input_sequences_filepath, result_path=temp_result_path, log_path=log_path) # This is an ugly hack, and needs to be pushed upstream to # the taxon assigners. The output taxonomy maps that are returned by the # taxon assigners contain the full sequence headers as the first field # (so including "comment" text in the fasta headers), but for consistency # with the input taxonomy maps, should only contain the sequence identifier. # This modifies those entries to contain only the sequence identifer, # discarding any comment information. The formatting of these result files # needs to be centralized, and at that stage this processing should # happen there rather than here. result_f = open(result_path, 'w') for line in open(temp_result_path, 'U'): fields = line.strip().split('\t') seq_id = fields[0].split()[0] result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:]))) result_f.close() remove_files([temp_result_path])