Ejemplo n.º 1
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method

    if assignment_method == 'blast':
        if not opts.id_to_taxonomy_fp:
            option_parser.error('Option --id_to_taxonomy_fp is required when '
                                'assigning with blast.')
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error(
                'Either a blast db (via -b) or a collection of '
                'reference sequences (via -r) must be passed to '
                'assign taxonomy using blast.')

    if assignment_method == 'rdp':
        try:
            validate_rdp_version()
        except RuntimeError, e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    'A filepath for reference sequences must be '
                    'specified (via -r) along with the id_to_taxonomy '
                    'file to train the Rdp Classifier.')
        elif opts.reference_seqs_fp is not None:
            option_parser.error('A filepath for an id to taxonomy map must be '
                                'specified (via -t) along with the reference '
                                'sequences fp to train the Rdp Classifier.')
        else:
            pass
Ejemplo n.º 2
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method

    if assignment_method == 'blast':
        if not opts.id_to_taxonomy_fp:
            option_parser.error('Option --id_to_taxonomy_fp is required when '
                         'assigning with blast.')
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error('Either a blast db (via -b) or a collection of '
                         'reference sequences (via -r) must be passed to '
                         'assign taxonomy using blast.')

    if assignment_method == 'rdp':
        try:
            validate_rdp_version()
        except RuntimeError, e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    'A filepath for reference sequences must be '
                    'specified (via -r) along with the id_to_taxonomy '
                    'file to train the Rdp Classifier.')
        elif opts.reference_seqs_fp is not None:
                option_parser.error(
                    'A filepath for an id to taxonomy map must be '
                    'specified (via -t) along with the reference '
                    'sequences fp to train the Rdp Classifier.')
        else:
            pass
Ejemplo n.º 3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method
    similarity = opts.similarity
    sortmerna_coverage = opts.sortmerna_coverage
    sortmerna_db = opts.sortmerna_db

    if assignment_method == 'sortmerna':
        # similarity must be between (0,1]
        if not 0 < similarity <= 1:
            option_parser.error('--similarity must be between (0,1].')
        # coverage must be between (0.1]
        if not 0 < sortmerna_coverage <= 1:
            option_parser.error('--sortmerna_coverage must be '
                                'between (0,1].')
        # check ID to taxonomy filepath
        if not opts.id_to_taxonomy_fp:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with sortmerna.')
        # check reference sequences filepath
        if not opts.reference_seqs_fp:
            option_parser.error(
                'sortmerna always requires --reference_seqs_fp '
                '(with or without sortmerna_db)')
        # check indexed database, if provided (not mandatory)
        elif sortmerna_db:
            if isfile(sortmerna_db + '.stats') is False:
                option_parser.error('%s does not exist, make sure you have '
                                    'indexed the database using indexdb_rna' %
                                    (sortmerna_db + '.stats'))

    if assignment_method == 'blast':
        if not opts.id_to_taxonomy_fp:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with blast.')
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error('Either a blast db (via -b) or a collection '
                                'of reference sequences (via -r) must be '
                                'passed to assign taxonomy using blast.')

    if assignment_method == 'rdp':
        try:
            validate_rdp_version()
        except RuntimeError as e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    'A filepath for reference sequences must be '
                    'specified (via -r) along with the id_to_taxonomy '
                    'file to train the Rdp Classifier.')
        elif opts.reference_seqs_fp is not None:
            option_parser.error('A filepath for an id to taxonomy map must be '
                                'specified (via -t) along with the reference '
                                'sequences fp to train the Rdp Classifier.')
        else:
            pass

    if assignment_method == 'uclust':
        if opts.id_to_taxonomy_fp is None:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with uclust.')
        if opts.reference_seqs_fp is None:
            option_parser.error('--reference_seqs_fp is required when '
                                'assigning with uclust.')

    if assignment_method == 'rtax':
        if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None:
            option_parser.error(
                'RTAX classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')
        if opts.read_1_seqs_fp is None:  # or opts.read_2_seqs_fp is None:
            option_parser.error(
                'RTAX classification requires the FASTA files '
                'produced by split_illumina_fastq.py for both reads, '
                'in addition to the cluster representatives.  Pass '
                'these via --read_1_seqs_fp and --read_2_seqs_fp.')

    if assignment_method == 'mothur':
        if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]:
            option_parser.error(
                'Mothur classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')

    taxon_assigner_constructor =\
        assignment_method_constructors[assignment_method]
    input_sequences_filepath = opts.input_fasta_fp

    try:
        id_to_taxonomy_fp = opts.id_to_taxonomy_fp
        params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp}
    except IndexError:
        params = {}

    # Build the output filenames
    output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy'
    try:
        mkdir(output_dir)
    except OSError:
        # output_dir already exists
        pass

    fpath, ext = splitext(input_sequences_filepath)
    input_dir, fname = split(fpath)
    result_path = output_dir + '/' + fname + '_tax_assignments.txt'
    log_path = output_dir + '/' + fname + '_tax_assignments.log'

    if assignment_method == 'blast':
        # one of these must have a value, otherwise we'd have
        # an optparse error
        if opts.blast_db:
            params['blast_db'] = opts.blast_db
        else:
            params['reference_seqs_filepath'] = opts.reference_seqs_fp
        params['Max E value'] = opts.blast_e_value

    elif assignment_method == 'mothur':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp

    elif assignment_method == 'uclust':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['min_consensus_fraction'] = opts.min_consensus_fraction
        params['similarity'] = similarity
        params['max_accepts'] = opts.uclust_max_accepts

    elif assignment_method == 'sortmerna':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['sortmerna_db'] = sortmerna_db
        params['min_consensus_fraction'] = opts.min_consensus_fraction
        params['min_percent_id'] = float(similarity * 100.0)
        params['min_percent_cov'] = float(sortmerna_coverage * 100.0)
        params['best_N_alignments'] = opts.sortmerna_best_N_alignments
        params['e_value'] = opts.sortmerna_e_value
        params['threads'] = opts.sortmerna_threads

    elif assignment_method == 'rdp':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params[
            'training_data_properties_fp'] = opts.training_data_properties_fp
        params['max_memory'] = "%sM" % opts.rdp_max_memory

    elif assignment_method == 'rtax':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['read_1_seqs_fp'] = opts.read_1_seqs_fp
        params['read_2_seqs_fp'] = opts.read_2_seqs_fp
        params['single_ok'] = opts.single_ok
        params['no_single_ok_generic'] = opts.no_single_ok_generic
        params['header_id_regex'] = opts.header_id_regex
        params['read_id_regex'] = opts.read_id_regex
        params['amplicon_id_regex'] = opts.amplicon_id_regex

    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    fd, temp_result_path = mkstemp(prefix='assign-tax')
    close(fd)
    taxon_assigner = taxon_assigner_constructor(params)
    if assignment_method == "sortmerna":
        taxon_assigner(input_sequences_filepath,
                       result_path=result_path,
                       log_path=log_path)
    else:
        taxon_assigner(input_sequences_filepath,
                       result_path=temp_result_path,
                       log_path=log_path)

        # This is an ugly hack, and needs to be pushed upstream to
        # the taxon assigners (except for sortmerna, which already outputs
        # only the first field for all headers in the Blast tabular output).
        # The output taxonomy maps that are returned by the taxon assigners
        # contain the full sequence headers as the first field (so including
        # "comment" text in the fasta headers), but for consistency with the
        # input taxonomy maps, should only contain the sequence identifier.
        # This modifies those entries to contain only the sequence identifer,
        # discarding any comment information. The formatting of these result
        # files needs to be centralized, and at that stage this processing
        # should happen there rather than here.
        result_f = open(result_path, 'w')
        for line in open(temp_result_path, 'U'):
            fields = line.strip().split('\t')
            seq_id = fields[0].split()[0]
            result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:])))
        result_f.close()
        remove_files([temp_result_path])
Ejemplo n.º 4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method
    similarity = opts.similarity
    sortmerna_coverage = opts.sortmerna_coverage
    sortmerna_db = opts.sortmerna_db

    if assignment_method == 'sortmerna':
        # similarity must be between (0,1]
        if not 0 < similarity <= 1:
            option_parser.error('--similarity must be between (0,1].')
        # coverage must be between (0.1]
        if not 0 < sortmerna_coverage <= 1:
            option_parser.error('--sortmerna_coverage must be '
                                'between (0,1].')
        # check ID to taxonomy filepath
        if not opts.id_to_taxonomy_fp:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with sortmerna.')
        # check reference sequences filepath
        if not opts.reference_seqs_fp:
            option_parser.error('sortmerna always requires --reference_seqs_fp '
                                '(with or without sortmerna_db)')
        # check indexed database, if provided (not mandatory)
        elif sortmerna_db:
            if isfile(sortmerna_db + '.stats') is False:
                option_parser.error('%s does not exist, make sure you have '
                                    'indexed the database using indexdb_rna' %
                                    (sortmerna_db + '.stats'))

    if assignment_method == 'blast':
        if not opts.id_to_taxonomy_fp:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with blast.')
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error('Either a blast db (via -b) or a collection '
                                'of reference sequences (via -r) must be '
                                'passed to assign taxonomy using blast.')

    if assignment_method == 'rdp':
        try:
            validate_rdp_version()
        except RuntimeError as e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    'A filepath for reference sequences must be '
                    'specified (via -r) along with the id_to_taxonomy '
                    'file to train the Rdp Classifier.')
        elif opts.reference_seqs_fp is not None:
            option_parser.error(
                'A filepath for an id to taxonomy map must be '
                'specified (via -t) along with the reference '
                'sequences fp to train the Rdp Classifier.')
        else:
            pass

    if assignment_method == 'uclust':
        if opts.id_to_taxonomy_fp is None:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with uclust.')
        if opts.reference_seqs_fp is None:
            option_parser.error('--reference_seqs_fp is required when '
                                'assigning with uclust.')

    if assignment_method == 'rtax':
        if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None:
            option_parser.error('RTAX classification requires both a filepath for '
                                'reference sequences (via -r) and an id_to_taxonomy '
                                'file (via -t).')
        if opts.read_1_seqs_fp is None:  # or opts.read_2_seqs_fp is None:
            option_parser.error('RTAX classification requires the FASTA files '
                                'produced by split_illumina_fastq.py for both reads, '
                                'in addition to the cluster representatives.  Pass '
                                'these via --read_1_seqs_fp and --read_2_seqs_fp.')

    if assignment_method == 'mothur':
        if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]:
            option_parser.error(
                'Mothur classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')

    taxon_assigner_constructor =\
        assignment_method_constructors[assignment_method]
    input_sequences_filepath = opts.input_fasta_fp

    try:
        id_to_taxonomy_fp = opts.id_to_taxonomy_fp
        params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp}
    except IndexError:
        params = {}

    # Build the output filenames
    output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy'
    try:
        mkdir(output_dir)
    except OSError:
        # output_dir already exists
        pass

    fpath, ext = splitext(input_sequences_filepath)
    input_dir, fname = split(fpath)
    result_path = output_dir + '/' + fname + '_tax_assignments.txt'
    log_path = output_dir + '/' + fname + '_tax_assignments.log'

    if assignment_method == 'blast':
        # one of these must have a value, otherwise we'd have
        # an optparse error
        if opts.blast_db:
            params['blast_db'] = opts.blast_db
        else:
            params['reference_seqs_filepath'] = opts.reference_seqs_fp
        params['Max E value'] = opts.blast_e_value

    elif assignment_method == 'mothur':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp

    elif assignment_method == 'uclust':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['min_consensus_fraction'] = opts.min_consensus_fraction
        params['similarity'] = similarity
        params['max_accepts'] = opts.uclust_max_accepts

    elif assignment_method == 'sortmerna':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['sortmerna_db'] = sortmerna_db
        params['min_consensus_fraction'] = opts.min_consensus_fraction
        params['min_percent_id'] = float(similarity*100.0)
        params['min_percent_cov'] = float(sortmerna_coverage*100.0)
        params['best_N_alignments'] = opts.sortmerna_best_N_alignments
        params['e_value'] = opts.sortmerna_e_value
        params['threads'] = opts.sortmerna_threads

    elif assignment_method == 'rdp':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params[
            'training_data_properties_fp'] = opts.training_data_properties_fp
        params['max_memory'] = "%sM" % opts.rdp_max_memory

    elif assignment_method == 'rtax':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['read_1_seqs_fp'] = opts.read_1_seqs_fp
        params['read_2_seqs_fp'] = opts.read_2_seqs_fp
        params['single_ok'] = opts.single_ok
        params['no_single_ok_generic'] = opts.no_single_ok_generic
        params['header_id_regex'] = opts.header_id_regex
        params['read_id_regex'] = opts.read_id_regex
        params['amplicon_id_regex'] = opts.amplicon_id_regex

    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    fd, temp_result_path = mkstemp(prefix='assign-tax')
    close(fd)
    taxon_assigner = taxon_assigner_constructor(params)
    if assignment_method == "sortmerna":
        taxon_assigner(input_sequences_filepath,
                       result_path=result_path,
                       log_path=log_path)
    else:
        taxon_assigner(input_sequences_filepath,
                       result_path=temp_result_path,
                       log_path=log_path)

        # This is an ugly hack, and needs to be pushed upstream to
        # the taxon assigners (except for sortmerna, which already outputs
        # only the first field for all headers in the Blast tabular output).
        # The output taxonomy maps that are returned by the taxon assigners
        # contain the full sequence headers as the first field (so including
        # "comment" text in the fasta headers), but for consistency with the
        # input taxonomy maps, should only contain the sequence identifier.
        # This modifies those entries to contain only the sequence identifer,
        # discarding any comment information. The formatting of these result
        # files needs to be centralized, and at that stage this processing
        # should happen there rather than here.
        result_f = open(result_path, 'w')
        for line in open(temp_result_path, 'U'):
            fields = line.strip().split('\t')
            seq_id = fields[0].split()[0]
            result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:])))
        result_f.close()
        remove_files([temp_result_path])
Ejemplo n.º 5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method

    if assignment_method == 'blast':
        if not opts.id_to_taxonomy_fp:
            option_parser.error('Option --id_to_taxonomy_fp is required when '
                                'assigning with blast.')
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error(
                'Either a blast db (via -b) or a collection of '
                'reference sequences (via -r) must be passed to '
                'assign taxonomy using blast.')

    if assignment_method == 'rdp':
        try:
            validate_rdp_version()
        except RuntimeError as e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    'A filepath for reference sequences must be '
                    'specified (via -r) along with the id_to_taxonomy '
                    'file to train the Rdp Classifier.')
        elif opts.reference_seqs_fp is not None:
            option_parser.error('A filepath for an id to taxonomy map must be '
                                'specified (via -t) along with the reference '
                                'sequences fp to train the Rdp Classifier.')
        else:
            pass

    if assignment_method == 'uclust':
        if opts.id_to_taxonomy_fp is None:
            option_parser.error('Option --id_to_taxonomy_fp is required when '
                                'assigning with uclust.')
        if opts.reference_seqs_fp is None:
            option_parser.error('Option --reference_seqs_fp is required when '
                                'assigning with uclust.')

    if assignment_method == 'rtax':
        if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None:
            option_parser.error(
                'RTAX classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')
        if opts.read_1_seqs_fp is None:  # or opts.read_2_seqs_fp is None:
            option_parser.error(
                'RTAX classification requires the FASTA files '
                'produced by split_illumina_fastq.py for both reads, '
                'in addition to the cluster representatives.  Pass '
                'these via --read_1_seqs_fp and --read_2_seqs_fp.')

    if assignment_method == 'mothur':
        if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]:
            option_parser.error(
                'Mothur classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')

    if assignment_method == 'tax2tree':
        if opts.tree_fp is None:
            option_parser.error(
                'Tax2Tree classification requires a '
                'filepath to a prebuilt tree (via --tree_fp) containing '
                'both the representative and reference sequences. Check '
                'Tax2Tree documentation for help building a tree.')
        if opts.id_to_taxonomy_fp is None:
            option_parser.error(
                'Tax2Tree classification requires a '
                'filepath for an id_to_taxonomy file (via -t).')

    taxon_assigner_constructor =\
        assignment_method_constructors[assignment_method]
    input_sequences_filepath = opts.input_fasta_fp

    try:
        id_to_taxonomy_fp = opts.id_to_taxonomy_fp
        params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp}
    except IndexError:
        params = {}

    # Build the output filenames
    output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy'
    try:
        mkdir(output_dir)
    except OSError:
        # output_dir already exists
        pass

    fpath, ext = splitext(input_sequences_filepath)
    input_dir, fname = split(fpath)
    result_path = output_dir + '/' + fname + '_tax_assignments.txt'
    log_path = output_dir + '/' + fname + '_tax_assignments.log'

    if assignment_method == 'blast':
        # one of these must have a value, otherwise we'd have
        # an optparse error
        if opts.blast_db:
            params['blast_db'] = opts.blast_db
        else:
            params['reference_seqs_filepath'] = opts.reference_seqs_fp
        params['Max E value'] = opts.e_value

    elif assignment_method == 'mothur':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp

    elif assignment_method == 'uclust':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['min_consensus_fraction'] = opts.uclust_min_consensus_fraction
        params['similarity'] = opts.uclust_similarity
        params['max_accepts'] = opts.uclust_max_accepts

    elif assignment_method == 'rdp':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params[
            'training_data_properties_fp'] = opts.training_data_properties_fp
        params['max_memory'] = "%sM" % opts.rdp_max_memory

    elif assignment_method == 'rtax':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['read_1_seqs_fp'] = opts.read_1_seqs_fp
        params['read_2_seqs_fp'] = opts.read_2_seqs_fp
        params['single_ok'] = opts.single_ok
        params['no_single_ok_generic'] = opts.no_single_ok_generic
        params['header_id_regex'] = opts.header_id_regex
        params['read_id_regex'] = opts.read_id_regex
        params['amplicon_id_regex'] = opts.amplicon_id_regex

    elif assignment_method == 'tax2tree':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['tree_fp'] = opts.tree_fp

    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    temp_result_path = get_tmp_filename(prefix='assign-tax')
    taxon_assigner = taxon_assigner_constructor(params)
    taxon_assigner(input_sequences_filepath,
                   result_path=temp_result_path,
                   log_path=log_path)

    # This is an ugly hack, and needs to be pushed upstream to
    # the taxon assigners. The output taxonomy maps that are returned by the
    # taxon assigners contain the full sequence headers as the first field
    # (so including "comment" text in the fasta headers), but for consistency
    # with the input taxonomy maps, should only contain the sequence identifier.
    # This modifies those entries to contain only the sequence identifer,
    # discarding any comment information. The formatting of these result files
    # needs to be centralized, and at that stage this processing should
    # happen there rather than here.
    result_f = open(result_path, 'w')
    for line in open(temp_result_path, 'U'):
        fields = line.strip().split('\t')
        seq_id = fields[0].split()[0]
        result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:])))
    result_f.close()
    remove_files([temp_result_path])
Ejemplo n.º 6
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method

    if assignment_method == "blast":
        if not opts.id_to_taxonomy_fp:
            option_parser.error("Option --id_to_taxonomy_fp is required when " "assigning with blast.")
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error(
                "Either a blast db (via -b) or a collection of "
                "reference sequences (via -r) must be passed to "
                "assign taxonomy using blast."
            )

    if assignment_method == "rdp":
        try:
            validate_rdp_version()
        except RuntimeError as e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    "A filepath for reference sequences must be "
                    "specified (via -r) along with the id_to_taxonomy "
                    "file to train the Rdp Classifier."
                )
        elif opts.reference_seqs_fp is not None:
            option_parser.error(
                "A filepath for an id to taxonomy map must be "
                "specified (via -t) along with the reference "
                "sequences fp to train the Rdp Classifier."
            )
        else:
            pass

    if assignment_method == "uclust":
        if opts.id_to_taxonomy_fp is None:
            option_parser.error("Option --id_to_taxonomy_fp is required when " "assigning with uclust.")
        if opts.reference_seqs_fp is None:
            option_parser.error("Option --reference_seqs_fp is required when " "assigning with uclust.")

    if assignment_method == "rtax":
        if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None:
            option_parser.error(
                "RTAX classification requires both a filepath for "
                "reference sequences (via -r) and an id_to_taxonomy "
                "file (via -t)."
            )
        if opts.read_1_seqs_fp is None:  # or opts.read_2_seqs_fp is None:
            option_parser.error(
                "RTAX classification requires the FASTA files "
                "produced by split_illumina_fastq.py for both reads, "
                "in addition to the cluster representatives.  Pass "
                "these via --read_1_seqs_fp and --read_2_seqs_fp."
            )

    if assignment_method == "mothur":
        if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]:
            option_parser.error(
                "Mothur classification requires both a filepath for "
                "reference sequences (via -r) and an id_to_taxonomy "
                "file (via -t)."
            )

    if assignment_method == "tax2tree":
        if opts.tree_fp is None:
            option_parser.error(
                "Tax2Tree classification requires a "
                "filepath to a prebuilt tree (via --tree_fp) containing "
                "both the representative and reference sequences. Check "
                "Tax2Tree documentation for help building a tree."
            )
        if opts.id_to_taxonomy_fp is None:
            option_parser.error("Tax2Tree classification requires a " "filepath for an id_to_taxonomy file (via -t).")

    taxon_assigner_constructor = assignment_method_constructors[assignment_method]
    input_sequences_filepath = opts.input_fasta_fp

    try:
        id_to_taxonomy_fp = opts.id_to_taxonomy_fp
        params = {"id_to_taxonomy_filepath": id_to_taxonomy_fp}
    except IndexError:
        params = {}

    # Build the output filenames
    output_dir = opts.output_dir or assignment_method + "_assigned_taxonomy"
    try:
        mkdir(output_dir)
    except OSError:
        # output_dir already exists
        pass

    fpath, ext = splitext(input_sequences_filepath)
    input_dir, fname = split(fpath)
    result_path = output_dir + "/" + fname + "_tax_assignments.txt"
    log_path = output_dir + "/" + fname + "_tax_assignments.log"

    if assignment_method == "blast":
        # one of these must have a value, otherwise we'd have
        # an optparse error
        if opts.blast_db:
            params["blast_db"] = opts.blast_db
        else:
            params["reference_seqs_filepath"] = opts.reference_seqs_fp
        params["Max E value"] = opts.e_value

    elif assignment_method == "mothur":
        params["Confidence"] = opts.confidence
        params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp
        params["reference_sequences_fp"] = opts.reference_seqs_fp

    elif assignment_method == "uclust":
        params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp
        params["reference_sequences_fp"] = opts.reference_seqs_fp
        params["min_consensus_fraction"] = opts.uclust_min_consensus_fraction
        params["similarity"] = opts.uclust_similarity
        params["max_accepts"] = opts.uclust_max_accepts

    elif assignment_method == "rdp":
        params["Confidence"] = opts.confidence
        params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp
        params["reference_sequences_fp"] = opts.reference_seqs_fp
        params["training_data_properties_fp"] = opts.training_data_properties_fp
        params["max_memory"] = "%sM" % opts.rdp_max_memory

    elif assignment_method == "rtax":
        params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp
        params["reference_sequences_fp"] = opts.reference_seqs_fp
        params["read_1_seqs_fp"] = opts.read_1_seqs_fp
        params["read_2_seqs_fp"] = opts.read_2_seqs_fp
        params["single_ok"] = opts.single_ok
        params["no_single_ok_generic"] = opts.no_single_ok_generic
        params["header_id_regex"] = opts.header_id_regex
        params["read_id_regex"] = opts.read_id_regex
        params["amplicon_id_regex"] = opts.amplicon_id_regex

    elif assignment_method == "tax2tree":
        params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp
        params["tree_fp"] = opts.tree_fp

    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    fd, temp_result_path = mkstemp(prefix="assign-tax")
    close(fd)
    taxon_assigner = taxon_assigner_constructor(params)
    taxon_assigner(input_sequences_filepath, result_path=temp_result_path, log_path=log_path)

    # This is an ugly hack, and needs to be pushed upstream to
    # the taxon assigners. The output taxonomy maps that are returned by the
    # taxon assigners contain the full sequence headers as the first field
    # (so including "comment" text in the fasta headers), but for consistency
    # with the input taxonomy maps, should only contain the sequence identifier.
    # This modifies those entries to contain only the sequence identifer,
    # discarding any comment information. The formatting of these result files
    # needs to be centralized, and at that stage this processing should
    # happen there rather than here.
    result_f = open(result_path, "w")
    for line in open(temp_result_path, "U"):
        fields = line.strip().split("\t")
        seq_id = fields[0].split()[0]
        result_f.write("%s\t%s\n" % (seq_id, "\t".join(fields[1:])))
    result_f.close()
    remove_files([temp_result_path])
Ejemplo n.º 7
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method

    if assignment_method == 'blast':
        if not opts.id_to_taxonomy_fp:
            option_parser.error('Option --id_to_taxonomy_fp is required when '
                                'assigning with blast.')
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error('Either a blast db (via -b) or a collection of '
                                'reference sequences (via -r) must be passed to '
                                'assign taxonomy using blast.')

    if assignment_method == 'rdp':
        try:
            validate_rdp_version()
        except RuntimeError as e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    'A filepath for reference sequences must be '
                    'specified (via -r) along with the id_to_taxonomy '
                    'file to train the Rdp Classifier.')
        elif opts.reference_seqs_fp is not None:
            option_parser.error(
                'A filepath for an id to taxonomy map must be '
                'specified (via -t) along with the reference '
                'sequences fp to train the Rdp Classifier.')
        else:
            pass

    if assignment_method == 'uclust':
        if opts.id_to_taxonomy_fp is None:
            option_parser.error('Option --id_to_taxonomy_fp is required when '
                                'assigning with uclust.')
        if opts.reference_seqs_fp is None:
            option_parser.error('Option --reference_seqs_fp is required when '
                                'assigning with uclust.')

    if assignment_method == 'rtax':
        if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None:
            option_parser.error('RTAX classification requires both a filepath for '
                                'reference sequences (via -r) and an id_to_taxonomy '
                                'file (via -t).')
        if opts.read_1_seqs_fp is None:  # or opts.read_2_seqs_fp is None:
            option_parser.error('RTAX classification requires the FASTA files '
                                'produced by split_illumina_fastq.py for both reads, '
                                'in addition to the cluster representatives.  Pass '
                                'these via --read_1_seqs_fp and --read_2_seqs_fp.')

    if assignment_method == 'mothur':
        if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]:
            option_parser.error(
                'Mothur classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')

    if assignment_method == 'tax2tree':
        if opts.tree_fp is None:
            option_parser.error('Tax2Tree classification requires a '
                                'filepath to a prebuilt tree (via --tree_fp) containing '
                                'both the representative and reference sequences. Check '
                                'Tax2Tree documentation for help building a tree.')
        if opts.id_to_taxonomy_fp is None:
            option_parser.error('Tax2Tree classification requires a '
                                'filepath for an id_to_taxonomy file (via -t).')

    taxon_assigner_constructor =\
        assignment_method_constructors[assignment_method]
    input_sequences_filepath = opts.input_fasta_fp

    try:
        id_to_taxonomy_fp = opts.id_to_taxonomy_fp
        params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp}
    except IndexError:
        params = {}

    # Build the output filenames
    output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy'
    try:
        mkdir(output_dir)
    except OSError:
        # output_dir already exists
        pass

    fpath, ext = splitext(input_sequences_filepath)
    input_dir, fname = split(fpath)
    result_path = output_dir + '/' + fname + '_tax_assignments.txt'
    log_path = output_dir + '/' + fname + '_tax_assignments.log'

    if assignment_method == 'blast':
        # one of these must have a value, otherwise we'd have
        # an optparse error
        if opts.blast_db:
            params['blast_db'] = opts.blast_db
        else:
            params['reference_seqs_filepath'] = opts.reference_seqs_fp
        params['Max E value'] = opts.e_value

    elif assignment_method == 'mothur':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp

    elif assignment_method == 'uclust':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['min_consensus_fraction'] = opts.uclust_min_consensus_fraction
        params['similarity'] = opts.uclust_similarity
        params['max_accepts'] = opts.uclust_max_accepts

    elif assignment_method == 'rdp':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params[
            'training_data_properties_fp'] = opts.training_data_properties_fp
        params['max_memory'] = "%sM" % opts.rdp_max_memory

    elif assignment_method == 'rtax':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['read_1_seqs_fp'] = opts.read_1_seqs_fp
        params['read_2_seqs_fp'] = opts.read_2_seqs_fp
        params['single_ok'] = opts.single_ok
        params['no_single_ok_generic'] = opts.no_single_ok_generic
        params['header_id_regex'] = opts.header_id_regex
        params['read_id_regex'] = opts.read_id_regex
        params['amplicon_id_regex'] = opts.amplicon_id_regex

    elif assignment_method == 'tax2tree':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['tree_fp'] = opts.tree_fp

    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    temp_result_path = get_tmp_filename(prefix='assign-tax')
    taxon_assigner = taxon_assigner_constructor(params)
    taxon_assigner(input_sequences_filepath,
                   result_path=temp_result_path,
                   log_path=log_path)

    # This is an ugly hack, and needs to be pushed upstream to
    # the taxon assigners. The output taxonomy maps that are returned by the
    # taxon assigners contain the full sequence headers as the first field
    # (so including "comment" text in the fasta headers), but for consistency
    # with the input taxonomy maps, should only contain the sequence identifier.
    # This modifies those entries to contain only the sequence identifer,
    # discarding any comment information. The formatting of these result files
    # needs to be centralized, and at that stage this processing should
    # happen there rather than here.
    result_f = open(result_path, 'w')
    for line in open(temp_result_path, 'U'):
        fields = line.strip().split('\t')
        seq_id = fields[0].split()[0]
        result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:])))
    result_f.close()
    remove_files([temp_result_path])