コード例 #1
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    with open(opts.input_seqs_fp, 'U') as input_seqs_f:
        split_sequence_file_on_sample_ids_to_files(input_seqs_f,
                                                   opts.file_type,
                                                   opts.output_dir,
                                                   opts.buffer_size)
コード例 #2
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    with open(opts.input_seqs_fp, 'U') as input_seqs_f:
        split_sequence_file_on_sample_ids_to_files(
            input_seqs_f,
            opts.file_type,
            opts.output_dir,
            opts.buffer_size)
コード例 #3
0
def parse_and_submit_params(key, project_id, seq_file, output_dir,
                            submit_to_server=True):
    '''This function takes the input options from the user and generates a url
       and request header for submitting to the MG-RAST cgi script'''

    # Verify that the users computer can connect to the internet
    try:
        check_internet = urlopen('http://www.google.com')
    except:
        raise OSError(
            "This script is having trouble connecting to the internet!")

    # parse and split fasta file into individual sample fastas
    with open(seq_file, 'U') as fasta_file:
        split_sequence_file_on_sample_ids_to_files(fasta_file, 'fasta',
                                                   output_dir)

    # set the MG-RAST link for QIIME
    host = 'metagenomics.anl.gov'

    # open the log html
    log_file = open(os.path.join(output_dir, 'log.html'), 'w')
    log_data = ['<h3>The following jobs were submitted to MG-RAST.</h3>']
    log_data.append('<table border=1><tr><th>Fasta File</th><th>Job ID</th>')
    log_data.append('<th>md5</th></tr>')
    num = 0
    # iterate over the fasta files in the given directory
    fasta_filepaths = sorted(glob('%s/*.fasta' % output_dir))
    for i in fasta_filepaths:

        # Get the sample id from the fasta filename
        sample_id = os.path.split(os.path.splitext(i)[0])[-1]

        # set the parameters
        params = [('key', key), ('sample', sample_id), ('project', project_id)]

        # get the full path and short name for the fasta file to be uploaded
        file_to_submit = os.path.abspath(i)
        fasta_shortname = os.path.split(file_to_submit)[-1]

        # open and read file to be put in post form
        file_object = open(file_to_submit).read()

        # set the file
        files = [('file', fasta_shortname, file_object)]

        # Post the file and parameters
        response = post_multipart(host, params, files, submit_to_server)

        # check the response for MG-RAST errors
        job = re.findall(r'<id>.*</id>', response)
        md5 = re.findall(r'<md5>.*</md5>', response)

        # if job successful write to log html otherwise post an error message
        # in the log file
        if job and md5:
            job_id = job[0].strip('<id>').strip('</id>')
            md5_id = md5[0].strip('<md5>').strip('</md5>')
            log_data.append('<tr><td>%s</td><td>%s</td><td>%s</td></tr>' %
                            (fasta_shortname, job_id, md5_id))
        else:
            response_error = re.findall(
                r'Can\'t call method "login" ',
                response)
            if response_error:
                log_data.append('</table><br><h3 style="color:red">')
                log_data.append('Web-service authorization key is not valid!')
                log_data.append('</h3>')
            else:
                log_data.append('</table><br><h3 style="color:red">%s</h3>' %
                                (response))

    log_data.append('</table>')

    log_info = '\n'.join(log_data)
    # write and close the log html
    log_file.write(log_html % (log_info))
    log_file.close()

    return log_info
コード例 #4
0
def usearch61_chimera_check(input_seqs_fp,
                            output_dir,
                            reference_seqs_fp=None,
                            suppress_usearch61_intermediates=False,
                            suppress_usearch61_ref=False,
                            suppress_usearch61_denovo=False,
                            split_by_sampleid=False,
                            non_chimeras_retention="union",
                            usearch61_minh=0.28,
                            usearch61_xn=8.0,
                            usearch61_dn=1.4,
                            usearch61_mindiffs=3,
                            usearch61_mindiv=0.8,
                            usearch61_abundance_skew=2.0,
                            percent_id_usearch61=0.97,
                            minlen=64,
                            word_length=8,
                            max_accepts=1,
                            max_rejects=8,
                            verbose=False,
                            threads=1.0,
                            HALT_EXEC=False):
    """ Main convenience function for usearch61 chimera checking

    input_seqs_fp:  filepath of input fasta file.
    output_dir:  output directory
    reference_seqs_fp: fasta filepath for reference chimera detection.
    suppress_usearch61_intermediates:  Suppress retention of .uc and log files.
    suppress_usearch61_ref:  Suppress usearch61 reference chimera detection.
    suppress_usearch61_denovo:  Suppress usearch61 de novo chimera detection.
    split_by_sampleid:  Split by sample ID for de novo chimera detection.
    non_chimeras_retention: Set to "union" or "intersection" to retain
     non-chimeras between de novo and reference based results.
    usearch61_minh: Minimum score (h) to be classified as chimera.
     Increasing this value tends to the number of false positives (and also
     sensitivity).
    usearch61_xn:  Weight of "no" vote.  Increasing this value tends to the
     number of false positives (and also sensitivity).
    usearch61_dn:  Pseudo-count prior for "no" votes. (n). Increasing this
     value tends to the number of false positives (and also sensitivity).
    usearch61_mindiffs:  Minimum number of diffs in a segment. Increasing this
     value tends to reduce the number of false positives while reducing
     sensitivity to very low-divergence chimeras.
    usearch61_mindiv:  Minimum divergence, i.e. 100% - identity between the
     query and closest reference database sequence. Expressed as a percentage,
     so the default is 0.8%, which allows chimeras that are up to 99.2% similar
     to a reference sequence.
    usearch61_abundance_skew: abundance skew for de novo chimera comparisons.
    percent_id_usearch61: identity to cluster sequences at
    minlen: minimum sequence length for use with usearch61
    word_length: length of nucleotide 'words' for usearch61
    max_accepts: max number of accepts for hits with usearch61
    max_rejects: max number of rejects for usearch61, increasing allows more
     sensitivity at a cost of speed
    threads: Specify number of threads used per core per CPU
    HALT_EXEC=application controller option to halt execution and print command
    """

    """
    Need to cluster sequences de novo first to get 1. abundance information
    and 2 consensus sequence for each cluster.  Using dereplication followed
    by clustering does not appear to automatically update complete cluster
    size, will directly cluster raw seqs with the small_mem clustering option.

    This means without additional parsing steps to recalculate
    actual cluster sizes, the sizeorder option can't be used for de novo
    clustering and downstream chimera detection."""

    files_to_remove = []

    # Get absolute paths to avoid issues with calling usearch
    input_seqs_fp = abspath(input_seqs_fp)
    output_dir = abspath(output_dir)
    if reference_seqs_fp:
        reference_seqs_fp = abspath(reference_seqs_fp)
    log_fp = join(output_dir, "identify_chimeric_seqs.log")
    chimeras_fp = join(output_dir, "chimeras.txt")
    non_chimeras_fp = join(output_dir, "non_chimeras.txt")

    non_chimeras = []
    chimeras = []
    log_lines = {'denovo_chimeras': 0,
                 'denovo_non_chimeras': 0,
                 'ref_chimeras': 0,
                 'ref_non_chimeras': 0}

    if split_by_sampleid:
        if verbose:
            print "Splitting fasta according to SampleID..."

        with open(input_seqs_fp, 'U') as full_seqs:
            sep_fastas =split_sequence_file_on_sample_ids_to_files(
                full_seqs,
                'fasta',
                output_dir)

        if suppress_usearch61_intermediates:
            files_to_remove += sep_fastas

        for curr_fasta in sep_fastas:
            curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\
                identify_chimeras_usearch61(curr_fasta, output_dir,
                                            reference_seqs_fp, suppress_usearch61_intermediates,
                                            suppress_usearch61_ref, suppress_usearch61_denovo,
                                            non_chimeras_retention, usearch61_minh, usearch61_xn,
                                            usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                                            usearch61_abundance_skew, percent_id_usearch61, minlen,
                                            word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
                                            log_lines, verbose, threads)

            chimeras += curr_chimeras
            non_chimeras += curr_non_chimeras

    else:
        chimeras, non_chimeras, files_to_remove, log_lines =\
            identify_chimeras_usearch61(input_seqs_fp, output_dir,
                                        reference_seqs_fp, suppress_usearch61_intermediates,
                                        suppress_usearch61_ref, suppress_usearch61_denovo,
                                        non_chimeras_retention, usearch61_minh, usearch61_xn,
                                        usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                                        usearch61_abundance_skew, percent_id_usearch61, minlen,
                                        word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
                                        log_lines, verbose, threads)

    # write log, non chimeras, chimeras.
    write_usearch61_log(log_fp, input_seqs_fp, output_dir,
                        reference_seqs_fp, suppress_usearch61_intermediates,
                        suppress_usearch61_ref, suppress_usearch61_denovo,
                        split_by_sampleid, non_chimeras_retention, usearch61_minh,
                        usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                        usearch61_abundance_skew, percent_id_usearch61, minlen,
                        word_length, max_accepts, max_rejects, HALT_EXEC, log_lines)

    chimeras_f = open(chimeras_fp, "w")
    non_chimeras_f = open(non_chimeras_fp, "w")
    for curr_chimera in chimeras:
        chimeras_f.write("%s\n" % curr_chimera)
    for curr_non_chimera in non_chimeras:
        non_chimeras_f.write("%s\n" % curr_non_chimera)
    chimeras_f.close()
    non_chimeras_f.close()

    remove_files(files_to_remove)
コード例 #5
0
def usearch61_chimera_check(input_seqs_fp,
                            output_dir,
                            reference_seqs_fp=None,
                            suppress_usearch61_intermediates=False,
                            suppress_usearch61_ref=False,
                            suppress_usearch61_denovo=False,
                            split_by_sampleid=False,
                            non_chimeras_retention="union",
                            usearch61_minh=0.28,
                            usearch61_xn=8.0,
                            usearch61_dn=1.4,
                            usearch61_mindiffs=3,
                            usearch61_mindiv=0.8,
                            usearch61_abundance_skew=2.0,
                            percent_id_usearch61=0.97,
                            minlen=64,
                            word_length=8,
                            max_accepts=1,
                            max_rejects=8,
                            verbose=False,
                            threads=1.0,
                            HALT_EXEC=False):
    """ Main convenience function for usearch61 chimera checking

    input_seqs_fp:  filepath of input fasta file.
    output_dir:  output directory
    reference_seqs_fp: fasta filepath for reference chimera detection.
    suppress_usearch61_intermediates:  Suppress retention of .uc and log files.
    suppress_usearch61_ref:  Suppress usearch61 reference chimera detection.
    suppress_usearch61_denovo:  Suppress usearch61 de novo chimera detection.
    split_by_sampleid:  Split by sample ID for de novo chimera detection.
    non_chimeras_retention: Set to "union" or "intersection" to retain
     non-chimeras between de novo and reference based results.
    usearch61_minh: Minimum score (h) to be classified as chimera.
     Increasing this value tends to the number of false positives (and also
     sensitivity).
    usearch61_xn:  Weight of "no" vote.  Increasing this value tends to the
     number of false positives (and also sensitivity).
    usearch61_dn:  Pseudo-count prior for "no" votes. (n). Increasing this
     value tends to the number of false positives (and also sensitivity).
    usearch61_mindiffs:  Minimum number of diffs in a segment. Increasing this
     value tends to reduce the number of false positives while reducing
     sensitivity to very low-divergence chimeras.
    usearch61_mindiv:  Minimum divergence, i.e. 100% - identity between the
     query and closest reference database sequence. Expressed as a percentage,
     so the default is 0.8%, which allows chimeras that are up to 99.2% similar
     to a reference sequence.
    usearch61_abundance_skew: abundance skew for de novo chimera comparisons.
    percent_id_usearch61: identity to cluster sequences at
    minlen: minimum sequence length for use with usearch61
    word_length: length of nucleotide 'words' for usearch61
    max_accepts: max number of accepts for hits with usearch61
    max_rejects: max number of rejects for usearch61, increasing allows more
     sensitivity at a cost of speed
    threads: Specify number of threads used per core per CPU
    HALT_EXEC=application controller option to halt execution and print command
    """

    """
    Need to cluster sequences de novo first to get 1. abundance information
    and 2 consensus sequence for each cluster.  Using dereplication followed
    by clustering does not appear to automatically update complete cluster
    size, will directly cluster raw seqs with the small_mem clustering option.

    This means without additional parsing steps to recalculate
    actual cluster sizes, the sizeorder option can't be used for de novo
    clustering and downstream chimera detection."""

    files_to_remove = []

    # Get absolute paths to avoid issues with calling usearch
    input_seqs_fp = abspath(input_seqs_fp)
    output_dir = abspath(output_dir)
    if reference_seqs_fp:
        reference_seqs_fp = abspath(reference_seqs_fp)
    log_fp = join(output_dir, "identify_chimeric_seqs.log")
    chimeras_fp = join(output_dir, "chimeras.txt")
    non_chimeras_fp = join(output_dir, "non_chimeras.txt")

    non_chimeras = []
    chimeras = []
    log_lines = {'denovo_chimeras': 0,
                 'denovo_non_chimeras': 0,
                 'ref_chimeras': 0,
                 'ref_non_chimeras': 0}

    if split_by_sampleid:
        if verbose:
            print "Splitting fasta according to SampleID..."

        with open(input_seqs_fp, 'U') as full_seqs:
            sep_fastas =split_sequence_file_on_sample_ids_to_files(
                full_seqs,
                'fasta',
                output_dir)

        if suppress_usearch61_intermediates:
            files_to_remove += sep_fastas

        for curr_fasta in sep_fastas:
            curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\
                identify_chimeras_usearch61(curr_fasta, output_dir,
                                            reference_seqs_fp, suppress_usearch61_intermediates,
                                            suppress_usearch61_ref, suppress_usearch61_denovo,
                                            non_chimeras_retention, usearch61_minh, usearch61_xn,
                                            usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                                            usearch61_abundance_skew, percent_id_usearch61, minlen,
                                            word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
                                            log_lines, verbose, threads)

            chimeras += curr_chimeras
            non_chimeras += curr_non_chimeras

    else:
        chimeras, non_chimeras, files_to_remove, log_lines =\
            identify_chimeras_usearch61(input_seqs_fp, output_dir,
                                        reference_seqs_fp, suppress_usearch61_intermediates,
                                        suppress_usearch61_ref, suppress_usearch61_denovo,
                                        non_chimeras_retention, usearch61_minh, usearch61_xn,
                                        usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                                        usearch61_abundance_skew, percent_id_usearch61, minlen,
                                        word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
                                        log_lines, verbose, threads)

    # write log, non chimeras, chimeras.
    write_usearch61_log(log_fp, input_seqs_fp, output_dir,
                        reference_seqs_fp, suppress_usearch61_intermediates,
                        suppress_usearch61_ref, suppress_usearch61_denovo,
                        split_by_sampleid, non_chimeras_retention, usearch61_minh,
                        usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                        usearch61_abundance_skew, percent_id_usearch61, minlen,
                        word_length, max_accepts, max_rejects, HALT_EXEC, log_lines)

    chimeras_f = open(chimeras_fp, "w")
    non_chimeras_f = open(non_chimeras_fp, "w")
    for curr_chimera in chimeras:
        chimeras_f.write("%s\n" % curr_chimera)
    for curr_non_chimera in non_chimeras:
        non_chimeras_f.write("%s\n" % curr_non_chimera)
    chimeras_f.close()
    non_chimeras_f.close()

    remove_files(files_to_remove)
コード例 #6
0
def parse_and_submit_params(key,
                            project_id,
                            seq_file,
                            output_dir,
                            submit_to_server=True):
    '''This function takes the input options from the user and generates a url
       and request header for submitting to the MG-RAST cgi script'''

    # Verify that the users computer can connect to the internet
    try:
        check_internet = urlopen('http://www.google.com')
    except:
        raise OSError(
            "This script is having trouble connecting to the internet!")

    # parse and split fasta file into individual sample fastas
    with open(seq_file, 'U') as fasta_file:
        split_sequence_file_on_sample_ids_to_files(fasta_file, 'fasta',
                                                   output_dir)

    # set the MG-RAST link for QIIME
    host = 'metagenomics.anl.gov'

    # open the log html
    log_file = open(os.path.join(output_dir, 'log.html'), 'w')
    log_data = ['<h3>The following jobs were submitted to MG-RAST.</h3>']
    log_data.append('<table border=1><tr><th>Fasta File</th><th>Job ID</th>')
    log_data.append('<th>md5</th></tr>')
    num = 0
    # iterate over the fasta files in the given directory
    fasta_filepaths = sorted(glob('%s/*.fasta' % output_dir))
    for i in fasta_filepaths:

        # Get the sample id from the fasta filename
        sample_id = os.path.split(os.path.splitext(i)[0])[-1]

        # set the parameters
        params = [('key', key), ('sample', sample_id), ('project', project_id)]

        # get the full path and short name for the fasta file to be uploaded
        file_to_submit = os.path.abspath(i)
        fasta_shortname = os.path.split(file_to_submit)[-1]

        # open and read file to be put in post form
        file_object = open(file_to_submit).read()

        # set the file
        files = [('file', fasta_shortname, file_object)]

        # Post the file and parameters
        response = post_multipart(host, params, files, submit_to_server)

        # check the response for MG-RAST errors
        job = re.findall(r'<id>.*</id>', response)
        md5 = re.findall(r'<md5>.*</md5>', response)

        # if job successful write to log html otherwise post an error message
        # in the log file
        if job and md5:
            job_id = job[0].strip('<id>').strip('</id>')
            md5_id = md5[0].strip('<md5>').strip('</md5>')
            log_data.append('<tr><td>%s</td><td>%s</td><td>%s</td></tr>' %
                            (fasta_shortname, job_id, md5_id))
        else:
            response_error = re.findall(r'Can\'t call method "login" ',
                                        response)
            if response_error:
                log_data.append('</table><br><h3 style="color:red">')
                log_data.append('Web-service authorization key is not valid!')
                log_data.append('</h3>')
            else:
                log_data.append('</table><br><h3 style="color:red">%s</h3>' %
                                (response))

    log_data.append('</table>')

    log_info = '\n'.join(log_data)
    # write and close the log html
    log_file.write(log_html % (log_info))
    log_file.close()

    return log_info