def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    split_fasta_on_sample_ids_to_files(MinimalFastaParser(open(opts.input_fasta_fp,'U')),
                                       opts.output_dir,
                                       opts.buffer_size)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    split_fasta_on_sample_ids_to_files(
        MinimalFastaParser(open(opts.input_fasta_fp, 'U')), opts.output_dir,
        opts.buffer_size)
Exemple #3
0
def parse_and_submit_params(key, project_id, seq_file, output_dir,
                            submit_to_server=True):
    '''This function takes the input options from the user and generates a url
       and request header for submitting to the MG-RAST cgi script'''

    # Verify that the users computer can connect to the internet
    try:
        check_internet = urlopen('http://www.google.com')
    except:
        raise OSError(
            "This script is having trouble connecting to the internet!")

    # parse and split fasta file into individual sample fastas
    fasta_file = parse_fasta(open(seq_file))
    split_fasta_on_sample_ids_to_files(fasta_file, output_dir)

    # set the MG-RAST link for QIIME
    host = 'metagenomics.anl.gov'

    # open the log html
    log_file = open(os.path.join(output_dir, 'log.html'), 'w')
    log_data = ['<h3>The following jobs were submitted to MG-RAST.</h3>']
    log_data.append('<table border=1><tr><th>Fasta File</th><th>Job ID</th>')
    log_data.append('<th>md5</th></tr>')
    num = 0
    # iterate over the fasta files in the given directory
    fasta_filepaths = sorted(glob('%s/*.fasta' % output_dir))
    for i in fasta_filepaths:

        # Get the sample id from the fasta filename
        sample_id = os.path.split(os.path.splitext(i)[0])[-1]

        # set the parameters
        params = [('key', key), ('sample', sample_id), ('project', project_id)]

        # get the full path and short name for the fasta file to be uploaded
        file_to_submit = os.path.abspath(i)
        fasta_shortname = os.path.split(file_to_submit)[-1]

        # open and read file to be put in post form
        file_object = open(file_to_submit).read()

        # set the file
        files = [('file', fasta_shortname, file_object)]

        # Post the file and parameters
        response = post_multipart(host, params, files, submit_to_server)

        # check the response for MG-RAST errors
        job = re.findall(r'<id>.*</id>', response)
        md5 = re.findall(r'<md5>.*</md5>', response)

        # if job successful write to log html otherwise post an error message
        # in the log file
        if job and md5:
            job_id = job[0].strip('<id>').strip('</id>')
            md5_id = md5[0].strip('<md5>').strip('</md5>')
            log_data.append('<tr><td>%s</td><td>%s</td><td>%s</td></tr>' %
                            (fasta_shortname, job_id, md5_id))
        else:
            response_error = re.findall(
                r'Can\'t call method "login" ',
                response)
            if response_error:
                log_data.append('</table><br><h3 style="color:red">')
                log_data.append('Web-service authorization key is not valid!')
                log_data.append('</h3>')
            else:
                log_data.append('</table><br><h3 style="color:red">%s</h3>' %
                                (response))

    log_data.append('</table>')

    log_info = '\n'.join(log_data)
    # write and close the log html
    log_file.write(log_html % (log_info))
    log_file.close()

    return log_info
def usearch61_chimera_check(input_seqs_fp,
                            output_dir,
                            reference_seqs_fp=None,
                            suppress_usearch61_intermediates=False,
                            suppress_usearch61_ref=False,
                            suppress_usearch61_denovo=False,
                            split_by_sampleid=False,
                            non_chimeras_retention="union",
                            usearch61_minh=0.28,
                            usearch61_xn=8.0,
                            usearch61_dn=1.4,
                            usearch61_mindiffs=3,
                            usearch61_mindiv=0.8,
                            usearch61_abundance_skew=2.0,
                            percent_id_usearch61=0.97,
                            minlen=64,
                            word_length=8,
                            max_accepts=1,
                            max_rejects=8,
                            verbose=False,
                            threads=1.0,
                            HALT_EXEC=False):
    """ Main convenience function for usearch61 chimera checking
    
    input_seqs_fp:  filepath of input fasta file.
    output_dir:  output directory
    reference_seqs_fp: fasta filepath for reference chimera detection.
    suppress_usearch61_intermediates:  Suppress retention of .uc and log files.
    suppress_usearch61_ref:  Suppress usearch61 reference chimera detection.
    suppress_usearch61_denovo:  Suppress usearch61 de novo chimera detection.
    split_by_sampleid:  Split by sample ID for de novo chimera detection.
    non_chimeras_retention: Set to "union" or "intersection" to retain 
     non-chimeras between de novo and reference based results.
    usearch61_minh: Minimum score (h) to be classified as chimera. 
     Increasing this value tends to the number of false positives (and also
     sensitivity).
    usearch61_xn:  Weight of "no" vote.  Increasing this value tends to the
     number of false positives (and also sensitivity).
    usearch61_dn:  Pseudo-count prior for "no" votes. (n). Increasing this 
     value tends to the number of false positives (and also sensitivity).
    usearch61_mindiffs:  Minimum number of diffs in a segment. Increasing this
     value tends to reduce the number of false positives while reducing 
     sensitivity to very low-divergence chimeras.
    usearch61_mindiv:  Minimum divergence, i.e. 100% - identity between the 
     query and closest reference database sequence. Expressed as a percentage,
     so the default is 0.8%, which allows chimeras that are up to 99.2% similar
     to a reference sequence.
    usearch61_abundance_skew: abundance skew for de novo chimera comparisons.
    percent_id_usearch61: identity to cluster sequences at
    minlen: minimum sequence length for use with usearch61
    word_length: length of nucleotide 'words' for usearch61
    max_accepts: max number of accepts for hits with usearch61
    max_rejects: max number of rejects for usearch61, increasing allows more
     sensitivity at a cost of speed
    threads: Specify number of threads used per core per CPU
    HALT_EXEC=application controller option to halt execution and print command
    """
    """ 
    Need to cluster sequences de novo first to get 1. abundance information
    and 2 consensus sequence for each cluster.  Using dereplication followed
    by clustering does not appear to automatically update complete cluster 
    size, will directly cluster raw seqs with the small_mem clustering option.
    
    This means without additional parsing steps to recalculate 
    actual cluster sizes, the sizeorder option can't be used for de novo
    clustering and downstream chimera detection."""

    files_to_remove = []

    # Get absolute paths to avoid issues with calling usearch
    input_seqs_fp = abspath(input_seqs_fp)
    output_dir = abspath(output_dir)
    if reference_seqs_fp:
        reference_seqs_fp = abspath(reference_seqs_fp)
    log_fp = join(output_dir, "identify_chimeric_seqs.log")
    chimeras_fp = join(output_dir, "chimeras.txt")
    non_chimeras_fp = join(output_dir, "non_chimeras.txt")

    non_chimeras = []
    chimeras = []
    log_lines = {
        'denovo_chimeras': 0,
        'denovo_non_chimeras': 0,
        'ref_chimeras': 0,
        'ref_non_chimeras': 0
    }

    if split_by_sampleid:
        if verbose:
            print "Splitting fasta according to SampleID..."
        full_seqs = open(input_seqs_fp, "U")
        sep_fastas =\
         split_fasta_on_sample_ids_to_files(MinimalFastaParser(full_seqs),
         output_dir)
        full_seqs.close()

        if suppress_usearch61_intermediates:
            files_to_remove += sep_fastas

        for curr_fasta in sep_fastas:
            curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\
             identify_chimeras_usearch61(curr_fasta, output_dir,
             reference_seqs_fp, suppress_usearch61_intermediates,
             suppress_usearch61_ref, suppress_usearch61_denovo,
             non_chimeras_retention, usearch61_minh, usearch61_xn,
             usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
             usearch61_abundance_skew, percent_id_usearch61, minlen,
             word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
             log_lines, verbose, threads)

            chimeras += curr_chimeras
            non_chimeras += curr_non_chimeras

    else:
        chimeras, non_chimeras, files_to_remove, log_lines =\
         identify_chimeras_usearch61(input_seqs_fp, output_dir,
         reference_seqs_fp, suppress_usearch61_intermediates,
         suppress_usearch61_ref, suppress_usearch61_denovo,
         non_chimeras_retention, usearch61_minh, usearch61_xn,
         usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
         usearch61_abundance_skew, percent_id_usearch61, minlen,
         word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
         log_lines, verbose, threads)

    # write log, non chimeras, chimeras.
    write_usearch61_log(
        log_fp, input_seqs_fp, output_dir, reference_seqs_fp,
        suppress_usearch61_intermediates, suppress_usearch61_ref,
        suppress_usearch61_denovo, split_by_sampleid, non_chimeras_retention,
        usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs,
        usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61,
        minlen, word_length, max_accepts, max_rejects, HALT_EXEC, log_lines)

    chimeras_f = open(chimeras_fp, "w")
    non_chimeras_f = open(non_chimeras_fp, "w")
    for curr_chimera in chimeras:
        chimeras_f.write("%s\n" % curr_chimera)
    for curr_non_chimera in non_chimeras:
        non_chimeras_f.write("%s\n" % curr_non_chimera)
    chimeras_f.close()
    non_chimeras_f.close()

    remove_files(files_to_remove)
def usearch61_chimera_check(input_seqs_fp,
                            output_dir,
                            reference_seqs_fp = None,
                            suppress_usearch61_intermediates = False,
                            suppress_usearch61_ref = False,
                            suppress_usearch61_denovo = False,
                            split_by_sampleid = False,
                            non_chimeras_retention = "union",
                            usearch61_minh = 0.28,
                            usearch61_xn = 8.0,
                            usearch61_dn = 1.4,
                            usearch61_mindiffs = 3,
                            usearch61_mindiv = 0.8,
                            usearch61_abundance_skew = 2.0,
                            percent_id_usearch61 = 0.97,
                            minlen = 64,
                            word_length = 8,
                            max_accepts = 1,
                            max_rejects = 8,
                            verbose=False,
                            threads = 1.0,
                            HALT_EXEC=False):
    """ Main convenience function for usearch61 chimera checking
    
    input_seqs_fp:  filepath of input fasta file.
    output_dir:  output directory
    reference_seqs_fp: fasta filepath for reference chimera detection.
    suppress_usearch61_intermediates:  Suppress retention of .uc and log files.
    suppress_usearch61_ref:  Suppress usearch61 reference chimera detection.
    suppress_usearch61_denovo:  Suppress usearch61 de novo chimera detection.
    split_by_sampleid:  Split by sample ID for de novo chimera detection.
    non_chimeras_retention: Set to "union" or "intersection" to retain 
     non-chimeras between de novo and reference based results.
    usearch61_minh: Minimum score (h) to be classified as chimera. 
     Increasing this value tends to the number of false positives (and also
     sensitivity).
    usearch61_xn:  Weight of "no" vote.  Increasing this value tends to the
     number of false positives (and also sensitivity).
    usearch61_dn:  Pseudo-count prior for "no" votes. (n). Increasing this 
     value tends to the number of false positives (and also sensitivity).
    usearch61_mindiffs:  Minimum number of diffs in a segment. Increasing this
     value tends to reduce the number of false positives while reducing 
     sensitivity to very low-divergence chimeras.
    usearch61_mindiv:  Minimum divergence, i.e. 100% - identity between the 
     query and closest reference database sequence. Expressed as a percentage,
     so the default is 0.8%, which allows chimeras that are up to 99.2% similar
     to a reference sequence.
    usearch61_abundance_skew: abundance skew for de novo chimera comparisons.
    percent_id_usearch61: identity to cluster sequences at
    minlen: minimum sequence length for use with usearch61
    word_length: length of nucleotide 'words' for usearch61
    max_accepts: max number of accepts for hits with usearch61
    max_rejects: max number of rejects for usearch61, increasing allows more
     sensitivity at a cost of speed
    threads: Specify number of threads used per core per CPU
    HALT_EXEC=application controller option to halt execution and print command
    """
    
    """ 
    Need to cluster sequences de novo first to get 1. abundance information
    and 2 consensus sequence for each cluster.  Using dereplication followed
    by clustering does not appear to automatically update complete cluster 
    size, will directly cluster raw seqs with the small_mem clustering option.
    
    This means without additional parsing steps to recalculate 
    actual cluster sizes, the sizeorder option can't be used for de novo
    clustering and downstream chimera detection."""
    
    files_to_remove = []
    
    # Get absolute paths to avoid issues with calling usearch
    input_seqs_fp = abspath(input_seqs_fp)
    output_dir = abspath(output_dir)
    if reference_seqs_fp:
        reference_seqs_fp = abspath(reference_seqs_fp)
    log_fp = join(output_dir, "identify_chimeric_seqs.log")
    chimeras_fp = join(output_dir, "chimeras.txt")
    non_chimeras_fp = join(output_dir, "non_chimeras.txt")
        
    non_chimeras = []
    chimeras = []
    log_lines = {'denovo_chimeras':0,
                 'denovo_non_chimeras':0,
                 'ref_chimeras':0,
                 'ref_non_chimeras':0}
    
    if split_by_sampleid:
        if verbose:
            print "Splitting fasta according to SampleID..."
        full_seqs = open(input_seqs_fp, "U")
        sep_fastas =\
         split_fasta_on_sample_ids_to_files(MinimalFastaParser(full_seqs),
         output_dir)
        full_seqs.close()
        
        if suppress_usearch61_intermediates:
            files_to_remove += sep_fastas
        
        for curr_fasta in sep_fastas:
            curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\
             identify_chimeras_usearch61(curr_fasta, output_dir,
             reference_seqs_fp, suppress_usearch61_intermediates,
             suppress_usearch61_ref, suppress_usearch61_denovo,
             non_chimeras_retention, usearch61_minh, usearch61_xn,
             usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
             usearch61_abundance_skew, percent_id_usearch61, minlen,
             word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
             log_lines, verbose, threads)
             
            chimeras += curr_chimeras
            non_chimeras += curr_non_chimeras
             
            
    else:
        chimeras, non_chimeras, files_to_remove, log_lines =\
         identify_chimeras_usearch61(input_seqs_fp, output_dir,
         reference_seqs_fp, suppress_usearch61_intermediates,
         suppress_usearch61_ref, suppress_usearch61_denovo,
         non_chimeras_retention, usearch61_minh, usearch61_xn,
         usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
         usearch61_abundance_skew, percent_id_usearch61, minlen,
         word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
         log_lines, verbose, threads)
         
    # write log, non chimeras, chimeras.
    write_usearch61_log(log_fp, input_seqs_fp, output_dir,
     reference_seqs_fp, suppress_usearch61_intermediates,
     suppress_usearch61_ref, suppress_usearch61_denovo,
     split_by_sampleid, non_chimeras_retention, usearch61_minh,
     usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
     usearch61_abundance_skew, percent_id_usearch61, minlen,
     word_length, max_accepts, max_rejects, HALT_EXEC, log_lines)
     
    chimeras_f = open(chimeras_fp, "w")
    non_chimeras_f = open(non_chimeras_fp, "w")
    for curr_chimera in chimeras:
        chimeras_f.write("%s\n" % curr_chimera)
    for curr_non_chimera in non_chimeras:
        non_chimeras_f.write("%s\n" % curr_non_chimera)
    chimeras_f.close()
    non_chimeras_f.close()
    
    remove_files(files_to_remove)
Exemple #6
0
def parse_and_submit_params(key,
                            project_id,
                            seq_file,
                            output_dir,
                            submit_to_server=True):
    '''This function takes the input options from the user and generates a url
       and request header for submitting to the MG-RAST cgi script'''

    # Verify that the users computer can connect to the internet
    try:
        check_internet = urlopen('http://www.google.com')
    except:
        raise OSError(
            "This script is having trouble connecting to the internet!")

    # parse and split fasta file into individual sample fastas
    fasta_file = MinimalFastaParser(open(seq_file))
    split_fasta_on_sample_ids_to_files(fasta_file, output_dir)

    # set the MG-RAST link for QIIME
    host = 'metagenomics.anl.gov'

    # open the log html
    log_file = open(os.path.join(output_dir, 'log.html'), 'w')
    log_data = ['<h3>The following jobs were submitted to MG-RAST.</h3>']
    log_data.append('<table border=1><tr><th>Fasta File</th><th>Job ID</th>')
    log_data.append('<th>md5</th></tr>')
    num = 0
    # iterate over the fasta files in the given directory
    fasta_filepaths = sorted(glob('%s/*.fasta' % output_dir))
    for i in fasta_filepaths:

        # Get the sample id from the fasta filename
        sample_id = os.path.split(os.path.splitext(i)[0])[-1]

        # set the parameters
        params = [('key', key), ('sample', sample_id), ('project', project_id)]

        # get the full path and short name for the fasta file to be uploaded
        file_to_submit = os.path.abspath(i)
        fasta_shortname = os.path.split(file_to_submit)[-1]

        # open and read file to be put in post form
        file_object = open(file_to_submit).read()

        # set the file
        files = [('file', fasta_shortname, file_object)]

        # Post the file and parameters
        response = post_multipart(host, params, files, submit_to_server)

        # check the response for MG-RAST errors
        job = re.findall(r'<id>.*</id>', response)
        md5 = re.findall(r'<md5>.*</md5>', response)

        # if job successful write to log html otherwise post an error message
        # in the log file
        if job and md5:
            job_id = job[0].strip('<id>').strip('</id>')
            md5_id = md5[0].strip('<md5>').strip('</md5>')
            log_data.append('<tr><td>%s</td><td>%s</td><td>%s</td></tr>' %
                            (fasta_shortname, job_id, md5_id))
        else:
            response_error = re.findall(r'Can\'t call method "login" ',
                                        response)
            if response_error:
                log_data.append('</table><br><h3 style="color:red">')
                log_data.append('Web-service authorization key is not valid!')
                log_data.append('</h3>')
            else:
                log_data.append('</table><br><h3 style="color:red">%s</h3>' %
                                (response))

    log_data.append('</table>')

    log_info = '\n'.join(log_data)
    # write and close the log html
    log_file.write(log_html % (log_info))
    log_file.close()

    return log_info