Esempio n. 1
0
def setup_workers(num_cpus,
                  outdir,
                  server_socket,
                  verbose=True,
                  error_profile=None):
    """Start workers waiting for data.
    
    num_cpus: number of cores

    outdir: directory were the workers will work in

    server_socket: an open socket to the server

    verbose: verbose flag passed to the workers

    error_profile: filepath to the error profiles, passed to workers

"""

    qiime_config = load_qiime_config()
    DENOISE_WORKER = join(get_qiime_scripts_dir(), "denoiser_worker.py")
    CLOUD_DISPATCH = join(get_qiime_scripts_dir(), "ec2Dispatch")
    CLOUD_ENV = qiime_config['cloud_environment']
    CLOUD = not CLOUD_ENV == "False"

    workers = []
    client_sockets = []
    tmpname = "".join(sample(list(lowercase),
                             8))  #somewhat unique id for cluster job

    host, port = server_socket.getsockname()

    #TODO: this should be set to a defined wait time using alarm()
    for i in range(num_cpus):
        name = outdir + ("/%sworker%d" % (tmpname, i))
        workers.append(name)
        if CLOUD:
            cmd = "%s %d %s %s -f %s -s %s -p %s" % (
                CLOUD_DISPATCH, i + 1, qiime_config['python_exe_fp'],
                DENOISE_WORKER, name, host, port)
        else:
            cmd = "%s %s -f %s -s %s -p %s" % (qiime_config['python_exe_fp'],
                                               DENOISE_WORKER, name, host,
                                               port)

        if verbose:
            cmd += " -v"
        if error_profile:
            cmd += " -e %s" % error_profile

        submit_jobs([cmd], tmpname)
        #wait until the client connects
        #This might be a race condition -> make the client robust
        client_socket, client_address = server_socket.accept()
        client_sockets.append((client_socket, client_address))

    return workers, client_sockets
Esempio n. 2
0
def setup_workers(num_cpus, outdir, server_socket, verbose=True,
                  error_profile=None):
    """Start workers waiting for data.

    num_cpus: number of cores

    outdir: directory were the workers will work in

    server_socket: an open socket to the server

    verbose: verbose flag passed to the workers

    error_profile: filepath to the error profiles, passed to workers

"""

    qiime_config = load_qiime_config()
    DENOISE_WORKER = join(get_qiime_scripts_dir(), "denoiser_worker.py")
    CLOUD_DISPATCH = join(get_qiime_scripts_dir(), "ec2Dispatch")
    CLOUD_ENV = qiime_config['cloud_environment']
    CLOUD = not CLOUD_ENV == "False"

    workers = []
    client_sockets = []
    # somewhat unique id for cluster job
    tmpname = "".join(sample(list(lowercase), 8))

    host, port = server_socket.getsockname()

    # TODO: this should be set to a defined wait time using alarm()
    for i in range(num_cpus):
        name = outdir + ("/%sworker%d" % (tmpname, i))
        workers.append(name)
        if CLOUD:
            cmd = "%s %d %s %s -f %s -s %s -p %s" % (CLOUD_DISPATCH, i + 1, qiime_config['python_exe_fp'],
                                                     DENOISE_WORKER, name, host, port)
        else:
            cmd = "%s %s -f %s -s %s -p %s" % (qiime_config['python_exe_fp'],
                                               DENOISE_WORKER, name, host, port)

        if verbose:
            cmd += " -v"
        if error_profile:
            cmd += " -e %s" % error_profile

        submit_jobs([cmd], tmpname)
        # wait until the client connects
        # This might be a race condition -> make the client robust
        client_socket, client_address = server_socket.accept()
        client_sockets.append((client_socket, client_address))

    return workers, client_sockets
Esempio n. 3
0
    def test_denoise_worker(self):
        """denoiser_worker.py is where it belongs and is callable."""

        qiime_config = load_qiime_config()
        PYTHON_BIN = qiime_config['python_exe_fp']

        DENOISE_WORKER = get_qiime_scripts_dir() + "/denoiser_worker.py"

        self.assertTrue(
            exists(DENOISE_WORKER),
            "DENOISER_WORKER is not where it's supposed to be: %s" %
            DENOISE_WORKER)

        #test if its callable and actually works
        command = "%s %s -h" % (PYTHON_BIN, DENOISE_WORKER)
        proc = Popen(command,shell=True,universal_newlines=True,\
                       stdout=PIPE,stderr=STDOUT)

        if (proc.wait() != 0):
            self.fail("Calling %s failed. Check permissions and that it is in fact an executable." \
                          % DENOISE_WORKER)

        result = proc.stdout.read()
        #check that the help string looks correct
        self.assertTrue(result.startswith("Usage"))
def run_make_otu_heatmap_html(otu_table_fp,mapping_fp,output_dir, params,
                              qiime_config,
                              command_handler,tree_fp,
                              status_update_callback=print_to_stdout):
    """ This function calls the make_otu_heatmap_html script """
    
    # define upper-level values
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    commands = []
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    # get the user-defined parameters
    try:
        params_str = get_params_str(params['make_otu_heatmap_html'])
    except KeyError:
        params_str = ''

    # Build the make_otu_heatmap_html command
    heatmap_cmd = '%s %s/make_otu_heatmap_html.py -i %s -m %s -t %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_table_fp, mapping_fp,tree_fp, output_dir, 
      params_str)
    
    commands.append([('OTU Heatmap' , heatmap_cmd)])
     
    # Call the command handler on the list of commands
    command_handler(commands, status_update_callback, logger)

    return True
Esempio n. 5
0
    def test_main(self):
        """Denoiser should always give same result on test data"""

        expected = """>FS8APND01D3TW3 | cluster size: 94 
CTCCCGTAGGAGTCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCC
"""

        expected_map = """FS8APND01EWRS4:
FS8APND01DXG45:
FS8APND01D3TW3:\tFS8APND01CSXFN\tFS8APND01DQ8MX\tFS8APND01DY7QW\tFS8APND01B5QNI\tFS8APND01CQ6OG\tFS8APND01C7IGN\tFS8APND01DHSGH\tFS8APND01DJ17E\tFS8APND01CUXOA\tFS8APND01EUTYG\tFS8APND01EKK7T\tFS8APND01D582W\tFS8APND01B5GWU\tFS8APND01D7N2A\tFS8APND01BJGHZ\tFS8APND01D6DYZ\tFS8APND01C6ZIM\tFS8APND01D2X6Y\tFS8APND01BUYCE\tFS8APND01BNUEY\tFS8APND01DKLOE\tFS8APND01C24PP\tFS8APND01EBWQX\tFS8APND01ELDYW\tFS8APND01B0GCS\tFS8APND01D4QXI\tFS8APND01EMYD9\tFS8APND01EA2SK\tFS8APND01DZOSO\tFS8APND01DHYAZ\tFS8APND01C7UD9\tFS8APND01BTZFV\tFS8APND01CR78R\tFS8APND01B39IE\tFS8APND01ECVC0\tFS8APND01DM3PL\tFS8APND01DELWS\tFS8APND01CIEK8\tFS8APND01D7ZOZ\tFS8APND01CZSAI\tFS8APND01DYOVR\tFS8APND01BX9XY\tFS8APND01DEWJA\tFS8APND01BEKIW\tFS8APND01DCKB9\tFS8APND01EEYIS\tFS8APND01DDKEA\tFS8APND01DSZLO\tFS8APND01C6EBC\tFS8APND01EE15M\tFS8APND01ELO9B\tFS8APND01C58QY\tFS8APND01DONCG\tFS8APND01DVXX2\tFS8APND01BL5YT\tFS8APND01BIL2V\tFS8APND01EBSYQ\tFS8APND01CCX8R\tFS8APND01B2YCJ\tFS8APND01B1JG4\tFS8APND01DJ024\tFS8APND01BIJY0\tFS8APND01CIA4G\tFS8APND01DV74M\tFS8APND01ECAX5\tFS8APND01DC3TZ\tFS8APND01EJVO6\tFS8APND01D4VFG\tFS8APND01DYYYO\tFS8APND01D1EDD\tFS8APND01DQUOT\tFS8APND01A2NSJ\tFS8APND01DDC8I\tFS8APND01BP1T2\tFS8APND01DPY6U\tFS8APND01CIQGV\tFS8APND01BPUT8\tFS8APND01BDNH4\tFS8APND01DOZDN\tFS8APND01DS866\tFS8APND01DGS2J\tFS8APND01EDK32\tFS8APND01EPA0T\tFS8APND01CK3JM\tFS8APND01BKLWW\tFS8APND01DV0BO\tFS8APND01DPNXE\tFS8APND01B7LUA\tFS8APND01BTTE2\tFS8APND01CKO4X\tFS8APND01DGGBY\tFS8APND01C4NHX\tFS8APND01DYPQN
FS8APND01BSTVP:
FS8APND01EFK0W:
FS8APND01DCIOO:
FS8APND01CKOMZ:
"""

        command =  " ".join( ["%s/denoiser.py" % get_qiime_scripts_dir(),
                              "--force", "-o", self.test_dir, "-i",
                              "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME] );

        result = Popen(command,shell=True,universal_newlines=True,\
                           stdout=PIPE,stderr=STDOUT).stdout.read()
        self.result_dir = self.test_dir

        observed = "".join(list(open(self.result_dir+ "centroids.fasta")))
        self.assertEqual(observed, expected)
        
        self.assertEqual(len(list(MinimalFastaParser(open(self.result_dir + "singletons.fasta")))), 6)

        observed = "".join(list(open(self.result_dir+ "denoiser_mapping.txt")))
        self.assertEqual(observed, expected_map)
def summarize_otus(processed_dir):
    """
    """
    per_library_stats_file = join(processed_dir, 'gg_97_otus/per_library_stats.txt')

    # Generate the per_library_stats_file if it doesn't already exist
    if not exists (per_library_stats_file):
        qiime_config = load_qiime_config()
        biom_file = join(processed_dir, 'gg_97_otus/exact_uclust_ref_otu_table.biom')
        python_exe_fp = qiime_config['python_exe_fp']
        script_dir = get_qiime_scripts_dir()
        per_library_stats_script = join(script_dir, 'per_library_stats.py')
        command = '{0} {1} -i {2}'.format(python_exe_fp, per_library_stats_script, biom_file)

        # Run the script and produce the per_library_stats.txt
        proc = Popen(command, shell = True, universal_newlines = True, stdout = PIPE, stderr = STDOUT)
        return_value = proc.wait()
        f = open(per_library_stats_file, 'w')
        f.write(proc.stdout.read())
        f.close()

    # File exists, parse out details
    start_lines = ['Seqs/sample detail:']
    header_lines, otu_summary_dict = parse_log_file(per_library_stats_file, start_lines)
    return header_lines, otu_summary_dict
Esempio n. 7
0
 def test_get_qiime_scripts_dir(self):
     """Test that we can find the directory containing QIIME scripts."""
     # get_qiime_scripts_dir will raise an error if it can't find a scripts
     # directory.
     scripts_dir = get_qiime_scripts_dir()
     self.assertTrue(isdir(scripts_dir), "The QIIME scripts directory does "
                     "not exist: %s" % scripts_dir)
Esempio n. 8
0
    def test_main(self):
        """Denoiser should always give same result on test data"""

        expected = """>FS8APND01D3TW3 | cluster size: 94 
CTCCCGTAGGAGTCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCC
"""

        expected_map = """FS8APND01EWRS4:
FS8APND01DXG45:
FS8APND01D3TW3:\tFS8APND01CSXFN\tFS8APND01DQ8MX\tFS8APND01DY7QW\tFS8APND01B5QNI\tFS8APND01CQ6OG\tFS8APND01C7IGN\tFS8APND01DHSGH\tFS8APND01DJ17E\tFS8APND01CUXOA\tFS8APND01EUTYG\tFS8APND01EKK7T\tFS8APND01D582W\tFS8APND01B5GWU\tFS8APND01D7N2A\tFS8APND01BJGHZ\tFS8APND01D6DYZ\tFS8APND01C6ZIM\tFS8APND01D2X6Y\tFS8APND01BUYCE\tFS8APND01BNUEY\tFS8APND01DKLOE\tFS8APND01C24PP\tFS8APND01EBWQX\tFS8APND01ELDYW\tFS8APND01B0GCS\tFS8APND01D4QXI\tFS8APND01EMYD9\tFS8APND01EA2SK\tFS8APND01DZOSO\tFS8APND01DHYAZ\tFS8APND01C7UD9\tFS8APND01BTZFV\tFS8APND01CR78R\tFS8APND01B39IE\tFS8APND01ECVC0\tFS8APND01DM3PL\tFS8APND01DELWS\tFS8APND01CIEK8\tFS8APND01D7ZOZ\tFS8APND01CZSAI\tFS8APND01DYOVR\tFS8APND01BX9XY\tFS8APND01DEWJA\tFS8APND01BEKIW\tFS8APND01DCKB9\tFS8APND01EEYIS\tFS8APND01DDKEA\tFS8APND01DSZLO\tFS8APND01C6EBC\tFS8APND01EE15M\tFS8APND01ELO9B\tFS8APND01C58QY\tFS8APND01DONCG\tFS8APND01DVXX2\tFS8APND01BL5YT\tFS8APND01BIL2V\tFS8APND01EBSYQ\tFS8APND01CCX8R\tFS8APND01B2YCJ\tFS8APND01B1JG4\tFS8APND01DJ024\tFS8APND01BIJY0\tFS8APND01CIA4G\tFS8APND01DV74M\tFS8APND01ECAX5\tFS8APND01DC3TZ\tFS8APND01EJVO6\tFS8APND01D4VFG\tFS8APND01DYYYO\tFS8APND01D1EDD\tFS8APND01DQUOT\tFS8APND01A2NSJ\tFS8APND01DDC8I\tFS8APND01BP1T2\tFS8APND01DPY6U\tFS8APND01CIQGV\tFS8APND01BPUT8\tFS8APND01BDNH4\tFS8APND01DOZDN\tFS8APND01DS866\tFS8APND01DGS2J\tFS8APND01EDK32\tFS8APND01EPA0T\tFS8APND01CK3JM\tFS8APND01BKLWW\tFS8APND01DV0BO\tFS8APND01DPNXE\tFS8APND01B7LUA\tFS8APND01BTTE2\tFS8APND01CKO4X\tFS8APND01DGGBY\tFS8APND01C4NHX\tFS8APND01DYPQN
FS8APND01BSTVP:
FS8APND01EFK0W:
FS8APND01DCIOO:
FS8APND01CKOMZ:
"""

        command =  " ".join( ["%s/denoiser.py" % get_qiime_scripts_dir(),
                              "--force", "-o", self.test_dir, "-i",
                              "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME] );

        result = Popen(command,shell=True,universal_newlines=True,\
                           stdout=PIPE,stderr=STDOUT).stdout.read()
        self.result_dir = self.test_dir

        observed = "".join(list(open(self.result_dir+ "centroids.fasta")))
        self.assertEqual(observed, expected)
        
        self.assertEqual(len(list(MinimalFastaParser(open(self.result_dir + "singletons.fasta")))), 6)

        observed = "".join(list(open(self.result_dir+ "denoiser_mapping.txt")))
        self.assertEqual(observed, expected_map)
Esempio n. 9
0
    def test_main_with_titanium_error(self):
        """Denoiser with titanium error should always give same result on test data"""

        command = " ".join(
            [
                "%s/denoiser.py" % get_qiime_scripts_dir(),
                "--force",
                "-o",
                self.test_dir,
                "-i",
                "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME,
                "-f",
                "%s/qiime/support_files/denoiser/TestData/test_set_seqs.fna" % PROJECT_HOME,
                "-e",
                "%s/qiime/support_files/denoiser/Data/Titanium_error_profile.dat" % PROJECT_HOME,
            ]
        )

        result = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT).stdout.read()
        self.result_dir = self.test_dir

        observed = "".join(list(open(self.result_dir + "centroids.fasta")))
        self.assertEqual(observed, self.expected)

        observed = "".join(list(open(self.result_dir + "denoiser_mapping.txt")))
        self.assertEqual(observed, self.expected_titanium_map_string)
Esempio n. 10
0
 def test_get_qiime_scripts_dir(self):
     """Test that we can find the directory containing QIIME scripts."""
     # get_qiime_scripts_dir will raise an error if it can't find a scripts
     # directory.
     scripts_dir = get_qiime_scripts_dir()
     self.assertTrue(isdir(scripts_dir), "The QIIME scripts directory does "
                     "not exist: %s" % scripts_dir)
Esempio n. 11
0
    def test_main_split_cluster(self):
        """Denoiser on cluster in split mode should always give same result on test data"""

        command = " ".join(
            [
                "%s/denoiser.py" % get_qiime_scripts_dir(),
                "-S",
                "--force",
                "-c",
                "-n 2",
                "-i",
                "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME,
                "-f",
                "%s/qiime/support_files/denoiser/TestData/test_set_seqs.fna" % PROJECT_HOME,
                "-o",
                self.test_dir,
            ]
        )

        result = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT).stdout.read()
        self.result_dir = self.test_dir

        for subdir in ["0/", "1/"]:
            observed = "".join(list(open(self.result_dir + subdir + "centroids.fasta")))
            self.assertEqual(observed, expected_centroids[subdir])

            observed = "".join(list(open(self.result_dir + subdir + "denoiser_mapping.txt")))
            self.assertEqual(observed, expected_map_string_on_cluster[subdir])
def run_process_illumina_through_split_lib(study_id,run_prefix,input_fp,
    mapping_fp, output_dir, 
    command_handler, params, qiime_config,
    write_to_all_fasta=False,
    status_update_callback=print_to_stdout):
    """ NOTE: Parts of this function are a directly copied from the
        run_qiime_data_preparation function from the workflow.py library file 
        in QIIME.
    
        The steps performed by this function are:
          1) De-multiplex sequences. (split_libraries_fastq.py)
    
    """

    # Prepare some variables for the later steps
    filenames=input_fp.split(',')
    commands = []
    create_dir(output_dir)
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    # copy the mapping file
    copied_mapping=split(mapping_fp)[-1]
    mapping_input_fp_copy=join(output_dir, copied_mapping)
    copy_mapping_cmd='cp %s %s' % (mapping_fp,mapping_input_fp_copy)
    commands.append([('CopyMapping', copy_mapping_cmd)])

    # sort the filenames
    filenames.sort()
    
    # determine which file is seq-file and which is barcode-file and associate
    # to mapping file
    if len(filenames) == 1:
        try:
            # Format of sample_id needs to be seqs_<sample_name>.<sequence_prep_id>.fastq
            data_access = data_access_factory(ServerConfig.data_access_type)
            sql = """
            select  s.sample_name || '.' || sp.sequence_prep_id 
            from    sample s 
                    inner join sequence_prep sp 
                    on s.sample_id = sp.sample_id
            where   s.study_id = {0}
                    and sp.run_prefix = '{1}'
            """.format(study_id, run_prefix[:-1])
            sample_and_prep = data_access.dynamicMetadataSelect(sql).fetchone()[0]
            input_str = '-i {0} --sample_id {1}'.format(filenames[0], sample_and_prep)
        except Exception, e:
            error = 'Failed to obtain sample and sequence prep info for study_id {0} and run_prefix {1}\n'.format(study_id, run_prefix)
            error += 'SQL was: \n {0} \n'.format(sql)
            error += 'Original exception was: \n {0}'.format(str(e))
            raise Exception(error)
Esempio n. 13
0
    def test_main_low_mem(self):
        """Denoiser works using low_memory"""

        command =  " ".join( ["%s/denoiser.py" % get_qiime_scripts_dir(),
                              "-f", "%s/qiime/support_files/denoiser/TestData/test_set_seqs.fna" % PROJECT_HOME,
                              "-i", "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME,
                              "-o", self.test_dir, "--low_memory"] )

        result = Popen(command,shell=True,universal_newlines=True,\
                           stdout=PIPE,stderr=STDOUT).stdout.read()
        self.result_dir = self.test_dir

        observed = "".join(list(open(self.result_dir+ "centroids.fasta")))
        self.assertEqual(observed, self.expected)
Esempio n. 14
0
    def test_main_low_mem(self):
        """Denoiser works using low_memory"""

        command =  " ".join( ["%s/denoiser.py" % get_qiime_scripts_dir(),
                              "-f", "%s/qiime/support_files/denoiser/TestData/test_set_seqs.fna" % PROJECT_HOME,
                              "-i", "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME,
                              "-o", self.test_dir, "--low_memory"] )

        result = Popen(command,shell=True,universal_newlines=True,\
                           stdout=PIPE,stderr=STDOUT).stdout.read()
        self.result_dir = self.test_dir

        observed = "".join(list(open(self.result_dir+ "centroids.fasta")))
        self.assertEqual(observed, self.expected)
Esempio n. 15
0
    def test_main_with_fasta(self):
        """Denoiser with fasta file should always give same result on test data"""

        command =  " ".join( ["%s/denoiser.py" % get_qiime_scripts_dir(),
                              "--force", "-o", self.test_dir,
                              "-i", "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME, 
                              "-f", "%s/qiime/support_files/denoiser/TestData/test_set_seqs.fna" % PROJECT_HOME] )

        result = Popen(command,shell=True,universal_newlines=True,\
                           stdout=PIPE,stderr=STDOUT).stdout.read()
        self.result_dir = self.test_dir

        observed = "".join(list(open(self.result_dir+ "centroids.fasta")))
        self.assertEqual(observed, self.expected)

        observed = "".join(list(open(self.result_dir+ "denoiser_mapping.txt")))
        self.assertEqual(observed,self.expected_map_string)
Esempio n. 16
0
File: util.py Progetto: jb2263/qiime
    def __init__(self,
                 python_exe_fp=qiime_config['python_exe_fp'],
                 cluster_jobs_fp=qiime_config['cluster_jobs_fp'],
                 jobs_to_start=int(qiime_config['jobs_to_start']),
                 poller_fp=join(get_qiime_scripts_dir(), 'poller.py'),
                 retain_temp_files=False,
                 suppress_polling=False,
                 seconds_to_sleep=int(qiime_config['seconds_to_sleep'])):
        """  """

        self._python_exe_fp = python_exe_fp
        self._cluster_jobs_fp = cluster_jobs_fp
        self._jobs_to_start = jobs_to_start
        self._poller_fp = poller_fp
        self._retain_temp_files = retain_temp_files
        self._suppress_polling = suppress_polling
        self._seconds_to_sleep = seconds_to_sleep
Esempio n. 17
0
 def __init__(self,
              python_exe_fp=qiime_config['python_exe_fp'],
              cluster_jobs_fp=qiime_config['cluster_jobs_fp'],
              jobs_to_start=int(qiime_config['jobs_to_start']),
              poller_fp=join(get_qiime_scripts_dir(),'poller.py'),
              retain_temp_files=False,
              suppress_polling=False,
              seconds_to_sleep=int(qiime_config['seconds_to_sleep'])):
     """  """
     
     self._python_exe_fp = python_exe_fp
     self._cluster_jobs_fp = cluster_jobs_fp
     self._jobs_to_start = jobs_to_start
     self._poller_fp = poller_fp
     self._retain_temp_files = retain_temp_files
     self._suppress_polling = suppress_polling
     self._seconds_to_sleep = seconds_to_sleep
Esempio n. 18
0
def preprocess_on_cluster(sff_fps,
                          log_fp,
                          fasta_fp=None,
                          out_fp="/tmp/",
                          squeeze=False,
                          verbose=False,
                          primer=STANDARD_BACTERIAL_PRIMER):
    """Call preprocess via cluster_jobs_script on the cluster.

    sff_fps: List of paths to flowgram files.
    
    log_fp: path to log file
    
    fasta_fp: Path to fasta file, formatted as from split_libraries.py.
              This files is used to filter the flowgrams in sff_fps. Only reads in 
              fasta_fp are pulled from sff_fps.
              
    out_fp: path to output directory
    
    verbose: a binary verbose flag
    
    squeeze: a flag that controls if sequences are squeezed before phase I.
             Squeezing means consecutive identical nucs are collapsed to one.
    
    primer: The primer sequences of the amplification process. This seq will be
            removed from all reads during the preprocessing         
    """

    qiime_config = load_qiime_config()
    python_bin = qiime_config['python_exe_fp']

    cmd = "%s %s/denoiser_preprocess.py -i %s -l %s -o %s" %\
        (python_bin, get_qiime_scripts_dir(), ",".join(sff_fps), log_fp, out_fp)
    if (fasta_fp):
        cmd += " -f %s" % fasta_fp
    if (squeeze):
        cmd += " -s"
    if verbose:
        cmd += " -v"
    if primer:
        cmd += " -p %s" % primer

    submit_jobs([cmd], "pp_" + make_tmp_name(6))

    wait_for_file(out_fp + "/prefix_mapping.txt", 10)
def get_html(script_name,column_headers,help_img):
    '''generate the html table rows for a given QIIME script'''
    
    script_dir = get_qiime_scripts_dir()
    dirname,fname=script_path_components(os.path.join(script_dir,script_name))
    script_info_dict=get_script_info(dirname,fname)
    fname,fname_ext=os.path.splitext(script_name)
    
    #get all the required options
    required_html=get_html_for_options(fname,script_info_dict,
                                      'required_options',column_headers,
                                      help_img)
    #get all the optional options
    optional_html=get_html_for_options(fname,script_info_dict,
                                       'optional_options',column_headers,
                                       help_img)

    return required_html,optional_html
 def setUp(self):
     """ """
     
     self.headers=['head1','head2','head3']
     self.test_option_file=make_option('-i', '--coord_fname',
             help='Input principal coordinates filepath',
             type='existing_path')
     self.test_option_colorby=make_option('-b', '--colorby', dest='colorby',\
             help='Comma-separated list categories metadata categories' +\
             ' (column headers) [default=color by all]')
     self.test_option_custom_axes=make_option('-a', '--custom_axes',
             help='This is the category from the metadata mapping file' +\
             ' [default: %default]')
     self.test_option_choice=make_option('-k', '--background_color',
             help='Background color to use in the plots.[default: %default]',
             default='black',type='choice',choices=['black','white'])
     self.test_option_float=make_option('--ellipsoid_opacity',
             help='Used only when plotting ellipsoids for jackknifed' +\
             ' beta diversity (i.e. using a directory of coord files' +\
             ' [default=%default]',
             default=0.33,type=float)
     self.test_option_int=make_option('--n_taxa_keep',
             help='Used only when generating BiPlots. This is the number '+\
             ' to display. Use -1 to display all. [default: %default]',
             default=10,type=int)
     self.test_option_true=make_option('--suppress_html_output',
             dest='suppress_html_output',\
             default=False,action='store_true',
             help='Suppress HTML output. [default: %default]')
     self.test_option_false=make_option('--suppress_html_output',
             dest='suppress_html_output',\
             default=True,action='store_false',
             help='Suppress HTML output. [default: %default]')
     
     self.option_labels={'coord_fname':'Principal coordinates filepath',
                                  'colorby': 'Colorby category',
                                  'background_color': 'Background color',
                                  'ellipsoid_opacity':'Ellipsoid opacity',
                                  'n_taxa_keep': '# of taxa to keep',
                                  'custom_axes':'Custom Axis'}
         
     self.script_dir = get_qiime_scripts_dir()
     self.test_script_info=get_script_info(self.script_dir,
                                                 'make_qiime_rst_file')
Esempio n. 21
0
    def test_main_split_cluster(self):
        """Denoiser on cluster in split mode should always give same result on test data"""
        
        command =  " ".join( ["%s/denoiser.py" % get_qiime_scripts_dir(),
                              "-S", "--force", '-c', '-n 2',
                              "-i", "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME,
                              "-f", "%s/qiime/support_files/denoiser/TestData/test_set_seqs.fna" % PROJECT_HOME,
                              "-o", self.test_dir] )

        result = Popen(command,shell=True,universal_newlines=True,\
                           stdout=PIPE,stderr=STDOUT).stdout.read()
        self.result_dir = self.test_dir

        for subdir in ["0/","1/"]:
            observed = "".join(list(open(self.result_dir+ subdir+"centroids.fasta")))
            self.assertEqual(observed, expected_centroids[subdir])

            observed = "".join(list(open(self.result_dir+ subdir+"denoiser_mapping.txt")))
            self.assertEqual(observed, expected_map_string_on_cluster[subdir])
Esempio n. 22
0
def preprocess_on_cluster(sff_fps, log_fp, fasta_fp=None, out_fp="/tmp/",
                          squeeze=False, verbose=False,
                          primer=STANDARD_BACTERIAL_PRIMER):
    """Call preprocess via cluster_jobs_script on the cluster.

    sff_fps: List of paths to flowgram files.

    log_fp: path to log file

    fasta_fp: Path to fasta file, formatted as from split_libraries.py.
              This files is used to filter the flowgrams in sff_fps. Only reads in
              fasta_fp are pulled from sff_fps.

    out_fp: path to output directory

    verbose: a binary verbose flag

    squeeze: a flag that controls if sequences are squeezed before phase I.
             Squeezing means consecutive identical nucs are collapsed to one.

    primer: The primer sequences of the amplification process. This seq will be
            removed from all reads during the preprocessing
    """

    qiime_config = load_qiime_config()
    python_bin = qiime_config['python_exe_fp']

    cmd = "%s %s/denoiser_preprocess.py -i %s -l %s -o %s" %\
        (python_bin, get_qiime_scripts_dir(),
         ",".join(sff_fps), log_fp, out_fp)
    if (fasta_fp):
        cmd += " -f %s" % fasta_fp
    if(squeeze):
        cmd += " -s"
    if verbose:
        cmd += " -v"
    if primer:
        cmd += " -p %s" % primer

    submit_jobs([cmd], "pp_" + make_tmp_name(6))

    wait_for_file(out_fp + "/prefix_mapping.txt", 10)
Esempio n. 23
0
    def test_denoise_worker(self):
        """denoiser_worker.py is where it belongs and is callable."""

        qiime_config = load_qiime_config()
        PYTHON_BIN = qiime_config["python_exe_fp"]

        DENOISE_WORKER = get_qiime_scripts_dir() + "/denoiser_worker.py"

        self.assertTrue(exists(DENOISE_WORKER), "DENOISER_WORKER is not where it's supposed to be: %s" % DENOISE_WORKER)

        # test if its callable and actually works
        command = "%s %s -h" % (PYTHON_BIN, DENOISE_WORKER)
        proc = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT)

        if proc.wait() != 0:
            self.fail("Calling %s failed. Check permissions and that it is in fact an executable." % DENOISE_WORKER)

        result = proc.stdout.read()
        # check that the help string looks correct
        self.assertTrue(result.startswith("Usage"))
def add_taxa_to_biom(input_dir,study):
    """ Add the retrained taxonomy assignments to the biom-table """
    
    # get the study directory
    study_input_dir=join(input_dir,'study_%s' % (str(study)))
    
    # get a list of all processed folders
    processed_folders=listdir(study_input_dir)
    for processed_folder in processed_folders:
        
        # make sure it is processed folder produced by DB
        if processed_folder.startswith('processed'):
            # get the biom file for this particular run
            gg_biom_fp=join(study_input_dir, processed_folder, \
                            'gg_97_otus', 'exact_uclust_ref_otu_table.biom')
            
            # make sure the path exists and if not create the biom-table
            # only applicable to studies processed prior to biom-format
            if not exists(gg_biom_fp):
                # get the classic OTU table path
                gg_otu_table_fp=join(study_input_dir, processed_folder, \
                                     'gg_97_otus', \
                                     'exact_uclust_ref_otu_table.txt')
                                     
                # make sure path exists and convert classic to biom-table
                if exists(gg_otu_table_fp):
                    system("python %s/software/biom-format/scripts/convert_biom.py -i %s -o %s --biom_table_type='otu table'" % \
                    (environ['HOME'],gg_otu_table_fp,gg_biom_fp))
            try:
                # add the taxa to the biom-table
                # if it exists already, this will fail out
                system("python %s/add_taxa.py -i %s -t %s/phpr_taxonomy.txt -o %s" % \
                (get_qiime_scripts_dir(), gg_biom_fp, environ['HOME'], \
                 gg_biom_fp))
            except:
                pass
Esempio n. 25
0
def run_pick_closed_reference_otus(
                              input_fp, 
                              refseqs_fp,
                              output_dir,
                              taxonomy_fp,
                              command_handler,
                              params,
                              qiime_config,
                              parallel=False,
                              logger=None,
                              suppress_md5=False,
                              status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          1) Pick OTUs;
          2) Build an OTU table with optional pre-defined taxonmy.
    
    """
    
    # confirm that a valid otu picking method was supplied before doing
    # any work
    reference_otu_picking_methods = ['blast','uclust_ref','usearch61_ref']

    try:
        otu_picking_method = params['pick_otus']['otu_picking_method']
    except KeyError:
        otu_picking_method = 'uclust_ref'
    assert otu_picking_method in reference_otu_picking_methods,\
     "Invalid OTU picking method supplied: %s. Valid choices are: %s"\
     % (otu_picking_method,' '.join(reference_otu_picking_methods))
    
    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger,[input_fp,refseqs_fp,taxonomy_fp])

    # Prep the OTU picking command
    pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method)
    otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename)
    if parallel and (otu_picking_method == 'blast' or 
                     otu_picking_method == 'uclust_ref' or
                     otu_picking_method == 'usearch61_ref'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['pick_otus'].copy()
            if 'otu_picking_method' in d:
                del d['otu_picking_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
        otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/%s -i %s -o %s -r %s -T %s' %\
          (python_exe_fp, 
           script_dir, 
           otu_picking_script,
           input_fp,
           pick_otu_dir,
           refseqs_fp,
           params_str)
    else:
        try:
            params_str = get_params_str(params['pick_otus'])
        except KeyError:
            params_str = ''
        # Since this is reference-based OTU picking we always want to
        # suppress new clusters -- force it here.
        params_str+= ' --suppress_new_clusters'
        logger.write("Forcing --suppress_new_clusters as this is closed-reference OTU picking.\n\n")
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -r %s -m %s %s' %\
         (python_exe_fp,
          script_dir,
          input_fp,
          pick_otu_dir,
          refseqs_fp,
          otu_picking_method,
          params_str)

    commands.append([('Pick OTUs', pick_otus_cmd)])

    # Prep the OTU table building command
    otu_table_fp = '%s/otu_table.biom' % output_dir
    try:
        params_str = get_params_str(params['make_otu_table'])
    except KeyError:
        params_str = ''
    if taxonomy_fp:
        taxonomy_str = '-t %s' % taxonomy_fp
    else:
        taxonomy_str = ''
    # Build the OTU table building command
    make_otu_table_cmd = '%s %s/make_otu_table.py -i %s %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_fp, taxonomy_str, otu_table_fp, params_str)
    
    commands.append([('Make OTU table', make_otu_table_cmd)])
    

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Esempio n. 26
0
def run_pick_de_novo_otus(input_fp, 
                               output_dir, 
                               command_handler,
                               params, 
                               qiime_config,
                               parallel=False,
                               logger=None,
                               suppress_md5=False,
                               status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          1) Pick OTUs;
          2) Pick a representative set;
          3) Align the representative set; 
          4) Assign taxonomy;
          5) Filter the alignment prior to tree building - remove positions
             which are all gaps, and specified as 0 in the lanemask
          6) Build a phylogenetic tree;
          7) Build an OTU table.
    
    """
    
    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    cluster_failures = False
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[input_fp])
    
    # Prep the OTU picking command
    try:
        otu_picking_method = params['pick_otus']['otu_picking_method']
    except KeyError:
        otu_picking_method = 'uclust'
    pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method)
    otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename)
    if parallel and (otu_picking_method == 'blast' or 
                     otu_picking_method == 'uclust_ref'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --otu_picking_method
            # option. This works for now though.
            d = params['pick_otus'].copy()
            del d['otu_picking_method']
        except KeyError:
            pass
        
        if otu_picking_method == 'uclust_ref':
            try:
                suppress_new_clusters = d['suppress_new_clusters']
                del d['suppress_new_clusters']
                cluster_failures = False
            except KeyError:
                cluster_failures = True
                failure_otu_picking_method = 'uclust'
        
        params_str += ' %s' % get_params_str(d)
        otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/%s -i %s -o %s -T %s' % (python_exe_fp, 
                                                        script_dir, 
                                                        otu_picking_script,
                                                        input_fp,
                                                        pick_otu_dir,
                                                        params_str)
    else:
        try:
            params_str = get_params_str(params['pick_otus'])
        except KeyError:
            params_str = ''
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, input_fp, pick_otu_dir, params_str)

    commands.append([('Pick OTUs', pick_otus_cmd)])
    
    if cluster_failures:
        reference_otu_fp = otu_fp
        clustered_failures_dir = '%s/failure_otus/' % pick_otu_dir
        
        try:
            d = params['pick_otus'].copy()
            del d['otu_picking_method']
        except KeyError:
            pass

        if 'uclust_otu_id_prefix' not in d:
            d['uclust_otu_id_prefix'] = 'DeNovoOTU'        
        params_str = ' %s' % get_params_str(d)

        failures_list_fp = '%s/%s_failures.txt' % \
         (pick_otu_dir,input_basename)
        failures_fasta_fp = '%s/%s_failures.fasta' % \
         (pick_otu_dir,input_basename)
        
        filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,failures_list_fp,failures_fasta_fp)
        
        commands.append([('Generate failures fasta file',
                          filter_fasta_cmd)])
        
        # Prep the OTU picking command for
        failure_otu_fp = '%s/%s_failures_otus.txt' % (clustered_failures_dir,input_basename)
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -m %s %s' %\
         (python_exe_fp, script_dir, failures_fasta_fp, clustered_failures_dir, 
          failure_otu_picking_method, params_str)

        commands.append([('Pick de novo OTUs for new clusters', pick_otus_cmd)])
        
        merged_otu_map_fp = '%s/merged_otu_map.txt' % clustered_failures_dir
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
         (reference_otu_fp,failure_otu_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])
        otu_fp = merged_otu_map_fp

    # Prep the representative set picking command
    rep_set_dir = '%s/rep_set/' % output_dir
    create_dir(rep_set_dir)
    rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir,input_basename)
    rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir,input_basename)
    
    try:
        params_str = get_params_str(params['pick_rep_set'])
    except KeyError:
        params_str = ''
    # Build the representative set picking command
    pick_rep_set_cmd = '%s %s/pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_fp, input_fp, rep_set_log_fp,\
      rep_set_fp, params_str)
    commands.append([('Pick representative set', pick_rep_set_cmd)])
    
    # Prep the taxonomy assignment command
    try:
        assignment_method = params['assign_taxonomy']['assignment_method']
    except KeyError:
        assignment_method = 'uclust'
    assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
     (output_dir,assignment_method)
    taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \
     (assign_taxonomy_dir,input_basename)
    if parallel and (assignment_method == 'rdp' or
                     assignment_method == 'blast' or
                     assignment_method == 'uclust'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the taxonomy assignment parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --assignment_method
            # option. This works for now though.
            d = params['assign_taxonomy'].copy()
            if 'assignment_method' in d:
                del d['assignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the parallel taxonomy assignment command
        assign_taxonomy_cmd = \
         '%s %s/parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
         (python_exe_fp, script_dir, assignment_method, rep_set_fp,\
          assign_taxonomy_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['assign_taxonomy'])
        except KeyError:
            params_str = ''
        # Build the taxonomy assignment command
        assign_taxonomy_cmd = '%s %s/assign_taxonomy.py -o %s -i %s %s' %\
         (python_exe_fp, script_dir, assign_taxonomy_dir,\
          rep_set_fp, params_str)
    
    commands.append([('Assign taxonomy',assign_taxonomy_cmd)])
    
    # Prep the OTU table building command
    otu_table_fp = '%s/otu_table.biom' % output_dir
    try:
        params_str = get_params_str(params['make_otu_table'])
    except KeyError:
        params_str = ''
    # Build the OTU table building command
    make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_fp, taxonomy_fp, otu_table_fp, params_str)
    
    commands.append([('Make OTU table', make_otu_table_cmd)])
    
    if cluster_failures:
        reference_otu_table_fp = '%s/reference_only_otu_table.biom' % output_dir
        # Build the OTU table building command
        make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\
         (python_exe_fp, script_dir, reference_otu_fp, taxonomy_fp, 
          reference_otu_table_fp, params_str)
    
        commands.append([('Make reference-only OTU table', make_otu_table_cmd)])
    
    # Prep the pynast alignment command
    try:
        alignment_method = params['align_seqs']['alignment_method']
    except KeyError:
        alignment_method = 'pynast'
    pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method)
    aln_fp = '%s/%s_rep_set_aligned.fasta' % (pynast_dir,input_basename)
    if parallel and alignment_method == 'pynast':
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the alignment parameters        
        # Want to find a cleaner strategy for this: the parallel script
        # is method-specific, so doesn't take a --alignment_method
        # option. This works for now though.
        try:
            d = params['align_seqs'].copy()
        except KeyError:
            d = {}
        try:
            del d['alignment_method']
        except KeyError:
            pass
        params_str += ' %s' % get_params_str(d)
        
        # Build the parallel pynast alignment command
        align_seqs_cmd = '%s %s/parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
         (python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['align_seqs'])
        except KeyError:
            params_str = ''
        # Build the pynast alignment command
        align_seqs_cmd = '%s %s/align_seqs.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str)
    commands.append([('Align sequences', align_seqs_cmd)])
    
    # Prep the alignment filtering command
    filtered_aln_fp = '%s/%s_rep_set_aligned_pfiltered.fasta' %\
     (pynast_dir,input_basename)
    try:
        params_str = get_params_str(params['filter_alignment'])
    except KeyError:
        params_str = ''
    # Build the alignment filtering command
    filter_alignment_cmd = '%s %s/filter_alignment.py -o %s -i %s %s' %\
     (python_exe_fp, script_dir, pynast_dir, aln_fp, params_str)
    commands.append([('Filter alignment', filter_alignment_cmd)])
    
    # Prep the tree building command
    tree_fp = '%s/rep_set.tre' % output_dir
    try:
        params_str = get_params_str(params['make_phylogeny'])
    except KeyError:
        params_str = ''
    # Build the tree building command
    make_phylogeny_cmd = '%s %s/make_phylogeny.py -i %s -o %s %s' %\
     (python_exe_fp, script_dir, filtered_aln_fp, tree_fp,\
     params_str)
    commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    
    return abspath(tree_fp), abspath(otu_table_fp)
Esempio n. 27
0
def run_summarize_taxa_through_plots(otu_table_fp,
                                     mapping_fp,
                                     output_dir,
                                     mapping_cat,
                                     sort,
                                     command_handler,
                                     params,
                                     qiime_config,
                                     logger=None,
                                     suppress_md5=False,
                                     status_update_callback=print_to_stdout):
    """ Run the data preparation for summarizing taxonomies and generating plots
    
        The steps performed by this function are:
          1) Summarize OTU by Category
          2) Summarize Taxonomy
          3) Plot Taxonomy Summary
          
    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)

    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()

    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp])

    # if mapping category not passed via command-line,
    # check if it is passed in params file
    if not mapping_cat:
        try:
            mapping_cat = params['summarize_otu_by_cat']['mapping_category']
        except:
            mapping_cat = None

    try:
        params_str = get_params_str(params['summarize_otu_by_cat'])
        # Need to remove the mapping category option, since it is defined above.
        # Using this method since we don't want to change the params dict
        split_params = params_str.split('--')
        updated_params_str = []
        for i in split_params:
            if not i.startswith('mapping_category'):
                updated_params_str.append(i)
        params_str = '--'.join(updated_params_str)
    except:
        params_str = ''

    if mapping_cat:
        output_fp = join(output_dir,
                         '%s_otu_table.biom' % (mapping_cat.replace(' ', '-')))
        # Build the summarize otu by category command
        summarize_otu_by_cat_cmd = \
         "%s %s/summarize_otu_by_cat.py -i %s -c %s -o %s -m '%s' %s" %\
         (python_exe_fp, script_dir, mapping_fp, otu_table_fp, output_fp,
          mapping_cat, params_str)

        commands.append(\
         [('Summarize OTU table by Category',summarize_otu_by_cat_cmd)])

        otu_table_fp = output_fp

    # Build the sort OTU table command
    if sort:
        # Prep the sort_otu_table command
        try:
            params_str = get_params_str(params['sort_otu_table'])
        except:
            params_str = ''

        # define output otu table
        sorted_fp = join(output_dir,
                         splitext(split(otu_table_fp)[-1])[0] + '_sorted.biom')

        if mapping_cat or params_str == '':
            # for this case we don't have a collapsed mapping file so must
            # handle separately
            sort_otu_table_cmd = \
             "%s %s/sort_otu_table.py -i %s -o %s" %\
             (python_exe_fp, script_dir, otu_table_fp, sorted_fp)
        else:
            sort_otu_table_cmd = \
             "%s %s/sort_otu_table.py -i %s -o %s -m %s %s" %\
             (python_exe_fp, script_dir, otu_table_fp, sorted_fp,
              mapping_fp, params_str)

        commands.append([('Sort OTU Table', sort_otu_table_cmd)])

        # redefine otu_table_fp to use
        otu_table_fp = sorted_fp

    # Prep the summarize taxonomy command
    try:
        params_str = get_params_str(params['summarize_taxa'])
    except:
        params_str = ''

    try:
        sum_taxa_levels = params['summarize_taxa']['level']
    except:
        sum_taxa_levels = None

    # Build the summarize taxonomy command
    summarize_taxa_cmd = '%s %s/summarize_taxa.py -i %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_table_fp, output_dir, params_str)

    commands.append([('Summarize Taxonomy', summarize_taxa_cmd)])

    sum_taxa_fps = []

    if sum_taxa_levels:
        basename = join(output_dir, splitext(split(otu_table_fp)[-1])[0])
        for i in sum_taxa_levels.split(','):
            sum_taxa_fps.append(basename + '_L%s.txt' % (str(i)))
    else:
        basename = join(output_dir, splitext(split(otu_table_fp)[-1])[0])
        # this is the default levels from summarize_taxa, but cannot import
        # script to get these values
        for i in [2, 3, 4, 5, 6]:
            sum_taxa_fps.append(basename + '_L%s.txt' % (str(i)))

    # Prep the plot taxa summary plot command(s)
    taxa_summary_plots_dir = '%s/taxa_summary_plots/' % output_dir
    create_dir(taxa_summary_plots_dir)

    try:
        params_str = get_params_str(params['plot_taxa_summary'])
    except:
        params_str = ''
    # Build the plot taxa summary plot command(s)

    plot_taxa_summary_cmd =\
         '%s %s/plot_taxa_summary.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, ','.join(sum_taxa_fps),
          taxa_summary_plots_dir, params_str)

    commands.append(\
         [('Plot Taxonomy Summary',plot_taxa_summary_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Esempio n. 28
0
def get_flowgram_ali_exe():
    """Return the path to the flowgram alignment prog
    """
    fp = get_qiime_scripts_dir() + "/FlowgramAli_4frame"
    return fp
def run_process_sff_through_split_lib(study_id,run_prefix,sff_input_fp,
    mapping_fp, output_dir, 
    command_handler, params, qiime_config,
    convert_to_flx=False, write_to_all_fasta=False,
    status_update_callback=print_to_stdout):
    """ NOTE: Parts of this function are a directly copied from the
        run_qiime_data_preparation function from the workflow.py library file 
        in QIIME.
    
        The steps performed by this function are:
          1) Process SFFs to generate .fna, .qual and flowgram file.
             (process_sff.py)
          2) De-multiplex sequences. (split_libraries.py)
          
    """

    # Prepare some variables for the later steps
    sff_filenames=sff_input_fp.split(',')
    commands = []
    create_dir(output_dir)
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    
    # generate a log file
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    make_flowgram=True
    split_lib_fasta_input_files=[]
    split_lib_qual_input_files=[]
    denoise_flow_input_files=[]

    # make a copy of the mapping file
    copied_mapping=split(mapping_fp)[-1]
    mapping_input_fp_copy=join(output_dir, copied_mapping)
    copy_mapping_cmd='cp %s %s' % (mapping_fp,mapping_input_fp_copy)
    commands.append([('CopyMapping', copy_mapping_cmd)])
    
    # iterate over SFFs and match to the mapping file
    for sff_input_fp in sff_filenames:
        # GENERATE THE MD5 HERE AND STORE IN THE DATABASE AFTER FILE 
        # SUCCESSFULLY PROCESSED
        
        # Copy the SFF into the processed files directory
        copied_sff=split(sff_input_fp)[-1]
        sff_input_fp_copy=join(output_dir, copied_sff)

        #Generate filenames for split_libraries
        input_dir, input_filename = split(sff_input_fp)

        if is_gzip(sff_input_fp) and sff_input_fp.endswith('.gz'):
            input_basename, input_ext = splitext(splitext(input_filename)[0])
        else:
            input_basename, input_ext = splitext(input_filename)

        # Convert sff file into fasta, qual and flowgram file
        if convert_to_flx:
            if study_id in ['496','968','969','1069','1002','1066','1194','1195','1457','1458','1460','1536','1918','1962']:
                ### this function is for handling files where the barcode and
                ### linkerprimer are all lowercase (i.e. HMP data or SRA data)
                
                # write process_sff command
                process_sff_cmd = '%s %s/process_sff.py -i %s -f -o %s -t --no_trim --use_sfftools' %\
                                  (python_exe_fp, script_dir, sff_input_fp,
                                   output_dir)
                #process_sff_cmd = '%s %s/process_sff.py -i %s -f -o %s -t' % (python_exe_fp, script_dir, sff_input_fp, output_dir)
                
                commands.append([('ProcessSFFs', process_sff_cmd)])
                
                # define output fasta from process_sff
                no_trim_fasta_fp=join(output_dir,input_basename + '_FLX.fna')
                
                # define pprospector scripts dir
                pprospector_scripts_dir=join(ServerConfig.home,'software',
                                                 'pprospector','scripts')
                
                # clean fasta - basically converting lowercase to uppercase
                clean_fasta_cmd = '%s %s/clean_fasta.py -f %s -o %s' %\
                                      (python_exe_fp, pprospector_scripts_dir, 
                                       no_trim_fasta_fp,output_dir)
                
                commands.append([('CleanFasta', clean_fasta_cmd)])
                
                # move the cleaned file to be consistent with other processes
                cleaned_fasta_fp=join(output_dir,input_basename + \
                                      '_FLX_filtered.fasta')
                moved_fasta_fp=join(output_dir,input_basename + '_FLX.fna')
                mv_cmd='mv %s %s' %  (cleaned_fasta_fp,moved_fasta_fp)

                commands.append([('RenameFasta',mv_cmd)])
                
                # update the split-lib files to use the cleaned file
                split_lib_fasta_input_files.append(moved_fasta_fp)
                split_lib_qual_input_files.append(join(output_dir,
                                                input_basename + '_FLX.qual'))
                denoise_flow_input_files.append(join(output_dir,
                                                input_basename + '_FLX.txt'))
            else:
                # write process_sff command
                process_sff_cmd = '%s %s/process_sff.py -i %s -f -o %s -t' %\
                                  (python_exe_fp, script_dir, sff_input_fp,
                                   output_dir)
                
                commands.append([('ProcessSFFs', process_sff_cmd)])
                
                # get filepaths for generated files
                split_lib_fasta_input_files.append(join(output_dir,
                                                input_basename + '_FLX.fna'))
                split_lib_qual_input_files.append(join(output_dir,
                                                input_basename + '_FLX.qual'))
                denoise_flow_input_files.append(join(output_dir,
                                                input_basename + '_FLX.txt'))
                
                
        else:
            # write process_sff command
            process_sff_cmd = '%s %s/process_sff.py -i %s -f -o %s' %\
                                (python_exe_fp, script_dir, sff_input_fp,
                                 output_dir)
            
            commands.append([('ProcessSFFs', process_sff_cmd)])
            
            # get filepaths for generated files
            split_lib_fasta_input_files.append(join(output_dir,input_basename + '.fna'))
            split_lib_qual_input_files.append(join(output_dir,input_basename + '.qual'))
            denoise_flow_input_files.append(join(output_dir,input_basename + '.txt'))
        

    split_lib_fasta_input=','.join(split_lib_fasta_input_files)
    split_lib_qual_input=','.join(split_lib_qual_input_files)
    denoise_flow_input=','.join(denoise_flow_input_files)
    
    # If dataset is metagenomic disable primer check
    data_access = data_access_factory(ServerConfig.data_access_type)
    study_info=data_access.getStudyInfo(study_id,12171)
    if study_info['investigation_type'].lower() == 'metagenome':
        params['split_libraries']['disable_primers']=None
    
    # create split-libraries folder
    split_library_output=join(output_dir,'split_libraries')
    create_dir(split_library_output)
    
    # get params string
    try:
        params_str = get_params_str(params['split_libraries'])
    except KeyError:
        params_str = ''
    
    # Build the split libraries command
    split_libraries_cmd = '%s %s/split_libraries.py -f %s -q %s -m %s -o %s %s'%\
     (python_exe_fp, script_dir, split_lib_fasta_input, split_lib_qual_input,
      mapping_fp, split_library_output, params_str)
    commands.append([('SplitLibraries', split_libraries_cmd)])
        
    input_fp=join(split_library_output,'seqs.fna')
    
    # create per sample fastq files
    fastq_output=join(split_library_output,'per_sample_fastq')
    create_dir(fastq_output)
    try:
        params_str = get_params_str(params['convert_fastaqual_fastq'])
    except KeyError:
        params_str = ''
        
    input_qual_fp=join(split_library_output,'seqs_filtered.qual')
    
    # build the convert fasta/qual to fastq command
    create_fastq_cmd = '%s %s/convert_fastaqual_fastq.py -f %s -q %s -o %s %s'%\
     (python_exe_fp, script_dir, input_fp, input_qual_fp,
      fastq_output, params_str)
      
    commands.append([('Create FASTQ', create_fastq_cmd)])
   
    # Call the command handler on the list of commands
    command_handler(commands,status_update_callback,logger=logger)
    
    # Return the fasta file paths
    return split_lib_fasta_input_files
Esempio n. 30
0
                   stdout=PIPE,stderr=STDOUT).stdout.read()
    print result
    if not unittest_good_pattern.search(result):
        if application_not_found_pattern.search(result):
            missing_application_tests.append(unittest_name)
        else:
            bad_tests.append(unittest_name)

# Run through all of QIIME's scripts, and pass -h to each one. If the
# resulting stdout does not being with the Usage text, that is an
# indicator of something being wrong with the script. Issues that would
# cause that are bad import statements in the script, SyntaxErrors, or
# other failures prior to running qiime.util.parse_command_line_parameters.

try:
    scripts_dir = get_qiime_scripts_dir()
    script_directory_found = True
except AssertionError:
    script_directory_found = False

if script_directory_found:
    script_names = []
    script_names = glob('%s/*py' % scripts_dir)
    script_names.sort()
    bad_scripts = []

    for script_name in script_names:
        script_good_pattern = re.compile('^Usage: %s' % split(script_name)[1])
        print "Testing %s." % script_name
        command = '%s %s -h' % (python_name, script_name)
        result = Popen(command,shell=True,universal_newlines=True,\
Esempio n. 31
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.suppress_unit_tests and opts.suppress_script_tests and opts.suppress_script_usage_tests:
        option_parser.error("You're suppressing all three test types. Nothing to run.")

    test_dir = abspath(dirname(__file__))

    unittest_good_pattern = re.compile("OK\s*$")
    application_not_found_pattern = re.compile("ApplicationNotFoundError")
    python_name = "python"
    bad_tests = []
    missing_application_tests = []

    # Run through all of QIIME's unit tests, and keep track of any files which
    # fail unit tests.
    if not opts.suppress_unit_tests:
        unittest_names = []
        if not opts.unittest_glob:
            for root, dirs, files in walk(test_dir):
                for name in files:
                    if name.startswith("test_") and name.endswith(".py"):
                        unittest_names.append(join(root, name))
        else:
            for fp in glob(opts.unittest_glob):
                fn = split(fp)[1]
                if fn.startswith("test_") and fn.endswith(".py"):
                    unittest_names.append(abspath(fp))

        unittest_names.sort()

        for unittest_name in unittest_names:
            print "Testing %s:\n" % unittest_name
            command = "%s %s -v" % (python_name, unittest_name)
            stdout, stderr, return_value = qiime_system_call(command)
            print stderr
            if not unittest_good_pattern.search(stderr):
                if application_not_found_pattern.search(stderr):
                    missing_application_tests.append(unittest_name)
                else:
                    bad_tests.append(unittest_name)

    bad_scripts = []
    if not opts.suppress_script_tests:
        # Run through all of QIIME's scripts, and pass -h to each one. If the
        # resulting stdout does not being with the Usage text, that is an
        # indicator of something being wrong with the script. Issues that would
        # cause that are bad import statements in the script, SyntaxErrors, or
        # other failures prior to running qiime.util.parse_command_line_parameters.

        try:
            scripts_dir = get_qiime_scripts_dir()
            script_directory_found = True
        except AssertionError:
            script_directory_found = False

        if script_directory_found:
            script_names = []
            script_names = glob("%s/*py" % scripts_dir)
            script_names.sort()

            for script_name in script_names:
                script_good_pattern = re.compile("^Usage: %s" % split(script_name)[1])
                print "Testing %s." % script_name
                command = "%s %s -h" % (python_name, script_name)
                stdout, stderr, return_value = qiime_system_call(command)
                if not script_good_pattern.search(stdout):
                    bad_scripts.append(script_name)

    num_script_usage_example_failures = 0
    qiime_test_data_dir = qiime_config["qiime_test_data_dir"]
    if not opts.suppress_script_usage_tests and qiime_test_data_dir != None:
        # Run the script usage testing functionality
        script_usage_result_summary, num_script_usage_example_failures = run_script_usage_tests(
            qiime_test_data_dir=qiime_test_data_dir,
            qiime_scripts_dir=qiime_config["qiime_scripts_dir"],
            working_dir=qiime_config["temp_dir"],
            verbose=True,
            tests=None,  # runs all
            failure_log_fp=None,
            force_overwrite=True,
        )

    print "==============\nResult summary\n=============="

    if not opts.suppress_unit_tests:
        print "\nUnit test result summary\n------------------------\n"
        if bad_tests:
            print "\nFailed the following unit tests.\n%s" % "\n".join(bad_tests)

        if missing_application_tests:
            print "\nFailed the following unit tests, in part or whole due " + "to missing external applications.\nDepending on the QIIME features " + "you plan to use, this may not be critical.\n%s" % "\n".join(
                missing_application_tests
            )

        if not (missing_application_tests or bad_tests):
            print "\nAll unit tests passed.\n\n"

    if not opts.suppress_script_tests:
        print "\nBasic script test result summary\n--------------------------------\n"
        if not script_directory_found:
            print "Critical error: Failed to test scripts because the script directory could not be found.\n The most likely explanation for this failure is that you've installed QIIME using setup.py, and forgot to specify the qiime_scripts_dir in your qiime_config file. This value shoud be set either to the directory you provided for --install-scripts, or /usr/local/bin if no value was provided to --install-scripts."
        else:
            if bad_scripts:
                print "Failed the following basic script tests.\n%s" % "\n".join(bad_scripts)
            else:
                print "All basic script tests passed successfully.\n"

    qiime_test_data_dir_exists = True
    if not opts.suppress_script_usage_tests:
        if qiime_test_data_dir:
            print "\nScript usage test result summary\n------------------------------------\n"
            print script_usage_result_summary
        else:
            print "\nCould not run script usage tests because qiime_test_data_dir is not defined in your qiime_config."
            qiime_test_data_dir_exists = False
        print ""

    # If any of the unit tests, script tests, or script usage tests fail, or if
    # we have any missing application errors or a missing QIIME test data dir
    # if script usage tests weren't suppressed, use return code 1 (as python's
    # unittest module does to indicate one or more failures).
    return_code = 1
    if (
        len(bad_tests) == 0
        and len(missing_application_tests) == 0
        and len(bad_scripts) == 0
        and num_script_usage_example_failures == 0
        and qiime_test_data_dir_exists
    ):
        return_code = 0
    return return_code
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    #get all the options
    cd_dir=path.join(opts.fs_fp,'arare')
    tmp_prefix=get_tmp_filename('',suffix='').strip()
    output_dir=path.join(opts.fs_fp,'arare','arare_'+tmp_prefix)
    web_fp=path.join(opts.web_fp,'arare','arare_'+tmp_prefix)
    otu_table_fp=opts.otu_table_fp
    mapping_file_fp=opts.mapping_file_fp
    file_name_prefix=opts.fname_prefix
    user_id=int(opts.user_id)
    meta_id=int(opts.meta_id)
    bdiv_rarefied_at=int(opts.bdiv_rarefied_at)
    jobs_to_start=opts.jobs_to_start
    tree_fp=opts.tree_fp
    command_handler=call_commands_serially
    status_update_callback=no_status_updates
    zip_fpath=opts.zip_fpath
    zip_fpath_db=opts.zip_fpath_db
    run_date=opts.run_date
    force=True
    
    try:
        from data_access_connections import data_access_factory
        from enums import ServerConfig
        import cx_Oracle
        data_access = data_access_factory(ServerConfig.data_access_type)
    except ImportError:
        print "NOT IMPORTING QIIMEDATAACCESS"
        pass
        
    try:
        parameter_f = open(opts.params_path)
    except IOError:
        raise IOError,\
         "Can't open parameters file (%s). Does it exist? Do you have read access?"\
         % opts.params_path
    
    params=parse_qiime_parameters(parameter_f)
    
    try:
        makedirs(output_dir)
    except OSError:
        if force:
            pass
        else:
            # Since the analysis can take quite a while, I put this check
            # in to help users avoid overwriting previous output.
            print "Output directory already exists. Please choose "+\
             "a different directory, or force overwrite with -f."
            exit(1)
    
    commands=[]
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    # determine whether to run alpha-diversity in serial or parallel
    serial_or_parallel = params['serial_or_parallel']['method']
    if serial_or_parallel=='Serial':
        arare_cmd='%s %s/alpha_rarefaction.py -i %s -m %s -o %s -t %s -p %s -f' %\
            (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, \
             output_dir,tree_fp,opts.params_path)
    else:
        arare_cmd='%s %s/alpha_rarefaction.py -i %s -m %s -o %s -t %s -a -O 50 -p %s -f' %\
            (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, \
             output_dir,tree_fp,opts.params_path)
    
    commands.append([('Alpha-Rarefaction',arare_cmd)])
    
    command_handler(commands, status_update_callback, logger)

    #zip the distance matrices
    cmd_call='cd %s; zip -r %s %s' % (cd_dir,zip_fpath,'arare_'+tmp_prefix)
    system(cmd_call)

    #convert link into web-link
    web_link=path.join(web_fp, 'alpha_rarefaction_plots',
                       'rarefaction_plots.html')
    
    #add the distance matrices
    valid=data_access.addMetaAnalysisFiles(True, int(meta_id), web_link, 
                                           'ARARE', run_date, 'ARARE')
    if not valid:
        raise ValueError, 'There was an issue uploading the filepaths to the DB!'
Esempio n. 33
0
def run_ampliconnoise(mapping_fp,
                      output_dir,
                      command_handler,
                      params,
                      qiime_config,
                      logger=None,
                      status_update_callback=print_to_stdout,
                      chimera_alpha=-3.8228,
                      chimera_beta=0.6200,
                      sff_txt_fp=None,
                      numnodes=2,
                      suppress_perseus=True,
                      output_filepath=None,
                      platform='flx',
                      seqnoise_resolution=None,
                      truncate_len=None):
    """ Run the ampliconnoise pipeline
    
        The steps performed by this function are:
1. Split input sff.txt file into one file per sample

2. Run scripts required for PyroNoise

3. Run scripts required for SeqNoise

4. Run scripts requred for Perseus (chimera removal)

5. Merge output files into one file similar to the output of split_libraries.py

    output_filepath should be absolute
    seqnoise_resolution should be string
    environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
    careful passing command handlers that don't spawn child processes, as they
    may not inherit the correct environment variable setting
    """
    map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))
    create_dir(output_dir)

    if seqnoise_resolution == None:
        if platform == 'flx': seqnoise_resolution = '30.0'
        elif platform == 'titanium': seqnoise_resolution = '25.0'
        else:            raise RuntimeError('seqnoise_resolution not set, and no'+\
          ' default for platform '+platform)

    if truncate_len == None:
        if platform == 'flx': truncate_len = '220'
        elif platform == 'titanium': truncate_len = '400'
        else:            raise RuntimeError('truncate_len not set, and no'+\
          ' default for platform '+platform)

    sample_names = [
    ]  # these are filenames minus extension, and are sample IDs
    primer_seqs = []  # same order as sample_names
    bc_seqs = []  # same order as sample_names
    for i in range(len(map_data)):
        sample_names.append(map_data[i][headers.index('SampleID')])
        bc_seqs.append(map_data[i][headers.index('BarcodeSequence')])
        # don't know why don't just take off the primer now.
        # but that's done later
        # primer += (map_data[i][headers.index('LinkerPrimerSequence')])
        # for char, bases in IUPAC_DNA_ambiguities.items():
        #     primer = primer.replace(char,'['+''.join(bases)+']')

        primer = (map_data[i][headers.index('LinkerPrimerSequence')])
        for char, bases in IUPAC_DNA_ambiguities.items():
            primer = primer.replace(char, '[' + ''.join(bases) + ']')
        primer_seqs.append(primer)

    if len(set(primer_seqs)) != 1:
        raise RuntimeError(
            'Error: only one primer per mapping file supported.')
    one_primer = primer_seqs[0]

    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()

    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    log_input_md5s(logger, [mapping_fp, sff_txt_fp])

    # execute commands in output_dir
    called_dir = os.getcwd()
    os.chdir(output_dir)
    fh = open(os.path.join(output_dir, 'map.csv'), 'w')
    for i in range(len(sample_names)):
        fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n')
    fh.close()

    # these are the fasta results, e.g. PC.636_Good.fa
    # later we merge them and copy to output file
    post_pyro_tail = '_' + truncate_len
    if suppress_perseus == True:
        fasta_result_names = [
            sample_name + post_pyro_tail + '_seqnoise_cd.fa'
            for sample_name in sample_names
        ]
    else:
        fasta_result_names = [sample_name + '_Good.fa' \
          for sample_name in sample_names]

    cmd = 'cd ' + output_dir  # see also os.chdir above
    commands.append([('change to output dir', cmd)])

    cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt'
    commands.append([('confirm pyro lookup filepath environment variable', cmd)
                     ])


    cmd = 'SplitKeys.pl '+one_primer+' map.csv < '+\
        os.path.join(called_dir,sff_txt_fp)+\
        ' > splitkeys_log.txt 2> unassigned.fna'
    commands.append([('split sff.txt via barcodes (keys)', cmd)])

    for i, sample_name in enumerate(sample_names):

        # Build the summarize taxonomy command
        if platform == 'flx':
            cmd = 'Clean360.pl '+one_primer+' '+sample_name+' < '+\
                sample_name+'.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # these run through the whole sff file once per sample, I think
            # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        elif platform == 'titanium':
            cmd = 'CleanMinMax.pl '+one_primer+' '+sample_name+' < '+\
                sample_name+'.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        else:
            raise RuntimeError("platform " + platform + " not supported")

        cmd = "mpirun -np "+str(numnodes)+" PyroDist -in "+\
          sample_name+".dat -out "+sample_name+ " > "+sample_name+".pdout"
        commands.append([('pyrodist ' + sample_name, cmd)])

        cmd = "FCluster -in "+sample_name+".fdist -out "+sample_name+\
          " > "+sample_name+".fcout"
        commands.append([('fcluster pyrodist ' + sample_name, cmd)])

        # e.g.:
        # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
        # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
        cmd = "mpirun -np "+str(numnodes)+" PyroNoise -din "+\
            sample_name+".dat -out "+\
            sample_name+"_pyronoise "+"-lin "+\
            sample_name+".list -s 60.0 -c 0.01 > "+\
            sample_name+"_pyronoise.pnout"
        commands.append([('pyronoise ' + sample_name, cmd)])

        cmd = 'Parse.pl '+bc_seqs[i]+one_primer+' '+truncate_len+' < '+\
            sample_name+'_pyronoise_cd.fa'+' > '+ sample_name+'_'+\
            truncate_len+'.fa'
        commands.append([('truncate ' + sample_name, cmd)])

        # now start with post_pyro_tail
        cmd = "mpirun -np "+str(numnodes)+" SeqDist -in "+\
            sample_name+post_pyro_tail+\
            ".fa > "+sample_name+post_pyro_tail+".seqdist"
        commands.append([('seqdist ' + sample_name, cmd)])

        cmd = "FCluster -in "+sample_name+post_pyro_tail+".seqdist -out "+\
            sample_name+post_pyro_tail+"fcl > "+\
            sample_name+post_pyro_tail+".fcout"
        commands.append([('fcluster seqdist ' + sample_name, cmd)])

        # e.g.:
        # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
        # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
        # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
        # PC.354_pyronoise_cd.snout

        cmd = "mpirun -np "+str(numnodes)+" SeqNoise -in "+\
            sample_name+post_pyro_tail+\
            ".fa -din "+sample_name+post_pyro_tail+".seqdist -out "+\
            sample_name+post_pyro_tail+\
            "_seqnoise -lin "+sample_name+post_pyro_tail+'fcl.list -min '+\
            sample_name+'_pyronoise'+\
            '.mapping -s '+seqnoise_resolution+' -c 0.08 > '+\
            sample_name+post_pyro_tail+'.snout'
        commands.append([('seqnoise ' + sample_name, cmd)])

        if suppress_perseus == False:

            cmd = 'Perseus -sin '+sample_name+post_pyro_tail+\
                '_seqnoise_cd.fa > ' +\
                sample_name+'.per'
            commands.append([('Perseus ' + sample_name, cmd)])

            cmd = 'Class.pl '+sample_name+'.per '+\
                str(chimera_alpha) + ' '+ str(chimera_beta)+\
                ' > '+sample_name+'.class'
            commands.append([('Class.pl ' + sample_name, cmd)])

            cmd = 'FilterGoodClass.pl '+sample_name+post_pyro_tail+\
                '_seqnoise_cd.fa '+\
                sample_name+'.class 0.5 > '+sample_name+'_Chi.fa 2> '+\
                sample_name+'_Good.fa'
            commands.append([('FilterGoodClass ' + sample_name, cmd)])

        cmd = '%s %s/unweight_fasta.py -i %s -o %s -l %s' %\
         (python_exe_fp, script_dir, fasta_result_names[i],
         sample_name+'_unw.fna', sample_name)
        commands.append([('unweight fasta ' + sample_name, cmd)])

    cmd = 'cat ' +\
      ' '.join([sample_name+'_unw.fna' for sample_name in sample_names]) +\
      ' > ' + output_filepath # this should be an abs filepath
    commands.append([('cat into one fasta file', cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Esempio n. 34
0
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    status_update_callback=print_to_stdout):
    """
    """

    if categories != None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
         parse_mapping_file_to_dict(open(mapping_fp,'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError, ("Category '%s' is not a column header "
                 "in your mapping file. "
                 "Categories are case and white space sensitive. Valid "
                 "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError, ("Category '%s' contains only one value. "
                 "Categories analyzed here require at least two values." % c)
            
    else:
        categories= []
    
    # prep some variables
    if params == None:
        params = parse_qiime_parameters([])
        
    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    
    # begin logging
    log_fp = generate_log_fp(output_dir)
    index_links.append(('Master run log',log_fp,'Log files'))
    logger = WorkflowLogger(log_fp,
                            params=params,
                            qiime_config=qiime_config)
    input_fps = [biom_fp,mapping_fp]
    if tree_fp != None:
        input_fps.append(tree_fp)
    log_input_md5s(logger,input_fps)
    
    
    bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth)
    even_dm_fps = run_beta_diversity_through_plots(
     otu_table_fp=biom_fp, 
     mapping_fp=mapping_fp,
     output_dir=bdiv_even_output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     sampling_depth=sampling_depth,
     # force suppression of distance histograms - boxplots work better
     # in this context, and are created below.
     histogram_categories=[],
     tree_fp=tree_fp,
     parallel=parallel,
     logger=logger,
     status_update_callback=status_update_callback)
    
    for bdiv_metric, dm_fp in even_dm_fps:
        for category in categories:
            boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric)
            try:
                params_str = get_params_str(params['make_distance_boxplots'])
            except KeyError:
                params_str = ''
            boxplots_cmd = \
             'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
             (dm_fp, category, boxplots_output_dir, mapping_fp, params_str)
            commands.append([('Boxplots (%s)' % category,
                              boxplots_cmd)])
            index_links.append(('Distance boxplots (%s)' % bdiv_metric,
                                '%s/%s_Distances.pdf' % \
                                 (boxplots_output_dir,category),
                                'Beta diversity results (even sampling: %d)' % sampling_depth))
            index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric,
                                '%s/%s_Stats.txt' % \
                                 (boxplots_output_dir,category),
                                'Beta diversity results (even sampling: %d)' % sampling_depth))
            
        index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric,
                            '%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric,
                            '%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric,
                            '%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric,
                            '%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('Distance matrix (%s)' % bdiv_metric,
                            '%s/%s_dm.txt' % \
                             (bdiv_even_output_dir,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric,
                            '%s/%s_pc.txt' % \
                             (bdiv_even_output_dir,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        
    ## Alpha rarefaction workflow
    arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth)
    run_qiime_alpha_rarefaction(
     otu_table_fp=biom_fp,
     mapping_fp=mapping_fp,
     output_dir=arare_full_output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     tree_fp=tree_fp,
     num_steps=arare_num_steps,
     parallel=parallel,
     logger=logger,
     min_rare_depth=arare_min_rare_depth,
     max_rare_depth=sampling_depth,
     status_update_callback=status_update_callback)
    
    index_links.append(('Alpha rarefaction plots',
                        '%s/alpha_rarefaction_plots/rarefaction_plots.html'\
                          % arare_full_output_dir,
                        "Alpha rarefaction results"))
                        
    collated_alpha_diversity_fps = \
     glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
    try:
        params_str = get_params_str(params['compare_alpha_diversity'])
    except KeyError:
        params_str = ''
    for c in categories:
        for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
            alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
            alpha_comparison_output_fp = '%s/%s_%s.txt' % \
             (arare_full_output_dir,c,alpha_metric)
            compare_alpha_cmd = \
             'compare_alpha_diversity.py -i %s -m %s -c %s -d %s -o %s -n 999 %s' %\
             (collated_alpha_diversity_fp, mapping_fp, c, 
              sampling_depth, alpha_comparison_output_fp, params_str)
            commands.append([('Compare alpha diversity (%s, %s)' %\
                               (category,alpha_metric),
                              compare_alpha_cmd)])
            index_links.append(
             ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric),
              alpha_comparison_output_fp,
              "Alpha rarefaction results"))
    
    taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
    run_summarize_taxa_through_plots(
     otu_table_fp=biom_fp,
     mapping_fp=mapping_fp,
     output_dir=taxa_plots_output_dir,
     mapping_cat=None, 
     sort=True,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     logger=logger, 
     status_update_callback=status_update_callback)
    

    index_links.append(('Taxa summary bar plots',
                        '%s/taxa_summary_plots/bar_charts.html'\
                          % taxa_plots_output_dir,
                        "Taxonomic summary results"))
    index_links.append(('Taxa summary area plots',
                        '%s/taxa_summary_plots/area_charts.html'\
                          % taxa_plots_output_dir,
                        "Taxonomic summary results"))
    for c in categories:
        taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,c)
        run_summarize_taxa_through_plots(
         otu_table_fp=biom_fp,
         mapping_fp=mapping_fp,
         output_dir=taxa_plots_output_dir,
         mapping_cat=c, 
         sort=True,
         command_handler=command_handler,
         params=params,
         qiime_config=qiime_config,
         logger=logger, 
         status_update_callback=status_update_callback)

        index_links.append(('Taxa summary bar plots',
                            '%s/taxa_summary_plots/bar_charts.html'\
                              % taxa_plots_output_dir,
                            "Taxonomic summary results (by %s)" % c))
        index_links.append(('Taxa summary area plots',
                            '%s/taxa_summary_plots/area_charts.html'\
                              % taxa_plots_output_dir,
                            "Taxonomic summary results (by %s)" % c))
    
    # OTU category significance
    for category in categories:
        category_signifance_fp = \
         '%s/category_significance_%s.txt' % (output_dir, category)
        try:
            params_str = get_params_str(params['otu_category_significance'])
        except KeyError:
            params_str = ''
        # Build the OTU cateogry significance command
        category_significance_cmd = \
         'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\
         (biom_fp, mapping_fp, category, 
          category_signifance_fp, params_str)
        commands.append([('OTU category significance (%s)' % category, 
                          category_significance_cmd)])
                          
        index_links.append(('Category significance (%s)' % category,
                    category_signifance_fp,
                    "Category results"))
    
    command_handler(commands, status_update_callback, logger)
    generate_index_page(index_links,index_fp)
Esempio n. 35
0
def run_beta_diversity_through_plots(otu_table_fp,
                                     mapping_fp,
                                     output_dir,
                                     command_handler,
                                     params,
                                     qiime_config,
                                     color_by_interesting_fields_only=True,
                                     sampling_depth=None,
                                     histogram_categories=None,
                                     tree_fp=None,
                                     parallel=False,
                                     logger=None,
                                     suppress_3d_plots=False,
                                     suppress_2d_plots=False,
                                     suppress_md5=False,
                                     status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
         1) Compute a beta diversity distance matrix;
         2) Peform a principal coordinates analysis on the result of
          Step 1;
         3) Generate a 3D prefs file for optimized coloring of continuous
          variables;
         4) Generate a 3D plot for all mapping fields with colors
          optimized for continuous data;
         5) Generate a 3D plot for all mapping fields with colors
          optimized for discrete data.
    
    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp])

    mapping_data, mapping_header, mapping_comments =\
     parse_mapping_file(open(mapping_fp,'U'))
    if histogram_categories:
        invalid_categories = set(histogram_categories) - set(mapping_header)
        if invalid_categories:
            raise ValueError,\
             "Invalid histogram categories - these must exactly match "+\
             "mapping file column headers: %s" % (' '.join(invalid_categories))
    # Get the interesting mapping fields to color by -- if none are
    # interesting, take all of them. Interesting is defined as those
    # which have greater than one value and fewer values than the number
    # of samples
    if color_by_interesting_fields_only:
        mapping_fields =\
          get_interesting_mapping_fields(mapping_data, mapping_header) or\
          mapping_header
    else:
        mapping_fields = mapping_header
    mapping_fields = ','.join(mapping_fields)

    if sampling_depth:
        # Sample the OTU table at even depth
        even_sampled_otu_table_fp = '%s/%s_even%d%s' %\
         (output_dir, otu_table_basename,
          sampling_depth, otu_table_ext)
        single_rarefaction_cmd = \
         '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\
         (python_exe_fp, script_dir, otu_table_fp,
          even_sampled_otu_table_fp, sampling_depth)
        commands.append([
            ('Sample OTU table at %d seqs/sample' % sampling_depth,
             single_rarefaction_cmd)
        ])
        otu_table_fp = even_sampled_otu_table_fp
        otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp)
        otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    try:
        beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
    except KeyError:
        beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac']

    # Prep the 3d prefs file generator command
    prefs_fp = '%s/prefs.txt' % output_dir
    try:
        params_str = get_params_str(params['make_prefs_file'])
    except KeyError:
        params_str = ''
    if not 'mapping_headers_to_use' in params['make_prefs_file']:
        params_str = '%s --mapping_headers_to_use %s' \
         % (params_str,mapping_fields)
    # Build the 3d prefs file generator command
    prefs_cmd = \
     '%s %s/make_prefs_file.py -m %s -o %s %s' %\
     (python_exe_fp, script_dir, mapping_fp, prefs_fp, params_str)
    commands.append([('Build prefs file', prefs_cmd)])

    dm_fps = []
    for beta_diversity_metric in beta_diversity_metrics:

        # Prep the beta-diversity command
        try:
            bdiv_params_copy = params['beta_diversity'].copy()
        except KeyError:
            bdiv_params_copy = {}
        try:
            del bdiv_params_copy['metrics']
        except KeyError:
            pass

        params_str = get_params_str(bdiv_params_copy)

        if tree_fp:
            params_str = '%s -t %s ' % (params_str, tree_fp)

        # Build the beta-diversity command
        if parallel:
            # Grab the parallel-specific parameters
            try:
                params_str += get_params_str(params['parallel'])
            except KeyError:
                pass
            beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\
             (python_exe_fp, script_dir, otu_table_fp,
              output_dir, beta_diversity_metric, params_str)
            commands.append(\
             [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])
        else:
            beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\
             (python_exe_fp, script_dir, otu_table_fp,
              output_dir, beta_diversity_metric, params_str)
            commands.append(\
             [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])


        orig_beta_div_fp = '%s/%s_%s.txt' % \
         (output_dir, beta_diversity_metric, otu_table_basename)
        beta_div_fp = '%s/%s_dm.txt' % \
         (output_dir, beta_diversity_metric)
        commands.append([
            ('Rename distance matrix (%s)' % beta_diversity_metric,
             'mv %s %s' % (orig_beta_div_fp, beta_div_fp))
        ])
        dm_fps.append((beta_diversity_metric, beta_div_fp))

        # Prep the principal coordinates command
        pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric)
        try:
            params_str = get_params_str(params['principal_coordinates'])
        except KeyError:
            params_str = ''
        # Build the principal coordinates command
        pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str)
        commands.append(\
         [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)])

        # Generate 3d plots
        if not suppress_3d_plots:
            # Prep the continuous-coloring 3d plots command
            continuous_3d_dir = '%s/%s_3d_continuous/' %\
             (output_dir, beta_diversity_metric)
            create_dir(continuous_3d_dir)
            try:
                params_str = get_params_str(params['make_3d_plots'])
            except KeyError:
                params_str = ''
            # Build the continuous-coloring 3d plots command
            continuous_3d_command = \
             '%s %s/make_3d_plots.py -p %s -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_3d_dir,
               mapping_fp, params_str)

            # Prep the discrete-coloring 3d plots command
            discrete_3d_dir = '%s/%s_3d_discrete/' %\
             (output_dir, beta_diversity_metric)
            create_dir(discrete_3d_dir)
            try:
                params_str = get_params_str(params['make_3d_plots'])
            except KeyError:
                params_str = ''
            # Build the discrete-coloring 3d plots command
            discrete_3d_command = \
             '%s %s/make_3d_plots.py -b "%s" -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_3d_dir,
               mapping_fp, params_str)

            commands.append([\
              ('Make 3D plots (continuous coloring, %s)' %\
                beta_diversity_metric,continuous_3d_command),\
              ('Make 3D plots (discrete coloring, %s)' %\
                beta_diversity_metric,discrete_3d_command,)])

        # Generate 3d plots
        if not suppress_2d_plots:
            # Prep the continuous-coloring 3d plots command
            continuous_2d_dir = '%s/%s_2d_continuous/' %\
             (output_dir, beta_diversity_metric)
            create_dir(continuous_2d_dir)
            try:
                params_str = get_params_str(params['make_2d_plots'])
            except KeyError:
                params_str = ''
            # Build the continuous-coloring 3d plots command
            continuous_2d_command = \
             '%s %s/make_2d_plots.py -p %s -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_2d_dir,
               mapping_fp, params_str)

            # Prep the discrete-coloring 3d plots command
            discrete_2d_dir = '%s/%s_2d_discrete/' %\
             (output_dir, beta_diversity_metric)
            create_dir(discrete_2d_dir)
            try:
                params_str = get_params_str(params['make_2d_plots'])
            except KeyError:
                params_str = ''
            # Build the discrete-coloring 2d plots command
            discrete_2d_command = \
             '%s %s/make_2d_plots.py -b "%s" -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_2d_dir,
               mapping_fp, params_str)

            commands.append([\
              ('Make 2D plots (continuous coloring, %s)' %\
                beta_diversity_metric,continuous_2d_command),\
              ('Make 2D plots (discrete coloring, %s)' %\
                beta_diversity_metric,discrete_2d_command,)])

        if histogram_categories:
            # Prep the discrete-coloring 3d plots command
            histograms_dir = '%s/%s_histograms/' %\
             (output_dir, beta_diversity_metric)
            create_dir(histograms_dir)
            try:
                params_str = get_params_str(params['make_distance_histograms'])
            except KeyError:
                params_str = ''
            # Build the make_distance_histograms command
            distance_histograms_command = \
             '%s %s/make_distance_histograms.py -d %s -o %s -m %s -f "%s" %s' %\
              (python_exe_fp, script_dir, beta_div_fp,
               histograms_dir, mapping_fp,
               ','.join(histogram_categories), params_str)

            commands.append([\
              ('Make Distance Histograms (%s)' %\
                beta_diversity_metric,distance_histograms_command)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)

    return dm_fps
Esempio n. 36
0
def run_alpha_rarefaction(otu_table_fp,
                          mapping_fp,
                          output_dir,
                          command_handler,
                          params,
                          qiime_config,
                          tree_fp=None,
                          num_steps=10,
                          parallel=False,
                          logger=None,
                          min_rare_depth=10,
                          max_rare_depth=None,
                          suppress_md5=False,
                          status_update_callback=print_to_stdout,
                          plot_stderr_and_stddev=False):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          1) Generate rarefied OTU tables;
          2) Compute alpha diversity metrics for each rarefied OTU table;
          3) Collate alpha diversity results;
          4) Generate alpha rarefaction plots.
    
    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp])

    if max_rare_depth == None:
        min_count, max_count, median_count, mean_count, counts_per_sample =\
         compute_seqs_per_library_stats(parse_biom_table(open(otu_table_fp,'U')))
        max_rare_depth = median_count
    step = int((max_rare_depth - min_rare_depth) / num_steps) or 1
    max_rare_depth = int(max_rare_depth)

    rarefaction_dir = '%s/rarefaction/' % output_dir
    create_dir(rarefaction_dir)
    try:
        params_str = get_params_str(params['multiple_rarefactions'])
    except KeyError:
        params_str = ''
    if parallel:
        params_str += ' %s' % get_params_str(params['parallel'])
        # Build the rarefaction command
        rarefaction_cmd = \
         '%s %s/parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\
         (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth,
          step, rarefaction_dir, params_str)
    else:
        # Build the rarefaction command
        rarefaction_cmd = \
         '%s %s/multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\
         (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth,
          step, rarefaction_dir, params_str)
    commands.append([('Alpha rarefaction', rarefaction_cmd)])

    # Prep the alpha diversity command
    alpha_diversity_dir = '%s/alpha_div/' % output_dir
    create_dir(alpha_diversity_dir)
    try:
        params_str = get_params_str(params['alpha_diversity'])
    except KeyError:
        params_str = ''
    if tree_fp:
        params_str += ' -t %s' % tree_fp
    if parallel:
        params_str += ' %s' % get_params_str(params['parallel'])
        # Build the alpha diversity command
        alpha_diversity_cmd = \
         "%s %s/parallel_alpha_diversity.py -T -i %s -o %s %s" %\
         (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir,
          params_str)
    else:
        # Build the alpha diversity command
        alpha_diversity_cmd = \
         "%s %s/alpha_diversity.py -i %s -o %s %s" %\
         (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir,
          params_str)

    commands.append(\
     [('Alpha diversity on rarefied OTU tables',alpha_diversity_cmd)])

    # Prep the alpha diversity collation command
    alpha_collated_dir = '%s/alpha_div_collated/' % output_dir
    create_dir(alpha_collated_dir)
    try:
        params_str = get_params_str(params['collate_alpha'])
    except KeyError:
        params_str = ''
    # Build the alpha diversity collation command
    alpha_collated_cmd = '%s %s/collate_alpha.py -i %s -o %s %s' %\
     (python_exe_fp, script_dir, alpha_diversity_dir, \
      alpha_collated_dir, params_str)
    commands.append([('Collate alpha', alpha_collated_cmd)])

    # Prep the make rarefaction plot command(s)
    try:
        params_str = get_params_str(params['make_rarefaction_plots'])
    except KeyError:
        params_str = ''

    if 'std_type' in params[
            'make_rarefaction_plots'] or not plot_stderr_and_stddev:
        rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir
        create_dir(rarefaction_plot_dir)

        # Build the make rarefaction plot command(s)
        #for metric in alpha_diversity_metrics:
        make_rarefaction_plot_cmd =\
             '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s' %\
             (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp,
              rarefaction_plot_dir, params_str)
        commands.append(\
             [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)])
    else:
        rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir
        rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir
        create_dir(rarefaction_plot_dir_stddev)
        create_dir(rarefaction_plot_dir_stderr)

        # Build the make rarefaction plot command(s)
        # for metric in alpha_diversity_metrics:
        make_rarefaction_plot_cmd =\
             '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\
             (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp,
              rarefaction_plot_dir_stddev, params_str)
        commands.append(\
             [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)])
        make_rarefaction_plot_cmd =\
             '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\
             (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp,
              rarefaction_plot_dir_stderr, params_str)
        commands.append(\
             [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Esempio n. 37
0
def run_jackknifed_beta_diversity(otu_table_fp,
                                  tree_fp,
                                  seqs_per_sample,
                                  output_dir,
                                  command_handler,
                                  params,
                                  qiime_config,
                                  mapping_fp,
                                  parallel=False,
                                  logger=None,
                                  suppress_md5=False,
                                  status_update_callback=print_to_stdout,
                                  master_tree=None):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          1) Compute beta diversity distance matrix from otu table (and
           tree, if applicable)
          2) Build rarefied OTU tables;
          3) Build UPGMA tree from full distance matrix;
          4) Compute distance matrics for rarefied OTU tables;
          5) Build UPGMA trees from rarefied OTU table distance matrices;
          5.5) Build a consensus tree from the rarefied UPGMA trees
          6) Compare rarefied OTU table distance matrix UPGMA trees 
           to tree full UPGMA tree and write support file and newick tree
           with support values as node labels.
           
        master_tree can be 'full' or 'consensus', default full
    """
    # Prepare some variables for the later steps
    if master_tree == None:
        master_tree = 'full'
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp])

    try:
        beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
    except KeyError:
        beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac']

    # Prep the beta-diversity command
    try:
        params_str = get_params_str(params['beta_diversity'])
    except KeyError:
        params_str = ''
    if tree_fp:
        params_str = '%s -t %s' % (params_str, tree_fp)
    # Build the beta-diversity command
    beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_table_fp, output_dir, params_str)
    commands.append(\
     [('Beta Diversity (%s)' % ', '.join(beta_diversity_metrics), beta_div_cmd)])

    # Prep rarefaction command
    rarefaction_dir = '%s/rarefaction/' % output_dir
    create_dir(rarefaction_dir)
    try:
        params_str = get_params_str(params['multiple_rarefactions_even_depth'])
    except KeyError:
        params_str = ''
    # Build the rarefaction command
    rarefaction_cmd = \
     '%s %s/multiple_rarefactions_even_depth.py -i %s -d %d -o %s %s' %\
     (python_exe_fp, script_dir, otu_table_fp, seqs_per_sample,
      rarefaction_dir, params_str)
    commands.append([('Rarefaction', rarefaction_cmd)])

    # Begin iterating over beta diversity distance metrics, if more than one
    # was provided
    for beta_diversity_metric in beta_diversity_metrics:
        metric_output_dir = '%s/%s/' % (output_dir, beta_diversity_metric)
        distance_matrix_fp = '%s/%s_%s.txt' % \
         (output_dir, beta_diversity_metric, otu_table_basename)

        # Prep the hierarchical clustering command (for full distance matrix)
        full_tree_fp = '%s/%s_upgma.tre' % (metric_output_dir,
                                            otu_table_basename)
        try:
            params_str = get_params_str(params['upgma_cluster'])
        except KeyError:
            params_str = ''
        # Build the hierarchical clustering command (for full distance matrix)
        hierarchical_cluster_cmd = '%s %s/upgma_cluster.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, distance_matrix_fp, full_tree_fp, params_str)
        commands.append(\
         [('UPGMA on full distance matrix: %s' % beta_diversity_metric,\
           hierarchical_cluster_cmd)])

        # Prep the beta diversity command (for rarefied OTU tables)
        dm_dir = '%s/rare_dm/' % metric_output_dir
        create_dir(dm_dir)
        # the metrics parameter needs to be ignored as we need to run
        # beta_diversity one metric at a time to keep the per-metric
        # output files in separate directories
        try:
            d = params['beta_diversity'].copy()
            del d['metrics']
        except KeyError:
            params_str = {}
        params_str = get_params_str(d) + ' -m %s ' % beta_diversity_metric
        if tree_fp:
            params_str = '%s -t %s' % (params_str, tree_fp)
        if parallel:
            params_str += ' %s' % get_params_str(params['parallel'])
            # Build the parallel beta diversity command (for rarefied OTU tables)
            beta_div_rarefied_cmd = \
             '%s %s/parallel_beta_diversity.py -T -i %s -o %s %s' %\
             (python_exe_fp, script_dir, rarefaction_dir, dm_dir, params_str)
        else:
            # Build the serial beta diversity command (for rarefied OTU tables)
            beta_div_rarefied_cmd = \
             '%s %s/beta_diversity.py -i %s -o %s %s' %\
             (python_exe_fp, script_dir, rarefaction_dir, dm_dir, params_str)
        commands.append(\
         [('Beta diversity on rarefied OTU tables (%s)' % beta_diversity_metric,
           beta_div_rarefied_cmd)])

        # Prep the hierarchical clustering command (for rarefied
        # distance matrices)
        upgma_dir = '%s/rare_upgma/' % metric_output_dir
        create_dir(upgma_dir)

        try:
            params_str = get_params_str(params['upgma_cluster'])
        except KeyError:
            params_str = ''
        # Build the hierarchical clustering command (for rarefied
        # distance matrices)
        hierarchical_cluster_cmd =\
         '%s %s/upgma_cluster.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, dm_dir, upgma_dir, params_str)
        commands.append(\
         [('UPGMA on rarefied distance matrix (%s)' % beta_diversity_metric,
           hierarchical_cluster_cmd)])

        # Build the consensus tree command
        consensus_tree_cmd =\
         '%s %s/consensus_tree.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, upgma_dir, upgma_dir + "/consensus.tre",
            params_str)
        commands.append(\
         [('consensus on rarefied distance matrices (%s)' % beta_diversity_metric,
           consensus_tree_cmd)])

        # Prep the tree compare command
        tree_compare_dir = '%s/upgma_cmp/' % metric_output_dir
        create_dir(tree_compare_dir)
        try:
            params_str = get_params_str(params['tree_compare'])
        except KeyError:
            params_str = ''

        # Build the tree compare command
        if master_tree == "full":
            master_tree_fp = full_tree_fp
        elif master_tree == "consensus":
            master_tree_fp = upgma_dir + "/consensus.tre"
        else:
            raise RuntimeError('master tree method "%s" not found' %
                               (master_tree, ))
        tree_compare_cmd = '%s %s/tree_compare.py -s %s -m %s -o %s %s' %\
         (python_exe_fp, script_dir, upgma_dir, master_tree_fp,
          tree_compare_dir, params_str)
        commands.append(\
         [('Tree compare (%s)' % beta_diversity_metric, tree_compare_cmd)])

        # Prep the PCoA command
        pcoa_dir = '%s/pcoa/' % metric_output_dir
        create_dir(pcoa_dir)
        try:
            params_str = get_params_str(params['principal_coordinates'])
        except KeyError:
            params_str = ''
        # Build the PCoA command
        pcoa_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, dm_dir, pcoa_dir, params_str)
        commands.append(\
         [('Principal coordinates (%s)' % beta_diversity_metric, pcoa_cmd)])

        # Prep the 2D plots command
        plots_2d_dir = '%s/2d_plots/' % metric_output_dir
        create_dir(plots_2d_dir)
        try:
            params_str = get_params_str(params['make_2d_plots'])
        except KeyError:
            params_str = ''
        # Build the 2d plots command
        plots_2d_cmd = '%s %s/make_2d_plots.py -i %s -o %s -m %s %s' %\
         (python_exe_fp, script_dir, pcoa_dir, plots_2d_dir,
          mapping_fp, params_str)
        commands.append(\
         [('2d plots (%s)' % beta_diversity_metric, plots_2d_cmd)])

        # Prep the 3D plots command
        plots_3d_dir = '%s/3d_plots/' % metric_output_dir
        create_dir(plots_3d_dir)
        try:
            params_str = get_params_str(params['make_3d_plots'])
        except KeyError:
            params_str = ''
        # Build the 2d plots command
        plots_3d_cmd = '%s %s/make_3d_plots.py -i %s -o %s -m %s %s' %\
         (python_exe_fp, script_dir, pcoa_dir, plots_3d_dir,
          mapping_fp, params_str)
        commands.append(\
         [('3d plots (%s)' % beta_diversity_metric, plots_3d_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Esempio n. 38
0
def run_beta_diversity_through_plots(otu_table_fp,
                                     mapping_fp,
                                     output_dir,
                                     command_handler,
                                     params,
                                     qiime_config,
                                     color_by_interesting_fields_only=True,
                                     sampling_depth=None,
                                     tree_fp=None,
                                     parallel=False,
                                     logger=None,
                                     suppress_emperor_plots=False,
                                     suppress_md5=False,
                                     status_update_callback=print_to_stdout):
    """ Compute beta diversity distance matrices, run PCoA, and generate emperor plots
    
        The steps performed by this function are:
         1) Compute a beta diversity distance matrix for each metric
         2) Peform a principal coordinates analysis on the result of step 1
         3) Generate an emperor plot for each result of step 2
    
    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp])

    mapping_data, mapping_header, mapping_comments =\
     parse_mapping_file(open(mapping_fp,'U'))

    # Get the interesting mapping fields to color by -- if none are
    # interesting, take all of them. Interesting is defined as those
    # which have greater than one value and fewer values than the number
    # of samples
    if color_by_interesting_fields_only:
        mapping_fields =\
          get_interesting_mapping_fields(mapping_data, mapping_header) or\
          mapping_header
    else:
        mapping_fields = mapping_header
    mapping_fields = ','.join(mapping_fields)

    if sampling_depth:
        # Sample the OTU table at even depth
        even_sampled_otu_table_fp = '%s/%s_even%d%s' %\
         (output_dir, otu_table_basename,
          sampling_depth, otu_table_ext)
        single_rarefaction_cmd = \
         '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\
         (python_exe_fp, script_dir, otu_table_fp,
          even_sampled_otu_table_fp, sampling_depth)
        commands.append([
            ('Sample OTU table at %d seqs/sample' % sampling_depth,
             single_rarefaction_cmd)
        ])
        otu_table_fp = even_sampled_otu_table_fp
        otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp)
        otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    try:
        beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
    except KeyError:
        beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac']

    dm_fps = []
    for beta_diversity_metric in beta_diversity_metrics:

        # Prep the beta-diversity command
        try:
            bdiv_params_copy = params['beta_diversity'].copy()
        except KeyError:
            bdiv_params_copy = {}
        try:
            del bdiv_params_copy['metrics']
        except KeyError:
            pass

        params_str = get_params_str(bdiv_params_copy)

        if tree_fp:
            params_str = '%s -t %s ' % (params_str, tree_fp)

        # Build the beta-diversity command
        if parallel:
            # Grab the parallel-specific parameters
            try:
                params_str += get_params_str(params['parallel'])
            except KeyError:
                pass
            beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\
             (python_exe_fp, script_dir, otu_table_fp,
              output_dir, beta_diversity_metric, params_str)
            commands.append(\
             [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])
        else:
            beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\
             (python_exe_fp, script_dir, otu_table_fp,
              output_dir, beta_diversity_metric, params_str)
            commands.append(\
             [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])


        orig_beta_div_fp = '%s/%s_%s.txt' % \
         (output_dir, beta_diversity_metric, otu_table_basename)
        beta_div_fp = '%s/%s_dm.txt' % \
         (output_dir, beta_diversity_metric)
        commands.append([
            ('Rename distance matrix (%s)' % beta_diversity_metric,
             'mv %s %s' % (orig_beta_div_fp, beta_div_fp))
        ])
        dm_fps.append((beta_diversity_metric, beta_div_fp))

        # Prep the principal coordinates command
        pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric)
        try:
            params_str = get_params_str(params['principal_coordinates'])
        except KeyError:
            params_str = ''
        # Build the principal coordinates command
        pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str)
        commands.append(\
         [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)])

        # Generate emperor plots
        if not suppress_emperor_plots:
            # Prep the emperor plots command
            emperor_dir = '%s/%s_emperor_pcoa_plot/' % (output_dir,
                                                        beta_diversity_metric)
            create_dir(emperor_dir)
            try:
                params_str = get_params_str(params['make_emperor'])
            except KeyError:
                params_str = ''
            # Build the continuous-coloring 3d plots command
            emperor_command = \
             'make_emperor.py -i %s -o %s -m %s %s' % (pc_fp,
                                                       emperor_dir,
                                                       mapping_fp,
                                                       params_str)

            commands.append([
                ('Make emperor plots, %s)' % beta_diversity_metric,
                 emperor_command)
            ])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)

    return dm_fps
Esempio n. 39
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    if (opts.suppress_unit_tests and opts.suppress_script_usage_tests):
        option_parser.error(
            "You're suppressing both test types. Nothing to run.")

    test_dir = abspath(dirname(__file__))

    unittest_good_pattern = re.compile('OK\s*$')
    application_not_found_pattern = re.compile('ApplicationNotFoundError')
    python_name = 'python'
    bad_tests = []
    missing_application_tests = []

    # Run through all of QIIME's unit tests, and keep track of any files which
    # fail unit tests.
    if not opts.suppress_unit_tests:
        unittest_names = []
        if not opts.unittest_glob:
            for root, dirs, files in walk(test_dir):
                for name in files:
                    if name.startswith('test_') and name.endswith('.py'):
                        unittest_names.append(join(root, name))
        else:
            for fp in glob(opts.unittest_glob):
                fn = split(fp)[1]
                if fn.startswith('test_') and fn.endswith('.py'):
                    unittest_names.append(abspath(fp))

        unittest_names.sort()

        for unittest_name in unittest_names:
            print "Testing %s:\n" % unittest_name
            command = '%s %s -v' % (python_name, unittest_name)
            stdout, stderr, return_value = qiime_system_call(command)
            print stderr
            if not unittest_good_pattern.search(stderr):
                if application_not_found_pattern.search(stderr):
                    missing_application_tests.append(unittest_name)
                else:
                    bad_tests.append(unittest_name)

    qiime_test_data_dir = join(get_qiime_project_dir(), 'qiime_test_data')
    qiime_test_data_dir_exists = exists(qiime_test_data_dir)
    if not opts.suppress_script_usage_tests and qiime_test_data_dir_exists:
        if opts.script_usage_tests is not None:
            script_usage_tests = opts.script_usage_tests.split(',')
        else:
            script_usage_tests = None

        # Run the script usage testing functionality
        script_usage_result_summary, has_script_usage_example_failures = \
            run_script_usage_tests(
                test_data_dir=qiime_test_data_dir,
                scripts_dir=get_qiime_scripts_dir(),
                working_dir=qiime_config['temp_dir'],
                verbose=True,
                tests=script_usage_tests,
                force_overwrite=True,
                timeout=240)

    print "==============\nResult summary\n=============="

    if not opts.suppress_unit_tests:
        print "\nUnit test result summary\n------------------------\n"
        if bad_tests:
            print "\nFailed the following unit tests.\n%s" % '\n'.join(bad_tests)

        if missing_application_tests:
            print "\nFailed the following unit tests, in part or whole due " +\
                "to missing external applications.\nDepending on the QIIME features " +\
                "you plan to use, this may not be critical.\n%s"\
                % '\n'.join(missing_application_tests)

        if not (missing_application_tests or bad_tests):
            print "\nAll unit tests passed.\n\n"

    if not opts.suppress_script_usage_tests:
        if qiime_test_data_dir_exists:
            print "\nScript usage test result summary\n--------------------------------\n"
            print script_usage_result_summary
        else:
            print "\nCould not run script usage tests because the directory %s does not exist." % qiime_test_data_dir
        print ""

    # If script usage tests weren't suppressed, the qiime_test_data dir must
    # exist and we can't have any failures.
    script_usage_tests_success = (opts.suppress_script_usage_tests or
                                  (qiime_test_data_dir_exists and
                                   not has_script_usage_example_failures))

    # If any of the unit tests or script usage tests fail, or if we have any
    # missing application errors, use return code 1 (as python's unittest
    # module does to indicate one or more failures).
    return_code = 1
    if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and
            script_usage_tests_success):
        return_code = 0
    return return_code
Esempio n. 40
0
def pick_subsampled_open_referenence_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_tax_align_tree=True,
        prefilter_percent_id=0.60,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    denovo_otu_picking_method = 'uclust'
    reference_otu_picking_method = 'uclust_ref'

    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    log_input_md5s(
        logger,
        [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
        prefilter_refseqs_fp = refseqs_fp

    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_otu_map_fp = \
             '%s/%s_otus.txt' % (prefilter_dir,input_basename)
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)

        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                    percent_subsample)

    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                 new_ref_set_id, denovo_otu_picking_method,
                                 params, logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',
                      step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                    reference_otu_picking_method,
                                    step2_repset_fasta_fp, parallel, params,
                                    logger)

    commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures',
                          step3_filter_fasta_cmd)])

        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])

    else:
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' %
                          (step3_failures_list_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    ## make the final representative seqs file and a new refseqs file that
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be.
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be
    ## reads from this run so we don't hit issues building a tree using
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,
                                                   'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    if run_tax_align_tree:
        taxonomy_fp, pynast_failures_fp = tax_align_tree(
            repset_fasta_fp=final_repset_fp,
            output_dir=output_dir,
            command_handler=command_handler,
            params=params,
            qiime_config=qiime_config,
            parallel=parallel,
            logger=logger,
            status_update_callback=status_update_callback)

        # Add taxa to otu table
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\
         (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
        commands.append([("Add taxa to OTU table", add_taxa_cmd)])

        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

        # Build OTU table without PyNAST failures
        otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
        filtered_otu_table = filter_otus_from_otu_table(
            parse_biom_table(open(otu_table_w_tax_fp, 'U')),
            get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
            0,
            inf,
            0,
            inf,
            negate_ids_to_keep=True)
        otu_table_f = open(otu_table_fp, 'w')
        otu_table_f.write(format_biom_table(filtered_otu_table))
        otu_table_f.close()

        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Esempio n. 41
0
def run_ampliconnoise(mapping_fp,
    output_dir, command_handler, params, qiime_config,
    logger=None, status_update_callback=print_to_stdout,
    chimera_alpha=-3.8228,chimera_beta=0.6200, sff_txt_fp=None, numnodes=2,
    suppress_perseus=True, output_filepath=None, platform='flx',
    seqnoise_resolution=None, truncate_len=None):
    """ Run the ampliconnoise pipeline
    
        The steps performed by this function are:
1. Split input sff.txt file into one file per sample

2. Run scripts required for PyroNoise

3. Run scripts required for SeqNoise

4. Run scripts requred for Perseus (chimera removal)

5. Merge output files into one file similar to the output of split_libraries.py

    output_filepath should be absolute
    seqnoise_resolution should be string
    environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
    careful passing command handlers that don't spawn child processes, as they
    may not inherit the correct environment variable setting
    """
    map_data,headers,comments = parse_mapping_file(open(mapping_fp,'U'))
    create_dir(output_dir)

    if seqnoise_resolution == None:
        if platform=='flx': seqnoise_resolution = '30.0'
        elif platform=='titanium': seqnoise_resolution = '25.0'
        else: raise RuntimeError('seqnoise_resolution not set, and no'+\
            ' default for platform '+platform)

    if truncate_len == None:
        if platform=='flx': truncate_len = '220'
        elif platform=='titanium': truncate_len = '400'
        else: raise RuntimeError('truncate_len not set, and no'+\
            ' default for platform '+platform)

    sample_names = [] # these are filenames minus extension, and are sample IDs
    primer_seqs = [] # same order as sample_names
    bc_seqs = [] # same order as sample_names
    for i in range(len(map_data)):
        sample_names.append(map_data[i][headers.index('SampleID')])
        bc_seqs.append(map_data[i][headers.index('BarcodeSequence')])
        # don't know why don't just take off the primer now. 
        # but that's done later
        # primer += (map_data[i][headers.index('LinkerPrimerSequence')])
        # for char, bases in IUPAC_DNA_ambiguities.items():
        #     primer = primer.replace(char,'['+''.join(bases)+']')

        primer = (map_data[i][headers.index('LinkerPrimerSequence')])
        for char, bases in IUPAC_DNA_ambiguities.items():
            primer = primer.replace(char,'['+''.join(bases)+']')
        primer_seqs.append(primer)

    if len(set(primer_seqs)) != 1:
        raise RuntimeError(
            'Error: only one primer per mapping file supported.')
    one_primer = primer_seqs[0]

    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()

    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    log_input_md5s(logger,[mapping_fp,sff_txt_fp])

    # execute commands in output_dir
    called_dir = os.getcwd()
    os.chdir(output_dir)
    fh = open(os.path.join(output_dir,'map.csv'),'w')
    for i in range(len(sample_names)):
        fh.write(sample_names[i]+','+bc_seqs[i]+'\n')
    fh.close()

    # these are the fasta results, e.g. PC.636_Good.fa
    # later we merge them and copy to output file
    post_pyro_tail = '_'+truncate_len
    if suppress_perseus == True:
        fasta_result_names = [sample_name + post_pyro_tail+'_seqnoise_cd.fa'
          for sample_name in sample_names]
    else:
        fasta_result_names = [sample_name + '_Good.fa' \
          for sample_name in sample_names]

    cmd = 'cd '+output_dir # see also os.chdir above
    commands.append([('change to output dir', cmd)])
    
    cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt'
    commands.append([('confirm pyro lookup filepath environment variable',
        cmd)])


    cmd = 'SplitKeys.pl '+one_primer+' map.csv < '+\
        os.path.join(called_dir,sff_txt_fp)+\
        ' > splitkeys_log.txt 2> unassigned.fna'
    commands.append([('split sff.txt via barcodes (keys)', cmd)])

    for i, sample_name in enumerate(sample_names):

        # Build the summarize taxonomy command
        if platform == 'flx':
            cmd = 'Clean360.pl '+one_primer+' '+sample_name+' < '+\
                sample_name+'.raw'
            commands.append([('clean flows '+sample_name, cmd)])

            # these run through the whole sff file once per sample, I think
            # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        elif platform == 'titanium':
            cmd = 'CleanMinMax.pl '+one_primer+' '+sample_name+' < '+\
                sample_name+'.raw'
            commands.append([('clean flows '+sample_name, cmd)])

            # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        else:
            raise RuntimeError("platform " + platform + " not supported")

        cmd = "mpirun -np "+str(numnodes)+" PyroDist -in "+\
          sample_name+".dat -out "+sample_name+ " > "+sample_name+".pdout"
        commands.append([('pyrodist '+sample_name, cmd)])

        cmd = "FCluster -in "+sample_name+".fdist -out "+sample_name+\
          " > "+sample_name+".fcout"
        commands.append([('fcluster pyrodist '+sample_name, cmd)])

# e.g.:
# mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
# PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
        cmd = "mpirun -np "+str(numnodes)+" PyroNoise -din "+\
            sample_name+".dat -out "+\
            sample_name+"_pyronoise "+"-lin "+\
            sample_name+".list -s 60.0 -c 0.01 > "+\
            sample_name+"_pyronoise.pnout"
        commands.append([('pyronoise '+sample_name, cmd)])

        cmd = 'Parse.pl '+bc_seqs[i]+one_primer+' '+truncate_len+' < '+\
            sample_name+'_pyronoise_cd.fa'+' > '+ sample_name+'_'+\
            truncate_len+'.fa'
        commands.append([('truncate '+sample_name, cmd)])

        # now start with post_pyro_tail
        cmd = "mpirun -np "+str(numnodes)+" SeqDist -in "+\
            sample_name+post_pyro_tail+\
            ".fa > "+sample_name+post_pyro_tail+".seqdist"
        commands.append([('seqdist '+sample_name, cmd)])

        cmd = "FCluster -in "+sample_name+post_pyro_tail+".seqdist -out "+\
            sample_name+post_pyro_tail+"fcl > "+\
            sample_name+post_pyro_tail+".fcout"
        commands.append([('fcluster seqdist '+sample_name, cmd)])

# e.g.:
# mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
# PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
# PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
# PC.354_pyronoise_cd.snout

        cmd = "mpirun -np "+str(numnodes)+" SeqNoise -in "+\
            sample_name+post_pyro_tail+\
            ".fa -din "+sample_name+post_pyro_tail+".seqdist -out "+\
            sample_name+post_pyro_tail+\
            "_seqnoise -lin "+sample_name+post_pyro_tail+'fcl.list -min '+\
            sample_name+'_pyronoise'+\
            '.mapping -s '+seqnoise_resolution+' -c 0.08 > '+\
            sample_name+post_pyro_tail+'.snout'
        commands.append([('seqnoise '+sample_name, cmd)])


        if suppress_perseus == False: 

            cmd = 'Perseus -sin '+sample_name+post_pyro_tail+\
                '_seqnoise_cd.fa > ' +\
                sample_name+'.per'
            commands.append([('Perseus '+sample_name, cmd)])

            cmd = 'Class.pl '+sample_name+'.per '+\
                str(chimera_alpha) + ' '+ str(chimera_beta)+\
                ' > '+sample_name+'.class'
            commands.append([('Class.pl '+sample_name, cmd)])

            cmd = 'FilterGoodClass.pl '+sample_name+post_pyro_tail+\
                '_seqnoise_cd.fa '+\
                sample_name+'.class 0.5 > '+sample_name+'_Chi.fa 2> '+\
                sample_name+'_Good.fa'
            commands.append([('FilterGoodClass '+sample_name, cmd)])

        cmd = '%s %s/unweight_fasta.py -i %s -o %s -l %s' %\
         (python_exe_fp, script_dir, fasta_result_names[i], 
         sample_name+'_unw.fna', sample_name)
        commands.append([('unweight fasta '+sample_name, cmd)])

    cmd = 'cat ' +\
      ' '.join([sample_name+'_unw.fna' for sample_name in sample_names]) +\
      ' > ' + output_filepath # this should be an abs filepath
    commands.append([('cat into one fasta file', cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
           type='string',help='path to store output files '+\
           '[REQUIRED]'),\
]
rdp_classifier_fp = getenv('RDP_JAR_PATH')
script_info['optional_options'] = [\
 make_option('--rdp_classifier_fp',action='store',\
           type='string',help='full path to rdp classifier jar file '+\
           '[default: %default]',\
           default=rdp_classifier_fp),\
 make_option('-c','--confidence',action='store',\
          type='float',help='Minimum confidence to'+\
          ' record an assignment [default: %default]',default=0.80),\
 make_option('-N','--assign_taxonomy_fp',action='store',\
           type='string',help='full path to '+\
           'scripts/assign_taxonomy.py [default: %default]',\
           default=join(get_qiime_scripts_dir(),'assign_taxonomy.py')),\
 make_option('-t','--id_to_taxonomy_fp',action='store',\
           type='string',help='full path to '+\
           'id_to_taxonomy mapping file [REQUIRED]'),\
 make_option('-r','--reference_seqs_fp',action='store',\
        help='Ref seqs to rdp against. [default: %default]'),\
 options_lookup['jobs_to_start'],\
 options_lookup['poller_fp'],\
 options_lookup['retain_temp_files'],\
 options_lookup['suppress_submit_jobs'],\
 options_lookup['poll_directly'],\
 options_lookup['cluster_jobs_fp'],\
 options_lookup['suppress_polling'],\
 options_lookup['job_prefix'],\
 options_lookup['python_exe_fp'],\
 options_lookup['seconds_to_sleep']\
def run_process_fasta_through_split_lib(study_id,run_prefix,input_fp,
    mapping_fp, output_dir, 
    command_handler, params, qiime_config,
    write_to_all_fasta=False,
    status_update_callback=print_to_stdout):
    """ NOTE: Parts of this function are a directly copied from the
        run_qiime_data_preparation function from the workflow.py library file 
        in QIIME.
    
        The steps performed by this function are:
          1) Update sequence names using DB accessions
    
    """

    # Prepare some variables for the later steps
    filenames=input_fp.split(',')
    commands = []
    create_dir(output_dir)
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    # copy the mapping file
    copied_mapping=split(mapping_fp)[-1]
    mapping_input_fp_copy=join(output_dir, copied_mapping)
    copy_mapping_cmd='cp %s %s' % (mapping_fp,mapping_input_fp_copy)
    commands.append([('CopyMapping', copy_mapping_cmd)])

    # sort filenames
    filenames.sort()
    
    # create split_libraries directory
    split_library_output=join(output_dir,'split_libraries')
    create_dir(split_library_output)
    
    # Call the command handler on the list of commands
    command_handler(commands,status_update_callback,logger=logger)
    
    # define output filepath
    output_fp=join(split_library_output,'seqs.fna')
    
    # re-write the sequence file
    outf=open(output_fp,'w')
    
    # get sample-info from mapping file
    map_data,map_header,map_comments=parse_mapping_file(\
                                        open(mapping_input_fp_copy,'U'))
                                        
    # create dictionary of original sample_ids to new sample_ids
    sample_ids_from_mapping=zip(*map_data)[0]
    sample_id_dict={}
    for sample in sample_ids_from_mapping:
        sample_id_dict['.'.join(sample.split('.')[:-1])]=sample
    
    
    # NEED to be able to just pass fasta file, since mapping sample_ids
    # will not match input fasta file ever
    '''
    fasta_check=run_fasta_checks(input_fp,mapping_input_fp_copy)
    
    if float(fasta_check['invalid_labels']) > 0:
        raise ValueError, "There are invalid sequence names in the Original sequence file"
    #elif float(fasta_check['barcodes_detected']) > 0:
    #    raise ValueError, "There are barcode sequences found in the Original sequence file"
    elif float(fasta_check['duplicate_labels']) > 0:
        raise ValueError, "There are duplicate sequence names in the Original sequence file"
    elif float(fasta_check['invalid_seq_chars']) > 0:
        raise ValueError, "There are invalid nucleotides in the sequence Original file (i.e. not A,C,G,T or N)"
    #elif float(fasta_check['linkerprimers_detected']) > 0:
    #    raise ValueError, "There are linker primer sequences in the Original sequence file"
    #elif float(fasta_check['nosample_ids_map']) > 0.20:
    #    raise ValueError, "More than 20% of the samples in the mapping file do not have sequences"
    ''' 
    
    # parse the sequences
    sequences=MinimalFastaParser(open(input_fp),'U')
    
    # update fasta file with new DB SampleIDs and create new split-lib seqs 
    # file
    num=1
    for seq_name,seq in sequences:
        seq_name_arr=seq_name.split()
        updated_seq_name =sample_id_dict['_'.join(seq_name_arr[0].split('_')[:-1])] + \
                '_' + str(num) + ' ' + ' '.join(seq_name_arr[1:])
        num=num+1
        outf.write('>%s\n%s\n' % (updated_seq_name,seq))
    outf.close()
        
        
    # Return the fasta file paths
    return filenames
Esempio n. 44
0
script_info[
    'output_description'] = """The output consists of many files (i.e. merged_table.biom, merged_table.log and all intermediate merge tables). The .biom file contains the result of merging the individual BIOM tables. The resulting .log file contains a list of parameters passed to this script along with the output location of the resulting .txt file, the dependency hierarchy and runtime information for each individual merge."""

script_info['required_options'] = [\
 make_option('-i','--input_fps',type='existing_filepaths',
             help='the otu tables in biom format (comma-separated)'),\
 make_option('-o','--output_dir',type='new_dirpath',
             help='the output otu table directory path')]

script_info['optional_options'] = [\
    make_option('-C','--cluster',action='store_true', default=False,
           help="Submit to a torque cluster"),
    make_option('-N','--merge_otus_fp',action='store',\
           type='existing_filepath',help='full path to '+\
           'scripts/merge_otu_tables.py [default: %default]',\
           default=join(get_qiime_scripts_dir(),'merge_otu_tables.py')),\
    options_lookup['python_exe_fp'],
    options_lookup['seconds_to_sleep'],
    options_lookup['job_prefix']]
script_info['version'] = __version__

RANDOM_JOB_PREFIX_CHARS = "abcdefghigklmnopqrstuvwxyz"
RANDOM_JOB_PREFIX_CHARS += RANDOM_JOB_PREFIX_CHARS.upper()
RANDOM_JOB_PREFIX_CHARS += "0123456790"

def get_random_job_prefix(fixed_prefix='',
                           max_job_prefix_len=10,\
                           leading_trailing_underscores=True):
    """ Return a string to use as job prefix """

    length = max_job_prefix_len - len(fixed_prefix)
        help="write output rarefied otu tables here makes dir if it doesn't exist [REQUIRED]"),\
 make_option('-m', '--min', type=int,help='min seqs/sample [REQUIRED]'),\
 make_option('-x', '--max', type=int,\
                      help='max seqs/sample (inclusive) [REQUIRED]'),\
 make_option('-s', '--step', type=int,\
                      help='levels: min, min+step... for level <= max [REQUIRED]'),\
]
script_info['optional_options'] = [\
 make_option('-n', '--num-reps', dest='num_reps', default=10, type=int,
        help='num iterations at each seqs/sample level [default: %default]'),\
 make_option('--lineages_included', dest='lineages_included', default=False,
        action="store_true",
        help="""output rarefied otu tables will include taxonomic (lineage) information for each otu, if present in input otu table [default: %default]"""),
 make_option('-N','--single_rarefaction_fp',action='store',\
           type='string',help='full path to scripts/single_rarefaction.py [default: %default]',\
           default=join(get_qiime_scripts_dir(),'single_rarefaction.py')),\
 options_lookup['poller_fp'],\
 options_lookup['retain_temp_files'],\
 options_lookup['suppress_submit_jobs'],\
 options_lookup['poll_directly'],\
 options_lookup['cluster_jobs_fp'],\
 options_lookup['suppress_polling'],\
 options_lookup['job_prefix'],\
 options_lookup['python_exe_fp'],\
 options_lookup['seconds_to_sleep'],\
 options_lookup['jobs_to_start']
]

script_info['version'] = __version__

def main():
from qiime.workflow import print_commands,call_commands_serially,\
                           print_to_stdout, no_status_updates,generate_log_fp,\
                           get_params_str, WorkflowError,WorkflowLogger
from qiime.util import get_qiime_scripts_dir,create_dir,load_qiime_config,\
                       get_qiime_library_version
from cogent.util.misc import get_random_directory_name
from submit_job_to_qiime import submitQiimeJob
from qiime.filter import filter_samples_from_otu_table
from biom.table import SparseOTUTable, DenseOTUTable, table_factory,\
                       get_biom_format_version_string,get_biom_format_url_string
from json import dumps
from numpy import array

# get qiime config and qiime scripts directory
qiime_config = load_qiime_config()
script_dir = get_qiime_scripts_dir()

def combine_map_header_cols(combinecolorby, mapping):
    """Merge two or more mapping columns into one column"""
    
    # create an empty array the size of the mapping file
    combinedmapdata=array([''] * len(mapping), dtype='a1000')
    title=[]
    match=False
    # iterate over columns and see if the colunns are supposed to be joined
    for p in range(len(combinecolorby)):                    
        for i in range(len(mapping[0])):
            if str(combinecolorby[p]) == str(mapping[0][i]):
                match=True
                for q in range(len(mapping)):
                    
Esempio n. 47
0
script_info[
    'output_description'] = """The output consists of many files (i.e. merged_table.biom, merged_table.log and all intermediate merge tables). The .biom file contains the result of merging the individual BIOM tables. The resulting .log file contains a list of parameters passed to this script along with the output location of the resulting .txt file, the dependency hierarchy and runtime information for each individual merge."""

script_info['required_options'] = [
    make_option('-i', '--input_fps', type='existing_filepaths',
                help='the otu tables in biom format (comma-separated)'),
    make_option('-o', '--output_dir', type='new_dirpath',
                help='the output otu table directory path')]

script_info['optional_options'] = [
    make_option('-C', '--cluster', action='store_true', default=False,
                help="Submit to a torque cluster"),
    make_option('-N', '--merge_otus_fp', action='store',
                type='existing_filepath', help='full path to ' +
                'scripts/merge_otu_tables.py [default: %default]',
                default=join(get_qiime_scripts_dir(), 'merge_otu_tables.py')),
    options_lookup['python_exe_fp'],
    options_lookup['seconds_to_sleep'],
    options_lookup['job_prefix']]
script_info['version'] = __version__

RANDOM_JOB_PREFIX_CHARS = "abcdefghigklmnopqrstuvwxyz"
RANDOM_JOB_PREFIX_CHARS += RANDOM_JOB_PREFIX_CHARS.upper()
RANDOM_JOB_PREFIX_CHARS += "0123456790"


def get_random_job_prefix(fixed_prefix='',
                          max_job_prefix_len=10,
                          leading_trailing_underscores=True):
    """ Return a string to use as job prefix """
        help='input path, must be directory [REQUIRED]'),\
 make_option('-o', '--output_path',
        help='output path, must be directory [REQUIRED]'),
]

script_info['optional_options'] = [
 make_option('-m', '--metrics', default='unweighted_unifrac,weighted_unifrac',
     help='Beta-diversity metric(s) to use. A comma-separated list should be' +\
     ' provided when multiple metrics are specified. [default: %default]'),
 make_option('-t', '--tree_path', default=None,
        help='path to newick tree file, required for phylogenetic metrics'+\
        ' [default: %default]'),\
 make_option('-N','--beta_diversity_fp',action='store',\
           type='string',help='full path to '+\
           'scripts/beta_diversity.py [default: %default]',\
           default=join(get_qiime_scripts_dir(),'beta_diversity.py')),\
 options_lookup['poller_fp'],\
 options_lookup['retain_temp_files'],\
 options_lookup['suppress_submit_jobs'],\
 options_lookup['poll_directly'],\
 options_lookup['cluster_jobs_fp'],\
 options_lookup['suppress_polling'],\
 options_lookup['job_prefix'],\
 options_lookup['python_exe_fp'],\
 options_lookup['seconds_to_sleep'],\
 options_lookup['jobs_to_start'],
 make_option('-f', '--full_tree', action="store_true",
     help='By default, each job removes calls _fast_unifrac_setup to remove unused parts of the tree. pass -f if you already have a minimal tree, and this script will run faster'),

]
script_info['version'] = __version__
           'template alignment [default: %default]'),\
    
    make_option('-b','--blast_db',action='store',\
           type='string',help='database to blast against '+\
           '[default: %default]'),\
           
    make_option('--min_aligned_percent',
        help=('Minimum percent of query sequence that can be aligned '
              'to consider a hit (BLAST OTU picker only) [default: %default]'),
              default=0.50,type='float'),
          
    #Define parallel-script-specific parameters
    make_option('-N','--pick_otus_fp',action='store',\
           type='string',help='full path to '+\
           'scripts/pick_otus.py [default: %default]',\
           default=join(get_qiime_scripts_dir(),'pick_otus.py')),\
        
 options_lookup['jobs_to_start'],\
 options_lookup['poller_fp'],\
 options_lookup['retain_temp_files'],\
 options_lookup['suppress_submit_jobs'],\
 options_lookup['poll_directly'],\
 options_lookup['cluster_jobs_fp'],\
 options_lookup['suppress_polling'],\
 options_lookup['job_prefix'],\
 options_lookup['python_exe_fp'],\
 options_lookup['seconds_to_sleep']\
]

script_info['version'] = __version__
Esempio n. 50
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if (opts.suppress_unit_tests and \
       opts.suppress_script_tests and \
       opts.suppress_script_usage_tests):
        option_parser.error(
            "You're suppressing all three test types. Nothing to run.")

    test_dir = abspath(dirname(__file__))

    unittest_good_pattern = re.compile('OK\s*$')
    application_not_found_pattern = re.compile('ApplicationNotFoundError')
    python_name = 'python'
    bad_tests = []
    missing_application_tests = []

    # Run through all of QIIME's unit tests, and keep track of any files which
    # fail unit tests.
    if not opts.suppress_unit_tests:
        unittest_names = []
        if not opts.unittest_glob:
            for root, dirs, files in walk(test_dir):
                for name in files:
                    if name.startswith('test_') and name.endswith('.py'):
                        unittest_names.append(join(root, name))
        else:
            for fp in glob(opts.unittest_glob):
                fn = split(fp)[1]
                if fn.startswith('test_') and fn.endswith('.py'):
                    unittest_names.append(abspath(fp))

        unittest_names.sort()

        for unittest_name in unittest_names:
            print "Testing %s:\n" % unittest_name
            command = '%s %s -v' % (python_name, unittest_name)
            stdout, stderr, return_value = qiime_system_call(command)
            print stderr
            if not unittest_good_pattern.search(stderr):
                if application_not_found_pattern.search(stderr):
                    missing_application_tests.append(unittest_name)
                else:
                    bad_tests.append(unittest_name)

    bad_scripts = []
    if not opts.suppress_script_tests:
        # Run through all of QIIME's scripts, and pass -h to each one. If the
        # resulting stdout does not being with the Usage text, that is an
        # indicator of something being wrong with the script. Issues that would
        # cause that are bad import statements in the script, SyntaxErrors, or
        # other failures prior to running qiime.util.parse_command_line_parameters.

        try:
            scripts_dir = get_qiime_scripts_dir()
            script_directory_found = True
        except AssertionError:
            script_directory_found = False

        if script_directory_found:
            script_names = []
            script_names = glob('%s/*py' % scripts_dir)
            script_names.sort()

            for script_name in script_names:
                script_good_pattern = re.compile('^Usage: %s' %
                                                 split(script_name)[1])
                print "Testing %s." % script_name
                command = '%s %s -h' % (python_name, script_name)
                stdout, stderr, return_value = qiime_system_call(command)
                if not script_good_pattern.search(stdout):
                    bad_scripts.append(script_name)

    num_script_usage_example_failures = 0
    qiime_test_data_dir = qiime_config['qiime_test_data_dir']
    if not opts.suppress_script_usage_tests and qiime_test_data_dir != None:
        # Run the script usage testing functionality
        script_usage_result_summary, num_script_usage_example_failures = \
         run_script_usage_tests(
               qiime_test_data_dir=qiime_test_data_dir,
               qiime_scripts_dir=qiime_config['qiime_scripts_dir'],
               working_dir=qiime_config['temp_dir'],
               verbose=True,
               tests=None, # runs all
               failure_log_fp=None,
               force_overwrite=True)

    print "==============\nResult summary\n=============="

    if not opts.suppress_unit_tests:
        print "\nUnit test result summary\n------------------------\n"
        if bad_tests:
            print "\nFailed the following unit tests.\n%s" % '\n'.join(
                bad_tests)

        if missing_application_tests:
            print "\nFailed the following unit tests, in part or whole due "+\
            "to missing external applications.\nDepending on the QIIME features "+\
            "you plan to use, this may not be critical.\n%s"\
             % '\n'.join(missing_application_tests)

        if not (missing_application_tests or bad_tests):
            print "\nAll unit tests passed.\n\n"

    if not opts.suppress_script_tests:
        print "\nBasic script test result summary\n--------------------------------\n"
        if not script_directory_found:
            print "Critical error: Failed to test scripts because the script directory could not be found.\n The most likely explanation for this failure is that you've installed QIIME using setup.py, and forgot to specify the qiime_scripts_dir in your qiime_config file. This value shoud be set either to the directory you provided for --install-scripts, or /usr/local/bin if no value was provided to --install-scripts."
        else:
            if bad_scripts:
                print "Failed the following basic script tests.\n%s" % '\n'.join(
                    bad_scripts)
            else:
                print "All basic script tests passed successfully.\n"

    qiime_test_data_dir_exists = True
    if not opts.suppress_script_usage_tests:
        if qiime_test_data_dir:
            print "\nScript usage test result summary\n------------------------------------\n"
            print script_usage_result_summary
        else:
            print "\nCould not run script usage tests because qiime_test_data_dir is not defined in your qiime_config."
            qiime_test_data_dir_exists = False
        print ""

    # If any of the unit tests, script tests, or script usage tests fail, or if
    # we have any missing application errors or a missing QIIME test data dir
    # if script usage tests weren't suppressed, use return code 1 (as python's
    # unittest module does to indicate one or more failures).
    return_code = 1
    if (len(bad_tests) == 0 and len(missing_application_tests) == 0
            and len(bad_scripts) == 0
            and num_script_usage_example_failures == 0
            and qiime_test_data_dir_exists):
        return_code = 0
    return return_code