예제 #1
0
 def setUp(self):
     
     self.files_to_remove = []
     self.dirs_to_remove = []
     
     # Create example output directory
     tmp_dir = get_qiime_temp_dir()
     self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                      prefix='qiime_parallel_tests_',
                                      suffix='',
                                      result_constructor=str)
     self.dirs_to_remove.append(self.test_out)
     create_dir(self.test_out)
     
     # Create example input file
     self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_inseqs',
                                         suffix='.fasta')
     inseqs1_f = open(self.inseqs1_fp,'w')
     inseqs1_f.write(inseqs1)
     inseqs1_f.close()
     self.files_to_remove.append(self.inseqs1_fp)
     
     # Define number of seconds a test can run for before timing out 
     # and failing
     initiate_timeout(60)
예제 #2
0
파일: split.py 프로젝트: ranjit58/qiime
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=""):
    """ Split infile into files with seqs_per_file sequences in each
    
        infile: list of fasta lines or open file object
        seqs_per_file: the number of sequences to include in each file
        out_fileprefix: string used to create output filepath - output filepaths
         are <out_prefix>.<i>.fasta where i runs from 0 to number of output files
        working_dir: directory to prepend to temp filepaths (defaults to 
         empty string -- files written to cwd)
         
        List of output filepaths is returned.
    
    """
    seq_counter = 0
    out_files = []
    if working_dir and not working_dir.endswith("/"):
        working_dir += "/"
        create_dir(working_dir)

    for seq_id, seq in MinimalFastaParser(infile):
        if seq_counter == 0:
            current_out_fp = "%s%s.%d.fasta" % (working_dir, outfile_prefix, len(out_files))
            current_out_file = open(current_out_fp, "w")
            out_files.append(current_out_fp)
        current_out_file.write(">%s\n%s\n" % (seq_id, seq))
        seq_counter += 1

        if seq_counter == seqs_per_file:
            current_out_file.close()
            seq_counter = 0

    return out_files
예제 #3
0
파일: split.py 프로젝트: jb2263/qiime
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''):
    """ Split infile into files with seqs_per_file sequences in each
    
        infile: list of fasta lines or open file object
        seqs_per_file: the number of sequences to include in each file
        out_fileprefix: string used to create output filepath - output filepaths
         are <out_prefix>.<i>.fasta where i runs from 0 to number of output files
        working_dir: directory to prepend to temp filepaths (defaults to 
         empty string -- files written to cwd)
         
        List of output filepaths is returned.
    
    """
    seq_counter = 0
    out_files = []
    if working_dir and not working_dir.endswith('/'):
        working_dir += '/'
        create_dir(working_dir)

    for seq_id, seq in MinimalFastaParser(infile):
        if seq_counter == 0:
            current_out_fp = '%s%s.%d.fasta' \
              % (working_dir,outfile_prefix,len(out_files))
            current_out_file = open(current_out_fp, 'w')
            out_files.append(current_out_fp)
        current_out_file.write('>%s\n%s\n' % (seq_id, seq))
        seq_counter += 1

        if seq_counter == seqs_per_file:
            current_out_file.close()
            seq_counter = 0

    return out_files
예제 #4
0
파일: utils.py 프로젝트: franny911/qiime
def write_checkpoint(current_key, ctr, cluster_mapping,
                     ids, bestscores, order, out_fp):
    """write intermediate results to checkpoint file

    current_key: the identifier of the current denoiser round
    ctr: a uniq counter to label the checkpoint
    cluster_mapping: an intermediate cluster mapping as dict
    ids: the dict of active ids
    order:  a list of ids, which defines the order of which flowgrams are clustered
    bestscores: a dict of
    """

    checkpoint_dir = out_fp + "/checkpoints/"
    if (not exists(checkpoint_dir)):
        create_dir(checkpoint_dir)
    out_fp = checkpoint_dir + "/checkpoint%d.pickle" % ctr
    out_fh = open(out_fp, "w")
    pickle.dump(
        (current_key,
         ctr,
         cluster_mapping,
         ids,
         bestscores,
         order),
        out_fh)

    return out_fp
예제 #5
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(
            tmp_dir=tmp_dir,
            prefix='qiime_parallel_blaster_tests_',
            suffix='',
            result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        self.tmp_seq_filepath = get_tmp_filename(
            tmp_dir=self.test_out,
            prefix='qiime_parallel_blaster_tests_input',
            suffix='.fasta')
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(blast_test_seqs)
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_blaster_tests_ref_seqs',
            suffix='.fasta',
            dir=tmp_dir)
        self.reference_seqs_file.write(blast_ref_seqs)
        self.reference_seqs_file.seek(0)

        initiate_timeout(60)
예제 #6
0
파일: denoiser.py 프로젝트: hillst/qiime
def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)

    if not opts.sff_fp:
        parser.error('Required option flowgram file path (-i) not specified')
    elif not files_exist(opts.sff_fp):
        parser.error('Flowgram file path does not exist:\n %s \n Pass a valid one via -i.'
                     % opts.sff_fp)

    if(opts.checkpoint_fp):
        bp_fp = opts.checkpoint_fp
        if not exists(bp_fp):
            parser.error('Specified checkpoint file does not exist: %s' % bp_fp)

    #peek into sff.txt files to make sure they are parseable
    #cat_sff_fles is lazy and only reads header
    flowgrams, header = cat_sff_files(map(open, opts.sff_fp.split(',')))
    
    if(opts.split and opts.preprocess_fp):
        parser.error('Options --split and --preprocess_fp are exclusive')

    if(opts.preprocess_fp):
        pp_fp = opts.preprocess_fp
        if not exists(opts.preprocess_fp):
            parser.error('Specified preprocess directory does not exist: %s' % opts.preprocess_fp)
        if not files_exist('%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' %(pp_fp, pp_fp)):
            parser.error('Specified preprocess directory does not contain expected files: ' +\
                             'prefix_mapping.txt and prefix_dereplicated.fasta')

    if opts.titanium:
        opts.error_profile = DENOISER_DATA_DIR+'Titanium_error_profile.dat'
        opts.low_cutoff = 4
        opts.high_cutoff = 5

    if not exists(opts.error_profile):
        parser.error('Specified error profile %s does not exist' % opts.error_profile)

    if opts.output_dir:
        #make sure it always ends on /
        tmpoutdir=opts.output_dir+"/"
    else:
        #make random dir in current dir
        tmpoutdir = get_tmp_filename(tmp_dir="", prefix="denoiser_", suffix="/")

    create_dir(tmpoutdir, not opts.force)
    
    log_fp = 'denoiser.log'
    
    if opts.split:
        denoise_per_sample(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.cluster,
                           opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail,
                           opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp,
                           opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter,
                           opts.titanium)
    else:
        denoise_seqs(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.preprocess_fp, opts.cluster,
                     opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer,
                     opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory,
                     opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium,
                     opts.checkpoint_fp)
예제 #7
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='qiime_parallel_taxonomy_assigner_tests_',
                                         suffix='',
                                         result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        self.tmp_seq_filepath = get_tmp_filename(tmp_dir=self.test_out,
            prefix='qiime_parallel_taxonomy_assigner_tests_input',
            suffix='.fasta')
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(blast_test_seqs.toFasta())
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.id_to_taxonomy_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy',
            suffix='.txt',dir=tmp_dir)
        self.id_to_taxonomy_file.write(blast_id_to_taxonomy)
        self.id_to_taxonomy_file.seek(0)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs',
            suffix='.fasta',dir=tmp_dir)
        self.reference_seqs_file.write(blast_reference_seqs.toFasta())
        self.reference_seqs_file.seek(0)

        initiate_timeout(60)
예제 #8
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    input_dir = opts.input_dir
    output_dir = opts.output_dir
    create_dir(output_dir)
    lanes = opts.lanes.split(',')
    bases = opts.bases
    read = opts.read
    
    for lane in lanes:
        read1_fps =  glob('%s/s_%s_%d_*qseq.txt' % (input_dir,
                                                   lane.replace(',',''),
                                                   read))
        # sort so results will be consistent across different runs (important
        # so amplicon and barcodes read headers will match)
        read1_fps.sort()
        for read1_fp in read1_fps:                
            output_fp =  '%s/s_%s_%s_sequences.fastq' % (output_dir,lane,read)
            output_f = open(output_fp,'w')
            for record in iter_split_lines(open(read1_fp,'U')):
                fastq_s = illumina_data_to_fastq(record,
                                                 number_of_bases=bases)
                output_f.write('%s\n' % fastq_s)
            output_f.close()
예제 #9
0
def make_jobs(commands, job_prefix, queue, jobs_dir="jobs/",
              walltime="72:00:00", ncpus=1, nodes=1, keep_output="oe"):
    """prepare qsub text files.

    command: list of commands

    job_prefix: a short, descriptive name for the job.

    queue: name of the queue to submit to

    jobs_dir: path to directory where job submision scripts are written

    walltime: the maximal walltime

    ncpus: number of cpus

    nodes: number of nodes

    keep_output: keep standard error, standard out, both, or neither
                 o=std out, e=std err, oe=both, n=neither
    """

    filenames = []
    create_dir(jobs_dir)
    for command in commands:
        job_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix + "_",
                                    suffix=".txt")
        out_fh = open(job_name, "w")

        out_fh.write(QSUB_TEXT % (walltime, ncpus, nodes, queue, job_prefix,
                                  keep_output, command))
        out_fh.close()
        filenames.append(job_name)
    return filenames
예제 #10
0
def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)

    if not opts.sff_fp:
        parser.error('Required option flowgram file path (-i) not specified')
    elif not files_exist(opts.sff_fp):
        parser.error('Flowgram file path does not exist:\n %s \n Pass a valid one via -i.'
                     % opts.sff_fp)

    if(opts.checkpoint_fp):
        bp_fp = opts.checkpoint_fp
        if not exists(bp_fp):
            parser.error('Specified checkpoint file does not exist: %s' % bp_fp)

    #peek into sff.txt files to make sure they are parseable
    #cat_sff_fles is lazy and only reads header
    flowgrams, header = cat_sff_files(map(open, opts.sff_fp.split(',')))
    
    if(opts.split and opts.preprocess_fp):
        parser.error('Options --split and --preprocess_fp are exclusive')

    if(opts.preprocess_fp):
        pp_fp = opts.preprocess_fp
        if not exists(opts.preprocess_fp):
            parser.error('Specified preprocess directory does not exist: %s' % opts.preprocess_fp)
        if not files_exist('%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' %(pp_fp, pp_fp)):
            parser.error('Specified preprocess directory does not contain expected files: ' +\
                             'prefix_mapping.txt and prefix_dereplicated.fasta')

    if opts.titanium:
        opts.error_profile = DENOISER_DATA_DIR+'Titanium_error_profile.dat'
        opts.low_cutoff = 4
        opts.high_cutoff = 5

    if not exists(opts.error_profile):
        parser.error('Specified error profile %s does not exist' % opts.error_profile)

    if opts.output_dir:
        #make sure it always ends on /
        tmpoutdir=opts.output_dir+"/"
    else:
        #make random dir in current dir
        tmpoutdir = get_tmp_filename(tmp_dir="", prefix="denoiser_", suffix="/")

    create_dir(tmpoutdir, not opts.force)
    
    log_fp = 'denoiser.log'
    
    if opts.split:
        denoise_per_sample(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.cluster,
                           opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail,
                           opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp,
                           opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter,
                           opts.titanium)
    else:
        denoise_seqs(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.preprocess_fp, opts.cluster,
                     opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer,
                     opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory,
                     opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium,
                     opts.checkpoint_fp)
예제 #11
0
파일: utils.py 프로젝트: Bonder-MJ/qiime
def write_checkpoint(current_key, ctr, cluster_mapping,
                     ids, bestscores, order, out_fp):
    """write intermediate results to checkpoint file

    current_key: the identifier of the current denoiser round
    ctr: a uniq counter to label the checkpoint
    cluster_mapping: an intermediate cluster mapping as dict
    ids: the dict of active ids
    order:  a list of ids, which defines the order of which flowgrams are clustered
    bestscores: a dict of
    """

    checkpoint_dir = out_fp + "/checkpoints/"
    if (not exists(checkpoint_dir)):
        create_dir(checkpoint_dir)
    out_fp = checkpoint_dir + "/checkpoint%d.pickle" % ctr
    out_fh = open(out_fp, "w")
    pickle.dump(
        (current_key,
         ctr,
         cluster_mapping,
         ids,
         bestscores,
         order),
        out_fh)

    return out_fp
예제 #12
0
   def test_store_cluster(self):
        """store_clusters stores the centroid seqs for each cluster."""

        self.tmpdir = get_tmp_filename(tmp_dir="./", suffix="_store_clusters/")
        create_dir(self.tmpdir)

        self.files_to_remove.append(self.tmpdir+"singletons.fasta")
        self.files_to_remove.append(self.tmpdir+"centroids.fasta")

        #empty map results in empty files
        store_clusters({}, self.tiny_test, self.tmpdir)
        actual_centroids = list(MinimalFastaParser(open(self.tmpdir+"centroids.fasta")))
        self.assertEqual(actual_centroids, [])
        actual_singletons = list(MinimalFastaParser(open(self.tmpdir+"singletons.fasta")))
        self.assertEqual(actual_singletons, [])

        #non-empty map creates non-empty files, centroids sorted by size
        mapping = {'FZTHQMS01B8T1H':[],
                   'FZTHQMS01DE1KN':['FZTHQMS01EHAJG'],
                   'FZTHQMS01EHAJG':[1,2,3]} # content doesn't really matter
        
        centroids = [('FZTHQMS01EHAJG | cluster size: 4', 'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAACAAGACCATGCGGTCTGATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCA'),
                     ('FZTHQMS01DE1KN | cluster size: 2','CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAACAAGACCATGCGGTCTGATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCA')]

        singletons= [('FZTHQMS01B8T1H', 'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAATTAAACCATGCGGTTTTATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCATCACTTA')]

        store_clusters(mapping, self.tiny_test, self.tmpdir)
        actual_centroids = list(MinimalFastaParser(open(self.tmpdir+"centroids.fasta")))
        self.assertEqual(actual_centroids, centroids)
        actual_singletons = list(MinimalFastaParser(open(self.tmpdir+"singletons.fasta")))
        self.assertEqual(actual_singletons,singletons)
예제 #13
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='qiime_parallel_tests_',
                                         suffix='',
                                         result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        self.template_fp = get_tmp_filename(tmp_dir=self.test_out,
                                            prefix='qiime_template',
                                            suffix='.fasta')
        template_f = open(self.template_fp, 'w')
        template_f.write(pynast_test1_template_fasta)
        template_f.close()
        self.files_to_remove.append(self.template_fp)

        self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                           prefix='qiime_inseqs',
                                           suffix='.fasta')
        inseqs1_f = open(self.inseqs1_fp, 'w')
        inseqs1_f.write(inseqs1)
        inseqs1_f.close()
        self.files_to_remove.append(self.inseqs1_fp)

        initiate_timeout(60)
예제 #14
0
 def setUp(self):
     
     self.files_to_remove = []
     self.dirs_to_remove = []
     
     # Create example output directory
     tmp_dir = get_qiime_temp_dir()
     self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                      prefix='qiime_parallel_tests_',
                                      suffix='',
                                      result_constructor=str)
     self.dirs_to_remove.append(self.test_out)
     create_dir(self.test_out)
     
     # Create example input file
     self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_inseqs',
                                         suffix='.fasta')
     inseqs1_f = open(self.inseqs1_fp,'w')
     inseqs1_f.write(inseqs1)
     inseqs1_f.close()
     self.files_to_remove.append(self.inseqs1_fp)
     
     # Define number of seconds a test can run for before timing out 
     # and failing
     initiate_timeout(60)
예제 #15
0
 def setUp(self):
     """ """
     self.files_to_remove = []
     self.dirs_to_remove = []
     
     tmp_dir = get_qiime_temp_dir()
     self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                      prefix='qiime_parallel_tests_',
                                      suffix='',
                                      result_constructor=str)
     self.dirs_to_remove.append(self.test_out)
     create_dir(self.test_out)
     
     self.template_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_template',
                                         suffix='.fasta')
     template_f = open(self.template_fp,'w')
     template_f.write(pynast_test1_template_fasta)
     template_f.close()
     self.files_to_remove.append(self.template_fp)
     
     self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_inseqs',
                                         suffix='.fasta')
     inseqs1_f = open(self.inseqs1_fp,'w')
     inseqs1_f.write(inseqs1)
     inseqs1_f.close()
     self.files_to_remove.append(self.inseqs1_fp)
     
     initiate_timeout(60)
예제 #16
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_dir = opts.input_dir
    output_dir = opts.output_dir
    create_dir(output_dir)
    lanes = opts.lanes.split(',')
    bases = opts.bases
    read = opts.read
    ignore_pass_filter = opts.ignore_pass_filter

    for lane in lanes:
        read1_fps = sorted(
            glob('%s/s_%s_%d_*qseq.txt' %
                 (input_dir, lane.replace(',', ''), read)))
        # sort so results will be consistent across different runs (important
        # so amplicon and barcodes read headers will match)
        output_fp = '%s/s_%s_%s_sequences.fastq' % (output_dir, lane, read)
        output_f = open(output_fp, 'w')
        for read1_fp in read1_fps:
            for record in iter_split_lines(open(read1_fp, 'U')):
                fastq_s, pass_filter = illumina_data_to_fastq(
                    record, number_of_bases=bases)
                if ignore_pass_filter or pass_filter != 0:
                    output_f.write('%s\n' % fastq_s)
        output_f.close()
예제 #17
0
파일: test_utils.py 프로젝트: Jorge-C/qiime
   def test_store_cluster(self):
        """store_clusters stores the centroid seqs for each cluster."""

        self.tmpdir = get_tmp_filename(tmp_dir="./", suffix="_store_clusters/")
        create_dir(self.tmpdir)

        self.files_to_remove.append(self.tmpdir+"singletons.fasta")
        self.files_to_remove.append(self.tmpdir+"centroids.fasta")

        #empty map results in empty files
        store_clusters({}, self.tiny_test, self.tmpdir)
        actual_centroids = list(MinimalFastaParser(open(self.tmpdir+"centroids.fasta")))
        self.assertEqual(actual_centroids, [])
        actual_singletons = list(MinimalFastaParser(open(self.tmpdir+"singletons.fasta")))
        self.assertEqual(actual_singletons, [])

        #non-empty map creates non-empty files, centroids sorted by size
        mapping = {'FZTHQMS01B8T1H':[],
                   'FZTHQMS01DE1KN':['FZTHQMS01EHAJG'],
                   'FZTHQMS01EHAJG':[1,2,3]} # content doesn't really matter
        
        centroids = [('FZTHQMS01EHAJG | cluster size: 4', 'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAACAAGACCATGCGGTCTGATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCA'),
                     ('FZTHQMS01DE1KN | cluster size: 2','CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAACAAGACCATGCGGTCTGATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCA')]

        singletons= [('FZTHQMS01B8T1H', 'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAATTAAACCATGCGGTTTTATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCATCACTTA')]

        store_clusters(mapping, self.tiny_test, self.tmpdir)
        actual_centroids = list(MinimalFastaParser(open(self.tmpdir+"centroids.fasta")))
        self.assertEqual(actual_centroids, centroids)
        actual_singletons = list(MinimalFastaParser(open(self.tmpdir+"singletons.fasta")))
        self.assertEqual(actual_singletons,singletons)
예제 #18
0
    def setUp(self):

        
        self.files_to_remove = []
        self.dirs_to_remove = []
        
        # Create example output directory
        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='core_qiime_analyses_test_',
                                         suffix='',
                                         result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)
        
        # Get input data
        self.test_data = get_test_data_fps()
        
        self.qiime_config = load_qiime_config()
        self.qiime_config['jobs_to_start'] = 2
        self.qiime_config['seconds_to_sleep'] = 1
        
        # suppress stderr during tests (one of the systems calls in the 
        # workflow prints a warning, and we can't suppress that warning with 
        # warnings.filterwarnings) here because it comes from within the code 
        # executed through the system call. Found this trick here:
        # http://stackoverflow.com/questions/9949633/suppressing-print-as-stdout-python
        self.saved_stderr = sys.stderr
        sys.stderr = StringIO()
        
        # Define number of seconds a test can run for before timing out 
        # and failing
        initiate_timeout(420)
    def setUp(self):

        
        self.files_to_remove = []
        self.dirs_to_remove = []
        
        # Create example output directory
        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='core_qiime_analyses_test_',
                                         suffix='',
                                         result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)
        
        # Get input data
        self.test_data = get_test_data_fps()
        
        self.qiime_config = load_qiime_config()
        self.qiime_config['jobs_to_start'] = 2
        self.qiime_config['seconds_to_sleep'] = 1
        
        # suppress stderr during tests (one of the systems calls in the 
        # workflow prints a warning, and we can't suppress that warning with 
        # warnings.filterwarnings) here because it comes from within the code 
        # executed through the system call. Found this trick here:
        # http://stackoverflow.com/questions/9949633/suppressing-print-as-stdout-python
        self.saved_stderr = sys.stderr
        sys.stderr = StringIO()
        
        # Define number of seconds a test can run for before timing out 
        # and failing
        initiate_timeout(420)
예제 #20
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    if opts.attempt_read_reorientation:
        if not opts.mapping_fp:
            option_parser.error("To use --attempt_read_reorientation, one must "
                                "supply a mapping file that contains both LinkerPrimerSequence "
                                "and ReversePrimer columns.")
    if opts.input_type == "barcode_paired_end":
        if not opts.fastq2:
            option_parser.error("To use input_type of barcode_paired_end, "
                                "a second fastq file must be specified with --fastq2")

    if not opts.fastq2:
        disable_header_match = True
    else:
        disable_header_match = opts.disable_header_match

    fastq1 = qiime_open(opts.fastq1)
    if opts.fastq2:
        fastq2 = qiime_open(opts.fastq2)
    else:
        fastq2 = None
    create_dir(opts.output_dir)
    if opts.mapping_fp:
        map_fp = qiime_open(opts.mapping_fp)
    else:
        map_fp = None

    extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type,
                     opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1, opts.rev_comp_bc2,
                     opts.char_delineator, opts.switch_bc_order, map_fp,
                     opts.attempt_read_reorientation, disable_header_match)
예제 #21
0
def make_jobs(commands, job_prefix, queue, jobs_dir="jobs/",
              walltime="72:00:00", ncpus=1, nodes=1, keep_output="oe"):
    """prepare qsub text files.

    command: list of commands

    job_prefix: a short, descriptive name for the job.

    queue: name of the queue to submit to

    jobs_dir: path to directory where job submision scripts are written

    walltime: the maximal walltime

    ncpus: number of cpus

    nodes: number of nodes

    keep_output: keep standard error, standard out, both, or neither
                 o=std out, e=std err, oe=both, n=neither
    """

    filenames = []
    create_dir(jobs_dir)
    for command in commands:
        fd, job_name = mkstemp(dir=jobs_dir, prefix=job_prefix + "_",
                              suffix=".txt")
        close(fd)
        out_fh = open(job_name, "w")

        out_fh.write(QSUB_TEXT % (walltime, ncpus, nodes, queue, job_prefix,
                                  keep_output, command))
        out_fh.close()
        filenames.append(job_name)
    return filenames
예제 #22
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if len(opts.taxa_summary_fps) != 2:
        option_parser.error("Exactly two taxa summary files are required. You "
                            "provided %d." % len(opts.taxa_summary_fps))

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = parse_sample_id_map(open(opts.sample_id_map_fp, 'U'))

    results = compare_taxa_summaries(
        parse_taxa_summary_table(open(opts.taxa_summary_fps[0], 'U')),
        parse_taxa_summary_table(open(opts.taxa_summary_fps[1], 'U')),
        opts.comparison_mode,
        correlation_type=opts.correlation_type,
        tail_type=opts.tail_type,
        num_permutations=opts.num_permutations,
        confidence_level=opts.confidence_level,
        perform_detailed_comparisons=opts.perform_detailed_comparisons,
        sample_id_map=sample_id_map,
        expected_sample_id=opts.expected_sample_id)

    # Write out the sorted and filled taxa summaries, basing their
    # filenames on the original input filenames. If the filenames are the same,
    # append a number to each filename.
    same_filenames = False
    if basename(opts.taxa_summary_fps[0]) == \
       basename(opts.taxa_summary_fps[1]):
        same_filenames = True

    for orig_ts_fp, filled_ts_lines, file_num in zip(opts.taxa_summary_fps,
                                                     results[:2], range(0, 2)):
        filename_suffix = '_sorted_and_filled'
        if same_filenames:
            filename_suffix += '_%d' % file_num
        filled_ts_fp = add_filename_suffix(orig_ts_fp, filename_suffix)
        filled_ts_f = open(join(opts.output_dir, filled_ts_fp), 'w')
        filled_ts_f.write(filled_ts_lines)
        filled_ts_f.close()

    # Write the overall comparison result.
    overall_comp_f = open(join(opts.output_dir, 'overall_comparison.txt'), 'w')
    overall_comp_f.write(results[2])
    overall_comp_f.close()

    # Write the correlation vector containing the pairwise sample comparisons.
    if opts.perform_detailed_comparisons:
        corr_vec_f = open(join(opts.output_dir, 'detailed_comparisons.txt'),
                          'w')
        corr_vec_f.write(results[3])
        corr_vec_f.close()
예제 #23
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")
    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = dict([(k, v[0]) \
        for k,v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()])
    input_dm_fps = opts.input_dms
    distmats = [parse_distmat(open(dm_fp, 'U')) for dm_fp in input_dm_fps]

    if opts.method == 'mantel':
        output_f = open(path.join(opts.output_dir, 'mantel_results.txt'), 'w')
        output_f.write(
            run_mantel_test('mantel',
                            input_dm_fps,
                            distmats,
                            opts.num_permutations,
                            opts.tail_type,
                            comment_mantel_pmantel,
                            sample_id_map=sample_id_map))
    elif opts.method == 'partial_mantel':
        output_f = open(
            path.join(opts.output_dir, 'partial_mantel_results.txt'), 'w')
        output_f.write(
            run_mantel_test('partial_mantel',
                            input_dm_fps,
                            distmats,
                            opts.num_permutations,
                            opts.tail_type,
                            comment_mantel_pmantel,
                            control_dm_fp=opts.control_dm,
                            control_dm=parse_distmat(open(
                                opts.control_dm, 'U')),
                            sample_id_map=sample_id_map))
    elif opts.method == 'mantel_corr':
        output_f = open(
            path.join(opts.output_dir, 'mantel_correlogram_results.txt'), 'w')
        result_str, correlogram_fps, correlograms = run_mantel_correlogram(
            input_dm_fps,
            distmats,
            opts.num_permutations,
            comment_corr,
            opts.alpha,
            sample_id_map=sample_id_map,
            variable_size_distance_classes=opts.variable_size_distance_classes)

        output_f.write(result_str)
        for corr_fp, corr in zip(correlogram_fps, correlograms):
            corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type),
                         format=opts.image_type)
    output_f.close()
예제 #24
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if len(opts.taxa_summary_fps) != 2:
        option_parser.error("Exactly two taxa summary files are required. You "
                            "provided %d." % len(opts.taxa_summary_fps))

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = parse_sample_id_map(open(opts.sample_id_map_fp, 'U'))

    results = compare_taxa_summaries(
        parse_taxa_summary_table(open(opts.taxa_summary_fps[0], 'U')),
        parse_taxa_summary_table(open(opts.taxa_summary_fps[1], 'U')),
        opts.comparison_mode, correlation_type=opts.correlation_type,
        tail_type=opts.tail_type, num_permutations=opts.num_permutations,
        confidence_level=opts.confidence_level,
        perform_detailed_comparisons=opts.perform_detailed_comparisons,
        sample_id_map=sample_id_map,
        expected_sample_id=opts.expected_sample_id)

    # Write out the sorted and filled taxa summaries, basing their
    # filenames on the original input filenames. If the filenames are the same,
    # append a number to each filename.
    same_filenames = False
    if basename(opts.taxa_summary_fps[0]) == \
       basename(opts.taxa_summary_fps[1]):
        same_filenames = True

    for orig_ts_fp, filled_ts_lines, file_num in zip(opts.taxa_summary_fps,
                                                     results[:2], range(0, 2)):
        filename_suffix = '_sorted_and_filled'
        if same_filenames:
            filename_suffix += '_%d' % file_num
        filled_ts_fp = add_filename_suffix(orig_ts_fp, filename_suffix)
        filled_ts_f = open(join(opts.output_dir, filled_ts_fp), 'w')
        filled_ts_f.write(filled_ts_lines)
        filled_ts_f.close()

    # Write the overall comparison result.
    overall_comp_f = open(join(opts.output_dir, 'overall_comparison.txt'), 'w')
    overall_comp_f.write(results[2])
    overall_comp_f.close()

    # Write the correlation vector containing the pairwise sample comparisons.
    if opts.perform_detailed_comparisons:
        corr_vec_f = open(join(opts.output_dir,
                               'detailed_comparisons.txt'), 'w')
        corr_vec_f.write(results[3])
        corr_vec_f.close()
예제 #25
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    out_dir = opts.output_dir
    try:
        create_dir(out_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    map_f = open(opts.mapping_fp, 'U')
    dm_f = open(opts.distance_matrix_fp, 'U')

    fields = map(strip, opts.fields.split(','))
    fields = [field.strip('"').strip("'") for field in fields]

    color_individual_within_by_field = opts.color_individual_within_by_field
    results = make_distance_boxplots(dm_f, map_f, fields, width=opts.width,
            height=opts.height, suppress_all_within=opts.suppress_all_within,
            suppress_all_between=opts.suppress_all_between,
            suppress_individual_within=opts.suppress_individual_within,
            suppress_individual_between=opts.suppress_individual_between,
            y_min=opts.y_min, y_max=opts.y_max,
            whisker_length=opts.whisker_length, box_width=opts.box_width,
            box_color=opts.box_color,
            color_individual_within_by_field=color_individual_within_by_field,
            sort=opts.sort)

    for field, plot_figure, plot_data, plot_labels, plot_colors in results:
        output_plot_fp = join(out_dir, "%s_Distances.%s" %
                              (field, opts.imagetype))
        plot_figure.savefig(output_plot_fp, format=opts.imagetype,
                            transparent=opts.transparent)

        if not opts.suppress_significance_tests:
            sig_tests_f = open(join(out_dir, "%s_Stats.txt" % field), 'w')
            sig_tests_results = all_pairs_t_test(plot_labels, plot_data,
                    tail_type=opts.tail_type,
                    num_permutations=opts.num_permutations)
            sig_tests_f.write(sig_tests_results)
            sig_tests_f.close()

        if opts.save_raw_data:
            # Write the raw plot data into a tab-delimited file.
            assert(len(plot_labels) == len(plot_data))
            raw_data_fp = join(out_dir, "%s_Distances.txt" % field)
            raw_data_f = open(raw_data_fp, 'w')

            for label, data in zip(plot_labels, plot_data):
                raw_data_f.write(label.replace(" ", "_") + "\t")
                raw_data_f.write("\t".join(map(str, data)))
                raw_data_f.write("\n")
            raw_data_f.close()
예제 #26
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    out_dir = opts.output_dir
    try:
        create_dir(out_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    map_f = open(opts.mapping_fp, 'U')
    dm_f = open(opts.distance_matrix_fp, 'U')

    fields = map(strip, opts.fields.split(','))
    fields = [field.strip('"').strip("'") for field in fields]

    color_individual_within_by_field = opts.color_individual_within_by_field
    results = make_distance_boxplots(dm_f, map_f, fields, width=opts.width,
            height=opts.height, suppress_all_within=opts.suppress_all_within,
            suppress_all_between=opts.suppress_all_between,
            suppress_individual_within=opts.suppress_individual_within,
            suppress_individual_between=opts.suppress_individual_between,
            y_min=opts.y_min, y_max=opts.y_max,
            whisker_length=opts.whisker_length, box_width=opts.box_width,
            box_color=opts.box_color,
            color_individual_within_by_field=color_individual_within_by_field,
            sort=opts.sort)

    for field, plot_figure, plot_data, plot_labels, plot_colors in results:
        output_plot_fp = join(out_dir, "%s_Distances.%s" %
                              (field, opts.imagetype))
        plot_figure.savefig(output_plot_fp, format=opts.imagetype,
                            transparent=opts.transparent)

        if not opts.suppress_significance_tests:
            sig_tests_f = open(join(out_dir, "%s_Stats.txt" % field), 'w')
            sig_tests_results = all_pairs_t_test(plot_labels, plot_data,
                    tail_type=opts.tail_type,
                    num_permutations=opts.num_permutations)
            sig_tests_f.write(sig_tests_results)
            sig_tests_f.close()

        if opts.save_raw_data:
            # Write the raw plot data into a tab-delimited file.
            assert(len(plot_labels) == len(plot_data))
            raw_data_fp = join(out_dir, "%s_Distances.txt" % field)
            raw_data_f = open(raw_data_fp, 'w')

            for label, data in zip(plot_labels, plot_data):
                raw_data_f.write(label.replace(" ", "_") + "\t")
                raw_data_f.write("\t".join(map(str, data)))
                raw_data_f.write("\n")
            raw_data_f.close()
예제 #27
0
    def setUp(self):
        """
        """
        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='qiime_parallel_tests_',
                                         suffix='',
                                         result_constructor=str)
        create_dir(self.test_out)
        self.dirs_to_remove = [self.test_out]

        self.output_fp = join(self.test_out, 'fmap.txt')
        self.failure_fp = join(self.test_out, 'fail.txt')
        self.usearch_fp = join(self.test_out, 'out.uc')
        self.bl6_fp = join(self.test_out, 'out.bl6')
        self.log_fp = join(self.test_out, 'fmap.log')
        self.files_to_remove = [
            self.output_fp, self.failure_fp, self.usearch_fp, self.log_fp,
            self.bl6_fp
        ]

        self.refseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                            prefix='qiime_refseqs',
                                            suffix='.fasta')
        refseqs1_f = open(self.refseqs1_fp, 'w')
        refseqs1_f.write(refseqs1)
        refseqs1_f.close()
        self.files_to_remove.append(self.refseqs1_fp)

        self.refseqs2_fp = get_tmp_filename(tmp_dir=self.test_out,
                                            prefix='qiime_refseqs',
                                            suffix='.fasta')
        refseqs2_f = open(self.refseqs2_fp, 'w')
        refseqs2_f.write(refseqs2)
        refseqs2_f.close()
        self.files_to_remove.append(self.refseqs2_fp)

        self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                           prefix='qiime_inseqs',
                                           suffix='.fasta')
        inseqs1_f = open(self.inseqs1_fp, 'w')
        inseqs1_f.write(inseqs1)
        inseqs1_f.close()
        self.files_to_remove.append(self.inseqs1_fp)
        self.inseqs2_fp = get_tmp_filename(tmp_dir=self.test_out,
                                           prefix='qiime_inseqs',
                                           suffix='.fasta')
        inseqs2_f = open(self.inseqs2_fp, 'w')
        inseqs2_f.write(inseqs2)
        inseqs2_f.close()
        self.files_to_remove.append(self.inseqs2_fp)
        initiate_timeout(60)
예제 #28
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory " "specified with the -o option.")
    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = dict([(k, v[0]) for k, v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()])
    input_dm_fps = opts.input_dms
    distmats = [parse_distmat(open(dm_fp, "U")) for dm_fp in input_dm_fps]

    if opts.method == "mantel":
        output_f = open(path.join(opts.output_dir, "mantel_results.txt"), "w")
        output_f.write(
            run_mantel_test(
                "mantel",
                input_dm_fps,
                distmats,
                opts.num_permutations,
                opts.tail_type,
                comment_mantel_pmantel,
                sample_id_map=sample_id_map,
            )
        )
    elif opts.method == "partial_mantel":
        output_f = open(path.join(opts.output_dir, "partial_mantel_results.txt"), "w")
        output_f.write(
            run_mantel_test(
                "partial_mantel",
                input_dm_fps,
                distmats,
                opts.num_permutations,
                opts.tail_type,
                comment_mantel_pmantel,
                control_dm_fp=opts.control_dm,
                control_dm=parse_distmat(open(opts.control_dm, "U")),
                sample_id_map=sample_id_map,
            )
        )
    elif opts.method == "mantel_corr":
        output_f = open(path.join(opts.output_dir, "mantel_correlogram_results.txt"), "w")
        result_str, correlogram_fps, correlograms = run_mantel_correlogram(
            input_dm_fps, distmats, opts.num_permutations, comment_corr, opts.alpha, sample_id_map=sample_id_map
        )
        output_f.write(result_str)
        for corr_fp, corr in zip(correlogram_fps, correlograms):
            corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type), format=opts.image_type)
    output_f.close()
예제 #29
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    input_fps = opts.input_fps
    output_dir = opts.output_dir
    create_dir(output_dir)
    barcode_length = opts.barcode_length
    barcode_in_header = opts.barcode_in_header
    barcode_qual_c = opts.barcode_qual_c
    
    for input_fp in input_fps:
        if input_fp.endswith('.gz'):
            open_f = gzip_open
            input_basename = split(splitext(splitext(input_fp)[0])[0])[1]
        else:              
            input_basename = split(splitext(input_fp)[0])[1]
            open_f = open
        sequence_output_fp =  '%s/%s.fastq' % (output_dir,input_basename)
        sequence_output_f = open(sequence_output_fp,'w')              
        barcode_output_fp =  '%s/%s_barcodes.fastq' % (output_dir,input_basename)
        barcode_output_f = open(barcode_output_fp,'w')
        for line in open_f(input_fp):
            common_fields, sequence, sequence_qual, barcode, barcode_qual =\
             iseq_to_qseq_fields(line, barcode_in_header, barcode_length, barcode_qual_c)
            
            sequence_s, pass_filter_s = illumina_data_to_fastq((common_fields[0],
                                                              common_fields[1],
                                                              common_fields[2],
                                                              common_fields[3],
                                                              common_fields[4],
                                                              common_fields[5],
                                                              common_fields[6],
                                                              common_fields[7],
                                                              sequence,
                                                              sequence_qual))
            
            barcode_s, pass_filter_b = illumina_data_to_fastq((common_fields[0],
                                                             common_fields[1],
                                                             common_fields[2],
                                                             common_fields[3],
                                                             common_fields[4],
                                                             common_fields[5],
                                                             common_fields[6],
                                                             common_fields[7],
                                                             barcode,
                                                             barcode_qual),barcode_length)
            if pass_filter_s != 0:
                sequence_output_f.write('%s\n' % sequence_s)
                barcode_output_f.write('%s\n' % barcode_s)
        sequence_output_f.close()
        barcode_output_f.close()
예제 #30
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    input_fps = opts.input_fps
    output_dir = opts.output_dir
    create_dir(output_dir)
    barcode_length = opts.barcode_length
    barcode_in_header = opts.barcode_in_header
    barcode_qual_c = opts.barcode_qual_c
    
    for input_fp in input_fps:
        if input_fp.endswith('.gz'):
            open_f = gzip_open
            input_basename = split(splitext(splitext(input_fp)[0])[0])[1]
        else:              
            input_basename = split(splitext(input_fp)[0])[1]
            open_f = open
        sequence_output_fp =  '%s/%s.fastq' % (output_dir,input_basename)
        sequence_output_f = open(sequence_output_fp,'w')              
        barcode_output_fp =  '%s/%s_barcodes.fastq' % (output_dir,input_basename)
        barcode_output_f = open(barcode_output_fp,'w')
        for line in open_f(input_fp):
            common_fields, sequence, sequence_qual, barcode, barcode_qual =\
             iseq_to_qseq_fields(line, barcode_in_header, barcode_length, barcode_qual_c)
            
            sequence_s, pass_filter_s = illumina_data_to_fastq((common_fields[0],
                                                              common_fields[1],
                                                              common_fields[2],
                                                              common_fields[3],
                                                              common_fields[4],
                                                              common_fields[5],
                                                              common_fields[6],
                                                              common_fields[7],
                                                              sequence,
                                                              sequence_qual))
            
            barcode_s, pass_filter_b = illumina_data_to_fastq((common_fields[0],
                                                             common_fields[1],
                                                             common_fields[2],
                                                             common_fields[3],
                                                             common_fields[4],
                                                             common_fields[5],
                                                             common_fields[6],
                                                             common_fields[7],
                                                             barcode,
                                                             barcode_qual),barcode_length)
            if pass_filter_s != 0:
                sequence_output_f.write('%s\n' % sequence_s)
                barcode_output_f.write('%s\n' % barcode_s)
        sequence_output_f.close()
        barcode_output_f.close()
def make_jobs(commands, job_prefix, queue, jobs_dir="jobs/",
              walltime="06:00:00", nodes=1, ncpus=16, mem=16, keep_output="oe"):
    """prepare qsub text files.
    
    command: list of commands
    
    job_prefix: a short, descriptive name for the job.

    queue: name of the queue to submit to
    
    jobs_dir: path to directory where job submision scripts are written

    walltime: the maximal walltime 
    
    ncpus: number of cpus
    
    nodes: number of nodes
    
    keep_output: keep standard error, standard out, both, or neither
                 o=std out, e=std err, oe=both, n=neither
    """

    filenames=[]
    create_dir(jobs_dir)
    job_list_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_joblist_", suffix = ".txt")
    
    job_log_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_prallel_job_log", suffix = ".txt")
    
    out_fh_list = open(job_list_name,"w")
    
    for command in commands[0:len(commands)-1]:
        out_fh_list.write(command+"\n")

    out_fh_list.close()

    job_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_", suffix = ".txt")

    out_fh = open(job_name,"w")

    #num_nodes = int(math.ceil((len(commands)-1)/8.0))
    # If you use the lab queue, then change the num_nodes and ncpus as:
    num_nodes = 1
    ncpus = len(commands) - 1
    
    out_fh.write(QSUB_TEXT % (walltime, num_nodes, ncpus, mem, queue, job_prefix, keep_output, job_list_name, len(commands)-1, job_log_name, commands[-1]))

    out_fh.close()

    filenames.append(job_name)

    return filenames
예제 #32
0
def make_sge_jobs(commands,
                  job_prefix,
                  queue,
                  jobs_dir="jobs/",
                  num_jobs=100,
                  max_hours_per_job=24):
    """prepare qsub text files.

    command: list of commands

    job_prefix: a short, descriptive name for the job.

    queue: name of the queue to submit to

    jobs_dir: path to directory where job submision scripts are written

    max_hours_per_job: the maximum expected time for each command (this will be multiplied by number of commands per job to get a 'walltime'

    ncpus: number of cpus

    nodes: number of nodes

    keep_output: keep standard error, standard out, both, or neither
                 o=std out, e=std err, oe=both, n=neither
    """

    filenames = []
    create_dir(jobs_dir)

    #calculate the number of commands to put in each job
    num_commands_per_job = int(ceil(len(commands) / float(num_jobs)))

    #calculate the walltime (time before job will be killed by scheduler if still running)
    total_time = max_hours_per_job * num_commands_per_job
    walltime = "{0}:00:00".format(total_time)

    for command_group in grouper(commands, num_commands_per_job, ''):
        job_name = get_tmp_filename(tmp_dir=jobs_dir,
                                    prefix=job_prefix + "_",
                                    suffix=".txt")
        out_fh = open(job_name, "w")

        stderr_fp = job_name + "_stderr"
        stdout_fp = job_name + "_stdout"
        out_fh.write(
            SGE_QSUB_TEXT %
            (walltime, stderr_fp, stdout_fp, "\n".join(command_group)))
        out_fh.close()
        filenames.append(job_name)
    return filenames
 def setUp(self):
     """
     """
     tmp_dir = get_qiime_temp_dir()
     self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                      prefix='qiime_parallel_tests_',
                                      suffix='',
                                      result_constructor=str)
     create_dir(self.test_out)
     self.dirs_to_remove = [self.test_out]
     
     self.output_fp = join(self.test_out,'fmap.txt')
     self.failure_fp = join(self.test_out,'fail.txt')
     self.usearch_fp = join(self.test_out,'out.uc')
     self.bl6_fp = join(self.test_out,'out.bl6')
     self.log_fp = join(self.test_out,'fmap.log')
     self.files_to_remove = [self.output_fp, self.failure_fp, 
                             self.usearch_fp, self.log_fp, self.bl6_fp]
     
     self.refseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_refseqs',
                                         suffix='.fasta')
     refseqs1_f = open(self.refseqs1_fp,'w')
     refseqs1_f.write(refseqs1)
     refseqs1_f.close()
     self.files_to_remove.append(self.refseqs1_fp)
     
     self.refseqs2_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_refseqs',
                                         suffix='.fasta')
     refseqs2_f = open(self.refseqs2_fp,'w')
     refseqs2_f.write(refseqs2)
     refseqs2_f.close()
     self.files_to_remove.append(self.refseqs2_fp)
     
     self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_inseqs',
                                         suffix='.fasta')
     inseqs1_f = open(self.inseqs1_fp,'w')
     inseqs1_f.write(inseqs1)
     inseqs1_f.close()
     self.files_to_remove.append(self.inseqs1_fp)
     self.inseqs2_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_inseqs',
                                         suffix='.fasta')
     inseqs2_f = open(self.inseqs2_fp,'w')
     inseqs2_f.write(inseqs2)
     inseqs2_f.close()
     self.files_to_remove.append(self.inseqs2_fp)
     initiate_timeout(60)
def main(commandline_args=None):
    option_parser, opts, args = parse_command_line_parameters(**script_info)
 
    #check for missing files
    required_files = [opts.denoiser_map_file, opts.otu_picker_map_file,
                      opts.fasta_fp, opts.denoised_fasta_fp]
    if (not all(required_files) or not all(map(exists, required_files))):
        option_parser.error('Missing input files.')

    create_dir(opts.output_dir, fail_on_exist=False)
           
    combine_mappings(open(opts.fasta_fp), open(opts.denoiser_map_file),
                     open(opts.denoised_fasta_fp),
                     open(opts.otu_picker_map_file), opts.output_dir)
예제 #35
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    out_dir = opts.output_dir
    categories = opts.categories.split(',')

    # Create the output dir if it doesn't already exist.
    try:
        if not exists(out_dir):
            create_dir(out_dir)
    except:
        option_parser.error("Could not create or access output directory '%s' "
                            "specified with the -o option." % out_dir)

    compare_categories(opts.input_dm, opts.mapping_file, opts.method,
                       categories, opts.num_permutations, out_dir)
예제 #36
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    out_dir = opts.output_dir
    categories = opts.categories.split(',')

    # Create the output dir if it doesn't already exist.
    try:
        if not exists(out_dir):
            create_dir(out_dir)
    except:
        option_parser.error("Could not create or access output directory '%s' "
                            "specified with the -o option." % out_dir)

    compare_categories(opts.input_dm, opts.mapping_file, opts.method,
                       categories, opts.num_permutations, out_dir)
예제 #37
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(
            tmp_dir=tmp_dir,
            prefix='qiime_parallel_taxonomy_assigner_tests_',
            suffix='',
            result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        # Temporary input file
        self.tmp_seq_filepath = get_tmp_filename(
            tmp_dir=self.test_out,
            prefix='qiime_parallel_taxonomy_assigner_tests_input',
            suffix='.fasta')
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(rdp_test_seqs)
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.id_to_taxonomy_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy',
            suffix='.txt',
            dir=tmp_dir)
        self.id_to_taxonomy_file.write(rdp_id_to_taxonomy)
        self.id_to_taxonomy_file.seek(0)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs',
            suffix='.fasta',
            dir=tmp_dir)
        self.reference_seqs_file.write(rdp_reference_seqs)
        self.reference_seqs_file.seek(0)

        jar_fp = getenv('RDP_JAR_PATH')
        jar_basename = basename(jar_fp)
        if '2.2' not in jar_basename:
            raise ApplicationError(
                "RDP_JAR_PATH does not point to version 2.2 of the "
                "RDP Classifier.")

        initiate_timeout(60)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
      
    verbose = opts.verbose
    
    sequence_reads_fp = opts.sequence_reads_fp
    output_dir = opts.output_dir
    proportion_subsample = opts.proportion_subsample
    index_reads_fp = opts.index_reads_fp
    
    if proportion_subsample > 1 or proportion_subsample <= 0:
        raise ValueError,('proportion_subsample must be in range of 0-1')
    

    # if the output dir isn't specified, create one
    if not output_dir:
        input_file_basename, input_file_ext = \
         splitext(split(sequence_reads_fp)[1])
        output_dir = '%s_subsample/' % (input_file_basename)
    create_dir(output_dir)

    # prepare output files
    input_file_basename, input_file_ext = \
     splitext(split(sequence_reads_fp)[1])
    output_reads_fp = '%s_subsample%s' % (input_file_basename,
         input_file_ext)

    input_file_basename, input_file_ext = \
     splitext(split(index_reads_fp)[1])
    output_indexs_fp = '%s_subsample%s' % (input_file_basename,
         input_file_ext)

    # randomly subsample
    sequence_reads = open(sequence_reads_fp, "U")
    index_reads = open(index_reads_fp, "U")
    output_reads = open('%s/%s' %(output_dir,output_reads_fp), "w")
    output_indexs = open('%s/%s' %(output_dir,output_indexs_fp), "w")
    for (label, seq, qual), (labelI, seqI, qualI) in izip(MinimalFastqParser(sequence_reads,strict=False),MinimalFastqParser(index_reads,strict=False)):
        if random() < proportion_subsample:
            output_reads.write('@%s\n%s\n+\n%s\n' % (label, seq, qual))
            output_indexs.write('@%s\n%s\n+\n%s\n' % (labelI, seqI, qualI))
         
    sequence_reads.close()
    index_reads.close()
    output_reads.close()
    output_indexs.close()
예제 #39
0
def make_sge_jobs(commands, job_prefix, queue, jobs_dir="jobs/",num_jobs=100,max_hours_per_job=24):
    """prepare qsub text files.
    
    command: list of commands
    
    job_prefix: a short, descriptive name for the job.

    queue: name of the queue to submit to
    
    jobs_dir: path to directory where job submision scripts are written

    max_hours_per_job: the maximum expected time for each command (this will be multiplied by number of commands per job to get a 'walltime' 
    
    ncpus: number of cpus
    
    nodes: number of nodes
    
    keep_output: keep standard error, standard out, both, or neither
                 o=std out, e=std err, oe=both, n=neither
    """

    filenames=[]
    create_dir(jobs_dir)

    #calculate the number of commands to put in each job
    num_commands_per_job=int(ceil(len(commands)/float(num_jobs)))

    #calculate the walltime (time before job will be killed by scheduler if still running)
    total_time = max_hours_per_job*num_commands_per_job
    walltime= "{0}:00:00".format(total_time)
    
    for command_group in grouper(commands,num_commands_per_job,''):
        job_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_",
                                    suffix = ".txt")
        out_fh = open(job_name,"w")

        stderr_fp = job_name+"_stderr"
        stdout_fp = job_name+"_stdout"
        out_fh.write(SGE_QSUB_TEXT % (walltime, stderr_fp, stdout_fp, "\n".join(command_group)))        
        out_fh.close()
        filenames.append(job_name)
    return filenames
예제 #40
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='qiime_parallel_taxonomy_assigner_tests_',
                                         suffix='',
                                         result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        # Temporary input file
        self.tmp_seq_filepath = get_tmp_filename(tmp_dir=self.test_out,
            prefix='qiime_parallel_taxonomy_assigner_tests_input',
            suffix='.fasta')
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(rdp_test_seqs)
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.id_to_taxonomy_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy',
            suffix='.txt',dir=tmp_dir)
        self.id_to_taxonomy_file.write(rdp_id_to_taxonomy)
        self.id_to_taxonomy_file.seek(0)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs',
            suffix='.fasta',dir=tmp_dir)
        self.reference_seqs_file.write(rdp_reference_seqs)
        self.reference_seqs_file.seek(0)

        jar_fp = getenv('RDP_JAR_PATH')
        jar_basename = basename(jar_fp)
        if '2.2' not in jar_basename:
            raise ApplicationError(
                "RDP_JAR_PATH does not point to version 2.2 of the "
                "RDP Classifier.")

        initiate_timeout(60)
예제 #41
0
파일: split.py 프로젝트: Bonder-MJ/qiime
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''):
    """ Split infile into files with seqs_per_file sequences in each

        infile: list of fasta lines or open file object
        seqs_per_file: the number of sequences to include in each file
        out_fileprefix: string used to create output filepath - output filepaths
         are <out_prefix>.<i>.fasta where i runs from 0 to number of output files
        working_dir: directory to prepend to temp filepaths (defaults to
         empty string -- files written to cwd)

        List of output filepaths is returned.

    """
    if seqs_per_file <= 0:
        raise ValueError("seqs_per_file must be > 0!")

    seq_counter = 0
    out_files = []
    if working_dir and not working_dir.endswith('/'):
        working_dir += '/'
        create_dir(working_dir)

    for seq_id, seq in parse_fasta(infile):
        if seq_counter == 0:
            current_out_fp = '%s%s.%d.fasta' \
                % (working_dir, outfile_prefix, len(out_files))
            current_out_file = open(current_out_fp, 'w')
            out_files.append(current_out_fp)
        current_out_file.write('>%s\n%s\n' % (seq_id, seq))
        seq_counter += 1

        if seq_counter == seqs_per_file:
            current_out_file.close()
            seq_counter = 0

    if not current_out_file.closed:
        current_out_file.close()

    return out_files
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    output_dir = opts.output_dir
    try:
        create_dir(output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o/--output_dir option.")

    otu_table_fp = opts.otu_table_fp
    with open(otu_table_fp, 'U') as table_f:
        table = parse_biom_table(table_f)

    estimator = ObservationRichnessEstimator(table,
                                             Chao1MultinomialPointEstimator)
    results = estimator(opts.min, opts.max, opts.num_steps,
                        opts.confidence_level)

    out_fp = join(output_dir, 'estimates_table.txt')
    with open(out_fp, 'w') as out_f:
        results.toTable(out_f)
예제 #43
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    output_dir = opts.output_dir
    try:
        create_dir(output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o/--output_dir option.")

    otu_table_fp = opts.otu_table_fp
    with open(otu_table_fp, 'U') as table_f:
        table = parse_biom_table(table_f)

    estimator = ObservationRichnessEstimator(table,
                                             Chao1MultinomialPointEstimator)
    results = estimator(opts.min, opts.max, opts.num_steps,
                        opts.confidence_level)

    out_fp = join(output_dir, 'estimates_table.txt')
    with open(out_fp, 'w') as out_f:
        results.toTable(out_f)
예제 #44
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.attempt_read_reorientation:
        if not opts.mapping_fp:
            option_parser.error(
                "To use --attempt_read_reorientation, one must "
                "supply a mapping file that contains both LinkerPrimerSequence "
                "and ReversePrimer columns.")
    if opts.input_type == "barcode_paired_end":
        if not opts.fastq2:
            option_parser.error(
                "To use input_type of barcode_paired_end, "
                "a second fastq file must be specified with --fastq2")

    if not opts.fastq2:
        disable_header_match = True
    else:
        disable_header_match = opts.disable_header_match

    fastq1 = qiime_open(opts.fastq1)
    if opts.fastq2:
        fastq2 = qiime_open(opts.fastq2)
    else:
        fastq2 = None
    create_dir(opts.output_dir)
    if opts.mapping_fp:
        map_fp = qiime_open(opts.mapping_fp)
    else:
        map_fp = None

    extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type,
                     opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1,
                     opts.rev_comp_bc2, opts.char_delineator,
                     opts.switch_bc_order, map_fp,
                     opts.attempt_read_reorientation, disable_header_match)
예제 #45
0
    def _get_job_commands(
        self,
        input_fp,
        output_dir,
        params,
        job_prefix,
        working_dir,
        command_prefix="/bin/bash; ",
        command_suffix="; exit",
    ):
        """Generate beta diversity to split single OTU table to multiple jobs
    
        full_tree=True is faster: beta_diversity.py -f will make things
        go faster, but be sure you already have the correct minimal tree.
        """
        commands = []
        result_filepaths = []

        sids = parse_biom_table(open(input_fp, "U")).SampleIds

        if params["full_tree"]:
            full_tree_str = "-f"
        else:
            full_tree_str = ""

        if params["tree_path"]:
            tree_str = "-t %s" % params["tree_path"]
        else:
            tree_str = ""

        metrics = params["metrics"]

        # this is a little bit of an abuse of _merge_to_n_commands, so may
        # be worth generalizing that method - this determines the correct
        # number of samples to process in each command
        sample_id_groups = self._merge_to_n_commands(
            sids, params["jobs_to_start"], delimiter=",", command_prefix="", command_suffix=""
        )

        for i, sample_id_group in enumerate(sample_id_groups):
            working_dir_i = join(working_dir, str(i))
            create_dir(working_dir_i)
            output_dir_i = join(output_dir, str(i))
            create_dir(output_dir_i)
            input_dir, input_fn = split(input_fp)
            input_basename, input_ext = splitext(input_fn)
            sample_id_desc = sample_id_group.replace(",", "_")
            output_fns = ["%s_%s.txt" % (metric, input_basename) for metric in metrics.split(",")]
            rename_command, current_result_filepaths = self._get_rename_command(output_fns, working_dir_i, output_dir_i)

            result_filepaths += current_result_filepaths

            bdiv_command = "%s -i %s -o %s %s -m %s %s -r %s" % (
                self._script_name,
                input_fp,
                working_dir_i,
                tree_str,
                params["metrics"],
                full_tree_str,
                sample_id_group,
            )

            shell_script_fp = "%s/%s%d.sh" % (working_dir_i, job_prefix, i)
            shell_script_commands = [bdiv_command] + rename_command.split(";")
            self._commands_to_shell_script(shell_script_commands, shell_script_fp)
            commands.append("bash %s" % shell_script_fp)

        return commands, result_filepaths
예제 #46
0
def iterative_pick_subsampled_open_reference_otus(
                              input_fps, 
                              refseqs_fp,
                              output_dir,
                              percent_subsample,
                              new_ref_set_id,
                              command_handler,
                              params,
                              qiime_config,
                              prefilter_refseqs_fp=None,
                              prefilter_percent_id=0.60,
                              min_otu_size=2,
                              run_assign_tax=True,
                              run_align_and_tree=True,
                              step1_otu_map_fp=None,
                              step1_failures_fasta_fp=None,
                              parallel=False,
                              suppress_step4=False,
                              logger=None,
                              suppress_md5=False,
                              denovo_otu_picking_method='uclust',
                              reference_otu_picking_method='uclust_ref',
                              status_update_callback=print_to_stdout):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we 
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp == None:
       prefilter_refseqs_fp = refseqs_fp
    
    otu_table_fps = []
    repset_fasta_fps = []
    for i,input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir,i)
        if iteration_output_exists(iteration_output_dir,min_otu_size):
            # if the output from an iteration already exists, skip that 
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger,[input_fp,refseqs_fp])
            logger.write('Iteration %d (input file: %s) output data already exists. '
                         'Skipping and moving to next.\n\n' % (i,input_fp))
        else:
            pick_subsampled_open_reference_otus(input_fp=input_fp,
                                     refseqs_fp=refseqs_fp,
                                     output_dir=iteration_output_dir,
                                     percent_subsample=percent_subsample,
                                     new_ref_set_id='.'.join([new_ref_set_id,str(i)]),
                                     command_handler=command_handler,
                                     params=params,
                                     qiime_config=qiime_config,
                                     run_assign_tax=False,
                                     run_align_and_tree=False,
                                     prefilter_refseqs_fp=prefilter_refseqs_fp,
                                     prefilter_percent_id=prefilter_percent_id,
                                     min_otu_size=min_otu_size,
                                     step1_otu_map_fp=step1_otu_map_fp,
                                     step1_failures_fasta_fp=step1_failures_fasta_fp,
                                     parallel=parallel,
                                     suppress_step4=suppress_step4,
                                     logger=logger,
                                     suppress_md5=suppress_md5,
                                     denovo_otu_picking_method=denovo_otu_picking_method,
                                     reference_otu_picking_method=reference_otu_picking_method,
                                     status_update_callback=status_update_callback)
        ## perform post-iteration file shuffling whether the previous iteration's
        ## data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp
        otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir,min_otu_size))
        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)
    
    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
         (','.join(otu_table_fps),otu_table_fp)        
        commands.append([("Merge OTU tables",merge_cmd)])
    
    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps,final_repset_fp)
    
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=False)
    commands = []
    
    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size)
    
    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp],error_on_missing=False)
        
            taxonomy_fp = assign_tax(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
             (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table",add_metadata_cmd)])
        
            command_handler(commands,
                status_update_callback,
                logger=logger,
                close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %\
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)
        
            pynast_failures_fp = align_and_tree(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                  parse_biom_table(open(align_and_tree_input_otu_table,'U')),
                  get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')),
                  0,inf,0,inf,negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()
        
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []
    
    logger.close()
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_dir = opts.output_dir
    
    if opts.num_fraction_for_core_steps < 2:
        option_parser.error("Must perform at least two steps. Increase --num_fraction_for_core_steps.")
    fractions_for_core = linspace(opts.min_fraction_for_core,
                                  opts.max_fraction_for_core,
                                  opts.num_fraction_for_core_steps)

    otu_md = opts.otu_md
    valid_states = opts.valid_states
    mapping_fp = opts.mapping_fp
    
    create_dir(output_dir)

    if valid_states and opts.mapping_fp:
        sample_ids = sample_ids_from_metadata_description(open(mapping_fp,'U'),
                                                          valid_states)
        if len(sample_ids) < 1:
            option_parser.error(\
             "--valid_states pattern didn't match any entries in mapping file: \"%s\"" %\
             valid_states)
    else:
        # get core across all samples if user doesn't specify a subset of the 
        # samples to work with
        sample_ids = None
    
    input_table = parse_biom_table(open(input_fp,'U'))
    
    otu_counts = []
    summary_figure_fp = join(output_dir,'core_otu_size.pdf')
    for fraction_for_core in fractions_for_core:
        # build a string representation of the fraction as that gets used 
        # several times
        fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.)
        
        # prep output files
        output_fp = join(output_dir,'core_otus_%s.txt' % fraction_for_core_str)
        output_table_fp = join(output_dir,'core_table_%s.biom' % fraction_for_core_str)
        output_f = open(output_fp,'w')
    
        try:
            core_table = filter_table_to_core(input_table,
                                              sample_ids,
                                              fraction_for_core)
        except TableException:
            output_f.write("# No OTUs present in %s %% of samples." % fraction_for_core_str)
            output_f.close()
            otu_counts.append(0)
            continue
    
        # write some header information to file
        if sample_ids == None:
            output_f.write("# Core OTUs across %s %% of samples.\n" % fraction_for_core_str)
        else:
            output_f.write(\
             "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" %\
              (fraction_for_core_str, valid_states,' '.join(sample_ids)))
    
        # write the otu id and corresponding metadata for all core otus
        otu_count = 0
        for value, id_, md in core_table.iterObservations():
            output_f.write('%s\t%s\n' % (id_,md[otu_md]))
            otu_count += 1
        output_f.close()
    
        # write the core biom table
        output_table_f = open(output_table_fp,'w')
        output_table_f.write(format_biom_table(core_table))
        output_table_f.close()
        
        # append the otu count to the list of counts
        otu_counts.append(otu_count)
    
    plot(fractions_for_core, otu_counts)
    xlim(min(fractions_for_core),max(fractions_for_core))
    ylim(0,max(otu_counts)+1)
    xlabel("Fraction of samples that OTU must be observed in to be considered 'core'")
    ylabel("Number of OTUs")
    savefig(summary_figure_fp)
예제 #48
0
def pick_subsampled_open_referenence_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_tax_align_tree=True,
        prefilter_percent_id=0.60,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    denovo_otu_picking_method = 'uclust'
    reference_otu_picking_method = 'uclust_ref'

    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    log_input_md5s(
        logger,
        [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
        prefilter_refseqs_fp = refseqs_fp

    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_otu_map_fp = \
             '%s/%s_otus.txt' % (prefilter_dir,input_basename)
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)

        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                    percent_subsample)

    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                 new_ref_set_id, denovo_otu_picking_method,
                                 params, logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',
                      step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                    reference_otu_picking_method,
                                    step2_repset_fasta_fp, parallel, params,
                                    logger)

    commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures',
                          step3_filter_fasta_cmd)])

        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])

    else:
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' %
                          (step3_failures_list_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    ## make the final representative seqs file and a new refseqs file that
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be.
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be
    ## reads from this run so we don't hit issues building a tree using
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,
                                                   'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    if run_tax_align_tree:
        taxonomy_fp, pynast_failures_fp = tax_align_tree(
            repset_fasta_fp=final_repset_fp,
            output_dir=output_dir,
            command_handler=command_handler,
            params=params,
            qiime_config=qiime_config,
            parallel=parallel,
            logger=logger,
            status_update_callback=status_update_callback)

        # Add taxa to otu table
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\
         (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
        commands.append([("Add taxa to OTU table", add_taxa_cmd)])

        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

        # Build OTU table without PyNAST failures
        otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
        filtered_otu_table = filter_otus_from_otu_table(
            parse_biom_table(open(otu_table_w_tax_fp, 'U')),
            get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
            0,
            inf,
            0,
            inf,
            negate_ids_to_keep=True)
        otu_table_f = open(otu_table_fp, 'w')
        otu_table_f.write(format_biom_table(filtered_otu_table))
        otu_table_f.close()

        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
예제 #49
0
def pick_subsampled_open_reference_otus(input_fp,
                                        refseqs_fp,
                                        output_dir,
                                        percent_subsample,
                                        new_ref_set_id,
                                        command_handler,
                                        params,
                                        qiime_config,
                                        prefilter_refseqs_fp=None,
                                        run_assign_tax=True,
                                        run_align_and_tree=True,
                                        prefilter_percent_id=0.60,
                                        min_otu_size=2,
                                        step1_otu_map_fp=None,
                                        step1_failures_fasta_fp=None,
                                        parallel=False,
                                        suppress_step4=False,
                                        logger=None,
                                        suppress_md5=False,
                                        suppress_index_page=False,
                                        denovo_otu_picking_method='uclust',
                                        reference_otu_picking_method='uclust_ref',
                                        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61']
    allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)
            
        close_logger_on_success = True
        index_links.append(
                ('Run summary data',
                log_fp,
                _index_headers['run_summary']))
    else:
        close_logger_on_success = False
        

    if not suppress_md5:
        log_input_md5s(logger, [input_fp,
                                refseqs_fp,
                                step1_otu_map_fp,
                                step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id)
            commands.append(
                [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append(
                [('Filter prefilter failures from input', filter_fasta_cmd)])
            index_links.append(
            ('Pre-filtered sequence identifiers '
             '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100),
                        prefilter_failures_list_fp,
                        _index_headers['sequences']))


            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(
            input_fp, step1_dir, reference_otu_picking_method,
            refseqs_fp, parallel, params, logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # Subsample the failures fasta file to retain (roughly) the
    # percent_subsample
    step2_input_fasta_fp = \
        '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp,
                    step2_input_fasta_fp,
                    percent_subsample)

    logger.write('# Subsample the failures fasta file using API \n' +
                 'python -c "import qiime; qiime.util.subsample_fasta' +
                 '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp),
                                                    abspath(
                                                        step2_input_fasta_fp),
                                                    percent_subsample))

    # Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                 step2_dir,
                                 new_ref_set_id,
                                 denovo_otu_picking_method,
                                 params,
                                 logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

    # Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
    commands.append(
        [('Pick representative set for subsampled failures', step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(
        step1_failures_fasta_fp,
        step3_dir,
        reference_otu_picking_method,
        step2_repset_fasta_fp,
        parallel,
        params,
        logger)

    commands.append([
        ('Pick reference OTUs using de novo rep set', step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir
 
    index_links.append(
        ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
         merged_otu_map_fp,
         _index_headers['otu_maps']))   
    

    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (step1_failures_fasta_fp,
             step3_failures_list_fp, step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures',
                          step3_filter_fasta_cmd)])

        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, step3_failures_fasta_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step4_rep_set_cmd)])

    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)
     
    otus_to_keep = filter_otus_from_otu_map(
        otu_fp,
        otu_no_singletons_fp,
        min_otu_size)
                                                           
    index_links.append(('Final map of OTU identifier to sequence identifers excluding '
                        'OTUs with fewer than %d sequences' % min_otu_size, 
                        otu_no_singletons_fp, 
                        _index_headers['otu_maps']))

    logger.write('# Filter singletons from the otu map using API \n' +
                 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
                 '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp),
                                                    abspath(
                                                        otu_no_singletons_fp),
                                                    min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(
        ('OTU representative sequences',
         final_repset_fp,
         _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append(
        ('New reference sequences (i.e., OTU representative sequences plus input '
         'reference sequences)',
         new_refseqs_fp,
         _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' % final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write('# Copy the full input refseqs file to the new refseq file\n' +
                 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()
    logger.write('# Write non-singleton otus representative sequences from ' +
                 'step 2 and step 4 to the final representative set and the new reference' +
                 ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp,
         _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
             'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))
             
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))
         
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))
         
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
             'fail to align with PyNAST' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))                                    

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(
            ('OTU phylogenetic tree',
             rep_set_tree_fp,
             _index_headers['trees']))   
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                parse_biom_table(open(align_and_tree_input_otu_table, 'U')),
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp, 'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    
    if close_logger_on_success:
        logger.close()
    
    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
예제 #50
0
def pick_subsampled_open_reference_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_assign_tax=True,
        run_align_and_tree=True,
        prefilter_percent_id=0.60,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61']
    allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
     "Unknown de novo OTU picking method: %s. Known methods are: %s"\
     % (denovo_otu_picking_method,
        ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
     "Unknown reference OTU picking method: %s. Known methods are: %s"\
     % (reference_otu_picking_method,
        ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(
            logger,
            [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
        prefilter_refseqs_fp = refseqs_fp

    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])

            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                    percent_subsample)

    logger.write('# Subsample the failures fasta file using API \n' +
                 'python -c "import qiime; qiime.util.subsample_fasta' +
                 '(\'%s\', \'%s\', \'%f\')\n\n"' %
                 (abspath(step1_failures_fasta_fp),
                  abspath(step2_input_fasta_fp), percent_subsample))

    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                 new_ref_set_id, denovo_otu_picking_method,
                                 params, logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',
                      step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                    reference_otu_picking_method,
                                    step2_repset_fasta_fp, parallel, params,
                                    logger)

    commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures',
                          step3_filter_fasta_cmd)])

        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])

    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s > %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' %
                          (step3_failures_list_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    logger.write(
        '# Filter singletons from the otu map using API \n' +
        'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
        '(\'%s\', \'%s\', \'%d\')"\n\n' %
        (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size))

    ## make the final representative seqs file and a new refseqs file that
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be.
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be
    ## reads from this run so we don't hit issues building a tree using
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' %
                 final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write(
        '# Copy the full input refseqs file to the new refseq file\n' +
        'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,
                                                   'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()
    logger.write(
        '# Write non-singleton otus representative sequences from ' +
        'step 2 and step 4 to the final representative set and the new reference'
        + ' set (%s and %s respectively)\n\n' %
        (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
             (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %\
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                parse_biom_table(open(align_and_tree_input_otu_table, 'U')),
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp, 'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if close_logger_on_success:
        logger.close()
예제 #51
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_dir = opts.output_dir
    
    if opts.num_fraction_for_core_steps < 2:
        option_parser.error("Must perform at least two steps. Increase --num_fraction_for_core_steps.")
    fractions_for_core = linspace(opts.min_fraction_for_core,
                                  opts.max_fraction_for_core,
                                  opts.num_fraction_for_core_steps)

    otu_md = opts.otu_md
    valid_states = opts.valid_states
    mapping_fp = opts.mapping_fp
    
    create_dir(output_dir)

    if valid_states and opts.mapping_fp:
        sample_ids = sample_ids_from_metadata_description(open(mapping_fp,'U'),
                                                          valid_states)
        if len(sample_ids) < 1:
            option_parser.error(\
             "--valid_states pattern didn't match any entries in mapping file: \"%s\"" %\
             valid_states)
    else:
        # get core across all samples if user doesn't specify a subset of the 
        # samples to work with
        sample_ids = None
    
    input_table = parse_biom_table(open(input_fp,'U'))
    
    otu_counts = []
    summary_figure_fp = join(output_dir,'core_otu_size.pdf')
    for fraction_for_core in fractions_for_core:
        # build a string representation of the fraction as that gets used 
        # several times
        fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.)
        
        # prep output files
        output_fp = join(output_dir,'core_otus_%s.txt' % fraction_for_core_str)
        output_table_fp = join(output_dir,'core_table_%s.biom' % fraction_for_core_str)
        output_f = open(output_fp,'w')
    
        try:
            core_table = filter_table_to_core(input_table,
                                              sample_ids,
                                              fraction_for_core)
        except TableException:
            output_f.write("# No OTUs present in %s %% of samples." % fraction_for_core_str)
            output_f.close()
            otu_counts.append(0)
            continue
    
        # write some header information to file
        if sample_ids == None:
            output_f.write("# Core OTUs across %s %% of samples.\n" % fraction_for_core_str)
        else:
            output_f.write(\
             "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" %\
              (fraction_for_core_str, valid_states,' '.join(sample_ids)))
    
        # write the otu id and corresponding metadata for all core otus
        otu_count = 0
        for value, id_, md in core_table.iterObservations():
            output_f.write('%s\t%s\n' % (id_,
                                         get_first_metadata_entry(md[otu_md])))
            otu_count += 1
        output_f.close()
    
        # write the core biom table
        output_table_f = open(output_table_fp,'w')
        output_table_f.write(format_biom_table(core_table))
        output_table_f.close()
        
        # append the otu count to the list of counts
        otu_counts.append(otu_count)
    
    plot(fractions_for_core, otu_counts)
    xlim(min(fractions_for_core),max(fractions_for_core))
    ylim(0,max(otu_counts)+1)
    xlabel("Fraction of samples that OTU must be observed in to be considered 'core'")
    ylabel("Number of OTUs")
    savefig(summary_figure_fp)
예제 #52
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    # opts.filter_bad_illumina_qual_digit
    filter_bad_illumina_qual_digit = False
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error(
                "If not providing barcode reads (because "
                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error(
                "If providing --sample_ids (because "
                "your data is not multiplexed), must provide the same number "
                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
        mapping_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
                            "--barcode_type is not 'not-barcoded'")
    elif mapping_fps is None:
        option_parser.error("Must provide --mapping_fps if "
                            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
        except KeyError:
            # shouldn't be able to get here, but we'll stay on the
            # safe side
            option_parser.error("Only valid phred offsets are: %s" %
                                ' '.join(phred_to_ascii_fs.keys()))
    else:
        # let split_libraries_fastq.process_fastq_single_end_read_file
        # figure it out...
        phred_to_ascii_f = None

    if opts.last_bad_quality_char is not None:
        option_parser.error(
            '--last_bad_quality_char is no longer supported. '
            'Use -q instead (see option help text by passing -h)')

    if not (0 <= min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be between '
                            '0 and 1 (inclusive). You passed %1.5f' %
                            min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(
            set([
                len(sequence_read_fps),
                len(barcode_read_fps),
                len(mapping_fps)
            ])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
                            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')

        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:

        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')

        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def fastq_writer(h, s, q):
            output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h, s, q))
    else:

        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        if mapping_fp is not None:
            mapping_f = open(mapping_fp, 'U')
            _, _, barcode_to_sample_id, _, _, _, _ = check_map(
                mapping_f,
                disable_primer_check=True,
                has_barcodes=barcode_read_fp is not None)
        else:
            mapping_f = None
            barcode_to_sample_id = {}

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {
                DNA.rc(k): v
                for k, v in barcode_to_sample_id.iteritems()
            }

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error(
                    "Some or all barcodes are not valid golay "
                    "codes. Do they need to be reverse complemented? If these "
                    "are not golay barcodes pass --barcode_type 12 to disable "
                    "barcode error correction, or pass --barcode_type # if "
                    "the barcodes are not 12 base pairs, where # is the size "
                    "of the barcodes. Invalid codes:\n\t%s" %
                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        if mapping_fp is not None:
            log_f.write('Mapping filepath: %s (md5: %s)\n' %
                        (mapping_fp, safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
                    (sequence_read_fp,
                     str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp is not None:
            log_f.write(
                'Barcode read filepath: %s (md5: %s)\n\n' %
                (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')

            seq_generator = process_fastq_single_end_read_file(
                sequence_read_f,
                barcode_read_f,
                barcode_to_sample_id,
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                rev_comp_barcode=rev_comp_barcode,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                barcode_correction_fn=barcode_correction_fn,
                max_barcode_errors=max_barcode_errors,
                phred_to_ascii_f=phred_to_ascii_f)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
                sequence_read_f,
                sample_ids[i],
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                phred_to_ascii_f=phred_to_ascii_f)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)
예제 #53
0
def pick_subsampled_open_reference_otus(input_fp, 
                              refseqs_fp,
                              output_dir,
                              percent_subsample,
                              new_ref_set_id,
                              command_handler,
                              params,
                              qiime_config,
                              prefilter_refseqs_fp=None,
                              run_assign_tax=True,
                              run_align_and_tree=True,
                              prefilter_percent_id=0.60,
                              min_otu_size=2,
                              step1_otu_map_fp=None,
                              step1_failures_fasta_fp=None,
                              parallel=False,
                              suppress_step4=False,
                              logger=None,
                              suppress_md5=False,
                              denovo_otu_picking_method='uclust',
                              reference_otu_picking_method='uclust_ref',
                              status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust','usearch61']
    allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
     "Unknown de novo OTU picking method: %s. Known methods are: %s"\
     % (denovo_otu_picking_method,
        ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
     "Unknown reference OTU picking method: %s. Known methods are: %s"\
     % (reference_otu_picking_method,
        ','.join(allowed_reference_otu_picking_methods))
    
    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[input_fp,
                               refseqs_fp,
                               step1_otu_map_fp,
                               step1_failures_fasta_fp])
    
    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in 
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
       prefilter_refseqs_fp = refseqs_fp
    
    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])
            
            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input', filter_fasta_cmd)])
            
            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            
        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)
        
        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])
        
        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []
    
    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set',step1_pick_rep_set_cmd)])
    
    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp,
                    step2_input_fasta_fp,
                    percent_subsample)
    
    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                 step2_dir,
                                 new_ref_set_id,
                                 denovo_otu_picking_method,
                                 params,
                                 logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])
    
    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(
     step1_failures_fasta_fp,
     step3_dir,
     reference_otu_picking_method,
     step2_repset_fasta_fp,
     parallel,
     params,
     logger)
    
    commands.append([
     ('Pick reference OTUs using de novo rep set',step3_cmd)])
    
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir
    
    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures', 
                          step3_filter_fasta_cmd)])
        
        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id,'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)])
        
    else:
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])    
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                      'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))])
    
    command_handler(commands,
        status_update_callback,
        logger=logger,
        close_logger_on_success=False)
    commands = []
    
    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size)
    
    ## make the final representative seqs file and a new refseqs file that 
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be. 
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be 
    ## reads from this run so we don't hit issues building a tree using 
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp,'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp,new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp,'a')
    new_refseqs_f.write('\n')
    # iterate over all representative sequences from step2 and step4 and write 
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    new_refseqs_f.close()
    final_repset_f.close()
    
    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table",make_otu_table_cmd)])
    
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=False)
    
    commands = []
    
    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size)
    
    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp],error_on_missing=False)
        
            taxonomy_fp = assign_tax(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
             (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table",add_metadata_cmd)])
        
            command_handler(commands,
                status_update_callback,
                logger=logger,
                close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %\
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)
        
            pynast_failures_fp = align_and_tree(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                  parse_biom_table(open(align_and_tree_input_otu_table,'U')),
                  get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')),
                  0,inf,0,inf,negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()
        
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []
            
    if close_logger_on_success:
        logger.close()
예제 #54
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the distance matrix and mapping file.
    try:
        dist_matrix_header, dist_matrix = parse_distmat(
            open(opts.distance_matrix_fp, 'U'))
    except:
        option_parser.error(
            "This does not look like a valid distance matrix "
            "file. Please supply a valid distance matrix file using the -d "
            "option.")

    try:
        mapping, mapping_header, mapping_comments = parse_mapping_file(
            open(opts.mapping_fp, 'U'))
    except QiimeParseError:
        option_parser.error(
            "This does not look like a valid metadata mapping "
            "file. Please supply a valid mapping file using the -m option.")

    fields = opts.fields
    fields = map(strip, fields.split(','))
    fields = [field.strip('"').strip("'") for field in fields]

    if fields is None:
        option_parser.error("You must provide at least one field using the -f "
                            "option.")

    # Make sure each field is in the mapping file.
    for field in fields:
        if field not in mapping_header:
            option_parser.error(
                "The field '%s' is not in the provided "
                "mapping file. Please supply correct fields (using the -f "
                "option) corresponding to fields in the mapping file." % field)

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = opts.y_min
    y_max = opts.y_max
    try:
        y_min = float(y_min)
    except ValueError:
        if y_min == 'auto':
            y_min = None
        else:
            option_parser.error("The --y_min option must be either a number "
                                "or 'auto'.")
    try:
        y_max = float(y_max)
    except ValueError:
        if y_max == 'auto':
            y_max = None
        else:
            option_parser.error("The --y_max option must be either a number "
                                "or 'auto'.")

    # Generate the various boxplots, depending on what the user wanted
    # suppressed. Add them all to one encompassing plot.
    for field in fields:
        plot_data = []
        plot_labels = []

        if not opts.suppress_all_within:
            plot_data.append(
                get_all_grouped_distances(dist_matrix_header,
                                          dist_matrix,
                                          mapping_header,
                                          mapping,
                                          field,
                                          within=True))
            plot_labels.append("All within %s" % field)
        if not opts.suppress_all_between:
            plot_data.append(
                get_all_grouped_distances(dist_matrix_header,
                                          dist_matrix,
                                          mapping_header,
                                          mapping,
                                          field,
                                          within=False))
            plot_labels.append("All between %s" % field)
        if not opts.suppress_individual_within:
            within_dists = get_grouped_distances(dist_matrix_header,
                                                 dist_matrix,
                                                 mapping_header,
                                                 mapping,
                                                 field,
                                                 within=True)
            for grouping in within_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))
        if not opts.suppress_individual_between:
            between_dists = get_grouped_distances(dist_matrix_header,
                                                  dist_matrix,
                                                  mapping_header,
                                                  mapping,
                                                  field,
                                                  within=False)
            for grouping in between_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))

        # We now have our data and labels ready, so plot them!
        assert (len(plot_data) == len(plot_labels)), "The number " +\
                "of boxplot labels does not match the number of " +\
                "boxplots."
        if plot_data:
            if opts.sort:
                # Sort our plot data in order of increasing median.
                sorted_data = []
                for label, distribution in zip(plot_labels, plot_data):
                    sorted_data.append(
                        (label, distribution, median(distribution)))
                sorted_data.sort(key=itemgetter(2))
                plot_labels = []
                plot_data = []
                for label, distribution, median_value in sorted_data:
                    plot_labels.append(label)
                    plot_data.append(distribution)

            width = opts.width
            height = opts.height
            if width is None:
                width = len(plot_data) * opts.box_width + 2
            if width <= 0 or height <= 0:
                option_parser.error("The specified width and height of the "
                                    "image must be greater than zero.")

            plot_figure = generate_box_plots(
                plot_data,
                x_tick_labels=plot_labels,
                title="%s Distances" % field,
                x_label="Grouping",
                y_label="Distance",
                x_tick_labels_orientation='vertical',
                y_min=y_min,
                y_max=y_max,
                whisker_length=opts.whisker_length,
                box_width=opts.box_width,
                box_color=opts.box_color,
                figure_width=width,
                figure_height=height)

            output_plot_fp = join(opts.output_dir,
                                  "%s_Distances.%s" % (field, opts.imagetype))
            plot_figure.savefig(output_plot_fp,
                                format=opts.imagetype,
                                transparent=opts.transparent)
        else:
            option_parser.error("You have chosen to suppress all plots. At "
                                "least one type of plot must be unsuppressed.")

        if not opts.suppress_significance_tests:
            sig_tests_f = open(join(opts.output_dir, "%s_Stats.xls" % field),
                               'w')
            sig_tests_results = all_pairs_t_test(
                plot_labels,
                plot_data,
                tail_type=opts.tail_type,
                num_permutations=opts.num_permutations)
            sig_tests_f.write(sig_tests_results)
            sig_tests_f.close()

        if opts.save_raw_data:
            # Write the raw plot data into a tab-delimited file.
            assert (len(plot_labels) == len(plot_data))
            raw_data_fp = join(opts.output_dir, "%s_Distances.xls" % field)
            raw_data_f = open(raw_data_fp, 'w')

            for label, data in zip(plot_labels, plot_data):
                raw_data_f.write(label.replace(" ", "_") + "\t")
                raw_data_f.write("\t".join(map(str, data)))
                raw_data_f.write("\n")
            raw_data_f.close()
예제 #55
0
def iterative_pick_subsampled_open_referenence_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=0.60,
        min_otu_size=2,
        run_tax_align_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        status_update_callback=print_to_stdout):
    """ Call the pick_subsampled_open_referenence_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp == None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write(
                'Iteration %d (input file: %s) output data already exists. '
                'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_referenence_otus(
                input_fp=input_fp,
                refseqs_fp=refseqs_fp,
                output_dir=iteration_output_dir,
                percent_subsample=percent_subsample,
                new_ref_set_id='.'.join([new_ref_set_id,
                                         str(i)]),
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                run_tax_align_tree=False,
                prefilter_refseqs_fp=prefilter_refseqs_fp,
                prefilter_percent_id=prefilter_percent_id,
                min_otu_size=min_otu_size,
                step1_otu_map_fp=step1_otu_map_fp,
                step1_failures_fasta_fp=step1_failures_fasta_fp,
                parallel=parallel,
                suppress_step4=suppress_step4,
                logger=logger,
                status_update_callback=status_update_callback)
        ## perform post-iteration file shuffling whether the previous iteration's
        ## data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp
        otu_table_fps.append('%s/otu_table_mc%d.biom' %
                             (iteration_output_dir, min_otu_size))
        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
         (','.join(otu_table_fps),otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    if run_tax_align_tree:
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        final_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
        if exists(final_otu_table_fp) and getsize(final_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp, final_otu_table_fp],
                         error_on_missing=False)

            taxonomy_fp, pynast_failures_fp = tax_align_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\
             (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_taxa_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                parse_biom_table(open(otu_table_w_tax_fp, 'U')),
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            otu_table_f = open(final_otu_table_fp, 'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
예제 #56
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_id = opts.sample_id
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors
    
    # if this is not a demultiplexed run, 
    if barcode_type == 'not-barcoded':
        if sample_id == None:
            option_parser.error("If not providing barcode reads (because "
            "your data is not multiplexed), must provide a --sample_id.")
        barcode_read_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps == None:
        option_parser.error("Must provide --barcode_fps if "
                            "--barcode_type is not 'not-barcoded'")
    else:
        pass
    
    phred_offset = opts.phred_offset
    if phred_offset != None:
        try:
            phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
        except KeyError:
            # shouldn't be able to get here, but we'll stay on the
            # safe side
            opption_parser.error(\
             "Only valid phred offsets are: %s" %\
             ' '.join(phred_to_ascii_fs.keys()))
    else:
        # let split_libraries_fastq.process_fastq_single_end_read_file 
        # figure it out...
        phred_to_ascii_f = None
    
    if opts.last_bad_quality_char != None:
        option_parser.error('--last_bad_quality_char is no longer supported. '
         'Use -q instead (see option help text by passing -h)')
    
    if not (0 <= min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be between '
         '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction)
    
    try:
        barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type]
    except KeyError:
        barcode_correction_fn = None
    
    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)
    
    if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1:
        option_parser.error("Same number of sequence, barcode and mapping files must be provided.")
    
    output_dir = opts.output_dir
    create_dir(output_dir)
    
    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp,'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir
    
    if store_qual_scores:
        qual_f = open(qual_fp_temp,'w')
        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def qual_writer(h,q):
            qual_f.write('>%s\n%s\n' % (h,q))
    else:
        def qual_writer(h,q):
            pass
    
    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp,'w')
        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def fastq_writer(h,s,q):
            output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h,s,q))
    else:
        def fastq_writer(h,s,q):
            pass
    
    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp,'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp,'w')
    
    for sequence_read_fp, barcode_read_fp, mapping_fp in\
      zip(sequence_read_fps, barcode_read_fps, mapping_fps):
        mapping_f = open(mapping_fp, 'U')
        h, i, barcode_to_sample_id, warnings, errors, p, a =\
           check_map(mapping_f, 
                     disable_primer_check=True, 
                     has_barcodes=barcode_read_fp != None)
        
        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = \
             dict([(DNA.rc(k),v) for k,v in barcode_to_sample_id.items()])
        
        if barcode_type == 'golay_12':
            invalid_golay_barcodes = \
             get_invalid_golay_barcodes(barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error("Some or all barcodes are not valid golay codes. "+\
                "Do they need to be reverse complimented? If these are not "+\
                "golay barcodes pass --barcode_type 12 to disable barcode "+\
                "error correction, or pass --barcode_type # if the barcodes "+\
                "are not 12 base pairs, where # is the size of the barcodes. "+
                "  Invalid codes:\n\t%s" % \
                ' '.join(invalid_golay_barcodes))
        
        log_f.write("Input file paths\n")
        log_f.write('Mapping filepath: %s (md5: %s)\n' %\
          (mapping_fp,safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %\
          (sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest())))
       
        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp,'U')
        

        seq_id = start_seq_id
        
        if barcode_read_fp != None:
            
            log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\
              (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest()))
              
            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp,'U')
            seq_generator = process_fastq_single_end_read_file(
               sequence_read_f,
               barcode_read_f,
               barcode_to_sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp,
               rev_comp_barcode=rev_comp_barcode,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               barcode_correction_fn=barcode_correction_fn,
               max_barcode_errors=max_barcode_errors,
               phred_to_ascii_f=phred_to_ascii_f)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
               sequence_read_f,
               sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               phred_to_ascii_f=phred_to_ascii_f)
        
        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header,sequence))
            qual_writer(fasta_header,quality)
            fastq_writer(fasta_header,sequence,quality)
            
        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')
        
    output_f.close()
    rename(output_fp_temp,output_fp)
    
    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp,qual_fp)
    
    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp,output_fastq_fp)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the distance matrix and mapping file.
    try:
        dist_matrix_header, dist_matrix = parse_distmat(
            open(opts.distance_matrix_fp, 'U'))
    except:
        option_parser.error(
            "This does not look like a valid distance matrix "
            "file. Please supply a valid distance matrix file using the -d "
            "option.")
    try:
        mapping, mapping_header, mapping_comments = parse_mapping_file(
            open(opts.mapping_fp, 'U'))
    except QiimeParseError:
        option_parser.error(
            "This does not look like a valid metadata mapping "
            "file. Please supply a valid mapping file using the -m option.")

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = opts.y_min
    y_max = opts.y_max
    try:
        y_min = float(y_min)
    except ValueError:
        if y_min == 'auto':
            y_min = None
        else:
            option_parser.error("The --y_min option must be either a number "
                                "or 'auto'.")
    try:
        y_max = float(y_max)
    except ValueError:
        if y_max == 'auto':
            y_max = None
        else:
            option_parser.error("The --y_max option must be either a number "
                                "or 'auto'.")

    # Parse the field states that will be compared to every other field state.
    comparison_field_states = opts.comparison_groups
    comparison_field_states = map(strip, comparison_field_states.split(','))
    comparison_field_states = [
        field_state.strip('"').strip("'")
        for field_state in comparison_field_states
    ]
    if comparison_field_states is None:
        option_parser.error("You must provide at least one field state to "
                            "compare (using the -c option).")

    # Get distance comparisons between each field state and each of the
    # comparison field states.
    field = opts.field
    comparison_groupings = get_field_state_comparisons(
        dist_matrix_header, dist_matrix, mapping_header, mapping, field,
        comparison_field_states)

    # Grab a list of all field states that had the comparison field states
    # compared against them. These will be plotted along the x-axis.
    field_states = comparison_groupings.keys()

    def custom_comparator(x, y):
        try:
            num_x = float(x)
            num_y = float(y)
            return int(num_x - num_y)
        except:
            if x < y:
                return -1
            elif x > y:
                return 1
            else:
                return 0

    # Sort the field states as numbers if the elements are numbers, else sort
    # them lexically.
    field_states.sort(custom_comparator)

    # If the label type is numeric, get a list of all field states in sorted
    # numeric order. These will be used to determine the spacing of the
    # field state 'points' along the x-axis.
    x_spacing = None
    if opts.label_type == "numeric":
        try:
            x_spacing = map(float, field_states)
            x_spacing.sort()
        except:
            option_parser.error("The 'numeric' label type is invalid because "
                                "not all field states could be converted into "
                                "numbers. Please specify a different label "
                                "type.")

    # Accumulate the data for each field state 'point' along the x-axis.
    plot_data = []
    plot_x_axis_labels = []
    for field_state in field_states:
        field_state_data = []
        for comp_field_state in comparison_field_states:
            field_state_data.append(
                comparison_groupings[field_state][comp_field_state])
        plot_data.append(field_state_data)
        plot_x_axis_labels.append(field_state)

    # Plot the data and labels.
    plot_title = "Distance Comparisons"
    plot_x_label = field
    plot_y_label = "Distance"

    # If we are creating a bar chart or box plot, grab a list of good data
    # colors to use.
    plot_type = opts.plot_type
    plot_colors = None
    if plot_type == "bar" or plot_type == "box":
        plot_colors = [matplotlib_rgb_color(data_colors[color].toRGB()) \
                       for color in data_color_order]

    assert plot_data, "Error: there is no data to plot!"

    width = opts.width
    height = opts.height
    if width <= 0 or height <= 0:
        option_parser.error("The specified width and height of the image must "
                            "be greater than zero.")

    plot_figure = generate_comparative_plots(
        opts.plot_type,
        plot_data,
        x_values=x_spacing,
        data_point_labels=plot_x_axis_labels,
        distribution_labels=comparison_field_states,
        distribution_markers=plot_colors,
        x_label=plot_x_label,
        y_label=plot_y_label,
        title=plot_title,
        x_tick_labels_orientation=opts.x_tick_labels_orientation,
        y_min=y_min,
        y_max=y_max,
        whisker_length=opts.whisker_length,
        error_bar_type=opts.error_bar_type,
        distribution_width=opts.distribution_width,
        figure_width=width,
        figure_height=height)

    # Save the plot in the specified format.
    output_plot_fp = join(
        opts.output_dir,
        "%s_Distance_Comparisons.%s" % (field, opts.imagetype))
    plot_figure.savefig(output_plot_fp,
                        format=opts.imagetype,
                        transparent=opts.transparent)

    if not opts.suppress_significance_tests:
        sig_tests_f = open(join(opts.output_dir, "%s_Stats.txt" % field), 'w')

        # Rearrange the plot data into a format suitable for all_pairs_t_test.
        sig_tests_labels = []
        sig_tests_data = []
        for data_point, data_point_label in zip(plot_data, plot_x_axis_labels):
            for dist, comp_field in zip(data_point, comparison_field_states):
                sig_tests_labels.append('%s vs %s' %
                                        (data_point_label, comp_field))
                sig_tests_data.append(dist)

        sig_tests_results = all_pairs_t_test(
            sig_tests_labels,
            sig_tests_data,
            tail_type=opts.tail_type,
            num_permutations=opts.num_permutations)
        sig_tests_f.write(sig_tests_results)
        sig_tests_f.close()

    if opts.save_raw_data:
        # Write the raw plot data into a tab-delimited file, where each line
        # has the distances between a comparison group and another field state
        # 'point' along the x-axis.
        assert (len(plot_x_axis_labels) == len(plot_data)), "The number of " +\
                "labels do not match the number of points along the x-axis."
        raw_data_fp = join(opts.output_dir,
                           "%s_Distance_Comparisons.txt" % field)
        raw_data_f = open(raw_data_fp, 'w')

        raw_data_f.write("#ComparisonGroup\tFieldState\tDistances\n")
        for label, data in zip(plot_x_axis_labels, plot_data):
            assert (len(comparison_field_states) == len(data)), "The " +\
                    "number of specified comparison groups does not match " +\
                    "the number of groups found at the current point along " +\
                    "the x-axis."
            for comp_field_state, comp_grp_data in zip(comparison_field_states,
                                                       data):
                raw_data_f.write(comp_field_state + "\t" + label + "\t" +
                                 "\t".join(map(str, comp_grp_data)) + "\n")
        raw_data_f.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory " "specified with the -o option.")

    # Parse the distance matrix and mapping file.
    try:
        dist_matrix_header, dist_matrix = parse_distmat(open(opts.distance_matrix_fp, "U"))
    except:
        option_parser.error(
            "This does not look like a valid distance matrix "
            "file. Please supply a valid distance matrix file using the -d "
            "option."
        )
    try:
        mapping, mapping_header, mapping_comments = parse_mapping_file(open(opts.mapping_fp, "U"))
    except QiimeParseError:
        option_parser.error(
            "This does not look like a valid metadata mapping "
            "file. Please supply a valid mapping file using the -m option."
        )

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = opts.y_min
    y_max = opts.y_max
    try:
        y_min = float(y_min)
    except ValueError:
        if y_min == "auto":
            y_min = None
        else:
            option_parser.error("The --y_min option must be either a number " "or 'auto'.")
    try:
        y_max = float(y_max)
    except ValueError:
        if y_max == "auto":
            y_max = None
        else:
            option_parser.error("The --y_max option must be either a number " "or 'auto'.")

    # Parse the field states that will be compared to every other field state.
    comparison_field_states = opts.comparison_groups
    comparison_field_states = map(strip, comparison_field_states.split(","))
    comparison_field_states = [field_state.strip('"').strip("'") for field_state in comparison_field_states]
    if comparison_field_states is None:
        option_parser.error("You must provide at least one field state to " "compare (using the -c option).")

    # Get distance comparisons between each field state and each of the
    # comparison field states.
    field = opts.field
    comparison_groupings = get_field_state_comparisons(
        dist_matrix_header, dist_matrix, mapping_header, mapping, field, comparison_field_states
    )

    # Grab a list of all field states that had the comparison field states
    # compared against them. These will be plotted along the x-axis.
    field_states = comparison_groupings.keys()

    def custom_comparator(x, y):
        try:
            num_x = float(x)
            num_y = float(y)
            return int(num_x - num_y)
        except:
            if x < y:
                return -1
            elif x > y:
                return 1
            else:
                return 0

    # Sort the field states as numbers if the elements are numbers, else sort
    # them lexically.
    field_states.sort(custom_comparator)

    # If the label type is numeric, get a list of all field states in sorted
    # numeric order. These will be used to determine the spacing of the
    # field state 'points' along the x-axis.
    x_spacing = None
    if opts.label_type == "numeric":
        try:
            x_spacing = sorted(map(float, field_states))
        except:
            option_parser.error(
                "The 'numeric' label type is invalid because "
                "not all field states could be converted into "
                "numbers. Please specify a different label "
                "type."
            )

    # Accumulate the data for each field state 'point' along the x-axis.
    plot_data = []
    plot_x_axis_labels = []
    for field_state in field_states:
        field_state_data = []
        for comp_field_state in comparison_field_states:
            field_state_data.append(comparison_groupings[field_state][comp_field_state])
        plot_data.append(field_state_data)
        plot_x_axis_labels.append(field_state)

    # Plot the data and labels.
    plot_title = "Distance Comparisons"
    plot_x_label = field
    plot_y_label = "Distance"

    # If we are creating a bar chart or box plot, grab a list of good data
    # colors to use.
    plot_type = opts.plot_type
    plot_colors = None
    if plot_type == "bar" or plot_type == "box":
        plot_colors = [matplotlib_rgb_color(data_colors[color].toRGB()) for color in data_color_order]

    assert plot_data, "Error: there is no data to plot!"

    width = opts.width
    height = opts.height
    if width <= 0 or height <= 0:
        option_parser.error("The specified width and height of the image must " "be greater than zero.")

    plot_figure = grouped_distributions(
        opts.plot_type,
        plot_data,
        x_values=x_spacing,
        data_point_labels=plot_x_axis_labels,
        distribution_labels=comparison_field_states,
        distribution_markers=plot_colors,
        x_label=plot_x_label,
        y_label=plot_y_label,
        title=plot_title,
        x_tick_labels_orientation=opts.x_tick_labels_orientation,
        y_min=y_min,
        y_max=y_max,
        whisker_length=opts.whisker_length,
        error_bar_type=opts.error_bar_type,
        distribution_width=opts.distribution_width,
        figure_width=width,
        figure_height=height,
    )

    # Save the plot in the specified format.
    output_plot_fp = join(opts.output_dir, "%s_Distance_Comparisons.%s" % (field, opts.imagetype))
    plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent)

    if not opts.suppress_significance_tests:
        sig_tests_f = open(join(opts.output_dir, "%s_Stats.txt" % field), "w")

        # Rearrange the plot data into a format suitable for all_pairs_t_test.
        sig_tests_labels = []
        sig_tests_data = []
        for data_point, data_point_label in zip(plot_data, plot_x_axis_labels):
            for dist, comp_field in zip(data_point, comparison_field_states):
                sig_tests_labels.append("%s vs %s" % (data_point_label, comp_field))
                sig_tests_data.append(dist)

        sig_tests_results = all_pairs_t_test(
            sig_tests_labels, sig_tests_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations
        )
        sig_tests_f.write(sig_tests_results)
        sig_tests_f.close()

    if opts.save_raw_data:
        # Write the raw plot data into a tab-delimited file, where each line
        # has the distances between a comparison group and another field state
        # 'point' along the x-axis.
        assert len(plot_x_axis_labels) == len(plot_data), (
            "The number of " + "labels do not match the number of points along the x-axis."
        )
        raw_data_fp = join(opts.output_dir, "%s_Distance_Comparisons.txt" % field)
        raw_data_f = open(raw_data_fp, "w")

        raw_data_f.write("#ComparisonGroup\tFieldState\tDistances\n")
        for label, data in zip(plot_x_axis_labels, plot_data):
            assert len(comparison_field_states) == len(data), (
                "The "
                + "number of specified comparison groups does not match "
                + "the number of groups found at the current point along "
                + "the x-axis."
            )
            for comp_field_state, comp_grp_data in zip(comparison_field_states, data):
                raw_data_f.write(comp_field_state + "\t" + label + "\t" + "\t".join(map(str, comp_grp_data)) + "\n")
        raw_data_f.close()
예제 #59
0
    def _get_job_commands(self,
                          input_fp,
                          output_dir,
                          params,
                          job_prefix,
                          working_dir,
                          command_prefix='/bin/bash; ',
                          command_suffix='; exit'):
        """Generate beta diversity to split single OTU table to multiple jobs

        full_tree=True is faster: beta_diversity.py -f will make things
        go faster, but be sure you already have the correct minimal tree.
        """
        commands = []
        result_filepaths = []

        sids = parse_biom_table(open(input_fp, 'U')).SampleIds

        if params['full_tree']:
            full_tree_str = '-f'
        else:
            full_tree_str = ''

        if params['tree_path']:
            tree_str = '-t %s' % params['tree_path']
        else:
            tree_str = ''

        metrics = params['metrics']

        # this is a little bit of an abuse of _merge_to_n_commands, so may
        # be worth generalizing that method - this determines the correct
        # number of samples to process in each command
        sample_id_groups = self._merge_to_n_commands(sids,
                                                     params['jobs_to_start'],
                                                     delimiter=',',
                                                     command_prefix='',
                                                     command_suffix='')

        for i, sample_id_group in enumerate(sample_id_groups):
            working_dir_i = join(working_dir, str(i))
            create_dir(working_dir_i)
            output_dir_i = join(output_dir, str(i))
            create_dir(output_dir_i)
            result_filepaths.append(output_dir_i)
            input_dir, input_fn = split(input_fp)
            input_basename, input_ext = splitext(input_fn)
            sample_id_desc = sample_id_group.replace(',', '_')
            output_fns = [
                '%s_%s.txt' % (metric, input_basename)
                for metric in metrics.split(',')
            ]
            rename_command, current_result_filepaths = self._get_rename_command(
                output_fns, working_dir_i, output_dir_i)

            result_filepaths += current_result_filepaths

            bdiv_command = '%s -i %s -o %s %s -m %s %s -r %s' %\
                (self._script_name,
                 input_fp,
                 working_dir_i,
                 tree_str,
                 params['metrics'],
                 full_tree_str,
                 sample_id_group)

            shell_script_fp = '%s/%s%d.sh' % (working_dir_i, job_prefix, i)
            shell_script_commands = [bdiv_command] + rename_command.split(';')
            self._commands_to_shell_script(shell_script_commands,
                                           shell_script_fp)
            commands.append('bash %s' % shell_script_fp)

        commands = self._merge_to_n_commands(commands,
                                             params['jobs_to_start'],
                                             command_prefix=command_prefix,
                                             command_suffix=command_suffix)

        return commands, result_filepaths