Beispiel #1
0
def equalize_tables_at_rarefaction_point(otu_table_p,name_otu_table,reference_table_p,name_reference_table,seq_number,output_p):

	sample_de_novo = subset_samples_by_seq_number(biom_stats(reference_table_p),seq_number)
	sample_ref = subset_samples_by_seq_number(biom_stats(otu_table_p),seq_number)
	common_sample_ids = sample_de_novo.intersection(sample_ref)


	#filtering from otu table 
	new_otu_table = filter_samples(otu_table_p,common_sample_ids,0,np.inf)
	new_reference_table = filter_samples(reference_table_p,common_sample_ids,0,np.inf)
	
	doc1 = open(output_p+"/"+name_otu_table.replace("/","_")+"_equalized_"+str(seq_number)+'.biom',"w")
	doc1.write(format_biom_table(new_otu_table))
	doc1.close()



	doc2 = open(output_p+"/"+name_reference_table.replace("/","_")+"_equalized_"+str(seq_number)+'.biom',"w")
	doc2.write(format_biom_table(new_reference_table))
	doc2.close()

	print " \nPercentage of Samples  : {}/{} , {}\n".format(
															len(new_otu_table.SampleIds) ,
															len(otu_table_p.SampleIds) ,
															(len(new_otu_table.SampleIds)/len(otu_table_p.SampleIds)) * 100
															)


	return
Beispiel #2
0
def equalize_tables_at_rarefaction_point(otu_table_p, name_otu_table,
                                         reference_table_p,
                                         name_reference_table, seq_number,
                                         output_p):

    sample_de_novo = subset_samples_by_seq_number(
        biom_stats(reference_table_p), seq_number)
    sample_ref = subset_samples_by_seq_number(biom_stats(otu_table_p),
                                              seq_number)
    common_sample_ids = sample_de_novo.intersection(sample_ref)

    #filtering from otu table
    new_otu_table = filter_samples(otu_table_p, common_sample_ids, 0, np.inf)
    new_reference_table = filter_samples(reference_table_p, common_sample_ids,
                                         0, np.inf)

    doc1 = open(
        output_p + "/" + name_otu_table.replace("/", "_") + "_equalized_" +
        str(seq_number) + '.biom', "w")
    doc1.write(format_biom_table(new_otu_table))
    doc1.close()

    doc2 = open(
        output_p + "/" + name_reference_table.replace("/", "_") +
        "_equalized_" + str(seq_number) + '.biom', "w")
    doc2.write(format_biom_table(new_reference_table))
    doc2.close()

    print " \nPercentage of Samples  : {}/{} , {}\n".format(
        len(new_otu_table.SampleIds), len(otu_table_p.SampleIds),
        (len(new_otu_table.SampleIds) / len(otu_table_p.SampleIds)) * 100)

    return
    def setUp(self):
        """Define some test data."""
        self.qiime_config = load_qiime_config()
        self.dirs_to_remove = []

        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # if test creates the temp dir, also remove it
            self.dirs_to_remove.append(self.tmp_dir)

        self.otu_table1 = table_factory(data=array([[2, 0, 0, 1],
                                                   [1, 1, 1, 1],
                                                   [0, 0, 0, 0]]).T,
                                        sample_ids=list('XYZ'),
                                        observation_ids=list('abcd'),
                                        constructor=DenseOTUTable)
        fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom')
        close(fd)
        open(self.otu_table1_fp, 'w').write(
            format_biom_table(self.otu_table1))

        self.otu_table2 = table_factory(data=array([[2, 0, 0, 1],
                                                   [1, 1, 1, 1],
                                                   [0, 0, 0, 0]]).T,
                                        sample_ids=list('XYZ'),
                                        observation_ids=['a', 'b', 'c', 'd_'],
                                        constructor=DenseOTUTable)
        fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom')
        close(fd)
        open(self.otu_table2_fp, 'w').write(
            format_biom_table(self.otu_table2))

        self.single_sample_otu_table = table_factory(
            data=array([[2, 0, 0, 1]]).T,
            sample_ids=list('X'),
            observation_ids=list(
                'abcd'),
            constructor=DenseOTUTable)
        fd, self.single_sample_otu_table_fp = mkstemp(
            dir=self.tmp_dir,
            prefix='alpha_diversity_tests',
            suffix='.biom')
        close(fd)
        open(self.single_sample_otu_table_fp, 'w').write(
            format_biom_table(self.single_sample_otu_table))

        self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);')
        self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);")

        self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp,
                                self.single_sample_otu_table_fp]
    def setUp(self):
        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'

        self.l19_data = numpy.array([
            [7, 1, 0, 0, 0, 0, 0, 0, 0],
            [4, 2, 0, 0, 0, 1, 0, 0, 0],
            [2, 4, 0, 0, 0, 1, 0, 0, 0],
            [1, 7, 0, 0, 0, 0, 0, 0, 0],
            [0, 8, 0, 0, 0, 0, 0, 0, 0],
            [0, 7, 1, 0, 0, 0, 0, 0, 0],
            [0, 4, 2, 0, 0, 0, 2, 0, 0],
            [0, 2, 4, 0, 0, 0, 1, 0, 0],
            [0, 1, 7, 0, 0, 0, 0, 0, 0],
            [0, 0, 8, 0, 0, 0, 0, 0, 0],
            [0, 0, 7, 1, 0, 0, 0, 0, 0],
            [0, 0, 4, 2, 0, 0, 0, 3, 0],
            [0, 0, 2, 4, 0, 0, 0, 1, 0],
            [0, 0, 1, 7, 0, 0, 0, 0, 0],
            [0, 0, 0, 8, 0, 0, 0, 0, 0],
            [0, 0, 0, 7, 1, 0, 0, 0, 0],
            [0, 0, 0, 4, 2, 0, 0, 0, 4],
            [0, 0, 0, 2, 4, 0, 0, 0, 1],
            [0, 0, 0, 1, 7, 0, 0, 0, 0]
        ])
        self.l19_sample_names = [
            'sam1', 'sam2', 'sam3', 'sam4', 'sam5', 'sam6',
            'sam7', 'sam8', 'sam9', 'sam_middle', 'sam11', 'sam12', 'sam13',
            'sam14', 'sam15', 'sam16', 'sam17', 'sam18', 'sam19']
        self.l19_taxon_names = ['tax1', 'tax2', 'tax3', 'tax4', 'endbigtaxon',
                                'tax6', 'tax7', 'tax8', 'tax9']
        self.l19_taxon_names_w_underscore = ['ta_x1', 'tax2', 'tax3', 'tax4',
                                             'endbigtaxon', 'tax6', 'tax7', 'tax8', 'tax9']

        l19_str = format_biom_table(DenseOTUTable(self.l19_data.T,
                                                  self.l19_sample_names,
                                                  self.l19_taxon_names))
        fd, self.l19_fp = mkstemp(dir=self.tmp_dir,
                                prefix='test_bdiv_otu_table', suffix='.blom')
        close(fd)
        open(self.l19_fp, 'w').write(l19_str)

        l19_str_w_underscore = format_biom_table(DenseOTUTable(self.l19_data.T,
                                                               self.l19_sample_names,
                                                               self.l19_taxon_names_w_underscore))
        fd, self.l19_str_w_underscore_fp = mkstemp(dir=self.tmp_dir,
                                                  prefix='test_bdiv_otu_table', suffix='.blom')
        close(fd)
        open(self.l19_str_w_underscore_fp, 'w').write(l19_str_w_underscore)

        self.l19_tree_str = '((((tax7:0.1,tax3:0.2):.98,tax8:.3, tax4:.3):.4,\
 ((tax1:0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, endbigtaxon:.08));'
        self.l19_tree = parse_newick(self.l19_tree_str, PhyloNode)

        self.files_to_remove = [self.l19_fp, self.l19_str_w_underscore_fp]
        self.folders_to_remove = []
    def setUp(self):
        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'

        self.l19_data = numpy.array([[7, 1, 0, 0, 0, 0, 0, 0, 0],
                                     [4, 2, 0, 0, 0, 1, 0, 0, 0],
                                     [2, 4, 0, 0, 0, 1, 0, 0, 0],
                                     [1, 7, 0, 0, 0, 0, 0, 0, 0],
                                     [0, 8, 0, 0, 0, 0, 0, 0, 0],
                                     [0, 7, 1, 0, 0, 0, 0, 0, 0],
                                     [0, 4, 2, 0, 0, 0, 2, 0, 0],
                                     [0, 2, 4, 0, 0, 0, 1, 0, 0],
                                     [0, 1, 7, 0, 0, 0, 0, 0, 0],
                                     [0, 0, 8, 0, 0, 0, 0, 0, 0],
                                     [0, 0, 7, 1, 0, 0, 0, 0, 0],
                                     [0, 0, 4, 2, 0, 0, 0, 3, 0],
                                     [0, 0, 2, 4, 0, 0, 0, 1, 0],
                                     [0, 0, 1, 7, 0, 0, 0, 0, 0],
                                     [0, 0, 0, 8, 0, 0, 0, 0, 0],
                                     [0, 0, 0, 7, 1, 0, 0, 0, 0],
                                     [0, 0, 0, 4, 2, 0, 0, 0, 4],
                                     [0, 0, 0, 2, 4, 0, 0, 0, 1],
                                     [0, 0, 0, 1, 7, 0, 0, 0, 0]])
        self.l19_sample_names = ['sam1', 'sam2', 'sam3', 'sam4', 'sam5','sam6',\
        'sam7', 'sam8', 'sam9', 'sam_middle', 'sam11', 'sam12', 'sam13', \
        'sam14', 'sam15', 'sam16', 'sam17', 'sam18', 'sam19']
        self.l19_taxon_names =  ['tax1', 'tax2', 'tax3', 'tax4', 'endbigtaxon',\
        'tax6', 'tax7', 'tax8', 'tax9']
        self.l19_taxon_names_w_underscore = [
            'ta_x1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7',
            'tax8', 'tax9'
        ]

        l19_str = format_biom_table(
            DenseOTUTable(self.l19_data.T, self.l19_sample_names,
                          self.l19_taxon_names))
        self.l19_fp = get_tmp_filename(tmp_dir=self.tmp_dir,
                                       prefix='test_bdiv_otu_table',
                                       suffix='.blom')
        open(self.l19_fp, 'w').write(l19_str)

        l19_str_w_underscore = format_biom_table(
            DenseOTUTable(self.l19_data.T, self.l19_sample_names,
                          self.l19_taxon_names_w_underscore))
        self.l19_str_w_underscore_fp = get_tmp_filename(
            tmp_dir=self.tmp_dir, prefix='test_bdiv_otu_table', suffix='.blom')
        open(self.l19_str_w_underscore_fp, 'w').write(l19_str_w_underscore)

        self.l19_tree_str = '((((tax7:0.1,tax3:0.2):.98,tax8:.3, tax4:.3):.4,\
 ((tax1:0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, endbigtaxon:.08));'

        self.l19_tree = parse_newick(self.l19_tree_str, PhyloNode)

        self.files_to_remove = [self.l19_fp, self.l19_str_w_underscore_fp]
        self.folders_to_remove = []
Beispiel #6
0
    def setUp(self):
        """Define some test data."""
        self.qiime_config = load_qiime_config()
        self.dirs_to_remove = []

        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # if test creates the temp dir, also remove it
            self.dirs_to_remove.append(self.tmp_dir)

        self.otu_table1 = table_factory(data=array([[2, 0, 0, 1], [1, 1, 1, 1],
                                                    [0, 0, 0, 0]]).T,
                                        sample_ids=list('XYZ'),
                                        observation_ids=list('abcd'),
                                        constructor=DenseOTUTable)
        self.otu_table1_fp = get_tmp_filename(tmp_dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom',
                                              result_constructor=str)
        open(self.otu_table1_fp,'w').write(\
         format_biom_table(self.otu_table1))

        self.otu_table2 = table_factory(data=array([[2, 0, 0, 1], [1, 1, 1, 1],
                                                    [0, 0, 0, 0]]).T,
                                        sample_ids=list('XYZ'),
                                        observation_ids=['a', 'b', 'c', 'd_'],
                                        constructor=DenseOTUTable)
        self.otu_table2_fp = get_tmp_filename(tmp_dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom',
                                              result_constructor=str)
        open(self.otu_table2_fp,'w').write(\
         format_biom_table(self.otu_table2))

        self.single_sample_otu_table = table_factory(
            data=array([[2, 0, 0, 1]]).T,
            sample_ids=list('X'),
            observation_ids=list('abcd'),
            constructor=DenseOTUTable)
        self.single_sample_otu_table_fp = get_tmp_filename(
            tmp_dir=self.tmp_dir,
            prefix='alpha_diversity_tests',
            suffix='.biom',
            result_constructor=str)
        open(self.single_sample_otu_table_fp,'w').write(\
         format_biom_table(self.single_sample_otu_table))

        self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);')
        self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);")

        self.files_to_remove = [
            self.otu_table1_fp, self.otu_table2_fp,
            self.single_sample_otu_table_fp
        ]
Beispiel #7
0
    def test_split_otu_table_on_sample_metadata_extra_mapping_entries(self):
        """ split_otu_table_on_sample_metadata functions as expected with extra mapping data """
        actual = list(split_otu_table_on_sample_metadata(self.otu_table_f1,
                                                         self.mapping_f2,
                                                         "Treatment"))

        actual = [(id_, parse_biom_table(e)) for id_, e in actual]
        exp = [(id_, parse_biom_table(e)) for id_, e in otu_table_exp1]

        actual.sort()
        exp.sort()

        for a, e in zip(actual, exp):
            self.assertEqual(a, e, "OTU tables are not equal:\n%s\n%s" %
                             (format_biom_table(a[1]), format_biom_table(e[1])))
Beispiel #8
0
 def test_split_otu_table_on_sample_metadata_extra_mapping_entries(self):
     """ split_otu_table_on_sample_metadata functions as expected with extra mapping data """
     actual = list(split_otu_table_on_sample_metadata(self.otu_table_f1,
                                                      self.mapping_f2,
                                                      "Treatment"))
                                                      
     actual = [(id_,parse_biom_table(e)) for id_, e in actual]
     exp = [(id_,parse_biom_table(e)) for id_, e in otu_table_exp1]
     
     actual.sort()
     exp.sort()
     
     for a,e in zip(actual,exp):
         self.assertEqual(a,e,"OTU tables are not equal:\n%s\n%s" % \
          (format_biom_table(a[1]),format_biom_table(e[1])))
Beispiel #9
0
    def setUp(self):

        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'

        self.otu_table_data = numpy.array([[2, 1, 0], [0, 5, 0], [0, 3, 0],
                                           [1, 2, 0]])
        self.sample_names = list('YXZ')
        self.taxon_names = list('bacd')
        self.otu_metadata = [{
            'domain': 'Archaea'
        }, {
            'domain': 'Bacteria'
        }, {
            'domain': 'Bacteria'
        }, {
            'domain': 'Bacteria'
        }]

        self.otu_table = table_factory(self.otu_table_data, self.sample_names,
                                       self.taxon_names)
        self.otu_table_meta = table_factory(
            self.otu_table_data,
            self.sample_names,
            self.taxon_names,
            observation_metadata=self.otu_metadata)

        self.otu_table_str = format_biom_table(self.otu_table)
        self.otu_table_meta_str = format_biom_table(self.otu_table_meta)

        _, self.otu_table_fp = mkstemp(dir=self.tmp_dir,
                                       prefix='test_rarefaction',
                                       suffix='.biom')
        close(_)
        _, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir,
                                            prefix='test_rarefaction',
                                            suffix='.biom')
        close(_)

        self.rare_dir = mkdtemp(dir=self.tmp_dir,
                                prefix='test_rarefaction_dir',
                                suffix='')

        open(self.otu_table_fp, 'w').write(self.otu_table_str)
        open(self.otu_table_meta_fp, 'w').write(self.otu_table_meta_str)

        self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp]
        self._dirs_to_clean_up = [self.rare_dir]
def generate_full_otu_table(study, study_input_dir, zip_fname, files_to_remove,
                            biom_files,output_dir):
    """ Merge OTU tables """
    
    master = parse_biom_table(open(biom_files[0],'U'))
    # only merge if there is more than 1 biom file
    if len(biom_files) > 1:
        for input_fp in biom_files[1:]:
            master = master.merge(parse_biom_table(open(input_fp,'U')))
        
    # write full biom-table
    full_biom_table_fname='study_%s_closed_reference_otu_table.biom' % \
                                                              (str(study))
    full_biom_table_fp=join(output_dir,full_biom_table_fname)
    # add to list of files to remove
    files_to_remove.append(full_biom_table_fp)
    
    biom_f = open(join(full_biom_table_fp),'w')
    biom_f.write(format_biom_table(master))
    biom_f.close()
    
    # zip the full biom-table file
    #cmd_call='cd %s; tar rzvf %s %s' % (study_input_dir,zip_fname,
    #                               full_biom_table_fname)
    #system(cmd_call)
    
    return files_to_remove
Beispiel #11
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    labels = opts.labels.split(',')
    if opts.all_strings:
        process_fs = [str] * len(labels)
        observation_metadata = parse_taxonomy_to_otu_metadata(\
                            open(opts.taxonomy_fp,'U'),labels=labels,process_fs=process_fs)
    else:
        observation_metadata = parse_taxonomy_to_otu_metadata(\
                            open(opts.taxonomy_fp,'U'),labels=labels)

    otu_table = parse_biom_table(open(opts.input_fp, 'U'))

    if otu_table.ObservationMetadata != None:
        # if there is already metadata associated with the
        # observations, confirm that none of the metadata names
        # are already present
        existing_keys = otu_table.ObservationMetadata[0].keys()
        for label in labels:
            if label in existing_keys:
                option_parser.error(\
                 "%s is already an observation metadata field."
                 " Can't add it, so nothing is being added." % label)

    otu_table.addObservationMetadata(observation_metadata)

    output_f = open(opts.output_fp, 'w')
    output_f.write(format_biom_table(otu_table))
    output_f.close()
Beispiel #12
0
def split_otu_table_on_taxonomy_to_files(otu_table_fp,
                                         level,
                                         output_dir,
                                         md_identifier='taxonomy',
                                         md_processor=process_md_as_list):
    """ Split OTU table by taxonomic level, writing otu tables to output dir
    """
    results = []
    otu_table = parse_biom_table(open(otu_table_fp, 'U'))
    create_dir(output_dir)

    def split_f(obs_md):
        try:
            result = md_processor(obs_md, md_identifier, level)
        except KeyError:
            raise KeyError,\
             "Metadata identifier (%s) is not associated with all (or any) observerations. You can modify the key with the md_identifier parameter." % md_identifier
        except TypeError:
            raise TypeError,\
             "Can't correctly process the metadata string. If your input file was generated from QIIME 1.4.0 or earlier you may need to pass --md_as_string."
        except AttributeError:
            raise AttributeError,\
             "Metadata category not found. If your input file was generated from QIIME 1.4.0 or earlier you may need to pass --md_identifier \"Consensus Lineage\"."

        return result

    for bin, sub_otu_table in otu_table.binObservationsByMetadata(split_f):
        output_fp = '%s/otu_table_%s.biom' % (output_dir, bin)
        output_f = open(output_fp, 'w')
        output_f.write(format_biom_table(sub_otu_table))
        output_f.close()
        results.append(output_fp)
    return results
Beispiel #13
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    labels = opts.labels.split(',')
    if opts.all_strings:
        process_fs = [str] * len(labels)
        observation_metadata = parse_taxonomy_to_otu_metadata(\
                            open(opts.taxonomy_fp,'U'),labels=labels,process_fs=process_fs)
    else:
        observation_metadata = parse_taxonomy_to_otu_metadata(\
                            open(opts.taxonomy_fp,'U'),labels=labels)
    

    otu_table = parse_biom_table(open(opts.input_fp,'U'))
    
    if otu_table.ObservationMetadata != None:
        # if there is already metadata associated with the 
        # observations, confirm that none of the metadata names
        # are already present
        existing_keys = otu_table.ObservationMetadata[0].keys()
        for label in labels:
            if label in existing_keys:
                option_parser.error(\
                 "%s is already an observation metadata field." 
                 " Can't add it, so nothing is being added." % label)
    
    otu_table.addObservationMetadata(observation_metadata)
    
    output_f = open(opts.output_fp,'w')
    output_f.write(format_biom_table(otu_table))
    output_f.close()
Beispiel #14
0
def main():
    option_parser, opts, args =\
      parse_command_line_parameters(**script_info)

    otu_table_data = parse_biom_table(open(opts.input_otu_table,'U'))
    sort_field = opts.sort_field
    mapping_fp = opts.mapping_fp
    sorted_sample_ids_fp = opts.sorted_sample_ids_fp
    
    if sort_field and mapping_fp:
        mapping_data = parse_mapping_file(open(mapping_fp,'U'))
        result = sort_otu_table_by_mapping_field(otu_table_data,
                                                 mapping_data,
                                                 sort_field)
    elif sorted_sample_ids_fp:
        sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp,'U'))
        result = sort_otu_table(otu_table_data,
                                sorted_sample_ids)
    else:
        result = sort_otu_table(otu_table_data,
                        natsort_case_insensitive(otu_table_data.SampleIds))
    
    # format and write the otu table
    result_str = format_biom_table(result)
    of = open(opts.output_fp,'w')
    of.write(result_str)
    of.close()
Beispiel #15
0
def main():
    option_parser, opts, args =\
      parse_command_line_parameters(**script_info)

    otu_table_data = parse_biom_table(open(opts.input_otu_table, 'U'))
    sort_field = opts.sort_field
    mapping_fp = opts.mapping_fp
    sorted_sample_ids_fp = opts.sorted_sample_ids_fp

    if sort_field and mapping_fp:
        mapping_data = parse_mapping_file(open(mapping_fp, 'U'))
        result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data,
                                                 sort_field)
    elif sorted_sample_ids_fp:
        sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp, 'U'))
        result = sort_otu_table(otu_table_data, sorted_sample_ids)
    else:
        result = sort_otu_table(
            otu_table_data, natsort_case_insensitive(otu_table_data.SampleIds))

    # format and write the otu table
    result_str = format_biom_table(result)
    of = open(opts.output_fp, 'w')
    of.write(result_str)
    of.close()
def split_otu_table_on_taxonomy_to_files(otu_table_fp,
                                         level,
                                         output_dir,
                                         md_identifier='taxonomy',
                                         md_processor=process_md_as_list):
    """ Split OTU table by taxonomic level, writing otu tables to output dir
    """
    results = []
    otu_table = parse_biom_table(open(otu_table_fp,'U'))
    create_dir(output_dir)
    
    def split_f(obs_md):
        try:
            result = md_processor(obs_md,md_identifier,level)
        except KeyError:
            raise KeyError,\
             "Metadata identifier (%s) is not associated with all (or any) observerations. You can modify the key with the md_identifier parameter." % md_identifier
        except TypeError:
            raise TypeError,\
             "Can't correctly process the metadata string. If your input file was generated from QIIME 1.4.0 or earlier you may need to pass --md_as_string."
        except AttributeError:
            raise AttributeError,\
             "Metadata category not found. If your input file was generated from QIIME 1.4.0 or earlier you may need to pass --md_identifier \"Consensus Lineage\"."
    
        return result
    
    for bin, sub_otu_table in otu_table.binObservationsByMetadata(split_f):
        output_fp = '%s/otu_table_%s.biom' % (output_dir,bin)
        output_f = open(output_fp,'w')
        output_f.write(format_biom_table(sub_otu_table))
        output_f.close()
        results.append(output_fp)
    return results
Beispiel #17
0
def make_otu_table(otu_map_f,
                   otu_to_taxonomy=None,
                   delim='_',
                   table_id=None,
                   sample_metadata=None,
                   constructor=SparseOTUTable):

    data, sample_ids, otu_ids = parse_otu_map(otu_map_f, delim)

    if otu_to_taxonomy is not None:
        otu_metadata = []
        for o in otu_ids:
            try:
                otu_metadata.append({'taxonomy': otu_to_taxonomy[o]})
            except KeyError:
                otu_metadata.append({'taxonomy': ["None"]})
    else:
        otu_metadata = None

    if sample_metadata is not None:
        raise NotImplementedError(
            "Passing of sample metadata to make_otu_table is not currently supported.")
    try:
        otu_table = table_factory(data, sample_ids, otu_ids,
                                  sample_metadata=sample_metadata,
                                  observation_metadata=otu_metadata,
                                  table_id=table_id,
                                  constructor=constructor,
                                  dtype=int)
    except ValueError as e:
        raise ValueError("Couldn't create OTU table. Is your OTU map empty?"
                         " Original error message: %s" % (str(e)))
    return format_biom_table(otu_table)
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_table = parse_biom_table(open(opts.input_otu_table_fp, 'U'))
    output_table_f = open(opts.output_otu_table_fp, 'w')
    metadata_field = opts.metadata_field
    positive_taxa = opts.positive_taxa
    negative_taxa = opts.negative_taxa

    if positive_taxa is not None:
        positive_taxa = positive_taxa.split(',')
    else:
        positive_taxa = None

    if negative_taxa is not None:
        negative_taxa = negative_taxa.split(',')
    else:
        negative_taxa = None

    filter_fn = get_otu_ids_from_taxonomy_f(
        positive_taxa,
        negative_taxa,
        metadata_field)
    output_table = input_table.filterObservations(filter_fn)
    output_table_f.write(format_biom_table(output_table))
    output_table_f.close()
Beispiel #19
0
def make_otu_table(otu_map_f,
                   otu_to_taxonomy=None,
                   delim='_',
                   table_id=None,
                   sample_metadata=None,
                   constructor=SparseOTUTable):

    data, sample_ids, otu_ids = parse_otu_map(otu_map_f, delim)

    if otu_to_taxonomy is not None:
        otu_metadata = []
        for o in otu_ids:
            try:
                otu_metadata.append({'taxonomy': otu_to_taxonomy[o]})
            except KeyError:
                otu_metadata.append({'taxonomy': ["None"]})
    else:
        otu_metadata = None

    if sample_metadata is not None:
        raise NotImplementedError(
            "Passing of sample metadata to make_otu_table is not currently supported.")
    try:
        otu_table = table_factory(data, sample_ids, otu_ids,
                                  sample_metadata=sample_metadata,
                                  observation_metadata=otu_metadata,
                                  table_id=table_id,
                                  constructor=constructor,
                                  dtype=int)
    except ValueError as e:
        raise ValueError("Couldn't create OTU table. Is your OTU map empty?"
                         " Original error message: %s" % (str(e)))
    return format_biom_table(otu_table)
Beispiel #20
0
 def _write_rarefaction(self, fname, sub_otu_table):
     """ depth and rep can be numbers or strings
     """
     if sub_otu_table.isEmpty():
         return
     f = open(fname, 'w')
     f.write(format_biom_table(sub_otu_table))
     f.close()
Beispiel #21
0
 def _write_rarefaction(self, fname, sub_otu_table):
     """ depth and rep can be numbers or strings
     """
     if sub_otu_table.isEmpty():
         return
     f = open(fname, 'w')
     f.write(format_biom_table(sub_otu_table))
     f.close()
Beispiel #22
0
    def setUp(self):
        """Define some test data."""
        self.qiime_config = load_qiime_config()
        self.dirs_to_remove = []

        self.tmp_dir = self.qiime_config["temp_dir"] or "/tmp/"
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # if test creates the temp dir, also remove it
            self.dirs_to_remove.append(self.tmp_dir)

        self.otu_table1 = table_factory(
            data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T,
            sample_ids=list("XYZ"),
            observation_ids=list("abcd"),
            constructor=DenseOTUTable,
        )
        self.otu_table1_fp = get_tmp_filename(
            tmp_dir=self.tmp_dir, prefix="alpha_diversity_tests", suffix=".biom", result_constructor=str
        )
        open(self.otu_table1_fp, "w").write(format_biom_table(self.otu_table1))

        self.otu_table2 = table_factory(
            data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T,
            sample_ids=list("XYZ"),
            observation_ids=["a", "b", "c", "d_"],
            constructor=DenseOTUTable,
        )
        self.otu_table2_fp = get_tmp_filename(
            tmp_dir=self.tmp_dir, prefix="alpha_diversity_tests", suffix=".biom", result_constructor=str
        )
        open(self.otu_table2_fp, "w").write(format_biom_table(self.otu_table2))

        self.single_sample_otu_table = table_factory(
            data=array([[2, 0, 0, 1]]).T, sample_ids=list("X"), observation_ids=list("abcd"), constructor=DenseOTUTable
        )
        self.single_sample_otu_table_fp = get_tmp_filename(
            tmp_dir=self.tmp_dir, prefix="alpha_diversity_tests", suffix=".biom", result_constructor=str
        )
        open(self.single_sample_otu_table_fp, "w").write(format_biom_table(self.single_sample_otu_table))

        self.tree1 = parse_newick("((a:2,b:3):2,(c:1,d:2):7);")
        self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);")

        self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp]
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if not ((mapping_fp and valid_states) or min_count != 0
            or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error(
            "No filtering requested. Must provide either "
            "mapping_fp and valid states, min counts, "
            "max counts, or sample_id_fp (or some combination of those).")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table = parse_biom_table(open(opts.input_fp, 'U'))
    output_f = open(opts.output_fp, 'w')

    if (mapping_fp and valid_states):
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
    else:
        sample_ids_to_keep = otu_table.SampleIds

    if (sample_id_fp is not None):
        sample_id_f_ids = set([
            l.strip().split()[0] for l in open(sample_id_fp, 'U')
            if not l.startswith('#')
        ])
        sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(otu_table,
                                                       sample_ids_to_keep,
                                                       min_count, max_count)
    output_f.write(format_biom_table(filtered_otu_table))
    output_f.close()

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.SampleIds)
        open(output_mapping_fp,
             'w').write(format_mapping_file(mapping_headers, mapping_data))
Beispiel #24
0
 def test_split_otu_table_on_sample_metadata(self):
     """ split_otu_table_on_sample_metadata functions as expected with valid input """
     actual = list(split_otu_table_on_sample_metadata(self.otu_table_f1,
                                                      self.mapping_f1,
                                                      "Treatment"))
     for id_, e in actual:
         try:
             parse_biom_table(e)
         except:
             print e
     actual = [(id_,parse_biom_table(e)) for id_, e in actual]
     exp = [(id_,parse_biom_table(e)) for id_, e in otu_table_exp1]
     
     actual.sort()
     exp.sort()
     
     for a,e in zip(actual,exp):
         self.assertEqual(a,e,"OTU tables are not equal:\n%s\n%s" % \
          (format_biom_table(a[1]),format_biom_table(e[1])))
Beispiel #25
0
    def _write_rarefaction(self, depth, rep, sub_otu_table):
        """ depth and rep can be numbers or strings
        """
        if sub_otu_table.isEmpty():
            return

        fname = 'rarefaction_' + str(depth) + '_' + str(rep) + '.biom'
        f = open(os.path.join(self.output_dir, fname), 'w')
        f.write(format_biom_table(sub_otu_table))
        f.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sample_mapping_fp = opts.sample_mapping_fp
    output_fp = opts.output_fp
    verbose = opts.verbose
    
    sample_mapping_file = open(sample_mapping_fp, 'U')
    result = sample_mapping_to_biom_table(sample_mapping_file)
    open(output_fp, 'w').write(format_biom_table(result))
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sample_mapping_fp = opts.sample_mapping_fp
    output_fp = opts.output_fp
    verbose = opts.verbose

    sample_mapping_file = open(sample_mapping_fp, 'U')
    result = sample_mapping_to_biom_table(sample_mapping_file)
    open(output_fp, 'w').write(format_biom_table(result))
Beispiel #28
0
    def _write_rarefaction(self, depth, rep, sub_otu_table):
        """ depth and rep can be numbers or strings
        """
        if sub_otu_table.isEmpty():
            return

        fname = 'rarefaction_'+str(depth)+'_'+str(rep)+'.biom'
        f = open(os.path.join(self.output_dir,fname), 'w')
        f.write(format_biom_table(sub_otu_table))
        f.close()
Beispiel #29
0
    def test_split_otu_table_on_sample_metadata(self):
        """ split_otu_table_on_sample_metadata functions as expected with valid input """
        actual = list(split_otu_table_on_sample_metadata(self.otu_table_f1,
                                                         self.mapping_f1,
                                                         "Treatment"))
        for id_, e in actual:
            try:
                parse_biom_table(e)
            except:
                print e
        actual = [(id_, parse_biom_table(e)) for id_, e in actual]
        exp = [(id_, parse_biom_table(e)) for id_, e in otu_table_exp1]

        actual.sort()
        exp.sort()

        for a, e in zip(actual, exp):
            self.assertEqual(a, e, "OTU tables are not equal:\n%s\n%s" %
                             (format_biom_table(a[1]), format_biom_table(e[1])))
Beispiel #30
0
    def setUp(self):

        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'

        self.otu_table_data = numpy.array([[2,1,0],
                                      [0,5,0],
                                      [0,3,0],
                                      [1,2,0]])
        self.sample_names = list('YXZ')
        self.taxon_names = list('bacd')
        self.otu_metadata = [{'domain':'Archaea'},
                          {'domain':'Bacteria'},
                          {'domain':'Bacteria'},
                          {'domain':'Bacteria'}]


        self.otu_table = table_factory(self.otu_table_data,
                                        self.sample_names,
                                        self.taxon_names)
        self.otu_table_meta = table_factory(self.otu_table_data,
          self.sample_names, self.taxon_names, 
          observation_metadata=self.otu_metadata)

        self.otu_table_str = format_biom_table(self.otu_table)
        self.otu_table_meta_str = format_biom_table(self.otu_table_meta)

        self.otu_table_fp = get_tmp_filename(tmp_dir=self.tmp_dir,
          prefix='test_rarefaction',suffix='.biom')
        self.otu_table_meta_fp = get_tmp_filename(tmp_dir=self.tmp_dir,
          prefix='test_rarefaction',suffix='.biom')

        self.rare_dir = get_tmp_filename(tmp_dir=self.tmp_dir,
         prefix='test_rarefaction_dir',suffix='',result_constructor=str)
        os.mkdir(self.rare_dir)

        open(self.otu_table_fp,'w').write(self.otu_table_str)
        open(self.otu_table_meta_fp,'w').write(self.otu_table_meta_str)


        self._paths_to_clean_up=[self.otu_table_fp,self.otu_table_meta_fp]
        self._dirs_to_clean_up=[self.rare_dir]
Beispiel #31
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)
    input_fps = opts.input_fps

    master = parse_biom_table(open(input_fps[0], 'U'))
    for input_fp in input_fps[1:]:
        master = master.merge(parse_biom_table(open(input_fp, 'U')))

    out_f = open(opts.output_fp, 'w')
    out_f.write(format_biom_table(master))
    out_f.close()
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    input_fp = opts.input_fp
    output_fp = opts.output_fp
    
    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp
    
    if not ((mapping_fp and valid_states) or 
            min_count != 0 or 
            not isinf(max_count) or
            sample_id_fp != None):
        option_parser.error("No filtering requested. Must provide either "
                     "mapping_fp and valid states, min counts, "
                     "max counts, or sample_id_fp (or some combination of those).")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table = parse_biom_table(open(opts.input_fp,'U'))
    output_f = open(opts.output_fp,'w')
    
    if (mapping_fp and valid_states):
        sample_ids_to_keep = sample_ids_from_metadata_description(
                              open(mapping_fp,'U'),valid_states)
    else:
        sample_ids_to_keep = otu_table.SampleIds
    
    if (sample_id_fp != None):
        sample_id_f_ids = set([l.strip().split()[0] for l in open(sample_id_fp,'U') if not l.startswith('#')])
        sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids
    
    filtered_otu_table = filter_samples_from_otu_table(otu_table,
                                                        sample_ids_to_keep,
                                                        min_count,
                                                        max_count)
    output_f.write(format_biom_table(filtered_otu_table))
    output_f.close()
    
    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp,'U'))
        mapping_headers, mapping_data = \
         filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.SampleIds)
        open(output_mapping_fp,'w').write(format_mapping_file(mapping_headers,mapping_data))
Beispiel #33
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
 
    if not isfile(opts.input_path):
       raise IOError, \
        "Input path (%s) not valid.  Does it exist?" % opts.input_path
    
    samples, otus, data = parse_trflp(open(opts.input_path,'U'))
    
    output_f = open(opts.output_path, 'w')
    t = table_factory(data,samples,otus)
    output_f.write(format_biom_table(t))
    output_f.close()
Beispiel #34
0
    def setUp(self):

        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config["temp_dir"] or "/tmp/"

        self.otu_table_data = numpy.array([[2, 1, 0], [0, 5, 0], [0, 3, 0], [1, 2, 0]])
        self.sample_names = list("YXZ")
        self.taxon_names = list("bacd")
        self.otu_metadata = [
            {"domain": "Archaea"},
            {"domain": "Bacteria"},
            {"domain": "Bacteria"},
            {"domain": "Bacteria"},
        ]

        self.otu_table = table_factory(self.otu_table_data, self.sample_names, self.taxon_names)
        self.otu_table_meta = table_factory(
            self.otu_table_data, self.sample_names, self.taxon_names, observation_metadata=self.otu_metadata
        )

        self.otu_table_str = format_biom_table(self.otu_table)
        self.otu_table_meta_str = format_biom_table(self.otu_table_meta)

        self.otu_table_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix="test_rarefaction", suffix=".biom")
        self.otu_table_meta_fp = get_tmp_filename(tmp_dir=self.tmp_dir, prefix="test_rarefaction", suffix=".biom")

        self.rare_dir = get_tmp_filename(
            tmp_dir=self.tmp_dir, prefix="test_rarefaction_dir", suffix="", result_constructor=str
        )
        os.mkdir(self.rare_dir)

        open(self.otu_table_fp, "w").write(self.otu_table_str)
        open(self.otu_table_meta_fp, "w").write(self.otu_table_meta_str)

        self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp]
        self._dirs_to_clean_up = [self.rare_dir]
Beispiel #35
0
def simsam_range_to_files(table,
                          tree,
                          simulated_sample_sizes,
                          dissimilarities,
                          output_dir,
                          mapping_f=None,
                          output_table_basename="table",
                          output_map_basename="map"):
    """Applies sim_otu_table over a range of parameters, writing output to file
    
     table: the input table to simulate samples from
     tree: tree related OTUs in input table
     simulated_sample_sizes: a list of ints defining how many
      output samples should be create per input sample
     dissimilarities: a list of floats containing the
      dissimilarities to use in simulating tables
     output_dir: the directory where all output tables and 
      mapping files should be written
     mapping_f: file handle for metadata mapping file, if 
      a mapping file should be created with the samples from
      each simulated table
     output_table_basename: basename for output table files 
      (default: table)
     output_map_basename: basename for output mapping files 
      (default: map)
    """
    create_dir(output_dir)
    for e in simsam_range(table, tree, simulated_sample_sizes, dissimilarities,
                          mapping_f):
        output_table = e[0]
        output_mapping_lines = e[1]
        simulated_sample_size = e[2]
        dissimilarity = e[3]

        output_table_fp = join(
            output_dir, '%s_n%d_d%r.biom' %
            (output_table_basename, simulated_sample_size, dissimilarity))
        output_table_f = open(output_table_fp, 'w')
        output_table_f.write(format_biom_table(output_table))
        output_table_f.close()

        if output_mapping_lines != None:
            output_map_fp = join(
                output_dir, '%s_n%d_d%r.txt' %
                (output_map_basename, simulated_sample_size, dissimilarity))
            output_map_f = open(output_map_fp, 'w')
            output_map_f.write(''.join(output_mapping_lines))
            output_map_f.close()
Beispiel #36
0
def simsam_range_to_files(table,
                          tree,
                          simulated_sample_sizes,
                          dissimilarities,
                          output_dir,
                          mapping_f=None,
                          output_table_basename="table",
                          output_map_basename="map"):
    """Applies sim_otu_table over a range of parameters, writing output to file
    
     table: the input table to simulate samples from
     tree: tree related OTUs in input table
     simulated_sample_sizes: a list of ints defining how many
      output samples should be create per input sample
     dissimilarities: a list of floats containing the
      dissimilarities to use in simulating tables
     output_dir: the directory where all output tables and 
      mapping files should be written
     mapping_f: file handle for metadata mapping file, if 
      a mapping file should be created with the samples from
      each simulated table
     output_table_basename: basename for output table files 
      (default: table)
     output_map_basename: basename for output mapping files 
      (default: map)
    """
    create_dir(output_dir)
    for e in simsam_range(table,tree,simulated_sample_sizes,dissimilarities,mapping_f):
        output_table = e[0]
        output_mapping_lines = e[1]
        simulated_sample_size = e[2]
        dissimilarity = e[3]
        
        output_table_fp = join(output_dir,'%s_n%d_d%r.biom' %
         (output_table_basename, simulated_sample_size, dissimilarity))
        output_table_f = open(output_table_fp, 'w')
        output_table_f.write(format_biom_table(output_table))
        output_table_f.close()
        
        if output_mapping_lines != None:
            output_map_fp = join(output_dir,'%s_n%d_d%r.txt' %
             (output_map_basename, simulated_sample_size, dissimilarity))
            output_map_f = open(output_map_fp, 'w')
            output_map_f.write(''.join(output_mapping_lines))
            output_map_f.close()
Beispiel #37
0
def main():
    option_parser, opts, args =\
     parse_command_line_parameters(**script_info)

    out_fh = open(opts.output_file,'w')
    otu_table_fh = open(opts.otu_table,'U')
    otu_table = parse_biom_table(otu_table_fh)
    tree_fh = open(opts.tree_file,'U')
    tree = DndParser(tree_fh)

    res_sam_names, res_otus, res_otu_mtx, res_otu_metadata = \
     sim_otu_table(otu_table.SampleIds, otu_table.ObservationIds, otu_table.iterSamples(), 
                   otu_table.ObservationMetadata, tree, opts.num, opts.dissim)


    rich_table = table_factory(res_otu_mtx,res_sam_names,res_otus,
    observation_metadata=res_otu_metadata)
    out_fh.write(format_biom_table(rich_table))
Beispiel #38
0
def split_otu_table_on_sample_metadata(otu_table_f,mapping_f,mapping_field):
    """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field 
    """
    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f,mapping_field)
    otu_table = parse_biom_table(otu_table_f)
    
    for v in mapping_values:
        v_fp_str = v.replace(' ','_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f,valid_states_str="%s:%s" % (mapping_field,v))
        
        try:
            filtered_otu_table = otu_table.filterSamples(
                              lambda values,id_,metadata: id_ in sample_ids_to_keep)
        except TableException:
            # all samples are filtered out, so no otu table to write
            continue
        yield v_fp_str, format_biom_table(filtered_otu_table)
Beispiel #39
0
def main():
    option_parser, opts, args =\
     parse_command_line_parameters(**script_info)

    out_fh = open(opts.output_file, 'w')
    otu_table_fh = open(opts.otu_table, 'U')
    otu_table = parse_biom_table(otu_table_fh)
    tree_fh = open(opts.tree_file, 'U')
    tree = DndParser(tree_fh)

    res_sam_names, res_otus, res_otu_mtx, res_otu_metadata = \
     sim_otu_table(otu_table.SampleIds, otu_table.ObservationIds, otu_table.iterSamples(),
                   otu_table.ObservationMetadata, tree, opts.num, opts.dissim)

    rich_table = table_factory(res_otu_mtx,
                               res_sam_names,
                               res_otus,
                               observation_metadata=res_otu_metadata)
    out_fh.write(format_biom_table(rich_table))
Beispiel #40
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    mapping_fp = opts.mapping_fp
    mapping_category = opts.mapping_category
    otu_table_fp = opts.otu_table_fp
    output_fp = opts.output_fp
    normalize = opts.normalize

    # define a function that returns the bin a sample shouldbe placed into
    bin_function = lambda sample_metadata: sample_metadata[mapping_category]
    # parse the sample metadata and add it to the OTU table (we assume that
    # sample metadata is not already present in the table)
    mapping, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))

    # added in ability to combine metadata columns and summarize based on the
    # new combined category
    if '&&' in mapping_category:
        new_mapping = []
        new_mapping.append(headers)
        for i in range(len(mapping)):
            new_mapping.append(mapping[i])
        #Create an array using multiple columns from mapping file
        combinecolorby = mapping_category.split('&&')
        mapping = combine_map_label_cols(combinecolorby, new_mapping)

    sample_metadata = mapping_file_to_dict(mapping, headers)
    table = parse_biom_table(open(otu_table_fp, 'U'))
    table.addSampleMetadata(sample_metadata)
    # create a new OTU table where samples are binned based on their return
    # value from bin_function
    result = table.collapseSamplesByMetadata(bin_function,
                                             norm=False,
                                             min_group_size=1)

    # normalize the result if requested by the user
    if normalize:
        result = result.normObservationBySample()

    # write a new BIOM file
    f = open(output_fp, 'w')
    f.write(format_biom_table(result))
    f.close()
Beispiel #41
0
def split_otu_table_on_sample_metadata(otu_table_f, mapping_f, mapping_field):
    """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field
    """
    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f, mapping_field)
    otu_table = parse_biom_table(otu_table_f)

    for v in mapping_values:
        v_fp_str = v.replace(' ', '_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f, valid_states_str="%s:%s" % (mapping_field, v))

        try:
            filtered_otu_table = otu_table.filterSamples(
                lambda values, id_, metadata: id_ in sample_ids_to_keep)
        except TableException:
            # all samples are filtered out, so no otu table to write
            continue
        yield v_fp_str, format_biom_table(filtered_otu_table)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    mapping_fp = opts.mapping_fp
    mapping_category = opts.mapping_category
    otu_table_fp = opts.otu_table_fp
    output_fp = opts.output_fp
    normalize = opts.normalize
    
    # define a function that returns the bin a sample shouldbe placed into
    bin_function = lambda sample_metadata: sample_metadata[mapping_category]
    # parse the sample metadata and add it to the OTU table (we assume that
    # sample metadata is not already present in the table)
    mapping,headers,comments = parse_mapping_file(open(mapping_fp,'U'))

    # added in ability to combine metadata columns and summarize based on the
    # new combined category    
    if '&&' in mapping_category:
        new_mapping=[]
        new_mapping.append(headers)
        for i in range(len(mapping)):
            new_mapping.append(mapping[i])
        #Create an array using multiple columns from mapping file
        combinecolorby=mapping_category.split('&&')
        mapping=combine_map_label_cols(combinecolorby,new_mapping)
        
    sample_metadata=mapping_file_to_dict(mapping,headers)
    table = parse_biom_table(open(otu_table_fp,'U'))
    table.addSampleMetadata(sample_metadata)
    # create a new OTU table where samples are binned based on their return
    # value from bin_function  
    result = table.collapseSamplesByMetadata(bin_function, norm=False,
        min_group_size=1)
    
    # normalize the result if requested by the user
    if normalize:
        result = result.normObservationBySample()
    
    # write a new BIOM file
    f = open(output_fp,'w')
    f.write(format_biom_table(result))
    f.close()
def pick_subsampled_open_reference_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_assign_tax=True,
        run_align_and_tree=True,
        prefilter_percent_id=0.60,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61']
    allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
     "Unknown de novo OTU picking method: %s. Known methods are: %s"\
     % (denovo_otu_picking_method,
        ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
     "Unknown reference OTU picking method: %s. Known methods are: %s"\
     % (reference_otu_picking_method,
        ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(
            logger,
            [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
        prefilter_refseqs_fp = refseqs_fp

    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])

            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                    percent_subsample)

    logger.write('# Subsample the failures fasta file using API \n' +
                 'python -c "import qiime; qiime.util.subsample_fasta' +
                 '(\'%s\', \'%s\', \'%f\')\n\n"' %
                 (abspath(step1_failures_fasta_fp),
                  abspath(step2_input_fasta_fp), percent_subsample))

    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                 new_ref_set_id, denovo_otu_picking_method,
                                 params, logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',
                      step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                    reference_otu_picking_method,
                                    step2_repset_fasta_fp, parallel, params,
                                    logger)

    commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures',
                          step3_filter_fasta_cmd)])

        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])

    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s > %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' %
                          (step3_failures_list_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    logger.write(
        '# Filter singletons from the otu map using API \n' +
        'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
        '(\'%s\', \'%s\', \'%d\')"\n\n' %
        (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size))

    ## make the final representative seqs file and a new refseqs file that
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be.
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be
    ## reads from this run so we don't hit issues building a tree using
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' %
                 final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write(
        '# Copy the full input refseqs file to the new refseq file\n' +
        'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,
                                                   'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()
    logger.write(
        '# Write non-singleton otus representative sequences from ' +
        'step 2 and step 4 to the final representative set and the new reference'
        + ' set (%s and %s respectively)\n\n' %
        (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
             (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %\
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                parse_biom_table(open(align_and_tree_input_otu_table, 'U')),
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp, 'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if close_logger_on_success:
        logger.close()
def pick_subsampled_open_reference_otus(input_fp, 
                              refseqs_fp,
                              output_dir,
                              percent_subsample,
                              new_ref_set_id,
                              command_handler,
                              params,
                              qiime_config,
                              prefilter_refseqs_fp=None,
                              run_assign_tax=True,
                              run_align_and_tree=True,
                              prefilter_percent_id=0.60,
                              min_otu_size=2,
                              step1_otu_map_fp=None,
                              step1_failures_fasta_fp=None,
                              parallel=False,
                              suppress_step4=False,
                              logger=None,
                              suppress_md5=False,
                              denovo_otu_picking_method='uclust',
                              reference_otu_picking_method='uclust_ref',
                              status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust','usearch61']
    allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
     "Unknown de novo OTU picking method: %s. Known methods are: %s"\
     % (denovo_otu_picking_method,
        ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
     "Unknown reference OTU picking method: %s. Known methods are: %s"\
     % (reference_otu_picking_method,
        ','.join(allowed_reference_otu_picking_methods))
    
    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[input_fp,
                               refseqs_fp,
                               step1_otu_map_fp,
                               step1_failures_fasta_fp])
    
    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in 
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
       prefilter_refseqs_fp = refseqs_fp
    
    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])
            
            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input', filter_fasta_cmd)])
            
            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            
        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)
        
        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])
        
        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []
    
    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set',step1_pick_rep_set_cmd)])
    
    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp,
                    step2_input_fasta_fp,
                    percent_subsample)
    
    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                 step2_dir,
                                 new_ref_set_id,
                                 denovo_otu_picking_method,
                                 params,
                                 logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])
    
    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(
     step1_failures_fasta_fp,
     step3_dir,
     reference_otu_picking_method,
     step2_repset_fasta_fp,
     parallel,
     params,
     logger)
    
    commands.append([
     ('Pick reference OTUs using de novo rep set',step3_cmd)])
    
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir
    
    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures', 
                          step3_filter_fasta_cmd)])
        
        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id,'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)])
        
    else:
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])    
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                      'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))])
    
    command_handler(commands,
        status_update_callback,
        logger=logger,
        close_logger_on_success=False)
    commands = []
    
    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size)
    
    ## make the final representative seqs file and a new refseqs file that 
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be. 
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be 
    ## reads from this run so we don't hit issues building a tree using 
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp,'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp,new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp,'a')
    new_refseqs_f.write('\n')
    # iterate over all representative sequences from step2 and step4 and write 
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    new_refseqs_f.close()
    final_repset_f.close()
    
    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table",make_otu_table_cmd)])
    
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=False)
    
    commands = []
    
    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size)
    
    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp],error_on_missing=False)
        
            taxonomy_fp = assign_tax(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
             (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table",add_metadata_cmd)])
        
            command_handler(commands,
                status_update_callback,
                logger=logger,
                close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %\
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)
        
            pynast_failures_fp = align_and_tree(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                  parse_biom_table(open(align_and_tree_input_otu_table,'U')),
                  get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')),
                  0,inf,0,inf,negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()
        
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []
            
    if close_logger_on_success:
        logger.close()
def iterative_pick_subsampled_open_reference_otus(
                              input_fps, 
                              refseqs_fp,
                              output_dir,
                              percent_subsample,
                              new_ref_set_id,
                              command_handler,
                              params,
                              qiime_config,
                              prefilter_refseqs_fp=None,
                              prefilter_percent_id=0.60,
                              min_otu_size=2,
                              run_assign_tax=True,
                              run_align_and_tree=True,
                              step1_otu_map_fp=None,
                              step1_failures_fasta_fp=None,
                              parallel=False,
                              suppress_step4=False,
                              logger=None,
                              suppress_md5=False,
                              denovo_otu_picking_method='uclust',
                              reference_otu_picking_method='uclust_ref',
                              status_update_callback=print_to_stdout):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we 
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp == None:
       prefilter_refseqs_fp = refseqs_fp
    
    otu_table_fps = []
    repset_fasta_fps = []
    for i,input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir,i)
        if iteration_output_exists(iteration_output_dir,min_otu_size):
            # if the output from an iteration already exists, skip that 
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger,[input_fp,refseqs_fp])
            logger.write('Iteration %d (input file: %s) output data already exists. '
                         'Skipping and moving to next.\n\n' % (i,input_fp))
        else:
            pick_subsampled_open_reference_otus(input_fp=input_fp,
                                     refseqs_fp=refseqs_fp,
                                     output_dir=iteration_output_dir,
                                     percent_subsample=percent_subsample,
                                     new_ref_set_id='.'.join([new_ref_set_id,str(i)]),
                                     command_handler=command_handler,
                                     params=params,
                                     qiime_config=qiime_config,
                                     run_assign_tax=False,
                                     run_align_and_tree=False,
                                     prefilter_refseqs_fp=prefilter_refseqs_fp,
                                     prefilter_percent_id=prefilter_percent_id,
                                     min_otu_size=min_otu_size,
                                     step1_otu_map_fp=step1_otu_map_fp,
                                     step1_failures_fasta_fp=step1_failures_fasta_fp,
                                     parallel=parallel,
                                     suppress_step4=suppress_step4,
                                     logger=logger,
                                     suppress_md5=suppress_md5,
                                     denovo_otu_picking_method=denovo_otu_picking_method,
                                     reference_otu_picking_method=reference_otu_picking_method,
                                     status_update_callback=status_update_callback)
        ## perform post-iteration file shuffling whether the previous iteration's
        ## data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp
        otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir,min_otu_size))
        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)
    
    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
         (','.join(otu_table_fps),otu_table_fp)        
        commands.append([("Merge OTU tables",merge_cmd)])
    
    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps,final_repset_fp)
    
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=False)
    commands = []
    
    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size)
    
    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp],error_on_missing=False)
        
            taxonomy_fp = assign_tax(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
             (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table",add_metadata_cmd)])
        
            command_handler(commands,
                status_update_callback,
                logger=logger,
                close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %\
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)
        
            pynast_failures_fp = align_and_tree(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                  parse_biom_table(open(align_and_tree_input_otu_table,'U')),
                  get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')),
                  0,inf,0,inf,negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()
        
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []
    
    logger.close()
Beispiel #46
0
def iterative_pick_subsampled_open_referenence_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=0.60,
        min_otu_size=2,
        run_tax_align_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        status_update_callback=print_to_stdout):
    """ Call the pick_subsampled_open_referenence_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp == None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write(
                'Iteration %d (input file: %s) output data already exists. '
                'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_referenence_otus(
                input_fp=input_fp,
                refseqs_fp=refseqs_fp,
                output_dir=iteration_output_dir,
                percent_subsample=percent_subsample,
                new_ref_set_id='.'.join([new_ref_set_id,
                                         str(i)]),
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                run_tax_align_tree=False,
                prefilter_refseqs_fp=prefilter_refseqs_fp,
                prefilter_percent_id=prefilter_percent_id,
                min_otu_size=min_otu_size,
                step1_otu_map_fp=step1_otu_map_fp,
                step1_failures_fasta_fp=step1_failures_fasta_fp,
                parallel=parallel,
                suppress_step4=suppress_step4,
                logger=logger,
                status_update_callback=status_update_callback)
        ## perform post-iteration file shuffling whether the previous iteration's
        ## data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp
        otu_table_fps.append('%s/otu_table_mc%d.biom' %
                             (iteration_output_dir, min_otu_size))
        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
         (','.join(otu_table_fps),otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    if run_tax_align_tree:
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        final_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
        if exists(final_otu_table_fp) and getsize(final_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp, final_otu_table_fp],
                         error_on_missing=False)

            taxonomy_fp, pynast_failures_fp = tax_align_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\
             (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_taxa_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                parse_biom_table(open(otu_table_w_tax_fp, 'U')),
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            otu_table_f = open(final_otu_table_fp, 'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
Beispiel #47
0
def pick_subsampled_open_referenence_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_tax_align_tree=True,
        prefilter_percent_id=0.60,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    denovo_otu_picking_method = 'uclust'
    reference_otu_picking_method = 'uclust_ref'

    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    log_input_md5s(
        logger,
        [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
        prefilter_refseqs_fp = refseqs_fp

    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_otu_map_fp = \
             '%s/%s_otus.txt' % (prefilter_dir,input_basename)
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)

        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                    percent_subsample)

    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                 new_ref_set_id, denovo_otu_picking_method,
                                 params, logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',
                      step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                    reference_otu_picking_method,
                                    step2_repset_fasta_fp, parallel, params,
                                    logger)

    commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures',
                          step3_filter_fasta_cmd)])

        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])

    else:
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' %
                          (step3_failures_list_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    ## make the final representative seqs file and a new refseqs file that
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be.
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be
    ## reads from this run so we don't hit issues building a tree using
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,
                                                   'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    if run_tax_align_tree:
        taxonomy_fp, pynast_failures_fp = tax_align_tree(
            repset_fasta_fp=final_repset_fp,
            output_dir=output_dir,
            command_handler=command_handler,
            params=params,
            qiime_config=qiime_config,
            parallel=parallel,
            logger=logger,
            status_update_callback=status_update_callback)

        # Add taxa to otu table
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\
         (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
        commands.append([("Add taxa to OTU table", add_taxa_cmd)])

        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

        # Build OTU table without PyNAST failures
        otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
        filtered_otu_table = filter_otus_from_otu_table(
            parse_biom_table(open(otu_table_w_tax_fp, 'U')),
            get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
            0,
            inf,
            0,
            inf,
            negate_ids_to_keep=True)
        otu_table_f = open(otu_table_fp, 'w')
        otu_table_f.write(format_biom_table(filtered_otu_table))
        otu_table_f.close()

        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_dir = opts.output_dir
    
    if opts.num_fraction_for_core_steps < 2:
        option_parser.error("Must perform at least two steps. Increase --num_fraction_for_core_steps.")
    fractions_for_core = linspace(opts.min_fraction_for_core,
                                  opts.max_fraction_for_core,
                                  opts.num_fraction_for_core_steps)

    otu_md = opts.otu_md
    valid_states = opts.valid_states
    mapping_fp = opts.mapping_fp
    
    create_dir(output_dir)

    if valid_states and opts.mapping_fp:
        sample_ids = sample_ids_from_metadata_description(open(mapping_fp,'U'),
                                                          valid_states)
        if len(sample_ids) < 1:
            option_parser.error(\
             "--valid_states pattern didn't match any entries in mapping file: \"%s\"" %\
             valid_states)
    else:
        # get core across all samples if user doesn't specify a subset of the 
        # samples to work with
        sample_ids = None
    
    input_table = parse_biom_table(open(input_fp,'U'))
    
    otu_counts = []
    summary_figure_fp = join(output_dir,'core_otu_size.pdf')
    for fraction_for_core in fractions_for_core:
        # build a string representation of the fraction as that gets used 
        # several times
        fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.)
        
        # prep output files
        output_fp = join(output_dir,'core_otus_%s.txt' % fraction_for_core_str)
        output_table_fp = join(output_dir,'core_table_%s.biom' % fraction_for_core_str)
        output_f = open(output_fp,'w')
    
        try:
            core_table = filter_table_to_core(input_table,
                                              sample_ids,
                                              fraction_for_core)
        except TableException:
            output_f.write("# No OTUs present in %s %% of samples." % fraction_for_core_str)
            output_f.close()
            otu_counts.append(0)
            continue
    
        # write some header information to file
        if sample_ids == None:
            output_f.write("# Core OTUs across %s %% of samples.\n" % fraction_for_core_str)
        else:
            output_f.write(\
             "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" %\
              (fraction_for_core_str, valid_states,' '.join(sample_ids)))
    
        # write the otu id and corresponding metadata for all core otus
        otu_count = 0
        for value, id_, md in core_table.iterObservations():
            output_f.write('%s\t%s\n' % (id_,md[otu_md]))
            otu_count += 1
        output_f.close()
    
        # write the core biom table
        output_table_f = open(output_table_fp,'w')
        output_table_f.write(format_biom_table(core_table))
        output_table_f.close()
        
        # append the otu count to the list of counts
        otu_counts.append(otu_count)
    
    plot(fractions_for_core, otu_counts)
    xlim(min(fractions_for_core),max(fractions_for_core))
    ylim(0,max(otu_counts)+1)
    xlabel("Fraction of samples that OTU must be observed in to be considered 'core'")
    ylabel("Number of OTUs")
    savefig(summary_figure_fp)
    def setUp(self):
        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'

        self.map_file = """#SampleID	Day	time	Description
#This is some comment about the study
1	090809	1200	some description of sample1
2	090809	1800	some description of sample2
3	090909	1200	some description of sample3
4	090909	1800	some description of sample4
5	091009	1200	some description of sample5"""
        self.cat_by_sample = {
            "1": [("Day", "090809"), ("time", "1200")],
            "2": [("Day", "090809"), ("time", "1800")],
            "3": [("Day", "090909"), ("time", "1200")],
            "4": [("Day", "090909"), ("time", "1800")],
            "5": [("Day", "091009"), ("time", "1200")]
        }
        self.sample_by_cat = {
            ("Day", "090809"): ["1", "2"],
            ("Day", "090909"): ["3", "4"],
            ("Day", "091009"): ["5"],
            ("time", "1200"): ["1", "3", "5"],
            ("time", "1800"): ["2", "4"]
        }

        self.num_cats = 2
        self.meta_dict = {
            "1": ["090809	1200", 0],
            "2": ["090809	1800", 0],
            "3": ["090909	1200", 0],
            "4": ["090909	1800", 0],
            "5": ["091009	1200", 0]
        }
        self.labels = ["from", "to", "eweight", "consensus_lin", "Day", "time"]
        self.node_labels = [
            "node_name", "node_disp_name", "ntype", "degree",
            "weighted_degree", "consensus_lin", "Day", "time"
        ]
        self.label_list = [["090809", "090909", "091009"], ["1200", "1800"]]

        self.otu_table_vals = array([[0, 1, 0, 0, 6], [2, 0, 0, 0, 0],
                                     [0, 0, 3, 1, 0], [0, 0, 0, 0, 5],
                                     [0, 4, 2, 0, 0], [3, 6, 0, 0, 0],
                                     [0, 0, 4, 2, 0], [0, 0, 0, 0, 3],
                                     [2, 0, 0, 5, 0], [0, 2, 0, 4, 0]])

        otu_table_str = format_biom_table(
            table_factory(self.otu_table_vals, ['1', '2', '3', '4', '5'], [
                'otu_1', 'otu_2', 'otu_3', 'otu_4', 'otu_5', 'otu_6', 'otu_7',
                'otu_8', 'otu_9', 'otu_10'
            ], [None, None, None, None, None], [{
                "taxonomy": ["Bacteria", "Actinobacteria", "Coriobacteridae"]
            }, {
                "taxonomy": [
                    "Bacteria", "Bacteroidetes", "Bacteroidales",
                    "Bacteroidaceae"
                ]
            }, {
                "taxonomy":
                ["Bacteria", "Firmicutes", "Clostridia", "Clostridiales"]
            }, {
                "taxonomy": [
                    "Bacteria", "Spirochaetes", "Spirochaetales",
                    "Spirochaetaceae"
                ]
            }, {
                "taxonomy": [
                    "Bacteria", "Bacteroidetes", "Bacteroidales",
                    "Rikenellaceae"
                ]
            }, {
                "taxonomy": [
                    "Bacteria", "Bacteroidetes", "Bacteroidales",
                    "Dysgonomonaceae"
                ]
            }, {
                "taxonomy": [
                    "Bacteria", "Bacteroidetes", "Bacteroidales",
                    "Odoribacteriaceae"
                ]
            }, {
                "taxonomy": [
                    "Bacteria", "Bacteroidetes", "Bacteroidales",
                    "Dysgonomonaceae", "otu_425"
                ]
            }, {
                "taxonomy": [
                    "Bacteria", "Bacteroidetes", "Bacteroidales",
                    "Dysgonomonaceae", "otu_425"
                ]
            }, {
                "taxonomy": [
                    "Bacteria", "Firmicutes", "Mollicutes",
                    "Clostridium_aff_innocuum_CM970"
                ]
            }]))

        _, self.otu_table_fp = mkstemp(
            dir=self.tmp_dir,
            prefix='test_make_otu_network_otu_table',
            suffix='.biom')
        close(_)
        open(self.otu_table_fp, 'w').write(otu_table_str)

        self.otu_sample_file = """#Full OTU Counts
#OTU ID	1	2	3	4	5	Consensus Lineage
otu_1	0	1	0	0	6	Bacteria; Actinobacteria; Coriobacteridae
otu_2	2	0	0	0	0	Bacteria; Bacteroidetes; Bacteroidales; Bacteroidaceae
otu_3	0	0	3	1	0	Bacteria; Firmicutes; Clostridia; Clostridiales
otu_4	0	0	0	0	5	Bacteria; Spirochaetes; Spirochaetales; Spirochaetaceae
otu_5	0	4	2	0	0	Bacteria; Bacteroidetes; Bacteroidales; Rikenellaceae
otu_6	3	6	0	0	0	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae
otu_7	0	0	4	2	0	Bacteria; Bacteroidetes; Bacteroidales; Odoribacteriaceae
otu_8	0	0	0	0	3	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425
otu_9	2	0	0	5	0	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425
otu_10	0	2	0	4	0	Bacteria; Firmicutes; Mollicutes; Clostridium_aff_innocuum_CM970"""

        self.con_by_sample = {
            '1': set(['2', '4']),
            '2': set(['5', '3', '1', '4']),
            '3': set(['4', '2']),
            '4': set(['3', '1', '2']),
            '5': set(['2'])
        }

        self.edge_file_str = [
            "2	otu_1	1.0	Bacteria:Actinobacteria:Coriobacteridae	090809	1800",
            "5	otu_1	6.0	Bacteria:Actinobacteria:Coriobacteridae	091009	1200",
            "1	otu_2	2.0	Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae	090809	1200",
            "3	otu_3	3.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1200",
            "4	otu_3	1.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1800",
            "5	otu_4	5.0	Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae	091009	1200",
            "2	otu_5	4.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090809	1800",
            "3	otu_5	2.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090909	1200",
            "1	otu_6	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1200",
            "2	otu_6	6.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1800",
            "3	otu_7	4.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1200",
            "4	otu_7	2.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1800",
            "5	otu_8	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	091009	1200",
            "1	otu_9	2.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090809	1200",
            "4	otu_9	5.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090909	1800",
            "2	otu_10	2.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090809	1800",
            "4	otu_10	4.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090909	1800"
        ]

        self.node_file_str = [
            "1	1	user_node	3	7.0	other	090809	1200",
            "2	2	user_node	4	13.0	other	090809	1800",
            "3	3	user_node	3	9.0	other	090909	1200",
            "4	4	user_node	4	12.0	other	090909	1800",
            "5	5	user_node	3	14.0	other	091009	1200",
            "otu_1		otu_node	2	7.0	Bacteria:Actinobacteria:Coriobacteridae	otu	otu",
            "otu_2		otu_node	1	2.0	Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae	otu	otu",
            "otu_3		otu_node	2	4.0	Bacteria:Firmicutes:Clostridia:Clostridiales	otu	otu",
            "otu_4		otu_node	1	5.0	Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae	otu	otu",
            "otu_5		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	otu	otu",
            "otu_6		otu_node	2	9.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	otu	otu",
            "otu_7		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	otu	otu",
            "otu_8		otu_node	1	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
            "otu_9		otu_node	2	7.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
            "otu_10		otu_node	2	6.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	otu	otu"
        ]

        self.red_edge_file_str = [
            "2	otu_1	1.0	Bacteria:Actinobacteria:Coriobacteridae	090809	1800",
            "5	otu_1	6.0	Bacteria:Actinobacteria:Coriobacteridae	091009	1200",
            "1	@1	1.0	missed	090809	1200",
            "3	otu_3	3.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1200",
            "4	otu_3	1.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1800",
            "5	@5	1.0	missed	091009	1200",
            "2	otu_5	4.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090809	1800",
            "3	otu_5	2.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090909	1200",
            "1	otu_6	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1200",
            "2	otu_6	6.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1800",
            "3	otu_7	4.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1200",
            "4	otu_7	2.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1800",
            "1	otu_9	2.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090809	1200",
            "4	otu_9	5.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090909	1800",
            "2	otu_10	2.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090809	1800",
            "4	otu_10	4.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090909	1800"
        ]

        self.red_node_file_str = [
            "1	1	user_node	3	7.0	other	090809	1200",
            "2	2	user_node	4	13.0	other	090809	1800",
            "3	3	user_node	3	9.0	other	090909	1200",
            "4	4	user_node	4	12.0	other	090909	1800",
            "5	5	user_node	3	14.0	other	091009	1200",
            "otu_1		otu_node	2	7.0	Bacteria:Actinobacteria:Coriobacteridae	otu	otu",
            "@1		otu_collapsed	1	1.0	other	otu	otu",
            "otu_3		otu_node	2	4.0	Bacteria:Firmicutes:Clostridia:Clostridiales	otu	otu",
            "@5		otu_collapsed	2	2.0	other	otu	otu",
            "otu_5		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	otu	otu",
            "otu_6		otu_node	2	9.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	otu	otu",
            "otu_7		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	otu	otu",
            "otu_9		otu_node	2	7.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
            "otu_10		otu_node	2	6.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	otu	otu"
        ]

        self.otu_dc = {1: 3, 2: 7}
        self.sample_dc = {3: 3, 4: 2}
        self.degree_counts = {1: 3, 2: 7, 3: 3, 4: 2}

        self.num_con_cat = {"Day": 2, "time": 1}
        self.num_con = 6
        self.num_cat = {"Day": 2, "time": 4}
        self.num_cat_less = {"Day": 1, "time": 3}
        self._paths_to_clean_up = [self.otu_table_fp]
        self._dir_to_clean_up = ''
Beispiel #50
0
                sample_metadata=None,
                constructor=SparseOTUTable):
 
 data, sample_ids, otu_ids = parse_otu_map(otu_map_f,delim)
 
 if otu_to_taxonomy != None:
     otu_metadata = []
     for o in otu_ids:
         try:
             otu_metadata.append({'taxonomy':otu_to_taxonomy[o].split(';')})
         except KeyError:
             otu_metadata.append({'taxonomy':["None"]})
 else: 
     otu_metadata = None
 
 if sample_metadata != None:
     raise NotImplementedError,\
      "Passing of sample metadata to make_otu_table is not currently supported."
 try:
     otu_table = table_factory(data, sample_ids, otu_ids, 
                               sample_metadata=sample_metadata, 
                               observation_metadata=otu_metadata, 
                               table_id=table_id, 
                               constructor=constructor,
                               dtype=int)
 except ValueError,e:
     raise ValueError,\
      ("Couldn't create OTU table. Is your OTU map empty?"
       " Original error message: %s" % (str(e)))
 return format_biom_table(otu_table)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    min_count = opts.min_count
    max_count = opts.max_count
    min_count_fraction = opts.min_count_fraction
    if min_count_fraction < 0. or min_count_fraction > 1.:
        option_parser.error("min_count_fraction must be between 0 and 1")
    if min_count != 0 and min_count_fraction != 0:
        option_parser.error(
            "cannot specify both min_count and min_count_fraction")

    min_samples = opts.min_samples
    max_samples = opts.max_samples

    otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp
    negate_ids_to_exclude = opts.negate_ids_to_exclude

    if not (min_count != 0 or \
            min_count_fraction != 0 or \
            not isinf(max_count) or \
            otu_ids_to_exclude_fp != None or \
            min_samples !=0 or not isinf(max_samples)):
        option_parser.error(
            "No filtering requested. Must provide either "
            "min counts, max counts, min samples, max samples, min_count_fraction, "
            "or exclude_fp (or some combination of those).")

    otu_table = parse_biom_table(open(opts.input_fp, 'U'))

    if min_count_fraction > 0:
        min_count = otu_table.sum() * min_count_fraction
        print otu_table.sum(), min_count

    output_f = open(opts.output_fp, 'w')

    otu_ids_to_keep = set(otu_table.ObservationIds)

    if otu_ids_to_exclude_fp:
        if otu_ids_to_exclude_fp.endswith('.fasta') or \
           otu_ids_to_exclude_fp.endswith('.fna'):
            otu_ids_to_exclude = set([
                id_.strip().split()[0] for id_, seq in MinimalFastaParser(
                    open(otu_ids_to_exclude_fp, 'U'))
            ])
        else:
            otu_ids_to_exclude = set([
                l.strip().split('\t')[0]
                for l in open(otu_ids_to_exclude_fp, 'U')
            ])

        otu_ids_to_keep -= otu_ids_to_exclude

    filtered_otu_table = filter_otus_from_otu_table(otu_table, otu_ids_to_keep,
                                                    min_count, max_count,
                                                    min_samples, max_samples,
                                                    negate_ids_to_exclude)
    output_f.write(format_biom_table(filtered_otu_table))
    output_f.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    verbose = opts.verbose

    output_fp = opts.output_fp

    category_mapping_fp = opts.category_mapping_fp
    category_mapping = open(category_mapping_fp, 'U')
    category_mapping = parse_mapping_file(category_mapping)
    individual_column = opts.individual_column
    reference_sample_column = opts.reference_sample_column
    conv_output_fp = opts.converted_otu_table_output_fp
    relative_abundance = opts.relative_abundance

    filter = opts.filter
    test = opts.test
    category = opts.category
    if not category:
        if test != 'paired_T':
            raise ValueError('a category in the category mapping file must be' +\
                ' specified with the -c option for this test')
    threshold = opts.threshold
    if threshold and threshold != 'None':
        threshold = float(threshold)

    otu_include_fp = opts.otu_include_fp
    if otu_include_fp and otu_include_fp != 'None':
        otu_include = open(otu_include_fp)
    else:
        otu_include = None

    otu_table_fp = opts.otu_table_fp
    if not isdir(opts.otu_table_fp):
        # if single file, process normally
        otu_table = open(otu_table_fp, 'U')
        try:
            otu_table = parse_biom_table(otu_table)
        except AttributeError:
            otu_table = parse_biom_table_str(otu_table)
        #synchronize the mapping file with the otu table
        category_mapping, removed_samples = sync_mapping_to_otu_table(otu_table, \
                        category_mapping)
        if removed_samples:
            print "Warning, the following samples were in the category mapping file " +\
                            "but not the OTU table and will be ignored: "
            for i in removed_samples:
                print i + '\n'

        if test == 'longitudinal_correlation' or test == 'paired_T':
            converted_otu_table = longitudinal_otu_table_conversion_wrapper(
                otu_table, category_mapping, individual_column,
                reference_sample_column)
            if conv_output_fp:
                of = open(conv_output_fp, 'w')
                of.write(format_biom_table(converted_otu_table))
                of.close()
            if test == 'longitudinal_correlation':
                #set the otu_include list to all of the OTUs, this effectively
                #deactivates the filter for correlation, because the filtered OTU_list is
                #rewritten with the otu_include list in the test_wrapper
                if not otu_include:
                    otu_include = set(otu_table.ObservationIds)
                output = test_wrapper('correlation', converted_otu_table, \
                    category_mapping, category, threshold, filter, otu_include, \
                    999999999.0, True)
            elif test == 'paired_T':
                output = test_wrapper('paired_T', converted_otu_table, \
                    category_mapping, category, threshold, \
                    filter, otu_include, 999999999.0, True, \
                    individual_column, reference_sample_column)
        else:
            output = test_wrapper(test, otu_table, category_mapping, \
                category, threshold, filter, otu_include, \
                otu_table_relative_abundance=relative_abundance)
    else:
        if test != 'longitudinal_correlation' and test != 'paired_T':
            otu_table_paths = glob('%s/*biom' % otu_table_fp)
            # if directory, get aggregated results
            parsed_otu_tables = []
            for path in otu_table_paths:
                ot = open(path, 'U')
                ot = parse_biom_table(ot)
                parsed_otu_tables.append(ot)

            #synchronize the mapping file with the otu table
            #checks with just the first OTU table and assumes that all otu tables
            #have the same collection of samples
            category_mapping, removed_samples = sync_mapping_to_otu_table(parsed_otu_tables[0], \
                        category_mapping)
            if removed_samples:
                print "Warning, the following samples were in the category mapping file " +\
                            "but not the OTU table and will be ignored: "
                for i in removed_samples:
                    print i + '\n'

            output = test_wrapper_multiple(test, parsed_otu_tables, \
                category_mapping, category, threshold, filter, otu_include,\
                otu_table_relative_abundance=relative_abundance)
        else:
            raise ValueError(
                "the longitudinal_correlation and paired_T options cannot be run on a directory"
            )

    of = open(output_fp, 'w')
    of.write('\n'.join(output))
    of.close()
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    min_count = opts.min_count
    max_count = opts.max_count
    min_count_fraction = opts.min_count_fraction
    if min_count_fraction < 0. or min_count_fraction > 1.:
        option_parser.error("min_count_fraction must be between 0 and 1")
    if min_count != 0 and min_count_fraction != 0:
        option_parser.error(
            "cannot specify both min_count and min_count_fraction")

    min_samples = opts.min_samples
    max_samples = opts.max_samples

    otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp
    negate_ids_to_exclude = opts.negate_ids_to_exclude

    if not (min_count != 0 or
            min_count_fraction != 0 or
            not isinf(max_count) or
            otu_ids_to_exclude_fp is not None or
            min_samples != 0 or not isinf(max_samples)):
        option_parser.error("No filtering requested. Must provide either "
                            "min counts, max counts, min samples, max samples, min_count_fraction, "
                            "or exclude_fp (or some combination of those).")

    otu_table = parse_biom_table(open(opts.input_fp, 'U'))

    if min_count_fraction > 0:
        min_count = otu_table.sum() * min_count_fraction
        print otu_table.sum(), min_count

    output_f = open(opts.output_fp, 'w')

    otu_ids_to_keep = set(otu_table.ObservationIds)

    if otu_ids_to_exclude_fp:
        if otu_ids_to_exclude_fp.endswith('.fasta') or \
           otu_ids_to_exclude_fp.endswith('.fna'):
            otu_ids_to_exclude = set([id_.strip().split()[0]
                                      for id_, seq in MinimalFastaParser(open(otu_ids_to_exclude_fp, 'U'))])
        else:
            otu_ids_to_exclude = set([l.strip().split('\t')[0]
                                      for l in open(otu_ids_to_exclude_fp, 'U')])

        otu_ids_to_keep -= otu_ids_to_exclude

    filtered_otu_table = filter_otus_from_otu_table(otu_table,
                                                    otu_ids_to_keep,
                                                    min_count,
                                                    max_count,
                                                    min_samples,
                                                    max_samples,
                                                    negate_ids_to_exclude)
    output_f.write(format_biom_table(filtered_otu_table))
    output_f.close()
Beispiel #54
0
    def setUp(self):
        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'

        self.map_file = """#SampleID	Day	time	Description
#This is some comment about the study
1	090809	1200	some description of sample1
2	090809	1800	some description of sample2
3	090909	1200	some description of sample3
4	090909	1800	some description of sample4
5	091009	1200	some description of sample5"""
        self.cat_by_sample = {"1": [("Day", "090809"), ("time", "1200")],
                              "2": [("Day", "090809"), ("time", "1800")],
                              "3": [("Day", "090909"), ("time", "1200")],
                              "4": [("Day", "090909"), ("time", "1800")],
                              "5": [("Day", "091009"), ("time", "1200")]}
        self.sample_by_cat = {("Day", "090809"): ["1", "2"],
                              ("Day", "090909"): ["3", "4"],
                              ("Day", "091009"): ["5"],
                              ("time", "1200"): ["1", "3", "5"],
                              ("time", "1800"): ["2", "4"]}

        self.num_cats = 2
        self.meta_dict = {"1": ["090809	1200", 0],
                          "2": ["090809	1800", 0],
                          "3": ["090909	1200", 0],
                          "4": ["090909	1800", 0],
                          "5": ["091009	1200", 0]}
        self.labels = ["from", "to", "eweight", "consensus_lin", "Day", "time"]
        self.node_labels = ["node_name", "node_disp_name", "ntype", "degree",
                            "weighted_degree", "consensus_lin", "Day", "time"]
        self.label_list = [["090809", "090909", "091009"], ["1200", "1800"]]

        self.otu_table_vals = array([[0, 1, 0, 0, 6],
                                     [2, 0, 0, 0, 0],
                                     [0, 0, 3, 1, 0],
                                     [0, 0, 0, 0, 5],
                                     [0, 4, 2, 0, 0],
                                     [3, 6, 0, 0, 0],
                                     [0, 0, 4, 2, 0],
                                     [0, 0, 0, 0, 3],
                                     [2, 0, 0, 5, 0],
                                     [0, 2, 0, 4, 0]])

        otu_table_str = format_biom_table(table_factory(self.otu_table_vals,
                                                        ['1',
                                                         '2',
                                                         '3',
                                                         '4',
                                                         '5'],
                                                        ['otu_1',
                                                         'otu_2',
                                                         'otu_3',
                                                         'otu_4',
                                                         'otu_5',
                                                         'otu_6',
                                                         'otu_7',
                                                         'otu_8',
                                                         'otu_9',
                                                         'otu_10'],
                                                        [None,
                                                         None,
                                                         None,
                                                         None,
                                                         None],
                                                        [{"taxonomy": ["Bacteria", "Actinobacteria", "Coriobacteridae"]},
                                                         {"taxonomy": ["Bacteria",
                                                                       "Bacteroidetes",
                                                                       "Bacteroidales",
                                                                       "Bacteroidaceae"]},
                                                            {"taxonomy": ["Bacteria",
                                                                          "Firmicutes",
                                                                          "Clostridia",
                                                                          "Clostridiales"]},
                                                            {"taxonomy": ["Bacteria",
                                                                          "Spirochaetes",
                                                                          "Spirochaetales",
                                                                          "Spirochaetaceae"]},
                                                            {"taxonomy": ["Bacteria",
                                                                          "Bacteroidetes",
                                                                          "Bacteroidales",
                                                                          "Rikenellaceae"]},
                                                            {"taxonomy": ["Bacteria",
                                                                          "Bacteroidetes",
                                                                          "Bacteroidales",
                                                                          "Dysgonomonaceae"]},
                                                            {"taxonomy": ["Bacteria",
                                                                          "Bacteroidetes",
                                                                          "Bacteroidales",
                                                                          "Odoribacteriaceae"]},
                                                            {"taxonomy": ["Bacteria",
                                                                          "Bacteroidetes",
                                                                          "Bacteroidales",
                                                                          "Dysgonomonaceae",
                                                                          "otu_425"]},
                                                            {"taxonomy": ["Bacteria",
                                                                          "Bacteroidetes",
                                                                          "Bacteroidales",
                                                                          "Dysgonomonaceae",
                                                                          "otu_425"]},
                                                            {"taxonomy": ["Bacteria", "Firmicutes", "Mollicutes", "Clostridium_aff_innocuum_CM970"]}]))

        self.otu_table_fp = get_tmp_filename(tmp_dir=self.tmp_dir,
                                             prefix='test_make_otu_network_otu_table', suffix='.biom')
        open(self.otu_table_fp, 'w').write(otu_table_str)

        self.otu_sample_file = """#Full OTU Counts
#OTU ID	1	2	3	4	5	Consensus Lineage
otu_1	0	1	0	0	6	Bacteria; Actinobacteria; Coriobacteridae
otu_2	2	0	0	0	0	Bacteria; Bacteroidetes; Bacteroidales; Bacteroidaceae
otu_3	0	0	3	1	0	Bacteria; Firmicutes; Clostridia; Clostridiales
otu_4	0	0	0	0	5	Bacteria; Spirochaetes; Spirochaetales; Spirochaetaceae
otu_5	0	4	2	0	0	Bacteria; Bacteroidetes; Bacteroidales; Rikenellaceae
otu_6	3	6	0	0	0	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae
otu_7	0	0	4	2	0	Bacteria; Bacteroidetes; Bacteroidales; Odoribacteriaceae
otu_8	0	0	0	0	3	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425
otu_9	2	0	0	5	0	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425
otu_10	0	2	0	4	0	Bacteria; Firmicutes; Mollicutes; Clostridium_aff_innocuum_CM970"""

        self.con_by_sample = {
            '1': set(['2', '4']), '2': set(['5', '3', '1', '4']),
            '3': set(['4', '2']), '4': set(['3', '1', '2']),
            '5': set(['2'])}

        self.edge_file_str = [
            "2	otu_1	1.0	Bacteria:Actinobacteria:Coriobacteridae	090809	1800",
            "5	otu_1	6.0	Bacteria:Actinobacteria:Coriobacteridae	091009	1200",
            "1	otu_2	2.0	Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae	090809	1200",
            "3	otu_3	3.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1200",
            "4	otu_3	1.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1800",
            "5	otu_4	5.0	Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae	091009	1200",
            "2	otu_5	4.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090809	1800",
            "3	otu_5	2.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090909	1200",
            "1	otu_6	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1200",
            "2	otu_6	6.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1800",
            "3	otu_7	4.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1200",
            "4	otu_7	2.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1800",
            "5	otu_8	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	091009	1200",
            "1	otu_9	2.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090809	1200",
            "4	otu_9	5.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090909	1800",
            "2	otu_10	2.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090809	1800",
            "4	otu_10	4.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090909	1800"]

        self.node_file_str = ["1	1	user_node	3	7.0	other	090809	1200",
                              "2	2	user_node	4	13.0	other	090809	1800",
                              "3	3	user_node	3	9.0	other	090909	1200",
                              "4	4	user_node	4	12.0	other	090909	1800",
                              "5	5	user_node	3	14.0	other	091009	1200",
                              "otu_1		otu_node	2	7.0	Bacteria:Actinobacteria:Coriobacteridae	otu	otu",
                              "otu_2		otu_node	1	2.0	Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae	otu	otu",
                              "otu_3		otu_node	2	4.0	Bacteria:Firmicutes:Clostridia:Clostridiales	otu	otu",
                              "otu_4		otu_node	1	5.0	Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae	otu	otu",
                              "otu_5		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	otu	otu",
                              "otu_6		otu_node	2	9.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	otu	otu",
                              "otu_7		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	otu	otu",
                              "otu_8		otu_node	1	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
                              "otu_9		otu_node	2	7.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
                              "otu_10		otu_node	2	6.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	otu	otu"]

        self.red_edge_file_str = [
            "2	otu_1	1.0	Bacteria:Actinobacteria:Coriobacteridae	090809	1800",
            "5	otu_1	6.0	Bacteria:Actinobacteria:Coriobacteridae	091009	1200",
            "1	@1	1.0	missed	090809	1200",
            "3	otu_3	3.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1200",
            "4	otu_3	1.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1800",
            "5	@5	1.0	missed	091009	1200",
            "2	otu_5	4.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090809	1800",
            "3	otu_5	2.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090909	1200",
            "1	otu_6	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1200",
            "2	otu_6	6.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1800",
            "3	otu_7	4.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1200",
            "4	otu_7	2.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1800",
            "1	otu_9	2.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090809	1200",
            "4	otu_9	5.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090909	1800",
            "2	otu_10	2.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090809	1800",
            "4	otu_10	4.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090909	1800"]

        self.red_node_file_str = ["1	1	user_node	3	7.0	other	090809	1200",
                                  "2	2	user_node	4	13.0	other	090809	1800",
                                  "3	3	user_node	3	9.0	other	090909	1200",
                                  "4	4	user_node	4	12.0	other	090909	1800",
                                  "5	5	user_node	3	14.0	other	091009	1200",
                                  "otu_1		otu_node	2	7.0	Bacteria:Actinobacteria:Coriobacteridae	otu	otu",
                                  "@1		otu_collapsed	1	1.0	other	otu	otu",
                                  "otu_3		otu_node	2	4.0	Bacteria:Firmicutes:Clostridia:Clostridiales	otu	otu",
                                  "@5		otu_collapsed	2	2.0	other	otu	otu",
                                  "otu_5		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	otu	otu",
                                  "otu_6		otu_node	2	9.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	otu	otu",
                                  "otu_7		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	otu	otu",
                                  "otu_9		otu_node	2	7.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
                                  "otu_10		otu_node	2	6.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	otu	otu"]

        self.otu_dc = {1: 3, 2: 7}
        self.sample_dc = {3: 3, 4: 2}
        self.degree_counts = {1: 3, 2: 7, 3: 3, 4: 2}

        self.num_con_cat = {"Day": 2,
                            "time": 1}
        self.num_con = 6
        self.num_cat = {"Day": 2,
                        "time": 4}
        self.num_cat_less = {"Day": 1,
                             "time": 3}
        self._paths_to_clean_up = [self.otu_table_fp]
        self._dir_to_clean_up = ''
Beispiel #55
0
 def test_format_biom_table(self):
     """ Formatting of BIOM table correctly includes "generated-by" information 
     """
     generated_by = "QIIME " + get_qiime_library_version()
     self.assertTrue(generated_by in format_biom_table(self.biom1))
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    otu_table_fp = opts.otu_table_fp
    otu_include_fp = opts.otu_include_fp
    output_fp = opts.output_fp
    verbose = opts.verbose
    category_mapping_fp = opts.category_mapping_fp
    individual_column = opts.individual_column
    reference_sample_column = opts.reference_sample_column
    conv_output_fp = opts.converted_otu_table_output_fp
    relative_abundance = opts.relative_abundance
    filter = opts.filter
    test = opts.test
    category = opts.category
    threshold = opts.threshold
    collate_results = opts.collate_results

    # check mapping file
    category_mapping = open(category_mapping_fp, 'U')
    category_mapping = parse_mapping_file(category_mapping)
    if not category:
        if test != 'paired_T':
            option_parser.error(
                'a category in the category mapping file must be'
                ' specified with the -c option for this test')

    # set up threshold value for filtering, if any
    if threshold and threshold != 'None':
        threshold = float(threshold)

    # if specifying a list of OTUs to look at specifically
    if otu_include_fp and otu_include_fp != 'None':
        otu_include = open(otu_include_fp)
    else:
        otu_include = None

    # if only passing in a single OTU table
    if isdir(otu_table_fp) is False:
        # raise error if collate option is being passed to single table
        if collate_results is True:
            option_parser.error(
                'Cannot collate the results of only one table.'
                ' Please rerun the command without passing the -w option')
        else:
            #open and parse the biom table fp
            otu_table = parse_biom_table(open(otu_table_fp, 'U'))

            # run the statistical test
            output = test_wrapper(
                test,
                otu_table,
                category_mapping,
                category,
                threshold,
                filter,
                otu_include,
                otu_table_relative_abundance=relative_abundance)

            # write output
            output_file = open(output_fp, 'w')
            output_file.write('\n'.join(output))
            output_file.close()

    # if the user has passed in a directory
    if isdir(otu_table_fp) is True:

        # negate_collate to return an results file on a per table basis
        if collate_results is False:

            # build list of otu tables
            otu_table_paths = glob('%s/*biom' % otu_table_fp)

            # if output dir doesn't exist, then make it
            if exists(output_fp):
                pass
            else:
                makedirs(output_fp)

            for otu_table_fp in otu_table_paths:
                #open and parse the biom table fp
                otu_table = parse_biom_table(open(otu_table_fp, 'U'))

                #synchronize the mapping file with the otu table
                category_mapping, removed_samples = \
                    sync_mapping_to_otu_table(otu_table, category_mapping)
                if removed_samples:
                    print "Warning, the following samples were in the category mapping file " +\
                                    "but not the OTU table and will be ignored: "
                    for i in removed_samples:
                        print i + '\n'

                # create naming convention for output file
                # will look like: otu_table_ANOVA_Treatment.txt
                output_basename = basename(otu_table_fp)
                output_basename = output_basename.replace(".biom", "")
                output_fp_sweep = "%s_%s_%s.txt" % \
                    (output_basename,test,category)

                # if the convert_otu_table_fp is passed, save the converted table
                if test == 'longitudinal_correlation' or test == 'paired_T':
                    converted_otu_table = longitudinal_otu_table_conversion_wrapper(
                        table, category_mapping, individual_column,
                        reference_sample_column)
                    if conv_output_fp:
                        of = open(conv_output_fp, 'w')
                        of.write(format_biom_table(converted_otu_table))
                        of.close()
                    if test == 'longitudinal_correlation':
                        #set the otu_include list to all of the OTUs, this effectively
                        #deactivates the filter for correlation, because the filtered OTU_list is
                        #rewritten with the otu_include list in the test_wrapper
                        if not otu_include:
                            otu_include = set(otu_table.ObservationIds)
                        output = test_wrapper('correlation',
                                              converted_otu_table,
                                              category_mapping, category,
                                              threshold, filter, otu_include,
                                              999999999.0, True)
                    elif test == 'paired_T':
                        output = test_wrapper('paired_T', converted_otu_table,
                                              category_mapping, category,
                                              threshold, filter, otu_include,
                                              999999999.0, True,
                                              individual_column,
                                              reference_sample_column)

                # run test single input table from the directory
                else:
                    output = test_wrapper(
                        test,
                        otu_table,
                        category_mapping,
                        category,
                        threshold,
                        filter,
                        otu_include,
                        otu_table_relative_abundance=relative_abundance)

                # write output file with new naming convention
                output_file = open(join(output_fp, output_fp_sweep), 'w')
                output_file.write('\n'.join(output))
                output_file.close()

        # Use when the input dir contains rarefied OTU tables, and you want
        # to collate the p-values & results into one results file
        if collate_results is True:
            if test != 'longitudinal_correlation' and test != 'paired_T':
                # get biom tables
                otu_table_paths = glob('%s/*biom' % otu_table_fp)

                #get aggregated tables
                parsed_otu_tables = []
                for otu_table_fp in otu_table_paths:
                    otu_table = open(otu_table_fp, 'U')
                    otu_table = parse_biom_table(otu_table)
                    parsed_otu_tables.append(otu_table)

                #synchronize the mapping file with the otu table
                #checks with just the first OTU table and assumes that all otu tables
                #have the same collection of samples
                category_mapping, removed_samples = \
                    sync_mapping_to_otu_table(parsed_otu_tables[0],category_mapping)
                if removed_samples:
                    print "Warning, the following samples were in the category mapping file " +\
                                "but not the OTU table and will be ignored: "
                    for i in removed_samples:
                        print i + '\n'

                # get output from statistical test
                output = test_wrapper_multiple(
                    test,
                    parsed_otu_tables,
                    category_mapping,
                    category,
                    threshold,
                    filter,
                    otu_include,
                    otu_table_relative_abundance=relative_abundance)

                #write out aggregated results
                output_file = open(output_fp, 'w')
                output_file.write('\n'.join(output))
                output_file.close()

            else:
                option_parser.error(
                    "You cannot collate the results obtained from "
                    "using the longitudinal_correlation and paired_T options.")