def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) output_table_fp = opts.output_otu_table_fp metadata_field = opts.metadata_field positive_taxa = opts.positive_taxa negative_taxa = opts.negative_taxa input_table = load_table(opts.input_otu_table_fp) if positive_taxa is not None: positive_taxa = positive_taxa.split(',') else: positive_taxa = None if negative_taxa is not None: negative_taxa = negative_taxa.split(',') else: negative_taxa = None filter_fn = get_otu_ids_from_taxonomy_f(positive_taxa, negative_taxa, metadata_field) input_table.filter(filter_fn, axis='observation') write_biom_table(input_table, output_table_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) exclude_otus_fp = opts.exclude_otus_fp if not opts.taxonomy_fname: otu_to_taxonomy = None else: infile = open(opts.taxonomy_fname, 'U') otu_to_taxonomy = parse_taxonomy(infile) ids_to_exclude = [] if exclude_otus_fp: if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'): ids_to_exclude = \ get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U')) else: ids_to_exclude = \ get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U')) sample_metadata = None if opts.mapping_fp is not None: with open(opts.mapping_fp, 'U') as map_f: mapping_data, mapping_header, mapping_comments = \ parse_mapping_file(map_f) sample_metadata = mapping_file_to_dict(mapping_data, mapping_header) with open(opts.otu_map_fp, 'U') as otu_map_f: biom_otu_table = make_otu_table(otu_map_f, otu_to_taxonomy=otu_to_taxonomy, otu_ids_to_exclude=ids_to_exclude, sample_metadata=sample_metadata) write_biom_table(biom_otu_table, opts.output_biom_fp)
def _call_cleanup(self, input_fp, output_dir, params, job_prefix, poll_directly, suppress_submit_jobs): """ Called as the last step in __call__. """ if poll_directly: if params['observation_metadata_fp'] is not None: observation_metadata = \ parse_observation_metadata( open(params['observation_metadata_fp'], 'U')) else: observation_metadata = None biom_fp = join(output_dir, 'observation_table.biom') biom_table = make_otu_table( open(join(output_dir, 'observation_map.txt'), 'U'), observation_metadata) write_biom_table(biom_table, biom_fp) else: # can't construct the final biom file if not polling # directly as the final observation map won't have been created yet pass
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) output_table_fp = opts.output_otu_table_fp metadata_field = opts.metadata_field positive_taxa = opts.positive_taxa negative_taxa = opts.negative_taxa input_table = load_table(opts.input_otu_table_fp) if positive_taxa is not None: positive_taxa = positive_taxa.split(',') else: positive_taxa = None if negative_taxa is not None: negative_taxa = negative_taxa.split(',') else: negative_taxa = None filter_fn = get_otu_ids_from_taxonomy_f(positive_taxa, negative_taxa, metadata_field) input_table.filter(filter_fn, axis='observation') try: write_biom_table(input_table, output_table_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no OTUs remained after filtering.")
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) exclude_otus_fp = opts.exclude_otus_fp if not opts.taxonomy_fname: otu_to_taxonomy = None else: infile = open(opts.taxonomy_fname, 'U') otu_to_taxonomy = parse_taxonomy(infile) ids_to_exclude = [] if exclude_otus_fp: if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'): ids_to_exclude = \ get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U')) else: ids_to_exclude = \ get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U')) sample_metadata = None if opts.mapping_fp is not None: mapping_data, mapping_header, mapping_comments = parse_mapping_file(open(opts.mapping_fp, 'U')) sample_metadata = assemble_sample_metadata(mapping_data, mapping_header, mapping_comments) biom_otu_table = make_otu_table(open(opts.otu_map_fp, 'U'), otu_to_taxonomy=otu_to_taxonomy, otu_ids_to_exclude=ids_to_exclude, sample_metadata=sample_metadata) write_biom_table(biom_otu_table, opts.output_biom_fp)
def split_otu_table_on_taxonomy_to_files(otu_table_fp, level, output_dir, md_identifier='taxonomy', md_processor=process_md_as_list): """ Split OTU table by taxonomic level, writing otu tables to output dir """ results = [] otu_table = load_table(otu_table_fp) create_dir(output_dir) def split_f(id_, obs_md): try: result = md_processor(obs_md, md_identifier, level) except KeyError: raise KeyError("Metadata identifier (%s) is not associated with " "all (or any) observerations. You can modify the " "key with the md_identifier parameter." % md_identifier) except TypeError: raise TypeError("Can't correctly process the metadata string. If " "your input file was generated from QIIME 1.4.0 or" " earlier you may need to pass --md_as_string.") except AttributeError: raise AttributeError("Metadata category not found. If your input " "file was generated from QIIME 1.4.0 or " "earlier you may need to pass --md_identifier" " \"Consensus Lineage\".") return result for bin, sub_otu_table in otu_table.partition(split_f, axis='observation'): output_fp = '%s/otu_table_%s.biom' % (output_dir, bin) write_biom_table(sub_otu_table, output_fp) results.append(output_fp) return results
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp mapping_fp = opts.mapping_fp mapping_field = opts.mapping_field output_dir = opts.output_dir # column_rename_ids = opts.column_rename_ids # include_repeat_cols = opts.include_repeat_cols create_dir(output_dir) # split mapping file mapping_f = open(mapping_fp, 'U') for fp_str, sub_mapping_s in split_mapping_file_on_field(mapping_f, mapping_field): mapping_output_fp = join(output_dir, 'mapping_%s.txt' % fp_str) open(mapping_output_fp, 'w').write(sub_mapping_s) # split otu table otu_table_base_name = splitext(split(otu_table_fp)[1])[0] mapping_f = open(mapping_fp, 'U') otu_table = load_table(otu_table_fp) try: for fp_str, sub_otu_table_s in split_otu_table_on_sample_metadata( otu_table, mapping_f, mapping_field): otu_table_output_fp = join(output_dir, '%s_%s.biom' % ( otu_table_base_name, fp_str)) write_biom_table(sub_otu_table_s, otu_table_output_fp) except OTUTableSplitError as e: option_parser.error(e)
def setUp(self): self.tmp_dir = get_qiime_temp_dir() self.l19_data = np.array([[7, 1, 0, 0, 0, 0, 0, 0, 0], [4, 2, 0, 0, 0, 1, 0, 0, 0], [2, 4, 0, 0, 0, 1, 0, 0, 0], [1, 7, 0, 0, 0, 0, 0, 0, 0], [0, 8, 0, 0, 0, 0, 0, 0, 0], [0, 7, 1, 0, 0, 0, 0, 0, 0], [0, 4, 2, 0, 0, 0, 2, 0, 0], [0, 2, 4, 0, 0, 0, 1, 0, 0], [0, 1, 7, 0, 0, 0, 0, 0, 0], [0, 0, 8, 0, 0, 0, 0, 0, 0], [0, 0, 7, 1, 0, 0, 0, 0, 0], [0, 0, 4, 2, 0, 0, 0, 3, 0], [0, 0, 2, 4, 0, 0, 0, 1, 0], [0, 0, 1, 7, 0, 0, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0], [0, 0, 0, 7, 1, 0, 0, 0, 0], [0, 0, 0, 4, 2, 0, 0, 0, 4], [0, 0, 0, 2, 4, 0, 0, 0, 1], [0, 0, 0, 1, 7, 0, 0, 0, 0]]) self.l19_sample_names = [ 'sam1', 'sam2', 'sam3', 'sam4', 'sam5', 'sam6', 'sam7', 'sam8', 'sam9', 'sam_middle', 'sam11', 'sam12', 'sam13', 'sam14', 'sam15', 'sam16', 'sam17', 'sam18', 'sam19' ] self.l19_taxon_names = [ 'tax1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7', 'tax8', 'tax9' ] self.l19_taxon_names_w_underscore = [ 'ta_x1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7', 'tax8', 'tax9' ] l19 = Table(self.l19_data.T, self.l19_taxon_names, self.l19_sample_names) fd, self.l19_fp = mkstemp(dir=self.tmp_dir, prefix='test_bdiv_otu_table', suffix='.blom') os.close(fd) write_biom_table(l19, self.l19_fp) l19_w_underscore = Table(self.l19_data.T, self.l19_taxon_names_w_underscore, self.l19_sample_names) fd, self.l19_w_underscore_fp = mkstemp(dir=self.tmp_dir, prefix='test_bdiv_otu_table', suffix='.blom') os.close(fd) write_biom_table(l19_w_underscore, self.l19_w_underscore_fp) self.l19_tree_str = '((((tax7:0.1,tax3:0.2):.98,tax8:.3, tax4:.3):.4,\ ((tax1:0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, endbigtaxon:.08));' self.l19_tree = parse_newick(self.l19_tree_str, PhyloNode) self.files_to_remove = [self.l19_fp, self.l19_w_underscore_fp] self.folders_to_remove = []
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: sample_id_f_ids = set([l.strip().split()[0] for l in open(sample_id_fp, 'U') if not l.startswith('#')]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) write_biom_table(filtered_otu_table, output_fp) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open( output_mapping_fp, 'w').write( format_mapping_file( mapping_headers, mapping_data))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if mapping_fp is None and valid_states is not None: option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.") if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error( "No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those)." ) if (mapping_fp and valid_states) and sample_id_fp: option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) negate_sample_id_fp = opts.negate_sample_id_fp if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states) negate_sample_id_fp = False else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, "U") sample_id_f_ids = set([l.strip().split()[0] for l in o if not l.startswith("#")]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table( otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp ) try: write_biom_table(filtered_otu_table, output_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering." ) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, "U")) mapping_headers, mapping_data = filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, "w").write(format_mapping_file(mapping_headers, mapping_data))
def _generate_biom_output(self, observation_map_fp, output_biom_fp, observation_metadata_fp): if observation_metadata_fp is not None: observation_metadata = parse_taxonomy(open(observation_metadata_fp, "U")) else: observation_metadata = None biom_table = make_otu_table(open(observation_map_fp, "U"), observation_metadata) write_biom_table(biom_table, output_biom_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) biom_table_fp = opts.biom_table_fp mapping_fp = opts.mapping_fp fields = opts.fields.split(',') output_dir = opts.output_dir suppress_mf = opts.suppress_mapping_file_output # column_rename_ids = opts.column_rename_ids # include_repeat_cols = opts.include_repeat_cols bt = load_table(biom_table_fp) mdata, mheaders, mcomments = parse_mapping_file(mapping_fp) mdata = array(mdata) # check that biom file and mapping file have matching sample names. discard # those samples that do not appear in both. shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample'))) if len(shared_samples) == 0: raise ValueError('Mapping file and biom table share no samples.') elif len(shared_samples) == len(mdata[:, 0]): mdata = array(mdata) else: # we want to preserve the order of the samples in the biom table ss_bt_order = [s for s in bt.ids(axis='sample') if s in shared_samples] bt = bt.filter(ss_bt_order, axis='sample', inplace=True) mdata = subset_mapping_data(mdata, shared_samples) # check that headers in mapping data if not all([i in mheaders for i in fields]): raise ValueError('One or more of the specified fields was not found ' +\ 'in the mapping file.') # create output directory and create base names create_dir(output_dir) mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0]) bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0]) # run code and append output sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders, mdata) for sg, vg in zip(sample_groups, value_groups): name_base = '__' + '%s_%s_' * len(vg) + '_' name_tmp = [] for f, v in zip(fields, vg): name_tmp.extend([f, v]) nb = name_base % tuple(name_tmp) tmp_mf_data = subset_mapping_data(mdata, sg) tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments) write_biom_table(bt.filter(sg, axis='sample', inplace=False), bt_base_name + nb + '.biom') if not suppress_mf: o = open(mf_base_name + nb + '.txt', 'w') o.writelines(tmp_mf_str) o.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp min_count = opts.min_count max_count = opts.max_count min_count_fraction = opts.min_count_fraction if min_count_fraction < 0. or min_count_fraction > 1.: option_parser.error("min_count_fraction must be between 0 and 1") if min_count != 0 and min_count_fraction != 0: option_parser.error( "cannot specify both min_count and min_count_fraction") min_samples = opts.min_samples max_samples = opts.max_samples otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp negate_ids_to_exclude = opts.negate_ids_to_exclude if not (min_count != 0 or min_count_fraction != 0 or not isinf(max_count) or otu_ids_to_exclude_fp is not None or min_samples != 0 or not isinf(max_samples)): option_parser.error("No filtering requested. Must provide either " "min counts, max counts, min samples, max samples, min_count_fraction, " "or exclude_fp (or some combination of those).") otu_table = load_table(opts.input_fp) if min_count_fraction > 0: min_count = otu_table.sum() * min_count_fraction print otu_table.sum(), min_count otu_ids_to_keep = set(otu_table.observation_ids) if otu_ids_to_exclude_fp: if otu_ids_to_exclude_fp.endswith('.fasta') or \ otu_ids_to_exclude_fp.endswith('.fna'): otu_ids_to_exclude = set([id_.strip().split()[0] for id_, seq in parse_fasta(open(otu_ids_to_exclude_fp, 'U'))]) else: otu_ids_to_exclude = set([l.strip().split('\t')[0] for l in open(otu_ids_to_exclude_fp, 'U')]) otu_ids_to_keep -= otu_ids_to_exclude filtered_otu_table = filter_otus_from_otu_table(otu_table, otu_ids_to_keep, min_count, max_count, min_samples, max_samples, negate_ids_to_exclude) write_biom_table(filtered_otu_table, opts.output_fp)
def _write_rarefaction(self, depth, rep, sub_otu_table): """ depth and rep can be numbers or strings """ if sub_otu_table.is_empty(): return fname = 'rarefaction_' + str(depth) + '_' + str(rep) + '.biom' fname = os.path.join(self.output_dir, fname) write_biom_table(sub_otu_table, fname)
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.l19_data = np.array([ [7, 1, 0, 0, 0, 0, 0, 0, 0], [4, 2, 0, 0, 0, 1, 0, 0, 0], [2, 4, 0, 0, 0, 1, 0, 0, 0], [1, 7, 0, 0, 0, 0, 0, 0, 0], [0, 8, 0, 0, 0, 0, 0, 0, 0], [0, 7, 1, 0, 0, 0, 0, 0, 0], [0, 4, 2, 0, 0, 0, 2, 0, 0], [0, 2, 4, 0, 0, 0, 1, 0, 0], [0, 1, 7, 0, 0, 0, 0, 0, 0], [0, 0, 8, 0, 0, 0, 0, 0, 0], [0, 0, 7, 1, 0, 0, 0, 0, 0], [0, 0, 4, 2, 0, 0, 0, 3, 0], [0, 0, 2, 4, 0, 0, 0, 1, 0], [0, 0, 1, 7, 0, 0, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0], [0, 0, 0, 7, 1, 0, 0, 0, 0], [0, 0, 0, 4, 2, 0, 0, 0, 4], [0, 0, 0, 2, 4, 0, 0, 0, 1], [0, 0, 0, 1, 7, 0, 0, 0, 0] ]) self.l19_sample_names = [ 'sam1', 'sam2', 'sam3', 'sam4', 'sam5', 'sam6', 'sam7', 'sam8', 'sam9', 'sam_middle', 'sam11', 'sam12', 'sam13', 'sam14', 'sam15', 'sam16', 'sam17', 'sam18', 'sam19'] self.l19_taxon_names = ['tax1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7', 'tax8', 'tax9'] self.l19_taxon_names_w_underscore = ['ta_x1', 'tax2', 'tax3', 'tax4', 'endbigtaxon', 'tax6', 'tax7', 'tax8', 'tax9'] l19 = Table(self.l19_data.T, self.l19_taxon_names, self.l19_sample_names) fd, self.l19_fp = mkstemp(dir=self.tmp_dir, prefix='test_bdiv_otu_table', suffix='.blom') os.close(fd) write_biom_table(l19, self.l19_fp) l19_w_underscore = Table(self.l19_data.T, self.l19_taxon_names_w_underscore, self.l19_sample_names) fd, self.l19_w_underscore_fp = mkstemp(dir=self.tmp_dir, prefix='test_bdiv_otu_table', suffix='.blom') os.close(fd) write_biom_table(l19_w_underscore, self.l19_w_underscore_fp) self.l19_tree_str = '((((tax7:0.1,tax3:0.2):.98,tax8:.3, tax4:.3):.4,\ ((tax1:0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, endbigtaxon:.08));' self.l19_tree = parse_newick(self.l19_tree_str, PhyloNode) self.files_to_remove = [self.l19_fp, self.l19_w_underscore_fp] self.folders_to_remove = []
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp min_count = opts.min_count max_count = opts.max_count min_count_fraction = opts.min_count_fraction if min_count_fraction < 0. or min_count_fraction > 1.: option_parser.error("min_count_fraction must be between 0 and 1") if min_count != 0 and min_count_fraction != 0: option_parser.error( "cannot specify both min_count and min_count_fraction") min_samples = opts.min_samples max_samples = opts.max_samples otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp negate_ids_to_exclude = opts.negate_ids_to_exclude if not (min_count != 0 or min_count_fraction != 0 or not isinf(max_count) or otu_ids_to_exclude_fp is not None or min_samples != 0 or not isinf(max_samples)): option_parser.error( "No filtering requested. Must provide either " "min counts, max counts, min samples, max samples, min_count_fraction, " "or exclude_fp (or some combination of those).") otu_table = load_table(opts.input_fp) if min_count_fraction > 0: min_count = otu_table.sum() * min_count_fraction print otu_table.sum(), min_count otu_ids_to_keep = set(otu_table.ids(axis='observation')) if otu_ids_to_exclude_fp: if otu_ids_to_exclude_fp.endswith('.fasta') or \ otu_ids_to_exclude_fp.endswith('.fna'): otu_ids_to_exclude = set([ id_.strip().split()[0] for id_, seq in parse_fasta(open(otu_ids_to_exclude_fp, 'U')) ]) else: otu_ids_to_exclude = set([ l.strip().split('\t')[0] for l in open(otu_ids_to_exclude_fp, 'U') ]) otu_ids_to_keep -= otu_ids_to_exclude filtered_otu_table = filter_otus_from_otu_table(otu_table, otu_ids_to_keep, min_count, max_count, min_samples, max_samples, negate_ids_to_exclude) write_biom_table(filtered_otu_table, opts.output_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fps = opts.input_fps master = load_table(input_fps[0]) for input_fp in input_fps[1:]: master = master.merge(load_table(input_fp)) write_biom_table(master, opts.output_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, 'U') sample_id_f_ids = set( [l.strip().split()[0] for l in o if not l.startswith('#')]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) write_biom_table(filtered_otu_table, output_fp) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if not isfile(opts.input_path): raise IOError("Input path (%s) not valid. Does it exist?" % opts.input_path) samples, otus, data = parse_trflp(open(opts.input_path, 'U')) t = Table(data, otus, samples) write_biom_table(t, opts.output_path)
def _generate_biom_output(self, observation_map_fp, output_biom_fp, observation_metadata_fp): if observation_metadata_fp is not None: observation_metadata = \ parse_taxonomy(open(observation_metadata_fp, 'U')) else: observation_metadata = None biom_table = make_otu_table(open(observation_map_fp, 'U'), observation_metadata) write_biom_table(biom_table, output_biom_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if not isfile(opts.input_path): raise IOError( "Input path (%s) not valid. Does it exist?" % opts.input_path) samples, otus, data = parse_trflp(open(opts.input_path, 'U')) t = Table(data, otus, samples) write_biom_table(t, opts.output_path)
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.otu_table_data = np.array([[2, 1, 0], [0, 5, 0], [0, 3, 0], [1, 2, 0]]) self.sample_names = list('YXZ') self.taxon_names = list('bacd') self.otu_metadata = [{ 'domain': 'Archaea' }, { 'domain': 'Bacteria' }, { 'domain': 'Bacteria' }, { 'domain': 'Bacteria' }] self.otu_table = Table(self.otu_table_data, self.taxon_names, self.sample_names, observation_metadata=[{}, {}, {}, {}], sample_metadata=[{}, {}, {}]) self.otu_table_meta = Table(self.otu_table_data, self.taxon_names, self.sample_names, observation_metadata=self.otu_metadata) fd, self.otu_table_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(fd) fd, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(fd) self.rare_dir = mkdtemp(dir=self.tmp_dir, prefix='test_rarefaction_dir', suffix='') write_biom_table(self.otu_table, self.otu_table_fp) write_biom_table(self.otu_table_meta, self.otu_table_meta_fp) self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp] self._dirs_to_clean_up = [self.rare_dir]
def simsam_range_to_files( table, tree, simulated_sample_sizes, dissimilarities, output_dir, mapping_f=None, output_table_basename="table", output_map_basename="map", ): """Applies sim_otu_table over a range of parameters, writing output to file table: the input table to simulate samples from tree: tree related OTUs in input table simulated_sample_sizes: a list of ints defining how many output samples should be create per input sample dissimilarities: a list of floats containing the dissimilarities to use in simulating tables output_dir: the directory where all output tables and mapping files should be written mapping_f: file handle for metadata mapping file, if a mapping file should be created with the samples from each simulated table output_table_basename: basename for output table files (default: table) output_map_basename: basename for output mapping files (default: map) """ create_dir(output_dir) for e in simsam_range(table, tree, simulated_sample_sizes, dissimilarities, mapping_f): output_table = e[0] output_mapping_lines = e[1] simulated_sample_size = e[2] dissimilarity = e[3] output_table_fp = join( output_dir, "%s_n%d_d%r.biom" % (output_table_basename, simulated_sample_size, dissimilarity) ) write_biom_table(output_table, output_table_fp) if output_mapping_lines is not None: output_map_fp = join( output_dir, "%s_n%d_d%r.txt" % (output_map_basename, simulated_sample_size, dissimilarity) ) output_map_f = open(output_map_fp, "w") output_map_f.write("".join(output_mapping_lines)) output_map_f.close()
def simsam_range_to_files(table, tree, simulated_sample_sizes, dissimilarities, output_dir, mapping_f=None, output_table_basename="table", output_map_basename="map"): """Applies sim_otu_table over a range of parameters, writing output to file table: the input table to simulate samples from tree: tree related OTUs in input table simulated_sample_sizes: a list of ints defining how many output samples should be create per input sample dissimilarities: a list of floats containing the dissimilarities to use in simulating tables output_dir: the directory where all output tables and mapping files should be written mapping_f: file handle for metadata mapping file, if a mapping file should be created with the samples from each simulated table output_table_basename: basename for output table files (default: table) output_map_basename: basename for output mapping files (default: map) """ create_dir(output_dir) for e in simsam_range(table, tree, simulated_sample_sizes, dissimilarities, mapping_f): output_table = e[0] output_mapping_lines = e[1] simulated_sample_size = e[2] dissimilarity = e[3] output_table_fp = join( output_dir, '%s_n%d_d%r.biom' % (output_table_basename, simulated_sample_size, dissimilarity)) write_biom_table(output_table, output_table_fp) if output_mapping_lines is not None: output_map_fp = join( output_dir, '%s_n%d_d%r.txt' % (output_map_basename, simulated_sample_size, dissimilarity)) output_map_f = open(output_map_fp, 'w') output_map_f.write(''.join(output_mapping_lines)) output_map_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) exclude_otus_fp = opts.exclude_otus_fp if not opts.taxonomy_fname: otu_to_taxonomy = None else: infile = open(opts.taxonomy_fname, 'U') otu_to_taxonomy = parse_taxonomy(infile) if not opts.counts_fname: seq_counts = None else: seq_counts = {} with open(opts.counts_fname, 'U') as infile: for line in infile: (key, val) = line.split() seq_counts[key] = val ids_to_exclude = [] if exclude_otus_fp: if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'): ids_to_exclude = \ get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U')) else: ids_to_exclude = \ get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U')) sample_metadata = None if opts.mapping_fp is not None: with open(opts.mapping_fp, 'U') as map_f: mapping_data, mapping_header, mapping_comments = \ parse_mapping_file(map_f) sample_metadata = mapping_file_to_dict(mapping_data, mapping_header) with open(opts.otu_map_fp, 'U') as otu_map_f: biom_otu_table = make_otu_table(otu_map_f, otu_to_taxonomy=otu_to_taxonomy, otu_ids_to_exclude=ids_to_exclude, sample_metadata=sample_metadata,seq_counts=seq_counts) write_biom_table(biom_otu_table, opts.output_biom_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) mapping_fp = opts.mapping_fp mapping_category = opts.mapping_category otu_table_fp = opts.otu_table_fp output_fp = opts.output_fp normalize = opts.normalize # define a function that returns the bin a sample shouldbe placed into bin_function = lambda id_, sample_metadata:\ sample_metadata[mapping_category] # parse the sample metadata and add it to the OTU table (we assume that # sample metadata is not already present in the table) mapping, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) # added in ability to combine metadata columns and summarize based on the # new combined category if '&&' in mapping_category: new_mapping = [] new_mapping.append(headers) for i in range(len(mapping)): new_mapping.append(mapping[i]) # Create an array using multiple columns from mapping file combinecolorby = mapping_category.split('&&') mapping = combine_map_label_cols(combinecolorby, new_mapping) sample_metadata = mapping_file_to_dict(mapping, headers) with biom_open(otu_table_fp, 'U') as biom_file: table = parse_biom_table(biom_file) table.add_metadata(sample_metadata) # create a new OTU table where samples are binned based on their return # value from bin_function result = table.collapse(bin_function, norm=False, min_group_size=1, axis='sample') # normalize the result if requested by the user if normalize: result.norm(axis='sample', inplace=True) # write a new BIOM file write_biom_table(result, output_fp)
def setUp(self): self.qiime_config = load_qiime_config() self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' self.otu_table_data = np.array([[2, 1, 0], [0, 5, 0], [0, 3, 0], [1, 2, 0]]) self.sample_names = list('YXZ') self.taxon_names = list('bacd') self.otu_metadata = [{'domain': 'Archaea'}, {'domain': 'Bacteria'}, {'domain': 'Bacteria'}, {'domain': 'Bacteria'}] self.otu_table = Table(self.otu_table_data, self.taxon_names, self.sample_names, observation_metadata=[{}, {}, {}, {}], sample_metadata=[{}, {}, {}]) self.otu_table_meta = Table(self.otu_table_data, self.taxon_names, self.sample_names, observation_metadata=self.otu_metadata) fd, self.otu_table_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(fd) fd, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(fd) self.rare_dir = mkdtemp(dir=self.tmp_dir, prefix='test_rarefaction_dir', suffix='') write_biom_table(self.otu_table, self.otu_table_fp) write_biom_table(self.otu_table_meta, self.otu_table_meta_fp) self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp] self._dirs_to_clean_up = [self.rare_dir]
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table_data = load_table(opts.input_otu_table) sort_field = opts.sort_field mapping_fp = opts.mapping_fp sorted_sample_ids_fp = opts.sorted_sample_ids_fp if sort_field and mapping_fp: mapping_data = parse_mapping_file(open(mapping_fp, 'U')) result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data, sort_field) elif sorted_sample_ids_fp: sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp, 'U')) result = sort_otu_table(otu_table_data, sorted_sample_ids) else: result = sort_otu_table(otu_table_data, natsort_case_insensitive(otu_table_data.ids())) write_biom_table(result, opts.output_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) mapping_fp = opts.mapping_fp collapse_fields = opts.collapse_fields.split(',') input_biom_fp = opts.input_biom_fp collapse_mode = opts.collapse_mode output_biom_fp = opts.output_biom_fp output_mapping_fp = opts.output_mapping_fp normalize = opts.normalize collapsed_metadata, collapsed_table = \ collapse_samples(load_table(input_biom_fp), open(mapping_fp, 'U'), collapse_fields, collapse_mode) if normalize: collapsed_table.norm(axis='sample', inplace=True) write_biom_table(collapsed_table, output_biom_fp) output_map_lines = mapping_lines_from_collapsed_df(collapsed_metadata) with open(output_mapping_fp, 'w') as output_mapping_f: output_mapping_f.write('\n'.join(output_map_lines))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) exclude_otus_fp = opts.exclude_otus_fp if not opts.taxonomy_fname: otu_to_taxonomy = None else: infile = open(opts.taxonomy_fname, 'U') otu_to_taxonomy = parse_taxonomy(infile) ids_to_exclude = [] if exclude_otus_fp: if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'): ids_to_exclude = \ get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U')) else: ids_to_exclude = \ get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U')) biom_otu_table = make_otu_table(open(opts.otu_map_fp, 'U'), otu_to_taxonomy=otu_to_taxonomy, otu_ids_to_exclude=ids_to_exclude) write_biom_table(biom_otu_table, opts.output_biom_fp)
def setUp(self): """Define some test data.""" self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) self.otu_table1 = Table(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=list('abcd')) fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.otu_table1, self.otu_table1_fp) self.otu_table2 = Table(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=['a', 'b', 'c', 'd_']) fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.otu_table2, self.otu_table2_fp) self.single_sample_otu_table = Table( data=array([[2, 0, 0, 1]]).T, sample_ids=list('X'), observation_ids=list( 'abcd')) fd, self.single_sample_otu_table_fp = mkstemp( dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.single_sample_otu_table, self.single_sample_otu_table_fp) self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);') self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);") self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp]
def setUp(self): """Define some test data.""" self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) self.otu_table1 = Table(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=list('abcd')) fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.otu_table1, self.otu_table1_fp) self.otu_table2 = Table(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=['a', 'b', 'c', 'd_']) fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.otu_table2, self.otu_table2_fp) self.single_sample_otu_table = Table(data=array([[2, 0, 0, 1]]).T, sample_ids=list('X'), observation_ids=list('abcd')) fd, self.single_sample_otu_table_fp = mkstemp( dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.single_sample_otu_table, self.single_sample_otu_table_fp) self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);') self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);") self.files_to_remove = [ self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp ]
def setUp(self): """Define some test data.""" self.tmp_dir = get_qiime_temp_dir() self.otu_table1 = Table(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=list('abcd')) fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.otu_table1, self.otu_table1_fp) self.otu_table2 = Table(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=['a', 'b', 'c', 'd_']) fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.otu_table2, self.otu_table2_fp) self.single_sample_otu_table = Table( data=array([[2, 0, 0, 1]]).T, sample_ids=list('X'), observation_ids=list( 'abcd')) fd, self.single_sample_otu_table_fp = mkstemp( dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.single_sample_otu_table, self.single_sample_otu_table_fp) self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);') self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);") self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp]
def pick_subsampled_open_reference_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust'] allowed_reference_otu_picking_methods = [ 'uclust_ref', 'usearch61_ref', 'sortmerna' ] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id) * 100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir # count number of sequences in step 1 failures fasta file with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f: num_failure_seqs, mean, std = count_seqs_from_file( step1_failures_fasta_f) # number of failures sequences is greater than the threshold, # continue to step 2,3 and 4 run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold if run_step_2_and_3: # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_dir = '%s/step2_otus/' % output_dir create_dir(step2_dir) step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step2_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir # remove the indexed reference database from the dictionary of # parameters as it must be forced to build a new database # using the step2_repset_fasta_fp if reference_otu_picking_method == 'sortmerna': if 'sortmerna_db' in params['pick_otus']: del params['pick_otus']['sortmerna_db'] step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) index_links.append(( 'Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step4_dir = '%s/step4_otus/' % output_dir if run_step_2_and_3: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) failures_fp = step3_failures_fasta_fp failures_otus_fp = 'failures_failures_otus.txt' failures_step = 'step3' else: failures_fp = step1_failures_fasta_fp failures_otus_fp = 'failures_otus.txt' failures_step = 'step1' step3_otu_map_fp = "" step4_cmd = pick_denovo_otus(failures_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp) commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created if run_step_2_and_3: failures_fp = step3_failures_list_fp else: failures_fp = step1_failures_list_fp step3_otu_map_fp = "" cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([ ('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (failures_fp, output_dir)) ]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append( ('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write( '# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append(('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append(( 'New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copyfile(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write( '# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. if run_step_2_and_3: for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # steps 1-4 executed if run_step_2_and_3: logger.write( '# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # only steps 1 and 4 executed else: logger.write( '# Write non-singleton otus representative sequences from ' + 'step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) index_links.append(('OTU taxonomic assignments', taxonomy_fp, _index_headers['taxa_assignments'])) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append(('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table( table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def _write_rarefaction(self, fname, sub_otu_table): """ depth and rep can be numbers or strings """ if sub_otu_table.is_empty(): return write_biom_table(sub_otu_table, fname)
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=None, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write( 'Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus( input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback, minimum_failure_threshold=minimum_failure_threshold) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table( table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) lower_percentage = opts.lower_percentage upper_percentage = opts.upper_percentage otu_table_fp = opts.otu_table_fp otu_table = load_table(otu_table_fp) delimiter = opts.delimiter mapping_fp = opts.mapping md_as_string = opts.md_as_string md_identifier = opts.md_identifier levels = opts.level.split(',') suppress_classic_table_output = opts.suppress_classic_table_output suppress_biom_table_output = opts.suppress_biom_table_output if upper_percentage is not None and lower_percentage is not None: raise ValueError( "upper_percentage and lower_percentage are mutually exclusive") if upper_percentage is not None and lower_percentage is not None and \ mapping: raise ValueError("upper_percentage and lower_percentage can not be " "using with mapping file") if upper_percentage is not None and \ (upper_percentage < 0 or upper_percentage > 1.0): raise ValueError('max_otu_percentage should be between 0.0 and 1.0') if lower_percentage is not None and \ (lower_percentage < 0 or lower_percentage > 1.0): raise ValueError('lower_percentage should be between 0.0 and 1.0') if mapping_fp: mapping_file = open(mapping_fp, 'U') mapping, header, comments = parse_mapping_file(mapping_file) # use the input Mapping file for producing the output filenames map_dir_path, map_fname = split(mapping_fp) map_basename, map_fname_ext = splitext(map_fname) else: if suppress_classic_table_output and suppress_biom_table_output: option_parser.error("Both classic and BIOM output formats were " "suppressed.") if not opts.absolute_abundance: otu_table = otu_table.norm(axis='sample', inplace=False) # introduced output directory to will allow for multiple outputs if opts.output_dir: create_dir(opts.output_dir, False) output_dir_path = opts.output_dir else: output_dir_path = './' # use the input OTU table to produce the output filenames dir_path, fname = split(otu_table_fp) basename, fname_ext = splitext(fname) # Iterate over the levels and generate a summarized taxonomy for each for level in levels: if mapping_fp: # define output filename output_fname = join(output_dir_path, map_basename + '_L%s.txt' % (level)) summary, tax_order = add_summary_mapping(otu_table, mapping, int(level), md_as_string, md_identifier) write_add_taxa_summary_mapping(summary, tax_order, mapping, header, output_fname, delimiter) else: # define the output filename. The extension will be added to the # end depending on the output format output_fname = join(output_dir_path, basename + '_L%s' % level) summary, header = make_summary(otu_table, int(level), upper_percentage, lower_percentage, md_as_string, md_identifier) sample_ids = header[1:] observation_ids = [] data = [] for row in summary: # Join taxonomic levels to create an observation ID. observation_ids.append(delimiter.join(row[0])) data.append(row[1:]) table = Table(np.asarray(data), observation_ids, sample_ids) if opts.transposed_output: table = table.transpose() if not suppress_classic_table_output: with open(output_fname + '.txt', 'w') as outfile: outfile.write(table.to_tsv()) if not suppress_biom_table_output: write_biom_table(table, output_fname + '.biom')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if (mapping_fp is None and valid_states is not None): option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.") if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if (mapping_fp and valid_states) and sample_id_fp: option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) negate_sample_id_fp = opts.negate_sample_id_fp if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) negate_sample_id_fp = False else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, 'U') sample_id_f_ids = set( [l.strip().split()[0] for l in o if not l.startswith('#')]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table( otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp) try: write_biom_table(filtered_otu_table, output_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering.") # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))
def setUp(self): self.tmp_dir = get_qiime_temp_dir() self.map_file = """#SampleID Day time Description #This is some comment about the study 1 090809 1200 some description of sample1 2 090809 1800 some description of sample2 3 090909 1200 some description of sample3 4 090909 1800 some description of sample4 5 091009 1200 some description of sample5""" self.cat_by_sample = {"1": [("Day", "090809"), ("time", "1200")], "2": [("Day", "090809"), ("time", "1800")], "3": [("Day", "090909"), ("time", "1200")], "4": [("Day", "090909"), ("time", "1800")], "5": [("Day", "091009"), ("time", "1200")]} self.sample_by_cat = {("Day", "090809"): ["1", "2"], ("Day", "090909"): ["3", "4"], ("Day", "091009"): ["5"], ("time", "1200"): ["1", "3", "5"], ("time", "1800"): ["2", "4"]} self.num_cats = 2 self.meta_dict = {"1": ["090809 1200", 0], "2": ["090809 1800", 0], "3": ["090909 1200", 0], "4": ["090909 1800", 0], "5": ["091009 1200", 0]} self.labels = ["from", "to", "eweight", "consensus_lin", "Day", "time"] self.node_labels = ["node_name", "node_disp_name", "ntype", "degree", "weighted_degree", "consensus_lin", "Day", "time"] self.label_list = [["090809", "090909", "091009"], ["1200", "1800"]] self.otu_table_vals = array([[0, 1, 0, 0, 6], [2, 0, 0, 0, 0], [0, 0, 3, 1, 0], [0, 0, 0, 0, 5], [0, 4, 2, 0, 0], [3, 6, 0, 0, 0], [0, 0, 4, 2, 0], [0, 0, 0, 0, 3], [2, 0, 0, 5, 0], [0, 2, 0, 4, 0]]) otu_table = Table(self.otu_table_vals, ['otu_1', 'otu_2', 'otu_3', 'otu_4', 'otu_5', 'otu_6', 'otu_7', 'otu_8', 'otu_9', 'otu_10'], ['1', '2', '3', '4', '5'], [{"taxonomy": ["Bacteria", "Actinobacteria", "Coriobacteridae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Bacteroidaceae"]}, {"taxonomy": ["Bacteria", "Firmicutes", "Clostridia", "Clostridiales"]}, {"taxonomy": ["Bacteria", "Spirochaetes", "Spirochaetales", "Spirochaetaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Rikenellaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Odoribacteriaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae", "otu_425"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae", "otu_425"]}, {"taxonomy": ["Bacteria", "Firmicutes", "Mollicutes", "Clostridium_aff_innocuum_CM970"]}], [None, None, None, None, None]) fd, self.otu_table_fp = mkstemp( dir=self.tmp_dir, prefix='test_make_otu_network_otu_table', suffix='.biom') close(fd) write_biom_table(otu_table, self.otu_table_fp) self.otu_sample_file = """#Full OTU Counts #OTU ID 1 2 3 4 5 Consensus Lineage otu_1 0 1 0 0 6 Bacteria; Actinobacteria; Coriobacteridae otu_2 2 0 0 0 0 Bacteria; Bacteroidetes; Bacteroidales; Bacteroidaceae otu_3 0 0 3 1 0 Bacteria; Firmicutes; Clostridia; Clostridiales otu_4 0 0 0 0 5 Bacteria; Spirochaetes; Spirochaetales; Spirochaetaceae otu_5 0 4 2 0 0 Bacteria; Bacteroidetes; Bacteroidales; Rikenellaceae otu_6 3 6 0 0 0 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae otu_7 0 0 4 2 0 Bacteria; Bacteroidetes; Bacteroidales; Odoribacteriaceae otu_8 0 0 0 0 3 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425 otu_9 2 0 0 5 0 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425 otu_10 0 2 0 4 0 Bacteria; Firmicutes; Mollicutes; Clostridium_aff_innocuum_CM970""" self.con_by_sample = { '1': set(['2', '4']), '2': set(['5', '3', '1', '4']), '3': set(['4', '2']), '4': set(['3', '1', '2']), '5': set(['2'])} self.edge_file_str = [ "2 otu_1 1.0 Bacteria:Actinobacteria:Coriobacteridae 090809 1800", "5 otu_1 6.0 Bacteria:Actinobacteria:Coriobacteridae 091009 1200", "1 otu_2 2.0 Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae 090809 1200", "3 otu_3 3.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1200", "4 otu_3 1.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1800", "5 otu_4 5.0 Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae 091009 1200", "2 otu_5 4.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090809 1800", "3 otu_5 2.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090909 1200", "1 otu_6 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1200", "2 otu_6 6.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1800", "3 otu_7 4.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1200", "4 otu_7 2.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1800", "5 otu_8 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 091009 1200", "1 otu_9 2.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090809 1200", "4 otu_9 5.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090909 1800", "2 otu_10 2.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090809 1800", "4 otu_10 4.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090909 1800"] self.node_file_str = ["1 1 user_node 3 7.0 other 090809 1200", "2 2 user_node 4 13.0 other 090809 1800", "3 3 user_node 3 9.0 other 090909 1200", "4 4 user_node 4 12.0 other 090909 1800", "5 5 user_node 3 14.0 other 091009 1200", "otu_1 otu_node 2 7.0 Bacteria:Actinobacteria:Coriobacteridae otu otu", "otu_2 otu_node 1 2.0 Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae otu otu", "otu_3 otu_node 2 4.0 Bacteria:Firmicutes:Clostridia:Clostridiales otu otu", "otu_4 otu_node 1 5.0 Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae otu otu", "otu_5 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae otu otu", "otu_6 otu_node 2 9.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae otu otu", "otu_7 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae otu otu", "otu_8 otu_node 1 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_9 otu_node 2 7.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_10 otu_node 2 6.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 otu otu"] self.red_edge_file_str = [ "2 otu_1 1.0 Bacteria:Actinobacteria:Coriobacteridae 090809 1800", "5 otu_1 6.0 Bacteria:Actinobacteria:Coriobacteridae 091009 1200", "1 @1 1.0 missed 090809 1200", "3 otu_3 3.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1200", "4 otu_3 1.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1800", "5 @5 1.0 missed 091009 1200", "2 otu_5 4.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090809 1800", "3 otu_5 2.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090909 1200", "1 otu_6 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1200", "2 otu_6 6.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1800", "3 otu_7 4.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1200", "4 otu_7 2.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1800", "1 otu_9 2.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090809 1200", "4 otu_9 5.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090909 1800", "2 otu_10 2.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090809 1800", "4 otu_10 4.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090909 1800"] self.red_node_file_str = ["1 1 user_node 3 7.0 other 090809 1200", "2 2 user_node 4 13.0 other 090809 1800", "3 3 user_node 3 9.0 other 090909 1200", "4 4 user_node 4 12.0 other 090909 1800", "5 5 user_node 3 14.0 other 091009 1200", "otu_1 otu_node 2 7.0 Bacteria:Actinobacteria:Coriobacteridae otu otu", "@1 otu_collapsed 1 1.0 other otu otu", "otu_3 otu_node 2 4.0 Bacteria:Firmicutes:Clostridia:Clostridiales otu otu", "@5 otu_collapsed 2 2.0 other otu otu", "otu_5 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae otu otu", "otu_6 otu_node 2 9.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae otu otu", "otu_7 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae otu otu", "otu_9 otu_node 2 7.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_10 otu_node 2 6.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 otu otu"] self.otu_dc = {1: 3, 2: 7} self.sample_dc = {3: 3, 4: 2} self.degree_counts = {1: 3, 2: 7, 3: 3, 4: 2} self.num_con_cat = {"Day": 2, "time": 1} self.num_con = 6 self.num_cat = {"Day": 2, "time": 4} self.num_cat_less = {"Day": 1, "time": 3} self._paths_to_clean_up = [self.otu_table_fp] self._dir_to_clean_up = ''
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_dir = opts.output_dir if opts.num_fraction_for_core_steps < 2: option_parser.error( "Must perform at least two steps. Increase --num_fraction_for_core_steps.") fractions_for_core = np.linspace(opts.min_fraction_for_core, opts.max_fraction_for_core, opts.num_fraction_for_core_steps) otu_md = opts.otu_md valid_states = opts.valid_states mapping_fp = opts.mapping_fp create_dir(output_dir) if valid_states and opts.mapping_fp: sample_ids = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) if len(sample_ids) < 1: option_parser.error( "--valid_states pattern didn't match any entries in mapping file: \"%s\"" % valid_states) else: # get core across all samples if user doesn't specify a subset of the # samples to work with sample_ids = None input_table = parse_biom_table(open(input_fp, 'U')) otu_counts = [] summary_figure_fp = join(output_dir, 'core_otu_size.pdf') for fraction_for_core in fractions_for_core: # build a string representation of the fraction as that gets used # several times fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.) # prep output files output_fp = join( output_dir, 'core_otus_%s.txt' % fraction_for_core_str) output_table_fp = join( output_dir, 'core_table_%s.biom' % fraction_for_core_str) output_f = open(output_fp, 'w') try: core_table = filter_table_to_core(input_table, sample_ids, fraction_for_core) except TableException: output_f.write( "# No OTUs present in %s %% of samples." % fraction_for_core_str) output_f.close() otu_counts.append(0) continue # write some header information to file if sample_ids is None: output_f.write( "# Core OTUs across %s %% of samples.\n" % fraction_for_core_str) else: output_f.write( "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" % (fraction_for_core_str, valid_states, ' '.join(sample_ids))) # write the otu id and corresponding metadata for all core otus otu_count = 0 for value, id_, md in core_table.iter(axis='observation'): output_f.write('%s\t%s\n' % (id_, md[otu_md])) otu_count += 1 output_f.close() # write the core biom table write_biom_table(core_table, output_table_fp) # append the otu count to the list of counts otu_counts.append(otu_count) plot(fractions_for_core, otu_counts) xlim(min(fractions_for_core), max(fractions_for_core)) ylim(0, max(otu_counts) + 1) xlabel( "Fraction of samples that OTU must be observed in to be considered 'core'") ylabel("Number of OTUs") savefig(summary_figure_fp)
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=None, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write('Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus(input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join( [new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback, minimum_failure_threshold=minimum_failure_threshold) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append( '%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref', 'sortmerna'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append( [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append( [('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus( input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir # count number of sequences in step 1 failures fasta file with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f: num_failure_seqs, mean, std = count_seqs_from_file(step1_failures_fasta_f) # number of failures sequences is greater than the threshold, # continue to step 2,3 and 4 run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold if run_step_2_and_3: # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_dir = '%s/step2_otus/' % output_dir create_dir(step2_dir) step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step2_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath( step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir # remove the indexed reference database from the dictionary of # parameters as it must be forced to build a new database # using the step2_repset_fasta_fp if reference_otu_picking_method == 'sortmerna': if 'sortmerna_db' in params['pick_otus']: del params['pick_otus']['sortmerna_db'] step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set', step3_cmd)]) index_links.append( ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step4_dir = '%s/step4_otus/' % output_dir if run_step_2_and_3: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) failures_fp = step3_failures_fasta_fp failures_otus_fp = 'failures_failures_otus.txt' failures_step = 'step3' else: failures_fp = step1_failures_fasta_fp failures_otus_fp = 'failures_otus.txt' failures_step = 'step1' step3_otu_map_fp = "" step4_cmd = pick_denovo_otus(failures_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp) commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp) commands.append( [('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created if run_step_2_and_3: failures_fp = step3_failures_list_fp else: failures_fp = step1_failures_list_fp step3_otu_map_fp = "" cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (failures_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map( otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append(('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write('# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath( otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append( ('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append( ('New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copyfile(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write('# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. if run_step_2_and_3: for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # steps 1-4 executed if run_step_2_and_3: logger.write('# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # only steps 1 and 4 executed else: logger.write('# Write non-singleton otus representative sequences from ' + 'step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) index_links.append( ('OTU taxonomic assignments', taxonomy_fp, _index_headers['taxa_assignments'])) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append( ('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)