def test_run_pick_de_novo_otus_parallel(self): """run_pick_de_novo_otus generates expected results in parallel """ self.params['assign_taxonomy'] = \ {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0], 'reference_seqs_fp': self.test_data['refseqs'][0]} self.params['align_seqs'] = \ {'template_fp': self.test_data['refseqs_aligned'][0]} self.params['filter_alignment'] = \ {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]} actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus( self.test_data['seqs'][0], self.test_out, call_commands_serially, self.params, self.qiime_config, parallel=True, status_update_callback=no_status_updates) input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0] otu_map_fp = join(self.test_out, 'uclust_picked_otus', '%s_otus.txt' % input_file_basename) alignment_fp = join(self.test_out, 'pynast_aligned_seqs', '%s_rep_set_aligned.fasta' % input_file_basename) failures_fp = join(self.test_out, 'pynast_aligned_seqs', '%s_rep_set_failures.fasta' % input_file_basename) taxonomy_assignments_fp = join( self.test_out, 'uclust_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' % input_file_basename) otu_table_fp = join(self.test_out, 'otu_table.biom') tree_fp = join(self.test_out, 'rep_set.tre') self.assertEqual(actual_tree_fp, tree_fp) self.assertEqual(actual_otu_table_fp, otu_table_fp) input_seqs = LoadSeqs(self.test_data['seqs'][0], format='fasta', aligned=False) # Number of OTUs falls within a range that was manually # confirmed otu_map_lines = list(open(otu_map_fp)) num_otus = len(otu_map_lines) otu_map_otu_ids = [o.split()[0] for o in otu_map_lines] self.assertEqual(num_otus, 14) # all otus get taxonomy assignments taxonomy_assignment_lines = list(open(taxonomy_assignments_fp)) self.assertEqual(len(taxonomy_assignment_lines), num_otus) # number of seqs which aligned + num of seqs which failed to # align sum to the number of OTUs aln = LoadSeqs(alignment_fp) failures = LoadSeqs(failures_fp, aligned=False) self.assertTrue(aln.getNumSeqs() + failures.getNumSeqs(), num_otus) # number of tips in the tree equals the number of sequences that # aligned tree = LoadTree(tree_fp) self.assertEqual(len(tree.tips()), aln.getNumSeqs()) # parse the otu table otu_table = parse_biom_table(open(otu_table_fp, 'U')) expected_sample_ids = [ 'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1' ] # sample IDs are as expected self.assertEqualItems(otu_table.SampleIds, expected_sample_ids) # otu ids are as expected self.assertEqualItems(otu_table.ObservationIds, otu_map_otu_ids) # number of sequences in the full otu table equals the number of # input sequences number_seqs_in_otu_table = sum( [v.sum() for v in otu_table.iterSampleData()]) self.assertEqual(number_seqs_in_otu_table, input_seqs.getNumSeqs()) # Check that the log file is created and has size > 0 log_fp = glob(join(self.test_out, 'log*.txt'))[0] self.assertTrue(getsize(log_fp) > 0)
def test_run_pick_de_novo_otus_muscle(self): """run_pick_de_novo_otus w muscle generates expected results """ self.params['assign_taxonomy'] = \ {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0], 'reference_seqs_fp': self.test_data['refseqs'][0]} self.params['align_seqs'] = {'alignment_method': 'muscle'} self.params['filter_alignment'] = \ {'suppress_lane_mask_filter': None, 'entropy_threshold': '0.10'} run_pick_de_novo_otus(self.test_data['seqs'][0], self.test_out, call_commands_serially, self.params, self.qiime_config, parallel=False, status_update_callback=no_status_updates) input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0] otu_map_fp = join(self.test_out, 'uclust_picked_otus', '%s_otus.txt' % input_file_basename) alignment_fp = join(self.test_out, 'muscle_aligned_seqs', '%s_rep_set_aligned.fasta' % input_file_basename) taxonomy_assignments_fp = join( self.test_out, 'uclust_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' % input_file_basename) otu_table_fp = join(self.test_out, 'otu_table.biom') tree_fp = join(self.test_out, 'rep_set.tre') input_seqs = LoadSeqs(self.test_data['seqs'][0], format='fasta', aligned=False) # Number of OTUs falls within a range that was manually # confirmed otu_map_lines = list(open(otu_map_fp)) num_otus = len(otu_map_lines) otu_map_otu_ids = [o.split()[0] for o in otu_map_lines] self.assertEqual(num_otus, 14) # all otus get taxonomy assignments taxonomy_assignment_lines = list(open(taxonomy_assignments_fp)) self.assertEqual(len(taxonomy_assignment_lines), num_otus) # all OTUs align aln = LoadSeqs(alignment_fp) self.assertTrue(aln.getNumSeqs(), num_otus) # all OTUs in tree tree = LoadTree(tree_fp) self.assertEqual(len(tree.tips()), num_otus) # check that the two final output files have non-zero size self.assertTrue(getsize(tree_fp) > 0) self.assertTrue(getsize(otu_table_fp) > 0) # Check that the log file is created and has size > 0 log_fp = glob(join(self.test_out, 'log*.txt'))[0] self.assertTrue(getsize(log_fp) > 0) # parse the otu table otu_table = parse_biom_table(open(otu_table_fp, 'U')) expected_sample_ids = [ 'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1' ] # sample IDs are as expected self.assertEqualItems(otu_table.SampleIds, expected_sample_ids) # expected OTUs self.assertEqualItems(otu_table.ObservationIds, otu_map_otu_ids) # number of sequences in the full otu table equals the number of # input sequences number_seqs_in_otu_table = sum( [v.sum() for v in otu_table.iterSampleData()]) self.assertEqual(number_seqs_in_otu_table, input_seqs.getNumSeqs())
def test_run_pick_de_novo_otus_muscle(self): """run_pick_de_novo_otus w muscle generates expected results """ self.params['assign_taxonomy'] = \ {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0], 'reference_seqs_fp': self.test_data['refseqs'][0]} self.params['align_seqs'] = {'alignment_method': 'muscle'} self.params['filter_alignment'] = \ {'suppress_lane_mask_filter': None, 'entropy_threshold': '0.10'} run_pick_de_novo_otus( self.test_data['seqs'][0], self.test_out, call_commands_serially, self.params, self.qiime_config, parallel=False, status_update_callback=no_status_updates) input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0] otu_map_fp = join(self.test_out, 'uclust_picked_otus', '%s_otus.txt' % input_file_basename) alignment_fp = join(self.test_out, 'muscle_aligned_seqs', '%s_rep_set_aligned.fasta' % input_file_basename) taxonomy_assignments_fp = join(self.test_out, 'uclust_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' % input_file_basename) otu_table_fp = join(self.test_out, 'otu_table.biom') tree_fp = join(self.test_out, 'rep_set.tre') # Number of OTUs falls within a range that was manually # confirmed otu_map_lines = list(open(otu_map_fp)) num_otus = len(otu_map_lines) otu_map_otu_ids = [o.split()[0] for o in otu_map_lines] self.assertEqual(num_otus, 14) # all otus get taxonomy assignments taxonomy_assignment_lines = list(open(taxonomy_assignments_fp)) self.assertEqual(len(taxonomy_assignment_lines), num_otus) # all OTUs align self.assertEqual(count_seqs(alignment_fp)[0], num_otus) # all OTUs in tree with open(tree_fp) as f: tree = TreeNode.from_newick(f) self.assertEqual(len(list(tree.tips())), num_otus) # check that the two final output files have non-zero size self.assertTrue(getsize(tree_fp) > 0) self.assertTrue(getsize(otu_table_fp) > 0) # Check that the log file is created and has size > 0 log_fp = glob(join(self.test_out, 'log*.txt'))[0] self.assertTrue(getsize(log_fp) > 0) # parse the otu table otu_table = load_table(otu_table_fp) expected_sample_ids = [ 'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'] # sample IDs are as expected self.assertItemsEqual(otu_table.ids(), expected_sample_ids) # expected OTUs self.assertItemsEqual(otu_table.ids(axis='observation'), otu_map_otu_ids) # number of sequences in the full otu table equals the number of # input sequences number_seqs_in_otu_table = sum([v.sum() for v in otu_table.iter_data()]) self.assertEqual(number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0])
def test_run_pick_de_novo_otus_parallel(self): """run_pick_de_novo_otus generates expected results in parallel """ self.params['assign_taxonomy'] = \ {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0], 'reference_seqs_fp': self.test_data['refseqs'][0]} self.params['align_seqs'] = \ {'template_fp': self.test_data['refseqs_aligned'][0]} self.params['filter_alignment'] = \ {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]} actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus( self.test_data['seqs'][0], self.test_out, call_commands_serially, self.params, self.qiime_config, parallel=True, status_update_callback=no_status_updates) input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0] otu_map_fp = join(self.test_out, 'uclust_picked_otus', '%s_otus.txt' % input_file_basename) alignment_fp = join(self.test_out, 'pynast_aligned_seqs', '%s_rep_set_aligned.fasta' % input_file_basename) failures_fp = join(self.test_out, 'pynast_aligned_seqs', '%s_rep_set_failures.fasta' % input_file_basename) taxonomy_assignments_fp = join(self.test_out, 'uclust_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' % input_file_basename) otu_table_fp = join(self.test_out, 'otu_table.biom') tree_fp = join(self.test_out, 'rep_set.tre') self.assertEqual(actual_tree_fp, tree_fp) self.assertEqual(actual_otu_table_fp, otu_table_fp) # Number of OTUs falls within a range that was manually # confirmed otu_map_lines = list(open(otu_map_fp)) num_otus = len(otu_map_lines) otu_map_otu_ids = [o.split()[0] for o in otu_map_lines] self.assertEqual(num_otus, 14) # all otus get taxonomy assignments taxonomy_assignment_lines = list(open(taxonomy_assignments_fp)) self.assertEqual(len(taxonomy_assignment_lines), num_otus) # number of seqs which aligned + num of seqs which failed to # align sum to the number of OTUs self.assertEqual(count_seqs(alignment_fp)[0] + count_seqs(failures_fp)[0], num_otus) # number of tips in the tree equals the number of sequences that # aligned with open(tree_fp) as f: tree = TreeNode.from_newick(f) self.assertEqual(len(list(tree.tips())), count_seqs(alignment_fp)[0]) # parse the otu table otu_table = load_table(otu_table_fp) expected_sample_ids = [ 'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'] # sample IDs are as expected self.assertItemsEqual(otu_table.ids(), expected_sample_ids) # otu ids are as expected self.assertItemsEqual(otu_table.ids(axis='observation'), otu_map_otu_ids) # number of sequences in the full otu table equals the number of # input sequences number_seqs_in_otu_table = sum([v.sum() for v in otu_table.iter_data()]) self.assertEqual(number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0]) # Check that the log file is created and has size > 0 log_fp = glob(join(self.test_out, 'log*.txt'))[0] self.assertTrue(getsize(log_fp) > 0)
def test_run_pick_de_novo_otus_swarm_rdp_tax_assign_modify_resolution( self): """run_pick_de_novo_otus using swarm (d=4 which approximates 97%% id) generates expected results with rdp tax assignment """ self.params['assign_taxonomy'] = \ {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0], 'reference_seqs_fp': self.test_data['refseqs'][0], 'assignment_method': 'rdp'} self.params['align_seqs'] = \ {'template_fp': self.test_data['refseqs_aligned'][0]} self.params['filter_alignment'] = \ {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]} self.params['pick_otus']['otu_picking_method'] = "swarm" # Swarm resolution = 4 is about 97% similarity for the # sequences clustered in this test self.params['pick_otus']['swarm_resolution'] = "4" actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus( self.test_data['seqs'][0], self.test_out, call_commands_serially, self.params, self.qiime_config, parallel=False, status_update_callback=no_status_updates) input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0] otu_map_fp = join(self.test_out, 'swarm_picked_otus', '%s_otus.txt' % input_file_basename) alignment_fp = join(self.test_out, 'pynast_aligned_seqs', '%s_rep_set_aligned.fasta' % input_file_basename) failures_fp = join(self.test_out, 'pynast_aligned_seqs', '%s_rep_set_failures.fasta' % input_file_basename) taxonomy_assignments_fp = join( self.test_out, 'rdp_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' % input_file_basename) otu_table_fp = join(self.test_out, 'otu_table.biom') tree_fp = join(self.test_out, 'rep_set.tre') self.assertEqual(actual_tree_fp, tree_fp) self.assertEqual(actual_otu_table_fp, otu_table_fp) # Number of OTUs falls within a range that was manually # confirmed. otu_map_lines = list(open(otu_map_fp)) num_otus = len(otu_map_lines) otu_map_otu_ids = [o.split()[0] for o in otu_map_lines] self.assertEqual(num_otus, 23) # all otus get taxonomy assignments taxonomy_assignment_lines = list(open(taxonomy_assignments_fp)) self.assertEqual(len(taxonomy_assignment_lines), num_otus) # number of seqs which aligned + num of seqs which failed to # align sum to the number of OTUs self.assertEqual( count_seqs(alignment_fp)[0] + count_seqs(failures_fp)[0], num_otus) # number of tips in the tree equals the number of sequences that # aligned with open(tree_fp) as f: tree = TreeNode.from_newick(f) self.assertEqual(len(list(tree.tips())), count_seqs(alignment_fp)[0]) # parse the otu table otu_table = load_table(otu_table_fp) expected_sample_ids = [ 'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1' ] # sample IDs are as expected self.assertItemsEqual(otu_table.ids(), expected_sample_ids) # otu ids are as expected self.assertItemsEqual(otu_table.ids(axis='observation'), otu_map_otu_ids) # number of sequences in the full otu table equals the number of # input sequences number_seqs_in_otu_table = sum( [v.sum() for v in otu_table.iter_data()]) self.assertEqual(number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0]) # Check that the log file is created and has size > 0 log_fp = glob(join(self.test_out, 'log*.txt'))[0] self.assertTrue(getsize(log_fp) > 0)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) verbose = opts.verbose input_fp = opts.input_fp output_dir = opts.output_dir verbose = opts.verbose print_only = opts.print_only parallel = opts.parallel # No longer checking that jobs_to_start > 2, but # commenting as we may change our minds about this. #if parallel: raise_error_on_parallel_unavailable() if opts.parameter_fp: try: parameter_f = open(opts.parameter_fp, 'U') except IOError: raise IOError,\ "Can't open parameters file (%s). Does it exist? Do you have read access?"\ % opts.parameter_fp params = parse_qiime_parameters(parameter_f) parameter_f.close() else: params = parse_qiime_parameters([]) # empty list returns empty defaultdict for now jobs_to_start = opts.jobs_to_start default_jobs_to_start = qiime_config['jobs_to_start'] validate_and_set_jobs_to_start(params, jobs_to_start, default_jobs_to_start, parallel, option_parser) try: makedirs(output_dir) except OSError: if opts.force: pass else: # Since the analysis can take quite a while, I put this check # in to help users avoid overwriting previous output. option_parser.error( "Output directory already exists. Please choose" " a different directory, or force overwrite with -f.") if print_only: command_handler = print_commands else: command_handler = call_commands_serially if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates run_pick_de_novo_otus( input_fp, output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel,\ status_update_callback=status_update_callback)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) verbose = opts.verbose input_fp = opts.input_fp output_dir = opts.output_dir verbose = opts.verbose print_only = opts.print_only parallel = opts.parallel # No longer checking that jobs_to_start > 2, but # commenting as we may change our minds about this. #if parallel: raise_error_on_parallel_unavailable() if opts.parameter_fp: try: parameter_f = open(opts.parameter_fp, 'U') except IOError: raise IOError("Can't open parameters file (%s). Does it exist? Do you have read access?" % opts.parameter_fp) params = parse_qiime_parameters(parameter_f) parameter_f.close() else: params = parse_qiime_parameters([]) # empty list returns empty defaultdict for now jobs_to_start = opts.jobs_to_start default_jobs_to_start = qiime_config['jobs_to_start'] validate_and_set_jobs_to_start(params, jobs_to_start, default_jobs_to_start, parallel, option_parser) try: makedirs(output_dir) except OSError: if opts.force: pass else: # Since the analysis can take quite a while, I put this check # in to help users avoid overwriting previous output. option_parser.error("Output directory already exists. Please choose" " a different directory, or force overwrite with -f.") if print_only: command_handler = print_commands else: command_handler = call_commands_serially if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates run_pick_de_novo_otus( input_fp, output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, status_update_callback=status_update_callback)