def test_iqtree_model_choice(self): # Tip to tip dists should NOT be identical under different models. # Default is MFP (auto select substitution model). We'll compare ouput # of the GTR+G and HKY models. # This test is comparing an ordered series of tip-to-tip distances. # Take note, that for this comparison to work, all must have the same # seed value set. input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') # default GTR+G with redirected_stdio(stderr=os.devnull): gtrg = iqtree(input_sequences, seed=1723, substitution_model='GTR+G') gtrg_tree = skbio.TreeNode.read(str(gtrg), convert_underscores=False) gtrg_td = set(gtrg_tree.tip_tip_distances().to_series()) # set HKY with redirected_stdio(stderr=os.devnull): hky = iqtree(input_sequences, seed=1723, substitution_model='HKY') hky_tree = skbio.TreeNode.read(str(hky), convert_underscores=False) hky_td = set(hky_tree.tip_tip_distances().to_series()) # test pairs are not equivalent self.assertNotEqual(gtrg_td, hky_td)
def test_iqtree_model_choice(self): # Tip to tip dists should NOT be identical under different models. # Default is MFP (auto select substitution model). We'll compare ouput # of the GTR+G and HKY models. # This test is comparing an ordered series of tip-to-tip distances. # Take note, that for this comparison to work, all must have the same # seed value set. input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') # default GTR+G with redirected_stdio(stderr=os.devnull): gtrg = iqtree(input_sequences, seed=1723, substitution_model='GTR+G') gtrg_tree = skbio.TreeNode.read( str(gtrg), convert_underscores=False) gtrg_td = set(gtrg_tree.tip_tip_distances().to_series()) # set HKY with redirected_stdio(stderr=os.devnull): hky = iqtree(input_sequences, seed=1723, substitution_model='HKY') hky_tree = skbio.TreeNode.read( str(hky), convert_underscores=False) hky_td = set(hky_tree.tip_tip_distances().to_series()) # test pairs are not equivalent self.assertNotEqual(gtrg_td, hky_td)
def test_join_pairs_some_samples_w_no_joined_seqs(self): # minmergelen is set very high here, resulting in only one sequence # being joined across the three samples. with redirected_stdio(stderr=os.devnull): obs = join_pairs(self.input_seqs, minmergelen=279) # manifest is as expected self._test_manifest(obs) # expected number of fastq files are created output_fastqs = list(obs.sequences.iter_views(FastqGzFormat)) self.assertEqual(len(output_fastqs), 3) # The following values were determined by running vsearch directly. exp_sequence_counts = { 'BAQ2687.1_0_L001_R1_001.fastq.gz': 0, 'BAQ3473.2_1_L001_R1_001.fastq.gz': 2, 'BAQ4697.2_2_L001_R1_001.fastq.gz': 0, } for fastq_name, fastq_path in output_fastqs: with redirected_stdio(stderr=os.devnull): seqs = skbio.io.read(str(fastq_path), format='fastq', compression='gzip', constructor=skbio.DNA) seqs = list(seqs) seq_lengths = np.asarray([len(s) for s in seqs]) # expected number of sequences are joined self.assertEqual(len(seq_lengths), exp_sequence_counts[str(fastq_name)])
def test_q_score(self): ar = Artifact.load(self.get_data_path('simple.qza')) with redirected_stdio(stdout=os.devnull): obs_drop_ambig_ar, stats_ar = self.plugin.methods['q_score']( ar, quality_window=2, min_quality=20, min_length_fraction=0.25) obs_drop_ambig = obs_drop_ambig_ar.view( SingleLanePerSampleSingleEndFastqDirFmt) stats = stats_ar.view(pd.DataFrame) exp_drop_ambig = ["@foo_1", "ATGCATGC", "+", "DDDDBBDD"] columns = ['sample-id', 'total-input-reads', 'total-retained-reads', 'reads-truncated', 'reads-too-short-after-truncation', 'reads-exceeding-maximum-ambiguous-bases'] exp_drop_ambig_stats = pd.DataFrame([('foo', 2., 1., 0., 0., 1.), ('bar', 1., 0., 0., 0., 1.)], columns=columns) exp_drop_ambig_stats = exp_drop_ambig_stats.set_index('sample-id') obs = [] iterator = obs_drop_ambig.sequences.iter_views(FastqGzFormat) for sample_id, fp in iterator: obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) self.assertEqual(obs, exp_drop_ambig) pdt.assert_frame_equal(stats, exp_drop_ambig_stats.loc[stats.index]) with redirected_stdio(stdout=os.devnull): obs_trunc_ar, stats_ar = self.plugin.methods['q_score']( ar, quality_window=1, min_quality=33, min_length_fraction=0.25) obs_trunc = obs_trunc_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) stats = stats_ar.view(pd.DataFrame) exp_trunc = ["@foo_1", "ATGCATGC", "+", "DDDDBBDD", "@bar_1", "ATA", "+", "DDD"] exp_trunc_stats = pd.DataFrame([('foo', 2., 1., 0., 0., 1.), ('bar', 1., 1., 1., 0., 0.)], columns=columns) exp_trunc_stats = exp_trunc_stats.set_index('sample-id') obs = [] for sample_id, fp in obs_trunc.sequences.iter_views(FastqGzFormat): obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) self.assertEqual(sorted(obs), sorted(exp_trunc)) pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index])
def test_iqtree_ultrafast_bootstrap_singlebranch_methods(self): # Comparing branch support to manually constructed tree # using the following command: # iqtree -s aligned-dna-sequences-3.fasta -alrt 1500 -lbp 1500 # -abayes -bb 1500 -m 'HKY' -seed 1723 # Here I am simply checking if the support values are identical # to the manual run. Also check for number of values. input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with redirected_stdio(stderr=os.devnull): obs = iqtree_ultrafast_bootstrap(input_sequences, seed=1723, substitution_model='HKY', alrt=1500, lbp=1500, abayes=True, bootstrap_replicates=1500) obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False) obs_supp = [node.name for node in obs_tree.non_tips()] exp_tree = skbio.TreeNode.read(self.get_data_path('test5.tre')) exp_supp = [node.name for node in exp_tree.non_tips()] self.assertEqual(set(obs_supp), set(exp_supp)) self.assertEqual(len(obs_supp[0].split('/')), 4) # should be 4 values self.assertEqual(len(exp_supp[0].split('/')), 4) # should be 4 values
def test_skip_denovo(self): # feature1 and feature3 clusters into r1 and feature2 and feature4 # clusters into r2 during closed-ref clustering; no unclustered # features so de-novo clustering is skipped. exp_table = biom.Table(np.array([[104, 106, 109], [107, 107, 108]]), ['r1', 'r2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, rep_seqs, new_ref_seqs = self.open_reference( sequences=self.input_sequences, table=self.input_table, reference_sequences=self.ref_sequences, perc_identity=0.01) obs_table = obs_table.view(biom.Table) obs_table_ids = set(obs_table.ids(axis='observation')) exp_table_ids = set(exp_table.ids(axis='observation')) self.assertEqual(obs_table_ids, exp_table_ids) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) obs_rep_seqs = _read_seqs(rep_seqs) exp_rep_seqs = [self.input_sequences_list[0], # feature1 self.input_sequences_list[4]] # feature5 _relabel_seqs(exp_rep_seqs, ['r1', 'r2']) self.assertEqual(obs_rep_seqs, exp_rep_seqs) obs_ref_seqs = _read_seqs(new_ref_seqs) # The returned "new" ref seqs should be the same as the original ref # seqs, because we skipped de-novo clustering. exp_ref_seqs = _read_seqs(self.ref_sequences) self.assertEqual(obs_ref_seqs, exp_ref_seqs)
def test_min_length(self): metadata = CategoricalMetadataColumn( # The third barcode is meant to completely remove the only GGGG # coded sequence pd.Series(['AAAA', 'CCCC', 'GGGGACGTACGT'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c is empty because the barcode matched the entire # read, which removed everything. '', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def test_typical(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) exp = [ # sample a, fwd '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample a, rev '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n' '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n', # sample b, fwd '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample b, fwd '@id2\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' '@id4\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' '@id5\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n', ] exp_untrimmed = [ '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
def test_batch_size(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata, batch_size=1) # This test should yield the same results as test_typical, above, # the fact that we are batching shouldn't impact the final results self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def test_batch_size_odd_number_of_samples(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC', 'GGGG'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c '@id6\nACGTACGT\n+\nzzzzzzzz\n', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata, batch_size=2) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def test_extra_barcode_in_metadata(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'], name='Barcode', index=pd.Index( ['sample_a', 'sample_b', 'sample_c', 'sample_d'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c '@id6\nACGTACGT\n+\nzzzzzzzz\n', # sample d is empty bc no reads matched the barcode TTTT '', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) exp_samples_and_barcodes = pd.Series( ['AAAA', 'CCCC', 'GGGG', 'TTTT'], index=['sample_a', 'sample_b', 'sample_c', 'sample_d']) self.assert_demux_results(exp_samples_and_barcodes, exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def test_variable_length_barcodes(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAAA', 'CCCCCC', 'GGGG'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz') muxed_sequences = Artifact.import_data( 'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c '@id6\nACGTACGT\n+\nzzzzzzzz\n', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def test_run_rapid_bs_not_verbose(self): input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') aligned_fp = str(input_sequences) with tempfile.TemporaryDirectory() as temp_dir: cmd = ['raxmlHPC', '-m', 'GTRGAMMA', '-p', '1723', '-s', aligned_fp, '-w', temp_dir, '-n', 'q2', '-f', 'a', '-x', '9834', '-N', '10'] with redirected_stdio(stderr=os.devnull): run_command(cmd, verbose=False) obs_tree_fp = os.path.join(temp_dir, 'RAxML_bipartitions.q2') obs_tree = skbio.TreeNode.read(str(obs_tree_fp), convert_underscores=False) # load the resulting tree and test that it has the right number of # tips and the right tip ids tips = list(obs_tree.tips()) tip_names = [t.name for t in tips] self.assertEqual(set(tip_names), set(['GCA001510755', 'GCA001045515', 'GCA000454205', 'GCA000473545', 'GCA000196255', 'GCA000686145', 'GCA001950115', 'GCA001971985', 'GCA900007555']))
def test_run_ultrafast_bs_not_verbose(self): input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') aligned_fp = str(input_sequences) with tempfile.TemporaryDirectory() as temp_dir: run_prefix = os.path.join(temp_dir, 'q2iqtreeufboot') cmd = ['iqtree', '-m', 'HKY', '-seed', '1723', '-bb', '1000', '-s', aligned_fp, '-pre', run_prefix, '-nt', '2'] with redirected_stdio(stderr=os.devnull): run_command(cmd, verbose=False) obs_tree_fp = run_prefix + '.treefile' obs_tree = skbio.TreeNode.read(str(obs_tree_fp), convert_underscores=False) # load the resulting tree and test that it has the right number of # tips and the right tip ids tips = list(obs_tree.tips()) tip_names = [t.name for t in tips] self.assertEqual(set(tip_names), set(['GCA001510755', 'GCA001045515', 'GCA000454205', 'GCA000473545', 'GCA000196255', 'GCA002142615', 'GCA000686145', 'GCA001950115', 'GCA001971985', 'GCA900007555']))
def test_raxml_rapid_bootstrap_with_seed(self): # Test tip-to-tip dists are identical to manually run RAxML output. # This test is comparing an ordered series of tip-to-tip distances # to a tree output from a manual run of the default command: # raxmlHPC -f a -m GTRGAMMA -p 1723 -x 3871 -N 10 # -s aligned-dna-sequences-3.fasta -n q2 # NOTE: I cleanly rounded the tip-to-tip dists (i.e. `%.4f`) as RAxML # may return slightly different rounding errors on different # systems (and at times, between conda environments). input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') # test that branchlengths are identical with redirected_stdio(stderr=os.devnull): obs = raxml_rapid_bootstrap(input_sequences, seed=1723, rapid_bootstrap_seed=3871, bootstrap_replicates=10) obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False) # sometimes we lose the last set of numbers on long floats obs_tl = list(obs_tree.tip_tip_distances().to_series()) obs_series = set(['%.4f' % e for e in obs_tl]) exp_tree = skbio.TreeNode.read(self.get_data_path('test2.tre'), convert_underscores=True) exp_tl = list(exp_tree.tip_tip_distances().to_series()) exp_series = set(['%.4f' % e for e in exp_tl]) self.assertEqual(obs_series, exp_series) # test that bootstrap supports are identical obs_bs = [node.name for node in obs_tree.non_tips()].sort() exp_bs = [node.name for node in exp_tree.non_tips()].sort() self.assertEqual(obs_bs, exp_bs)
def test_1_percent_clustering(self): # feature1 and feature3 cluster together; feature2 and feature4 # cluster together; exp_table = biom.Table(np.array([[104, 106, 109], [8, 9, 11]]), ['r1', 'r2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, matched_seqs, unmatched_seqs = \ cluster_features_closed_reference( sequences=self.input_sequences, table=self.input_table, reference_sequences=self.ref_sequences_1, perc_identity=0.01) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) obs_matched_seqs = _read_seqs(matched_seqs) # The rep seqs selected are feature1 and feature4, for r1 and r2, # respectively. feature1 and feature3 are in the same cluster, but # feature1 is selected as the rep seq because it has a higher count. # Similarly, feature4 is selected as the cluster rep seq because it # has a higher count. exp_matched_seqs = [self.input_sequences_list[0], # feature1 self.input_sequences_list[3]] # feature4 _relabel_seqs(exp_matched_seqs, ['r1', 'r2']) self.assertEqual(obs_matched_seqs, exp_matched_seqs) # all sequences matched, so unmatched seqs is empty self.assertEqual(os.path.getsize(str(unmatched_seqs)), 0)
def test_none_matched(self): metadata = MetadataCategory( pd.Series(['TTTT'], index=['sample_d'], name='Barcode')) with redirected_stdio(stderr=os.devnull): with self.assertRaisesRegex(ValueError, 'demultiplexed'): self.demux_single_fn(self.muxed_sequences, metadata)
def test_duplicate_input_ids(self): input_fp = self.get_data_path('unaligned-duplicate-ids.fasta') input_sequences = DNAFASTAFormat(input_fp, mode='r') with self.assertRaisesRegex(ValueError, 'the unaligned.*id1'): with redirected_stdio(stderr=os.devnull): mafft(input_sequences)
def test_97_percent_clustering(self): # feature1 and feature3 cluster together; feature2 doesn't cluster at # all; feature 4 clusters alone. exp_table = biom.Table(np.array([[104, 106, 109], [7, 8, 9]]), ['r1', 'r2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, matched_seqs, unmatched_seqs = \ cluster_features_closed_reference( sequences=self.input_sequences, table=self.input_table, reference_sequences=self.ref_sequences_1, perc_identity=0.97) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) obs_matched_seqs = _read_seqs(matched_seqs) # The rep seqs selected are feature1 and feature4, for r1 and r2, # respectively. feature1 and feature3 are in the same cluster, but # feature1 is selected as the rep seq because it has a higher count. exp_matched_seqs = [self.input_sequences_list[0], # feature1 self.input_sequences_list[3]] # feature4 _relabel_seqs(exp_matched_seqs, ['r1', 'r2']) self.assertEqual(obs_matched_seqs, exp_matched_seqs) obs_unmatched_seqs = _read_seqs(unmatched_seqs) exp_unmatched_seqs = [self.input_sequences_list[1]] # feature2 self.assertEqual(obs_unmatched_seqs, exp_unmatched_seqs)
def test_join_pairs(self): with redirected_stdio(stderr=os.devnull): obs = join_pairs(self.input_seqs) # manifest is as expected self._test_manifest(obs) # expected number of fastq files are created output_fastqs = list(obs.sequences.iter_views(FastqGzFormat)) self.assertEqual(len(output_fastqs), 3) # The following values were determined by running vsearch directly # with default parameters. It is possible that different versions of # vsearch will result in differences in these numbers, and that # the corresponding tests may therefore be too specific. We'll have # to adjust the tests if that's the case. default_exp_sequence_counts = { 'BAQ2687.1_0_L001_R1_001.fastq.gz': 806, 'BAQ3473.2_1_L001_R1_001.fastq.gz': 753, 'BAQ4697.2_2_L001_R1_001.fastq.gz': 711, } for fastq_name, fastq_path in output_fastqs: seqs = skbio.io.read(str(fastq_path), format='fastq', compression='gzip', constructor=skbio.DNA) seqs = list(seqs) seq_lengths = np.asarray([len(s) for s in seqs]) self._test_seq_lengths(seq_lengths) # expected number of sequences are joined self.assertEqual(len(seq_lengths), default_exp_sequence_counts[str(fastq_name)])
def test_100_percent_clustering_strand(self): # feature2 and feature3 don't cluster exp_table = biom.Table(np.array([[100, 101, 103], [7, 8, 9]]), ['r1', 'r2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, matched_seqs, unmatched_seqs = \ cluster_features_closed_reference( sequences=self.input_sequences, table=self.input_table, reference_sequences=self.ref_sequences_2, perc_identity=1.0, strand='both') # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) obs_matched_seqs = _read_seqs(matched_seqs) # The rep seqs selected are feature1 and feature4, for r1 and r2, # respectively. Since no other features are in the cluster, there is # no count-based selection of the rep seq. exp_matched_seqs = [self.input_sequences_list[0], # feature1 self.input_sequences_list[3]] # feature4 _relabel_seqs(exp_matched_seqs, ['r1', 'r2']) self.assertEqual(obs_matched_seqs, exp_matched_seqs) obs_unmatched_seqs = _read_seqs(unmatched_seqs) exp_unmatched_seqs = [self.input_sequences_list[2], # feature3 self.input_sequences_list[1]] # feature2 self.assertEqual(obs_unmatched_seqs, exp_unmatched_seqs)
def test_uchime_denovo(self): with redirected_stdio(stderr=os.devnull): chime, nonchime, stats = uchime_denovo( sequences=self.input_sequences, table=self.input_table) obs_chime = _read_seqs(chime) exp_chime = [self.input_sequences_list[3]] self.assertEqual(obs_chime, exp_chime) # sequences are reverse-sorted by abundance in output obs_nonchime = _read_seqs(nonchime) exp_nonchime = [ self.input_sequences_list[0], self.input_sequences_list[1], self.input_sequences_list[2] ] self.assertEqual(obs_nonchime, exp_nonchime) with stats.open() as stats_fh: stats_text = stats_fh.read() self.assertTrue('feature1' in stats_text) self.assertTrue('feature2' in stats_text) self.assertTrue('feature3' in stats_text) self.assertTrue('feature4' in stats_text) stats_lines = [e for e in stats_text.split('\n') if len(e) > 0] self.assertEqual(len(stats_lines), 4)
def test_mixed_orientation_success(self): forward_barcodes = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) mixed_orientation_sequences_f_fp = self.get_data_path( 'mixed-orientation/forward.fastq.gz') mixed_orientation_sequences_r_fp = self.get_data_path( 'mixed-orientation/reverse.fastq.gz') with tempfile.TemporaryDirectory() as temp: shutil.copy(mixed_orientation_sequences_f_fp, temp) shutil.copy(mixed_orientation_sequences_r_fp, temp) mixed_orientation_sequences = Artifact.import_data( 'MultiplexedPairedEndBarcodeInSequence', temp) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(mixed_orientation_sequences, forward_barcodes=forward_barcodes, mixed_orientation=True) self.assert_demux_results(forward_barcodes.to_series(), obs_demuxed_art) # Everything should match self.assert_untrimmed_results([b'', b''], obs_untrimmed_art)
def test_uchime_denovo_no_chimeras(self): input_table = biom.Table( np.array([[3, 4, 2], [1, 0, 0], [4, 5, 6], [2, 2, 2]]), ['feature1', 'feature2', 'feature3', 'feature4'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): chime, nonchime, stats = uchime_denovo( sequences=self.input_sequences, table=input_table) obs_chime = _read_seqs(chime) exp_chime = [] self.assertEqual(obs_chime, exp_chime) # sequences are reverse-sorted by abundance in output obs_nonchime = _read_seqs(nonchime) exp_nonchime = [ self.input_sequences_list[2], self.input_sequences_list[0], self.input_sequences_list[3], self.input_sequences_list[1] ] self.assertEqual(obs_nonchime, exp_nonchime) with stats.open() as stats_fh: stats_text = stats_fh.read() self.assertTrue('feature1' in stats_text) self.assertTrue('feature2' in stats_text) self.assertTrue('feature3' in stats_text) self.assertTrue('feature4' in stats_text) stats_lines = [e for e in stats_text.split('\n') if len(e) > 0] self.assertEqual(len(stats_lines), 4)
def test_run_rapid_bs_not_verbose(self): input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') aligned_fp = str(input_sequences) with tempfile.TemporaryDirectory() as temp_dir: cmd = ['raxmlHPC', '-m', 'GTRGAMMA', '-p', '1723', '-s', aligned_fp, '-w', temp_dir, '-n', 'q2', '-f', 'a', '-x', '9834', '-N', '10'] with redirected_stdio(stderr=os.devnull): run_command(cmd, verbose=False) obs_tree_fp = os.path.join(temp_dir, 'RAxML_bipartitions.q2') obs_tree = skbio.TreeNode.read(str(obs_tree_fp), convert_underscores=False) # load the resulting tree and test that it has the right number of # tips and the right tip ids tips = list(obs_tree.tips()) tip_names = [t.name for t in tips] self.assertEqual(set(tip_names), set(['GCA001510755', 'GCA001045515', 'GCA000454205', 'GCA000473545', 'GCA000196255', 'GCA002142615', 'GCA000686145', 'GCA001950115', 'GCA001971985', 'GCA900007555']))
def test_uchime_ref_no_chimeras(self): ref_sequences_fp = self.get_data_path('ref-sequences-4.fasta') ref_sequences = DNAFASTAFormat(ref_sequences_fp, mode='r') with redirected_stdio(stderr=os.devnull): chime, nonchime, stats = uchime_ref( sequences=self.input_sequences, table=self.input_table, reference_sequences=ref_sequences) obs_chime = _read_seqs(chime) exp_chime = [] self.assertEqual(obs_chime, exp_chime) # sequences are reverse-sorted by abundance in output obs_nonchime = _read_seqs(nonchime) exp_nonchime = [ self.input_sequences_list[0], self.input_sequences_list[1], self.input_sequences_list[2], self.input_sequences_list[3] ] self.assertEqual(obs_nonchime, exp_nonchime) with stats.open() as stats_fh: stats_text = stats_fh.read() self.assertTrue('feature1' in stats_text) self.assertTrue('feature2' in stats_text) self.assertTrue('feature3' in stats_text) self.assertTrue('feature4' in stats_text) stats_lines = [e for e in stats_text.split('\n') if len(e) > 0] self.assertEqual(len(stats_lines), 4)
def test_97_percent_clustering_feature4_most_abundant(self): input_table = biom.Table(np.array([[4, 5, 6], [1, 1, 2], [7, 8, 9], [100, 101, 103]]), ['feature1', 'feature2', 'feature3', 'feature4'], ['sample1', 'sample2', 'sample3']) exp_table = biom.Table(np.array([[111, 114, 118], [1, 1, 2]]), ['feature4', 'feature2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, obs_sequences = cluster_features_de_novo( sequences=self.input_sequences, table=input_table, perc_identity=0.97) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) # sequences are reverse-sorted by abundance in output obs_seqs = _read_seqs(obs_sequences) exp_seqs = [self.input_sequences_list[3], self.input_sequences_list[1]] self.assertEqual(obs_seqs, exp_seqs)
def test_run_ultrafast_bs_not_verbose(self): input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') aligned_fp = str(input_sequences) with tempfile.TemporaryDirectory() as temp_dir: run_prefix = os.path.join(temp_dir, 'q2iqtreeufboot') cmd = [ 'iqtree', '-m', 'HKY', '-seed', '1723', '-bb', '1000', '-s', aligned_fp, '-pre', run_prefix, '-nt', '2' ] with redirected_stdio(stderr=os.devnull): run_command(cmd, verbose=False) obs_tree_fp = run_prefix + '.treefile' obs_tree = skbio.TreeNode.read(str(obs_tree_fp), convert_underscores=False) # load the resulting tree and test that it has the right number of # tips and the right tip ids tips = list(obs_tree.tips()) tip_names = [t.name for t in tips] self.assertEqual( set(tip_names), set([ 'GCA001510755', 'GCA001045515', 'GCA000454205', 'GCA000473545', 'GCA000196255', 'GCA002142615', 'GCA000686145', 'GCA001950115', 'GCA001971985', 'GCA900007555' ]))
def test_dereplicate_sequences_prefix(self): input_sequences_fp = self.get_data_path('seqs-1') input_sequences = QIIME1DemuxDirFmt(input_sequences_fp, 'r') exp_table = biom.Table(np.array([[2, 2], [2, 0]]), ['4574b947a0159c0da35a1f30f989681a1d9f64ef', '16a1263bde4f2f99422630d1bb87935c4236d1ba'], ['s2', 'sample1']) with redirected_stdio(stderr=os.devnull): obs_table, obs_sequences = dereplicate_sequences( sequences=input_sequences, derep_prefix=True) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) # sequences are reverse-sorted by abundance in output obs_seqs = list(skbio.io.read(str(obs_sequences), constructor=skbio.DNA, format='fasta')) exp_seqs = [skbio.DNA('AAACGTTACGGTTAACTATACATGCAGAAGACTAATCGG', metadata={'id': ('4574b947a0159c0da35a1f30f' '989681a1d9f64ef'), 'description': 's2_1'}), skbio.DNA('ACGTACGTACGTACGTACGTACGTACGTACGTGCATGGTGCGACCG', metadata={'id': ('16a1263bde4f2f99422630d1bb' '87935c4236d1ba'), 'description': 's2_42'})] self.assertEqual(obs_seqs, exp_seqs)
def test_typical(self): demuxed_art = Artifact.import_data( 'SampleData[PairedEndSequencesWithQuality]', self.get_data_path('paired-end')) adapter = ['TACGGAGGATCC'] with redirected_stdio(stdout=os.devnull): # The forward and reverse reads are identical in these data obs_art, = self.plugin.methods['trim_paired'](demuxed_art, front_f=adapter, front_r=adapter) demuxed = demuxed_art.view(SingleLanePerSampleSingleEndFastqDirFmt) demuxed_seqs = demuxed.sequences.iter_views(FastqGzFormat) obs = obs_art.view(SingleLanePerSampleSingleEndFastqDirFmt) obs_seqs = obs.sequences.iter_views(FastqGzFormat) # Iterate over each sample, side-by-side for (_, exp_fp), (_, obs_fp) in zip(demuxed_seqs, obs_seqs): exp_fh = gzip.open(str(exp_fp), 'rt') obs_fh = gzip.open(str(obs_fp), 'rt') # Iterate over expected and observed reads, side-by-side for records in itertools.zip_longest(*[exp_fh] * 4, *[obs_fh] * 4): (exp_seq_h, exp_seq, _, exp_qual, obs_seq_h, obs_seq, _, obs_qual) = records # Make sure cutadapt hasn't shuffled the read order self.assertEqual(exp_seq_h, obs_seq_h) self.assertTrue(obs_seq in exp_seq) # The adapter should not be present in the trimmed seqs self.assertTrue('TACGGAGGATCC' not in obs_seq) self.assertTrue(obs_qual in exp_qual) # Make sure cutadapt trimmed the quality scores, too self.assertEqual(len(obs_seq), len(obs_qual)) exp_fh.close(), obs_fh.close()
def test_raxml_rapid_bootstrap_with_seed(self): # Test tip-to-tip dists are identical to manually run RAxML output. # This test is comparing an ordered series of tip-to-tip distances # to a tree output from a manual run of the default command: # raxmlHPC -f a -m GTRGAMMA -p 1723 -x 3871 # -s aligned-dna-sequences-3.fasta -n q2 # NOTE: I cleanly rounded the tip-to-tip dists (i.e. `%.4f`) as RAxML # may return slightly different rounding errors on different # systems (and at times, between conda environments). input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') # test that branchlengths are identical with redirected_stdio(stderr=os.devnull): obs = raxml_rapid_bootstrap(input_sequences, seed=1723, rapid_bootstrap_seed=3871, bootstrap_replicates=10) obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False) # sometimes we lose the last set of numbers on long floats obs_tl = list(obs_tree.tip_tip_distances().to_series()) obs_series = set(['%.4f' % e for e in obs_tl]) exp_tree = skbio.TreeNode.read(self.get_data_path('test2.tre'), convert_underscores=True) exp_tl = list(exp_tree.tip_tip_distances().to_series()) exp_series = set(['%.4f' % e for e in exp_tl]) self.assertEqual(obs_series, exp_series) # test that bootstrap supports are identical obs_bs = [node.name for node in obs_tree.non_tips()].sort() exp_bs = [node.name for node in exp_tree.non_tips()].sort() self.assertEqual(obs_bs, exp_bs)
def test_multithreaded_mafft(self): input_sequences, exp = self._prepare_sequence_data() with redirected_stdio(stderr=os.devnull): result = mafft(input_sequences, n_threads='auto') obs = skbio.io.read(str(result), into=skbio.TabularMSA, constructor=skbio.DNA) self.assertEqual(obs, exp)
def test_nans_in_unused_column(self): md = qiime2.Metadata( pd.DataFrame([[1, 'a'], [1, 'b'], [np.nan, 'b']], columns=['number', 'letter'], index=pd.Index(['sample1', 'sample2', 'sample3'], name='id'))) with redirected_stdio(stderr=os.devnull): with tempfile.TemporaryDirectory() as temp_dir_name: adonis(temp_dir_name, self.dm, md, 'letter+letter')
def test_mafft_parttree_exception(self): input_fp = os.path.join(self.temp_dir.name, 'million.fasta') with open(input_fp, "w") as f: for i in range(0, 1000002): f.write('>%d\nAAGCAAGC\n' % i) input_sequences = DNAFASTAFormat(input_fp, mode='r') with self.assertRaisesRegex(ValueError, '1 million'): with redirected_stdio(stderr=os.devnull): mafft(input_sequences)
def test_failed_run_not_verbose(self): input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta') input_sequences = DNAFASTAFormat(input_fp, mode='r') output_alignment = AlignedDNAFASTAFormat() unaligned_fp = str(input_sequences) aligned_fp = str(output_alignment) cmd = ["mafft", "--not-a-real-parameter", unaligned_fp] with self.assertRaises(subprocess.CalledProcessError): with redirected_stdio(stderr=os.devnull): run_command(cmd, aligned_fp, verbose=False)
def test_build_iqtree_ufbs_command(self): input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with tempfile.TemporaryDirectory() as temp_dir: run_prefix = os.path.join(temp_dir, 'q2iqtreeufboot') with redirected_stdio(stderr=os.devnull): obs = _build_iqtree_ufbs_command(input_sequences, seed=1723, n_cores=0, n_runs=5, bootstrap_replicates=2000, substitution_model='MFP', run_prefix=run_prefix, dtype='DNA', safe='True', allnni='True', alrt=500, abayes=True, lbp=400, bnni=True, n_init_pars_trees=200, n_top_init_trees=30, n_best_retain_trees=10, stop_iter=300, perturb_nni_strength=0.55, spr_radius=8, n_max_ufboot_iter=600, n_ufboot_steps=80, min_cor_ufboot=0.66, ep_break_ufboot=0.51) self.assertTrue('2000' in obs[2]) self.assertTrue('DNA' in obs[4]) self.assertTrue('5' in obs[6]) self.assertTrue(str(input_sequences) in str(obs[8])) self.assertTrue('MFP' in obs[10]) self.assertTrue(str(run_prefix) in obs[12]) self.assertTrue('AUTO' in obs[14]) self.assertTrue('1723' in obs[16]) self.assertTrue('-safe' in obs[17]) self.assertTrue('-allnni' in obs[18]) self.assertTrue('500' in obs[20]) self.assertTrue('-abayes' in obs[21]) self.assertTrue('400' in obs[23]) self.assertTrue('-bnni' in obs[24]) self.assertTrue('200' in obs[26]) self.assertTrue(str('30') in obs[28]) self.assertTrue(str('10') in obs[30]) self.assertTrue(str('300') in obs[32]) self.assertTrue(str('0.55') in obs[34]) self.assertTrue(str('8') in obs[36]) self.assertTrue(str('600') in obs[38]) self.assertTrue(str('80') in obs[40]) self.assertTrue(str('0.66') in obs[42]) self.assertTrue(str('0.51') in obs[44])
def test_failed_run_not_verbose(self): input_fp = self.get_data_path('aligned-dna-sequences-1.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') result = NewickFormat() aligned_fp = str(input_sequences) tree_fp = str(result) cmd = ['FastTree', '-nt', '-not-a-real-parameter', aligned_fp] with self.assertRaises(subprocess.CalledProcessError): with redirected_stdio(stderr=os.devnull): run_command(cmd, tree_fp, verbose=False)
def test_mafft(self): input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta') input_sequences = DNAFASTAFormat(input_fp, mode='r') exp = skbio.TabularMSA( [skbio.DNA('AGGGGGG', metadata={'id': 'seq1', 'description': ''}), skbio.DNA('-GGGGGG', metadata={'id': 'seq2', 'description': ''})] ) with redirected_stdio(stderr=os.devnull): result = mafft(input_sequences) obs = skbio.io.read(str(result), into=skbio.TabularMSA, constructor=skbio.DNA) self.assertEqual(obs, exp)
def test_fasttree_underscore_ids(self): input_fp = self.get_data_path('aligned-dna-sequences-2.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with redirected_stdio(stderr=os.devnull): obs = fasttree(input_sequences) # load the resulting tree and test that it has the right number of # tips and the right tip ids (the branch lengths can vary with # different versions of FastTree) obs_tree = skbio.TreeNode.read(str(obs)) tips = list(obs_tree.tips()) tip_names = [t.name for t in tips] tip_names.sort() self.assertEqual(tip_names, ['_s_e_q_1_', '_s_e_q_2_'])
def test_raxml_num_searches(self): input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with redirected_stdio(stderr=os.devnull): obs = raxml(input_sequences, seed=1723, n_searches=5) obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False) obs_tl = list(obs_tree.tip_tip_distances().to_series()) obs_series = set(['%.4f' % e for e in obs_tl]) exp_tree = skbio.TreeNode.read(self.get_data_path('test3.tre')) exp_tl = list(exp_tree.tip_tip_distances().to_series()) exp_series = set(['%.4f' % e for e in exp_tl]) self.assertEqual(obs_series, exp_series)
def test_iqtree_safe_allnni(self): # Same as `test_iqtree` but testing the `-safe` and `-allnni `flags input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with redirected_stdio(stderr=os.devnull): obs = iqtree(input_sequences, safe='True', allnni='True') obs_tree = skbio.TreeNode.read(str(obs)) tips = list(obs_tree.tips()) tip_names = [t.name for t in tips] self.assertEqual(set(tip_names), set(['GCA001510755', 'GCA001045515', 'GCA000454205', 'GCA000473545', 'GCA000196255', 'GCA002142615', 'GCA000686145', 'GCA001950115', 'GCA001971985', 'GCA900007555']))
def test_fasttree_n_threads(self): input_fp = self.get_data_path('aligned-dna-sequences-1.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with redirected_stdio(stderr=os.devnull): obs = fasttree(input_sequences, n_threads=-1) # load the resulting tree and test that it has the right number of # tips and the right tip ids (the branch lengths can vary with # different versions of FastTree, and threading can produce # non-deterministic trees) obs_tree = skbio.TreeNode.read(str(obs)) tips = list(obs_tree.tips()) tip_names = [t.name for t in tips] tip_names.sort() self.assertEqual(tip_names, ['seq1', 'seq2'])
def test_raxml_model_choice(self): # Tip to tip dists should NOT be identical under different models. # Default is GTRGAMMA, we'll compare ouput to GRTGAMMAI & GTRCAT. # This test is comparing an ordered series of tip-to-tip distances. # Take note, that for this comparison to work, all must have the same # seed value set. input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') # default GTRGAMMA with redirected_stdio(stderr=os.devnull): gtrg = raxml(input_sequences, seed=1723) gtrg_tree = skbio.TreeNode.read( str(gtrg), convert_underscores=False) gtrg_td = set(gtrg_tree.tip_tip_distances().to_series()) # set GTRGAMMAI with redirected_stdio(stderr=os.devnull): gtrgi = raxml(input_sequences, seed=1723, substitution_model='GTRGAMMAI') gtrgi_tree = skbio.TreeNode.read( str(gtrgi), convert_underscores=False) gtrgi_td = set(gtrgi_tree.tip_tip_distances().to_series()) # set GTRCAT with redirected_stdio(stderr=os.devnull): gtrcat = raxml(input_sequences, seed=1723, substitution_model='GTRCAT') gtrcat_tree = skbio.TreeNode.read( str(gtrcat), convert_underscores=False) gtrcat_td = set(gtrcat_tree.tip_tip_distances().to_series()) # test pairs are not equivalent self.assertNotEqual(gtrg_td, gtrgi_td) self.assertNotEqual(gtrg_td, gtrcat_td) self.assertNotEqual(gtrgi_td, gtrcat_td)
def test_rapid_bootstrap_command(self): input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with tempfile.TemporaryDirectory() as temp_dir: with redirected_stdio(stderr=os.devnull): obs = _build_rapid_bootstrap_command(input_sequences, 1723, 8752, 15, 'GTRGAMMA', temp_dir, 'bs') self.assertTrue(str(input_sequences) in str(obs[11])) self.assertTrue('1723' in obs[5]) self.assertTrue('8752' in obs[7]) self.assertTrue('15' in obs[9]) self.assertTrue('GTRGAMMA' in obs[3]) self.assertTrue(str(temp_dir) in obs[13]) self.assertTrue('bs' in obs[15])
def test_raxml(self): # Test that output tree is made. # Reads tree output and compares tip labels to expected labels. input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with redirected_stdio(stderr=os.devnull): obs = raxml(input_sequences) obs_tree = skbio.TreeNode.read(str(obs)) # load the resulting tree and test that it has the right number of # tips and the right tip ids tips = list(obs_tree.tips()) tip_names = [t.name for t in tips] self.assertEqual(set(tip_names), set(['GCA001510755', 'GCA001045515', 'GCA000454205', 'GCA000473545', 'GCA000196255', 'GCA002142615', 'GCA000686145', 'GCA001950115', 'GCA001971985', 'GCA900007555']))
def test_build_iqtree_command(self): input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with tempfile.TemporaryDirectory() as temp_dir: run_prefix = os.path.join(temp_dir, 'q2iqtree') with redirected_stdio(stderr=os.devnull): obs = _build_iqtree_command(input_sequences, seed=1723, n_cores=0, n_runs=2, substitution_model='MFP', run_prefix=run_prefix, dtype='DNA', safe='True', fast='True', alrt=1000, abayes=True, lbp=1000, n_init_pars_trees=200, n_top_init_trees=30, n_best_retain_trees=10, n_iter=80, stop_iter=300, perturb_nni_strength=0.55, spr_radius=8, allnni='True') self.assertTrue('DNA' in obs[2]) self.assertTrue('2' in obs[4]) self.assertTrue(str(input_sequences) in str(obs[6])) self.assertTrue('MFP' in obs[8]) self.assertTrue(str(run_prefix) in obs[10]) self.assertTrue('AUTO' in obs[12]) self.assertTrue('1723' in obs[14]) self.assertTrue(str('-safe') in obs[15]) self.assertTrue(str('-fast') in obs[16]) self.assertTrue(str('1000') in obs[18]) self.assertTrue(str('-abayes') in obs[19]) self.assertTrue(str('1000') in obs[21]) self.assertTrue(str('-allnni') in obs[22]) self.assertTrue(str('200') in obs[24]) self.assertTrue(str('30') in obs[26]) self.assertTrue(str('10') in obs[28]) self.assertTrue(str('80') in obs[30]) self.assertTrue(str('300') in obs[32]) self.assertTrue(str('0.55') in obs[34]) self.assertTrue(str('8') in obs[36])
def test_raxml_rapid_bootstrap_n_threads(self): # Test that an output tree is made when invoking threads. input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with redirected_stdio(stderr=os.devnull): obs = raxml_rapid_bootstrap(input_sequences, n_threads=2) obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False) # load the resulting tree and test that it has the right number of # tips and the right tip ids tips = list(obs_tree.tips()) tip_names = [t.name for t in tips] self.assertEqual(set(tip_names), set(['GCA001510755', 'GCA001045515', 'GCA000454205', 'GCA000473545', 'GCA000196255', 'GCA002142615', 'GCA000686145', 'GCA001950115', 'GCA001971985', 'GCA900007555']))
def test_raxml_underscore_ids(self): # Test that output tree is made with underscores in tip IDs. # Some programs and python wrappers may strip underscores. # Reads tree output and compares tip labels to expected labels. input_fp = self.get_data_path('aligned-dna-sequences-4.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with redirected_stdio(stderr=os.devnull): obs = raxml(input_sequences) obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False) # load the resulting tree and test that it has the right number of # tips and the right tip ids tips = list(obs_tree.tips()) tip_names = [t.name for t in tips] self.assertEqual(set(tip_names), set(['GCA_001510755_1', 'GCA_001045515_1', 'GCA_000454205_1', 'GCA_000473545_1', 'GCA_000196255_1', 'GCA_002142615_1', 'GCA_000686145_1', 'GCA_001950115_1', 'GCA_001971985_1', 'GCA_900007555_1']))
def test_raxml_with_seed(self): # Test tip-to-tip dists are identical to manually run RAxML output. # This test is comparing an ordered series of tip-to-tip distances # to a tree output from a manual run of the default command: # raxmlHPC -m GTRGAMMA -p 1723 -s aligned-dna-sequences-3.fasta -n q2 # NOTE: I cleanly rounded the tip-to-tip dists (i.e. `%.4f`) as RAxML # may return slightly different rounding errors on different # systems. input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with redirected_stdio(stderr=os.devnull): obs = raxml(input_sequences, seed=1723) obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False) obs_tl = list(obs_tree.tip_tip_distances().to_series()) obs_series = set(['%.4f' % e for e in obs_tl]) exp_tree = skbio.TreeNode.read(self.get_data_path('test.tre')) exp_tl = list(exp_tree.tip_tip_distances().to_series()) exp_series = set(['%.4f' % e for e in exp_tl]) self.assertEqual(obs_series, exp_series)
def test_iqtree_with_seed(self): # Test tip-to-tip dists are identical to manually run IQ-TREE output. # This test is comparing an ordered series of tip-to-tip distances # to a tree output from a manual run of the default command: # iqtree -seed 1723 -m HKY -s aligned-dna-sequences-3.fasta # -nt 1 -pre q2iqtree # NOTE: I cleanly rounded the tip-to-tip dists (i.e. `%.4f`) as # IQ-TREE may return slightly different rounding errors on different # systems. input_fp = self.get_data_path('aligned-dna-sequences-3.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') with redirected_stdio(stderr=os.devnull): obs = iqtree(input_sequences, seed=1723, substitution_model='HKY') obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False) obs_tl = list(obs_tree.tip_tip_distances().to_series()) obs_series = set(['%.4f' % e for e in obs_tl]) exp_tree = skbio.TreeNode.read(self.get_data_path('test4.tre')) exp_tl = list(exp_tree.tip_tip_distances().to_series()) exp_series = set(['%.4f' % e for e in exp_tl]) self.assertEqual(obs_series, exp_series)
def create_job(): # TODO: handle errors in the request body request_body = request.get_json() plugin = request_body['plugin'] action = request_body['action'] inputs = request_body['inputs'] parameters = request_body['parameters'] outputs = request_body['outputs'] plugin = PLUGIN_MANAGER.plugins[plugin] action = plugin.actions[action] # TODO: make this better json_params = {} for key, spec in action.signature.parameters.items(): if spec.qiime_type == Metadata: if parameters[key] == "": parameters[key] = None json_params[key] = None else: parameters[key] = qiime2.Metadata.load(parameters[key]) json_params[key] = '<metadata>' # TODO is there a better way to check whether `spec.qiime_type` is some # kind of `MetadataColumn` subtype using the type system API? The # current approach here matches more or less what q2cli is doing. elif spec.qiime_type.name == 'MetadataColumn': if spec.qiime_type == MetadataColumn[Categorical]: column_types = ('categorical',) elif spec.qiime_type == MetadataColumn[Numeric]: column_types = ('numeric',) elif spec.qiime_type == MetadataColumn[Categorical | Numeric]: column_types = ('categorical', 'numeric') else: raise NotImplementedError( "Parameter %r is type %r, which is not currently " "supported by this interface." % (key, spec.qiime_type)) if parameters[key][0] == "" or parameters[key][1] == "": parameters[key] = None json_params[key] = None else: column_name = parameters[key][1] metadata_column = qiime2.Metadata.load( parameters[key][0]).get_column(column_name) if metadata_column.type not in column_types: if len(column_types) == 1: suffix = '%s.' % column_types[0] else: suffix = ('one of the following types: %s' % ', '.join(column_types)) raise TypeError( "Metadata column %r is %s. Parameter %r expects the " "column to be %s" % (column_name, metadata_column.type, key, suffix)) parameters[key] = metadata_column json_params[key] = '<metadata>' else: json_params[key] = parameters[key] parameters = action.signature.decode_parameters(**parameters) inputs = load_artifacts(**inputs) job_id = str(uuid.uuid4()) now = int(time.time() * 1000) JOBS[job_id] = { 'uuid': job_id, 'completed': False, 'error': False, 'started': now, 'finished': None, 'stdout': None, 'stderr': None, 'code': action.source, 'actionId': action.id, 'actionName': action.name, 'inputs': {k: v.uuid for k, v in inputs.items()}, 'params': json_params, 'outputs': {k: None for k in outputs} } inputs.update(parameters) # Add prefix just in case the file isn't unlinked, but we don't need a # name either way as the context manager works on file-descripters stdout = tempfile.TemporaryFile(prefix='q2studio-stdout') stderr = tempfile.TemporaryFile(prefix='q2studio-stderr') with redirected_stdio(stdout=stdout, stderr=stderr): future = action.asynchronous(**inputs) future.add_done_callback( _callback_factory(job_id, outputs, stdout, stderr)) return jsonify({ 'job': url_for('.inspect_job', job_id=job_id) })
def create_job(): # TODO: handle errors in the request body request_body = request.get_json() plugin = request_body['plugin'] action = request_body['action'] inputs = request_body['inputs'] parameters = request_body['parameters'] outputs = request_body['outputs'] plugin = PLUGIN_MANAGER.plugins[plugin] action = plugin.actions[action] # TODO: make this better json_params = {} for key, spec in action.signature.parameters.items(): if spec.qiime_type == qiime2.plugin.Metadata: if parameters[key] == "": parameters[key] = None json_params[key] = None else: parameters[key] = qiime2.Metadata.load(parameters[key]) json_params[key] = '<metadata>' elif spec.qiime_type == qiime2.plugin.MetadataCategory: if parameters[key][0] == "" or parameters[key][1] == "": parameters[key] = None json_params[key] = None else: parameters[key] = qiime2.Metadata.load( parameters[key][0]).get_category(parameters[key][1]) json_params[key] = '<metadata>' else: json_params[key] = parameters[key] parameters = action.signature.decode_parameters(**parameters) inputs = load_artifacts(**inputs) job_id = str(uuid.uuid4()) now = int(time.time() * 1000) JOBS[job_id] = { 'uuid': job_id, 'completed': False, 'error': False, 'started': now, 'finished': None, 'stdout': None, 'stderr': None, 'code': action.source, 'actionId': action.id, 'actionName': action.name, 'inputs': {k: v.uuid for k, v in inputs.items()}, 'params': json_params, 'outputs': {k: None for k in outputs} } inputs.update(parameters) # Add prefix just in case the file isn't unlinked, but we don't need a # name either way as the context manager works on file-descripters stdout = tempfile.TemporaryFile(prefix='q2studio-stdout') stderr = tempfile.TemporaryFile(prefix='q2studio-stderr') with redirected_stdio(stdout=stdout, stderr=stderr): future = action.async(**inputs) future.add_done_callback( _callback_factory(job_id, outputs, stdout, stderr)) return jsonify({ 'job': url_for('.inspect_job', job_id=job_id) })