def setUp(self): dummy_plugin = get_dummy_plugin() self.runner = CliRunner() self.tempdir = tempfile.mkdtemp(prefix='qiime2-q2cli-test-temp-') self.metadata_file_mixed_types = os.path.join( self.tempdir, 'metadata-mixed-types.tsv') with open(self.metadata_file_mixed_types, 'w') as f: f.write('id\tnumbers\tstrings\n0\t42\tabc\n1\t-1.5\tdef\n') self.bad_metadata_file = os.path.join( self.tempdir, 'bad-metadata.tsv') with open(self.bad_metadata_file, 'w') as f: f.write('wrong\tnumbers\tstrings\nid1\t42\tabc\nid2\t-1.5\tdef\n') self.metadata_artifact = os.path.join(self.tempdir, 'metadata.qza') Artifact.import_data( 'Mapping', {'a': 'dog', 'b': 'cat'}).save(self.metadata_artifact) self.ints1 = os.path.join(self.tempdir, 'ints1.qza') ints1 = Artifact.import_data( 'IntSequence1', [0, 42, 43], list) ints1.save(self.ints1) self.ints2 = os.path.join(self.tempdir, 'ints') ints1.export_data(self.ints2) self.viz = os.path.join(self.tempdir, 'viz.qzv') most_common_viz = dummy_plugin.actions['most_common_viz'] self.viz = most_common_viz(ints1).visualization.save(self.viz)
def test_split_ints(self): qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') # build output file names left_path = os.path.join(self.tempdir, 'left.qza') right_path = os.path.join(self.tempdir, 'right.qza') # TODO: currently must pass `--verbose` to commands invoked by Click's # test runner because redirecting stdout/stderr raises an # "io.UnsupportedOperation: fileno" error. Likely related to Click # mocking a filesystem in the test runner. result = self.runner.invoke( command, ['split-ints', '--i-ints', self.artifact1_path, '--o-left', left_path, '--o-right', right_path, '--verbose']) # command completes successfully and creates the correct # output files self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(left_path)) self.assertTrue(os.path.exists(right_path)) # results are correct left = Artifact.load(left_path) right = Artifact.load(right_path) self.assertEqual(left.view(list), [0]) self.assertEqual(right.view(list), [42, 43])
def test_variadic_inputs(self): qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') output_path = os.path.join(self.tempdir, 'output.qza') ints1 = Artifact.import_data('IntSequence1', [1, 2, 3]).save( os.path.join(self.tempdir, 'ints1.qza')) ints2 = Artifact.import_data('IntSequence2', [4, 5, 6]).save( os.path.join(self.tempdir, 'ints2.qza')) set1 = Artifact.import_data('SingleInt', 7).save( os.path.join(self.tempdir, 'set1.qza')) set2 = Artifact.import_data('SingleInt', 8).save( os.path.join(self.tempdir, 'set2.qza')) result = self.runner.invoke( command, ['variadic-input-method', '--i-ints', ints1, '--i-ints', ints2, '--i-int-set', set1, '--i-int-set', set2, '--p-nums', '9', '--p-nums', '10', '--p-opt-nums', '11', '--p-opt-nums', '12', '--p-opt-nums', '13', '--o-output', output_path, '--verbose']) self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(output_path)) output = Artifact.load(output_path) self.assertEqual(output.view(list), list(range(1, 14)))
def test_repeated_multiple_option(self): input_path = os.path.join(self.tempdir, 'ints.qza') artifact = Artifact.import_data(IntSequence1, [0, 42, 43], list) artifact.save(input_path) metadata_path1 = os.path.join(self.tempdir, 'metadata1.tsv') with open(metadata_path1, 'w') as f: f.write('id\tcol1\nid1\tfoo\nid2\tbar\n') metadata_path2 = os.path.join(self.tempdir, 'metadata2.tsv') with open(metadata_path2, 'w') as f: f.write('id\tcol2\nid1\tbaz\nid2\tbaa\n') output_path = os.path.join(self.tempdir, 'out.qza') qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') result = self.runner.invoke( command, ['identity-with-metadata', '--i-ints', input_path, '--o-out', output_path, '--m-metadata-file', metadata_path1, '--m-metadata-file', metadata_path2, '--verbose']) self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(output_path)) self.assertEqual(Artifact.load(output_path).view(list), [0, 42, 43])
def test_core_metrics_phylogenetic_multiple_jobs(self): table = biom.Table(np.array([[0, 11, 11], [13, 11, 11]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) table = Artifact.import_data('FeatureTable[Frequency]', table) tree = skbio.TreeNode.read(io.StringIO( '((O1:0.25, O2:0.50):0.25, O3:0.75)root;')) tree = Artifact.import_data('Phylogeny[Rooted]', tree) metadata = Metadata( pd.DataFrame({'foo': ['1', '2', '3']}, index=pd.Index(['S1', 'S2', 'S3'], name='id'))) results = self.core_metrics_phylogenetic(table, tree, 13, metadata, n_jobs=2) self.assertEqual(len(results), 17) self.assertEqual(repr(results.bray_curtis_distance_matrix.type), 'DistanceMatrix') self.assertEqual(repr(results.jaccard_emperor.type), 'Visualization') # pipelines preserve the output's type, in this case, beta_phylogenetic # returns this type, and that is passed through to the final output # (as long as the type is a subtype of the signature). self.assertEqual( repr(results.faith_pd_vector.type), "SampleData[AlphaDiversity] % Properties(['phylogenetic'])") expected = pd.Series({'S1': 1, 'S2': 2, 'S3': 2}, name='observed_otus') pdt.assert_series_equal(results[2].view(pd.Series), expected)
def test_add_artifacts(self): # First two artifacts have the same data but different UUIDs. artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '3'}) self.mdc._add_artifacts([artifact1]) artifact2 = Artifact.import_data('Mapping', {'a': '1', 'b': '3'}) artifact3 = Artifact.import_data('IntSequence1', [1, 2, 3, 4]) self.mdc._add_artifacts([artifact2, artifact3]) self.assertEqual(self.mdc.artifacts, (artifact1, artifact2, artifact3))
def test_artifact_mismatch(self): # Metadata created from different artifacts shouldn't compare equal, # even if the data is the same. artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'}) artifact2 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'}) md1 = artifact1.view(Metadata) md2 = artifact2.view(Metadata) pdt.assert_frame_equal(md1.to_dataframe(), md2.to_dataframe()) self.assertReallyNotEqual(md1, md2)
def test_add_duplicate_artifact(self): artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '3'}) artifact2 = Artifact.import_data('IntSequence1', [1, 2, 3, 4]) self.mdc._add_artifacts([artifact1, artifact2]) with self.assertRaisesRegex( ValueError, "Duplicate source artifacts.*DummyMetadataColumn.*" "artifact: Mapping"): self.mdc._add_artifacts([artifact1]) # Test that the object hasn't been mutated. self.assertEqual(self.mdc.artifacts, (artifact1, artifact2))
def setUp(self): get_dummy_plugin() self.runner = CliRunner() self.tempdir = tempfile.mkdtemp(prefix='qiime2-q2cli-test-temp-') self.artifact1_path = os.path.join(self.tempdir, 'a1.qza') self.mapping_path = os.path.join(self.tempdir, 'mapping.qza') artifact1 = Artifact.import_data(IntSequence1, [0, 42, 43]) artifact1.save(self.artifact1_path) self.artifact1_root_dir = str(artifact1.uuid) mapping = Artifact.import_data('Mapping', {'foo': '42'}) mapping.save(self.mapping_path)
def setUp(self): get_dummy_plugin() self.runner = CliRunner() self.plugin_command = RootCommand().get_command( ctx=None, name='dummy-plugin') self.tempdir = tempfile.mkdtemp(prefix='qiime2-q2cli-test-temp-') self.input_artifact = os.path.join(self.tempdir, 'in.qza') Artifact.import_data( IntSequence1, [0, 42, 43], list).save(self.input_artifact) self.output_artifact = os.path.join(self.tempdir, 'out.qza') self.metadata_file1 = os.path.join(self.tempdir, 'metadata1.tsv') with open(self.metadata_file1, 'w') as f: f.write('id\tcol1\n0\tfoo\nid1\tbar\n') self.metadata_file_alt_id_header = os.path.join( self.tempdir, 'metadata-alt-id-header.tsv') with open(self.metadata_file_alt_id_header, 'w') as f: f.write('#SampleID\tcol1\n0\tfoo\nid1\tbar\n') self.metadata_file2 = os.path.join(self.tempdir, 'metadata2.tsv') with open(self.metadata_file2, 'w') as f: f.write('id\tcol2\n0\tbaz\nid1\tbaa\n') self.metadata_file_mixed_types = os.path.join( self.tempdir, 'metadata-mixed-types.tsv') with open(self.metadata_file_mixed_types, 'w') as f: f.write('id\tnumbers\tstrings\nid1\t42\tabc\nid2\t-1.5\tdef\n') self.metadata_artifact = os.path.join(self.tempdir, 'metadata.qza') Artifact.import_data( 'Mapping', {'a': 'dog', 'b': 'cat'}).save(self.metadata_artifact) self.cmd_config = os.path.join(self.tempdir, 'conf.ini') with open(self.cmd_config, 'w') as f: f.write('[dummy-plugin.identity-with-metadata]\n' 'm-metadata-file=%s\n' % self.metadata_file1) f.write('[dummy-plugin.identity-with-optional-metadata]\n' 'm-metadata-file=%s\n' % self.metadata_file1) f.write('[dummy-plugin.identity-with-metadata-column]\n' 'm-metadata-file=%s\n' 'm-metadata-column=col1\n' % self.metadata_file1) f.write('[dummy-plugin.identity-with-optional-metadata-column]\n' 'm-metadata-file=%s\n' 'm-metadata-column=col1\n' % self.metadata_file1)
def setUp(self): super().setUp() self.align_to_tree_mafft_fasttree = self.plugin.pipelines[ 'align_to_tree_mafft_fasttree'] input_sequences_fp = self.get_data_path('dna-sequences-1.fasta') self.input_sequences = Artifact.import_data('FeatureData[Sequence]', input_sequences_fp)
def test_add_non_artifact(self): artifact = Artifact.import_data('Mapping', {'a': '1', 'b': '3'}) with self.assertRaisesRegex(TypeError, "Artifact object.*42"): self.mdc._add_artifacts([artifact, 42]) # Test that the object hasn't been mutated. self.assertEqual(self.mdc.artifacts, ())
def test_no_optional_artifacts_provided(self): result = self._run_command( 'optional-artifacts-method', '--i-ints', self.ints1, '--p-num1', 42, '--o-output', self.output, '--verbose') self.assertEqual(result.exit_code, 0) self.assertEqual(Artifact.load(self.output).view(list), [0, 42, 43, 42])
def test_with_artifacts(self): artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'}) artifact2 = Artifact.import_data('Mapping', {'d': '4'}) mdc = DummyMetadataColumn(pd.Series( [1, 2, 3], name='col1', index=pd.Index(['a', 'b', 'c'], name='id'))) mdc._add_artifacts([artifact1, artifact2]) obs = mdc.filter_ids({'a', 'c'}) exp = DummyMetadataColumn(pd.Series( [1, 3], name='col1', index=pd.Index(['a', 'c'], name='id'))) exp._add_artifacts([artifact1, artifact2]) self.assertEqual(obs, exp) self.assertEqual(obs.artifacts, (artifact1, artifact2))
def test_artifacts_mismatch(self): artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'}) artifact2 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'}) series = pd.Series([42, 43], name='col1', index=pd.Index(['id1', 'id2'], name='id')) # No artifacts mdc1 = DummyMetadataColumn(series) # Has an artifact mdc2 = DummyMetadataColumn(series) mdc2._add_artifacts([artifact1]) # Has a different artifact mdc3 = DummyMetadataColumn(series) mdc3._add_artifacts([artifact2]) self.assertReallyNotEqual(mdc1, mdc2) self.assertReallyNotEqual(mdc2, mdc3)
def setUp(self): self.runner = CliRunner() self.tempdir = tempfile.mkdtemp(prefix='qiime2-test-temp-') self.artifact1_path = os.path.join(self.tempdir, 'a1.qza') artifact1 = Artifact._from_view( IntSequence1, [0, 42, 43], list, provenance_capture=ImportProvenanceCapture()) artifact1.save(self.artifact1_path) self.artifact1_root_dir = str(artifact1.uuid)
def test_core_metrics_phylogenetic_rarefy_drops_sample(self): table = biom.Table(np.array([[0, 11, 11], [12, 11, 11]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) table = Artifact.import_data('FeatureTable[Frequency]', table) tree = skbio.TreeNode.read(io.StringIO( '((O1:0.25, O2:0.50):0.25, O3:0.75)root;')) tree = Artifact.import_data('Phylogeny[Rooted]', tree) metadata = Metadata(pd.DataFrame({'foo': ['1', '2', '3']}, index=['S1', 'S2', 'S3'])) results = self.core_metrics_phylogenetic(table, tree, 13, metadata) self.assertEqual(len(results), 17) expected = pd.Series({'S2': 2, 'S3': 2}, name='observed_otus') pdt.assert_series_equal(results[2].view(pd.Series), expected)
def test_artifacts_are_propagated(self): A = Artifact.import_data('Mapping', {'a': '1', 'b': '3'}) md = A.view(Metadata) obs = md.get_column('b') # TODO update to use MetadataColumn.__eq__ self.assertEqual(obs.artifacts, (A,)) pdt.assert_series_equal( obs.to_series(), pd.Series(['3'], index=pd.Index(['0'], name='id'), name='b'))
def test_source_mismatch(self): # Metadata created from an artifact vs not shouldn't compare equal, # even if the data is the same. artifact = Artifact.import_data('Mapping', {'a': '1', 'b': '2'}) md_from_artifact = artifact.view(Metadata) md_no_artifact = Metadata(md_from_artifact.to_dataframe()) pdt.assert_frame_equal(md_from_artifact.to_dataframe(), md_no_artifact.to_dataframe()) self.assertReallyNotEqual(md_from_artifact, md_no_artifact)
def test_with_artifacts(self): artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'}) artifact2 = Artifact.import_data('Mapping', {'d': '4'}) md_from_artifact1 = artifact1.view(Metadata) md_from_artifact2 = artifact2.view(Metadata) md_no_artifact = Metadata(pd.DataFrame( {'c': ['3', '42']}, index=pd.Index(['0', '1'], name='id'))) # Merge three metadata objects -- the first has an artifact, the second # does not, and the third has an artifact. obs_md = md_from_artifact1.merge(md_no_artifact, md_from_artifact2) exp_df = pd.DataFrame( {'a': '1', 'b': '2', 'c': '3', 'd': '4'}, index=pd.Index(['0'], name='id')) exp_md = Metadata(exp_df) exp_md._add_artifacts((artifact1, artifact2)) self.assertEqual(obs_md, exp_md) self.assertEqual(obs_md.artifacts, (artifact1, artifact2))
def test_equality_with_artifact(self): artifact = Artifact.import_data('Mapping', {'a': '1', 'b': '2'}) mdc1 = DummyMetadataColumn(pd.Series( [42, 43], name='col1', index=pd.Index(['id1', 'id2'], name='id'))) mdc1._add_artifacts([artifact]) mdc2 = DummyMetadataColumn(pd.Series( [42, 43], name='col1', index=pd.Index(['id1', 'id2'], name='id'))) mdc2._add_artifacts([artifact]) self.assertReallyEqual(mdc1, mdc2)
def test_without_inputs_or_parameters(self): qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') output_path = os.path.join(self.tempdir, 'output.qza') result = self.runner.invoke( command, ['no-input-method', '--o-out', output_path, '--verbose']) self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(output_path)) artifact = Artifact.load(output_path) self.assertEqual(artifact.view(dict), {'foo': '42'})
def setUp(self): super().setUp() self.beta_correlation = self.plugin.pipelines['beta_correlation'] dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 1], [2, 1, 0]], ids=['sample1', 'sample2', 'sample3']) self.dm = Artifact.import_data('DistanceMatrix', dm) self.md = qiime2.NumericMetadataColumn( pd.Series([1, 2, 3], name='number', index=pd.Index(['sample1', 'sample2', 'sample3'], name='id')))
def _assertMetadataOutput(self, result, *, exp_tsv, exp_yaml): self.assertEqual(result.exit_code, 0) artifact = Artifact.load(self.output_artifact) action_dir = artifact._archiver.provenance_dir / 'action' if exp_tsv is None: self.assertFalse((action_dir / 'metadata.tsv').exists()) else: with (action_dir / 'metadata.tsv').open() as fh: self.assertEqual(fh.read(), exp_tsv) with (action_dir / 'action.yaml').open() as fh: self.assertIn(exp_yaml, fh.read())
def test_multiple_metadata(self): for command in ('identity-with-metadata-category', 'identity-with-optional-metadata-category'): result = self._run_command( command, '--i-ints', self.input_artifact, '--o-out', self.output_artifact, '--m-metadata-file', self.metadata_file1, '--m-metadata-file', self.metadata_file2, '--m-metadata-file', self.metadata_artifact, '--m-metadata-category', 'col2', '--verbose') exp_yaml = "metadata: !metadata '%s:metadata.tsv'" % ( Artifact.load(self.metadata_artifact).uuid) self._assertMetadataOutput(result, exp_tsv='0\tbaz\n', exp_yaml=exp_yaml)
def test_qza_extension(self): qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') # build output parameter arguments and expected output file names left_path = os.path.join(self.tempdir, 'left') expected_left_path = os.path.join(self.tempdir, 'left.qza') right_path = os.path.join(self.tempdir, 'right') expected_right_path = os.path.join(self.tempdir, 'right.qza') result = self.runner.invoke( command, ['split-ints', '--i-ints', self.artifact1_path, '--o-left', left_path, '--o-right', right_path, '--verbose']) # command completes successfully and creates the correct # output files self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(expected_left_path)) self.assertTrue(os.path.exists(expected_right_path)) # results are correct left = Artifact.load(expected_left_path) right = Artifact.load(expected_right_path) self.assertEqual(left.view(list), [0]) self.assertEqual(right.view(list), [42, 43])
def setUp(self): get_dummy_plugin() self.runner = CliRunner() self.plugin_command = RootCommand().get_command( ctx=None, name='dummy-plugin') self.tempdir = tempfile.mkdtemp(prefix='qiime2-q2cli-test-temp-') self.input_artifact = os.path.join(self.tempdir, 'in.qza') Artifact.import_data( IntSequence1, [0, 42, 43], list).save(self.input_artifact) self.output_artifact = os.path.join(self.tempdir, 'out.qza') self.metadata_file1 = os.path.join(self.tempdir, 'metadata1.tsv') with open(self.metadata_file1, 'w') as f: f.write('id\tcol1\n0\tfoo\nid1\tbar\n') self.metadata_file2 = os.path.join(self.tempdir, 'metadata2.tsv') with open(self.metadata_file2, 'w') as f: f.write('id\tcol2\n0\tbaz\nid1\tbaa\n') self.metadata_artifact = os.path.join(self.tempdir, 'metadata.qza') Artifact.import_data( 'Mapping', {'a': 'dog', 'b': 'cat'}).save(self.metadata_artifact) self.cmd_config = os.path.join(self.tempdir, 'conf.ini') with open(self.cmd_config, 'w') as f: f.write('[dummy-plugin.identity-with-metadata]\n' 'm-metadata-file=%s\n' % self.metadata_file1) f.write('[dummy-plugin.identity-with-optional-metadata]\n' 'm-metadata-file=%s\n' % self.metadata_file1) f.write('[dummy-plugin.identity-with-metadata-category]\n' 'm-metadata-file=%s\n' 'm-metadata-category=col1\n' % self.metadata_file1) f.write('[dummy-plugin.identity-with-optional-metadata-category]\n' 'm-metadata-file=%s\n' 'm-metadata-category=col1\n' % self.metadata_file1)
def test_core_metrics(self): table = biom.Table(np.array([[0, 11, 11], [13, 11, 11]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) table = Artifact.import_data('FeatureTable[Frequency]', table) metadata = Metadata(pd.DataFrame({'foo': ['1', '2', '3']}, index=['S1', 'S2', 'S3'])) results = self.core_metrics(table, 13, metadata) self.assertEqual(len(results), 10) self.assertEqual(repr(results.bray_curtis_distance_matrix.type), 'DistanceMatrix') self.assertEqual(repr(results.jaccard_emperor.type), 'Visualization') expected = pd.Series({'S1': 1, 'S2': 2, 'S3': 2}, name='observed_otus') pdt.assert_series_equal(results[1].view(pd.Series), expected)
def test_artifacts_are_propagated(self): artifact = Artifact.import_data('Mapping', {'a': '1', 'b': '2'}) series = pd.Series( [0.0, np.nan, 3.3, np.nan, np.nan, 4.4], name='col1', index=pd.Index(['a', 'b', 'c', 'd', 'e', 'f'], name='sampleid')) mdc = DummyMetadataColumn(series) mdc._add_artifacts([artifact]) obs = mdc.drop_missing_values() exp = DummyMetadataColumn(pd.Series( [0.0, 3.3, 4.4], name='col1', index=pd.Index(['a', 'c', 'f'], name='sampleid'))) exp._add_artifacts([artifact]) self.assertEqual(obs, exp) self.assertEqual(obs.artifacts, (artifact,))
def test_multiple_metadata(self): for command in ('identity-with-metadata', 'identity-with-optional-metadata'): result = self._run_command( command, '--i-ints', self.input_artifact, '--o-out', self.output_artifact, '--m-metadata-file', self.metadata_file_alt_id_header, '--m-metadata-file', self.metadata_file2, '--m-metadata-file', self.metadata_artifact, '--verbose') exp_tsv = ( 'id\tcol1\tcol2\ta\tb\n' '#q2:types\tcategorical\tcategorical\tcategorical\tcategorical' '\n0\tfoo\tbaz\tdog\tcat\n' ) exp_yaml = "metadata: !metadata '%s:metadata.tsv'" % ( Artifact.load(self.metadata_artifact).uuid) self._assertMetadataOutput(result, exp_tsv=exp_tsv, exp_yaml=exp_yaml)
def setUp(self): get_dummy_plugin() self.runner = CliRunner() self.plugin_command = RootCommand().get_command(ctx=None, name='dummy-plugin') self.tempdir = tempfile.mkdtemp(prefix='qiime2-q2cli-test-temp-') self.ints1 = os.path.join(self.tempdir, 'ints1.qza') Artifact.import_data(IntSequence1, [0, 42, 43], list).save(self.ints1) self.ints2 = os.path.join(self.tempdir, 'ints2.qza') Artifact.import_data(IntSequence1, [99, -22], list).save(self.ints2) self.ints3 = os.path.join(self.tempdir, 'ints3.qza') Artifact.import_data(IntSequence2, [43, 43], list).save(self.ints3) self.output = os.path.join(self.tempdir, 'output.qza')
def demultiplex_manifests(fastq_files, primers, regions=None, split_on_header=True, threads=16): """Demultiplex fastq files into variable region origins. """ if regions is not None: primers = {k: v for k, v in primers.items() if k in regions} r1 = [abspath(i) for i in fastq_files if '_R1.fastq' in i] rundir = mkdtemp() #run in a tmpdir cwd = os.path.abspath(os.curdir) os.chdir(rundir) with mp.Pool(threads) as pool: if split_on_header: args_iter = itertools.product(r1, regions) pool.starmap(seqkit_worker, args_iter) else: pool.map(cutadapt_worker, r1, primer_subset) manifest_filenames = {} for r in regions: R1 = glob.glob(join(rundir, '*_{}_R1.fastq'.format(r))) df = pandas_manifest(R1) manifest_fn = r + '_manifest.csv' if df is not None: df.to_csv(manifest_fn, index=False, sep='\t') manifest_filenames[r] = manifest_fn adata = {} with mp.Pool(threads) as pool: pool.map(import_data_worker, manifest_filenames.values()) for r, fn in manifest_filenames.items(): write_message('importing data ({}) from {}'.format(r, fn)) adata[r] = Artifact.load(fn.split('_')[0] + '.qza') # clean up tmpdir os.chdir(cwd) shutil.rmtree(rundir) return adata
def main(): cdir = Path("./data/silva_138_1") habitats = """ animal-distal-gut animal-surface animal-secretion water-non-saline animal-proximal-gut animal-corpus plant-rhizosphere water-saline sediment-saline sediment-non-saline plant-corpus plant-surface surface-saline soil-non-saline human-stool human-oral average """ habitats = habitats.split() v4 = {} fl = {} for habitat in habitats: for ddirs, collection in [("515f-806r", v4), ("full_length", fl)]: art = Artifact.load(cdir / ddirs / (habitat + ".qza")) alpha = diversity.actions.alpha(metric="shannon", table=art)[0].view(Series)[0] collection[habitat] = alpha assert len(v4.values()) == len(set(v4.values())), \ "WARNING: two sets of weights are the same" assert len(fl.values()) == len(set(fl.values())), \ "WARNING: two sets of weights are the same" for alpha in v4.values(): assert v4[ 'average'] >= alpha, "WARNING: average weights are not the most diverse" for alpha in fl.values(): assert fl[ 'average'] >= alpha, "WARNING: average weights are not the most diverse"
def test_parse_q2_data_wrong_semantic_type(self): resource_filename = self.create_tempfile(suffix='.qza').name test_series = pd.Series({ 'feature1': 'k__1', 'feature2': 'k__2' }, name='Taxon') test_series.index.name = 'Feature ID' imported_artifact = Artifact.import_data( # the distincion here is that this is not alpha diversity "FeatureData[Taxonomy]", test_series) imported_artifact.save(resource_filename) with self.assertRaisesRegex( ConfigurationError, r"Expected (.*) " r"'SampleData\[AlphaDiversity\]'. " r"Received 'FeatureData\[Taxonomy\]'."): _parse_q2_data(resource_filename, SampleData[AlphaDiversity], view_type=pd.Series)
def _parse_q2_data(filepath, semantic_type, view_type=None, ignore_predicate=True): try: data = Artifact.load(filepath) except ValueError as e: raise ConfigurationError(*e.args) data_type = data.type if ignore_predicate: data_type = TypeExp(data_type.template, fields=data_type.fields) if data_type != semantic_type: raise ConfigurationError(f"Expected QZA '{filepath}' to have type " f"'{semantic_type}'. " f"Received '{data.type}'.") if view_type is not None: data = data.view(view_type=view_type) return data
def validate_denoise_input(sequence_data): """ Precheck input files prior to running denoise step Input: - sequence_data: sequence data in QIIME2 artifact format """ # Check Artifact type try: q2_artifact = Artifact.load(sequence_data) if (str(q2_artifact.type) != "SampleData[PairedEndSequencesWithQuality]"): msg = "Input QIIME2 Artifact is not of type 'SampleData[PairedEndSequencesWithQuality]'!" raise ValueError(msg) except ValueError as err: message = str(err) return 400, message return 200, "Imported data good!"
def import_qiime2_feature_table(feature_table_filepath): """ Convert QIIME2 feature table artifact to compatible format """ artifact = Artifact.load(feature_table_filepath) artifact_type = str(artifact.type) if (artifact_type == "FeatureTable[Frequency]" or artifact_type == "FeatureTable[RelativeFrequency]"): feature_table_df = artifact.view(pd.DataFrame) # return transposed version for better view transposed = feature_table_df.T transposed.index.name = "SampleID" return transposed.reset_index() # raise error if not feature table artifact else: raise ValueError( "Input artifact is not of type FeatureTable[Frequency] or FeatureTable[RelativeFrequency]!" )
def convert(artifact_path): """ Converts QIIME2 artifact to tsv if applicabl if applicable Input: - artifact_path: path to QIIME2 artifact (.qza) - output_path: path to save output as Returns: - Dictionary with pandas series or dataframe as values """ artifact = Artifact.load(artifact_path) artifact_type = str(artifact.type) if(artifact_type == "FeatureTable[Frequency]" or artifact_type == "FeatureTable[RelativeFrequency]"): df = artifact.view(pd.DataFrame) output = { "feature_table": df } return output elif(artifact_type == "PCoAResults"): ordination_result = artifact.view(ordination.OrdinationResults) eigvals = ordination_result.eigvals # pd.DataFrame coordinates = ordination_result.samples # pd.Series proportion_explained = ordination_results.proportion_explained # pd.Series output = { "eigvals": eigvals, "coordinates": coordinates, "proportion_explained": proportion_explained } return output else: logger.warning("Could not convert specified QIIME2 artifact.") return {}
def setUp(self): _ranks = pd.DataFrame([[4.1, 1.3, 2.1], [0.1, 0.3, 0.2], [2.2, 4.3, 3.2], [-6.3, -4.4, 2.1]], index=pd.Index([c for c in 'ABCD'], name='id'), columns=['m1', 'm2', 'm3']) self.ranks = Artifact.import_data('FeatureData[Conditional]', _ranks) self.taxa = CategoricalMetadataColumn( pd.Series([ 'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; ' 'o__Desulfobacterales; f__Desulfobulbaceae; g__; s__', 'k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta', 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; ' 'o__Rickettsiales; f__mitochondria; g__Lardizabala; s__biternata', 'k__Archaea; p__Euryarchaeota; c__Methanomicrobia; ' 'o__Methanosarcinales; f__Methanosarcinaceae; g__Methanosarcina' ], index=pd.Index([c for c in 'ABCD'], name='feature-id'), name='Taxon')) self.metabolites = CategoricalMetadataColumn( pd.Series(['amino acid', 'carbohydrate', 'drug metabolism'], index=pd.Index(['m1', 'm2', 'm3'], name='feature-id'), name='Super Pathway'))
def merge_data(tables, taxas, sequences, samples): taxa_list = [] table_list = [] seq_list = [] meta_region = {} write_message('merging region results ...') # ensure same ordering of dicts for r in taxas.keys(): write_message('collecting data from {}'.format(r)) taxa_list.append(taxas[r].classification) df = tables[r].view(pd.DataFrame) df.index = df.index.str.replace('_{}'.format(r), '') for seq_id in df.columns: if seq_id in meta_region.keys(): rr = meta_region[seq_id] + '_' + r else: rr = r meta_region[seq_id] = rr table = Artifact.import_data('FeatureTable[Frequency]', df) table_list.append(table) seq_list.append(sequences[r]) merged_taxa = feature_table.methods.merge_taxa(taxa_list) merged_seq = feature_table.methods.merge_seqs(seq_list) merged_table = feature_table.methods.merge(table_list, overlap_method='sum') # meta = [ meta_region[seq_id] for seq_id in merged_table.merged_table.view(pd.DataFrame).columns ] meta = pd.DataFrame(meta) meta.index = merged_table.merged_table.view(pd.DataFrame).columns meta.index.name = 'feature-id' meta.columns = ['region'] meta = Metadata(meta) return merged_table, merged_taxa, merged_seq, meta
def test__qiime2_rclr(self): """Tests q2-rclr matches standalone rclr.""" # make mock table to write samps_ids = ['s%i' % i for i in range(self.cdata.shape[0])] feats_ids = ['f%i' % i for i in range(self.cdata.shape[1])] table_test = Table(self.cdata.T, feats_ids, samps_ids) # write table in_ = get_data_path('test.biom', subfolder='data') out_path = os_path_sep.join(in_.split(os_path_sep)[:-1]) test_path = os.path.join(out_path, 'rclr-test.biom') with biom_open(test_path, 'w') as wf: table_test.to_hdf5(wf, "test") # run standalone runner = CliRunner() result = runner.invoke(sdc.commands['rclr'], ['--in-biom', test_path, '--output-dir', out_path]) out_table = get_data_path('rclr-table.biom', subfolder='data') res_table = load_table(out_table) standalone_mat = res_table.matrix_data.toarray().T # check that exit code was 0 (indicating success) try: self.assertEqual(0, result.exit_code) except AssertionError: ex = result.exception error = Exception('Command failed with non-zero exit code') raise error.with_traceback(ex.__traceback__) # run QIIME2 q2_table_test = Artifact.import_data("FeatureTable[Frequency]", table_test) q2_res = rclr_transformation(q2_table_test).rclr_table.view(Table) q2_res_mat = q2_res.matrix_data.toarray().T # check same and check both correct npt.assert_allclose(standalone_mat, q2_res_mat) npt.assert_allclose(standalone_mat, self.true) npt.assert_allclose(q2_res_mat, self.true)
def qiime2PCoA(sample_metadata, df, out_dir, norm=True, scale=False, metric='canberra'): sample_metadata.rename(index=str, columns={"filename": "#SampleID"}, inplace=True) sample_metadata.columns = sample_metadata.columns.str.replace('\s', '_') sample_metadata.index = sample_metadata['#SampleID'] sample_metadata.drop(['#SampleID'], axis=1, inplace=True) qsample_metadata = qiime2.metadata.Metadata(sample_metadata) df2 = df[df.columns[df.columns.str.contains(' Peak area')]] df2.columns = [re.sub('(.+\.mzX?ML) .+', '\\1', a) for a in df2.columns] df2.index = df['row ID'].astype(str) df2 = df2.T if norm: df2 = df2.apply(lambda a: a / sum(a), axis=1) if scale: df2 = (df2 - df2.mean()) / df2.std() dm1 = squareform(pdist(df2, metric=metric)) dm1 = skbio.DistanceMatrix(dm1, ids=df2.index.tolist()) dm1 = Artifact.import_data("DistanceMatrix", dm1) pcoa = diversity.methods.pcoa(dm1) emperor_plot = emperor.visualizers.plot(pcoa.pcoa, qsample_metadata) if '.qzv' in out_dir: emperor_plot.visualization.save(out_dir) else: emperor_plot.visualization.export_data(out_dir) return pcoa
def cross_validate_for_weights(ref_taxa, ref_seqs, weights, obs_dir, results_dir, intermediate_dir, n_jobs, log_file, log_level): # set up logging setup_logging(log_level, log_file) logging.info(locals()) # load taxonomy-level information biom_path = join(intermediate_dir, 'taxonomy_samples.biom') taxonomy_samples = biom.load_table(biom_path) logging.info('Got taxonomy samples') # load folds taxon_defaults_file = join(intermediate_dir, 'taxon_defaults.json') with open(taxon_defaults_file) as fh: taxon_defaults = json.load(fh) folds = glob.glob(join(intermediate_dir, 'fold-*')) logging.info('Got folds') # load the weights weights = Artifact.load(weights) # for each fold for fold in folds: # load the simulated test samples test_samples = load_simulated_samples(fold, results_dir) # generate the training taxa, seqs, ref_seqs, reduced weights train_taxa, train_seqs, ref_seqs_art, fold_weights = \ get_train_artifacts(taxonomy_samples, fold, taxon_defaults, ref_taxa, ref_seqs, weights) # train the weighted classifier and classify the test samples classification = classify_samples(test_samples, train_taxa, ref_seqs_art, 0.7, n_jobs, fold_weights) # save the classified taxonomy artifacts save_observed(results_dir, test_samples, classification, obs_dir) logging.info('Done ' + fold)
def setUp(self): super().setUp() self.qza_resource_fp = self.create_tempfile(suffix='.qza').name self.qza_resource_fp2 = self.create_tempfile(suffix='.qza').name self.qza_resource_fh2 = self.create_tempfile(suffix='.qza') self.qza_resource_fh2.close() self.qza_resource_dne = self.qza_resource_fh2.name self.non_qza_resource_fp = self.create_tempfile( suffix='.some_ext').name self.test_series = pd.Series({ 'sample1': 7.15, 'sample2': 9.04 }, name='chao1') self.test_series2 = pd.Series({ 'sample1': 7.16, 'sample2': 9.01 }, name='faith_pd') self.resources = ResourceManager(some_key='some_value') imported_artifact = Artifact.import_data("SampleData[AlphaDiversity]", self.test_series) imported_artifact.save(self.qza_resource_fp) self.update_with = { 'random-value': 7.24, 'alpha_resources': { 'chao1': self.qza_resource_fp, 'faith_pd': 9, }, 'other': { 'dict': { 'of': 'things' } }, }
def taxon2fasta(taxonomy, sequences, taxon, path): ''' taxonomy is an artifact of type FeatureData[Taxonomy] sequences is an artifact of type FeatureData[Sequence] taxon is the annotated OTU we are interested in. input string path is where to export the fasta files. input string ''' # convert FeatureData[Taxonomy] to pandas dataframe df_taxon = taxonomy.view(pd.DataFrame) # filter ASV that were annotated to 'taxon' df_taxon = df_taxon.loc[(df_taxon.loc[:, 'Taxon'] == taxon)] # convert FeatureData[Sequence] to pandas series ser = sequences.view(pd.Series) # filter seqs that were annotated to 'taxon' ser_taxon = ser[df_taxon.index] # covert filtered seqs to artifact taxon_seq = Artifact.import_data('FeatureData[Sequence]', ser_taxon) # export fasta files to given path taxon_seq.export_data(path)
def test_integration(self): # This will run through a slightly more complex dataset... base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'files/integration') test_dir = os.path.join(base_dir, 'test') known_dir = os.path.join(base_dir, 'known') data_dir = os.path.join(base_dir, 'data') if os.path.exists(test_dir): shutil.rmtree(test_dir) ### Sequence extraction region1_seqs, region1_map = sidle.prepare_extracted_region( Artifact.load(os.path.join(data_dir, 'region1-extract-seqs.qza')), fwd_primer='TGGCGGACGGGTGAGTAA', rev_primer='CTGCTGCCTCCCGTAGGA', trim_length=50, region='1', debug=True, ) known = \ Artifact.load(os.path.join(known_dir, 'region1-kmer-seqs.qza')) pdt.assert_series_equal( region1_seqs.view(pd.Series).astype(str), known.view(pd.Series).astype(str)) known = \ Artifact.load(os.path.join(known_dir, 'region1-kmer-map.qza')) pdt.assert_frame_equal( known.view(pd.DataFrame).sort_index(), region1_map.view(pd.DataFrame).sort_index()) region2_seqs, region2_map = sidle.prepare_extracted_region( Artifact.load(os.path.join(data_dir, 'region2-extract-seqs.qza')), fwd_primer='CAGCAGCCGCGGTAATAC', rev_primer='CGCATTTCACCGCTACAC', trim_length=50, region='2', debug=True, ) known = \ Artifact.load(os.path.join(known_dir, 'region2-kmer-seqs.qza')) pdt.assert_series_equal( region2_seqs.view(pd.Series).astype(str), known.view(pd.Series).astype(str)) known = \ Artifact.load(os.path.join(known_dir, 'region2-kmer-map.qza')) pdt.assert_frame_equal(known.view(pd.DataFrame), region2_map.view(pd.DataFrame)) region3_seqs, region3_map = sidle.prepare_extracted_region( Artifact.load(os.path.join(data_dir, 'region3-extract-seqs.qza')), fwd_primer='GCACAAGCGGTGGAGCAT', rev_primer='CGCTCGTTGCGGGACTTA', trim_length=50, region='3', debug=True, ) known = \ Artifact.load(os.path.join(known_dir, 'region3-kmer-seqs.qza')) pdt.assert_series_equal( region3_seqs.view(pd.Series).astype(str), known.view(pd.Series).astype(str)) known = \ Artifact.load(os.path.join(known_dir, 'region3-kmer-map.qza')) pdt.assert_frame_equal(known.view(pd.DataFrame), region3_map.view(pd.DataFrame)) ### Regiomal Alignment align1 = sidle.align_regional_kmers( region1_seqs, Artifact.load(os.path.join(data_dir, 'region1-rep-seq.qza')), region='1', max_mismatch=2, debug=True, chunk_size=1, ).regional_alignment known = \ Artifact.load(os.path.join(known_dir, 'region1-align-map.qza')) pdt.assert_frame_equal( align1.view(pd.DataFrame).sort_values(['kmer', 'asv']), known.view(pd.DataFrame)) align2 = sidle.align_regional_kmers( region2_seqs, Artifact.load(os.path.join(data_dir, 'region2-rep-seq.qza')), region='2', max_mismatch=2, debug=True, ).regional_alignment known = \ Artifact.load(os.path.join(known_dir, 'region2-align-map.qza')) pdt.assert_frame_equal( align2.view(pd.DataFrame).sort_values(['kmer', 'asv']), known.view(pd.DataFrame)) align3 = sidle.align_regional_kmers( region3_seqs, Artifact.load(os.path.join(data_dir, 'region3-rep-seq.qza')), region='3', max_mismatch=2, debug=True, ).regional_alignment known = \ Artifact.load(os.path.join(known_dir, 'region3-align-map.qza')) pdt.assert_frame_equal( align3.view(pd.DataFrame).sort_values(['kmer', 'asv']), known.view(pd.DataFrame)) count1 = Artifact.load(os.path.join(data_dir, 'region1-counts.qza')) count2 = Artifact.load(os.path.join(data_dir, 'region2-counts.qza')) count3 = Artifact.load(os.path.join(data_dir, 'region3-counts.qza')) ### Reconstruction map_, summary = sidle.reconstruct_database( region=['1', '2', '3'], kmer_map=[region1_map, region2_map, region3_map], regional_alignment=[align1, align2, align3], count_degenerates=False, debug=True, ) known = \ Artifact.load(os.path.join(known_dir, 'reconstructed-summary.qza')) # ASV mapping was optional in the original sidle. This is tested # elsewhere and dealing w ith it is going to suck. pdt.assert_frame_equal( known.view(pd.DataFrame), summary.view(pd.DataFrame).drop(columns=['mapped-asvs'])) known = \ Artifact.load(os.path.join(known_dir, 'sidle-reconstruction.qza')) pdt.assert_series_equal( known.view(pd.Series).sort_index(), map_.view(pd.Series)) table = sidle.reconstruct_counts( region=['1', '2', '3'], regional_alignment=[align1, align2, align3], regional_table=[count1, count2, count3], database_map=map_, database_summary=summary, debug=True, min_counts=100, min_abund=1e-5, ).reconstructed_table known = \ Artifact.load(os.path.join(known_dir, 'reconstructed-table.qza')) pdt.assert_frame_equal(known.view(pd.DataFrame), table.view(pd.DataFrame)) known = \ Artifact.load(os.path.join(known_dir, 'reconstructed-summary.qza')) pdt.assert_frame_equal( known.view(pd.DataFrame), summary.view(pd.DataFrame).drop(columns=['mapped-asvs']))
def test_reconstruct_fragment_rep_seqs(self): recon_map = Artifact.import_data( 'FeatureData[SidleReconstruction]', pd.DataFrame( data=np.array([ ['seq01|seq02', 0, 'WANTCAT', 0, 'WANTCAT', 15], ['seq01|seq02', 0, 'WANTCAT', 0, 'WANTCAT', 15], ['seq03|seq04', 0, 'WANTCAT', 1, 'CACCTCGTN', 15], ['seq03|seq04', 0, 'CACCTCGTN', 1, 'CACCTCGTN', 15], ['seq05', 0, 'WANTCAT', 1, 'CACCTCGTN', 15], ], dtype=object), index=pd.Index(['seq01', 'seq02', 'seq03', 'seq04', 'seq05'], name='db-seq'), columns=[ 'clean_name', 'first-region', 'first-fwd-primer', 'last-region', 'last-fwd-primer', 'last-kmer-length' ], )) recon_summary = Artifact.import_data( 'FeatureData[ReconstructionSummary]', Metadata( pd.DataFrame( data=[[1, 2, 2, 0, 'asv01|asv02'], [2, 3, 1.5, np.std([1, 2], ddof=1), 'asv03|asv04'], [2, 2, 1, 0, 'asv07|asv08']], index=pd.Index(['seq01|seq02', 'seq03|seq04', 'seq05'], name='feature-id'), columns=[ 'num-regions', 'total-kmers-mapped', 'mean-kmer-per-region', 'stdv-kmer-per-region', 'mapped-asvs' ]))) aligned_seqs = Artifact.import_data( 'FeatureData[AlignedSequence]', skbio.TabularMSA([ DNA( 'CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------' '--------------', metadata={'id': 'seq01'}), DNA( 'CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------' '--------------', metadata={'id': 'seq02'}), DNA( 'CATAGTCATWTCCGCGTTGGAGTTATGATGATGAWACCACCTCGTCCCAGTTCCGC' 'GCTTCTGACGTGC-', metadata={'id': 'seq03'}), DNA( '------------------GGAGTTATGATGA--AGACCACCTCGTCCCAGTTCCGC' 'GCTTCTGACGTGCC', metadata={'id': 'seq04'}), DNA( 'CATAGTCATCGTTTATGTATGCCCATGATGATGCGAGCACCTCGTATGGATGTAGA' 'GCCACTGACGTGCG', metadata={'id': 'seq05'}), ])) known = pd.Series( data=[ 'GCGAAGCGGCTCAGG', 'WTCCGCGTTGGAGTTATGATGATGAGACCACCTCGTCCCAGTTCCGCGCTTC' ], index=pd.Index(['seq01|seq02', 'seq03|seq04']), ) test = sidle.reconstruct_fragment_rep_seqs( reconstruction_map=recon_map, reconstruction_summary=recon_summary, aligned_sequences=aligned_seqs, ).representative_fragments pdt.assert_series_equal(known, test.view(pd.Series).astype(str))
def test_mismatch_nested(self): a = Artifact.import_data('C1[Foo]', "element 1", view_type=str) b = Artifact.import_data('Foo', "element 2", view_type=str) with self.assertRaisesRegex(ValueError, 'No solution.*C1'): viz, = self.run_action(a=a, b=b)
def test_intsequence2(self): a = Artifact.import_data('IntSequence2', [1]) x = self.run_action(ints=a, strs1=['a'], strs2={'a'}) self.assertEqual(x.output.type, IntSequence2)
tax = re.sub('; ', ';', tax) tax = re.sub(r'[\-| ]+', '_', tax) tax = re.sub(';$', '', tax) return(tax) def ch_col(df, index, val): columns = list(df.columns) columns[index] = val df.columns = columns return df # load otu table from qiime2 Artifact if opt.otu.endswith('.qza'): otu = Artifact.load(opt.otu) otu = otu.view(DataFrame) taxon = Artifact.load(opt.taxon) taxon = taxon.view(DataFrame) # otherwise load otu table from tsv table else: skip_rows = [] with open(opt.otu) as f: for num, line in enumerate(f): if len(line.split('\t')) < 2: skip_rows.append(num) otu = read_csv(opt.otu, sep='\t', skiprows=skip_rows, index_col=0) otu_t = otu.iloc[:, -1] taxon = DataFrame({ 'Taxon': otu_t.copy(), 'Confidence': [0.9] * otu_t.shape[0]
def main(sequence_artifact, seq_samplesize, output_dir): demux_artifact = Artifact.load(sequence_artifact) demux_viz = demux.visualizers.summarize(demux_artifact, n=seq_samplesize) demux_viz.visualization.export_data(output_dir)
help="the reference_reads path") parser.add_argument( '-t', '--ref_taxa', dest='reference_taxonomy', type=str, required=False, default="/mnt/d/Lab/TaxaIdentification/classifier/silva-138-99-tax.qza", help="the reference_taxonomy path") args = parser.parse_args() inputDir = os.path.abspath(args.fileDir) outputDir = os.path.abspath(args.OpDir) ref_reads = os.path.abspath(args.reference_reads) ref_taxa = os.path.abspath(args.reference_taxonomy) artifact = Artifact.import_data('FeatureData[Sequence]', inputDir) reference_reads = Artifact.load(ref_reads) reference_taxonomy = Artifact.load(ref_taxa) taxonomy = classify_consensus_vsearch(artifact, reference_reads, reference_taxonomy, threads=8) mafft_alignment = align_to_tree_mafft_fasttree(artifact, 8) Artifact.export_data(taxonomy.classification, outputDir) Artifact.export_data(mafft_alignment.tree, outputDir)
def test_beta_empty_table(self): t = Table(np.array([]), [], []) t = Artifact.import_data('FeatureTable[Frequency]', t) with self.assertRaisesRegex(ValueError, 'empty'): self.beta(table=t, metric='braycurtis')
def tada(phylogeny: NewickFormat, table: biom.Table, meta_data: NumericMetadataColumn = None, seed_num: Int = 0, xgen: Int = 0, n_beta: Int = 1, n_binom: Int = 5, var_method: Str = 'br_penalized', stat_method: Str = 'binom', prior_weight: Float = 0, coef: Float = 200, exponent: Float = 0.5, pseudo_branch_length: Float = 1e-6, pseudo_cnt: Float = 5, normalized: Bool = False, output_log_fp: Str = None, original_table: Str = None, augmented_table: Str = None, concatenate_meta: Metadata = None, sampling_strategy: Str = None) -> (NewickFormat, biom.Table): _table, y, _phylogeny, generate_strategy, pruned_phylogeny = \ _read_inputs(biom_table=table, phylogeny_fp=phylogeny, meta_data=meta_data) if generate_strategy is 'balancing' and (concatenate_meta is None): raise ValueError( "Expected a path to write out the generated and original labels and metadata!" ) tmp = tempfile.mkdtemp() try: sG = SampleGenerator(seed_num=seed_num, logger=None, generate_strategy=generate_strategy, tmp_dir=tmp, xgen=xgen, n_beta=n_beta, n_binom=n_binom, var_method=var_method, stat_method=stat_method, prior_weight=prior_weight, coef=coef, exponent=exponent, pseudo=pseudo_branch_length, pseudo_cnt=pseudo_cnt, normalized=normalized) orig_biom, orig_labels, augm_biom, augm_labels = \ sG.fit_transform(table=_table, y=y, tree=_phylogeny, sampling_strategy=sampling_strategy) if np.sum(orig_biom.matrix_data - table.matrix_data) > 1e-20: raise ValueError( "The original biom table doesn't match the " "output of generator function! Please double check") if generate_strategy is 'balancing': orig_pd, augm_pd = make_data_frame(orig_biom, augm_biom, orig_labels, augm_labels) if concatenate_meta and not os.path.exists(os.path.dirname(str(concatenate_meta))): os.mkdir(os.path.dirname(str(concatenate_meta))) concat_pd = pd.concat([orig_pd, augm_pd]) concat_meta = qiime2.Metadata(concat_pd) concat_meta.save(concatenate_meta) if output_log_fp is not None: if not os.path.exists(os.path.dirname(output_log_fp)): os.mkdir(os.path.dirname(output_log_fp)) shutil.copyfile(sG.log_fp, output_log_fp) if np.sum(orig_biom.ids('observation') == augm_biom.ids('observation'))\ != len(orig_biom.ids('observation')): raise ValueError( "The order of features in original and augmented data " "is different. Please make sure that your phylogeny doesn't " "have extra features" ) if original_table and \ not os.path.exists(os.path.dirname(str(original_table))): os.mkdir(os.path.dirname(str(original_table))) elif augmented_table and \ not os.path.exists(os.path.dirname(str(original_table))): os.mkdir(os.path.dirname(str(augmented_table))) if augmented_table is not None: augm_qza = \ Artifact.import_data("FeatureTable[Frequency]", augm_biom) augm_qza.save(augmented_table) if original_table is not None: orig_qza = \ Artifact.import_data("FeatureTable[Frequency]", orig_biom) orig_qza.save(original_table) concat_biom = orig_biom concat_biom = concat_biom.merge(augm_biom) finally: print("Something went wrong") return pruned_phylogeny, concat_biom
def test_mismatch(self): a = Artifact.import_data("Foo % Properties('X')", 'element 1', view_type=str) with self.assertRaises(TypeError): self.run_action(a=a)
def test_true(self): a = Artifact.import_data('Bar', 'element', view_type=str) x, = self.run_action(a=a, b=True) self.assertEqual(repr(x.type), 'C1[Foo]')
def run_integration_test( input_dir_name, output_dir_name, ranks_name, table_name, sample_metadata_name, feature_metadata_name=None, use_q2=False, q2_ranking_tool="songbird", expected_unsupported_samples=0, expected_unsupported_features=0, expect_all_unsupported_samples=False, q2_table_biom_format="BIOMV210Format", extreme_feature_count=None, ): """Runs qurro, and validates the output somewhat. Note that this is a pretty outdated function (as in, it doesn't support checking many of the corner cases/etc. that happen when running Qurro). The main purpose of this function is just checking at a high level that things look good, and that data is faithfully represented in the output main.js file. """ in_dir = os.path.join("qurro", "tests", "input", input_dir_name) rloc = os.path.join(in_dir, ranks_name) tloc = os.path.join(in_dir, table_name) sloc = os.path.join(in_dir, sample_metadata_name) floc = None if feature_metadata_name is not None: floc = os.path.join(in_dir, feature_metadata_name) out_dir = os.path.join("docs", "demos", output_dir_name) rrv_qzv = result = None if use_q2: if q2_ranking_tool == "songbird": q2_action = q2qurro.actions.differential_plot q2_rank_type = "FeatureData[Differential]" elif q2_ranking_tool == "DEICODE": q2_action = q2qurro.actions.loading_plot q2_rank_type = "PCoAResults % Properties(['biplot'])" else: raise ValueError( "Unknown q2_ranking_tool: {}".format(q2_ranking_tool) ) # Import all of these files as Q2 artifacts or metadata. rank_qza = Artifact.import_data(q2_rank_type, rloc) table_qza = Artifact.import_data( "FeatureTable[Frequency]", tloc, view_type=q2_table_biom_format ) sample_metadata = Metadata.load(sloc) feature_metadata = None if floc is not None: feature_metadata = Metadata.load(floc) # Now that everything's imported, try running qurro rrv_qzv = q2_action( ranks=rank_qza, table=table_qza, sample_metadata=sample_metadata, feature_metadata=feature_metadata, extreme_feature_count=extreme_feature_count, ) # Output the contents of the visualization to out_dir. rrv_qzv.visualization.export_data(out_dir) else: # Run qurro "standalone" -- i.e. outside of QIIME 2 runner = CliRunner() args = [ "--ranks", rloc, "--table", tloc, "--sample-metadata", sloc, "--output-dir", out_dir, ] if floc is not None: args += ["--feature-metadata", floc] if extreme_feature_count is not None: args += ["--extreme-feature-count", extreme_feature_count] result = runner.invoke(rrvp.plot, args) # Validate that the correct exit code and output were recorded validate_standalone_result( result, expected_unsupported_samples=expected_unsupported_samples, expect_all_unsupported_samples=expect_all_unsupported_samples, expected_unsupported_features=expected_unsupported_features, ) # If we expected this test to fail due to invalid inputs, don't bother # doing any JSON validation. # (Input validity checking is done in generate.process_input(), before # any output files are created in generate.gen_visualization() -- so no # output should be created anyway in these cases.) if expect_all_unsupported_samples or expected_unsupported_features > 0: return None, None else: # Only validate JSONs if -x wasn't specified (i.e. the passed # extreme feature count is None) validate_jsons = extreme_feature_count is None rank_json, sample_json, count_json = validate_main_js( out_dir, rloc, tloc, sloc, validate_jsons=validate_jsons ) return rank_json, sample_json, count_json
def test_false(self): a = Artifact.import_data('Bar', 'element', view_type=str) x, = self.run_action(a=a, b=False) self.assertEqual(repr(x.type), 'Foo')
def run(self, factory): factory.validate() _check_unique_names(factory) configs = [] for config in factory.gen_configurations(): configs.append(config) # Run test configs = [x for x in factory.gen_configurations()] self.callbacks.batch_info(configs) for config in configs: try: print(config.analysis_name) # Run this custom preprocessing (final_biom, target, _, _) = run_preprocessing(config, self.callbacks) base_dir = "dataset" dataset = "imsms-mlab" preparation = config.analysis_name target_name = "disease_binary" algorithm = config.mlab_algorithm if algorithm is None: algorithm = "RandomForestClassifier" # Create the expected file structure results_dir = path.join(base_dir, dataset, preparation, target_name) if not path.exists(results_dir): makedirs(results_dir) # Save table and metadata table_fp = path.join(base_dir, dataset, preparation, target_name, "filtered_table.qza") table_artifact = Artifact.import_data( "FeatureTable[Frequency]", final_biom) table_artifact.save(table_fp) metadata_fp = path.join(base_dir, dataset, preparation, target_name, "filtered_metadata.qza") metadata_artifact = Artifact.import_data( "SampleData[Target]", target) metadata_artifact.save(metadata_fp) # run job indices 1...10 inclusive, letting 5 jobs run at once # each job has 100 parameter sets, for a total of 1000 parameter sets start = 1 end = 10 n_concurrent_jobs = 5 chunk_size = 100 ( script_fp, params_fp, run_info_fp, ) = orchestrate_hyperparameter_search( dataset=dataset, preparation=preparation, target=target_name, algorithm=algorithm, repeats=3, # num CV repeats base_dir= base_dir, # Directory with mlab structure containing datasets ppn=4, # processors per node memory=32, # memory in GB wall=50, # walltime in hours chunk_size=chunk_size, # num parameter sets to run per job randomize=True, # randomly shuffle order of parameter set force=False, # force overwrite of existing results dry=False, # dry runs dataset_path=table_fp, metadata_path=metadata_fp, ) cmd = [ "qsub", "-t", f"{start}-{end}%{n_concurrent_jobs}", script_fp ] subprocess.run(cmd) except Exception: print(f"TEST FAILURE. CONFIG: " + config.analysis_name) traceback.print_exc()
def setUp(self): self.q2table = Artifact.import_data("FeatureTable[Frequency]", create_test_table())