def test_split_ints(self): qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') # build output file names left_path = os.path.join(self.tempdir, 'left.qza') right_path = os.path.join(self.tempdir, 'right.qza') # TODO: currently must pass `--verbose` to commands invoked by Click's # test runner because redirecting stdout/stderr raises an # "io.UnsupportedOperation: fileno" error. Likely related to Click # mocking a filesystem in the test runner. result = self.runner.invoke( command, ['split-ints', '--i-ints', self.artifact1_path, '--o-left', left_path, '--o-right', right_path, '--verbose']) # command completes successfully and creates the correct # output files self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(left_path)) self.assertTrue(os.path.exists(right_path)) # results are correct left = Artifact.load(left_path) right = Artifact.load(right_path) self.assertEqual(left.view(list), [0]) self.assertEqual(right.view(list), [42, 43])
def setUp(self): super().setUp() self.preprocess = self.plugin.pipelines['preprocess'] continuous_metadata = pd.DataFrame( { 'target': ['1.0', '2.0', '3.0', '4.0'], 'contain_nan': ['3.3', '3.5', None, '3.9'] }, index=pd.Index(['A', 'B', 'C', 'D'], name='id')) self.continuous_metadata = continuous_metadata discrete_metadata = pd.DataFrame( { 'target': ['0', '1', '0', '1'], 'target_int': [1, 0, 1, 0], 'contain_nan': ['0', '1', None, '1'], 'non_encoded': ['10', '2', '', 'b'] }, index=pd.Index(['A', 'B', 'C', 'D'], name='id')) self.discrete_metadata = discrete_metadata TEST_DIR = path.split(__file__)[0] md_path = path.join(TEST_DIR, 'data/sample-metadata-binary.tsv') table_path = path.join(TEST_DIR, 'data/table.qza') rooted_tree_path = path.join(TEST_DIR, 'data/rooted-tree.qza') unrooted_tree_path = path.join(TEST_DIR, 'data/unrooted-tree.qza') self.mp_sample_metadata = Metadata.load(md_path) self.mp_table = Artifact.load(table_path) self.mp_rooted_tree = Artifact.load(rooted_tree_path) self.mp_unrooted_tree = Artifact.load(unrooted_tree_path)
def test_split_ints(self): qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') # build output file names left_path = os.path.join(self.tempdir, 'left.qza') right_path = os.path.join(self.tempdir, 'right.qza') # TODO: currently must pass `--verbose` to commands invoked by Click's # test runner because redirecting stdout/stderr raises an # "io.UnsupportedOperation: fileno" error. Likely related to Click # mocking a filesystem in the test runner. result = self.runner.invoke(command, [ 'split-ints', '--i-ints', self.artifact1_path, '--o-left', left_path, '--o-right', right_path, '--verbose' ]) # command completes successfully and creates the correct # output files self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(left_path)) self.assertTrue(os.path.exists(right_path)) # results are correct left = Artifact.load(left_path) right = Artifact.load(right_path) self.assertEqual(left.view(list), [0]) self.assertEqual(right.view(list), [42, 43])
def setUp(self): self.base_dir = \ os.path.join(os.path.dirname(os.path.realpath(__file__)), 'files/little_test') self.aligned_seqs = pd.Series({ 'seq01': DNA('-CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC---------------------------------' ), 'seq02': DNA('ACTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC---------------------------------' ), 'seq03': DNA('CATAGTCATWTCCGCGTTGGAGTTATGATGATGAWACCACCTCGTCCCAGTTCCGCGCTTCTGACGTGCA-' ), 'seq04': DNA('------------------GGAGTTATGATGA--AGACCACCTCGTCCCAGTTCCGCGCTTCTGACGTGCAC' ), 'seq05': DNA('CATAGTCATCGTTTATGTATGCCCATGATGATGCGAGCACCTCGTATGGATGTAGAGCCACTGACGTGCGG' ), }) kmer1 = Artifact.load(os.path.join(self.base_dir, 'frag_r1_db_map.qza')) kmer2 = Artifact.load(os.path.join(self.base_dir, 'frag_r2_db_map.qza')) self.kmer_map1 = kmer1.view(pd.DataFrame) self.kmer_map2 = kmer2.view(pd.DataFrame) np.random.seed(5)
def validate_analysis_input(feature_table, rep_seqs, taxonomy): """ Precheck input files prior to running denoise step Input: - feature_table: Path to QIIME2 artifact of type FeatureTable[Frequency] - rep_seqs: Path to QIIME2 artifact of type FeatureData[Sequence] """ # Check Artifact type try: feature_table_artifact = Artifact.load(feature_table) rep_seqs_artifact = Artifact.load(rep_seqs) if (str(feature_table_artifact.type) != "FeatureTable[Frequency]"): msg = "Input Feature Table is not of type 'FeatureTable[Frequency]'!" raise ValueError(msg) if (str(rep_seqs_artifact.type) != "FeatureData[Sequence]"): msg = "Input Representative Sequences is not of type 'FeatureData[Sequence]'!" raise ValueError(msg) except ValueError as err: message = str(err) return 400, message return 200, "Imported data good!"
def load_mp_data(): """Loads data from the QIIME 2 moving pictures tutorial for visualization. It's assumed that this data is already stored in docs/moving-pictures/, aka the PREFIX_DIR global variable set above, which should be located relative to where this function is being run from. If this directory or the data files within it cannot be accessed, this function will (probably) break. Returns ------- (tree, table, md, fmd, ordination) tree: Artifact with semantic type Phylogeny[Rooted] Phylogenetic tree. table: Artifact with semantic type FeatureTable[Frequency] Feature table. md: Metadata Sample metadata. fmd: Metadata Feature metadata. (Although this is stored in the repository as a FeatureData[Taxonomy] artifact, we transform it to Metadata.) pcoa: Artifact with semantic type PCoAResults Ordination. """ tree = Artifact.load(os.path.join(PREFIX_DIR, "rooted-tree.qza")) table = Artifact.load(os.path.join(PREFIX_DIR, "table.qza")) pcoa = Artifact.load( os.path.join(PREFIX_DIR, "unweighted_unifrac_pcoa_results.qza") ) md = Metadata.load(os.path.join(PREFIX_DIR, "sample_metadata.tsv")) # We have to transform the taxonomy QZA to Metadata ourselves taxonomy = Artifact.load(os.path.join(PREFIX_DIR, "taxonomy.qza")) fmd = taxonomy.view(Metadata) return tree, table, md, fmd, pcoa
def setUp(self): self.exp_demux = Artifact.load(dir_path + "/data/mock-3/exp_demux.qza") self.exp_deblurred = Artifact.load(dir_path + "/data/mock-3/deblurred_150nt.qza") self.exp_deblur_biom = self.exp_deblurred.view(biom.Table) self.exp_deblurred_pt = Artifact.load( dir_path + "/data/mock-3/deblurred_100nt_pt.qza") self.num_parallel = NUM_CORES
def analysis(input_fp, input_path_file, clps_df, output_fp, trim_incr, num_trims, trim_lengths): start = time.clock() if output_fp.endswith('/'): output_fp = output_fp[:-1] if(input_fp is None and input_path_file is None): click.echo("No inputs supplied, see --help!") return elif(input_fp is not None): if input_fp.endswith('/'): input_fp = input_fp[:-1] pres = dict() res = [f for f in os.listdir(input_fp) if re.match('deblurred_pre_\d+.qza', f)] for f in res: fm = f.replace("_",".") fm = fm.split(".") length = int(fm[len(fm)-2]) artifact = Artifact.load(input_fp + "/" + f) pres[length] = artifact pre_artifacts = [pres[x] for x in sorted(pres.keys())] pre_artifacts.reverse() posts = dict() res = [f for f in os.listdir(input_fp) if re.match('deblurred_pt_\d+.qza', f)] for f in res: fm = f.replace("_",".") fm = fm.split(".") length = int(fm[len(fm)-2]) artifact = Artifact.load(input_fp + "/" + f) posts[length] = artifact post_artifacts = [posts[x] for x in sorted(posts.keys())] post_artifacts.reverse() clps_df = pd.read_csv(input_fp + "/collapse.csv") else: if clps_df is None: click.echo("Supply collapse path!") return paths = pd.read_csv(input_path_file, header=None) pre_artifacts = [load_artifact(x) for x in paths.iloc[:,0]] post_artifacts = [load_artifact(x) for x in paths.iloc[:,1]] clps_df = pd.read_csv(clps_df) click.echo("{}s for loading qza's for analysis"\ .format(str(time.clock() - start))) if len(trim_lengths) == 0: trim_lengths = None return analysis_art(pre_artifacts, post_artifacts, clps_df, trim_incr, num_trims, output_fp, trim_lengths)
def test_reconstruct_fragment_rep_seqs(self): recon_map = Artifact.import_data( 'FeatureData[SidleReconstruction]', pd.DataFrame(data=[['seq01|seq02'], ['seq01|seq02'], ['seq03|seq04'], ['seq03|seq04'], ['seq05']], index=pd.Index(['seq01', 'seq02', 'seq03', 'seq04', 'seq05'], name='db-seq'), columns=['clean_name']) ) recon_summary = Artifact.import_data( 'FeatureData[ReconstructionSummary]', Metadata(pd.DataFrame(data=[[1, 2, 2, 0, 'asv01|asv02'], [2, 3, 1.5, np.std([1, 2], ddof=1), 'asv03|asv04'], [2, 2, 1, 0, 'asv07|asv08']], index=pd.Index(['seq01|seq02', 'seq03|seq04', 'seq05'], name='feature-id'), columns=['num-regions', 'total-kmers-mapped', 'mean-kmer-per-region', 'stdv-kmer-per-region', 'mapped-asvs'])) ) aligned_seqs = Artifact.import_data( 'FeatureData[AlignedSequence]', skbio.TabularMSA([ DNA('CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------' '--------------', metadata={'id': 'seq01'}), DNA('CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------' '--------------', metadata={'id': 'seq02'}), DNA('CATAGTCATWTCCGCGTTGGAGTTATGATGATGAWACCACCTCGTCCCAGTTCCGC' 'GCTTCTGACGTGC-', metadata={'id': 'seq03'}), DNA('------------------GGAGTTATGATGA--AGACCACCTCGTCCCAGTTCCGC' 'GCTTCTGACGTGCC', metadata={'id': 'seq04'}), DNA('CATAGTCATCGTTTATGTATGCCCATGATGATGCGAGCACCTCGTATGGATGTAGA' 'GCCACTGACGTGCG', metadata={'id': 'seq05'}), ]) ) known = pd.Series( data=['GCGAAGCGGCTCAGG', 'WTCCGCGTTGGAGTTATGATGATGAGACCACCTCGTCCCAGTTCCGCGCTTC'], index=pd.Index(['seq01|seq02', 'seq03|seq04']), ) test = sidle.reconstruct_fragment_rep_seqs( region=['Bludhaven', 'Gotham'], kmer_map=[Artifact.load(os.path.join(self.base_dir, 'frag_r1_db_map.qza')), Artifact.load(os.path.join(self.base_dir, 'frag_r2_db_map.qza'))], reconstruction_map=recon_map, reconstruction_summary=recon_summary, aligned_sequences=aligned_seqs, ).representative_fragments pdt.assert_series_equal(known, test.view(pd.Series).astype(str))
def pre_trims(input_fp, trim_length, trim_incr, num_trims, output_fp, num_cores): """Quality filters and then pre_trims sequences to various pre-trim lengths Saves qza's if specified. With naming format "deblurred_pre_<length>nt.qza Parameters ---------- input_fp: path Path to qza of demuxed sequences trim_length: int, optional Length to trim to. If not supplied, longest possible length is used. This takes a while so do supply if possible trim_incr: int, optional Percent amount to decrement by. num_trims: int, optional Number of different lengths to trim to. Each trim_incr % less. output_fp: path, optional Path to output deblurred qza files num_cores: int, optional Number of cores to parallelize deblur Returns ------- list of length trim_lengths of deblurred seq artifacts """ start = time.clock() click.echo("Importing seq data from " + input_fp) input_artifact = Artifact.load(input_fp) if output_fp.endswith('/'): output_fp = output_fp[:-1] click.echo("{}s for importing for pre".format(str(time.clock() - start))) return pre_trims_art(input_artifact, trim_length, trim_incr, num_trims, output_fp, num_cores)
def post_trims(input_fp, output_fp, trim_lengths, output_name, time_out, time_out_append, partition_count, input_biom_fp, save_biom): start = time.clock() if output_fp.endswith('/'): output_fp = output_fp[:-1] if(input_fp is None and input_biom_fp is None): click.echo("No input given! See --help") return elif (input_fp is None): input_artifact = None click.echo("Loading biom table") input_biom = biom.load_table(input_biom_fp) else: click.echo("Importing seq data from " + input_fp) input_artifact = Artifact.load(input_fp) input_biom=None click.echo("{}s for importing for post_trims".format(str(time.clock() - start))) click.echo("partition_count: {}".format(partition_count)) return post_trims_art(output_fp, input_artifact, trim_lengths, output_name, time_out, time_out_append, partition_count, input_biom, save_biom)
def get_itol_barchart(fdata: pd.DataFrame, table_file: str, metadata_file: str, metadata_column: str, output_file: str): '''Generate a table in QIIME 2 artifact format which can be directly parsed by iTOL and yield a multi-bar chart. ''' # load sample feature table table = Artifact.load(table_file) # extract BIOM table table = table.view(biom.Table) # load sample metadata meta = Metadata.load(metadata_file) # generate a sample Id to category map column = meta.get_column(metadata_column).drop_missing_values() catmap = column.to_series().to_dict() # collapse feature table by category # note: when multiple samples map to one category, take **mean** table = table.collapse(lambda i, _: catmap[i], norm=True, axis='sample') # import BIOM table into QIIME 2 and save res = Artifact.import_data('FeatureTable[Frequency]', table) res.save(output_file)
def read_results(path): """Read the results from a MICOM simulation. Parameters: ----------- path : str The path to a MicomResults artifact. Returns: -------- MicomResultsData A named tuple with the following attributes: growth_rates : pd.DataFrame The growth rates for each taxon and sample. exchange_fluxes : pd.DataFrame The exchange fluxes for each metabolite, sample and taxon. Fluxes that denote trasnport from and into the environment are denoted with the taxon `medium. """ from qiime2 import Artifact art = Artifact.load(path) return art.view(MicomResultsData)
def load_qiime2_artifact(feature_table): """ Load the output of QIIME2 DADA2 (QIIME2 feature table artifact) into Python ** Will throw errors if the artifact type is NOT FeatureTable[Frequency] ** You may check Artifact type by checking the "type" property of the Artifact object after loading the artifact via 'Artifact.load(artifact)' """ # Make sure input actually exists if not(os.path.isfile(feature_table)): msg = "Input file '{in_file}' does NOT exist!".format( in_file=feature_table) raise FileNotFoundError(msg) try: feature_table_artifact = Artifact.load(feature_table) # Check Artifact type if(str(feature_table_artifact.type) != "FeatureTable[Frequency]"): msg = "Input QIIME2 Artifact is not of the type 'FeatureTable[Frequency]'!" raise ValueError(msg) feature_table_df = feature_table_artifact.view(pd.DataFrame) return feature_table_df except ValueError as err: logger.error(err) raise except Exception as err: logger.error(err) raise
def test_variadic_inputs(self): qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') output_path = os.path.join(self.tempdir, 'output.qza') ints1 = Artifact.import_data('IntSequence1', [1, 2, 3]).save( os.path.join(self.tempdir, 'ints1.qza')) ints2 = Artifact.import_data('IntSequence2', [4, 5, 6]).save( os.path.join(self.tempdir, 'ints2.qza')) set1 = Artifact.import_data('SingleInt', 7).save( os.path.join(self.tempdir, 'set1.qza')) set2 = Artifact.import_data('SingleInt', 8).save( os.path.join(self.tempdir, 'set2.qza')) result = self.runner.invoke(command, [ 'variadic-input-method', '--i-ints', ints1, '--i-ints', ints2, '--i-int-set', set1, '--i-int-set', set2, '--p-nums', '9', '--p-nums', '10', '--p-opt-nums', '11', '--p-opt-nums', '12', '--p-opt-nums', '13', '--o-output', output_path, '--verbose' ]) self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(output_path)) output = Artifact.load(output_path) self.assertEqual(output.view(list), list(range(1, 14)))
def convert2table(infile, ofile): artifact = Artifact.load(infile) stats = artifact.view(pd.DataFrame) stats.index = stats.index.astype(str) ofile_path = abspath(ofile) stats.to_csv(ofile_path, sep='\t', index=1) return ofile_path
def test_repeated_multiple_option(self): input_path = os.path.join(self.tempdir, 'ints.qza') artifact = Artifact.import_data(IntSequence1, [0, 42, 43], list) artifact.save(input_path) metadata_path1 = os.path.join(self.tempdir, 'metadata1.tsv') with open(metadata_path1, 'w') as f: f.write('id\tcol1\nid1\tfoo\nid2\tbar\n') metadata_path2 = os.path.join(self.tempdir, 'metadata2.tsv') with open(metadata_path2, 'w') as f: f.write('id\tcol2\nid1\tbaz\nid2\tbaa\n') output_path = os.path.join(self.tempdir, 'out.qza') qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') result = self.runner.invoke(command, [ 'identity-with-metadata', '--i-ints', input_path, '--o-out', output_path, '--m-metadata-file', metadata_path1, '--m-metadata-file', metadata_path2, '--verbose' ]) self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(output_path)) self.assertEqual(Artifact.load(output_path).view(list), [0, 42, 43])
def test_variadic_inputs(self): qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') output_path = os.path.join(self.tempdir, 'output.qza') ints1 = Artifact.import_data('IntSequence1', [1, 2, 3]).save( os.path.join(self.tempdir, 'ints1.qza')) ints2 = Artifact.import_data('IntSequence2', [4, 5, 6]).save( os.path.join(self.tempdir, 'ints2.qza')) set1 = Artifact.import_data('SingleInt', 7).save( os.path.join(self.tempdir, 'set1.qza')) set2 = Artifact.import_data('SingleInt', 8).save( os.path.join(self.tempdir, 'set2.qza')) result = self.runner.invoke( command, ['variadic-input-method', '--i-ints', ints1, '--i-ints', ints2, '--i-int-set', set1, '--i-int-set', set2, '--p-nums', '9', '--p-nums', '10', '--p-opt-nums', '11', '--p-opt-nums', '12', '--p-opt-nums', '13', '--o-output', output_path, '--verbose']) self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(output_path)) output = Artifact.load(output_path) self.assertEqual(output.view(list), list(range(1, 14)))
def setUp(self): super().setUp() self.demux_seqs = SingleLanePerSampleSingleEndFastqDirFmt( self.get_data_path('sample_seqs_other'), 'r') self.ref_ar = Artifact.load( self.get_data_path('../../assets/test_reference.qza')) self.ref = self.ref_ar.view(DNAFASTAFormat)
def test_repeated_multiple_option(self): input_path = os.path.join(self.tempdir, 'ints.qza') artifact = Artifact.import_data(IntSequence1, [0, 42, 43], list) artifact.save(input_path) metadata_path1 = os.path.join(self.tempdir, 'metadata1.tsv') with open(metadata_path1, 'w') as f: f.write('id\tcol1\nid1\tfoo\nid2\tbar\n') metadata_path2 = os.path.join(self.tempdir, 'metadata2.tsv') with open(metadata_path2, 'w') as f: f.write('id\tcol2\nid1\tbaz\nid2\tbaa\n') output_path = os.path.join(self.tempdir, 'out.qza') qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') result = self.runner.invoke( command, ['identity-with-metadata', '--i-ints', input_path, '--o-out', output_path, '--m-metadata-file', metadata_path1, '--m-metadata-file', metadata_path2, '--verbose']) self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(output_path)) self.assertEqual(Artifact.load(output_path).view(list), [0, 42, 43])
def summarize(input_file, verbose=False): """Extract summary or verbose data from an Artifact file. This command automatically detects the input file's semantic type and then extracts summary or verbose data from it. Currently, the command supports the following semantic types: FeatureTable[Frequency], FeatureTable[RelativeFrequency], FeatureData[Sequence], FeatureData[AlignedSequence], FeatureData[Taxonomy], DistanceMatrix. Parameters ---------- input_file : str Path to the input Artifact file. verbose : bool, default: False Print a verbose version of the results. """ artifact = Artifact.load(input_file) if str(artifact.type) in [ "FeatureTable[Frequency]", "FeatureTable[RelativeFrequency]" ]: _parse_feature_table(artifact, verbose) elif str(artifact.type) in [ "FeatureData[Sequence]", "FeatureData[AlignedSequence]" ]: _parse_feature_data(artifact, verbose) elif str(artifact.type) in ["FeatureData[Taxonomy]"]: _parse_feature_data2(artifact, verbose) elif str(artifact.type) in ["DistanceMatrix"]: _parse_distance_matrix(artifact, verbose) else: raise TypeError(f"Unsupported Artifact type: '{artifact.type}'")
def convert2otutab(infile, ofile): artifact = Artifact.load(infile) dada2_df = artifact.view(qiime2.Metadata).to_dataframe() dada2_df.index = dada2_df.index.astype(str) ofile_path = abspath(ofile) dada2_df.to_csv(ofile_path, sep='\t', index=1) return ofile_path
def convert_qiime2_2_skbio(pcoa_artifact): """ Convert QIIME2 PCoA artifact to skbio OrdinationResults object. ** Will throw errors if the artifact type is NOT PCoAResults ** You may check Artifact type by checking the "type" property of the Artifact object after loading the artifact via 'Artifact.load(artifact)' """ try: pcoa_artifact = Artifact.load(pcoa_artifact) # Check Artifact type if (str(pcoa_artifact.type) != "PCoAResults"): msg = "Input QIIME2 Artifact is not of the type 'PCoAResults'!" raise AXIOME3Error(msg) pcoa = pcoa_artifact.view(ordination.OrdinationResults) except AXIOME3Error: raise # Rename PCoA coordinates index (so left join can be performed later) coords = pcoa.samples coords.index.names = ['SampleID'] # Rename columns to have more meaningful names num_col = coords.shape[1] col_names = ['Axis ' + str(i) for i in range(1, num_col + 1)] coords.columns = col_names pcoa.samples = coords return pcoa
def test_no_optional_artifacts_provided(self): result = self._run_command( 'optional-artifacts-method', '--i-ints', self.ints1, '--p-num1', 42, '--o-output', self.output, '--verbose') self.assertEqual(result.exit_code, 0) self.assertEqual(Artifact.load(self.output).view(list), [0, 42, 43, 42])
def load_classifier(c_path): # c_path = '/home/liaoth/data2/gg-13-8-99-nb-classifier.qza' if not c_path.endswith('.qza'): classifier = Artifact.load(c_path) else: pass # todo: implement a classifier training module. return classifier
def tutorial(dir_name): print('\n', 'Running the Artifact API tutorial section', '\n') table = dir_name + '/table.qza' unrarefied_table = Artifact.load(table) rarefy_result = feature_table.methods.rarefy(table=unrarefied_table, sampling_depth=100) rarefied_table = rarefy_result.rarefied_table biom_table = rarefied_table.view(biom.Table) print(biom_table.head())
def test_no_optional_artifacts_provided(self): result = self._run_command('optional-artifacts-method', '--i-ints', self.ints1, '--p-num1', '42', '--o-output', self.output, '--verbose') self.assertEqual(result.exit_code, 0) self.assertEqual( Artifact.load(self.output).view(list), [0, 42, 43, 42])
def qiime_to_biom(input_fp, output_fp): """Converts a .qza file to .biom file""" as_artifact = Artifact.load(input_fp) as_biom = as_artifact.view(biom.Table) as_json = as_biom.to_json(generated_by="deblur-testing") with open(output_fp, "w") as f: f.write(as_json)
def load_artifact(artifact): '''It takes a qiime2 artifact and create a pandas dataframe from it''' import pandas as pd from qiime2 import Artifact artifact = Artifact.load(artifact) return artifact.view(pd.DataFrame)
def setUp(self): print("Importing metadata for expected") self.exp_barcode_metadata = \ Metadata.load(dir_path + "/data/mock-3/sample-metadata.tsv") self.exp_demux = Artifact.load(dir_path + "/data/mock-3/exp_demux.qza") self.exp_out = [self.exp_demux, self.exp_barcode_metadata] self.working_dir_fp = dir_path + "/data/mock-3"
def _parse_input(input, temp_dir): """Parse the input QIIME 2 object and export the files.""" if isinstance(input, qiime2.Artifact): fn = f'{temp_dir}/dokdo-temporary.qza' input.save(fn) input = fn Artifact.load(input).export_data(temp_dir) elif isinstance(input, qiime2.Visualization): fn = f'{temp_dir}/dokdo-temporary.qzv' input.save(fn) input = fn Visualization.load(input).export_data(temp_dir) elif isinstance(input, str) and input.endswith('.qza'): Artifact.load(input).export_data(temp_dir) elif isinstance(input, str) and input.endswith('.qzv'): Visualization.load(input).export_data(temp_dir) else: pass
def check_artifact_type(artifact_path, artifact_type): q2_artifact = Artifact.load(artifact_path) # Raise ValueError if not appropriate type if(str(q2_artifact.type) != ARTIFACT_TYPES[artifact_type]): msg = "Input QIIME2 Artifact is not of the type '{}'".format(ARTIFACT_TYPES[artifact_type]) raise AXIOME3Error(msg) return q2_artifact
def cross_validate_classifier(ref_taxa, ref_seqs, classifier_spec, obs_dir, results_dir, intermediate_dir, n_jobs, log_file, log_level, confidence, classifier_directory): classifier_spec = classifier_spec.read() # set up logging setup_logging(log_level, log_file) logging.info(locals()) # load folds taxon_defaults_file = join(intermediate_dir, 'taxon_defaults.json') with open(taxon_defaults_file) as fh: taxon_defaults = json.load(fh) folds = glob.glob(join(intermediate_dir, 'fold-*')) logging.info('Got folds') # load ref_seq _, ref_seqs = load_references(ref_taxa, ref_seqs) ref_seqs = Artifact.import_data('FeatureData[Sequence]', DNAIterator(ref_seqs)) # for each fold for fold in folds: # load new file for different folds weights_file = join(fold, 'weights.qza') training_taxa_file = join(fold, 'train_taxa.qza') # load the simulated test samples test_samples = load_simulated_samples(fold, results_dir) # load the test seqs, training taxa, traing seqs, and weights weights = Artifact.load(weights_file) #test_seqs = Artifact.load(test_seqs_file) train_taxa = Artifact.load(training_taxa_file) # train the weighted classifier and classify the test samples classification = classify_samples_sklearn(test_samples, train_taxa, ref_seqs, classifier_spec, confidence, n_jobs, weights) # save the classified taxonomy artifacts save_observed(classifier_directory, test_samples, classification, obs_dir) logging.info('Done ' + fold)
def cluster_features(query_table: biom.Table, closed_reference_table: biom.Table, query_sequences: DNAFASTAFormat, reference_sequences: pd.Series, thr: float = 0.97, threads: int = 1, output_log_file: str = None) -> ( biom.Table, DNAFASTAFormat, DNAFASTAFormat): reference_sequences_fasta = get_reference_seqs_from_ids(closed_reference_table, reference_sequences) results = cluster_features_closed_reference(sequences=query_sequences, table=query_table, reference_sequences=reference_sequences_fasta, perc_identity=thr, threads=threads) clustered_table_biom = results[0] clustered_sequences_pd = Artifact.load(str(results[1])).view(pd.Series) unmatched_sequences_pd = Artifact.load(str(results[2])).view(pd.Series) with tempfile.mktemp() as tmp_fp: logger_ins = LOG(tmp_fp).get_logger('clustering_features') logger_ins.info("The number of OTUs in the reference database is", _15(reference_sequences_fasta).size) logger_ins.info("The number of unmatched sequence to the reference alignment is", unmatched_sequences_pd.size) logger_ins.info("The number of matched sequences to the reference alignment is", clustered_sequences_pd.size) logger_ins.info("Before applying clustering, the total number of counts " "in the original feature table was", np.sum(query_table.sum())) logger_ins.info("Before applying clustering, the number of non-zero elements" " of the underlying feature table is", query_table.nnz) logger_ins.info("After applying clustering, the total number of counts " "in the original feature table was", np.sum(clustered_table_biom.sum())) logger_ins.info("After applying clustering, the number of non-zero elements" " of the underlying feature table is", clustered_table_biom.nnz) logger_ins.info("The percent of total counts retained is", np.sum(query_table.sum()) / np.sum(clustered_table_biom.sum()) * 100, "%s") query_samples = clustered_table_biom.ids('sample') closed_reference_features = closed_reference_table.ids('observation') clustered_table_biom = closed_reference_table.merge(clustered_table_biom) clustered_table_biom.filter(ids_to_keep=query_samples, axis='sample', inplace=True) if len(set(closed_reference_features) - set(clustered_table_biom.ids('sample'))) != 0: raise ValueError( "Merging two tables failed! There are less features in the final table than expected!" ) if output_log_file: shutil.copy(tmp_fp, output_log_file) return clustered_table_biom, results[1], results[2]
def test_without_inputs_or_parameters(self): qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') output_path = os.path.join(self.tempdir, 'output.qza') result = self.runner.invoke( command, ['no-input-method', '--o-out', output_path, '--verbose']) self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(output_path)) artifact = Artifact.load(output_path) self.assertEqual(artifact.view(dict), {'foo': '42'})
def test_multiple_metadata(self): for command in ('identity-with-metadata-category', 'identity-with-optional-metadata-category'): result = self._run_command( command, '--i-ints', self.input_artifact, '--o-out', self.output_artifact, '--m-metadata-file', self.metadata_file1, '--m-metadata-file', self.metadata_file2, '--m-metadata-file', self.metadata_artifact, '--m-metadata-category', 'col2', '--verbose') exp_yaml = "metadata: !metadata '%s:metadata.tsv'" % ( Artifact.load(self.metadata_artifact).uuid) self._assertMetadataOutput(result, exp_tsv='0\tbaz\n', exp_yaml=exp_yaml)
def _assertMetadataOutput(self, result, *, exp_tsv, exp_yaml): self.assertEqual(result.exit_code, 0) artifact = Artifact.load(self.output_artifact) action_dir = artifact._archiver.provenance_dir / 'action' if exp_tsv is None: self.assertFalse((action_dir / 'metadata.tsv').exists()) else: with (action_dir / 'metadata.tsv').open() as fh: self.assertEqual(fh.read(), exp_tsv) with (action_dir / 'action.yaml').open() as fh: self.assertIn(exp_yaml, fh.read())
def test_qza_extension(self): qiime_cli = RootCommand() command = qiime_cli.get_command(ctx=None, name='dummy-plugin') # build output parameter arguments and expected output file names left_path = os.path.join(self.tempdir, 'left') expected_left_path = os.path.join(self.tempdir, 'left.qza') right_path = os.path.join(self.tempdir, 'right') expected_right_path = os.path.join(self.tempdir, 'right.qza') result = self.runner.invoke( command, ['split-ints', '--i-ints', self.artifact1_path, '--o-left', left_path, '--o-right', right_path, '--verbose']) # command completes successfully and creates the correct # output files self.assertEqual(result.exit_code, 0) self.assertTrue(os.path.exists(expected_left_path)) self.assertTrue(os.path.exists(expected_right_path)) # results are correct left = Artifact.load(expected_left_path) right = Artifact.load(expected_right_path) self.assertEqual(left.view(list), [0]) self.assertEqual(right.view(list), [42, 43])
def test_multiple_metadata(self): for command in ('identity-with-metadata', 'identity-with-optional-metadata'): result = self._run_command( command, '--i-ints', self.input_artifact, '--o-out', self.output_artifact, '--m-metadata-file', self.metadata_file_alt_id_header, '--m-metadata-file', self.metadata_file2, '--m-metadata-file', self.metadata_artifact, '--verbose') exp_tsv = ( 'id\tcol1\tcol2\ta\tb\n' '#q2:types\tcategorical\tcategorical\tcategorical\tcategorical' '\n0\tfoo\tbaz\tdog\tcat\n' ) exp_yaml = "metadata: !metadata '%s:metadata.tsv'" % ( Artifact.load(self.metadata_artifact).uuid) self._assertMetadataOutput(result, exp_tsv=exp_tsv, exp_yaml=exp_yaml)