def deposit_biofilms(output_dir, abs_table1, abs_table2, rel_table1, rel_table2, edges, metadata, sample_id): """ Writes down tables and edges into files. Parameters ---------- output_dir : str output directory rel_table1 : biom.Table Biom table of relative abundances rel_table2 : biom.Table Biom table of relative abundances abs_table1 : biom.Table Biom table of absolute abundances abs_table2 : biom.Table Biom table of absolute abundances edges : list Edge list for ground truthing. metadata : pd.DataFrame Dataframe of sample metadata sample_id : str sample id """ output_abs_microbes = "%s/table.abs.microbes.%s.biom" % (output_dir, sample_id) output_abs_metabolites = "%s/table.abs.metabolites.%s.biom" % (output_dir, sample_id) output_rel_microbes = "%s/table.rel.microbes.%s.biom" % (output_dir, sample_id) output_rel_metabolites = "%s/table.rel.metabolites.%s.biom" % (output_dir, sample_id) output_md = "%s/metadata.%s.txt" % (output_dir, sample_id) output_U = "%s/U.%s.txt" % (output_dir, sample_id) output_V = "%s/V.%s.txt" % (output_dir, sample_id) output_edges = "%s/edges.%s.txt" % (output_dir, sample_id) output_ranks = "%s/ranks.%s.txt" % (output_dir, sample_id) # idx1 = table1.sum(axis=0) > 0 # idx2 = table2.sum(axis=0) > 0 # table1 = table1.loc[:, idx1] # table2 = table2.loc[:, idx2] # relative abundances table1 = Table(rel_table1.values.T, rel_table1.columns, rel_table1.index) table2 = Table(rel_table2.values.T, rel_table2.columns, rel_table2.index) with biom_open(output_rel_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_rel_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') # absolute abundances table1 = Table(abs_table1.values.T, abs_table1.columns, abs_table1.index) table2 = Table(abs_table2.values.T, abs_table2.columns, abs_table2.index) with biom_open(output_abs_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_abs_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') pd.DataFrame(edges).to_csv(output_edges, sep='\t') metadata.to_csv(output_md, sep='\t')
def deposit_blocktable(output_dir, abs_table, rel_table, metadata, truth, sample_id): choice = 'abcdefghijklmnopqrstuvwxyz' output_abstable = "%s/rel_table.%s.biom" % ( output_dir, sample_id) output_reltable = "%s/abs_table.%s.biom" % ( output_dir, sample_id) output_metadata = "%s/metadata.%s.txt" % ( output_dir, sample_id) output_truth = "%s/truth.%s.txt" % ( output_dir, sample_id) abs_t = Table(abs_table.T.values, abs_table.columns.values, abs_table.index.values) with biom_open(output_abstable, 'w') as f: abs_t.to_hdf5(f, generated_by='moi') rel_t = Table(rel_table.T.values, rel_table.columns.values, rel_table.index.values) with biom_open(output_reltable, 'w') as f: rel_t.to_hdf5(f, generated_by='moi') metadata.to_csv(output_metadata, sep='\t') truth.to_csv(output_truth, sep='\t')
def setUp(self): np.random.seed(0) torch.manual_seed(0) self.k, self.D, self.N, self.M, self.C = 10, 50, 500, 100000, 3 self.sims = multinomial_batch_bioms(k=self.k, D=self.D, N=self.N, M=self.M, C=self.C) Y = self.sims['Y'] parts = Y.shape[0] // 10 samp_ids = list(map(str, range(Y.shape[0]))) obs_ids = list(map(str, range(Y.shape[1]))) train = Table(Y[:parts * 8].T, obs_ids, samp_ids[:parts * 8]) test = Table(Y[parts * 8:parts * 9].T, obs_ids, samp_ids[parts * 8:parts * 9]) valid = Table(Y[parts * 9:].T, obs_ids, samp_ids[parts * 9:]) with biom_open('train.biom', 'w') as f: train.to_hdf5(f, 'train') with biom_open('test.biom', 'w') as f: test.to_hdf5(f, 'test') with biom_open('valid.biom', 'w') as f: valid.to_hdf5(f, 'valid') md = pd.DataFrame({'batch_category': self.sims['batch_idx']}, index=samp_ids) md.index.name = 'sampleid' md.to_csv('metadata.txt', sep='\t') batch_priors = pd.Series(self.sims['alphaILR']) batch_priors.to_csv('batch_priors.txt', sep='\t') self.sims['tree'].write('basis.nwk')
def main(args): os.mkdir(args.output_dir) np.random.seed(args.seed) sims = multinomial_bioms( k=args.latent_dim, D=args.input_dim, N=args.samples, M=args.depth) Y = sims['Y'] parts = Y.shape[0] // 10 samp_ids = list(map(str, range(Y.shape[0]))) obs_ids = list(map(str, range(Y.shape[1]))) train = Table(Y[:parts * 8].T, obs_ids, samp_ids[:parts * 8]) test = Table(Y[parts * 8 : parts * 9].T, obs_ids, samp_ids[parts * 8 : parts * 9]) valid = Table(Y[parts * 9:].T, obs_ids, samp_ids[parts * 9:]) output_dir = args.output_dir with biom_open(f'{output_dir}/train.biom', 'w') as f: train.to_hdf5(f, 'train') with biom_open(f'{output_dir}/test.biom', 'w') as f: test.to_hdf5(f, 'test') with biom_open(f'{output_dir}/valid.biom', 'w') as f: valid.to_hdf5(f, 'valid') tree = sims['tree'] tree.write(f'{output_dir}/basis.nwk') np.savetxt(f'{output_dir}/eigvals.txt', sims['eigs']) np.savetxt(f'{output_dir}/eigvecs.txt', sims['eigvectors']) np.savetxt(f'{output_dir}/W.txt', sims['W'])
def biom_data_from_vcfs(vcfs, min_position=0, max_position=inf): oids = {} ordered_oids = [] sids = {} ordered_sids = [] data = {} master_oids = set([]) for vcf in vcfs: working_oids = set([]) vcf = biom_open(vcf) for line in vcf: fields = line.strip().split('\t') if fields[0] == '#CHROM': pass elif fields[0].startswith("#"): pass else: chrom = fields[0] pos = int(fields[1]) oid = '%s.%d' % (chrom, pos) working_oids.add(oid) if len(master_oids) == 0: master_oids = working_oids else: master_oids = set.intersection(master_oids, working_oids) # master_oids = master_oids | working_oids vcf.close() for vcf in vcfs: vcf = biom_open(vcf) for line in vcf: fields = line.strip().split('\t') if fields[0] == '#CHROM': # this will differ for human data (when multiple genomes per vcf): sid = fields[9] try: sid_index = sids[sid] except KeyError: ordered_sids.append(sid) sid_index = len(ordered_sids) - 1 sids[sid] = sid_index elif fields[0].startswith("#"): pass else: chrom = fields[0] pos = int(fields[1]) oid = '%s.%d' % (chrom, pos) if fields[4] != '.' and \ min_position <= pos <= max_position and \ oid in master_oids: try: oid_index = oids[oid] except KeyError: ordered_oids.append(oid) oid_index = len(ordered_oids) - 1 oids[oid] = oid_index # this will differ for non-haploid data: data[(oid_index, sid_index)] = 1 return data, ordered_oids, ordered_sids
def test_biom_open_gz(self): with biom_open(get_data_path('test.json.gz')) as f: self.assertTrue(isinstance(f, gzip.GzipFile)) with biom_open(get_data_path('test_writing.json.gz'), 'w') as f: self.assertTrue(isinstance(f, gzip.GzipFile)) remove(get_data_path('test_writing.json.gz'))
def test_biom_open_hdf5(self): with biom_open(get_data_path('test.biom')) as f: self.assertTrue(isinstance(f, h5py.File)) with biom_open(get_data_path('test_writing.biom'), 'w') as f: self.assertTrue(isinstance(f, h5py.File)) remove(get_data_path('test_writing.biom'))
def deposit_biofilm(table1, table2, metadata, U, V, edges, it, rep, output_dir): """ Writes down tables, metadata and feature metadata into files. Parameters ---------- table : biom.Table Biom table metadata : pd.DataFrame Dataframe of sample metadata feature_metadata : pd.DataFrame Dataframe of features metadata it : int iteration number rep : int repetition number output_dir : str output directory """ choice = 'abcdefghijklmnopqrstuvwxyz' output_microbes = "%s/table_microbes.%d_%s.biom" % (output_dir, it, choice[rep]) output_metabolites = "%s/table_metabolites.%d_%s.biom" % (output_dir, it, choice[rep]) output_md = "%s/metadata.%d_%s.txt" % (output_dir, it, choice[rep]) output_U = "%s/U.%d_%s.txt" % (output_dir, it, choice[rep]) output_V = "%s/V.%d_%s.txt" % (output_dir, it, choice[rep]) output_B = "%s/edges.%d_%s.txt" % (output_dir, it, choice[rep]) output_ranks = "%s/ranks.%d_%s.txt" % (output_dir, it, choice[rep]) idx1 = table1.sum(axis=0) > 0 idx2 = table2.sum(axis=0) > 0 table1 = table1.loc[:, idx1] table2 = table2.loc[:, idx2] table1 = Table(table1.values.T, table1.columns, table1.index) table2 = Table(table2.values.T, table2.columns, table2.index) with biom_open(output_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') ranks = (U @ V) ranks = ranks[idx1, :] ranks = ranks[:, idx2] ranks = pd.DataFrame(ranks, index=table1.ids(axis='observation'), columns=table2.ids(axis='observation')) ranks.to_csv(output_ranks, sep='\t') metadata.to_csv(output_md, sep='\t', index_label='#SampleID') B = B[:, idx1] np.savetxt(output_U, U) np.savetxt(output_V, V) np.savetxt(output_B, B)
def test_delete_analysis(self): # adding extra filepaths to make sure the delete works as expected, we # basically want 8 -> 9 -> 10 -> 12 -> 14 # -> 11 -> 13 fd, fp10 = mkstemp(suffix='_table.biom') close(fd) fd, fp11 = mkstemp(suffix='_table.biom') close(fd) fd, fp12 = mkstemp(suffix='_table.biom') close(fd) fd, fp13 = mkstemp(suffix='_table.biom') close(fd) fd, fp14 = mkstemp(suffix='_table.biom') close(fd) with biom_open(fp10, 'w') as f: et.to_hdf5(f, "test") with biom_open(fp11, 'w') as f: et.to_hdf5(f, "test") with biom_open(fp12, 'w') as f: et.to_hdf5(f, "test") with biom_open(fp13, 'w') as f: et.to_hdf5(f, "test") with biom_open(fp14, 'w') as f: et.to_hdf5(f, "test") self._clean_up_files.extend([fp10, fp11, fp12, fp13, fp14]) # copying some processing parameters a9 = Artifact(9) pp = a9.processing_parameters # 7: BIOM a10 = Artifact.create([(fp10, 7)], "BIOM", parents=[a9], processing_parameters=pp) a11 = Artifact.create([(fp11, 7)], "BIOM", parents=[a9], processing_parameters=pp) a12 = Artifact.create([(fp12, 7)], "BIOM", parents=[a10], processing_parameters=pp) Artifact.create([(fp13, 7)], "BIOM", parents=[a11], processing_parameters=pp) Artifact.create([(fp14, 7)], "BIOM", parents=[a12], processing_parameters=pp) job = self._create_job('delete_analysis', {'analysis_id': 1}) private_task(job.id) self.assertEqual(job.status, 'success') with self.assertRaises(QiitaDBUnknownIDError): Analysis(1)
def test_between_correls(args, tmpdir): table1 = simulate_correls() table2 = simulate_correls() loc = tmpdir.mkdir("with_correls_test") with biom_open(str(loc.join("table1.biom")), 'w') as f: table1.to_hdf5(f, 'madebyme') with biom_open(str(loc.join("table2.biom")), 'w') as f: table2.to_hdf5(f, 'madebyme') os.chdir(str(loc)) between_correls(args) files = os.listdir(str(loc)+'/out_dir') assert "correls.txt" in files assert "crossnet.gml" in files
def write_biom_and_meta_data(orig_biom, orig_pd, augm_biom, augm_pd, out_dir, biom_fp, meta_fp): with biom_open(out_dir + '/' + os.path.basename(biom_fp), 'w') as f: orig_biom.to_hdf5(f, "original biom table") with biom_open(out_dir + '/augmented_data.biom', 'w') as f: augm_biom.to_hdf5(f, "augmented biom table") if meta_fp is not None: orig_pd.to_csv(out_dir + '/' + os.path.basename(meta_fp), sep='\t', header=['#SampleID', 'label']) augm_pd.to_csv(out_dir + '/augmented_meta_data.csv', sep='\t', header=['#SampleID', 'label'])
def split_dataset(input_biom, input_metadata, split_ratio, output_dir): table = load_table(input_biom) metadata = pd.read_table(input_metadata, index_col=0) metadata.columns = [x.replace('-', '_') for x in metadata.columns] metadata_filter = lambda val, id_, md: id_ in metadata.index table = table.filter(metadata_filter, axis='sample') metadata = metadata.loc[table.ids(axis='sample')] sample_ids = metadata.index D, N = table.shape samples = pd.Series(np.arange(N), index=sample_ids) train_size = int(N * split_ratio) test_size = N - train_size test_samples = set(np.random.choice(sample_ids, size=test_size)) test_idx = np.array([(x in test_samples) for x in metadata.index]) train_idx = ~test_idx f = lambda id_, md: id_ in test_samples gen = table.partition(f) _, train_table = next(gen) _, test_table = next(gen) train_metadata = metadata.iloc[train_idx] test_metadata = metadata.iloc[test_idx] if not os.path.exists(output_dir): os.mkdir(output_dir) test_metadata_path = os.path.join( output_dir, 'test_' + os.path.basename(input_metadata)) train_metadata_path = os.path.join( output_dir, 'train_' + os.path.basename(input_metadata)) test_biom_path = os.path.join(output_dir, 'test_' + os.path.basename(input_biom)) train_biom_path = os.path.join(output_dir, 'train_' + os.path.basename(input_biom)) print(train_metadata_path) train_metadata.to_csv(train_metadata_path, sep='\t') test_metadata.to_csv(test_metadata_path, sep='\t') with biom_open(train_biom_path, 'w') as f: train_table.to_hdf5(f, "train") with biom_open(test_biom_path, 'w') as f: test_table.to_hdf5(f, "test")
def main(): args = parser.parse_args() n = args.n input_fp = args.input_fp output_dir = args.output_dir biom_table = load_table(input_fp) obs_ids = biom_table.ids(axis='observation') print "{0} total ids\n".format(len(obs_ids)) chunk_size = int(len(obs_ids) / n) last_id = -1 for chunk in range(1, n): begin_id = last_id + 1 end_id = chunk * chunk_size print "chunk: {0} begin: {1} end: {2}\n".format( chunk, begin_id, end_id) sub_ids = obs_ids[begin_id:end_id] sub_table = biom_table.filter(lambda val, id_, md: id_ in sub_ids, axis='observation', invert=False, inplace=False) with biom_open(join(output_dir, 'chunk{0}.biom'.format(chunk)), 'w') as out_f: sub_table.to_hdf5(out_f, "split_biom.py") last_id = end_id begin_id = last_id + 1 chunk += 1 print "chunk: {0} begin: {1} end: {2}\n".format(chunk, begin_id, len(obs_ids)) sub_ids = obs_ids[last_id + 1:] sub_table = biom_table.filter(lambda val, id_, md: id_ in sub_ids, axis='observation', invert=False, inplace=False) with biom_open(join(output_dir, 'chunk{0}.biom'.format(n)), 'w') as out_f: sub_table.to_hdf5(out_f, "split_biom.py")
def test_write_biom(self): with tempfile.NamedTemporaryFile(suffix='biom') as biom: with biom_open(biom.name, 'w') as f: s = Stats_And_Summary() s.write_biom(('sample1', 'sample2'), [{ 'readname': ['ab', 'c'], 'readnameE': ['ab', 'd'] }, { 'readname2': ['ab', 'c'] }], f) with tempfile.NamedTemporaryFile(suffix='csv') as biom_out: os.remove( biom_out.name) #delete because otherwise biom complains subprocess.check_call( "biom convert -i %s -o %s --table-type 'OTU table' --to-tsv --header-key taxonomy" % (biom.name, biom_out.name), shell=True) observed = open(biom_out.name).read() self.assertTrue(observed in ('''# Constructed from biom file #OTU ID\tsample1\tsample2\ttaxonomy 1\t1.0\t0.0\tab; d 2\t1.0\t1.0\tab; c''', '''# Constructed from biom file #OTU ID\tsample1\tsample2\ttaxonomy 1\t1.0\t1.0\tab; c 2\t1.0\t0.0\tab; d'''), msg=observed)
def biom_artifact_output_translator(artifact): biom_table = artifact.data fd, temp_file_name = mkstemp(suffix=".biom") close(fd) with biom_open(temp_file_name, 'w') as f: biom_table.to_hdf5(f, "QIITA-QIIME 2 plugin") return temp_file_name, 'biom'
def test_validate_prefix(self): httpretty.register_uri( httpretty.POST, "https://test_server.com/qiita_db/jobs/job-id/step/") httpretty.register_uri( httpretty.GET, "https://test_server.com/qiita_db/prep_template/1/data", body='{"data": {"1.S1": {"orig_name": "S1"}, "1.S2": ' '{"orig_name": "S2"}, "1.S3": {"orig_name": "S3"}}}') fd, biom_fp = mkstemp(suffix=".biom") close(fd) data = np.asarray([[0, 0, 1], [1, 3, 42]]) table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) with biom_open(biom_fp, 'w') as f: table.to_hdf5(f, "Test") self._clean_up_files.append(biom_fp) self.parameters['files'] = '{"BIOM": ["%s"]}' % biom_fp obs_success, obs_ainfo, obs_error = validate( self.qclient, 'job-id', self.parameters, self.out_dir) exp_biom_fp = join(self.out_dir, basename(biom_fp)) self._clean_up_files.append(exp_biom_fp) self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [[None, 'BIOM', [exp_biom_fp, 'biom']]]) self.assertEqual(obs_error, "") obs_t = load_table(exp_biom_fp) self.assertItemsEqual(obs_t.ids(), ["1.S1", "1.S2", "1.S3"])
def hashing(unhashed_otu_table_list, unhashed_rep_seqs_list, sample_metadata_list): otu_df_list = [] rep_seq_ids = set() seqs = [] # Create OTU table for unhashed_otu_table in unhashed_otu_table_list: otu_df_list.append(hash_otu_table(unhashed_otu_table)) otu_df = pd.concat(otu_df_list, join="outer", axis=1) otu_df.fillna(0.0, inplace=True) otu_table = Table(otu_df.values, list(otu_df.index), list(otu_df.columns)) # Create rep seqs for unhashed_rep_seqs in unhashed_rep_seqs_list: seqs.extend(hash_rep_seqs(unhashed_rep_seqs, rep_seq_ids)) otu_table_ids = set(otu_df.index) assert otu_table_ids == rep_seq_ids assert len(otu_df.index) == len(rep_seq_ids) # Merge sample metadata sample_metadata = pd.concat( [pd.read_csv(s, sep="\\t") for s in sample_metadata_list]) # Write files sample_metadata.to_csv("sample_metadata.tsv", sep="\\t", index=False) with biom_open("otu_table.biom", "w") as fid: otu_table.to_hdf5(fid, "Constructed by micone in dada2/deblur pipeline") with open("rep_seqs.fasta", "w") as fid: fasta_writer = FastaIO.FastaWriter(fid, wrap=None) fasta_writer.write_file(seqs)
def load_category_files(category_files): """Loads the category tables as biom files INPUTS: category_files -- a dictionary that associates the mapping category (key) with the file path to the otu_table summarizing that OUTPUTS: category_tables -- a dictionary that associates the mapping category with the summarized otu table for the category. """ category_tables = {} watch_count = 0 watch_list = [] for (category, category_file) in category_files.iteritems(): if isfile(category_file): with biom_open(category_file, 'U') as fp: cat_table = parse_biom_table(fp) category_tables[category] = cat_table else: watch_list.append('The summarized OTU table file cannot be found ' 'for %s. \n%s is not in the file path.' % (category, category_file)) watch_count = watch_count + 1 if watch_count > 0: print 'The following category files could not be found: \n%s' \ % '\n'.join(watch_list) if watch_count == len(category_files): raise ValueError('No files could be found for any of the supplied ' 'categories. \n%s' % '\n'.join(watch_list)) return category_tables
def main(argv): parser=argparse.ArgumentParser(description= 'Select Gammaproteobacteria (or other group) contamination candidates') parser.add_argument('-i','--biom',help='biom file of the experiment') parser.add_argument('-o','--output',help='output file name') parser.add_argument('-c','--classpos', help='class of taxonomy name (0-kingdom,1-phylum etc.', default=2,type=int) parser.add_argument('-t','--taxonomy', help='taxonomy name (including c__ or equivalent)', default='c__Gammaproteobacteria') parser.add_argument('-l','--level',help= 'minimal cumulative level to filter (0 to get all)', default='0.03',type=float) args=parser.parse_args(argv) # load the biom table biom_table = parse_biom_table(biom_open(args.biom,'U')) # find the high freq. OTUs result=get_high_freq_otus(biom_table,args.classpos,args.taxonomy,args.level) # and write them to the file with open(args.output,'w') as snames: for cstr in result: snames.write(cstr+'\n')
def run(self, **kwargs): is_json = kwargs['is_json'] if kwargs['format_version'] in [None, 'None']: if is_json: kwargs['format_version'] = '1.0.0' else: kwargs['format_version'] = '2.0.0' # this is not pyqi-appriopriate, but how we parse this thing is # dependent on runtime options :( with biom_open(kwargs['table']) as f: if is_json: kwargs['table'] = json.load(f) return self._validate_json(**kwargs) elif HAVE_H5PY: import h5py kwargs['table'] = f if not isinstance(f, h5py.File): print("Attempting to validate an HDF5 BIOM table, but the " "table does not appear to be in HDF5 format!") sys.exit(1) return self._validate_hdf5(**kwargs) else: raise IOError("h5py is not installed, can only validate JSON " "tables")
def load_hdf5_or_json(fp): """Return a parsed JSON object or an HDF5 object""" with biom_open(fp) as f: if hasattr(f, 'seek'): return json.load(f) else: return f
def _get_distance_matrix(self, X): """ computes UniFrac distances with the fitted samples Parameters ---------- X : biom.Table new samples Returns ------- dm : DistanceMatrix distances from old samples to new samples """ # TODO one problem with this approach is that # if any samples in X overlap self.table, the counts will # be doubled merged_table = self.table.merge(X) with tempfile.NamedTemporaryFile() as f: with biom_open(f.name, 'w') as b: merged_table.to_hdf5(b, "merged") dm = ssu( f.name, self.tree_path, unifrac_method=self.unifrac_method, variance_adjust=False, alpha=1.0, bypass_tips=False, threads=1, ) return dm
def build_OTU_table_biom(OTU_table_classic, OTU_table_biom, dataset_ID): # Builds a BIOM format OTU table from an OTU table in classic dense format (sample IDs in the first row, OTU IDs in the first column). For some reason, 'biom convert' command fails to recognize some OTU tables, and therefore the method classic2biom (above) fails. Look into this sometime... with open(OTU_table_classic, 'r') as fidin: otu_table_data = fidin.readlines() firstrow = otu_table_data[0].split('\t') sample_labels = firstrow[1:] sample_labels[len(sample_labels) - 1] = sample_labels[len(sample_labels) - 1].rstrip('\n') OTU_labels = [ otu_table_data[i].split('\t')[0] for i in range(1, len(otu_table_data)) ] nOTUs = len(OTU_labels) nSamples = len(sample_labels) # Load OTU table row major order OTU_table_data = np.zeros((nOTUs, nSamples)) for i in range(1, nOTUs + 1): OTU_table_data[i - 1, :] = otu_table_data[i].split('\t')[1:] # Write in BIOM format t = Table(OTU_table_data, OTU_labels, sample_labels, observ_metadata=None, sample_metadata=None, table_id=dataset_ID) with biom_open(OTU_table_biom, 'w') as f: t.to_hdf5(f, "Generated by processing layer", compress=False)
def setUp(self): # Register the URIs for the QiitaClient httpretty.register_uri( httpretty.POST, "https://test_server.com/qiita_db/authenticate/", body='{"access_token": "token", "token_type": "Bearer", ' '"expires_in": "3600"}') self.qclient = QiitaClient('https://test_server.com', 'client_id', 'client_secret') # Create a biom table fd, self.biom_fp = mkstemp(suffix=".biom") close(fd) data = np.asarray([[0, 0, 1], [1, 3, 42]]) table = Table(data, ['O1', 'O2'], ['1.S1', '1.S2', '1.S3']) with biom_open(self.biom_fp, 'w') as f: table.to_hdf5(f, "Test") self.out_dir = mkdtemp() self.parameters = { 'template': 1, 'files': '{"BIOM": ["%s"]}' % self.biom_fp, 'artifact_type': 'BIOM' } self._clean_up_files = [self.biom_fp, self.out_dir]
def write_biom_table(biom_table, biom_table_fp, compress=True, write_hdf5=HAVE_H5PY, format_fs=None): """Writes a BIOM table to the specified filepath Parameters ---------- biom_table : biom.Table The table object to write out biom_table_fp : str The path to the output file compress : bool, optional Defaults to ``True``. If True, built-in compression on the output HDF5 file will be enabled. This option is only relevant if ``write_hdf5`` is ``True``. write_hdf5 : bool, optional Defaults to ``True`` if H5PY is installed and to ``False`` if H5PY is not installed. If ``True`` the output biom table will be written as an HDF5 binary file, otherwise it will be a JSON string. format_fs : dict, optional Formatting functions to be passed to `Table.to_hdf5` Notes ----- This code was adapted from QIIME 1.9 """ generated_by = "PICRUSt " + __version__ if write_hdf5: with biom_open(biom_table_fp, 'w') as biom_file: biom_table.to_hdf5(biom_file, generated_by, compress, format_fs=format_fs) else: with open(biom_table_fp, 'w') as biom_file: biom_table.to_json(generated_by, biom_file)
def _create_job_and_biom(self, sample_ids, template=None, analysis=None): # Create the BIOM table that needs to be valdiated fd, biom_fp = mkstemp(suffix=".biom") close(fd) data = np.random.randint(100, size=(2, len(sample_ids))) table = Table(data, ['O1', 'O2'], sample_ids) with biom_open(biom_fp, 'w') as f: table.to_hdf5(f, "Test") self._clean_up_files.append(biom_fp) # Create a new job parameters = { 'template': template, 'files': dumps({'biom': [biom_fp]}), 'artifact_type': 'BIOM', 'analysis': analysis } data = { 'command': dumps(['BIOM type', '2.1.4', 'Validate']), 'parameters': dumps(parameters), 'status': 'running' } res = self.qclient.post('/apitest/processing_job/', data=data) job_id = res['job'] return biom_fp, job_id, parameters
def test_execute_job_error(self): # Create a prep template prep_info = {'SKB8.640193': {'col': 'val1'}, 'SKD8.640184': {'col': 'val2'}} data = {'prep_info': dumps(prep_info), 'study': 1, 'data_type': '16S'} template = self.qclient.post( '/apitest/prep_template/', data=data)['prep'] # Create a new validate job fd, biom_fp = mkstemp(suffix=".biom") close(fd) data = np.random.randint(100, size=(2, 2)) table = Table(data, ['O1', 'O2'], ['S1', 'S2']) with biom_open(biom_fp, 'w') as f: table.to_hdf5(f, "Test") data = {'command': dumps(['BIOM type', '2.1.4', 'Validate']), 'parameters': dumps( {'files': dumps({'biom': [biom_fp]}), 'template': template, 'artifact_type': 'BIOM'}), 'artifact_type': 'BIOM', 'status': 'queued'} job_id = self.qclient.post( '/apitest/processing_job/', data=data)['job'] plugin("https://localhost:21174", job_id, self.out_dir) obs = self._wait_job(job_id) self.assertEqual(obs, 'error')
def run(self, **kwargs): is_json = not is_hdf5_file(kwargs['table']) if kwargs['format_version'] in [None, 'None']: if is_json: kwargs['format_version'] = '1.0.0' else: kwargs['format_version'] = '2.1' else: if is_json: raise ValueError("Only format 1.0.0 is valid for JSON") fmt_ver = [int(v) for v in kwargs['format_version'].split('.')] if tuple(fmt_ver) not in self.HDF5FormatVersions: raise ValueError("Unrecognized format version: %s" % kwargs['format_version']) with biom_open(kwargs['table']) as f: if is_json: kwargs['table'] = json.load(f) return self._validate_json(**kwargs) elif HAVE_H5PY: import h5py kwargs['table'] = f if not isinstance(f, h5py.File): print("Attempting to validate an HDF5 BIOM table, but the " "table does not appear to be in HDF5 format!") sys.exit(1) return self._validate_hdf5(**kwargs) else: raise IOError("h5py is not installed, can only validate JSON " "tables")
def write_biom_table(biom_table, biom_table_fp, compress=True, write_hdf5=HAVE_H5PY, format_fs=None): """Writes a BIOM table to the specified filepath Parameters ---------- biom_table : biom.Table The table object to write out biom_table_fp : str The path to the output file compress : bool, optional Defaults to ``True``. If True, built-in compression on the output HDF5 file will be enabled. This option is only relevant if ``write_hdf5`` is ``True``. write_hdf5 : bool, optional Defaults to ``True`` if H5PY is installed and to ``False`` if H5PY is not installed. If ``True`` the output biom table will be written as an HDF5 binary file, otherwise it will be a JSON string. format_fs : dict, optional Formatting functions to be passed to `Table.to_hdf5` Notes ----- This code was adapted from QIIME 1.9 """ generated_by = "Microbiome Helper" if write_hdf5: with biom_open(biom_table_fp, 'w') as biom_file: biom_table.to_hdf5(biom_file, generated_by, compress, format_fs=format_fs) else: with open(biom_table_fp, 'w') as biom_file: biom_table.to_json(generated_by, biom_file)
def load_table(f): r"""Load a `Table` from a path Parameters ---------- f : str Returns ------- Table Raises ------ IOError If the path does not exist TypeError If the data in the path does not appear to be a BIOM table Examples -------- Parse a table from a path. BIOM will attempt to determine if the fhe file is either in TSV, HDF5, JSON, gzip'd JSON or gzip'd TSV and parse accordingly: >>> from biom import load_table >>> table = load_table('path/to/table.biom') # doctest: +SKIP """ from biom.util import biom_open with biom_open(f) as fp: try: table = parse_table(fp) except (IndexError, TypeError): raise TypeError("%s does not appear to be a BIOM file!" % f) return table
def test_validate_prefix(self): httpretty.register_uri( httpretty.POST, "https://test_server.com/qiita_db/jobs/job-id/step/") httpretty.register_uri( httpretty.GET, "https://test_server.com/qiita_db/prep_template/1/data", body='{"data": {"1.S1": {"orig_name": "S1"}, "1.S2": ' '{"orig_name": "S2"}, "1.S3": {"orig_name": "S3"}}}') fd, biom_fp = mkstemp(suffix=".biom") close(fd) data = np.asarray([[0, 0, 1], [1, 3, 42]]) table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) with biom_open(biom_fp, 'w') as f: table.to_hdf5(f, "Test") self._clean_up_files.append(biom_fp) self.parameters['files'] = '{"BIOM": ["%s"]}' % biom_fp obs_success, obs_ainfo, obs_error = validate(self.qclient, 'job-id', self.parameters, self.out_dir) exp_biom_fp = join(self.out_dir, basename(biom_fp)) self._clean_up_files.append(exp_biom_fp) self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [[None, 'BIOM', [exp_biom_fp, 'biom']]]) self.assertEqual(obs_error, "") obs_t = load_table(exp_biom_fp) self.assertItemsEqual(obs_t.ids(), ["1.S1", "1.S2", "1.S3"])
def main(): args = prog_options() try: biomf = biom.load_table(args.in_biomf) except IOError as ioe: sys.exit("Error with input BIOM format file: {}".format(ioe)) else: biomf_pa = biomf.pa( inplace=False) # convert to presence/absence BIOM table obs_ids = biomf_pa.ids("observation") try: mheader, mdata = parse_map_file(args.map_fnh) except IOError as ioe: sys.exit("Error with input mapping file: {}".format(ioe)) else: if args.group_by: sid_cat = gather_categories(mdata, mheader, [args.group_by]) else: sid_cat = gather_categories(mdata, mheader) # calculate core core_calc = {k: set() for k in sid_cat.keys()} for idx in obs_ids: for cat, val in sid_cat.iteritems(): obs_count = 0 num_of_samples = len(val.sids) for sid in val.sids: try: assert biomf_pa.get_value_by_ids(idx, sid) == 1 except AssertionError: continue else: obs_count += 1 try: assert obs_count > round(args.core_pct * num_of_samples) except AssertionError: continue else: core_calc[cat].add(idx) # Check if output directory exists, if not, create it try: assert os.path.exists(os.path.abspath(args.out_fnh)) is True except AssertionError: os.makedirs(os.path.abspath(args.out_fnh)) finally: for k, v in core_calc.iteritems(): print("{0} core IDs in {1}".format(len(v), k)) idx_filename = os.path.join(os.path.abspath(args.out_fnh), k + "_80_pct_core_ids.txt") with open(idx_filename, "w") as of: of.write("{0}".format("\n".join(sorted(v)))) filtered_biomf = biomf.filter(v, axis="observation", inplace=False) if args.biom_out: biom_filename = os.path.join(os.path.abspath(args.out_fnh), k + "_80_pct_core.biom") with biom_open(biom_filename, "w") as f: filtered_biomf.to_hdf5(f, "CORE BIOM")
def deposit(table, groups, truth, output_table, output_groups, output_truth): t = Table(table.T.values, table.columns.values, table.index.values) with biom_open(output_table, 'w') as f: t.to_hdf5(f, generated_by='moi') groups.to_csv(output_groups, sep='\t') with open(output_truth, 'w') as f: f.write(','.join(truth))
def noisify(table_file, metadata_file, sigma, output_file): metadata = pd.read_table(metadata_file, index_col=0) table = load_table(table_file) table = pd.DataFrame(np.array(table.matrix_data.todense()).T, index=table.ids(axis='sample'), columns=table.ids(axis='observation')) cov = np.eye(table.shape[1] - 1) m_noise = compositional_noise(cov, nsamp=table.shape[0]) table_ = table.values table_ = np.vstack( [perturb(table_[i, :], m_noise[i, :]) for i in range(table_.shape[0])]) # note that this assumes that the column is named `library_size table_ = pd.DataFrame( multinomial_sample(table_, depths=metadata['library_size'])) table_.index = table.index table_.columns = list(table.columns) metadata['observed'] = np.sum(table_.sum(axis=0) > 0) metadata['unobserved'] = np.sum(table_.sum(axis=0) == 0) metadata.to_csv(metadata_file, sep='\t') # drop zeros -- they are not informative table_ = table_.loc[:, table_.sum(axis=0) > 0] t = Table(table_.T.values, table_.columns.values, table_.index.values) with biom_open(output_file, 'w') as f: t.to_hdf5(f, generated_by='moi')
def main(argv): parser = argparse.ArgumentParser( description= 'Select Gammaproteobacteria (or other group) contamination candidates') parser.add_argument('-i', '--biom', help='biom file of the experiment') parser.add_argument('-o', '--output', help='output file name') parser.add_argument('-c', '--classpos', help='class of taxonomy name (0-kingdom,1-phylum etc.', default=2, type=int) parser.add_argument('-t', '--taxonomy', help='taxonomy name (including c__ or equivalent)', default='c__Gammaproteobacteria') parser.add_argument( '-l', '--level', help='minimal cumulative level to filter (0 to get all)', default='0.03', type=float) args = parser.parse_args(argv) # load the biom table biom_table = parse_biom_table(biom_open(args.biom, 'U')) # find the high freq. OTUs result = get_high_freq_otus(biom_table, args.classpos, args.taxonomy, args.level) # and write them to the file with open(args.output, 'w') as snames: for cstr in result: snames.write(cstr + '\n')
def load_table(f): r"""Load a `Table` from a path Parameters ---------- f : str Returns ------- Table Raises ------ IOError If the path does not exist TypeError If the data in the path does not appear to be a BIOM table Examples -------- Parse a table from a path. BIOM will attempt to determine if the fhe file is either in TSV, HDF5, JSON, gzip'd JSON or gzip'd TSV and parse accordingly: >>> from biom import load_table >>> table = load_table('path/to/table.biom') # doctest: +SKIP """ with biom_open(f) as fp: try: table = parse_biom_table(fp) except (IndexError, TypeError): raise TypeError("%s does not appear to be a BIOM file!" % f) return table
def test_faith_pd_invalid_input(self): # tests are based of skbio tests, checking for duplicate ids, # negative counts are not included but should be incorporated # tree has duplicated tip ids tree = TreeNode.read( StringIO('((OTU1:0.1, OTU2:0.2):0.3, (OTU3:0.5, OTU4:0.7):1.1)' 'root;')) otu_ids = ['OTU%d' % i for i in range(1, 5)] u_counts = [1, 1, 0, 0] data = np.array([u_counts]).T bt = Table(data, otu_ids, ['u']) ta = os.path.join(gettempdir(), 'table.biom') tr = os.path.join(gettempdir(), 'tree.biom') self.files_to_delete.append(ta) self.files_to_delete.append(tr) with biom_open(ta, 'w') as fhdf5: bt.to_hdf5(fhdf5, 'Table for unit testing') tree.write(tr) self.assertRaises(IOError, faith_pd, 'dne.biom', tr) self.assertRaises(IOError, faith_pd, ta, 'dne.tre')
def run(self, **kwargs): is_json = not is_hdf5_file(kwargs['table']) if kwargs['format_version'] in [None, 'None']: if is_json: kwargs['format_version'] = '1.0.0' else: kwargs['format_version'] = '2.1' elif is_json: if kwargs['format_version'] != "1.0.0": raise ValueError("Only format 1.0.0 is valid for JSON") else: fmt_ver = [int(v) for v in kwargs['format_version'].split('.')] if tuple(fmt_ver) not in self.HDF5FormatVersions: raise ValueError("Unrecognized format version: %s" % kwargs['format_version']) with biom_open(kwargs['table']) as f: if is_json: kwargs['table'] = json.load(f) return self._validate_json(**kwargs) elif HAVE_H5PY: import h5py kwargs['table'] = f if not isinstance(f, h5py.File): print("Attempting to validate an HDF5 BIOM table, but the " "table does not appear to be in HDF5 format!") sys.exit(1) return self._validate_hdf5(**kwargs) else: raise IOError("h5py is not installed, can only validate JSON " "tables")
def run(self, **kwargs): json_table_str = kwargs['json_table_str'] hdf5_biom = kwargs['hdf5_table'] axis = kwargs['axis'] ids = kwargs['ids'] if axis not in self.Axes: raise CommandError("Invalid axis '%s'. Must be either %s." % ( axis, ' or '.join(map(lambda e: "'%s'" % e, self.Axes)))) if hdf5_biom is None and json_table_str is None: raise CommandError("Must specify an input table") elif hdf5_biom is not None and json_table_str is not None: raise CommandError("Can only specify one input table") if json_table_str is not None: idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis) new_data = direct_slice_data(json_table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(json_table_str, "id") yield "," yield direct_parse_key(json_table_str, "format") yield "," yield direct_parse_key(json_table_str, "format_url") yield "," yield direct_parse_key(json_table_str, "type") yield "," yield direct_parse_key(json_table_str, "generated_by") yield "," yield direct_parse_key(json_table_str, "date") yield "," yield direct_parse_key(json_table_str, "matrix_type") yield "," yield direct_parse_key(json_table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observation": yield direct_parse_key(json_table_str, "columns") else: yield direct_parse_key(json_table_str, "rows") yield "}" format_ = 'json' table = subset_generator() else: with biom_open(hdf5_biom) as f: table = Table.from_hdf5(f, ids=ids, axis=axis) format_ = 'hdf5' return {'subsetted_table': (table, format_)}
def merge_biom_tables(master_fp, additional_fp): """ :param master_fp: str :param additional_fp: str :return: None """ master = load_table(master_fp) master = master.merge(load_table(additional_fp)) with biom_open(master_fp, 'w') as biom_file: master.to_hdf5(biom_file, "amquery", True)
def _subset_table(hdf5_biom, json_table_str, axis, ids): if axis not in ['sample', 'observation']: raise ValueError("Invalid axis '%s'. Must be either 'sample' or " "'observation'." % axis) if hdf5_biom is None and json_table_str is None: raise ValueError("Must specify an input table") elif hdf5_biom is not None and json_table_str is not None: raise ValueError("Can only specify one input table") if json_table_str is not None: idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis) new_data = direct_slice_data(json_table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(json_table_str, "id") yield "," yield direct_parse_key(json_table_str, "format") yield "," yield direct_parse_key(json_table_str, "format_url") yield "," yield direct_parse_key(json_table_str, "type") yield "," yield direct_parse_key(json_table_str, "generated_by") yield "," yield direct_parse_key(json_table_str, "date") yield "," yield direct_parse_key(json_table_str, "matrix_type") yield "," yield direct_parse_key(json_table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observation": yield direct_parse_key(json_table_str, "columns") else: yield direct_parse_key(json_table_str, "rows") yield "}" format_ = 'json' table = subset_generator() else: with biom_open(hdf5_biom) as f: table = Table.from_hdf5(f, ids=ids, axis=axis) format_ = 'hdf5' return table, format_
def write_biom_table(table, biom_output_fp): """Write BIOM table to file. Parameters ---------- table: biom.Table an instance of a BIOM table biom_output_fp: str filepath to output BIOM table """ with biom_open(biom_output_fp, 'w') as f: table.to_hdf5(h5grp=f, generated_by="tcga-kraken-translate")
def main(): args = parser.parse_args() n = args.n input_fp = args.input_fp biom_table = load_table(input_fp) obs_ids = biom_table.ids(axis='observation') print "{0} total ids\n".format(len(obs_ids)) chunk_size = int(len(obs_ids)/n) last_id = -1 for chunk in range(1,n): begin_id = last_id + 1 end_id = chunk * chunk_size print "chunk: {0} begin: {1} end: {2}\n".format(chunk, begin_id, end_id) sub_ids = obs_ids[begin_id : end_id] sub_table = biom_table.filter(lambda val, id_, md: id_ in sub_ids, axis='observation', invert=False, inplace=False) with biom_open('chunk{0}.biom'.format(chunk), 'w') as out_f: sub_table.to_hdf5(out_f, "split_biom.py") last_id = end_id begin_id = last_id + 1 chunk += 1 print "chunk: {0} begin: {1} end: {2}\n".format(chunk, begin_id, len(obs_ids)) sub_ids = obs_ids[last_id + 1 : ] sub_table = biom_table.filter(lambda val, id_, md: id_ in sub_ids, axis='observation', invert=False, inplace=False) with biom_open('chunk{0}.biom'.format(n), 'w') as out_f: sub_table.to_hdf5(out_f, "split_biom.py")
def _build_biom_tables(self, samples, rarefaction_depth): """Build tables and add them to the analysis""" with qdb.sql_connection.TRN: # filter and combine all study BIOM tables needed for # each data type new_tables = {dt: None for dt in self.data_types} base_fp = qdb.util.get_work_base_dir() for a_id, samps in viewitems(samples): # one biom table attached to each artifact object artifact = qdb.artifact.Artifact(a_id) table_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': table_fp = fp break if not table_fp: raise RuntimeError( "Artifact %s do not have a biom table associated" % a_id) table = load_table(table_fp) # HACKY WORKAROUND FOR DEMO. Issue # 246 # make sure samples not in biom table are not filtered for table_samps = set(table.ids()) filter_samps = table_samps.intersection(samps) # add the metadata column for study the samples come from study_meta = {'Study': artifact.study.title, 'Processed_id': artifact.id} samples_meta = {sid: study_meta for sid in filter_samps} # filter for just the wanted samples and merge into new table # this if/else setup avoids needing a blank table to # start merges table.filter(filter_samps, axis='sample', inplace=True) table.add_metadata(samples_meta, axis='sample') data_type = artifact.data_type if new_tables[data_type] is None: new_tables[data_type] = table else: new_tables[data_type] = new_tables[data_type].merge(table) # add the new tables to the analysis _, base_fp = qdb.util.get_mountpoint(self._table)[0] for dt, biom_table in viewitems(new_tables): # rarefy, if specified if rarefaction_depth is not None: biom_table = biom_table.subsample(rarefaction_depth) # write out the file biom_fp = join(base_fp, "%d_analysis_%s.biom" % (self._id, dt)) with biom_open(biom_fp, 'w') as f: biom_table.to_hdf5(f, "Analysis %s Datatype %s" % (self._id, dt)) self._add_file("%d_analysis_%s.biom" % (self._id, dt), "biom", data_type=dt)
def setUp(self): # Generate some files for a root artifact fd, self.fp1 = mkstemp(suffix='_seqs.fastq') close(fd) with open(self.fp1, 'w') as f: f.write("@HWI-ST753:189:D1385ACXX:1:1101:1214:1906 1:N:0:\n" "NACGTAGGGTGCAAGCGTTGTCCGGAATNA\n" "+\n" "#1=DDFFFHHHHHJJJJJJJJJJJJGII#0\n") fd, self.fp2 = mkstemp(suffix='_barcodes.fastq') close(fd) with open(self.fp2, 'w') as f: f.write("@HWI-ST753:189:D1385ACXX:1:1101:1214:1906 2:N:0:\n" "NNNCNNNNNNNNN\n" "+\n" "#############\n") self.filepaths_root = [(self.fp1, 1), (self.fp2, 3)] # Generate some files for a processed artifact fd, self.fp3 = mkstemp(suffix='_seqs.fna') close(fd) with open(self.fp3, 'w') as f: f.write(">1.sid_r4_0 M02034:17:000000000-A5U18:1:1101:15370:1394 " "1:N:0:1 orig_bc=CATGAGCT new_bc=CATGAGCT bc_diffs=0\n" "GTGTGCCAGCAGCCGCGGTAATACGTAGGG\n") self.filepaths_processed = [(self.fp3, 4)] # Generate some file for a BIOM fd, self.fp4 = mkstemp(suffix='_table.biom') with biom_open(self.fp4, 'w') as f: et.to_hdf5(f, "test") self.filepaths_biom = [(self.fp4, 7)] # Create a new prep template metadata_dict = { 'SKB8.640193': {'center_name': 'ANL', 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'GTCCGCAAGTTA', 'run_prefix': "s_G1_L001_sequences", 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'AAAA', 'experiment_design_description': 'BBBB'}} metadata = pd.DataFrame.from_dict(metadata_dict, orient='index', dtype=str) self.prep_template = \ qdb.metadata_template.prep_template.PrepTemplate.create( metadata, qdb.study.Study(1), "16S") self._clean_up_files = [self.fp1, self.fp2, self.fp3, self.fp4]
def load_biom_table_with_file_contents(biom_fp): """Return a BIOM table and the original open filehandle as a tuple. Useful when additional computation needs to be performed on the file contents, such as an MD5 sum. WARNING: this function does not close the open filehandle that it returns. Users of this function are responsible for closing the filehandle when done using it! """ biom_f = biom_open(biom_fp, 'U') table = parse_biom_table(biom_f) biom_f.seek(0) return table, biom_f
def run(self, **kwargs): is_json = kwargs['is_json'] # this is not pyqi-appriopriate, but how we parse this thing is # dependent on runtime options :( with biom_open(kwargs['table']) as f: if is_json: kwargs['table'] = json.load(f) return self._validate_json(**kwargs) elif HAVE_H5PY: kwargs['table'] = f return self._validate_hdf5(**kwargs) else: raise IOError("h5py is not installed, can only validate JSON " "tables")
def test_rarefy_to_files(self): """rarefy_to_files should write valid files """ maker = RarefactionMaker(self.otu_table_fp, 0, 1, 1, 1) maker.rarefy_to_files( self.rare_dir, include_full=True, include_lineages=False) fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom") with biom_open(fname, 'U') as biom_file: otu_table = Table.from_hdf5(biom_file) self.assertItemsEqual( otu_table.sample_ids, self.otu_table.sample_ids[:2])
def _build_biom_tables(self, samples, rarefaction_depth, conn_handler=None): """Build tables and add them to the analysis""" # filter and combine all study BIOM tables needed for each data type new_tables = {dt: None for dt in self.data_types} base_fp = get_work_base_dir() for pid, samps in viewitems(samples): # one biom table attached to each processed data object proc_data = ProcessedData(pid) proc_data_fp = proc_data.get_filepaths()[0][0] table_fp = join(base_fp, proc_data_fp) table = load_table(table_fp) # HACKY WORKAROUND FOR DEMO. Issue # 246 # make sure samples not in biom table are not filtered for table_samps = set(table.ids()) filter_samps = table_samps.intersection(samps) # add the metadata column for study the samples come from study_meta = {'Study': Study(proc_data.study).title, 'Processed_id': proc_data.id} samples_meta = {sid: study_meta for sid in filter_samps} # filter for just the wanted samples and merge into new table # this if/else setup avoids needing a blank table to start merges table.filter(filter_samps, axis='sample', inplace=True) table.add_metadata(samples_meta, axis='sample') data_type = proc_data.data_type() if new_tables[data_type] is None: new_tables[data_type] = table else: new_tables[data_type] = new_tables[data_type].merge(table) # add the new tables to the analysis conn_handler = conn_handler if conn_handler is not None \ else SQLConnectionHandler() base_fp = get_db_files_base_dir(conn_handler) for dt, biom_table in viewitems(new_tables): # rarefy, if specified if rarefaction_depth is not None: biom_table = biom_table.subsample(rarefaction_depth) # write out the file biom_fp = join(base_fp, "analysis", "%d_analysis_%s.biom" % (self._id, dt)) with biom_open(biom_fp, 'w') as f: biom_table.to_hdf5(f, "Analysis %s Datatype %s" % (self._id, dt)) self._add_file("%d_analysis_%s.biom" % (self._id, dt), "biom", data_type=dt, conn_handler=conn_handler)
def test_json_to_hdf5_collapsed_metadata(self): """Correctly converts json to HDF5 changing the observation metadata""" with biom_open(self.json_collapsed_obs) as f: obs = self.cmd(table=parse_biom_table(f), to_hdf5=True, collapsed_observations=True) self.assertEqual(obs.keys(), ['table']) exp = Table(np.array([[2., 1., 1., 0., 0., 1.], [0., 0., 1., 4., 0., 2.], [5., 1., 0., 2., 3., 1.], [0., 1., 2., 0., 0., 0.]]), observation_ids=['p__Firmicutes', 'p__Euryarchaeota', 'p__Cyanobacteria', 'p__Proteobacteria'], sample_ids=['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], observation_metadata=[ {'collapsed_ids': ['GG_OTU_4']}, {'collapsed_ids': ['GG_OTU_3']}, {'collapsed_ids': ['GG_OTU_2']}, {'collapsed_ids': ['GG_OTU_1', 'GG_OTU_5']}], sample_metadata=[ {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT', 'BarcodeSequence': 'CGCTTATCGAGA', 'Description': 'human gut', 'BODY_SITE': 'gut'}, {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT', 'BarcodeSequence': 'CATACCAGTAGC', 'Description': 'human gut', 'BODY_SITE': 'gut'}, {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT', 'BarcodeSequence': 'CTCTCTACCTGT', 'Description': 'human gut', 'BODY_SITE': 'gut'}, {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT', 'BarcodeSequence': 'CTCTCGGCCTGT', 'Description': 'human skin', 'BODY_SITE': 'skin'}, {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT', 'BarcodeSequence': 'CTCTCTACCAAT', 'Description': 'human skin', 'BODY_SITE': 'skin'}, {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT', 'BarcodeSequence': 'CTAACTACCAAT', 'Description': 'human skin', 'BODY_SITE': 'skin'}], type='OTU table') self.assertEqual(obs['table'][0], exp)
def rarefy(qclient, job_id, parameters, out_dir): """rarefy a table Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to rarefy out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ out_dir = join(out_dir, 'rarefy') qclient.update_job_step(job_id, "Step 1 of 2: Collecting information") artifact_id = int(parameters['BIOM table']) rarefy_level = int(parameters['Sampling depth']) artifact_info = qclient.get("/qiita_db/artifacts/%d/" % artifact_id) # getting just the biom file, [0] it should be only one to_rarefy = artifact_info['files']['biom'][0] qclient.update_job_step(job_id, "Step 2 of 2: Rarefying") b = load_table(to_rarefy) if not exists(out_dir): mkdir(out_dir) rarefied = b.subsample(rarefy_level) if rarefied.sum() == 0: return False, None, "Rarefaction level too high %d" % rarefy_level rarefied_fp = join(out_dir, 'rarefied.biom') with biom_open(rarefied_fp, 'w') as bf: rarefied.to_hdf5(bf, "Qiita's Qiime2 plugin") ainfo = [ArtifactInfo('Rarefied table', 'BIOM', [(rarefied_fp, 'biom')])] return True, ainfo, ""
def build_OTU_table_biom(OTU_table_classic, OTU_table_biom, dataset_ID): # Builds a BIOM format OTU table from an OTU table in classic dense format (sample IDs in the first row, OTU IDs in the first column). For some reason, 'biom convert' command fails to recognize some OTU tables, and therefore the method classic2biom (above) fails. with open(OTU_table_classic,'r') as fidin: otu_table_data = fidin.readlines() firstrow = otu_table_data[0].split('\t') sample_labels = firstrow[1:] sample_labels[len(sample_labels)-1] = sample_labels[len(sample_labels)-1].rstrip('\n') OTU_labels = [otu_table_data[i].split('\t')[0] for i in range(1,len(otu_table_data))] nOTUs = len(OTU_labels) nSamples = len(sample_labels) # Load OTU table row major order OTU_table_data = np.zeros((nOTUs, nSamples)) for i in range(1,nOTUs+1): OTU_table_data[i-1,:] = otu_table_data[i].split('\t')[1:] # Write in BIOM format t = Table(OTU_table_data, OTU_labels, sample_labels, observ_metadata=None, sample_metadata=None, table_id=dataset_ID) with biom_open(OTU_table_biom, 'w') as f: t.to_hdf5(f, "Generated by processing layer", compress=False)
def test_json_to_hdf5_collapsed_samples(self): """Correctly converts json to HDF5 changing the sample metadata""" with biom_open(self.json_collapsed_samples) as f: obs = self.cmd(table=parse_biom_table(f), to_hdf5=True, collapsed_samples=True) self.assertEqual(obs.keys(), ['table']) exp = Table(np.array([[0., 1.], [6., 6.], [6., 1.], [1., 4.], [0., 2.]]), observation_ids=['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], sample_ids=['skin', 'gut'], observation_metadata=[ {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}, {'taxonomy': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__']}, {'taxonomy': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__']}, {'taxonomy': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum']}, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}], sample_metadata=[ {'collapsed_ids': ['Sample5', 'Sample4', 'Sample6']}, {'collapsed_ids': ['Sample1', 'Sample3', 'Sample2']} ], type='OTU table') self.assertEqual(obs['table'][0], exp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) mapping_fp = opts.mapping_fp mapping_category = opts.mapping_category otu_table_fp = opts.otu_table_fp output_fp = opts.output_fp normalize = opts.normalize # define a function that returns the bin a sample shouldbe placed into bin_function = lambda id_, sample_metadata:\ sample_metadata[mapping_category] # parse the sample metadata and add it to the OTU table (we assume that # sample metadata is not already present in the table) mapping, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) # added in ability to combine metadata columns and summarize based on the # new combined category if '&&' in mapping_category: new_mapping = [] new_mapping.append(headers) for i in range(len(mapping)): new_mapping.append(mapping[i]) # Create an array using multiple columns from mapping file combinecolorby = mapping_category.split('&&') mapping = combine_map_label_cols(combinecolorby, new_mapping) sample_metadata = mapping_file_to_dict(mapping, headers) with biom_open(otu_table_fp, 'U') as biom_file: table = parse_biom_table(biom_file) table.add_metadata(sample_metadata) # create a new OTU table where samples are binned based on their return # value from bin_function result = table.collapse(bin_function, norm=False, min_group_size=1, axis='sample') # normalize the result if requested by the user if normalize: result.norm(axis='sample', inplace=True) # write a new BIOM file write_biom_table(result, output_fp)
def test_write_biom(self): with tempfile.NamedTemporaryFile(suffix='biom') as biom: with biom_open(biom.name,'w') as f: s = Stats_And_Summary() s.write_biom(('sample1','sample2'), [ {'readname': ['ab','c'], 'readnameE': ['ab','d']}, {'readname2': ['ab','c']} ], f) with tempfile.NamedTemporaryFile(suffix='csv') as biom_out: os.remove(biom_out.name) #delete because otherwise biom complains subprocess.check_call("biom convert -i %s -o %s --table-type 'OTU table' --to-tsv --header-key taxonomy" % (biom.name, biom_out.name), shell=True) observed = open(biom_out.name).read() self.assertEqual('''# Constructed from biom file #OTU ID\tsample1\tsample2\ttaxonomy 1\t1.0\t0.0\tab; d 2\t1.0\t1.0\tab; c''', observed)