def test_save_biom(self): # NOTE: Currently not testing the save biom hdf with taxonomy # as there is a bug there! exp = ca.read_amplicon(self.test1_biom, self.test1_samp, normalize=None, min_reads=None) d = mkdtemp() f = join(d, 'test1.save.biom') # test the json biom format exp.save_biom(f, fmt='hdf5') newexp = ca.read_amplicon(f, self.test1_samp, normalize=None, min_reads=None) assert_experiment_equal(newexp, exp) # test the txt biom format exp.save_biom(f, fmt='txt') newexp = ca.read_amplicon(f, self.test1_samp, normalize=None, min_reads=None) assert_experiment_equal(newexp, exp, ignore_md_fields=['taxonomy']) # test the hdf5 biom format with no taxonomy exp.save_biom(f, add_metadata=None) newexp = ca.read(f, self.test1_samp, normalize=None) self.assertTrue('taxonomy' not in newexp.feature_metadata) assert_experiment_equal(newexp, exp, ignore_md_fields=['taxonomy']) shutil.rmtree(d)
def setUp(self): super().setUp() # load the simple experiment as sparse self.test1 = ca.read(self.test1_biom, self.test1_samp, normalize=None) # load the complex experiment as sparse with normalizing and removing low read samples self.complex = ca.read_amplicon(self.timeseries_biom, self.timeseries_samp, filter_reads=1000, normalize=10000)
def setUp(self): super().setUp() self.mock_db = MockDatabase() self.test1 = ca.read_amplicon(self.test1_biom, self.test1_samp, filter_reads=1000, normalize=10000) self.s1 = 'TACGTATGTCACAAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGTGGATTAAGCGTGTTGTGAAATGTAGACGCTCAACGTCTGAATCGCAGCGCGAACTGGTTCACTTGAGTATGCACAACGTAGGCGGAATTCGTCG'
def test_sort_by_data_feature(self): obs = self.timeseries.sort_by_data(axis=1) exp = ca.read_amplicon(join(self.test_data_dir, 'timeseries.sorted.freq.biom'), join(self.test_data_dir, 'timeseries.sample'), normalize=None, min_reads=0) self.assert_experiment_equal(obs, exp, almost_equal=True)
def test_read_amplicon(self): # test loading a taxonomy biom table and filtering/normalizing exp1 = ca.read_amplicon(self.test1_biom, min_reads=1000, normalize=10000) exp2 = ca.read(self.test1_biom, normalize=None) exp2.filter_by_data('abundance', axis=0, cutoff=1000, inplace=True, mean_or_sum='sum') exp2.normalize(inplace=True) assert_experiment_equal(exp1, exp2) self.assertIn('taxonomy', exp1.feature_metadata.columns)
def __init__(self, load_exp=None): '''Start the gui and load data if supplied Parameters ---------- load_exp : list of (table_file_name, map_file_name, study_name) or None (optional) load the experiments in the list upon startup ''' super().__init__() # load the gui uic.loadUi(get_ui_file_name('CalourGUI.ui'), self) # handle button clicks self.wLoad.clicked.connect(self.load) self.wPlot.clicked.connect(self.plot) # the experiment list right mouse menu self.wExperiments.setContextMenuPolicy(QtCore.Qt.CustomContextMenu) self.wExperiments.customContextMenuRequested.connect( self.listItemRightClicked) # add functions # init the action group list action_groups = ['sample', 'feature', 'analysis'] self.actions = {} for caction in action_groups: self.actions[caction] = {} # Add 'sample' buttons sample_buttons = [ 'Sort', 'Filter', 'Cluster', 'Join fields', 'Filter by original reads', 'Normalize', 'Merge' ] self.add_buttons('sample', sample_buttons) feature_buttons = [ 'Cluster', 'Filter min reads', 'Filter taxonomy', 'Filter fasta', 'Sort abundance' ] self.add_buttons('feature', feature_buttons) analysis_buttons = ['Diff. abundance', 'Correlation'] self.add_buttons('analysis', analysis_buttons) # load experiments supplied if load_exp is not None: for cdata in load_exp: study_name = cdata[2] if study_name is None: study_name = cdata[0] exp = ca.read_amplicon(cdata[0], cdata[1], normalize=10000, filter_reads=1000) exp._studyname = study_name self.addexp(exp) self.show()
def main(argv): parser = argparse.ArgumentParser( description='metaanalysis cross-classifier', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-i', '--input', help='name of input biom table') parser.add_argument('-m', '--map', help='name of input mapping file') parser.add_argument('-o', '--output', help='name of output file') parser.add_argument( '--use-subset', help= "Use only subset of features present in both cohorts for classifier training", action='store_true', default=True) parser.add_argument('--shuffle', help="Shuffle testing labels", action='store_true', default=False) parser.add_argument('--shuffle-source', help="Shuffle training labels", action='store_true', default=False) parser.add_argument('--uname', help="add unique signature to name", action='store_true', default=False) args = parser.parse_args(argv) cname = args.output if args.uname: cname += '_' cname = cname + str(uuid.uuid1()) # cname += str(int(time.time() * 1000000)) print('started processing file %s' % cname) # load the experiment print('loading experiment %s' % args.input) exp = ca.read_amplicon(args.input, args.map, min_reads=1000, normalize=10000) print(exp) # run the classifier print('running the classifier') resdf_roc, resdf_accuracy = classifier_performance_matrix( exp=exp, use_subset_features=args.use_subset, shuffle=args.shuffle, shuffle_source=args.shuffle_source) # save the results print('saving to %s' % cname) resdf_roc.to_csv(cname + '_roc' + '.csv') resdf_accuracy.to_csv(cname + '_accuracy' + '.csv')
def test_read_amplicon(self): # test loading a taxonomy biom table and filtering/normalizing exp = ca.read_amplicon(self.test1_biom, filter_reads=1000, normalize=10000) exp2 = ca.read(self.test1_biom, normalize=None) exp2.filter_by_data('sum_abundance', cutoff=1000, inplace=True) exp2.normalize(inplace=True) assert_experiment_equal(exp, exp2) self.assertIn('taxonomy', exp.feature_metadata)
def load(self): win = LoadWindow() res = win.exec_() if res == QtWidgets.QDialog.Accepted: tablefname = str(win.wTableFile.text()) mapfname = str(win.wMapFile.text()) if mapfname == '': mapfname = None gnpsfname = str(win.wGNPSFile.text()) if gnpsfname == '': gnpsfname = None expname = str(win.wNewName.text()) exptype = str(win.wType.currentText()) if exptype == 'Amplicon': try: expdat = ca.read_amplicon(tablefname, mapfname, normalize=10000, filter_reads=1000) except: logger.warn( 'Load for amplicon biom table %s map %s failed' % (tablefname, mapfname)) return elif exptype == 'Metabolomics (rows are samples)': try: expdat = ca.read_open_ms(tablefname, mapfname, gnps_file=gnpsfname, normalize=None, rows_are_samples=True) except: logger.warn('Load for openms table %s map %s failed' % (tablefname, mapfname)) return elif exptype == 'Metabolomics (rows are features)': try: expdat = ca.read_open_ms(tablefname, mapfname, gnps_file=gnpsfname, normalize=None, rows_are_samples=False) except: logger.warn('Load for openms table %s map %s failed' % (tablefname, mapfname)) return elif exptype == 'Amplicon': try: expdat = ca.read(tablefname, mapfname) except: logger.warn('Load for biom table %s map %s failed' % (tablefname, mapfname)) return expdat._studyname = expname self.addexp(expdat)
def read_qiime2(data_file, sample_metadata_file=None, feature_metadata_file=None, rep_seqs_file=None, **kwargs): '''Read a qiime2 generated table (even if it was run without the --p-no-hashedfeature-ids flag) This is a wrapper for calour.read_amplicon(), that can unzip and extract biom table, feature metadata, rep_seqs_file qza files generated by qiime2 Parameters ---------- data_file: str name of qiime2 deblur/dada2 generated feature table qza or biom table sample_metadata_file: str or None, optional name of tab separated mapping file feature_metadata_file: str or None, optional can be the taxonomy qza or tsv generated by qiime2 feature classifier rep_seqs_file: str or None, optional if not none, name of the qiime2 representative sequences qza file (the --o-representative-sequences file name in qiime2 dada2/deblur) **kwargs: to be passed to calour.read_amplicon Returns ------- calour.AmpliconExperiment ''' import tempfile with tempfile.TemporaryDirectory() as tempdir: data_file = filename_from_zip(tempdir, data_file, 'data/feature-table.biom') feature_metadata_file = filename_from_zip(tempdir, feature_metadata_file, 'data/taxonomy.tsv') rep_seqs_file = filename_from_zip(tempdir, rep_seqs_file, 'data/dna-sequences.fasta') expdat = ca.read_amplicon(data_file, sample_metadata_file=sample_metadata_file, feature_metadata_file=feature_metadata_file, **kwargs) if rep_seqs_file is not None: seqs = [] with open(rep_seqs_file) as rsf: for cline in rsf: # take the sequence from the header if cline[0] != '>': continue seqs.append(cline[1:]) expdat.feature_metadata['_orig_id'] = expdat.feature_metadata[ '_feature_id'] expdat.feature_metadata['_feature_id'] = seqs expdat.feature_metadata = expdat.feature_metadata.set_index( '_feature_id') return expdat
def setUp(self): super().setUp() self.test1 = ca.read_amplicon(self.test1_biom, self.test1_samp, min_reads=1000, normalize=10000)
"age_prediction/gut_4575/gut_4575_rare_map__cohort_cantonese_sex_female__.txt", "age_prediction/gut_4575/gut_4575_rare_map__cohort_cantonese_sex_male__.txt" ] distmatrix_fp = [ '82-soil/beta-q2/', 'PMI_16s/beta-q2/', 'malnutrition/beta-q2/', 'cider/beta-q2/' ] # In[8]: if (balances): feature_datatype = 'qiime2' exp = ca.read_amplicon(biom_fp[dataset], metadata_fp[dataset], data_file_type='qiime2', min_reads=None, normalize=None) else: #BIOM table input exp = ca.read_amplicon(biom_fp[dataset], metadata_fp[dataset], min_reads=None, normalize=None) #if (dataset!=3): exp = exp.filter_abundance(10) # ## Modify parameter options by shape of data # Create logarithmic scales for ranges of parameter options where valid inputs can be 1<->n_features or n_samples # In[11]:
def setUp(self): super().setUp() self.pre_ratio = ca.read_amplicon(self.rat_pre_biom, self.rat_pre_samp, min_reads=10, normalize=None) self.ratio1 = ca.read(self.rat_biom, self.rat_samp, normalize=None, cls=RatioExperiment)
import calour as cl import numpy as np from scipy.stats import sem import pickle cl.set_log_level(40) # input biom table and mapping file cfs = cl.read_amplicon('../data/cfs.biom', '../data/cfs.map.txt', sparse=False, normalize=10000, min_reads=1000) filtlev = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500 ] B = 1000 sig_ds_cfs = [] sig_bh_cfs = [] sig_fbh_cfs = [] err_bh_cfs = [] err_ds_cfs = [] err_fbh_cfs = [] for i in filtlev: print('filter level...: %s' % (i)) sig_ds = []
"fermentation_1976_beer/qiita_v2.txt", "fermentation_1976_wine/qiita_v2.txt", "infant_fecal_11402/filtered_metadata.tsv", "infant_fecal_10918/filtered_metadata.tsv", "infant_fecal_10080/filtered_metadata.tsv", "infant_fecal_11358/filtered_metadata.tsv", "infant_oral_2010/filtered_metadata.tsv", "infant_skin_2010/filtered_metadata.tsv" ] # In[10]: if(knn): exp = ca.read_amplicon(dir_prefixes[dataset]+"/feature-table.biom", metadata_fp[dataset], min_reads=None, normalize=None) else: exp = ca.read_amplicon(biom_fp[dataset], metadata_fp[dataset], min_reads=None, normalize=None) print(exp) # In[11]: target = None #Specify column to predict if (dataset==0): #82-soil target = 'ph' if (dataset==1): target = 'days_since_placement'
from scipy import stats import scipy import pickle import time import math import inspect import operator pd.set_option('display.max_rows', 10000) # # Importing data, no TSS normalization performed here since table is already normalized # In[3]: allMetab = ca.read_amplicon('PMI_MS1_FeatureTable_Normalized.biom', 'pmi3_metab_meta.txt', min_reads=1, normalize=None) # ## Remove controls and samples that grouped with controls on PCoA # In[4]: allMetab = allMetab.filter_samples('control', 'n') allMetab = allMetab.filter_samples('pcoa_removals', 'n') allMetab.sample_metadata.description.value_counts() # # Split by sampling location (soil v. skin) # ## Skin sample filtering # In[5]: