def test_combine_selections_alt_metadata(self): df = pd.DataFrame([[42, 88], [3, 88], [99, 88], [np.nan, 88]], index=['a', 'b', 'c', 'd'], columns=['value', 'time-travel-speed-mph']) df.index.name = 'id' alt_md = qiime2.Metadata(df) sel4 = IDSelection(self.sel3.inclusion, alt_md, 'abc') sel = combine_selections([self.sel1, self.sel2, sel4]) exp_inclusion = pd.Series([True, True, True, False], index=['a', 'b', 'c', 'd'], name='inclusion') exp_df = pd.DataFrame([['x', 88, 42], ['y', 88, 3], ['z', 88, 99], ['a', 88, np.nan]], index=['a', 'b', 'c', 'd'], columns=['locale', 'time-travel-speed-mph', 'value']) exp_df.index.name = 'id' exp_md = qiime2.Metadata(exp_df) pdt.assert_series_equal(sel.inclusion, exp_inclusion) self.assertEqual(sel.metadata, exp_md) self.assertEqual(sel.label, 'combined_selections')
def subsample_longitudinal(dates: qiime2.CategoricalMetadataColumn, start_date: str = None, samples_per_interval: int = 7, days_per_interval: int = 7, seed: int = None) -> IDSelection: window_size = '%dD' % days_per_interval dt_series = pd.to_datetime(dates.to_series(), errors='coerce') df = pd.DataFrame({'ids': dates.to_series().index}, index=dt_series) if start_date is not None: filter_before = pd.Timestamp(start_date) df = df.iloc[np.where(dt_series >= filter_before)] if filter_before not in df.index: # this will be stripped in _sample_group::_sampler # the purpose is to force Pandas to begin the window at this # time instead of the first observation (by making NaN the first # observation) df.loc[filter_before] = float('nan') grouped = df.groupby(pd.Grouper(freq=window_size, convention='start', closed='left'), group_keys=False) filtered_df = grouped.apply(_sample_group(samples_per_interval, seed)) df = df.dropna(axis=0) selection = pd.Series(False, index=dates.to_series().index) selection[filtered_df['ids']] = True md = qiime2.Metadata(dates.to_dataframe()) return IDSelection(selection, md, 'subsample_longitudinal')
def _3(fmt: IDSelectionDirFmt) -> IDSelection: md = fmt.metadata.view(IDMetadataFormat).to_metadata() inclusion = pd.Series(False, index=md.to_dataframe().index) included = fmt.included.view(UNIXListFormat).to_list() inclusion[included] = True with fmt.label.view(UNIXListFormat).open() as fh: label = fh.read().strip() return IDSelection(inclusion, md, label)
def test_combine_selections_inconsistent_metadata(self): df = pd.DataFrame([['x'], ['y'], ['w'], ['a']], index=['a', 'b', 'c', 'd'], columns=['locale']) df.index.name = 'id' alt_md = qiime2.Metadata(df) sel4 = IDSelection(self.sel3.inclusion, alt_md, 'abc') with self.assertRaisesRegex(ValueError, 'inconsistent metadata'): combine_selections([self.sel1, self.sel2, sel4])
def sample_random(ids: qiime2.Metadata, n: int, seed: int = None) \ -> IDSelection: if n > ids.id_count: raise ValueError("Value for n is larger than the number of IDs" " present") df = ids.to_dataframe() samples = df.sample(n, replace=False, random_state=seed) inclusion = pd.Series(False, index=df.index) inclusion[samples.index] = True return IDSelection(inclusion, ids, "sample_random")
def subsample_neighbors(focal_seqs: DNAFASTAFormat, context_seqs: DNAFASTAFormat, percent_id: float, samples_per_cluster: int, locale: CategoricalMetadataColumn = None, max_accepts: int = 10, n_threads: int = 1, seed: int = None) -> IDSelection: if max_accepts < samples_per_cluster: raise ValueError('max_accepts (%d) must be greater than or equal to ' 'samples_per_cluster (%d), since it is determines ' 'the largest number of samples that could be ' 'obtained per cluster.' % (max_accepts, samples_per_cluster)) context_ids = ids_from_fasta(str(context_seqs)) inclusion = pd.Series(False, index=context_ids, name='inclusion') if locale is not None: locale = locale.filter_ids(inclusion.index).to_series() metadata = pd.DataFrame(locale) else: metadata = pd.DataFrame(index=pd.Index(inclusion.index)) metadata.index.name = 'id' with tempfile.NamedTemporaryFile() as vsearch_out_f: command = [ 'vsearch', '--threads', str(n_threads), '--usearch_global', str(focal_seqs), '--id', str(percent_id), '--db', str(context_seqs), '--userout', vsearch_out_f.name, '--qmask', 'none', '--maxaccepts', str(max_accepts), '--uc_allhits', '--userfields', 'query+target+mism' ] run_command(command) vsearch_out = pd.read_csv( open(vsearch_out_f.name), sep='\t', na_values='*', names=['focal_id', 'context_id', 'n_mismatches']) clusters = _clusters_from_vsearch_out(vsearch_out, locale) context_seqs_to_keep = \ _sample_clusters(clusters, samples_per_cluster, seed=seed) inclusion[context_seqs_to_keep] = True return IDSelection(inclusion, qiime2.Metadata(metadata), "subsample_neighbors")
def combine_selections(selections: IDSelection) -> IDSelection: output_label = "combined_selections" if len(selections) == 1: return IDSelection(selections[0].inclusion, selections[0].metadata, label=output_label) inclusion = selections[0].inclusion inclusion_ids = set(inclusion.index) metadata = selections[0].metadata.to_dataframe() metadata_ids = set(metadata.index) for e in selections[1:]: if inclusion_ids != set(e.inclusion.index): raise ValueError("Inclusion id sets are not equal. Can't combine.") inclusion = inclusion.combine(e.inclusion, operator.or_) df = e.metadata.to_dataframe() if metadata_ids != set(df.index): raise ValueError("Metadata id sets are not equal. Can't combine.") metadata = metadata.combine(df, _combine_df_error_if_not_equal) return IDSelection(inclusion, qiime2.Metadata(metadata), output_label)
def test_error_on_non_equal_inclusion_id_sets(self): bad_sel1 = IDSelection(pd.Series([False, False, True], index=['a', 'b', 'c'], name='inclusion'), self.md1, label='somthing') with self.assertRaisesRegex(ValueError, "id sets are not equal"): combine_selections([self.sel1, bad_sel1]) with self.assertRaisesRegex(ValueError, "id sets are not equal"): combine_selections([bad_sel1, self.sel1]) with self.assertRaisesRegex(ValueError, "id sets are not equal"): combine_selections([self.sel1, self.sel2, self.sel3, bad_sel1])
def setUp(self): super().setUp() df1 = pd.DataFrame([['x'], ['y'], ['z'], ['a']], index=['a', 'b', 'c', 'd'], columns=['locale']) df1.index.name = 'id' self.md1 = qiime2.Metadata(df1) self.sel1 = IDSelection(pd.Series([True, False, False, False], index=['a', 'b', 'c', 'd'], name='inclusion'), self.md1, label='sel1') self.sel2 = IDSelection(pd.Series([False, True, False, False], index=['a', 'b', 'c', 'd'], name='inclusion'), self.md1, label='sel2') self.sel3 = IDSelection(pd.Series([False, False, True, False], index=['a', 'b', 'c', 'd'], name='inclusion'), self.md1, label='sel3')
def test_error_on_non_equal_metadata_id_sets(self): df = pd.DataFrame([['x'], ['y'], ['z']], index=['a', 'b', 'c'], columns=['locale']) df.index.name = 'id' bad_md1 = qiime2.Metadata(df) bad_sel1 = IDSelection(self.sel1.inclusion, bad_md1, label='somthing') with self.assertRaisesRegex(ValueError, "id sets are not equal"): combine_selections([self.sel1, bad_sel1]) with self.assertRaisesRegex(ValueError, "id sets are not equal"): combine_selections([bad_sel1, self.sel1]) with self.assertRaisesRegex(ValueError, "id sets are not equal"): combine_selections([self.sel1, self.sel2, self.sel3, bad_sel1])
def subsample_diversity(context_seqs: DNAFASTAFormat, percent_id: float, max_accepts: int = 10, n_threads: int = 1) -> IDSelection: context_ids = ids_from_fasta(str(context_seqs)) inclusion = pd.Series(False, index=context_ids, name='inclusion') metadata = pd.DataFrame(index=pd.Index(inclusion.index)) metadata.index.name = 'id' with tempfile.NamedTemporaryFile() as uc_out_f: command = [ 'vsearch', '--threads', str(n_threads), '--cluster_fast', str(context_seqs), '--id', str(percent_id), '--uc', uc_out_f.name, '--qmask', 'none', '--maxaccepts', str(max_accepts), ] run_command(command) uc = pd.read_csv(uc_out_f.name, sep='\t', na_values='*', names=[ 'type', 'cluster_id', 'length', 'perc_id', 'strand', 'BLANK1', 'BLANK2', 'cigar', 'query', 'target' ]) # the S lines define the cluster centroids context_seqs_to_keep = uc[uc['type'] == 'S'].index inclusion[context_seqs_to_keep] = True return IDSelection(inclusion, qiime2.Metadata(metadata), "subsample_diversity")