def subsample_neighbors(focal_seqs: DNAFASTAFormat, context_seqs: DNAFASTAFormat, percent_id: float, samples_per_cluster: int, locale: CategoricalMetadataColumn = None, max_accepts: int = 10, n_threads: int = 1, seed: int = None) -> IDSelection: if max_accepts < samples_per_cluster: raise ValueError('max_accepts (%d) must be greater than or equal to ' 'samples_per_cluster (%d), since it is determines ' 'the largest number of samples that could be ' 'obtained per cluster.' % (max_accepts, samples_per_cluster)) context_ids = ids_from_fasta(str(context_seqs)) inclusion = pd.Series(False, index=context_ids, name='inclusion') if locale is not None: locale = locale.filter_ids(inclusion.index).to_series() metadata = pd.DataFrame(locale) else: metadata = pd.DataFrame(index=pd.Index(inclusion.index)) metadata.index.name = 'id' with tempfile.NamedTemporaryFile() as vsearch_out_f: command = [ 'vsearch', '--threads', str(n_threads), '--usearch_global', str(focal_seqs), '--id', str(percent_id), '--db', str(context_seqs), '--userout', vsearch_out_f.name, '--qmask', 'none', '--maxaccepts', str(max_accepts), '--uc_allhits', '--userfields', 'query+target+mism' ] run_command(command) vsearch_out = pd.read_csv( open(vsearch_out_f.name), sep='\t', na_values='*', names=['focal_id', 'context_id', 'n_mismatches']) clusters = _clusters_from_vsearch_out(vsearch_out, locale) context_seqs_to_keep = \ _sample_clusters(clusters, samples_per_cluster, seed=seed) inclusion[context_seqs_to_keep] = True return IDSelection(inclusion, qiime2.Metadata(metadata), "subsample_neighbors")
def subsample_diversity(context_seqs: DNAFASTAFormat, percent_id: float, max_accepts: int = 10, n_threads: int = 1) -> IDSelection: context_ids = ids_from_fasta(str(context_seqs)) inclusion = pd.Series(False, index=context_ids, name='inclusion') metadata = pd.DataFrame(index=pd.Index(inclusion.index)) metadata.index.name = 'id' with tempfile.NamedTemporaryFile() as uc_out_f: command = [ 'vsearch', '--threads', str(n_threads), '--cluster_fast', str(context_seqs), '--id', str(percent_id), '--uc', uc_out_f.name, '--qmask', 'none', '--maxaccepts', str(max_accepts), ] run_command(command) uc = pd.read_csv(uc_out_f.name, sep='\t', na_values='*', names=[ 'type', 'cluster_id', 'length', 'perc_id', 'strand', 'BLANK1', 'BLANK2', 'cigar', 'query', 'target' ]) # the S lines define the cluster centroids context_seqs_to_keep = uc[uc['type'] == 'S'].index inclusion[context_seqs_to_keep] = True return IDSelection(inclusion, qiime2.Metadata(metadata), "subsample_diversity")
def sample_longitudinal(dates: qiime2.CategoricalMetadataColumn, context_seqs: DNAFASTAFormat = None, start_date: str = None, samples_per_interval: int = 7, days_per_interval: int = 7, seed: int = None) -> IDSelection: window_size = '%dD' % days_per_interval if context_seqs is not None: # filter dates to only include the ids that sequence data is # available for ids_to_include = ids_from_fasta(str(context_seqs)) dates = dates.filter_ids(ids_to_include) dt_series = pd.to_datetime(dates.to_series(), errors='coerce') df = pd.DataFrame({'ids': dates.to_series().index}, index=dt_series) if start_date is not None: filter_before = pd.Timestamp(start_date) df = df.iloc[np.where(dt_series >= filter_before)] if filter_before not in df.index: # this will be stripped in _sample_group::_sampler # the purpose is to force Pandas to begin the window at this # time instead of the first observation (by making NaN the first # observation) df.loc[filter_before] = float('nan') grouped = df.groupby(pd.Grouper(freq=window_size, convention='start', closed='left'), group_keys=False) filtered_df = grouped.apply(_sample_group(samples_per_interval, seed)) df = df.dropna(axis=0) selection = pd.Series(False, index=dates.to_series().index) selection[filtered_df['ids']] = True md = qiime2.Metadata(dates.to_dataframe()) return IDSelection(selection, md, 'sample_longitudinal')