Ejemplo n.º 1
0
def subsample_neighbors(focal_seqs: DNAFASTAFormat,
                        context_seqs: DNAFASTAFormat,
                        percent_id: float,
                        samples_per_cluster: int,
                        locale: CategoricalMetadataColumn = None,
                        max_accepts: int = 10,
                        n_threads: int = 1,
                        seed: int = None) -> IDSelection:

    if max_accepts < samples_per_cluster:
        raise ValueError('max_accepts (%d) must be greater than or equal to '
                         'samples_per_cluster (%d), since it is determines '
                         'the largest number of samples that could be '
                         'obtained per cluster.' %
                         (max_accepts, samples_per_cluster))

    context_ids = ids_from_fasta(str(context_seqs))

    inclusion = pd.Series(False, index=context_ids, name='inclusion')
    if locale is not None:
        locale = locale.filter_ids(inclusion.index).to_series()
        metadata = pd.DataFrame(locale)
    else:
        metadata = pd.DataFrame(index=pd.Index(inclusion.index))
    metadata.index.name = 'id'

    with tempfile.NamedTemporaryFile() as vsearch_out_f:
        command = [
            'vsearch', '--threads',
            str(n_threads), '--usearch_global',
            str(focal_seqs), '--id',
            str(percent_id), '--db',
            str(context_seqs), '--userout', vsearch_out_f.name, '--qmask',
            'none', '--maxaccepts',
            str(max_accepts), '--uc_allhits', '--userfields',
            'query+target+mism'
        ]
        run_command(command)

        vsearch_out = pd.read_csv(
            open(vsearch_out_f.name),
            sep='\t',
            na_values='*',
            names=['focal_id', 'context_id', 'n_mismatches'])

        clusters = _clusters_from_vsearch_out(vsearch_out, locale)
        context_seqs_to_keep = \
            _sample_clusters(clusters, samples_per_cluster, seed=seed)
        inclusion[context_seqs_to_keep] = True

    return IDSelection(inclusion, qiime2.Metadata(metadata),
                       "subsample_neighbors")
def subsample_diversity(context_seqs: DNAFASTAFormat,
                        percent_id: float,
                        max_accepts: int = 10,
                        n_threads: int = 1) -> IDSelection:

    context_ids = ids_from_fasta(str(context_seqs))
    inclusion = pd.Series(False, index=context_ids, name='inclusion')
    metadata = pd.DataFrame(index=pd.Index(inclusion.index))
    metadata.index.name = 'id'

    with tempfile.NamedTemporaryFile() as uc_out_f:
        command = [
            'vsearch',
            '--threads',
            str(n_threads),
            '--cluster_fast',
            str(context_seqs),
            '--id',
            str(percent_id),
            '--uc',
            uc_out_f.name,
            '--qmask',
            'none',
            '--maxaccepts',
            str(max_accepts),
        ]
        run_command(command)

        uc = pd.read_csv(uc_out_f.name,
                         sep='\t',
                         na_values='*',
                         names=[
                             'type', 'cluster_id', 'length', 'perc_id',
                             'strand', 'BLANK1', 'BLANK2', 'cigar', 'query',
                             'target'
                         ])

    # the S lines define the cluster centroids
    context_seqs_to_keep = uc[uc['type'] == 'S'].index
    inclusion[context_seqs_to_keep] = True

    return IDSelection(inclusion, qiime2.Metadata(metadata),
                       "subsample_diversity")
Ejemplo n.º 3
0
def sample_longitudinal(dates: qiime2.CategoricalMetadataColumn,
                        context_seqs: DNAFASTAFormat = None,
                        start_date: str = None,
                        samples_per_interval: int = 7,
                        days_per_interval: int = 7,
                        seed: int = None) -> IDSelection:

    window_size = '%dD' % days_per_interval

    if context_seqs is not None:
        # filter dates to only include the ids that sequence data is
        # available for
        ids_to_include = ids_from_fasta(str(context_seqs))
        dates = dates.filter_ids(ids_to_include)

    dt_series = pd.to_datetime(dates.to_series(), errors='coerce')
    df = pd.DataFrame({'ids': dates.to_series().index}, index=dt_series)

    if start_date is not None:
        filter_before = pd.Timestamp(start_date)
        df = df.iloc[np.where(dt_series >= filter_before)]
        if filter_before not in df.index:
            # this will be stripped in _sample_group::_sampler
            # the purpose is to force Pandas to begin the window at this
            # time instead of the first observation (by making NaN the first
            # observation)
            df.loc[filter_before] = float('nan')

    grouped = df.groupby(pd.Grouper(freq=window_size,
                                    convention='start',
                                    closed='left'),
                         group_keys=False)
    filtered_df = grouped.apply(_sample_group(samples_per_interval, seed))

    df = df.dropna(axis=0)
    selection = pd.Series(False, index=dates.to_series().index)
    selection[filtered_df['ids']] = True

    md = qiime2.Metadata(dates.to_dataframe())
    return IDSelection(selection, md, 'sample_longitudinal')