Esempio n. 1
0
def filter_seqs_length_by_taxon(
        sequences: DNAFASTAFormat,
        taxonomy: pd.Series,
        labels: str,
        min_lens: int = None,
        max_lens: int = None,
        global_min: int = None,
        global_max: int = None) -> (DNAFASTAFormat, DNAFASTAFormat):
    # Validate filtering options
    if min_lens is max_lens is None:
        raise ValueError(ERROR_FILTER_OPTIONS + 'min_lens, max_lens.')

    # validate that all seqIDs are present in taxonomy
    # Note we view as DNAIterator to take a first pass (should take a few
    # seconds) as initial validation before performing length filtering.
    seq_ids = {i.metadata['id'] for i in sequences.view(DNAIterator)}
    _index_is_superset(seq_ids, set(taxonomy.index))

    # set filter options
    mins = maxs = None
    if min_lens is not None:
        if len(labels) != len(min_lens):
            raise ValueError(
                'labels and min_lens must contain the same number of elements')
        else:
            mins = {k: v for k, v in zip(labels, min_lens)}

    if max_lens is not None:
        if len(labels) != len(max_lens):
            raise ValueError(
                'labels and max_lens must contain the same number of elements')
        else:
            maxs = {k: v for k, v in zip(labels, max_lens)}

    # Stream seqs, apply filter(s)
    result = DNAFASTAFormat()
    failures = DNAFASTAFormat()
    with result.open() as out_fasta, failures.open() as out_failed:
        for seq in sequences.view(DNAIterator):
            # taxon is required, we always use taxon-based filtering
            # grab taxon affiliation for seq
            taxon = taxonomy[seq.metadata['id']]
            # search taxon for filter terms
            # NOTE: we find all matching search terms and pass them all to
            # _seq_length_within_range below; that function determines and
            # applies the most stringent matching length thresholds.
            taxahits = [t for t in labels if t in taxon]
            # if there are no taxahits or global filters, just write out
            if not any(taxahits) and global_min is global_max is None:
                seq.write(out_fasta)
            # if there are taxahits or global filters, always check length
            elif _seq_length_within_range(seq, taxahits, mins, maxs,
                                          global_min, global_max):
                seq.write(out_fasta)
            else:
                seq.write(out_failed)
    return result, failures
Esempio n. 2
0
def exclude_seqs(
    query_sequences: DNAFASTAFormat,
    reference_sequences: DNAFASTAFormat,
    method: str = 'blast',
    perc_identity: float = 0.97,
    evalue: float = None,
    perc_query_aligned: float = 0.97,
    threads: str = 1,
    left_justify: bool = False,
) -> (pd.Series, pd.Series):

    if left_justify and (method not in left_justify_supported_methods):
        raise ValueError("Enabling left_justify is not compatible with "
                         "method=%r, check the documentation for valid "
                         "combinations" % (method, ))
    # BLAST query seqs vs. ref db of contaminants (or targets)
    hit_ids = _search_seqs(query_sequences,
                           reference_sequences,
                           evalue=evalue,
                           perc_identity=perc_identity,
                           threads=threads,
                           perc_query_aligned=perc_query_aligned,
                           method=method,
                           left_justify=left_justify)

    # convert query_sequences to series for filtering
    query_series = query_sequences.view(pd.Series)

    # if no hits are in hit_ids, return empty hits and query_series as misses
    if len(hit_ids) < 1:
        hits_seqs = pd.Series(dtype='string')
        return hits_seqs, query_series
    # if all query seqs are hits, return query_series as hits and empty misses
    elif len(hit_ids) == len(query_series):
        misses_seqs = pd.Series(dtype='string')
        return query_series, misses_seqs
    # otherwise filter seqs from seq file
    else:
        hits_seqs = {}
        misses_seqs = {}
        for seq_id, seq in query_series.items():
            seq = str(seq)
            if seq_id in hit_ids:
                hits_seqs[seq_id] = seq
            else:
                misses_seqs[seq_id] = seq
        return (pd.Series(hits_seqs, dtype='string'),
                pd.Series(misses_seqs, dtype='string'))
Esempio n. 3
0
def orient_seqs(
    sequences: DNAFASTAFormat,
    reference_sequences: DNAFASTAFormat,
    perc_identity: float = 0.9,
    query_cov: float = 0.9,
    threads: int = 1,
    left_justify: bool = False,
) -> (DNAFASTAFormat, DNAFASTAFormat):
    matched_temp, notmatched = DNAFASTAFormat(), DNAFASTAFormat()
    # use vsearch to search query seqs against reference database
    # report orientation of query seqs relative to reference seqs.
    with tempfile.NamedTemporaryFile() as out:
        # note: qmask is disabled as DNAFASTAFormat requires all output seqs
        # to be uppercase. Could loop through output seqs to convert to upper
        # but which is faster: disabling masking or looping through with skbio?
        cmd = [
            'vsearch', '--usearch_global',
            str(sequences), '--matched',
            str(matched_temp), '--notmatched',
            str(notmatched), '--db',
            str(reference_sequences), '--id',
            str(perc_identity), '--maxaccepts', '1', '--strand', 'both',
            '--qmask', 'none', '--query_cov',
            str(query_cov), '--threads',
            str(threads), '--userfields', 'qstrand', '--userout', out.name
        ]
        if left_justify:
            cmd.append('--leftjust')
        run_command(cmd)
        with open(out.name, 'r') as orient:
            orientations = [line.strip() for line in orient]

    # if any query seqs are in reverse orientation, reverse complement
    if '-' in orientations:
        matched = DNAFASTAFormat()
        with matched.open() as out_fasta:
            for seq, orientation in zip(matched_temp.view(DNAIterator),
                                        orientations):
                if orientation == '+':
                    seq.write(out_fasta)
                elif orientation == '-':
                    seq.reverse_complement().write(out_fasta)
    else:
        matched = matched_temp

    return matched, notmatched
Esempio n. 4
0
def prepare_extracted_region(sequences: DNAFASTAFormat, 
    region:str, 
    trim_length:int, 
    fwd_primer:str, 
    rev_primer:str, 
    reverse_complement_rev:bool=True,
    reverse_complement_result:bool=False,
    chunk_size:int=10000, 
    debug:bool=False, 
    n_workers:int=1,
    client_address:str=None,
    ) -> (DNAFASTAFormat, pd.DataFrame):
    """
    Prepares and extracted database for regional alignment

    This function takes an amplified region of the database, expands the
    degenerate sequences and collapses the duplciated sequences under a 
    single id that can be untangled later.

    Parameters
    ----------
    sequences: q2_type.DNAFASTAFormat
        The regional sequences to be collapsed
    region: str
        A unique name for the region being handled
    trim_length : int
        The length of final sequences to matched the trimmed kmers for 
        kmer-based alignment.
    chunk_size: int, optional
        The number of sequences to group for analysis
    debug: bool
        Whether the function should be run in debug mode (without a client)
        or not. `debug` superceeds all options
    n_workers: int, optional
        The number of jobs to initiate. When `n_workers` is 0, the cluster 
        will be able to access all avaliable resources.

    Returns
    -------
    q2_types.DNAFASTAFormat
        The reads with degenerate nucleotides expanded and duplicated 
        sequences collapsed.
    DataFrame
        A mapping between the kmer sequence name and the the full database 
        sequence name, along with regional information
    """

    # Sets up the client
    _setup_dask_client(debug=debug, cluster_config=None,  
                       n_workers=n_workers, address=client_address)

    # Reverse complements the reverse primer
    if reverse_complement_rev:
        rev_primer = str(DNA(rev_primer).reverse_complement())

    # Reads in the sequences
    sequences = sequences.view(DNAIterator)
    seq_blocks = [dask.delayed(_block_seqs)(seq)
                  for seq in _chunks(sequences, int((chunk_size)))]
    # Makes the fake extraction position based on the trim length
    fragment = [dask.delayed(_artifical_trim)(seq, trim_length) 
                for seq in seq_blocks]
    # Prepares the amplicon for collapsing
    condensed = dd.from_delayed([
        dask.delayed(_condense_seqs)(seq) for seq in fragment],
        meta=[('amplicon', 'str'), ('seq-name', 'str')]
    )
    # Writes the 
    ff, group2 = _collapse_all_sequences(condensed, reverse_complement_result)
    ids = _expand_ids(group2, fwd_primer, rev_primer, region, trim_length,
                      chunk_size)

    return (ff, ids.compute().set_index('db-seq').sort_index())
Esempio n. 5
0
def align_regional_kmers(kmers: DNAFASTAFormat, 
    rep_seq: pd.Series, 
    region: str, 
    max_mismatch: int=2, 
    chunk_size:int=100, 
    debug:bool=False, 
    n_workers:int=1,
    client_address:str=None) -> KmerAlignFormat:
    """
    Performs regional alignment between database "kmers" and ASVs

    Parameters
    ----------
    kmers : DNAFastaFormat
        The set of reference sequences extracted from the database. These are
        assumes to be start in the same position of the 16s rRNA sequence as 
        the sequence being tested and assumed to be the same length as the
        ASVs being aligned.
    rep_seq: DNAFastaFormat
        The representative sequences for the regional ASV table being aligned.
        These are assumed to start at the same position as the kmers and 
        should be trimmed to the same length.
    region: str
        An identifier for the region. Ideally, this matches the identifier 
        used in the reference region map
    max_mismatch: int
        the maximum number of mismatched nucleotides allowed in mapping 
        between a sequence and kmer.
    debug: bool
        Whether the function should be run in debug mode (without a client)
        or not. `debug` superceeds all options
    n_workers: int, optional
        The number of jobs to initiate. When `n_workers` is 0, the cluster 
        will be able to access all avaliable resources.

    Returns
    -------
    DataFrame
        A mapping between the kmer (`kmer`) and the asv (`asv`), including 
        the region (`region`), number of mismatched basepairs (`mismatch`) and 
        the sequence length (`length`).
    DNAFASTAFormat
        The ASVs which could not be aligned to kmers 

    """
     # Sets up the client
    _setup_dask_client(debug=debug, cluster_config=None,  
                       n_workers=n_workers, address=client_address)

    # Converts the representative sequences to a delayed object
    num_asvs, asv_length = _check_read_lengths(rep_seq, 'rep_seq')
    rep_seq_ids  = rep_seq.index.values
    rep_seq = dd.from_pandas(rep_seq.astype(str),
                             chunksize=chunk_size)

    ff = KmerAlignFormat()

    # Performs the alignment
    for i,  batch in enumerate(_chunks(kmers.view(DNAIterator), 
                               chunk_size * 100)):
       

        if i == 0:
            batch = pd.Series({s.metadata['id']: str(s) for s in batch})
            num_kmers, kmer_length = _check_read_lengths(batch, 'kmer')

            if kmer_length != asv_length:
                raise ValueError('The kmer and ASV sequences must be the'
                                 ' same length')
            batch = dd.from_pandas(batch, chunksize=chunk_size)
        else:
            batch = dd.from_pandas(
                pd.Series({s.metadata['id']: str(s) for s in batch}),
                chunksize=chunk_size
                )

        aligned_batch = np.hstack([
            dask.delayed(_align_kmers)(kmer, asv, max_mismatch)
            for kmer, asv in it.product(batch.to_delayed(), rep_seq.to_delayed())
            ])

        aligned_batch = pd.concat(axis=0, objs=dask.compute(*aligned_batch))

        aligned_batch['region'] = region
        aligned_batch['max-mismatch'] = max_mismatch
        if i  == 0:
            aligned_batch.to_csv(str(ff), sep='\t', index=False, 
                                 mode='w')
        else:
            aligned_batch.to_csv(str(ff), sep='\t', index=False, 
                                 header=False,
                                 mode='a')

    return ff