def filter_seqs_length_by_taxon( sequences: DNAFASTAFormat, taxonomy: pd.Series, labels: str, min_lens: int = None, max_lens: int = None, global_min: int = None, global_max: int = None) -> (DNAFASTAFormat, DNAFASTAFormat): # Validate filtering options if min_lens is max_lens is None: raise ValueError(ERROR_FILTER_OPTIONS + 'min_lens, max_lens.') # validate that all seqIDs are present in taxonomy # Note we view as DNAIterator to take a first pass (should take a few # seconds) as initial validation before performing length filtering. seq_ids = {i.metadata['id'] for i in sequences.view(DNAIterator)} _index_is_superset(seq_ids, set(taxonomy.index)) # set filter options mins = maxs = None if min_lens is not None: if len(labels) != len(min_lens): raise ValueError( 'labels and min_lens must contain the same number of elements') else: mins = {k: v for k, v in zip(labels, min_lens)} if max_lens is not None: if len(labels) != len(max_lens): raise ValueError( 'labels and max_lens must contain the same number of elements') else: maxs = {k: v for k, v in zip(labels, max_lens)} # Stream seqs, apply filter(s) result = DNAFASTAFormat() failures = DNAFASTAFormat() with result.open() as out_fasta, failures.open() as out_failed: for seq in sequences.view(DNAIterator): # taxon is required, we always use taxon-based filtering # grab taxon affiliation for seq taxon = taxonomy[seq.metadata['id']] # search taxon for filter terms # NOTE: we find all matching search terms and pass them all to # _seq_length_within_range below; that function determines and # applies the most stringent matching length thresholds. taxahits = [t for t in labels if t in taxon] # if there are no taxahits or global filters, just write out if not any(taxahits) and global_min is global_max is None: seq.write(out_fasta) # if there are taxahits or global filters, always check length elif _seq_length_within_range(seq, taxahits, mins, maxs, global_min, global_max): seq.write(out_fasta) else: seq.write(out_failed) return result, failures
def exclude_seqs( query_sequences: DNAFASTAFormat, reference_sequences: DNAFASTAFormat, method: str = 'blast', perc_identity: float = 0.97, evalue: float = None, perc_query_aligned: float = 0.97, threads: str = 1, left_justify: bool = False, ) -> (pd.Series, pd.Series): if left_justify and (method not in left_justify_supported_methods): raise ValueError("Enabling left_justify is not compatible with " "method=%r, check the documentation for valid " "combinations" % (method, )) # BLAST query seqs vs. ref db of contaminants (or targets) hit_ids = _search_seqs(query_sequences, reference_sequences, evalue=evalue, perc_identity=perc_identity, threads=threads, perc_query_aligned=perc_query_aligned, method=method, left_justify=left_justify) # convert query_sequences to series for filtering query_series = query_sequences.view(pd.Series) # if no hits are in hit_ids, return empty hits and query_series as misses if len(hit_ids) < 1: hits_seqs = pd.Series(dtype='string') return hits_seqs, query_series # if all query seqs are hits, return query_series as hits and empty misses elif len(hit_ids) == len(query_series): misses_seqs = pd.Series(dtype='string') return query_series, misses_seqs # otherwise filter seqs from seq file else: hits_seqs = {} misses_seqs = {} for seq_id, seq in query_series.items(): seq = str(seq) if seq_id in hit_ids: hits_seqs[seq_id] = seq else: misses_seqs[seq_id] = seq return (pd.Series(hits_seqs, dtype='string'), pd.Series(misses_seqs, dtype='string'))
def orient_seqs( sequences: DNAFASTAFormat, reference_sequences: DNAFASTAFormat, perc_identity: float = 0.9, query_cov: float = 0.9, threads: int = 1, left_justify: bool = False, ) -> (DNAFASTAFormat, DNAFASTAFormat): matched_temp, notmatched = DNAFASTAFormat(), DNAFASTAFormat() # use vsearch to search query seqs against reference database # report orientation of query seqs relative to reference seqs. with tempfile.NamedTemporaryFile() as out: # note: qmask is disabled as DNAFASTAFormat requires all output seqs # to be uppercase. Could loop through output seqs to convert to upper # but which is faster: disabling masking or looping through with skbio? cmd = [ 'vsearch', '--usearch_global', str(sequences), '--matched', str(matched_temp), '--notmatched', str(notmatched), '--db', str(reference_sequences), '--id', str(perc_identity), '--maxaccepts', '1', '--strand', 'both', '--qmask', 'none', '--query_cov', str(query_cov), '--threads', str(threads), '--userfields', 'qstrand', '--userout', out.name ] if left_justify: cmd.append('--leftjust') run_command(cmd) with open(out.name, 'r') as orient: orientations = [line.strip() for line in orient] # if any query seqs are in reverse orientation, reverse complement if '-' in orientations: matched = DNAFASTAFormat() with matched.open() as out_fasta: for seq, orientation in zip(matched_temp.view(DNAIterator), orientations): if orientation == '+': seq.write(out_fasta) elif orientation == '-': seq.reverse_complement().write(out_fasta) else: matched = matched_temp return matched, notmatched
def prepare_extracted_region(sequences: DNAFASTAFormat, region:str, trim_length:int, fwd_primer:str, rev_primer:str, reverse_complement_rev:bool=True, reverse_complement_result:bool=False, chunk_size:int=10000, debug:bool=False, n_workers:int=1, client_address:str=None, ) -> (DNAFASTAFormat, pd.DataFrame): """ Prepares and extracted database for regional alignment This function takes an amplified region of the database, expands the degenerate sequences and collapses the duplciated sequences under a single id that can be untangled later. Parameters ---------- sequences: q2_type.DNAFASTAFormat The regional sequences to be collapsed region: str A unique name for the region being handled trim_length : int The length of final sequences to matched the trimmed kmers for kmer-based alignment. chunk_size: int, optional The number of sequences to group for analysis debug: bool Whether the function should be run in debug mode (without a client) or not. `debug` superceeds all options n_workers: int, optional The number of jobs to initiate. When `n_workers` is 0, the cluster will be able to access all avaliable resources. Returns ------- q2_types.DNAFASTAFormat The reads with degenerate nucleotides expanded and duplicated sequences collapsed. DataFrame A mapping between the kmer sequence name and the the full database sequence name, along with regional information """ # Sets up the client _setup_dask_client(debug=debug, cluster_config=None, n_workers=n_workers, address=client_address) # Reverse complements the reverse primer if reverse_complement_rev: rev_primer = str(DNA(rev_primer).reverse_complement()) # Reads in the sequences sequences = sequences.view(DNAIterator) seq_blocks = [dask.delayed(_block_seqs)(seq) for seq in _chunks(sequences, int((chunk_size)))] # Makes the fake extraction position based on the trim length fragment = [dask.delayed(_artifical_trim)(seq, trim_length) for seq in seq_blocks] # Prepares the amplicon for collapsing condensed = dd.from_delayed([ dask.delayed(_condense_seqs)(seq) for seq in fragment], meta=[('amplicon', 'str'), ('seq-name', 'str')] ) # Writes the ff, group2 = _collapse_all_sequences(condensed, reverse_complement_result) ids = _expand_ids(group2, fwd_primer, rev_primer, region, trim_length, chunk_size) return (ff, ids.compute().set_index('db-seq').sort_index())
def align_regional_kmers(kmers: DNAFASTAFormat, rep_seq: pd.Series, region: str, max_mismatch: int=2, chunk_size:int=100, debug:bool=False, n_workers:int=1, client_address:str=None) -> KmerAlignFormat: """ Performs regional alignment between database "kmers" and ASVs Parameters ---------- kmers : DNAFastaFormat The set of reference sequences extracted from the database. These are assumes to be start in the same position of the 16s rRNA sequence as the sequence being tested and assumed to be the same length as the ASVs being aligned. rep_seq: DNAFastaFormat The representative sequences for the regional ASV table being aligned. These are assumed to start at the same position as the kmers and should be trimmed to the same length. region: str An identifier for the region. Ideally, this matches the identifier used in the reference region map max_mismatch: int the maximum number of mismatched nucleotides allowed in mapping between a sequence and kmer. debug: bool Whether the function should be run in debug mode (without a client) or not. `debug` superceeds all options n_workers: int, optional The number of jobs to initiate. When `n_workers` is 0, the cluster will be able to access all avaliable resources. Returns ------- DataFrame A mapping between the kmer (`kmer`) and the asv (`asv`), including the region (`region`), number of mismatched basepairs (`mismatch`) and the sequence length (`length`). DNAFASTAFormat The ASVs which could not be aligned to kmers """ # Sets up the client _setup_dask_client(debug=debug, cluster_config=None, n_workers=n_workers, address=client_address) # Converts the representative sequences to a delayed object num_asvs, asv_length = _check_read_lengths(rep_seq, 'rep_seq') rep_seq_ids = rep_seq.index.values rep_seq = dd.from_pandas(rep_seq.astype(str), chunksize=chunk_size) ff = KmerAlignFormat() # Performs the alignment for i, batch in enumerate(_chunks(kmers.view(DNAIterator), chunk_size * 100)): if i == 0: batch = pd.Series({s.metadata['id']: str(s) for s in batch}) num_kmers, kmer_length = _check_read_lengths(batch, 'kmer') if kmer_length != asv_length: raise ValueError('The kmer and ASV sequences must be the' ' same length') batch = dd.from_pandas(batch, chunksize=chunk_size) else: batch = dd.from_pandas( pd.Series({s.metadata['id']: str(s) for s in batch}), chunksize=chunk_size ) aligned_batch = np.hstack([ dask.delayed(_align_kmers)(kmer, asv, max_mismatch) for kmer, asv in it.product(batch.to_delayed(), rep_seq.to_delayed()) ]) aligned_batch = pd.concat(axis=0, objs=dask.compute(*aligned_batch)) aligned_batch['region'] = region aligned_batch['max-mismatch'] = max_mismatch if i == 0: aligned_batch.to_csv(str(ff), sep='\t', index=False, mode='w') else: aligned_batch.to_csv(str(ff), sep='\t', index=False, header=False, mode='a') return ff