def filter_seqs_length_by_taxon( sequences: DNAFASTAFormat, taxonomy: pd.Series, labels: str, min_lens: int = None, max_lens: int = None, global_min: int = None, global_max: int = None) -> (DNAFASTAFormat, DNAFASTAFormat): # Validate filtering options if min_lens is max_lens is None: raise ValueError(ERROR_FILTER_OPTIONS + 'min_lens, max_lens.') # validate that all seqIDs are present in taxonomy # Note we view as DNAIterator to take a first pass (should take a few # seconds) as initial validation before performing length filtering. seq_ids = {i.metadata['id'] for i in sequences.view(DNAIterator)} _index_is_superset(seq_ids, set(taxonomy.index)) # set filter options mins = maxs = None if min_lens is not None: if len(labels) != len(min_lens): raise ValueError( 'labels and min_lens must contain the same number of elements') else: mins = {k: v for k, v in zip(labels, min_lens)} if max_lens is not None: if len(labels) != len(max_lens): raise ValueError( 'labels and max_lens must contain the same number of elements') else: maxs = {k: v for k, v in zip(labels, max_lens)} # Stream seqs, apply filter(s) result = DNAFASTAFormat() failures = DNAFASTAFormat() with result.open() as out_fasta, failures.open() as out_failed: for seq in sequences.view(DNAIterator): # taxon is required, we always use taxon-based filtering # grab taxon affiliation for seq taxon = taxonomy[seq.metadata['id']] # search taxon for filter terms # NOTE: we find all matching search terms and pass them all to # _seq_length_within_range below; that function determines and # applies the most stringent matching length thresholds. taxahits = [t for t in labels if t in taxon] # if there are no taxahits or global filters, just write out if not any(taxahits) and global_min is global_max is None: seq.write(out_fasta) # if there are taxahits or global filters, always check length elif _seq_length_within_range(seq, taxahits, mins, maxs, global_min, global_max): seq.write(out_fasta) else: seq.write(out_failed) return result, failures
def scaffold_hybrid_tree_foundation_tree( otu_map: OtuMapFormat, extension_taxonomy: TSVTaxonomyFormat, extension_sequences: DNAFASTAFormat, foundation_tree: NewickFormat, foundation_taxonomy: TSVTaxonomyFormat, graft_level: str = _ghost_tree_defaults['graft_level'], ) -> NewickFormat: otu_map_fh = otu_map.open() extension_taxonomy_fh = extension_taxonomy.open() extension_sequences_fh = extension_sequences.open() foundation_alignment_fh = foundation_tree.open() if foundation_taxonomy: foundation_taxonomy_fh = foundation_taxonomy.open() else: foundation_taxonomy_fh = None with tempfile.TemporaryDirectory() as tmp: # need ghost_tree.nwk here otherwise file exists gt_path = os.path.join(tmp, 'ghost_tree') thetree = extensions_onto_foundation(otu_map_fh, extension_taxonomy_fh, extension_sequences_fh, foundation_alignment_fh, gt_path, graft_level, foundation_taxonomy_fh)[0] # write new file to tmp file; gets deleted when this block is done gt_temp_file = open(tmp + 'ghost_tree', 'w') gt_temp_file.write(thetree) gt_temp_file.close() return NewickFormat(tmp + 'ghost_tree', 'r')
def _process_primers(primer_fwd: Union[str, None], primer_rev: Union[str, None]) -> DNAFASTAFormat: """ Convert provided primers into skbio DNA format. Will reverse complement the reverse primer, if provided. Arguments: primer_fwd (str, None): forward primer primer_rev (str, None): reverse primer Returns: primers_fasta (DNAFASTAFormat): primers in FASTA format """ primers = { 'forward': DNA(primer_fwd, metadata={'id': 'forward'}) if primer_fwd else None, 'reverse': DNA(primer_rev, metadata={ 'id': 'reverse' }).reverse_complement() if primer_rev else None } # save primers in that format to pass them to mafft_add primers_fasta = DNAFASTAFormat() with primers_fasta.open() as out: [primer.write(out) for primer in primers.values() if primer] return primers_fasta
def _4(fmt: GISAIDDNAFASTAFormat) -> DNASequencesDirectoryFormat: data = _read_gisaid_dna_fasta(str(fmt)) df = DNASequencesDirectoryFormat() ff = DNAFASTAFormat() with ff.open() as file: skbio.io.write(data, format='fasta', into=file) df.file.write_data(ff, DNAFASTAFormat) return df
def _split_fasta(sequences, train_ids, test_ids): ''' Split FeatureData[Sequence] artifact into two, based on two sets of IDs. sequences: FeatureData[Sequence] Artifact train_ids: set test_ids: set ''' train_seqs = DNAFASTAFormat() test_seqs = DNAFASTAFormat() with train_seqs.open() as _train, test_seqs.open() as _test: for s in sequences.view(DNAIterator): _id = s.metadata['id'] if s.metadata['id'] in train_ids: _train.write('>%s\n%s\n' % (_id, str(s))) elif s.metadata['id'] in test_ids: _test.write('>%s\n%s\n' % (_id, str(s))) train_seqs = q2.Artifact.import_data('FeatureData[Sequence]', train_seqs) test_seqs = q2.Artifact.import_data('FeatureData[Sequence]', test_seqs) return train_seqs, test_seqs
def cull_seqs(sequences: DNAIterator, num_degenerates: int = 5, homopolymer_length: int = 8) -> DNAFASTAFormat: result = DNAFASTAFormat() with result.open() as out_fasta: for seq in sequences: degen = _filt_seq_with_degenerates(seq, num_degenerates) if not degen: poly = _filter_homopolymer(seq, homopolymer_length) if not poly: # if we make it here, write seq to file seq.write(out_fasta) return result
def degap_seqs(aligned_sequences: AlignedDNAIterator, min_length: int = 1) -> DNAFASTAFormat: result = DNAFASTAFormat() with result.open() as out_fasta: for seq in aligned_sequences: dg_seq = seq.degap() # If seq is all gaps, then dg_seq will be an empty string # and we'll not write it out. if len(dg_seq) >= min_length: dg_seq.write(out_fasta) return result
def _4(fmt: GISAIDDNAFASTAFormat) -> DNASequencesDirectoryFormat: df = DNASequencesDirectoryFormat() ff = DNAFASTAFormat() with ff.open() as file, \ tempfile.TemporaryFile(mode='w+') as temp_fh: data = _read_gisaid_dna_fasta(str(fmt), temp_fh) skbio.io.write(data, format='fasta', into=file) df.file.write_data(ff, DNAFASTAFormat) return df
def _convert_seq_block_to_dna_fasta_format(seqs): """ Converts to a DNA fasta format """ seqs = pd.concat(axis=0, sort=True, objs=seqs).fillna('') seqs = seqs.apply(lambda x: ''.join(x), axis=1) ff = DNAFASTAFormat() with ff.open() as f: for id_, seq_ in seqs.items(): sequence = skbio.DNA(seq_, metadata={'id': id_}) skbio.io.write(sequence, format='fasta', into=f) return ff
def extensions_cluster(extension_sequences: DNAFASTAFormat, similarity_threshold: str) -> OtuMapFormat: extension_sequences_fh = extension_sequences.open() with tempfile.TemporaryDirectory() as tmp: # need ghost_tree.nwk here otherwise file exists gt_path = os.path.join(tmp, 'otu_map') preprocess_extension_tree_sequences(str(extension_sequences_fh.name), str(similarity_threshold), gt_path) copyfile(gt_path, tmp + 'otu_map') return OtuMapFormat(tmp + 'otu_map', 'r')
def _dereplicate_taxa(taxa, raw_seqs, derep_seqs, uc, mode): # we only want to grab hits for uniq mode if mode == 'uniq': centroid_ids = set(uc['centroidID'].unique()) uc = uc[uc['seqID'] != uc['centroidID']] # map to taxonomy labels uc['Taxon'] = uc['seqID'].apply(lambda x: taxa.loc[x]) uc['centroidtaxa'] = uc['centroidID'].apply(lambda x: taxa.loc[x]) if mode == 'uniq': # filter out hits that do not match centroid taxonomy rereplicates = uc[uc['Taxon'] != uc['centroidtaxa']] # drop duplicates that share centroid ID and taxa assignment rereplicates = rereplicates.drop_duplicates(['centroidID', 'Taxon']) # grab associated seqs rereplicate_ids = centroid_ids.union(rereplicates['seqID'].unique()) # write out seqs for centroids and daughters with unique taxonomies seqs_out = DNAFASTAFormat() with seqs_out.open() as out_fasta: seq_fp = str(raw_seqs) for s in skbio.read(seq_fp, format='fasta', constructor=skbio.DNA): if s.metadata['id'] in rereplicate_ids: s.write(out_fasta) # generate list of dereplicated taxa derep_taxa = taxa.reindex(rereplicate_ids) else: # group seqs that share centroids (this includes the centroid) derep_taxa = uc.groupby('centroidID')['Taxon'].apply(lambda x: list(x)) # find LCA within each cluster if mode == 'lca': derep_taxa = derep_taxa.apply(lambda x: ';'.join( _find_lca([y.split(';') for y in x]))).to_frame() # find majority superset LCA within each cluster elif mode == 'super': derep_taxa = derep_taxa.apply(lambda x: ';'.join( _find_super_lca([y.split(';') for y in x]))).to_frame() # find majority taxon within each cluster elif mode == 'majority': derep_taxa = derep_taxa.apply(lambda x: _majority(x)).to_frame() # LCA and majority do nothing with the seqs seqs_out = derep_seqs # gotta please the type validator gods derep_taxa.index.name = 'Feature ID' return derep_taxa, seqs_out
def orient_seqs( sequences: DNAFASTAFormat, reference_sequences: DNAFASTAFormat, perc_identity: float = 0.9, query_cov: float = 0.9, threads: int = 1, left_justify: bool = False, ) -> (DNAFASTAFormat, DNAFASTAFormat): matched_temp, notmatched = DNAFASTAFormat(), DNAFASTAFormat() # use vsearch to search query seqs against reference database # report orientation of query seqs relative to reference seqs. with tempfile.NamedTemporaryFile() as out: # note: qmask is disabled as DNAFASTAFormat requires all output seqs # to be uppercase. Could loop through output seqs to convert to upper # but which is faster: disabling masking or looping through with skbio? cmd = [ 'vsearch', '--usearch_global', str(sequences), '--matched', str(matched_temp), '--notmatched', str(notmatched), '--db', str(reference_sequences), '--id', str(perc_identity), '--maxaccepts', '1', '--strand', 'both', '--qmask', 'none', '--query_cov', str(query_cov), '--threads', str(threads), '--userfields', 'qstrand', '--userout', out.name ] if left_justify: cmd.append('--leftjust') run_command(cmd) with open(out.name, 'r') as orient: orientations = [line.strip() for line in orient] # if any query seqs are in reverse orientation, reverse complement if '-' in orientations: matched = DNAFASTAFormat() with matched.open() as out_fasta: for seq, orientation in zip(matched_temp.view(DNAIterator), orientations): if orientation == '+': seq.write(out_fasta) elif orientation == '-': seq.reverse_complement().write(out_fasta) else: matched = matched_temp return matched, notmatched
def _rna_to_dna(path): ff = DNAFASTAFormat() with ff.open() as outfasta: for seq in _read_rna_fasta(path): seq.reverse_transcribe().write(outfasta) return ff