def cluster_features_de_novo(sequences: DNAFASTAFormat, table: biom.Table, perc_identity: float, threads: int = 1 ) -> (biom.Table, DNAFASTAFormat): clustered_sequences = DNAFASTAFormat() with tempfile.NamedTemporaryFile() as fasta_with_sizes: with tempfile.NamedTemporaryFile() as out_uc: _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table) cmd = ['vsearch', '--cluster_size', fasta_with_sizes.name, '--id', str(perc_identity), '--centroids', str(clustered_sequences), '--uc', out_uc.name, '--qmask', 'none', # ensures no lowercase DNA chars '--xsize', '--threads', str(threads)] run_command(cmd) out_uc.seek(0) conn = _uc_to_sqlite(out_uc) collapse_f = _collapse_f_from_sqlite(conn) table = table.collapse(collapse_f, norm=False, min_group_size=1, axis='observation', include_collapsed_metadata=False) return table, clustered_sequences
def group(table: biom.Table, axis: str, metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table: if table.is_empty(): raise ValueError("Cannot group an empty table.") if axis == 'feature': biom_axis = 'observation' else: biom_axis = axis metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis), axis) grouped_table = table.collapse( lambda axis_id, _: metadata.get_value(axis_id), collapse_f=_mode_lookup[mode], axis=biom_axis, norm=False, include_collapsed_metadata=False) # Reorder axis by first unique appearance of each group value in metadata # (makes it stable for identity mappings and easier to test) # TODO use CategoricalMetadataColumn API for retrieving categories/groups, # when the API exists. series = metadata.to_series() return grouped_table.sort_order(series.unique(), axis=biom_axis)
def collapse_biom(table: biom.Table, mapping: dict, normalize=False): """Collapse a BIOM table in many-to-many mode. Parameters ---------- table : biom.Table Table to collapse. mapping : dict of list of str Source-to-target(s) mapping. normalize : bool, optional Whether normalize per-target counts by number of targets per source. Returns ------- biom.Table Collapsed BIOM table. Notes ----- Metadata will not be retained in the collapsed table. See Also -------- .table.collapse_table """ # filter table features table = table.filter(lambda data, id_, md: id_ in mapping, axis='observation', inplace=False) # stop if no feature left if table.is_empty(): return table # add mapping to table metadata table.add_metadata({k: dict(part=v) for k, v in mapping.items()}, axis='observation') # determine collapsing method kwargs = dict(norm=False, one_to_many=True, axis='observation', one_to_many_mode=('divide' if normalize else 'add')) # collapse table in many-to-many mode table = table.collapse(lambda id_, md: zip(md['part'], md['part']), **kwargs) # round to integers if normalize: round_biom(table) # clean up table.del_metadata(keys=['Path']) return table
def cluster_features(represenative_seqs: DNAFASTAFormat, table: biom.Table, id: float) -> (biom.Table, DNAFASTAFormat): seqs_fp = str(represenative_seqs) out_representative_seqs = DNAFASTAFormat() with tempfile.NamedTemporaryFile() as out_uc: cmd = [ 'vsearch', '--cluster_fast', seqs_fp, '--id', str(id), '--centroids', str(out_representative_seqs), '--uc', out_uc.name ] run_command(cmd) out_uc.seek(0) collapse_f = _collapse_f_from_uc(out_uc) table = table.collapse(collapse_f, norm=False, min_group_size=1, axis='observation') return table, out_representative_seqs
def group(table: biom.Table, axis: str, metadata: qiime2.MetadataCategory, mode: str) -> biom.Table: if table.is_empty(): raise ValueError("Cannot group an empty table.") if axis == 'feature': biom_axis = 'observation' else: biom_axis = axis series = _munge_metadata_category(metadata, table.ids(axis=biom_axis), axis) grouped_table = table.collapse(lambda axis_id, _: series.loc[axis_id], collapse_f=_mode_lookup[mode], axis=biom_axis, norm=False, include_collapsed_metadata=False) # Reorder axis by first unique appearance of each group value in metadata # (makes it stable for identity mappings and easier to test) return grouped_table.sort_order(series.unique(), axis=biom_axis)
def trim_dada2_posthoc( table: biom.Table, representative_sequences: pd.Series, trim_length: int = 0, hashed_feature_ids: bool = True) -> (biom.Table, pd.Series): """ Trims ASVs generated by DADA2 to a standard length Parameters ---------- table : biom.Table The feature table representative_sequences: DNAFASTAFormat The sequences which correspond to the ASV table trim_length : int The length to trim the ASVS. If the length is 0, the minimum sequence length will be used. hash_feature_ids: bool Whether feature and sequence IDs should be hashed. """ # Trims the sequences seq_length = representative_sequences.apply(lambda x: len(x)) if trim_length == 0: trim_length = seq_length.min() if (seq_length < trim_length).any(): warnings.warn( "There are ASVs shorter than the trim length. " "These sequences will be discarded.", UserWarning) rep_seqs = representative_sequences.astype(str) rep_seqs = rep_seqs.loc[seq_length >= trim_length].copy() rep_seqs = pd.DataFrame(data=[rep_seqs.apply(lambda x: x[:trim_length])], index=['sequence']).T # Collapses the table based on the trimmed sequences table.filter(lambda v, id_, md: id_ in rep_seqs.index, axis='observation', inplace=True) table.add_metadata( rep_seqs.loc[table.ids(axis='observation')].to_dict(orient='index'), axis='observation') table2 = table.collapse(lambda id_, md: md['sequence'], norm=False, axis='observation') seqs2 = rep_seqs.drop_duplicates()['sequence'].copy() if hashed_feature_ids: table2.update_ids( {seq_: _hash_seq(seq_) for seq_ in table2.ids(axis='observation')}, axis='observation', inplace=True) seqs2.rename({id_: _hash_seq(seq_) for id_, seq_ in seqs2.items()}, inplace=True) else: seqs2.rename({id_: seq_ for id_, seq_ in seqs2.items()}, inplace=True) return table2, seqs2
def cluster_features_closed_reference( sequences: DNAFASTAFormat, table: biom.Table, reference_sequences: DNAFASTAFormat, perc_identity: float, strand: str = 'plus', threads: int = 1) -> (biom.Table, DNAFASTAFormat, DNAFASTAFormat): table_ids = set(table.ids(axis='observation')) sequence_ids = { e.metadata['id'] for e in skbio.io.read( str(sequences), constructor=skbio.DNA, format='fasta') } _error_on_nonoverlapping_ids(table_ids, sequence_ids) matched_seqs, unmatched_seqs = DNAFASTAFormat(), DNAFASTAFormat() with tempfile.NamedTemporaryFile() as fasta_with_sizes, \ tempfile.NamedTemporaryFile() as out_uc, \ tempfile.NamedTemporaryFile() as tmp_unmatched_seqs: _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table) cmd = [ 'vsearch', '--usearch_global', fasta_with_sizes.name, '--id', str(perc_identity), '--db', str(reference_sequences), '--uc', out_uc.name, '--strand', str(strand), '--qmask', 'none', # ensures no lowercase DNA chars '--notmatched', tmp_unmatched_seqs.name, '--threads', str(threads) ] run_command(cmd) out_uc.seek(0) # It is possible for there to be no unmatched sequences --- if that # is the case, skip thie following clean-up. if os.path.getsize(tmp_unmatched_seqs.name) > 0: # We don't really need to sort the matched sequences, this # is just to let us use --xsize, which strips the counts from # the Feature ID. It would be more ideal if --usearch_global, # above let us pass in --xsize, but unfortunately it isn't # supported. cmd = [ 'vsearch', '--sortbysize', tmp_unmatched_seqs.name, '--xsize', '--output', str(unmatched_seqs) ] run_command(cmd) try: conn = _uc_to_sqlite(out_uc) collapse_f = _collapse_f_from_sqlite(conn) _fasta_from_sqlite(conn, str(sequences), str(matched_seqs)) except ValueError: raise VSearchError('No matches were identified to ' 'reference_sequences. This can happen if ' 'sequences are not homologous to ' 'reference_sequences, or if sequences are ' 'not in the same orientation as reference_' 'sequences (i.e., if sequences are reverse ' 'complemented with respect to reference ' 'sequences). Sequence orientation can be ' 'adjusted with the strand parameter.') unmatched_ids = [ e.metadata['id'] for e in skbio.io.read(open(str(unmatched_seqs)), constructor=skbio.DNA, format='fasta') ] table.filter(ids_to_keep=unmatched_ids, invert=True, axis='observation', inplace=True) table = table.collapse(collapse_f, norm=False, min_group_size=1, axis='observation', include_collapsed_metadata=False) return table, matched_seqs, unmatched_seqs
def collapse_biom(table: biom.Table, mapping: dict, divide=False, field=None): """Collapse a BIOM table in many-to-many mode. Parameters ---------- table : biom.Table Table to collapse. mapping : dict of list of str Source-to-target(s) mapping. divide : bool, optional Whether divide per-target counts by number of targets per source. field : int, optional Index of field to be collapsed in a stratified table. Returns ------- biom.Table Collapsed BIOM table. Raises ------ ValueError Field index is not present in a feature ID. Notes ----- Metadata will not be retained in the collapsed table. See Also -------- .table.collapse_table """ # generate metadata metadata = {} for id_ in table.ids('observation'): feature = id_ if field: fields = feature.split('|') try: feature = fields[field] except IndexError: raise ValueError( f'Feature "{feature}" has less than {field + 1} fields.') if feature not in mapping: continue targets = [] for target in mapping[feature]: if field: fields[field] = target target = '|'.join(fields) targets.append(target) metadata[id_] = dict(part=targets) # filter table features table = table.filter(lambda data, id_, md: id_ in metadata, axis='observation', inplace=False) # stop if no feature left if table.is_empty(): return table # add mapping to table metadata table.add_metadata(metadata, axis='observation') # determine collapsing method kwargs = dict(norm=False, one_to_many=True, axis='observation', one_to_many_mode=('divide' if divide else 'add')) # collapse table in many-to-many mode table = table.collapse(lambda _, md: zip(md['part'], md['part']), **kwargs) # round to integers if divide: round_biom(table) # clean up table.del_metadata(keys=['Path']) return table