def filter_samples(demux: _PlotQualView, metadata: Metadata, where: str = None, exclude_ids: bool = False) \ -> CasavaOneEightSingleLanePerSampleDirFmt: results = CasavaOneEightSingleLanePerSampleDirFmt() paired = demux.paired samples = demux.directory_format ids_to_keep = metadata.get_ids(where=where) if not ids_to_keep: raise ValueError('No filtering requested.') manifest = samples.manifest.view(pd.DataFrame) if exclude_ids: ids_to_keep = set(manifest.index) - set(ids_to_keep) try: for id in ids_to_keep: forward = manifest.loc[id].forward duplicate(forward, os.path.join(str(results), os.path.split(forward)[1])) if paired: reverse = manifest.loc[id].reverse duplicate( reverse, os.path.join(str(results), os.path.split(reverse)[1])) except KeyError: raise ValueError(f'{id!r} is not a sample present in the ' 'demultiplexed data.') return results
def get_ncbi_data(query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, entrez_delay: float = 0.334) -> (DNAIterator, DataFrame): if query is None and accession_ids is None: raise ValueError('Query or accession_ids must be supplied') if ranks is None: ranks = _default_ranks if query: seqs, taxids = get_nuc_for_query(query, entrez_delay) if accession_ids: accs = accession_ids.get_ids() if query and seqs: accs = accs - seqs.keys() if accs: acc_seqs, acc_taxids = get_nuc_for_accs(accs, entrez_delay) seqs.update(acc_seqs) taxids.update(acc_taxids) else: seqs, taxids = get_nuc_for_accs(accs, entrez_delay) taxa = get_taxonomies(taxids, ranks, rank_propagation, entrez_delay) seqs = DNAIterator(DNA(v, metadata={'id': k}) for k, v in seqs.items()) taxa = DataFrame(taxa, index=['Taxon']).T taxa.index.name = 'Feature ID' return seqs, taxa
def plot(output_dir, table: biom.Table, metadata: q2.Metadata, case_where: str, control_where: str, feature_tree: skbio.TreeNode = None): with open('/tmp/tree.nwk', 'w') as fh: feature_tree.write(fh) copy_tree(os.path.join(PLOT, 'assets', 'dist'), output_dir) data_dir = os.path.join(output_dir, 'data') os.mkdir(data_dir) metadata = metadata.filter_ids(table.ids(axis='sample')) case_samples = sorted(list(metadata.get_ids(case_where))) control_samples = sorted(list(metadata.get_ids(control_where))) table.filter(case_samples + control_samples) table.remove_empty('observation') features = list(table.ids(axis='observation')) if feature_tree is not None: feature_tree = shear_no_prune(feature_tree, features) else: feature_tree = TreeNode() tree_data = tree_to_array(feature_tree) idx, = np.where(np.asarray(tree_data['children']) == 0) tree_data['lookup'] = dict(zip(map(str, idx), range(len(idx)))) tip_order = np.asarray(tree_data['names'])[idx] table = table.sort_order(tip_order, axis='observation') table = table.sort_order(case_samples + control_samples, axis='sample') with open(os.path.join(data_dir, 'packed_table.jsonp'), 'w') as fh: fh.write('LOAD_PACKED_TABLE(') fh.write(json.dumps(table_to_b64pa(table))) fh.write(');') with open(os.path.join(data_dir, 'tree.jsonp'), 'w') as fh: fh.write('LOAD_TREE(') fh.write(json.dumps(tree_data)) fh.write(');')
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, where: str = None, exclude_ids: bool = False) -> skbio.DistanceMatrix: ids_to_keep = metadata.get_ids(where=where) if exclude_ids: ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep) # NOTE: there is no guaranteed ordering to output distance matrix because # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration # order. try: return distance_matrix.filter(ids_to_keep, strict=False) except skbio.stats.distance.DissimilarityMatrixError: raise ValueError( "All samples were filtered out of the distance matrix.")
def get_ncbi_data(query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, logging_level: str = None, n_jobs: int = 1) -> (DNAIterator, DataFrame): if ranks is None: ranks = _default_ranks if query is None and accession_ids is None: raise ValueError('Query or accession_ids must be supplied') manager = LokyManager() manager.start() request_lock = manager.Lock() if query: seqs, taxids = get_nuc_for_query(query, logging_level, n_jobs, request_lock, _entrez_delay) if accession_ids: accs = accession_ids.get_ids() if query and seqs: accs = accs - seqs.keys() if accs: acc_seqs, acc_taxids = get_nuc_for_accs( accs, logging_level, n_jobs, request_lock, _entrez_delay) seqs.update(acc_seqs) taxids.update(acc_taxids) else: seqs, taxids = get_nuc_for_accs(accs, logging_level, n_jobs, request_lock, _entrez_delay) taxa, bad_accs = get_taxonomies(taxids, ranks, rank_propagation, logging_level, n_jobs, request_lock, _entrez_delay) for acc in bad_accs: del seqs[acc] seqs = DNAIterator(DNA(v, metadata={'id': k}) for k, v in seqs.items()) taxa = DataFrame(taxa, index=['Taxon']).T taxa.index.name = 'Feature ID' return seqs, taxa
def filter_seqs(data: pd.Series, table: biom.Table = None, metadata: qiime2.Metadata = None, where: str = None, exclude_ids: bool = False) -> pd.Series: if table is not None and metadata is not None: raise ValueError('Filtering with metadata and filtering with a table ' 'are mutually exclusive.') elif table is None and metadata is None: raise ValueError('No filtering requested. Must provide either table ' 'or metadata.') elif table is not None: ids_to_keep = table.ids(axis='observation') else: # Note, no need to check for missing feature IDs in the metadata, # because that is basically the point of this method. ids_to_keep = metadata.get_ids(where=where) if exclude_ids is True: ids_to_keep = set(data.index) - set(ids_to_keep) filtered = data[data.index.isin(ids_to_keep)] if filtered.empty is True: raise ValueError('All features were filtered out of the data.') return filtered
def filter_seqs(data: pd.Series, table: biom.Table=None, metadata: qiime2.Metadata=None, where: str=None, exclude_ids: bool=False) -> pd.Series: if table is not None and metadata is not None: raise ValueError('Filtering with metadata and filtering with a table ' 'are mutually exclusive.') elif table is None and metadata is None: raise ValueError('No filtering requested. Must provide either table ' 'or metadata.') elif table is not None: ids_to_keep = table.ids(axis='observation') else: # Note, no need to check for missing feature IDs in the metadata, # because that is basically the point of this method. ids_to_keep = metadata.get_ids(where=where) if exclude_ids is True: ids_to_keep = set(data.index) - set(ids_to_keep) filtered = data[data.index.isin(ids_to_keep)] if filtered.empty is True: raise ValueError('All features were filtered out of the data.') return filtered
def filter_tree( tree: skbio.TreeNode, table: biom.Table = None, metadata: qiime2.Metadata = None, where: str = None, ) -> skbio.TreeNode: """ Prunes a phylogenetic tree to match the input ids """ # Checks the input metadata if ((table is None) & (metadata is None)): raise ValueError('A feature table, sequences or metadata must be ' 'provided for filtering.') filter_refs = [table, metadata] if np.sum([(ref is not None) for ref in filter_refs]).sum() > 1: raise ValueError('Filtering can only be performed using one reference' ' file. Please choose between filtering with a ' 'feature table, sequences, or metadata.') if (where is not None) & (metadata is None): raise ValueError("Metadata must be provided if 'where' is specified") # Gets the list of IDs to keep if table is not None: ids_to_keep = table.ids(axis='observation') if metadata is not None: ids_to_keep = metadata.get_ids(where) # Gets the list of tips tip_ids = set([t.name for t in tree.tips()]) # Checks for an intersection between ids if not set(tip_ids).issuperset(set(ids_to_keep)): raise ValueError('The ids for filtering must be a subset of ' 'the tips in the tree.') sub_tree = tree.shear(ids_to_keep) sub_tree.prune() return sub_tree
def _get_ncbi_data(query: str = None, accession_ids: Metadata = None, ranks: list = None, rank_propagation: bool = True, logging_level: str = None, n_jobs: int = 1, db: str = 'nuccore'): manager = LokyManager() manager.start() request_lock = manager.Lock() if query: seqs, taxids = get_data_for_query(query, logging_level, n_jobs, request_lock, _entrez_delay, db) if accession_ids: accs = accession_ids.get_ids() if query and seqs: accs = accs - seqs.keys() if accs: acc_seqs, acc_taxids = get_data_for_accs( accs, logging_level, n_jobs, request_lock, _entrez_delay, db) seqs.update(acc_seqs) taxids.update(acc_taxids) else: seqs, taxids = get_data_for_accs(accs, logging_level, n_jobs, request_lock, _entrez_delay, db) taxa, bad_accs = get_taxonomies(taxids, ranks, rank_propagation, logging_level, n_jobs, request_lock, _entrez_delay) for acc in bad_accs: del seqs[acc] return seqs, taxa
print("%s sample pairs matched together" % (len(case_to_control_match.keys()))) for key in case_to_control_match: key_value = case_to_control_match[key] matchDF.at[key, "matched_to"] = str(key_value) matchDF.at[key_value, "matched_to"] = str(key) else: print("%s cases matched" % (len(case_dictionary.keys()))) for case in case_dictionary: for control in case_dictionary[case]: if control in control_dictionary: control_dictionary[control].append(case) else: control_dictionary[control] = [case] matchDF.at[case, "matched_to"] = ", ".join(sorted(case_dictionary[case])) for control in control_dictionary: matchDF.at[control, "matched_to"] = ", ".join( sorted(control_dictionary[control])) matchedMD = Metadata(matchDF) if only_matches: ids = matchedMD.get_ids("matched_to NOT IN ('none')") #shrinks the MD to only have matched samples matchedMD = matchedMD.filter_ids(ids) return matchedMD
def simple_plot(output_dir, table: biom.Table, feature_tree: skbio.TreeNode, metadata: q2.Metadata, case_where: str, control_where: str, n_transects: int = 10, stratify_by: str = None, mode: str = 'max'): print("Data extracted") layer_dir = os.path.join(output_dir, 'layers') rank_dir = os.path.join(output_dir, 'ranks') os.mkdir(layer_dir) os.mkdir(rank_dir) metadata = metadata.filter_ids(table.ids(axis='sample')) case_samples = sorted(list(metadata.get_ids(case_where))) control_samples = sorted(list(metadata.get_ids(control_where))) get_pairs = comparisons(metadata, control_samples, case_samples, stratify_by) table.filter(case_samples + control_samples) table.remove_empty('observation') features = list(table.ids(axis='observation')) feature_tree = shear_no_prune(feature_tree, features) print("Extraneous features removed") for n in feature_tree.traverse(): if not n.length: n.length = 0 tree = tree_to_array(feature_tree, mode) print("Tree index created") possible_transects = len(np.unique(np.asarray(tree['distances']))) tree_length = tree['distances'][0] # root of tree if n_transects > possible_transects: n_transects = possible_transects print("Only %d transects exist, using that instead" % n_transects) transects = list(np.linspace(0, tree_length, num=n_transects)) print("Will transect at: %s" % ", ".join(map(str, transects))) figure_gen = prepare_plot(tree_length) figure_gen.send(None) # initialize co-routine colors = [] points, _ = pairwise_components(table, get_pairs()) color_fig, highlight_fig, color = figure_gen.send((points, None)) color_fig.savefig(os.path.join(layer_dir, 'original.png'), transparent=True) plt.close(color_fig) highlight_fig.savefig(os.path.join(layer_dir, 'original.h.png'), transparent=True) plt.close(highlight_fig) colors.append(color) rank_files = [] collapsed_groups = pd.DataFrame() for distance in transects: collapsed_table, collapsed_counts, groups = group_by_transect( table, tree, distance) collapsed_groups[groups.name] = groups print("Table collapsed at transect %s" % distance) points, ranks = pairwise_components(collapsed_table, get_pairs()) filename = write_ranks(rank_dir, collapsed_counts, ranks, distance) rank_files.append(filename) color_fig, highlight_fig, color = figure_gen.send((points, distance)) colors.append(color) color_fig.savefig(os.path.join(layer_dir, 'T_%s.png' % distance), transparent=True) plt.close(color_fig) highlight_fig.savefig(os.path.join(layer_dir, 'T_%s.h.png' % distance), transparent=True) plt.close(highlight_fig) print("Finalizing visualization") figure = figure_gen.send((None, None)) figure.savefig(os.path.join(layer_dir, 'trajectory.png'), transparent=True) plt.close(figure) background = next(figure_gen) background.savefig(os.path.join(layer_dir, 'bg.png'), transparent=True) plt.close(background) with open(os.path.join(output_dir, 'collapsed_groups.tsv'), 'w') as fh: collapsed_groups.to_csv(fh, sep='\t') with open(os.path.join(output_dir, 'index.html'), 'w') as fh: template = Environment(loader=BaseLoader).from_string(TEMPLATE) fh.write( template.render({ 'legend': list( zip(['original'] + ['T_%s' % d for d in transects] + ['trajectory'], list(map(to_hex, colors)) + ['red'])), 'filenames': rank_files }))