Ejemplo n.º 1
0
def filter_samples(demux: _PlotQualView, metadata: Metadata,
                   where: str = None, exclude_ids: bool = False) \
                   -> CasavaOneEightSingleLanePerSampleDirFmt:
    results = CasavaOneEightSingleLanePerSampleDirFmt()

    paired = demux.paired
    samples = demux.directory_format

    ids_to_keep = metadata.get_ids(where=where)
    if not ids_to_keep:
        raise ValueError('No filtering requested.')
    manifest = samples.manifest.view(pd.DataFrame)

    if exclude_ids:
        ids_to_keep = set(manifest.index) - set(ids_to_keep)

    try:
        for id in ids_to_keep:
            forward = manifest.loc[id].forward
            duplicate(forward,
                      os.path.join(str(results),
                                   os.path.split(forward)[1]))
            if paired:
                reverse = manifest.loc[id].reverse
                duplicate(
                    reverse,
                    os.path.join(str(results),
                                 os.path.split(reverse)[1]))
    except KeyError:
        raise ValueError(f'{id!r} is not a sample present in the '
                         'demultiplexed data.')

    return results
Ejemplo n.º 2
0
def get_ncbi_data(query: str = None,
                  accession_ids: Metadata = None,
                  ranks: list = None,
                  rank_propagation: bool = True,
                  entrez_delay: float = 0.334) -> (DNAIterator, DataFrame):
    if query is None and accession_ids is None:
        raise ValueError('Query or accession_ids must be supplied')
    if ranks is None:
        ranks = _default_ranks

    if query:
        seqs, taxids = get_nuc_for_query(query, entrez_delay)

    if accession_ids:
        accs = accession_ids.get_ids()
        if query and seqs:
            accs = accs - seqs.keys()
            if accs:
                acc_seqs, acc_taxids = get_nuc_for_accs(accs, entrez_delay)
                seqs.update(acc_seqs)
                taxids.update(acc_taxids)
        else:
            seqs, taxids = get_nuc_for_accs(accs, entrez_delay)

    taxa = get_taxonomies(taxids, ranks, rank_propagation, entrez_delay)

    seqs = DNAIterator(DNA(v, metadata={'id': k}) for k, v in seqs.items())
    taxa = DataFrame(taxa, index=['Taxon']).T
    taxa.index.name = 'Feature ID'

    return seqs, taxa
Ejemplo n.º 3
0
def plot(output_dir,
         table: biom.Table,
         metadata: q2.Metadata,
         case_where: str,
         control_where: str,
         feature_tree: skbio.TreeNode = None):

    with open('/tmp/tree.nwk', 'w') as fh:
        feature_tree.write(fh)

    copy_tree(os.path.join(PLOT, 'assets', 'dist'), output_dir)
    data_dir = os.path.join(output_dir, 'data')
    os.mkdir(data_dir)

    metadata = metadata.filter_ids(table.ids(axis='sample'))
    case_samples = sorted(list(metadata.get_ids(case_where)))
    control_samples = sorted(list(metadata.get_ids(control_where)))

    table.filter(case_samples + control_samples)
    table.remove_empty('observation')
    features = list(table.ids(axis='observation'))

    if feature_tree is not None:
        feature_tree = shear_no_prune(feature_tree, features)
    else:
        feature_tree = TreeNode()

    tree_data = tree_to_array(feature_tree)
    idx, = np.where(np.asarray(tree_data['children']) == 0)
    tree_data['lookup'] = dict(zip(map(str, idx), range(len(idx))))

    tip_order = np.asarray(tree_data['names'])[idx]
    table = table.sort_order(tip_order, axis='observation')
    table = table.sort_order(case_samples + control_samples, axis='sample')

    with open(os.path.join(data_dir, 'packed_table.jsonp'), 'w') as fh:
        fh.write('LOAD_PACKED_TABLE(')
        fh.write(json.dumps(table_to_b64pa(table)))
        fh.write(');')

    with open(os.path.join(data_dir, 'tree.jsonp'), 'w') as fh:
        fh.write('LOAD_TREE(')
        fh.write(json.dumps(tree_data))
        fh.write(');')
Ejemplo n.º 4
0
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix,
                           metadata: qiime2.Metadata,
                           where: str = None,
                           exclude_ids: bool = False) -> skbio.DistanceMatrix:
    ids_to_keep = metadata.get_ids(where=where)
    if exclude_ids:
        ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep)
    # NOTE: there is no guaranteed ordering to output distance matrix because
    # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration
    # order.
    try:
        return distance_matrix.filter(ids_to_keep, strict=False)
    except skbio.stats.distance.DissimilarityMatrixError:
        raise ValueError(
            "All samples were filtered out of the distance matrix.")
Ejemplo n.º 5
0
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix,
                           metadata: qiime2.Metadata,
                           where: str = None,
                           exclude_ids: bool = False) -> skbio.DistanceMatrix:
    ids_to_keep = metadata.get_ids(where=where)
    if exclude_ids:
        ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep)
    # NOTE: there is no guaranteed ordering to output distance matrix because
    # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration
    # order.
    try:
        return distance_matrix.filter(ids_to_keep, strict=False)
    except skbio.stats.distance.DissimilarityMatrixError:
        raise ValueError(
            "All samples were filtered out of the distance matrix.")
Ejemplo n.º 6
0
def get_ncbi_data(query: str = None,
                  accession_ids: Metadata = None,
                  ranks: list = None,
                  rank_propagation: bool = True,
                  logging_level: str = None,
                  n_jobs: int = 1) -> (DNAIterator, DataFrame):
    if ranks is None:
        ranks = _default_ranks
    if query is None and accession_ids is None:
        raise ValueError('Query or accession_ids must be supplied')

    manager = LokyManager()
    manager.start()
    request_lock = manager.Lock()

    if query:
        seqs, taxids = get_nuc_for_query(query, logging_level, n_jobs,
                                         request_lock, _entrez_delay)

    if accession_ids:
        accs = accession_ids.get_ids()
        if query and seqs:
            accs = accs - seqs.keys()
            if accs:
                acc_seqs, acc_taxids = get_nuc_for_accs(
                    accs, logging_level, n_jobs, request_lock, _entrez_delay)
                seqs.update(acc_seqs)
                taxids.update(acc_taxids)
        else:
            seqs, taxids = get_nuc_for_accs(accs, logging_level, n_jobs,
                                            request_lock, _entrez_delay)

    taxa, bad_accs = get_taxonomies(taxids, ranks, rank_propagation,
                                    logging_level, n_jobs, request_lock,
                                    _entrez_delay)
    for acc in bad_accs:
        del seqs[acc]

    seqs = DNAIterator(DNA(v, metadata={'id': k}) for k, v in seqs.items())
    taxa = DataFrame(taxa, index=['Taxon']).T
    taxa.index.name = 'Feature ID'

    return seqs, taxa
Ejemplo n.º 7
0
def filter_seqs(data: pd.Series, table: biom.Table = None,
                metadata: qiime2.Metadata = None, where: str = None,
                exclude_ids: bool = False) -> pd.Series:
    if table is not None and metadata is not None:
        raise ValueError('Filtering with metadata and filtering with a table '
                         'are mutually exclusive.')
    elif table is None and metadata is None:
        raise ValueError('No filtering requested. Must provide either table '
                         'or metadata.')
    elif table is not None:
        ids_to_keep = table.ids(axis='observation')
    else:
        # Note, no need to check for missing feature IDs in the metadata,
        # because that is basically the point of this method.
        ids_to_keep = metadata.get_ids(where=where)

    if exclude_ids is True:
        ids_to_keep = set(data.index) - set(ids_to_keep)
    filtered = data[data.index.isin(ids_to_keep)]
    if filtered.empty is True:
        raise ValueError('All features were filtered out of the data.')
    return filtered
Ejemplo n.º 8
0
def filter_seqs(data: pd.Series, table: biom.Table=None,
                metadata: qiime2.Metadata=None, where: str=None,
                exclude_ids: bool=False) -> pd.Series:
    if table is not None and metadata is not None:
        raise ValueError('Filtering with metadata and filtering with a table '
                         'are mutually exclusive.')
    elif table is None and metadata is None:
        raise ValueError('No filtering requested. Must provide either table '
                         'or metadata.')
    elif table is not None:
        ids_to_keep = table.ids(axis='observation')
    else:
        # Note, no need to check for missing feature IDs in the metadata,
        # because that is basically the point of this method.
        ids_to_keep = metadata.get_ids(where=where)

    if exclude_ids is True:
        ids_to_keep = set(data.index) - set(ids_to_keep)
    filtered = data[data.index.isin(ids_to_keep)]
    if filtered.empty is True:
        raise ValueError('All features were filtered out of the data.')
    return filtered
Ejemplo n.º 9
0
def filter_tree(
    tree: skbio.TreeNode,
    table: biom.Table = None,
    metadata: qiime2.Metadata = None,
    where: str = None,
) -> skbio.TreeNode:
    """
    Prunes a phylogenetic tree to match the input ids
    """
    # Checks the input metadata
    if ((table is None) & (metadata is None)):
        raise ValueError('A feature table, sequences or metadata must be '
                         'provided for filtering.')
    filter_refs = [table, metadata]
    if np.sum([(ref is not None) for ref in filter_refs]).sum() > 1:
        raise ValueError('Filtering can only be performed using one reference'
                         ' file. Please choose between filtering with a '
                         'feature table, sequences, or metadata.')
    if (where is not None) & (metadata is None):
        raise ValueError("Metadata must be provided if 'where' is specified")

    # Gets the list of IDs to keep
    if table is not None:
        ids_to_keep = table.ids(axis='observation')
    if metadata is not None:
        ids_to_keep = metadata.get_ids(where)

    # Gets the list of tips
    tip_ids = set([t.name for t in tree.tips()])

    # Checks for an intersection between ids
    if not set(tip_ids).issuperset(set(ids_to_keep)):
        raise ValueError('The ids for filtering must be a subset of '
                         'the tips in the tree.')
    sub_tree = tree.shear(ids_to_keep)
    sub_tree.prune()

    return sub_tree
Ejemplo n.º 10
0
def _get_ncbi_data(query: str = None,
                   accession_ids: Metadata = None,
                   ranks: list = None,
                   rank_propagation: bool = True,
                   logging_level: str = None,
                   n_jobs: int = 1,
                   db: str = 'nuccore'):
    manager = LokyManager()
    manager.start()
    request_lock = manager.Lock()

    if query:
        seqs, taxids = get_data_for_query(query, logging_level, n_jobs,
                                          request_lock, _entrez_delay, db)

    if accession_ids:
        accs = accession_ids.get_ids()
        if query and seqs:
            accs = accs - seqs.keys()
            if accs:
                acc_seqs, acc_taxids = get_data_for_accs(
                    accs, logging_level, n_jobs, request_lock, _entrez_delay,
                    db)
                seqs.update(acc_seqs)
                taxids.update(acc_taxids)
        else:
            seqs, taxids = get_data_for_accs(accs, logging_level, n_jobs,
                                             request_lock, _entrez_delay, db)

    taxa, bad_accs = get_taxonomies(taxids, ranks, rank_propagation,
                                    logging_level, n_jobs, request_lock,
                                    _entrez_delay)
    for acc in bad_accs:
        del seqs[acc]

    return seqs, taxa
Ejemplo n.º 11
0
        print("%s sample pairs matched together" %
              (len(case_to_control_match.keys())))

        for key in case_to_control_match:
            key_value = case_to_control_match[key]
            matchDF.at[key, "matched_to"] = str(key_value)
            matchDF.at[key_value, "matched_to"] = str(key)
    else:
        print("%s cases matched" % (len(case_dictionary.keys())))
        for case in case_dictionary:
            for control in case_dictionary[case]:
                if control in control_dictionary:
                    control_dictionary[control].append(case)
                else:
                    control_dictionary[control] = [case]
            matchDF.at[case,
                       "matched_to"] = ", ".join(sorted(case_dictionary[case]))

        for control in control_dictionary:
            matchDF.at[control, "matched_to"] = ", ".join(
                sorted(control_dictionary[control]))

    matchedMD = Metadata(matchDF)
    if only_matches:
        ids = matchedMD.get_ids("matched_to NOT IN ('none')")
        #shrinks the MD to only have matched samples
        matchedMD = matchedMD.filter_ids(ids)

    return matchedMD
Ejemplo n.º 12
0
def simple_plot(output_dir,
                table: biom.Table,
                feature_tree: skbio.TreeNode,
                metadata: q2.Metadata,
                case_where: str,
                control_where: str,
                n_transects: int = 10,
                stratify_by: str = None,
                mode: str = 'max'):
    print("Data extracted")
    layer_dir = os.path.join(output_dir, 'layers')
    rank_dir = os.path.join(output_dir, 'ranks')
    os.mkdir(layer_dir)
    os.mkdir(rank_dir)

    metadata = metadata.filter_ids(table.ids(axis='sample'))
    case_samples = sorted(list(metadata.get_ids(case_where)))
    control_samples = sorted(list(metadata.get_ids(control_where)))
    get_pairs = comparisons(metadata, control_samples, case_samples,
                            stratify_by)

    table.filter(case_samples + control_samples)
    table.remove_empty('observation')
    features = list(table.ids(axis='observation'))
    feature_tree = shear_no_prune(feature_tree, features)
    print("Extraneous features removed")

    for n in feature_tree.traverse():
        if not n.length:
            n.length = 0
    tree = tree_to_array(feature_tree, mode)
    print("Tree index created")

    possible_transects = len(np.unique(np.asarray(tree['distances'])))
    tree_length = tree['distances'][0]  # root of tree
    if n_transects > possible_transects:
        n_transects = possible_transects
        print("Only %d transects exist, using that instead" % n_transects)

    transects = list(np.linspace(0, tree_length, num=n_transects))
    print("Will transect at: %s" % ", ".join(map(str, transects)))

    figure_gen = prepare_plot(tree_length)
    figure_gen.send(None)  # initialize co-routine
    colors = []

    points, _ = pairwise_components(table, get_pairs())
    color_fig, highlight_fig, color = figure_gen.send((points, None))

    color_fig.savefig(os.path.join(layer_dir, 'original.png'),
                      transparent=True)
    plt.close(color_fig)
    highlight_fig.savefig(os.path.join(layer_dir, 'original.h.png'),
                          transparent=True)
    plt.close(highlight_fig)
    colors.append(color)

    rank_files = []
    collapsed_groups = pd.DataFrame()
    for distance in transects:
        collapsed_table, collapsed_counts, groups = group_by_transect(
            table, tree, distance)
        collapsed_groups[groups.name] = groups
        print("Table collapsed at transect %s" % distance)

        points, ranks = pairwise_components(collapsed_table, get_pairs())

        filename = write_ranks(rank_dir, collapsed_counts, ranks, distance)
        rank_files.append(filename)

        color_fig, highlight_fig, color = figure_gen.send((points, distance))
        colors.append(color)

        color_fig.savefig(os.path.join(layer_dir, 'T_%s.png' % distance),
                          transparent=True)
        plt.close(color_fig)
        highlight_fig.savefig(os.path.join(layer_dir, 'T_%s.h.png' % distance),
                              transparent=True)
        plt.close(highlight_fig)

    print("Finalizing visualization")
    figure = figure_gen.send((None, None))
    figure.savefig(os.path.join(layer_dir, 'trajectory.png'), transparent=True)
    plt.close(figure)

    background = next(figure_gen)
    background.savefig(os.path.join(layer_dir, 'bg.png'), transparent=True)
    plt.close(background)

    with open(os.path.join(output_dir, 'collapsed_groups.tsv'), 'w') as fh:
        collapsed_groups.to_csv(fh, sep='\t')

    with open(os.path.join(output_dir, 'index.html'), 'w') as fh:
        template = Environment(loader=BaseLoader).from_string(TEMPLATE)
        fh.write(
            template.render({
                'legend':
                list(
                    zip(['original'] + ['T_%s' % d
                                        for d in transects] + ['trajectory'],
                        list(map(to_hex, colors)) + ['red'])),
                'filenames':
                rank_files
            }))