Exemple #1
0
def plot_distance(analyses, metric='braycurtis',
                  title=None, label=None, xlabel=None, ylabel=None,
                  field='readcount_w_children', rank='species', **kwargs):
    """Plot beta diversity distance matrix.

    Additional **kwargs are passed to Seaborn's `sns.clustermap`.
    """
    # if taxonomy trees are inconsistent, unifrac will not work
    if metric in ['braycurtis', 'bray-curtis', 'bray curtis']:
        f = braycurtis
    elif metric in ['manhattan', 'cityblock']:
        f = cityblock
    elif metric == 'jaccard':
        f = jaccard
    elif metric == 'unifrac':
        f = unifrac
    else:
        raise OneCodexException("'metric' must be one of "
                                "braycurtis, manhattan, jaccard, or unifrac")

    normed_classifications, metadata = normalize_classifications(analyses, label=label)
    if len(normed_classifications) < 2:
        raise OneCodexException('`plot_distance` requires 2 or more valid classification results.')

    sns.set(style=kwargs.pop('style', 'darkgrid'))

    # there is no uniqueness constraint on metadata names
    # so plot by uuid, then replace the labels in the dataframe with their names
    uuids = {}
    sample_names = {}
    for idx, analysis in enumerate(normed_classifications):
        uuids[analysis.id] = analysis.id
        sample_names[analysis.id] = metadata.loc[idx, '_display_name']

    distances = f(normed_classifications, field=field, rank=rank)
    ids = distances.ids
    distance_matrix = distances.data
    dists = {}
    for idx1, id1 in enumerate(ids):
        dists[uuids[id1]] = {}
        for idx2, id2 in enumerate(ids):
            dists[uuids[id1]][uuids[id2]] = distance_matrix[idx1][idx2]
    dists = pd.DataFrame(dists).rename(index=sample_names, columns=sample_names)

    # Plot cluster map; ignore new SciPy cluster warnings
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', scipy.cluster.hierarchy.ClusterWarning)
        g = sns.clustermap(dists, **kwargs)

    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

    # Labels
    if xlabel is not None:
        plt.gca().set_xlabel(xlabel)
    if ylabel is not None:
        plt.gca().set_ylabel(ylabel)

    if title:
        g.fig.suptitle(title)
    plt.show()
Exemple #2
0
    def __init__(self,
                 file_path,
                 file_size,
                 file_format="fastq",
                 progressbar=None):
        if file_path[0].endswith(".gz") or file_path[1].endswith(".gz"):
            self._fp_left = gzip.GzipFile(file_path[0], mode="rb")
            self._fp_right = gzip.GzipFile(file_path[1], mode="rb")
        elif file_path[0].endswith(".bz2") or file_path[1].endswith(".bz2"):
            self._fp_left = bz2.BZ2File(file_path[0], mode="rb")
            self._fp_right = bz2.BZ2File(file_path[1], mode="rb")
        else:
            self._fp_left = open(file_path[0], mode="rb")
            self._fp_right = open(file_path[1], mode="rb")

        if file_format == "fasta":
            raise OneCodexException(
                "Interleaving FASTA files is currently unsupported")
        elif file_format == "fastq":
            self._lines_per_record = 4
        else:
            raise OneCodexException("file_format must be one of: fastq, fasta")

        self._tell = 0
        self._fsize = file_size
        self._buf = Buffer()

        self.progressbar = progressbar
        self.mime_type = "text/plain"
Exemple #3
0
    def download(self, path=None):
        """
        Downloads the original reads file (FASTA/FASTQ) from One Codex.

        Note that this may only work from within a notebook session and the file
        is not guaranteed to exist for all One Codex plan types.

        Parameters
        ----------
        path : string, optional
            Full path to save the file to. If omitted, defaults to the original filename
            in the current working directory.
        """
        if path is None:
            path = os.path.join(os.getcwd(), self.filename)
        try:
            url_data = self._resource.download_uri()
            resp = requests.get(url_data['download_uri'], stream=True)
            # TODO: use tqdm or ProgressBar here to display progress?
            with open(path, 'wb') as f_out:
                for data in resp.iter_content(chunk_size=1024):
                    f_out.write(data)
        except HTTPError as exc:
            if exc.response.status_code == 402:
                raise OneCodexException(
                    'You must either have a premium platform account or be in '
                    'a notebook environment to download samples.')
            else:
                raise OneCodexException(
                    'Download failed with an HTTP status code {}.'.format(
                        exc.response.status_code))
Exemple #4
0
    def _compute_distance(self, rank, metric):
        if rank is None:
            raise OneCodexException(
                "Please specify a rank or 'auto' to choose automatically")

        # if taxonomy trees are inconsistent, unifrac will not work
        if callable(metric):
            distances = metric(self, rank=rank)
        elif metric in (BetaDiversityMetric.BrayCurtis, "bray-curtis",
                        "bray curtis"):
            distances = self.beta_diversity(
                metric=BetaDiversityMetric.BrayCurtis, rank=rank)
        elif metric in ("manhattan", BetaDiversityMetric.CityBlock):
            distances = self.beta_diversity(
                metric=BetaDiversityMetric.CityBlock, rank=rank)
        elif metric == BetaDiversityMetric.Jaccard:
            distances = self.beta_diversity(metric=BetaDiversityMetric.Jaccard,
                                            rank=rank)
        elif metric == BetaDiversityMetric.WeightedUnifrac:
            distances = self.unifrac(weighted=True, rank=rank)
        elif metric == BetaDiversityMetric.UnweightedUnifrac:
            distances = self.unifrac(weighted=False, rank=rank)
        elif metric == BetaDiversityMetric.Aitchison:
            distances = self.beta_diversity(
                metric=BetaDiversityMetric.Aitchison, rank=rank)
        else:
            raise OneCodexException("Metric must be one of: {}".format(
                ", ".join(BetaDiversityMetric.values())))

        return distances
Exemple #5
0
    def __init__(self, text=None, label=None):
        if text is None and label is None:
            raise OneCodexException(
                "Please specify at least one of: text, label")

        self.text = text or ""
        self.label = label or ""

        try:
            ipy = get_ipython()
            self.ref_list = ipy.meta.get("references", {})
        except NameError:
            raise OneCodexException("Must be run from within IPython")

        if text:
            # has this reference already been cited?
            for ref_label, (ref_num, ref_text) in self.ref_list.items():
                print(ref_label, ref_num, ref_text)
                if text == ref_text:
                    if label and label != ref_label:
                        raise OneCodexException(
                            "Citation already in use with label={}".format(
                                ref_label))
                    else:
                        self.ref_num = ref_num
                        break
            else:
                # reference has not been cited. is the label already in use?
                if label in self.ref_list.keys():
                    raise OneCodexException(
                        "Citation label={} already in use".format(label))

                # create the citation and assign next number
                if not self.ref_list:
                    self.ref_num = 1
                else:
                    self.ref_num = max([x[0]
                                        for x in self.ref_list.values()]) + 1

                if not label:
                    ref_label = self.ref_num
                else:
                    ref_label = label

                self.ref_list[ref_label] = (self.ref_num, text)
                ipy.meta["references"] = self.ref_list

        elif label:
            if label not in self.ref_list.keys():
                raise OneCodexException(
                    "Cannot find citation with label={}".format(label))

            self.ref_num = self.ref_list[label][0]
Exemple #6
0
def renderer_settings(svg_or_png=None, save_json=True, enable=True):
    """Change behavior of Vega/Altair renderer in IPython notebook for this session.

    Parameters
    ----------
    svg_or_png : `str` in {"png", "svg"} or `False` to disable saving images
        Save rendered image in PNG or SVG format in an output cell. Defaults to "svg"
    save_json : `bool`
        Store Altair-generated JSON in output cell.
    enable : `bool`
        If True, after updating renderer settings, will enable the renderer. If False, user must
        call `altair.renderers.enable("onecodex")` before changes will take effect.
    """
    svg = png = False

    if svg_or_png is None or svg_or_png == "svg":
        svg = True
    elif svg_or_png == "png":
        png = True
    elif svg_or_png is False:
        pass
    else:
        raise OneCodexException("svg_or_png kwarg must be one of: png, svg")

    renderer = partial(onecodex_renderer, svg=svg, png=png, save_json=save_json)
    alt.renderers.register("onecodex", renderer)

    if enable:
        alt.renderers.enable("onecodex")
Exemple #7
0
    def beta_diversity(self, metric="braycurtis", rank="auto"):
        """Calculate the diversity between two communities.

        Parameters
        ----------
        metric : {'jaccard', 'braycurtis', 'cityblock'}
            The distance metric to calculate.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        import skbio.diversity

        if metric not in ("jaccard", "braycurtis", "cityblock"):
            raise OneCodexException(
                "For beta diversity, metric must be one of: jaccard, braycurtis, cityblock"
            )

        df = self.to_df(rank=rank, normalize=self._guess_normalized())

        counts = []
        for c_id in df.index:
            counts.append(df.loc[c_id].tolist())

        # NOTE: see #291 for a discussion on using these metrics with normalized read counts. we are
        # explicitly disabling skbio's check for a counts matrix to allow normalized data to make
        # its way into this function.
        return skbio.diversity.beta_diversity(metric, counts, df.index.tolist(), validate=False)
Exemple #8
0
    def filter(self, filter_func):
        """Return a new SampleCollection containing only samples meeting the filter criteria.

        Will pass any kwargs (e.g., metric or skip_missing) used when instantiating the current class
        on to the new SampleCollection that is returned.

        Parameters
        ----------
        filter_func : `callable`
            A function that will be evaluated on every object in the collection. The function must
            return a `bool`. If True, the object will be kept. If False, it will be removed from the
            SampleCollection that is returned.

        Returns
        -------
        `onecodex.models.SampleCollection` containing only objects `filter_func` returned True on.

        Examples
        --------
        Generate a new collection of Samples that have a specific filename extension:

            new_collection = samples.filter(lambda s: s.filename.endswith('.fastq.gz'))
        """
        if callable(filter_func):
            return self.__class__([obj for obj in self if filter_func(obj)],
                                  **self._kwargs)
        else:
            raise OneCodexException(
                "Please pass a function to filter: {}".format(
                    type(filter_func).__name__))
Exemple #9
0
    def _check_valid_resource(self, other, check_for_dupes=True):
        try:
            other = iter(other)
        except TypeError:
            other = [other]

        other_ids = []

        for o in other:
            if not isinstance(o, self._oc_model):
                raise ValueError(
                    "Expected object of type '{}', got '{}'".format(
                        self._oc_model.__name__,
                        type(o).__name__))

            other_ids.append(o.id)

        if check_for_dupes:
            # duplicates are not allowed
            self_ids = [s.id for s in self._resource]

            if len(set(self_ids + other_ids)) != len(self_ids + other_ids):
                raise OneCodexException(
                    "{} cannot contain duplicate objects".format(
                        self.__class__.__name__))
Exemple #10
0
def unifrac(classifications, weighted=True,
            field='readcount_w_children', rank='species', strict=False):
    """
    A beta diversity metric that takes into account the relative relatedness of community members.
    Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence
    """
    assert field in ACCEPTABLE_FIELDS
    counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)

    tree = None
    for c in classifications:
        if strict and c.job.id != classifications[0].job.id:
            raise OneCodexException('All Classifications must have the same Job for Unifrac')
        tree = generate_skbio_tree(c, existing_tree=tree)

    # there's a bug (?) in skbio where it expects the root to only have
    # one child, so we do a little faking here
    new_tree = TreeNode(name='fake root')
    new_tree.rank = 'no rank'
    new_tree.append(tree)

    # prune low-level nodes off the tree so the tips are what we're comparing
    prune_to_rank(new_tree, rank=rank)

    if weighted:
        return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids,
                                              tree=new_tree, otu_ids=tax_ids)
    else:
        return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids,
                                              tree=new_tree, otu_ids=tax_ids)
Exemple #11
0
def write_fastx_record(record, handler):
    if len(record) == 2:
        record_str = '>{}\n{}'
    elif len(record) == 4:
        record_str = '@{}\n{}\n{}\n{}'
    else:
        raise OneCodexException('Unknown FASTX record format', record)
    handler.write(record_str.format(*record))
Exemple #12
0
def sort_helper(sort, values):
    """Return a sorted list of values for the Altair chart axes."""
    sort_order = None

    if callable(sort):
        values = list(set(values))
        sort_order = sort(values)
    elif isinstance(sort, list):
        if set(sort) != set(values):
            raise OneCodexException(
                "sort_x must have the same items as your dataset.")
        sort_order = sort
    elif sort:
        raise OneCodexException(
            "Please pass either a sorted list of values matching the axis labels \
            or a function that returns a sorted list of labels")

    return sort_order
Exemple #13
0
    def __init__(self, style=None):
        self.style = "" if style is None else style

        try:
            ipy = get_ipython()
            ref_list = ipy.meta.get("references", {})
        except NameError:
            raise OneCodexException("Must be run from within IPython")

        self.ref_list = ref_list
Exemple #14
0
    def __getattr__(self, key):
        if hasattr(self, "_resource") and hasattr(self.__class__, "_resource"):
            schema_key = key if key != "id" else "$uri"
            schema = self.__class__._resource._schema["properties"].get(
                schema_key)
            if schema is not None:
                value = getattr(self._resource, key)
                if isinstance(value, Resource):
                    # convert potion resources into wrapped ones
                    resource_path = value._uri.rsplit("/", 1)[0]
                    return _model_lookup[resource_path](_resource=value)
                elif isinstance(value, list):
                    if schema["items"]["type"] == "object":
                        # convert lists of potion resources into wrapped ones
                        compiled_re = re.compile(
                            schema["items"]["properties"]["$ref"]["pattern"])

                        # if the list we're returning is empty, we can't just infer what type of
                        # object belongs in this list from its contents. to account for this, we'll
                        # instead try to match the object's URI to those in our lookup table
                        for route, obj in _model_lookup.items():
                            if compiled_re.match(
                                    "{}/dummy_lookup".format(route)):
                                return ResourceList(value, obj)

                        raise OneCodexException(
                            "No object found for {}".format(
                                compiled_re.pattern))
                    else:
                        # otherwise, just return a regular list
                        return value
                else:
                    if key == "id":
                        # undo the bad coercion from potion_client/resource.py#L111
                        if value is None:
                            return None
                        else:
                            return str(value)
                    if schema.get(
                            "format") == "date-time" and value is not None:
                        datetime_value = parse(value)
                        if datetime_value.tzinfo is None:
                            return pytz.utc.localize(datetime_value)
                        else:
                            return datetime_value.astimezone(pytz.utc)
                    return value
        elif key == "id" or key in self.__class__._resource._schema[
                "properties"]:
            # make fields appear blank if there's no _resource bound to me
            return None

        raise AttributeError("'{}' object has no attribute '{}'".format(
            self.__class__.__name__, key))
Exemple #15
0
    def __init__(self, _resource=None, name=None, sample=None):
        if name:
            # try to lookup Tags with a where call using kwargs
            results = self.where(name=name)

            if len(results) == 0:
                super(Tags, self).__init__(name=name, sample=sample)
            elif len(results) == 1:
                self._resource = results[0]._resource
            elif len(results) > 1:
                raise OneCodexException("Multiple matches found for given criteria")
        else:
            super(Tags, self).__init__(_resource=_resource)
Exemple #16
0
    def __init__(self, files, progressbar=None):
        if len(files) != 2:
            raise OneCodexException(
                "Paired files uploading can only take 2 files")

        for f in files:
            if get_fastx_format(f) != "fastq":
                raise OneCodexException(
                    "Interleaving FASTA files is currently unsupported")

        if R1_FILENAME_RE.match(files[0]) and R2_FILENAME_RE.match(files[1]):
            file1 = files[0]
            file2 = files[1]
        elif R2_FILENAME_RE.match(files[0]) and R1_FILENAME_RE.match(files[1]):
            file1 = files[1]
            file2 = files[0]
        else:
            raise OneCodexException(
                "Paired files need to have _R1/_1 and _R2/_2 in their name")

        self.r1 = FilePassthru(file1, progressbar)
        self.r2 = FilePassthru(file2, progressbar)
Exemple #17
0
    def _sample_collection_constructor(self,
                                       objects,
                                       skip_missing=True,
                                       field="auto",
                                       metric="auto",
                                       include_host=False,
                                       job=None):
        if field:
            warnings.warn(
                "The `field` parameter has been renamed to `metric`. Passing `field` to a SampleCollection is deprecated and will be removed in a future release.",
                DeprecationWarning,
            )
            metric = field

        # are they all wrapped potion resources?
        if not all([hasattr(obj, "_resource") for obj in objects]):
            raise OneCodexException(
                "SampleCollection can only contain One Codex Samples or Classifications objects"
            )

        # are they all the same model?
        object_classes = [type(obj) for obj in objects]

        if len(set(object_classes)) > 1:
            raise OneCodexException(
                "SampleCollection can contain Samples or Classifications, but not both"
            )

        resources = [obj._resource for obj in objects]
        model = objects[0].__class__

        self._kwargs = {
            "skip_missing": skip_missing,
            "metric": metric,
            "include_host": include_host,
            "job": job,
        }
        super(SampleCollection, self).__init__(resources, model,
                                               **self._kwargs)
Exemple #18
0
    def __init__(self, url, position="left", style=None):
        self.url = url
        self.style = "" if style is None else style

        if position == "left":
            self.classes = "logo-left"
        elif position == "center":
            self.classes = "logo-center"
        elif position == "right":
            self.classes = "logo-right"
        else:
            raise OneCodexException(
                "position must be one of: left, right, center")
Exemple #19
0
    def unifrac(self, weighted=True, rank="auto"):
        """Calculate the UniFrac beta diversity metric.

        UniFrac takes into account the relatedness of community members. Weighted UniFrac considers
        abundances, unweighted UniFrac considers presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        # needs read counts, not relative abundances
        import skbio.diversity

        if self._guess_normalized():
            raise OneCodexException("UniFrac requires unnormalized read counts.")

        df = self.to_df(rank=rank, normalize=False)

        counts = []
        for c_id in df.index:
            counts.append(df.loc[c_id].tolist())

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=df.ocx_rank)

        # there's a bug (?) in skbio where it expects the root to only have
        # one child, so we do a little faking here
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
        else:
            return skbio.diversity.beta_diversity(
                "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
Exemple #20
0
    def _classification_fetch(self, skip_missing=None):
        """Transform a list of Samples or Classifications into a list of Classifications objects.

        Parameters
        ----------
        skip_missing : `bool`
            If an analysis was not successful, exclude it, warn, and keep going

        Returns
        -------
        None, but stores a result in self._cached.
        """
        from onecodex.models import Classifications, Samples

        skip_missing = skip_missing if skip_missing else self._kwargs[
            "skip_missing"]

        new_classifications = []

        for obj in self._res_list:
            if isinstance(obj, Samples):
                classification = obj.primary_classification
            elif isinstance(obj, Classifications):
                classification = obj
            else:
                raise OneCodexException(
                    "Objects in SampleCollection must be one of: Classifications, Samples"
                )

            if skip_missing and not classification.success:
                warnings.warn(
                    "Classification {} not successful. Skipping.".format(
                        classification.id))
                continue

            new_classifications.append(classification)

        # warn if some of the classifications in this collection are not alike
        job_names = set([obj.job.name for obj in new_classifications])

        if len(job_names) > 1:
            warnings.warn(
                "SampleCollection contains multiple analysis types: {}".format(
                    ", ".join(job_names)))

        self._cached["is_metagenomic"] = False
        if len(job_names) == 1 and "One Codex Database" in list(job_names)[0]:
            self._cached["is_metagenomic"] = True

        self._cached["classifications"] = new_classifications
Exemple #21
0
def interleaved_filename(file_path):
    """Return filename used to represent a set of paired-end files.

    Assumes Illumina-style naming conventions where each file has _R1_ or _R2_ in its name.
    """
    if not isinstance(file_path, tuple):
        raise OneCodexException(
            "Cannot get the interleaved filename without a tuple.")
    if re.match(".*[._][Rr][12][_.].*", file_path[0]):
        return re.sub("[._][Rr][12]", "", file_path[0])
    else:
        warnings.warn(
            "Paired-end filenames do not match--are you sure they are correct?"
        )
        return file_path[0]
Exemple #22
0
    def _compute_distance(self, rank, metric):
        if rank is None:
            raise OneCodexException(
                "Please specify a rank or 'auto' to choose automatically")

        # if taxonomy trees are inconsistent, unifrac will not work
        if callable(metric):
            distances = metric(self, rank=rank)
        elif metric in ("braycurtis", "bray-curtis", "bray curtis"):
            distances = self.beta_diversity(metric="braycurtis", rank=rank)
        elif metric in ("manhattan", "cityblock"):
            distances = self.beta_diversity(metric="cityblock", rank=rank)
        elif metric == "jaccard":
            distances = self.beta_diversity(metric="jaccard", rank=rank)
        elif metric in ("unifrac", "weighted_unifrac"):
            distances = self.unifrac(weighted=True, rank=rank)
        elif metric == "unweighted_unifrac":
            distances = self.unifrac(weighted=False, rank=rank)
        else:
            raise OneCodexException(
                "Metric must be one of: braycurtis, manhattan, jaccard, "
                "weighted_unifrac, unweighted_unifrac")

        return distances
Exemple #23
0
    def __init__(self, text, heading=None, fignum=None, style=None):
        self.heading = "" if heading is None else "{} ".format(heading)
        self.text = text
        self.style = "" if style is None else style

        if fignum is None:
            try:
                ipy = get_ipython()
                self.fignum = ipy.meta.get("figure_count", 0) + 1
            except NameError:
                raise OneCodexException("Must be run from within IPython")

            ipy.meta["figure_count"] = self.fignum
        else:
            self.fignum = fignum
Exemple #24
0
    def alpha_diversity(self, metric="simpson", rank="auto"):
        """Calculate the diversity within a community.

        Parameters
        ----------
        metric : {'simpson', 'chao1', 'shannon'}
            The diversity metric to calculate.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        pandas.DataFrame, a distance matrix.
        """
        import skbio.diversity

        if metric not in ("simpson", "chao1", "shannon"):
            raise OneCodexException(
                "For alpha diversity, metric must be one of: simpson, chao1, shannon"
            )

        # needs read counts, not relative abundances
        if self._guess_normalized():
            raise OneCodexException("Alpha diversity requires unnormalized read counts.")

        df = self.to_df(rank=rank, normalize=False)

        output = {"classification_id": [], metric: []}

        for c_id in df.index:
            output["classification_id"].append(c_id)
            output[metric].append(
                skbio.diversity.alpha_diversity(metric, df.loc[c_id].tolist(), [c_id]).values[0]
            )

        return pd.DataFrame(output).set_index("classification_id")
Exemple #25
0
    def beta_diversity(self,
                       metric=BetaDiversityMetric.BrayCurtis,
                       rank=Rank.Auto):
        """Calculate the diversity between two communities.

        Parameters
        ----------
        metric : {'jaccard', 'braycurtis', 'cityblock', 'manhattan', 'aitchison'}
            The distance metric to calculate.
            Note that 'cityblock' and 'manhattan' are equivalent metrics.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        import skbio.diversity

        if not BetaDiversityMetric.has_value(metric):
            raise OneCodexException(
                "For beta diversity, metric must be one of: {}".format(
                    ", ".join(BetaDiversityMetric.values())))

        df = self.to_df(rank=rank, normalize=self._guess_normalized())

        if metric == BetaDiversityMetric.WeightedUnifrac:
            return self.unifrac(weighted=True, rank=rank)
        elif metric == BetaDiversityMetric.UnweightedUnifrac:
            return self.unifrac(weighted=False, rank=rank)
        elif metric == BetaDiversityMetric.Jaccard:
            df = df > 0  # Jaccard requires a boolean matrix, otherwise it throws a warning
        elif metric == BetaDiversityMetric.Aitchison:
            return self.aitchison_distance(rank=rank)

        # NOTE: see #291 for a discussion on using these metrics with normalized read counts. we are
        # explicitly disabling skbio's check for a counts matrix to allow normalized data to make
        # its way into this function.
        skbio_metric = "cityblock" if metric == "manhattan" else metric
        return skbio.diversity.beta_diversity(skbio_metric,
                                              df.values,
                                              df.index,
                                              validate=False)
Exemple #26
0
def _choose_boto3_chunksize(file_obj):
    """Return the appropriate chunksize for use in uploading the given file object.

    Choose the minimum chunk size for a boto3 direct-to-S3 upload that will result in less than
    10000 chunks (the maximum). This function will raise if there is no allowed chunk size big
    enough to accomodate the file.

    Parameters
    ----------
    file_obj : `FASTXInterleave`, `FilePassthru`, or a file-like object
        A wrapper around a pair of fastx files (`FASTXInterleave`) or a single fastx file. In the
        case of paired files, they will be interleaved and uploaded uncompressed. In the case of a
        single file, it will simply be passed through (`FilePassthru`) to One Codex, compressed
        or otherwise.

    Returns
    -------
    `int`
        The minimum multipart chunk size in bytes.
    """
    file_obj_size = getattr(file_obj, "_fsize", None)

    if file_obj_size:
        allowed_chunk_sizes = [size * 1024**2 for size in range(10, 110, 10)]

        for chunk_size in allowed_chunk_sizes:
            if math.ceil(file_obj_size / chunk_size) < 10000:
                break
        else:
            max_file_size = chunk_size * 10000
            uncompressed = "uncompressed " if isinstance(
                file_obj, FASTXInterleave) else ""

            raise OneCodexException(
                "File is too large to upload ({}size: {}, max: {})".format(
                    uncompressed, file_obj_size, max_file_size))

        multipart_chunksize = chunk_size
    else:
        # default to 25 mb
        multipart_chunksize = 25 * 1024**2

    return multipart_chunksize
Exemple #27
0
def get_project(project):
    """
    Get the actual project instance if the argument is not None and not already a Project.

    Raises an exception if the project can't be found.
    """
    if not isinstance(project, Projects) and project is not None:
        project_search = Projects.get(project)
        if not project_search:
            project_search = Projects.where(name=project)
        if not project_search:
            try:
                project_search = Projects.where(project_name=project)
            except HTTPError:
                project_search = None
        if not project_search:
            raise OneCodexException(
                "{} is not a valid project UUID".format(project))

        if isinstance(project_search, list):
            return project_search[0]
    return project
Exemple #28
0
    def alpha_diversity(self,
                        metric=AlphaDiversityMetric.Shannon,
                        rank=Rank.Auto):
        """Calculate the diversity within a community.

        Parameters
        ----------
        metric : {'simpson', 'observed_taxa', 'shannon'}
            The diversity metric to calculate.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        pandas.DataFrame, a distance matrix.
        """
        import pandas as pd
        import skbio.diversity

        if not AlphaDiversityMetric.has_value(metric):
            raise OneCodexException(
                "For alpha diversity, metric must be one of: {}".format(
                    ", ".join(AlphaDiversityMetric.values())))

        if metric == "chao1":
            warnings.warn(
                "`Chao1` is deprecated and will be removed in a future release. Please use `observed_taxa` instead.",
                DeprecationWarning,
            )

        df = self.to_df(rank=rank, normalize=self._guess_normalized())

        skbio_metric = "observed_otus" if metric == "observed_taxa" else metric
        output = skbio.diversity.alpha_diversity(skbio_metric,
                                                 df.values,
                                                 df.index,
                                                 validate=False)

        return pd.DataFrame(output, columns=[metric])
Exemple #29
0
def _check_for_ascii_filename(filename, coerce_ascii):
    """Check that the filename is ASCII.

    If it isn't, convert it to ASCII & return it if the ascii flag
    has been set otherwise raise an exception.
    """
    try:
        # python2
        ascii_fname = unidecode(unicode(filename))
    except NameError:
        ascii_fname = unidecode(filename)

    if filename != ascii_fname:
        if coerce_ascii:
            # TODO: Consider warnings.warn here instead
            log.warning("Renaming {} to {}, must be ASCII\n".format(
                filename.encode("utf-8"), ascii_fname))
            filename = ascii_fname
        else:
            raise OneCodexException(
                "Filenames must be ascii. Try using --coerce-ascii")
    return filename
Exemple #30
0
def interleave_palette(domain, palette="ocx"):
    from onecodex.viz import DEFAULT_PALETTES

    if palette in DEFAULT_PALETTES:
        colors = DEFAULT_PALETTES[palette]
    elif isinstance(palette, list):
        colors = palette
    else:
        raise OneCodexException(
            "A valid palette name or list of colors must be passed")

    n_rows = len(set(domain))

    # We do some shuffling to optimize the range of colours with our own palette
    if palette == "ocx":
        hues, shades = 6, 4

        # Calculate how many shades to show of each hue
        period = min(ceil(n_rows / hues), shades)

        # Save the darkest hue for last
        offset = 0
        if period < shades:
            offset = 1

        # Generate a sub-palette with each hue, at each shade
        sub_palettes = [
            colors[ix::shades] for ix in range(offset, period + offset)
        ]

        # Interleave the sub-palettes so the individuals colors are ordered by hue, then shade
        colors = list(chain.from_iterable(zip(*sub_palettes)))

    # Repeat the palette to extend it to the length of the domain
    extended_palette = colors * (n_rows // len(colors)) + colors[:n_rows %
                                                                 len(colors)]

    return extended_palette