def plot_distance(analyses, metric='braycurtis', title=None, label=None, xlabel=None, ylabel=None, field='readcount_w_children', rank='species', **kwargs): """Plot beta diversity distance matrix. Additional **kwargs are passed to Seaborn's `sns.clustermap`. """ # if taxonomy trees are inconsistent, unifrac will not work if metric in ['braycurtis', 'bray-curtis', 'bray curtis']: f = braycurtis elif metric in ['manhattan', 'cityblock']: f = cityblock elif metric == 'jaccard': f = jaccard elif metric == 'unifrac': f = unifrac else: raise OneCodexException("'metric' must be one of " "braycurtis, manhattan, jaccard, or unifrac") normed_classifications, metadata = normalize_classifications(analyses, label=label) if len(normed_classifications) < 2: raise OneCodexException('`plot_distance` requires 2 or more valid classification results.') sns.set(style=kwargs.pop('style', 'darkgrid')) # there is no uniqueness constraint on metadata names # so plot by uuid, then replace the labels in the dataframe with their names uuids = {} sample_names = {} for idx, analysis in enumerate(normed_classifications): uuids[analysis.id] = analysis.id sample_names[analysis.id] = metadata.loc[idx, '_display_name'] distances = f(normed_classifications, field=field, rank=rank) ids = distances.ids distance_matrix = distances.data dists = {} for idx1, id1 in enumerate(ids): dists[uuids[id1]] = {} for idx2, id2 in enumerate(ids): dists[uuids[id1]][uuids[id2]] = distance_matrix[idx1][idx2] dists = pd.DataFrame(dists).rename(index=sample_names, columns=sample_names) # Plot cluster map; ignore new SciPy cluster warnings with warnings.catch_warnings(): warnings.simplefilter('ignore', scipy.cluster.hierarchy.ClusterWarning) g = sns.clustermap(dists, **kwargs) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) # Labels if xlabel is not None: plt.gca().set_xlabel(xlabel) if ylabel is not None: plt.gca().set_ylabel(ylabel) if title: g.fig.suptitle(title) plt.show()
def __init__(self, file_path, file_size, file_format="fastq", progressbar=None): if file_path[0].endswith(".gz") or file_path[1].endswith(".gz"): self._fp_left = gzip.GzipFile(file_path[0], mode="rb") self._fp_right = gzip.GzipFile(file_path[1], mode="rb") elif file_path[0].endswith(".bz2") or file_path[1].endswith(".bz2"): self._fp_left = bz2.BZ2File(file_path[0], mode="rb") self._fp_right = bz2.BZ2File(file_path[1], mode="rb") else: self._fp_left = open(file_path[0], mode="rb") self._fp_right = open(file_path[1], mode="rb") if file_format == "fasta": raise OneCodexException( "Interleaving FASTA files is currently unsupported") elif file_format == "fastq": self._lines_per_record = 4 else: raise OneCodexException("file_format must be one of: fastq, fasta") self._tell = 0 self._fsize = file_size self._buf = Buffer() self.progressbar = progressbar self.mime_type = "text/plain"
def download(self, path=None): """ Downloads the original reads file (FASTA/FASTQ) from One Codex. Note that this may only work from within a notebook session and the file is not guaranteed to exist for all One Codex plan types. Parameters ---------- path : string, optional Full path to save the file to. If omitted, defaults to the original filename in the current working directory. """ if path is None: path = os.path.join(os.getcwd(), self.filename) try: url_data = self._resource.download_uri() resp = requests.get(url_data['download_uri'], stream=True) # TODO: use tqdm or ProgressBar here to display progress? with open(path, 'wb') as f_out: for data in resp.iter_content(chunk_size=1024): f_out.write(data) except HTTPError as exc: if exc.response.status_code == 402: raise OneCodexException( 'You must either have a premium platform account or be in ' 'a notebook environment to download samples.') else: raise OneCodexException( 'Download failed with an HTTP status code {}.'.format( exc.response.status_code))
def _compute_distance(self, rank, metric): if rank is None: raise OneCodexException( "Please specify a rank or 'auto' to choose automatically") # if taxonomy trees are inconsistent, unifrac will not work if callable(metric): distances = metric(self, rank=rank) elif metric in (BetaDiversityMetric.BrayCurtis, "bray-curtis", "bray curtis"): distances = self.beta_diversity( metric=BetaDiversityMetric.BrayCurtis, rank=rank) elif metric in ("manhattan", BetaDiversityMetric.CityBlock): distances = self.beta_diversity( metric=BetaDiversityMetric.CityBlock, rank=rank) elif metric == BetaDiversityMetric.Jaccard: distances = self.beta_diversity(metric=BetaDiversityMetric.Jaccard, rank=rank) elif metric == BetaDiversityMetric.WeightedUnifrac: distances = self.unifrac(weighted=True, rank=rank) elif metric == BetaDiversityMetric.UnweightedUnifrac: distances = self.unifrac(weighted=False, rank=rank) elif metric == BetaDiversityMetric.Aitchison: distances = self.beta_diversity( metric=BetaDiversityMetric.Aitchison, rank=rank) else: raise OneCodexException("Metric must be one of: {}".format( ", ".join(BetaDiversityMetric.values()))) return distances
def __init__(self, text=None, label=None): if text is None and label is None: raise OneCodexException( "Please specify at least one of: text, label") self.text = text or "" self.label = label or "" try: ipy = get_ipython() self.ref_list = ipy.meta.get("references", {}) except NameError: raise OneCodexException("Must be run from within IPython") if text: # has this reference already been cited? for ref_label, (ref_num, ref_text) in self.ref_list.items(): print(ref_label, ref_num, ref_text) if text == ref_text: if label and label != ref_label: raise OneCodexException( "Citation already in use with label={}".format( ref_label)) else: self.ref_num = ref_num break else: # reference has not been cited. is the label already in use? if label in self.ref_list.keys(): raise OneCodexException( "Citation label={} already in use".format(label)) # create the citation and assign next number if not self.ref_list: self.ref_num = 1 else: self.ref_num = max([x[0] for x in self.ref_list.values()]) + 1 if not label: ref_label = self.ref_num else: ref_label = label self.ref_list[ref_label] = (self.ref_num, text) ipy.meta["references"] = self.ref_list elif label: if label not in self.ref_list.keys(): raise OneCodexException( "Cannot find citation with label={}".format(label)) self.ref_num = self.ref_list[label][0]
def renderer_settings(svg_or_png=None, save_json=True, enable=True): """Change behavior of Vega/Altair renderer in IPython notebook for this session. Parameters ---------- svg_or_png : `str` in {"png", "svg"} or `False` to disable saving images Save rendered image in PNG or SVG format in an output cell. Defaults to "svg" save_json : `bool` Store Altair-generated JSON in output cell. enable : `bool` If True, after updating renderer settings, will enable the renderer. If False, user must call `altair.renderers.enable("onecodex")` before changes will take effect. """ svg = png = False if svg_or_png is None or svg_or_png == "svg": svg = True elif svg_or_png == "png": png = True elif svg_or_png is False: pass else: raise OneCodexException("svg_or_png kwarg must be one of: png, svg") renderer = partial(onecodex_renderer, svg=svg, png=png, save_json=save_json) alt.renderers.register("onecodex", renderer) if enable: alt.renderers.enable("onecodex")
def beta_diversity(self, metric="braycurtis", rank="auto"): """Calculate the diversity between two communities. Parameters ---------- metric : {'jaccard', 'braycurtis', 'cityblock'} The distance metric to calculate. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ import skbio.diversity if metric not in ("jaccard", "braycurtis", "cityblock"): raise OneCodexException( "For beta diversity, metric must be one of: jaccard, braycurtis, cityblock" ) df = self.to_df(rank=rank, normalize=self._guess_normalized()) counts = [] for c_id in df.index: counts.append(df.loc[c_id].tolist()) # NOTE: see #291 for a discussion on using these metrics with normalized read counts. we are # explicitly disabling skbio's check for a counts matrix to allow normalized data to make # its way into this function. return skbio.diversity.beta_diversity(metric, counts, df.index.tolist(), validate=False)
def filter(self, filter_func): """Return a new SampleCollection containing only samples meeting the filter criteria. Will pass any kwargs (e.g., metric or skip_missing) used when instantiating the current class on to the new SampleCollection that is returned. Parameters ---------- filter_func : `callable` A function that will be evaluated on every object in the collection. The function must return a `bool`. If True, the object will be kept. If False, it will be removed from the SampleCollection that is returned. Returns ------- `onecodex.models.SampleCollection` containing only objects `filter_func` returned True on. Examples -------- Generate a new collection of Samples that have a specific filename extension: new_collection = samples.filter(lambda s: s.filename.endswith('.fastq.gz')) """ if callable(filter_func): return self.__class__([obj for obj in self if filter_func(obj)], **self._kwargs) else: raise OneCodexException( "Please pass a function to filter: {}".format( type(filter_func).__name__))
def _check_valid_resource(self, other, check_for_dupes=True): try: other = iter(other) except TypeError: other = [other] other_ids = [] for o in other: if not isinstance(o, self._oc_model): raise ValueError( "Expected object of type '{}', got '{}'".format( self._oc_model.__name__, type(o).__name__)) other_ids.append(o.id) if check_for_dupes: # duplicates are not allowed self_ids = [s.id for s in self._resource] if len(set(self_ids + other_ids)) != len(self_ids + other_ids): raise OneCodexException( "{} cannot contain duplicate objects".format( self.__class__.__name__))
def unifrac(classifications, weighted=True, field='readcount_w_children', rank='species', strict=False): """ A beta diversity metric that takes into account the relative relatedness of community members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence """ assert field in ACCEPTABLE_FIELDS counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank) tree = None for c in classifications: if strict and c.job.id != classifications[0].job.id: raise OneCodexException('All Classifications must have the same Job for Unifrac') tree = generate_skbio_tree(c, existing_tree=tree) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here new_tree = TreeNode(name='fake root') new_tree.rank = 'no rank' new_tree.append(tree) # prune low-level nodes off the tree so the tips are what we're comparing prune_to_rank(new_tree, rank=rank) if weighted: return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids, tree=new_tree, otu_ids=tax_ids) else: return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids, tree=new_tree, otu_ids=tax_ids)
def write_fastx_record(record, handler): if len(record) == 2: record_str = '>{}\n{}' elif len(record) == 4: record_str = '@{}\n{}\n{}\n{}' else: raise OneCodexException('Unknown FASTX record format', record) handler.write(record_str.format(*record))
def sort_helper(sort, values): """Return a sorted list of values for the Altair chart axes.""" sort_order = None if callable(sort): values = list(set(values)) sort_order = sort(values) elif isinstance(sort, list): if set(sort) != set(values): raise OneCodexException( "sort_x must have the same items as your dataset.") sort_order = sort elif sort: raise OneCodexException( "Please pass either a sorted list of values matching the axis labels \ or a function that returns a sorted list of labels") return sort_order
def __init__(self, style=None): self.style = "" if style is None else style try: ipy = get_ipython() ref_list = ipy.meta.get("references", {}) except NameError: raise OneCodexException("Must be run from within IPython") self.ref_list = ref_list
def __getattr__(self, key): if hasattr(self, "_resource") and hasattr(self.__class__, "_resource"): schema_key = key if key != "id" else "$uri" schema = self.__class__._resource._schema["properties"].get( schema_key) if schema is not None: value = getattr(self._resource, key) if isinstance(value, Resource): # convert potion resources into wrapped ones resource_path = value._uri.rsplit("/", 1)[0] return _model_lookup[resource_path](_resource=value) elif isinstance(value, list): if schema["items"]["type"] == "object": # convert lists of potion resources into wrapped ones compiled_re = re.compile( schema["items"]["properties"]["$ref"]["pattern"]) # if the list we're returning is empty, we can't just infer what type of # object belongs in this list from its contents. to account for this, we'll # instead try to match the object's URI to those in our lookup table for route, obj in _model_lookup.items(): if compiled_re.match( "{}/dummy_lookup".format(route)): return ResourceList(value, obj) raise OneCodexException( "No object found for {}".format( compiled_re.pattern)) else: # otherwise, just return a regular list return value else: if key == "id": # undo the bad coercion from potion_client/resource.py#L111 if value is None: return None else: return str(value) if schema.get( "format") == "date-time" and value is not None: datetime_value = parse(value) if datetime_value.tzinfo is None: return pytz.utc.localize(datetime_value) else: return datetime_value.astimezone(pytz.utc) return value elif key == "id" or key in self.__class__._resource._schema[ "properties"]: # make fields appear blank if there's no _resource bound to me return None raise AttributeError("'{}' object has no attribute '{}'".format( self.__class__.__name__, key))
def __init__(self, _resource=None, name=None, sample=None): if name: # try to lookup Tags with a where call using kwargs results = self.where(name=name) if len(results) == 0: super(Tags, self).__init__(name=name, sample=sample) elif len(results) == 1: self._resource = results[0]._resource elif len(results) > 1: raise OneCodexException("Multiple matches found for given criteria") else: super(Tags, self).__init__(_resource=_resource)
def __init__(self, files, progressbar=None): if len(files) != 2: raise OneCodexException( "Paired files uploading can only take 2 files") for f in files: if get_fastx_format(f) != "fastq": raise OneCodexException( "Interleaving FASTA files is currently unsupported") if R1_FILENAME_RE.match(files[0]) and R2_FILENAME_RE.match(files[1]): file1 = files[0] file2 = files[1] elif R2_FILENAME_RE.match(files[0]) and R1_FILENAME_RE.match(files[1]): file1 = files[1] file2 = files[0] else: raise OneCodexException( "Paired files need to have _R1/_1 and _R2/_2 in their name") self.r1 = FilePassthru(file1, progressbar) self.r2 = FilePassthru(file2, progressbar)
def _sample_collection_constructor(self, objects, skip_missing=True, field="auto", metric="auto", include_host=False, job=None): if field: warnings.warn( "The `field` parameter has been renamed to `metric`. Passing `field` to a SampleCollection is deprecated and will be removed in a future release.", DeprecationWarning, ) metric = field # are they all wrapped potion resources? if not all([hasattr(obj, "_resource") for obj in objects]): raise OneCodexException( "SampleCollection can only contain One Codex Samples or Classifications objects" ) # are they all the same model? object_classes = [type(obj) for obj in objects] if len(set(object_classes)) > 1: raise OneCodexException( "SampleCollection can contain Samples or Classifications, but not both" ) resources = [obj._resource for obj in objects] model = objects[0].__class__ self._kwargs = { "skip_missing": skip_missing, "metric": metric, "include_host": include_host, "job": job, } super(SampleCollection, self).__init__(resources, model, **self._kwargs)
def __init__(self, url, position="left", style=None): self.url = url self.style = "" if style is None else style if position == "left": self.classes = "logo-left" elif position == "center": self.classes = "logo-center" elif position == "right": self.classes = "logo-right" else: raise OneCodexException( "position must be one of: left, right, center")
def unifrac(self, weighted=True, rank="auto"): """Calculate the UniFrac beta diversity metric. UniFrac takes into account the relatedness of community members. Weighted UniFrac considers abundances, unweighted UniFrac considers presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ # needs read counts, not relative abundances import skbio.diversity if self._guess_normalized(): raise OneCodexException("UniFrac requires unnormalized read counts.") df = self.to_df(rank=rank, normalize=False) counts = [] for c_id in df.index: counts.append(df.loc[c_id].tolist()) tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=df.ocx_rank) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids ) else: return skbio.diversity.beta_diversity( "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids )
def _classification_fetch(self, skip_missing=None): """Transform a list of Samples or Classifications into a list of Classifications objects. Parameters ---------- skip_missing : `bool` If an analysis was not successful, exclude it, warn, and keep going Returns ------- None, but stores a result in self._cached. """ from onecodex.models import Classifications, Samples skip_missing = skip_missing if skip_missing else self._kwargs[ "skip_missing"] new_classifications = [] for obj in self._res_list: if isinstance(obj, Samples): classification = obj.primary_classification elif isinstance(obj, Classifications): classification = obj else: raise OneCodexException( "Objects in SampleCollection must be one of: Classifications, Samples" ) if skip_missing and not classification.success: warnings.warn( "Classification {} not successful. Skipping.".format( classification.id)) continue new_classifications.append(classification) # warn if some of the classifications in this collection are not alike job_names = set([obj.job.name for obj in new_classifications]) if len(job_names) > 1: warnings.warn( "SampleCollection contains multiple analysis types: {}".format( ", ".join(job_names))) self._cached["is_metagenomic"] = False if len(job_names) == 1 and "One Codex Database" in list(job_names)[0]: self._cached["is_metagenomic"] = True self._cached["classifications"] = new_classifications
def interleaved_filename(file_path): """Return filename used to represent a set of paired-end files. Assumes Illumina-style naming conventions where each file has _R1_ or _R2_ in its name. """ if not isinstance(file_path, tuple): raise OneCodexException( "Cannot get the interleaved filename without a tuple.") if re.match(".*[._][Rr][12][_.].*", file_path[0]): return re.sub("[._][Rr][12]", "", file_path[0]) else: warnings.warn( "Paired-end filenames do not match--are you sure they are correct?" ) return file_path[0]
def _compute_distance(self, rank, metric): if rank is None: raise OneCodexException( "Please specify a rank or 'auto' to choose automatically") # if taxonomy trees are inconsistent, unifrac will not work if callable(metric): distances = metric(self, rank=rank) elif metric in ("braycurtis", "bray-curtis", "bray curtis"): distances = self.beta_diversity(metric="braycurtis", rank=rank) elif metric in ("manhattan", "cityblock"): distances = self.beta_diversity(metric="cityblock", rank=rank) elif metric == "jaccard": distances = self.beta_diversity(metric="jaccard", rank=rank) elif metric in ("unifrac", "weighted_unifrac"): distances = self.unifrac(weighted=True, rank=rank) elif metric == "unweighted_unifrac": distances = self.unifrac(weighted=False, rank=rank) else: raise OneCodexException( "Metric must be one of: braycurtis, manhattan, jaccard, " "weighted_unifrac, unweighted_unifrac") return distances
def __init__(self, text, heading=None, fignum=None, style=None): self.heading = "" if heading is None else "{} ".format(heading) self.text = text self.style = "" if style is None else style if fignum is None: try: ipy = get_ipython() self.fignum = ipy.meta.get("figure_count", 0) + 1 except NameError: raise OneCodexException("Must be run from within IPython") ipy.meta["figure_count"] = self.fignum else: self.fignum = fignum
def alpha_diversity(self, metric="simpson", rank="auto"): """Calculate the diversity within a community. Parameters ---------- metric : {'simpson', 'chao1', 'shannon'} The diversity metric to calculate. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- pandas.DataFrame, a distance matrix. """ import skbio.diversity if metric not in ("simpson", "chao1", "shannon"): raise OneCodexException( "For alpha diversity, metric must be one of: simpson, chao1, shannon" ) # needs read counts, not relative abundances if self._guess_normalized(): raise OneCodexException("Alpha diversity requires unnormalized read counts.") df = self.to_df(rank=rank, normalize=False) output = {"classification_id": [], metric: []} for c_id in df.index: output["classification_id"].append(c_id) output[metric].append( skbio.diversity.alpha_diversity(metric, df.loc[c_id].tolist(), [c_id]).values[0] ) return pd.DataFrame(output).set_index("classification_id")
def beta_diversity(self, metric=BetaDiversityMetric.BrayCurtis, rank=Rank.Auto): """Calculate the diversity between two communities. Parameters ---------- metric : {'jaccard', 'braycurtis', 'cityblock', 'manhattan', 'aitchison'} The distance metric to calculate. Note that 'cityblock' and 'manhattan' are equivalent metrics. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ import skbio.diversity if not BetaDiversityMetric.has_value(metric): raise OneCodexException( "For beta diversity, metric must be one of: {}".format( ", ".join(BetaDiversityMetric.values()))) df = self.to_df(rank=rank, normalize=self._guess_normalized()) if metric == BetaDiversityMetric.WeightedUnifrac: return self.unifrac(weighted=True, rank=rank) elif metric == BetaDiversityMetric.UnweightedUnifrac: return self.unifrac(weighted=False, rank=rank) elif metric == BetaDiversityMetric.Jaccard: df = df > 0 # Jaccard requires a boolean matrix, otherwise it throws a warning elif metric == BetaDiversityMetric.Aitchison: return self.aitchison_distance(rank=rank) # NOTE: see #291 for a discussion on using these metrics with normalized read counts. we are # explicitly disabling skbio's check for a counts matrix to allow normalized data to make # its way into this function. skbio_metric = "cityblock" if metric == "manhattan" else metric return skbio.diversity.beta_diversity(skbio_metric, df.values, df.index, validate=False)
def _choose_boto3_chunksize(file_obj): """Return the appropriate chunksize for use in uploading the given file object. Choose the minimum chunk size for a boto3 direct-to-S3 upload that will result in less than 10000 chunks (the maximum). This function will raise if there is no allowed chunk size big enough to accomodate the file. Parameters ---------- file_obj : `FASTXInterleave`, `FilePassthru`, or a file-like object A wrapper around a pair of fastx files (`FASTXInterleave`) or a single fastx file. In the case of paired files, they will be interleaved and uploaded uncompressed. In the case of a single file, it will simply be passed through (`FilePassthru`) to One Codex, compressed or otherwise. Returns ------- `int` The minimum multipart chunk size in bytes. """ file_obj_size = getattr(file_obj, "_fsize", None) if file_obj_size: allowed_chunk_sizes = [size * 1024**2 for size in range(10, 110, 10)] for chunk_size in allowed_chunk_sizes: if math.ceil(file_obj_size / chunk_size) < 10000: break else: max_file_size = chunk_size * 10000 uncompressed = "uncompressed " if isinstance( file_obj, FASTXInterleave) else "" raise OneCodexException( "File is too large to upload ({}size: {}, max: {})".format( uncompressed, file_obj_size, max_file_size)) multipart_chunksize = chunk_size else: # default to 25 mb multipart_chunksize = 25 * 1024**2 return multipart_chunksize
def get_project(project): """ Get the actual project instance if the argument is not None and not already a Project. Raises an exception if the project can't be found. """ if not isinstance(project, Projects) and project is not None: project_search = Projects.get(project) if not project_search: project_search = Projects.where(name=project) if not project_search: try: project_search = Projects.where(project_name=project) except HTTPError: project_search = None if not project_search: raise OneCodexException( "{} is not a valid project UUID".format(project)) if isinstance(project_search, list): return project_search[0] return project
def alpha_diversity(self, metric=AlphaDiversityMetric.Shannon, rank=Rank.Auto): """Calculate the diversity within a community. Parameters ---------- metric : {'simpson', 'observed_taxa', 'shannon'} The diversity metric to calculate. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- pandas.DataFrame, a distance matrix. """ import pandas as pd import skbio.diversity if not AlphaDiversityMetric.has_value(metric): raise OneCodexException( "For alpha diversity, metric must be one of: {}".format( ", ".join(AlphaDiversityMetric.values()))) if metric == "chao1": warnings.warn( "`Chao1` is deprecated and will be removed in a future release. Please use `observed_taxa` instead.", DeprecationWarning, ) df = self.to_df(rank=rank, normalize=self._guess_normalized()) skbio_metric = "observed_otus" if metric == "observed_taxa" else metric output = skbio.diversity.alpha_diversity(skbio_metric, df.values, df.index, validate=False) return pd.DataFrame(output, columns=[metric])
def _check_for_ascii_filename(filename, coerce_ascii): """Check that the filename is ASCII. If it isn't, convert it to ASCII & return it if the ascii flag has been set otherwise raise an exception. """ try: # python2 ascii_fname = unidecode(unicode(filename)) except NameError: ascii_fname = unidecode(filename) if filename != ascii_fname: if coerce_ascii: # TODO: Consider warnings.warn here instead log.warning("Renaming {} to {}, must be ASCII\n".format( filename.encode("utf-8"), ascii_fname)) filename = ascii_fname else: raise OneCodexException( "Filenames must be ascii. Try using --coerce-ascii") return filename
def interleave_palette(domain, palette="ocx"): from onecodex.viz import DEFAULT_PALETTES if palette in DEFAULT_PALETTES: colors = DEFAULT_PALETTES[palette] elif isinstance(palette, list): colors = palette else: raise OneCodexException( "A valid palette name or list of colors must be passed") n_rows = len(set(domain)) # We do some shuffling to optimize the range of colours with our own palette if palette == "ocx": hues, shades = 6, 4 # Calculate how many shades to show of each hue period = min(ceil(n_rows / hues), shades) # Save the darkest hue for last offset = 0 if period < shades: offset = 1 # Generate a sub-palette with each hue, at each shade sub_palettes = [ colors[ix::shades] for ix in range(offset, period + offset) ] # Interleave the sub-palettes so the individuals colors are ordered by hue, then shade colors = list(chain.from_iterable(zip(*sub_palettes))) # Repeat the palette to extend it to the length of the domain extended_palette = colors * (n_rows // len(colors)) + colors[:n_rows % len(colors)] return extended_palette