def __init__(self, filename="kraken.out"): """.. rubric:: **constructor** :param filename: the input from KrakenAnalysis class """ self.filename = filename on_rtd = os.environ.get("READTHEDOCS", None) == "True" if on_rtd is False: from biokit import Taxonomy self.tax = Taxonomy(verbose=True) self.tax._load_flat_file() # make sure it is available locally else: class Taxonomy(object): from sequana import sequana_data # must be local df = pd.read_csv(sequana_data("test_taxon_rtd.csv"), index_col=0) def get_lineage_and_rank(self, x): # Note that we add the name as well here ranks = [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'name' ] return [(self.df.ix[x][rank], rank) for rank in ranks] self.tax = Taxonomy() if filename: # This initialise the data self._parse_data() self._data_created = False
def __init__(self, filename="kraken.out"): """.. rubric:: **constructor** :param filename: the input from KrakenAnalysis class """ self.filename = filename on_rtd = os.environ.get("READTHEDOCS", None) == "True" if on_rtd is False: from biokit import Taxonomy self.tax = Taxonomy(verbose=True) self.tax._load_flat_file() # make sure it is available locally else: class Taxonomy(object): from sequana import sequana_data # must be local df = pd.read_csv(sequana_data("test_taxon_rtd.csv"), index_col=0) def get_lineage_and_rank(self, x): # Note that we add the name as well here ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'name'] return [(self.df.ix[x][rank], rank) for rank in ranks] self.tax = Taxonomy() if filename: # This initialise the data self._parse_data() self._data_created = False
def test_taxonomy(): t = Taxonomy() t.load_records() lineage = t.get_lineage(9606) assert len(lineage) == 31 assert 'Mammalia' in lineage assert t.fetch_by_id('10090')['name'] == 'Mus musculus' ret = t.fetch_by_name('Mus Musculus') assert ret[0]['id'] == '10090' lineage = t.get_lineage_and_rank(9606) tree = t.get_family_tree(9606)
class KrakenResults(object): """Translate Kraken results into a Krona-compatible file If you run a kraken analysis with :class:`KrakenAnalysis`, you will end up with a file e.g. named kraken.out (by default). You could use kraken-translate but then you need extra parsing to convert into a Krona-compatible file. Here, we take the output from kraken and directly transform it to a krona-compatible file. :: k = KrakenResults("kraken.out") k.kraken_to_krona() Then format expected looks like:: C HISEQ:426:C5T65ACXX:5:2301:18719:16377 1 203 1:71 A:31 1:71 C HISEQ:426:C5T65ACXX:5:2301:21238:16397 1 202 1:71 A:31 1:71 Where each row corresponds to one read. :: "562:13 561:4 A:31 0:1 562:3" would indicate that: the first 13 k-mers mapped to taxonomy ID #562 the next 4 k-mers mapped to taxonomy ID #561 the next 31 k-mers contained an ambiguous nucleotide the next k-mer was not in the database the last 3 k-mers mapped to taxonomy ID #562 See kraken documentation for details. .. note:: a taxon of ID 1 (root) means that the read is classified but in differen domain. https://github.com/DerrickWood/kraken/issues/100 .. note:: This takes care of fetching taxons and the corresponding lineages from online web services. """ def __init__(self, filename="kraken.out"): """.. rubric:: **constructor** :param filename: the input from KrakenAnalysis class """ self.filename = filename on_rtd = os.environ.get("READTHEDOCS", None) == "True" if on_rtd is False: from biokit import Taxonomy self.tax = Taxonomy(verbose=True) self.tax._load_flat_file() # make sure it is available locally else: class Taxonomy(object): from sequana import sequana_data # must be local df = pd.read_csv(sequana_data("test_taxon_rtd.csv"), index_col=0) def get_lineage_and_rank(self, x): # Note that we add the name as well here ranks = [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'name' ] return [(self.df.ix[x][rank], rank) for rank in ranks] self.tax = Taxonomy() if filename: # This initialise the data self._parse_data() self._data_created = False def get_taxonomy_biokit(self, ids): """Retrieve taxons given a list of taxons :param list ids: list of taxons as strings or integers. Could also be a single string or a single integer :return: a dataframe .. note:: the first call first loads all taxons in memory and takes a few seconds but subsequent calls are much faster """ # filter the lineage to keep only information from one of the main rank # that is superkingdom, kingdom, phylum, class, order, family, genus and # species ranks = ('kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species') if isinstance(ids, int): ids = [ids] if len(ids) == 0: return pd.DataFrame() logger.info('Retrieving taxon using biokit.Taxonomy') if isinstance(ids, list) is False: ids = [ids] lineage = [self.tax.get_lineage_and_rank(x) for x in ids] # Now, we filter each lineage to keep only relevant ranks # We drop the 'no rank' and create a dictionary # Not nice but works for now results = [] for i, this in enumerate(lineage): default = dict.fromkeys(ranks, ' ') for entry in this: if entry[1] in ranks: default[entry[1]] = entry[0] elif entry[1] == "superkingdom": default["kingdom"] = entry[0] # Scientific name is the last entry tagged has no_rank following # species TODO (check this assumption) # e.g. 351680 and 151529 have same 7 ranks so to differenatiate # them, the scientific name should be used. # By default, we will take the last one. If species or genus, we # repeat the term try: default['name'] = this[-1][0] except: default['name'] = "root (ambigous kingdom)" results.append(default) df = pd.DataFrame.from_records(results) df.index = ids df = df[list(ranks) + ['name']] df.index = df.index.astype(int) return df def _parse_data(self): taxonomy = {} logger.info("Reading kraken data") columns = ["status", "taxon", "length"] # we select only col 0,2,3 to save memoty, which is required on very # large files try: # each call to concat in the for loop below # will take time and increase with chunk position. # for 15M reads, this has a big cost. So chunksize set to 1M # is better than 1000 and still reasonable in memory reader = pd.read_csv(self.filename, sep="\t", header=None, usecols=[0, 2, 3], chunksize=1000000) except pd.parser.CParserError: raise NotImplementedError # this section is for the case #only_classified_output when there is no found classified read self.unclassified = N # size of the input data set self.classified = 0 self._df = pd.DataFrame([], columns=columns) self._taxons = self._df.taxon return for chunk in reader: try: self._df self._df = pd.concat([self._df, chunk]) except AttributeError: self._df = chunk self._df.columns = columns count = sum(self._df.taxon == 1) if count: logger.warning("Found %s taxons with root ID (1)" % count) # This gives the list of taxons as index and their amount # above, we select only columns 0, 2, 3 the column are still labelled # 0, 2, 3 in the df self._taxons = self._df.groupby("taxon").size() try: self._taxons.drop(0, inplace=True) except: pass # 0 may not be there self._taxons.sort_values(ascending=False, inplace=True) category = self.df.groupby("status").size() if 'C' in category.index: self.classified = category['C'] else: self.classified = 0 if 'U' in category.index: self.unclassified = category['U'] else: self.unclassified = 0 def _get_taxons(self): try: return self._taxons except: self._parse_data() return self._taxons taxons = property(_get_taxons) def _get_df(self): try: return self._df except: self._parse_data() return self._df df = property(_get_df) def _get_df_with_taxon(self, dbname): # line 14500 # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index]) df['count'] = self.taxons.values df.reset_index(inplace=True) newrow = len(df) df.ix[newrow] = "Unclassified" df.ix[newrow, 'count'] = self.unclassified df.ix[newrow, 'index'] = -1 df.rename(columns={"index": "taxon"}, inplace=True) df["percentage"] = df["count"] / df["count"].sum() * 100 # Now get back all annotations from the database itself. filename = dbname + os.sep + "annotations.csv" if os.path.exists(filename): annotations = pd.read_csv(filename) annotations.set_index("taxon", inplace=True) df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']] # There are duplicates sohow. let us keep the first one for now df2 = df2.reset_index().drop_duplicates( subset="taxon", keep="first").set_index("taxon") self.df2 = df2 self.df1 = df.set_index("taxon") df = pd.merge(self.df1, df2, left_index=True, right_index=True) df.reset_index(inplace=True) starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count'] df = df[starter + [ x for x in df.columns if x not in starter and x != "description" ] + ["description"]] df['gi'] = [int(x) for x in df['gi'].fillna(-1)] from easydev import precision df['percentage'] = [str(precision(x, 2)) for x in df['percentage']] else: starter = ['taxon', 'count', 'percentage'] df = df[starter + [x for x in df.columns if x not in starter]] df.sort_values(by="percentage", inplace=True, ascending=False) return df def kraken_to_csv(self, filename, dbname): df = self._get_df_with_taxon(dbname) df.to_csv(filename, index=False) return df def kraken_to_json(self, filename, dbname): df = self._get_df_with_taxon(dbname) df.to_json(filename) return df def kraken_to_krona(self, output_filename=None, mode=None, nofile=False): """ :return: status: True is everything went fine otherwise False """ if output_filename is None: output_filename = self.filename + ".summary" taxon_to_find = list(self.taxons.index) if len(taxon_to_find) == 0: logger.warning( "No reads were identified. You will need a more complete database" ) self.output_filename = output_filename with open(output_filename, "w") as fout: fout.write("%s\t%s" % (self.unclassified, "Unclassified")) return False # classified reads as root (1) """try: logger.warning("Removing taxon 1 (%s values) " % self.taxons.ix[1]) logger.info("Found %s taxons " % len(taxon_to_find)) taxon_to_find.pop(taxon_to_find.index(1)) except: pass """ if len(taxon_to_find) == 0: return False if mode != "adapters": df = self.get_taxonomy_biokit(taxon_to_find) self.lineage = [ ";".join(this) for this in df[df.columns[0:-1]].values ] self.scnames = list(df['name'].values) # do we need a cast ? else: # Let us get the known adapters and their identifiers from sequana.adapters import AdapterDB adapters = AdapterDB() adapters.load_all() self.scnames = [] for taxon in self.taxons.index: if str(taxon) in [1, "1"]: self.scnames.append('unknown') continue if str(taxon) not in list(adapters.df.identifier): self.scnames.append('unknown') continue self.scnames.append(adapters.get_name(taxon)) self.lineage = ["Adapters;%s" % x for x in self.scnames] assert len(self.lineage) == len(self.taxons) assert len(self.scnames) == len(self.taxons) # Now save the file self.output_filename = output_filename with open(output_filename, "w") as fout: for i, this in enumerate(self.lineage): taxon = taxon_to_find[i] count = self.taxons.loc[taxon] line = str(count) + "\t" + "\t".join(this.split(';')) line += " " + self.scnames[i] fout.write(line + '\n') try: fout.write("%s\t%s" % (self.unclassified, "Unclassified")) except: pass #unclassified may not exists if all classified self._data_created = True return True def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9, textcolor="red", **kargs): """A simple non-interactive plot of taxons :return: None if no taxon were found and a dataframe otherwise A Krona Javascript output is also available in :meth:`kraken_to_krona` .. plot:: :include-source: from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') .. seealso:: to generate the data see :class:`KrakenPipeline` or the standalone application **sequana_taxonomy**. """ if len(self._df) == 0: return if self._data_created == False: status = self.kraken_to_krona() if kind not in ['barh', 'pie']: logger.error('kind parameter: Only barh and pie are supported') return # This may have already been called but maybe not. This is not time # consuming, so we call it again here if len(self.taxons.index) == 0: return None df = self.get_taxonomy_biokit(list(self.taxons.index)) df.ix[-1] = ["Unclassified"] * 8 data = self.taxons.copy() data.ix[-1] = self.unclassified data = data / data.sum() * 100 assert threshold > 0 and threshold < 100 others = data[data < threshold].sum() data = data[data > threshold] names = df.ix[data.index]['name'] data.index = names.values data.ix['others'] = others try: data.sort_values(inplace=True) except: data.sort(inplace=True) # text may be long so, let us increase the figsize a little bit pylab.figure(figsize=(10, 8)) pylab.clf() if kind == "pie": ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%', radius=radius, **kargs) pylab.ylabel(" ") for text in ax.texts: # large, x-small, small, None, x-large, medium, xx-small, # smaller, xx-large, larger text.set_size("small") text.set_color(textcolor) for wedge in ax.patches: wedge.set_linewidth(1) wedge.set_edgecolor("k") self.ax = ax elif kind == "barh": ax = data.plot(kind=kind, **kargs) pylab.xlabel(" percentage ") return data def to_js(self, output="krona.html", onweb=False): if self._data_created == False: status = self.kraken_to_krona() execute("ktImportText %s -o %s" % (self.output_filename, output)) if onweb is True: import easydev easydev.onweb(output) def boxplot_classified_vs_read_length(self): """Show distribution of the read length grouped by classified or not""" self.df[["status", "length"]].groupby('status').boxplot()
class KrakenResults(object): """Translate Kraken results into a Krona-compatible file If you run a kraken analysis with :class:`KrakenAnalysis`, you will end up with a file e.g. named kraken.out (by default). You could use kraken-translate but then you need extra parsing to convert into a Krona-compatible file. Here, we take the output from kraken and directly transform it to a krona-compatible file. :: k = KrakenResults("kraken.out") k.kraken_to_krona() Then format expected looks like:: C HISEQ:426:C5T65ACXX:5:2301:18719:16377 1 203 1:71 A:31 1:71 C HISEQ:426:C5T65ACXX:5:2301:21238:16397 1 202 1:71 A:31 1:71 Where each row corresponds to one read. :: "562:13 561:4 A:31 0:1 562:3" would indicate that: the first 13 k-mers mapped to taxonomy ID #562 the next 4 k-mers mapped to taxonomy ID #561 the next 31 k-mers contained an ambiguous nucleotide the next k-mer was not in the database the last 3 k-mers mapped to taxonomy ID #562 See kraken documentation for details. .. note:: a taxon of ID 1 (root) means that the read is classified but in differen domain. https://github.com/DerrickWood/kraken/issues/100 .. note:: This takes care of fetching taxons and the corresponding lineages from online web services. """ def __init__(self, filename="kraken.out"): """.. rubric:: **constructor** :param filename: the input from KrakenAnalysis class """ self.filename = filename on_rtd = os.environ.get("READTHEDOCS", None) == "True" if on_rtd is False: from biokit import Taxonomy self.tax = Taxonomy(verbose=True) self.tax._load_flat_file() # make sure it is available locally else: class Taxonomy(object): from sequana import sequana_data # must be local df = pd.read_csv(sequana_data("test_taxon_rtd.csv"), index_col=0) def get_lineage_and_rank(self, x): # Note that we add the name as well here ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'name'] return [(self.df.ix[x][rank], rank) for rank in ranks] self.tax = Taxonomy() if filename: # This initialise the data self._parse_data() self._data_created = False def get_taxonomy_biokit(self, ids): """Retrieve taxons given a list of taxons :param list ids: list of taxons as strings or integers. Could also be a single string or a single integer :return: a dataframe .. note:: the first call first loads all taxons in memory and takes a few seconds but subsequent calls are much faster """ # filter the lineage to keep only information from one of the main rank # that is superkingdom, kingdom, phylum, class, order, family, genus and # species ranks = ('kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species') if isinstance(ids, int): ids = [ids] if len(ids) == 0: return pd.DataFrame() logger.info('Retrieving taxon using biokit.Taxonomy') if isinstance(ids, list) is False: ids = [ids] lineage = [self.tax.get_lineage_and_rank(x) for x in ids] # Now, we filter each lineage to keep only relevant ranks # We drop the 'no rank' and create a dictionary # Not nice but works for now results = [] for i, this in enumerate(lineage): default = dict.fromkeys(ranks, ' ') for entry in this: if entry[1] in ranks: default[entry[1]] = entry[0] elif entry[1] == "superkingdom": default["kingdom"] = entry[0] # Scientific name is the last entry tagged has no_rank following # species TODO (check this assumption) # e.g. 351680 and 151529 have same 7 ranks so to differenatiate # them, the scientific name should be used. # By default, we will take the last one. If species or genus, we # repeat the term try: default['name'] = this[-1][0] except: default['name'] = "root (ambigous kingdom)" results.append(default) df = pd.DataFrame.from_records(results) df.index = ids df = df[list(ranks) + ['name']] df.index = df.index.astype(int) return df def _parse_data(self): taxonomy = {} logger.info("Reading kraken data") columns = ["status", "taxon", "length"] # we select only col 0,2,3 to save memoty, which is required on very # large files try: # each call to concat in the for loop below # will take time and increase with chunk position. # for 15M reads, this has a big cost. So chunksize set to 1M # is better than 1000 and still reasonable in memory reader = pd.read_csv(self.filename, sep="\t", header=None, usecols=[0,2,3], chunksize=1000000) except pd.parser.CParserError: raise NotImplementedError # this section is for the case #only_classified_output when there is no found classified read self.unclassified = N # size of the input data set self.classified = 0 self._df = pd.DataFrame([], columns=columns) self._taxons = self._df.taxon return for chunk in reader: try: self._df self._df = pd.concat([self._df, chunk]) except AttributeError: self._df = chunk self._df.columns = columns count = sum(self._df.taxon == 1) if count: logger.warning("Found %s taxons with root ID (1)" % count) # This gives the list of taxons as index and their amount # above, we select only columns 0, 2, 3 the column are still labelled # 0, 2, 3 in the df self._taxons = self._df.groupby("taxon").size() try: self._taxons.drop(0, inplace=True) except: pass # 0 may not be there self._taxons.sort_values(ascending=False, inplace=True) category = self.df.groupby("status").size() if 'C' in category.index: self.classified = category['C'] else: self.classified = 0 if 'U' in category.index: self.unclassified = category['U'] else: self.unclassified = 0 def _get_taxons(self): try: return self._taxons except: self._parse_data() return self._taxons taxons = property(_get_taxons) def _get_df(self): try: return self._df except: self._parse_data() return self._df df = property(_get_df) def _get_df_with_taxon(self, dbname): # line 14500 # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index]) df['count'] = self.taxons.values df.reset_index(inplace=True) newrow = len(df) df.ix[newrow] = "Unclassified" df.ix[newrow, 'count'] = self.unclassified df.ix[newrow, 'index'] = -1 df.rename(columns={"index":"taxon"}, inplace=True) df["percentage"] = df["count"] / df["count"].sum() * 100 # Now get back all annotations from the database itself. filename = dbname + os.sep + "annotations.csv" if os.path.exists(filename): annotations = pd.read_csv(filename) annotations.set_index("taxon", inplace=True) df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']] # There are duplicates sohow. let us keep the first one for now df2 = df2.reset_index().drop_duplicates(subset="taxon", keep="first").set_index("taxon") self.df2 = df2 self.df1 = df.set_index("taxon") df = pd.merge(self.df1, df2, left_index=True, right_index=True) df.reset_index(inplace=True) starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count'] df = df[starter + [x for x in df.columns if x not in starter and x!="description"] + ["description"]] df['gi'] = [int(x) for x in df['gi'].fillna(-1)] from easydev import precision df['percentage'] = [str(precision(x,2)) for x in df['percentage']] else: starter = ['taxon', 'count', 'percentage'] df = df[starter + [x for x in df.columns if x not in starter]] df.sort_values(by="percentage", inplace=True, ascending=False) return df def kraken_to_csv(self, filename, dbname): df = self._get_df_with_taxon(dbname) df.to_csv(filename, index=False) return df def kraken_to_json(self, filename, dbname): df = self._get_df_with_taxon(dbname) df.to_json(filename) return df def kraken_to_krona(self, output_filename=None, mode=None, nofile=False): """ :return: status: True is everything went fine otherwise False """ if output_filename is None: output_filename = self.filename + ".summary" taxon_to_find = list(self.taxons.index) if len(taxon_to_find) == 0: logger.warning("No reads were identified. You will need a more complete database") self.output_filename = output_filename with open(output_filename, "w") as fout: fout.write("%s\t%s" % (self.unclassified, "Unclassified")) return False # classified reads as root (1) """try: logger.warning("Removing taxon 1 (%s values) " % self.taxons.ix[1]) logger.info("Found %s taxons " % len(taxon_to_find)) taxon_to_find.pop(taxon_to_find.index(1)) except: pass """ if len(taxon_to_find) == 0: return False if mode != "adapters": df = self.get_taxonomy_biokit(taxon_to_find) self.lineage = [";".join(this) for this in df[df.columns[0:-1]].values] self.scnames = list(df['name'].values) # do we need a cast ? else: # Let us get the known adapters and their identifiers from sequana.adapters import AdapterDB adapters = AdapterDB() adapters.load_all() self.scnames = [] for taxon in self.taxons.index: if str(taxon) in [1, "1"]: self.scnames.append('unknown') continue if str(taxon) not in list(adapters.df.identifier): self.scnames.append('unknown') continue self.scnames.append(adapters.get_name(taxon)) self.lineage = ["Adapters;%s"% x for x in self.scnames] assert len(self.lineage) == len(self.taxons) assert len(self.scnames) == len(self.taxons) # Now save the file self.output_filename = output_filename with open(output_filename, "w") as fout: for i, this in enumerate(self.lineage): taxon = taxon_to_find[i] count = self.taxons.loc[taxon] line = str(count)+"\t"+"\t".join(this.split(';')) line += " " +self.scnames[i] fout.write(line+'\n') try: fout.write("%s\t%s" % (self.unclassified, "Unclassified")) except: pass #unclassified may not exists if all classified self._data_created = True return True def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9, textcolor="red", **kargs): """A simple non-interactive plot of taxons :return: None if no taxon were found and a dataframe otherwise A Krona Javascript output is also available in :meth:`kraken_to_krona` .. plot:: :include-source: from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') .. seealso:: to generate the data see :class:`KrakenPipeline` or the standalone application **sequana_taxonomy**. """ if len(self._df) == 0: return if self._data_created == False: status = self.kraken_to_krona() if kind not in ['barh', 'pie']: logger.error('kind parameter: Only barh and pie are supported') return # This may have already been called but maybe not. This is not time # consuming, so we call it again here if len(self.taxons.index) == 0: return None df = self.get_taxonomy_biokit(list(self.taxons.index)) df.ix[-1] = ["Unclassified"] * 8 data = self.taxons.copy() data.ix[-1] = self.unclassified data = data/data.sum()*100 assert threshold > 0 and threshold < 100 others = data[data<threshold].sum() data = data[data>threshold] names = df.ix[data.index]['name'] data.index = names.values data.ix['others'] = others try: data.sort_values(inplace=True) except: data.sort(inplace=True) # text may be long so, let us increase the figsize a little bit pylab.figure(figsize=(10,8)) pylab.clf() if kind == "pie": ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%', radius=radius, **kargs) pylab.ylabel(" ") for text in ax.texts: # large, x-small, small, None, x-large, medium, xx-small, # smaller, xx-large, larger text.set_size("small") text.set_color(textcolor) for wedge in ax.patches: wedge.set_linewidth(1) wedge.set_edgecolor("k") self.ax = ax elif kind == "barh": ax = data.plot(kind=kind, **kargs) pylab.xlabel(" percentage ") return data def to_js(self, output="krona.html", onweb=False): if self._data_created == False: status = self.kraken_to_krona() execute("ktImportText %s -o %s" % (self.output_filename, output)) if onweb is True: import easydev easydev.onweb(output) def boxplot_classified_vs_read_length(self): """Show distribution of the read length grouped by classified or not""" self.df[["status", "length"]].groupby('status').boxplot()