Exemple #1
0
    def __init__(self, filename="kraken.out"):
        """.. rubric:: **constructor**

        :param filename: the input from KrakenAnalysis class

        """
        self.filename = filename

        on_rtd = os.environ.get("READTHEDOCS", None) == "True"

        if on_rtd is False:
            from biokit import Taxonomy
            self.tax = Taxonomy(verbose=True)
            self.tax._load_flat_file()  # make sure it is available locally
        else:

            class Taxonomy(object):
                from sequana import sequana_data  # must be local
                df = pd.read_csv(sequana_data("test_taxon_rtd.csv"),
                                 index_col=0)

                def get_lineage_and_rank(self, x):
                    # Note that we add the name as well here
                    ranks = [
                        'kingdom', 'phylum', 'class', 'order', 'family',
                        'genus', 'species', 'name'
                    ]
                    return [(self.df.ix[x][rank], rank) for rank in ranks]

            self.tax = Taxonomy()

        if filename:
            # This initialise the data
            self._parse_data()
            self._data_created = False
Exemple #2
0
    def __init__(self, filename="kraken.out"):
        """.. rubric:: **constructor**

        :param filename: the input from KrakenAnalysis class

        """
        self.filename = filename

        on_rtd = os.environ.get("READTHEDOCS", None) == "True"

        if on_rtd is False:
            from biokit import Taxonomy
            self.tax = Taxonomy(verbose=True)
            self.tax._load_flat_file() # make sure it is available locally
        else:
            class Taxonomy(object):
                from sequana import sequana_data # must be local
                df = pd.read_csv(sequana_data("test_taxon_rtd.csv"),
                        index_col=0)
                def get_lineage_and_rank(self, x):
                    # Note that we add the name as well here
                    ranks = ['kingdom', 'phylum', 'class', 'order',
                            'family', 'genus', 'species', 'name']
                    return [(self.df.ix[x][rank], rank) for rank in ranks]
            self.tax = Taxonomy()

        if filename:
            # This initialise the data
            self._parse_data()
            self._data_created = False
Exemple #3
0
def test_taxonomy():
    t = Taxonomy()
    t.load_records()
    lineage = t.get_lineage(9606)
    assert len(lineage) == 31
    assert 'Mammalia' in lineage
    assert t.fetch_by_id('10090')['name'] == 'Mus musculus'
    ret = t.fetch_by_name('Mus Musculus')
    assert ret[0]['id'] == '10090'

    lineage = t.get_lineage_and_rank(9606)

    tree = t.get_family_tree(9606)
Exemple #4
0
def test_taxonomy():
    t = Taxonomy()
    t.load_records()
    lineage = t.get_lineage(9606)
    assert len(lineage) == 31
    assert 'Mammalia' in lineage
    assert t.fetch_by_id('10090')['name'] == 'Mus musculus' 
    ret = t.fetch_by_name('Mus Musculus')
    assert ret[0]['id'] == '10090'

    lineage = t.get_lineage_and_rank(9606)

    tree = t.get_family_tree(9606)
Exemple #5
0
class KrakenResults(object):
    """Translate Kraken results into a Krona-compatible file


    If you run a kraken analysis with :class:`KrakenAnalysis`, you will end up
    with a file e.g. named kraken.out (by default).

    You could use kraken-translate but then you need extra parsing to convert
    into a Krona-compatible file. Here, we take the output from kraken and
    directly transform it to a krona-compatible file.

    ::

        k = KrakenResults("kraken.out")
        k.kraken_to_krona()

    Then format expected looks like::

        C    HISEQ:426:C5T65ACXX:5:2301:18719:16377    1    203    1:71 A:31 1:71
        C    HISEQ:426:C5T65ACXX:5:2301:21238:16397    1    202    1:71 A:31 1:71

    Where each row corresponds to one read.

    ::

        "562:13 561:4 A:31 0:1 562:3" would indicate that:

        the first 13 k-mers mapped to taxonomy ID #562
        the next 4 k-mers mapped to taxonomy ID #561
        the next 31 k-mers contained an ambiguous nucleotide
        the next k-mer was not in the database
        the last 3 k-mers mapped to taxonomy ID #562


    See kraken documentation for details.

    .. note:: a taxon of ID 1 (root) means that the read is classified but in
        differen domain. https://github.com/DerrickWood/kraken/issues/100

    .. note:: This takes care of fetching taxons and the corresponding lineages
        from online web services.

    """
    def __init__(self, filename="kraken.out"):
        """.. rubric:: **constructor**

        :param filename: the input from KrakenAnalysis class

        """
        self.filename = filename

        on_rtd = os.environ.get("READTHEDOCS", None) == "True"

        if on_rtd is False:
            from biokit import Taxonomy
            self.tax = Taxonomy(verbose=True)
            self.tax._load_flat_file()  # make sure it is available locally
        else:

            class Taxonomy(object):
                from sequana import sequana_data  # must be local
                df = pd.read_csv(sequana_data("test_taxon_rtd.csv"),
                                 index_col=0)

                def get_lineage_and_rank(self, x):
                    # Note that we add the name as well here
                    ranks = [
                        'kingdom', 'phylum', 'class', 'order', 'family',
                        'genus', 'species', 'name'
                    ]
                    return [(self.df.ix[x][rank], rank) for rank in ranks]

            self.tax = Taxonomy()

        if filename:
            # This initialise the data
            self._parse_data()
            self._data_created = False

    def get_taxonomy_biokit(self, ids):
        """Retrieve taxons given a list of taxons

        :param list ids: list of taxons as strings or integers. Could also
            be a single string or a single integer
        :return: a dataframe

        .. note:: the first call first loads all taxons in memory and takes a
            few seconds but subsequent calls are much faster
        """
        # filter the lineage to keep only information from one of the main rank
        # that is superkingdom, kingdom, phylum, class, order, family, genus and
        # species
        ranks = ('kingdom', 'phylum', 'class', 'order', 'family', 'genus',
                 'species')

        if isinstance(ids, int):
            ids = [ids]

        if len(ids) == 0:
            return pd.DataFrame()

        logger.info('Retrieving taxon using biokit.Taxonomy')

        if isinstance(ids, list) is False:
            ids = [ids]

        lineage = [self.tax.get_lineage_and_rank(x) for x in ids]
        # Now, we filter each lineage to keep only relevant ranks
        # We drop the 'no rank' and create a dictionary
        # Not nice but works for now
        results = []
        for i, this in enumerate(lineage):
            default = dict.fromkeys(ranks, ' ')
            for entry in this:
                if entry[1] in ranks:
                    default[entry[1]] = entry[0]
                elif entry[1] == "superkingdom":
                    default["kingdom"] = entry[0]
            # Scientific name is the last entry tagged has no_rank  following
            # species TODO (check this assumption)
            # e.g. 351680 and 151529 have same 7 ranks so to differenatiate
            # them, the scientific name should be used.
            # By default, we will take the last one. If species or genus, we
            # repeat the term
            try:
                default['name'] = this[-1][0]
            except:
                default['name'] = "root (ambigous kingdom)"
            results.append(default)

        df = pd.DataFrame.from_records(results)
        df.index = ids
        df = df[list(ranks) + ['name']]
        df.index = df.index.astype(int)

        return df

    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename,
                                 sep="\t",
                                 header=None,
                                 usecols=[0, 2, 3],
                                 chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
            #only_classified_output when there is no found classified read
            self.unclassified = N  # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass  # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0

    def _get_taxons(self):
        try:
            return self._taxons
        except:
            self._parse_data()
            return self._taxons

    taxons = property(_get_taxons)

    def _get_df(self):
        try:
            return self._df
        except:
            self._parse_data()
            return self._df

    df = property(_get_df)

    def _get_df_with_taxon(self, dbname):

        # line 14500
        # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome

        df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index])
        df['count'] = self.taxons.values
        df.reset_index(inplace=True)
        newrow = len(df)
        df.ix[newrow] = "Unclassified"
        df.ix[newrow, 'count'] = self.unclassified
        df.ix[newrow, 'index'] = -1
        df.rename(columns={"index": "taxon"}, inplace=True)
        df["percentage"] = df["count"] / df["count"].sum() * 100

        # Now get back all annotations from the database itself.
        filename = dbname + os.sep + "annotations.csv"
        if os.path.exists(filename):
            annotations = pd.read_csv(filename)
            annotations.set_index("taxon", inplace=True)

            df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']]
            # There are duplicates sohow. let us keep the first one for now
            df2 = df2.reset_index().drop_duplicates(
                subset="taxon", keep="first").set_index("taxon")
            self.df2 = df2
            self.df1 = df.set_index("taxon")
            df = pd.merge(self.df1, df2, left_index=True, right_index=True)
            df.reset_index(inplace=True)
            starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count']
            df = df[starter + [
                x
                for x in df.columns if x not in starter and x != "description"
            ] + ["description"]]

            df['gi'] = [int(x) for x in df['gi'].fillna(-1)]
            from easydev import precision
            df['percentage'] = [str(precision(x, 2)) for x in df['percentage']]
        else:
            starter = ['taxon', 'count', 'percentage']
            df = df[starter + [x for x in df.columns if x not in starter]]

        df.sort_values(by="percentage", inplace=True, ascending=False)
        return df

    def kraken_to_csv(self, filename, dbname):
        df = self._get_df_with_taxon(dbname)
        df.to_csv(filename, index=False)
        return df

    def kraken_to_json(self, filename, dbname):
        df = self._get_df_with_taxon(dbname)
        df.to_json(filename)
        return df

    def kraken_to_krona(self, output_filename=None, mode=None, nofile=False):
        """

        :return: status: True is everything went fine otherwise False
        """
        if output_filename is None:
            output_filename = self.filename + ".summary"
        taxon_to_find = list(self.taxons.index)
        if len(taxon_to_find) == 0:
            logger.warning(
                "No reads were identified. You will need a more complete database"
            )
            self.output_filename = output_filename
            with open(output_filename, "w") as fout:
                fout.write("%s\t%s" % (self.unclassified, "Unclassified"))
            return False

        # classified reads as root  (1)
        """try:
            logger.warning("Removing taxon 1 (%s values) " % self.taxons.ix[1])
            logger.info("Found %s taxons " % len(taxon_to_find))
            taxon_to_find.pop(taxon_to_find.index(1))
        except:
            pass
        """

        if len(taxon_to_find) == 0:
            return False

        if mode != "adapters":
            df = self.get_taxonomy_biokit(taxon_to_find)
            self.lineage = [
                ";".join(this) for this in df[df.columns[0:-1]].values
            ]
            self.scnames = list(df['name'].values)  # do we need a cast ?
        else:
            # Let us get the known adapters and their identifiers
            from sequana.adapters import AdapterDB
            adapters = AdapterDB()
            adapters.load_all()

            self.scnames = []

            for taxon in self.taxons.index:
                if str(taxon) in [1, "1"]:
                    self.scnames.append('unknown')
                    continue

                if str(taxon) not in list(adapters.df.identifier):
                    self.scnames.append('unknown')
                    continue

                self.scnames.append(adapters.get_name(taxon))
            self.lineage = ["Adapters;%s" % x for x in self.scnames]

            assert len(self.lineage) == len(self.taxons)
            assert len(self.scnames) == len(self.taxons)

        # Now save the file
        self.output_filename = output_filename
        with open(output_filename, "w") as fout:
            for i, this in enumerate(self.lineage):
                taxon = taxon_to_find[i]
                count = self.taxons.loc[taxon]
                line = str(count) + "\t" + "\t".join(this.split(';'))
                line += " " + self.scnames[i]
                fout.write(line + '\n')
            try:
                fout.write("%s\t%s" % (self.unclassified, "Unclassified"))
            except:
                pass  #unclassified may not exists if all classified
        self._data_created = True
        return True

    def plot(self,
             kind="pie",
             cmap="copper",
             threshold=1,
             radius=0.9,
             textcolor="red",
             **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_biokit(list(self.taxons.index))
        df.ix[-1] = ["Unclassified"] * 8
        data = self.taxons.copy()
        data.ix[-1] = self.unclassified

        data = data / data.sum() * 100
        assert threshold > 0 and threshold < 100
        others = data[data < threshold].sum()
        data = data[data > threshold]
        names = df.ix[data.index]['name']

        data.index = names.values
        data.ix['others'] = others
        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        # text may be long so, let us increase the figsize a little bit
        pylab.figure(figsize=(10, 8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind,
                           cmap=cmap,
                           autopct='%1.1f%%',
                           radius=radius,
                           **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind, **kargs)
            pylab.xlabel(" percentage ")

        return data

    def to_js(self, output="krona.html", onweb=False):
        if self._data_created == False:
            status = self.kraken_to_krona()
        execute("ktImportText %s -o %s" % (self.output_filename, output))
        if onweb is True:
            import easydev
            easydev.onweb(output)

    def boxplot_classified_vs_read_length(self):
        """Show distribution of the read length grouped by classified or not"""
        self.df[["status", "length"]].groupby('status').boxplot()
Exemple #6
0
class KrakenResults(object):
    """Translate Kraken results into a Krona-compatible file


    If you run a kraken analysis with :class:`KrakenAnalysis`, you will end up
    with a file e.g. named kraken.out (by default).

    You could use kraken-translate but then you need extra parsing to convert
    into a Krona-compatible file. Here, we take the output from kraken and
    directly transform it to a krona-compatible file.

    ::

        k = KrakenResults("kraken.out")
        k.kraken_to_krona()

    Then format expected looks like::

        C    HISEQ:426:C5T65ACXX:5:2301:18719:16377    1    203    1:71 A:31 1:71
        C    HISEQ:426:C5T65ACXX:5:2301:21238:16397    1    202    1:71 A:31 1:71

    Where each row corresponds to one read.

    ::

        "562:13 561:4 A:31 0:1 562:3" would indicate that:

        the first 13 k-mers mapped to taxonomy ID #562
        the next 4 k-mers mapped to taxonomy ID #561
        the next 31 k-mers contained an ambiguous nucleotide
        the next k-mer was not in the database
        the last 3 k-mers mapped to taxonomy ID #562


    See kraken documentation for details.

    .. note:: a taxon of ID 1 (root) means that the read is classified but in
        differen domain. https://github.com/DerrickWood/kraken/issues/100

    .. note:: This takes care of fetching taxons and the corresponding lineages
        from online web services.

    """
    def __init__(self, filename="kraken.out"):
        """.. rubric:: **constructor**

        :param filename: the input from KrakenAnalysis class

        """
        self.filename = filename

        on_rtd = os.environ.get("READTHEDOCS", None) == "True"

        if on_rtd is False:
            from biokit import Taxonomy
            self.tax = Taxonomy(verbose=True)
            self.tax._load_flat_file() # make sure it is available locally
        else:
            class Taxonomy(object):
                from sequana import sequana_data # must be local
                df = pd.read_csv(sequana_data("test_taxon_rtd.csv"),
                        index_col=0)
                def get_lineage_and_rank(self, x):
                    # Note that we add the name as well here
                    ranks = ['kingdom', 'phylum', 'class', 'order',
                            'family', 'genus', 'species', 'name']
                    return [(self.df.ix[x][rank], rank) for rank in ranks]
            self.tax = Taxonomy()

        if filename:
            # This initialise the data
            self._parse_data()
            self._data_created = False

    def get_taxonomy_biokit(self, ids):
        """Retrieve taxons given a list of taxons

        :param list ids: list of taxons as strings or integers. Could also
            be a single string or a single integer
        :return: a dataframe

        .. note:: the first call first loads all taxons in memory and takes a
            few seconds but subsequent calls are much faster
        """
        # filter the lineage to keep only information from one of the main rank
        # that is superkingdom, kingdom, phylum, class, order, family, genus and
        # species
        ranks = ('kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species')

        if isinstance(ids, int):
            ids = [ids]

        if len(ids) == 0:
            return pd.DataFrame()

        logger.info('Retrieving taxon using biokit.Taxonomy')

        if isinstance(ids, list) is False:
            ids = [ids]

        lineage = [self.tax.get_lineage_and_rank(x) for x in ids]
        # Now, we filter each lineage to keep only relevant ranks
        # We drop the 'no rank' and create a dictionary
        # Not nice but works for now
        results = []
        for i, this in enumerate(lineage):
            default = dict.fromkeys(ranks, ' ')
            for entry in this:
                if entry[1] in ranks:
                    default[entry[1]] = entry[0]
                elif entry[1] == "superkingdom":
                    default["kingdom"] = entry[0]
            # Scientific name is the last entry tagged has no_rank  following
            # species TODO (check this assumption)
            # e.g. 351680 and 151529 have same 7 ranks so to differenatiate
            # them, the scientific name should be used.
            # By default, we will take the last one. If species or genus, we
            # repeat the term
            try:
                default['name'] = this[-1][0]
            except:
                default['name'] = "root (ambigous kingdom)"
            results.append(default)

        df = pd.DataFrame.from_records(results)
        df.index = ids
        df = df[list(ranks) + ['name']]
        df.index = df.index.astype(int)

        return df

    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename, sep="\t", header=None,
                               usecols=[0,2,3], chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
                #only_classified_output when there is no found classified read
            self.unclassified = N # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0

    def _get_taxons(self):
        try:
            return self._taxons
        except:
            self._parse_data()
            return self._taxons
    taxons = property(_get_taxons)

    def _get_df(self):
        try:
            return self._df
        except:
            self._parse_data()
            return self._df
    df = property(_get_df)

    def _get_df_with_taxon(self, dbname):

        # line 14500
        # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome

        df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index])
        df['count'] = self.taxons.values
        df.reset_index(inplace=True)
        newrow = len(df)
        df.ix[newrow] = "Unclassified"
        df.ix[newrow, 'count'] = self.unclassified
        df.ix[newrow, 'index'] = -1
        df.rename(columns={"index":"taxon"}, inplace=True)
        df["percentage"] = df["count"] / df["count"].sum() * 100

        # Now get back all annotations from the database itself.
        filename = dbname + os.sep + "annotations.csv"
        if os.path.exists(filename):
            annotations = pd.read_csv(filename)
            annotations.set_index("taxon", inplace=True)

            df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']]
            # There are duplicates sohow. let us keep the first one for now
            df2 = df2.reset_index().drop_duplicates(subset="taxon",
                keep="first").set_index("taxon")
            self.df2 = df2
            self.df1 = df.set_index("taxon")
            df = pd.merge(self.df1, df2, left_index=True, right_index=True)
            df.reset_index(inplace=True)
            starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count']
            df = df[starter + [x for x in df.columns if x not in starter and
                                x!="description"] +  ["description"]]

            df['gi'] = [int(x) for x in df['gi'].fillna(-1)]
            from easydev import precision
            df['percentage'] = [str(precision(x,2)) for x in df['percentage']]
        else:
            starter = ['taxon', 'count', 'percentage']
            df = df[starter + [x for x in df.columns if x not in starter]]

        df.sort_values(by="percentage", inplace=True, ascending=False)
        return df

    def kraken_to_csv(self, filename, dbname):
        df = self._get_df_with_taxon(dbname)
        df.to_csv(filename, index=False)
        return df

    def kraken_to_json(self, filename, dbname):
        df = self._get_df_with_taxon(dbname)
        df.to_json(filename)
        return df

    def kraken_to_krona(self, output_filename=None, mode=None, nofile=False):
        """

        :return: status: True is everything went fine otherwise False
        """
        if output_filename is None:
            output_filename = self.filename + ".summary"
        taxon_to_find = list(self.taxons.index)
        if len(taxon_to_find) == 0:
            logger.warning("No reads were identified. You will need a more complete database")
            self.output_filename = output_filename
            with open(output_filename, "w") as fout:
                fout.write("%s\t%s" % (self.unclassified, "Unclassified"))
            return False

        # classified reads as root  (1)
        """try:
            logger.warning("Removing taxon 1 (%s values) " % self.taxons.ix[1])
            logger.info("Found %s taxons " % len(taxon_to_find))
            taxon_to_find.pop(taxon_to_find.index(1))
        except:
            pass
        """

        if len(taxon_to_find) == 0:
            return False

        if mode != "adapters":
            df = self.get_taxonomy_biokit(taxon_to_find)
            self.lineage = [";".join(this) for this in df[df.columns[0:-1]].values]
            self.scnames = list(df['name'].values)  # do we need a cast ?
        else:
            # Let us get the known adapters and their identifiers
            from sequana.adapters import AdapterDB
            adapters = AdapterDB()
            adapters.load_all()

            self.scnames = []

            for taxon in self.taxons.index:
                if str(taxon) in [1, "1"]:
                    self.scnames.append('unknown')
                    continue

                if str(taxon) not in list(adapters.df.identifier):
                    self.scnames.append('unknown')
                    continue

                self.scnames.append(adapters.get_name(taxon))
            self.lineage = ["Adapters;%s"% x for x in self.scnames]

            assert len(self.lineage) == len(self.taxons)
            assert len(self.scnames) == len(self.taxons)

        # Now save the file
        self.output_filename = output_filename
        with open(output_filename, "w") as fout:
            for i, this in enumerate(self.lineage):
                taxon = taxon_to_find[i]
                count = self.taxons.loc[taxon]
                line = str(count)+"\t"+"\t".join(this.split(';'))
                line += " " +self.scnames[i]
                fout.write(line+'\n')
            try:
                fout.write("%s\t%s" % (self.unclassified, "Unclassified"))
            except:
                pass #unclassified may not exists if all classified
        self._data_created = True
        return True

    def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9,
                textcolor="red", **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_biokit(list(self.taxons.index))
        df.ix[-1] = ["Unclassified"] * 8
        data = self.taxons.copy()
        data.ix[-1] = self.unclassified

        data = data/data.sum()*100
        assert threshold > 0 and threshold < 100
        others = data[data<threshold].sum()
        data = data[data>threshold]
        names = df.ix[data.index]['name']

        data.index = names.values
        data.ix['others'] = others
        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        # text may be long so, let us increase the figsize a little bit
        pylab.figure(figsize=(10,8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%',
                radius=radius, **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind,  **kargs)
            pylab.xlabel(" percentage ")

        return data

    def to_js(self, output="krona.html", onweb=False):
        if self._data_created == False:
            status = self.kraken_to_krona()
        execute("ktImportText %s -o %s" % (self.output_filename, output))
        if onweb is True:
            import easydev
            easydev.onweb(output)

    def boxplot_classified_vs_read_length(self):
        """Show distribution of the read length grouped by classified or not"""
        self.df[["status", "length"]].groupby('status').boxplot()