Ejemplo n.º 1
0
    def set_tooltips_to_column(self, tooltips_col, target_col):
        """Hide a column with tooltips and connect it with a column.

        :param str tooltips_col: column with your tooltips.
        :param str target_col: column to connect.
        """
        # hide tooltips
        try:
            self.datatable_columns[tooltips_col]['visible'] = 'false'
        except KeyError:
            logger.warning(
                "KeyError: Column name '{0}' does not exist.".format(
                    target_col))
            pass
        # function to add tooltips
        fct = """function(data, type, row, meta){{
            return '<a href="#" data-toggle="tooltip" title="'+row.{0}+'">'+data+'</a>';
        }}
        """.format(tooltips_col)
        try:
            self.datatable_columns[target_col]['render'] = fct
        except KeyError:
            logger.warning(
                "KeyError: Column name '{0}' does not exist.".format(
                    target_col))
            pass
Ejemplo n.º 2
0
    def set_links_to_column(self, link_col, target_col):
        """Hide a column with urls and connect it with a column.

        :param str link_col: column with your URLs.
        :param str target_col: column to connect.
        """
        # hide the link column
        try:
            self.datatable_columns[link_col]['visible'] = 'false'
        except KeyError:
            logger.warning(
                "KeyError: Column name '{0}' does not exist.".format(
                    target_col))
            pass
        # function to add link
        fct = """function(data, type, row, meta){{
            return '<a href="'+row.{0}+'" target="_blank">'+data+'</a>';
        }}
        """.format(link_col)
        try:
            self.datatable_columns[target_col]['render'] = fct
        except KeyError:
            logger.warning(
                "KeyError: Column name '{0}' does not exist.".format(
                    target_col))
            pass
Ejemplo n.º 3
0
    def get_df(self):
        import pandas as pd
        data = {}
        for sample, filename in zip(self.sample_names, self.filenames):
            df = pd.read_csv(filename)
            df = df.groupby("kingdom")['percentage'].sum()
            # if a taxon is obsolete, the kingdom is empty.
            # We will set the kingdom as Unclassified and raise a warning
            # if the count is > 5%
            if " " in df.index:
                percent = df.loc[" "]
                if percent > 5:
                    logger.warning(
                        "Found {}% of taxons in obsolete category".format(
                            percent))
                if "Unclassified" in df.index:
                    df.loc['Unclassified'] += df.loc[' ']
                    df.drop(" ", inplace=True)
                else:
                    df.loc['Unclassified'] = df.loc[' ']
                    df.drop(" ", inplace=True)
            data[sample] = df

        df = pd.DataFrame(data)
        #df.to_json(output.data)
        df = df.sort_index(ascending=False)
        return df
Ejemplo n.º 4
0
    def _parse_data(self):
        """Parse the YAML file to get the block content (comments)
        before each top-level sections. See doc in the constructor

        Removes all # so that the block of comments can be interpreted as
        a standard docstring in Sequanix
        """
        current_block = []
        current_section = "docstring"

        # if we get a line that starts with #, this is a new comment or
        # part of a block comment. Otherwise, it means the current block
        # comment has ended.

        for this in self.data:
            # Beginning of a new section at top level
            if self.regex_section.findall(this):
                name = self.regex_section.findall(this)[0]
                current_section = name.strip(":")
                self.sections[current_section] = "".join(current_block)
                current_block = []
                current_section = None
            elif this.startswith('#'):    # a comment at top level
                current_block.append(this)
            elif this.strip() == "":      # an empty line
                #this was the main comment, or an isolated comment
                current_block = []
            else:  # a non-empty line to skip
                current_block = []

        for key in self._get_expected_sections():
            if key not in self.sections.keys():
                logger.warning("section %s not dealt by the parsing function" % key)
Ejemplo n.º 5
0
    def _get_specials(self, section):
        """This method extracts data from the docstring

        Lines such as ::

            field_choice__ = ["a", "b"]

        are extracted. Where _choice is a special keyword to be
        found.

        """
        if section not in self.sections.keys():
            logger.warning("%s not found in the yaml " % section)
            return
        comments = self.sections[section]
        specials = {}
        for line in comments.split("\n"):
            if "#############" in line:
                pass
            elif sum([this in line for this in self._specials]):
                for special in self._specials:
                    line = line[2:]
                    key, value = line.split("=", 1)
                    key = key.strip().rstrip("__")
                    value = value.strip()
                    specials[key] = list(eval(value))
        return specials
Ejemplo n.º 6
0
    def _parse_data(self):
        """Parse the YAML file to get the block content (comments)
        before each top-level sections. See doc in the constructor

        Removes all # so that the block of comments can be interpreted as
        a standard docstring in Sequanix
        """
        current_block = []
        current_section = "docstring"

        # if we get a line that starts with #, this is a new comment or
        # part of a block comment. Otherwise, it means the current block
        # comment has ended.

        for this in self.data:
            # Beginning of a new section at top level
            if self.regex_section.findall(this):
                name = self.regex_section.findall(this)[0]
                current_section = name.strip(":")
                self.sections[current_section] = "".join(current_block)
                current_block = []
                current_section = None
            elif this.startswith('#'):  # a comment at top level
                current_block.append(this)
            elif this.strip() == "":  # an empty line
                #this was the main comment, or an isolated comment
                current_block = []
            else:  # a non-empty line to skip
                current_block = []

        for key in self._get_expected_sections():
            if key not in self.sections.keys():
                logger.warning("section %s not dealt by the parsing function" %
                               key)
Ejemplo n.º 7
0
    def save_significant_pathways(self,
                                  mode,
                                  cutoff=0.05,
                                  nmax=20,
                                  background=None):  #pragma: no cover
        """mode should be up, down or all"""

        if background is None:
            background = self.background

        # select the relevant pathways
        df = self._enrichr(mode, background).results
        df = self._get_final_df(df, cutoff=cutoff, nmax=nmax)
        logger.warning("Found {} pathways to save".format(len(df)))
        if len(df) == nmax:
            logger.warning("Restricted pathways to {}".format(nmax))

        logger.info("saving {} deregulated pathways".format(len(df)))

        summaries = {}
        # save them
        for ID in df['Term']:
            summary = self.save_pathway(ID,
                                        filename="{}_{}.png".format(ID, mode))
            summaries[ID] = summary
        return summaries
Ejemplo n.º 8
0
def get_sequana_adapters(type_, direction):
    """Return path to a list of adapters in FASTA format

    :param tag: PCRFree, Rubicon, Nextera
    :param type_: fwd, rev, revcomp
    :return: path to the adapter filename

    """
    # search possible types
    registered = _get_registered_adapters()
    if type_ not in registered:
        logger.error("This adapter type (%s) is not valid" % type_)
        logger.error("choose one in %s types" % registered)
        raise ValueError

    directions = ["fwd", "rev", "revcomp"]
    if direction not in directions:
        logger.error("This kind of tag (%s) is not valid" % direction)
        logger.error("choose one in %s " % directions)
        raise ValueError
    try:
        this = sequana_data("adapters_%s_%s.fa" % (type_, direction))
        logger.warning("Rename {} (remove the adapters_ prefix)".format(this))
        return this
    except:
        return sequana_data("%s_%s.fa" % (type_, direction))
Ejemplo n.º 9
0
    def df(self):
        # RG: ID read group ??
        # np: number of passes
        # rq ?
        # rs: list 6 numbers ?
        # za:
        # zm ID of the ZMW
        # sn: SNR how is this computed ?
        # zs
        # - sn: list of ACGT SNRs. A, C, G, T in that order
        if self._df is not None:
            return self._df

        logger.info("Scanning input file. Please wait")
        self.reset()
        N = 0

        all_results = []
        # This takes 60%  of the time...could use cython ?
        for read in self.data:
            tags = dict(read.tags) #11% of the time
            res = []
            # count reads
            N += 1
            if (N % 10000) == 0:
                logger.info("Read %d sequences" %N)

            # res[0] = read length
            res.append(read.query_length) # also stored in tags["qe"] - tags["qs"]

            # collections.counter is slow, let us do it ourself
            res.append( 100. / read.qlen * sum(
                [read.query_sequence.count(letter) if read.query_sequence
                    else 0 for letter in "CGcgSs"]))

            # res[1:4] contains SNR  stored in tags['sn'] in the order A, C, G, T
            try:
                snr = list(tags['sn'])
            except:
                snr = [None] * 4
            res = res + snr

            # res[6] = ZMW name, also stored in tags["zm"]
            res.append(int(tags['zm']))
            res.append(tags['np'])

            # aggregate results
            all_results.append(res)

        self._df = pd.DataFrame(all_results,
            columns=['read_length','GC_content','snr_A','snr_C','snr_G','snr_T','ZMW',
                     "nb_passes"])
        self._df.ZMW = self._df.ZMW.astype(int)

        if len(self._df.ZMW.unique()) != len(self._df):
            logger.warning("Found non unique ZMW. This may not be a CCS but "
                        "a subread file. Consider using PacbioSubreads class")

        self.reset()
        return self._df
Ejemplo n.º 10
0
    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename,
                                 sep="\t",
                                 header=None,
                                 usecols=[0, 2, 3],
                                 chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
            #only_classified_output when there is no found classified read
            self.unclassified = N  # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass  # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0
Ejemplo n.º 11
0
    def _get_specials(self, section):
        """This method extracts data from the docstring

        Lines such as ::

            field_choice__ = ["a", "b"]

        are extracted. Where _choice is a special keyword to be
        found.

        """
        if section not in self.sections.keys():
            logger.warning("%s not found in the yaml " % section)
            return
        comments = self.sections[section]
        specials = {}
        for line in comments.split("\n"):
            if "#############" in line:
                pass
            elif sum([this in line for this in self._specials]):
                for special in self._specials:
                    line = line[2:]
                    key, value = line.split("=", 1)
                    key = key.strip().rstrip("__")
                    value = value.strip()
                    specials[key] = list(eval(value))
        return specials
Ejemplo n.º 12
0
    def _get_adapter_by_index(self, index_name, prefix):
        """Return adapter corresponding to the unique index

        :param index_name: the unique index name to be found. If several
            sequence do match, this is an error meaning the fasta file
            with all adapters is not correctly formatted.
        :return: an instance of :class:`Adapter` if index_name match an
            adapter; returns None otherwise

        ::

            from sequana import sequana_data, AdapterReader
            filename = sequana_data("adapters_Nextera_fwd.fa")
            ar = AdapterReader(filename)
            ar.get_adapter_by_identifier("N712")

        """
        # there should be only one
        adapters = []
        for this in self._data:
            if prefix + str(index_name) in this.identifier.split("|"):
                this_adapter = Adapter(identifier=this.identifier,
                                       sequence=this.sequence,
                                       comment=this.comment)
                adapters.append(this_adapter)

        if len(adapters) == 0:
            return None
        elif len(adapters) >= 2:
            logger.warning(
                "Found two adapters matching index {}. This may happen with Nextera adapters"
                .format(index_name))
        return adapters
Ejemplo n.º 13
0
    def switch_header_to_gi(self, acc):
        """Kraken will only accept the GI from NCBI so we need to convert
        the ENA accession to GI numbers"""

        # Accession may have a version .1, .2 hence this try/except first
        # without the version and then with the version.
        # Note also that some accession are different from an earlier version.
        # For instance, AF525933 is in the virus.txt list from ENA but
        # the new updated accession ois AH012103 showing that the list and DB
        # must not be fully synchronised.
        # http://www.ebi.ac.uk/ena/data/search?query=AF525933
        # In such case, the results attribute will be missing that accession,
        # which needs to be searched for specifically. We cannot now its name
        # before downloading the fasta.
        if acc in self.results.keys():
            res = self.results[acc]
        else:
            try:
                res = self.results[acc.split(".")[0]]
            except:
                logger.warning(
                    "\nUnknown accession (%s). May be an updated version. Checking..."
                    % acc)
                res = self.ena_id_to_gi_number([acc])
                self.results.update(res)
                res = res[acc]
                logger.info('Found %s using GI number' % acc)
        return ">" + res['identifier'] + " " + res['comment']
Ejemplo n.º 14
0
 def exists(self, filename, exit_on_error=True, warning_only=False):
     if os.path.exists(filename) is False:
         if warning_only is False:
             logger.error("{} file does not exists".format(filename))
             if exit_on_error:
                 sys.exit(1)
         elif warning_only is True:
             logger.warning("{} file does not exists".format(filename))
Ejemplo n.º 15
0
 def window_size(self, n):
     if n % 2 == 0:
         logger.warning("Window size must be an odd number.")
         self._window_size = n + 1
         logger.warning("{0} is incremented to {1}".format(
             n, self._window_size))
     else:
         self._window_size = n
Ejemplo n.º 16
0
    def run(self):

        # To normalise one need to ignore the insertions since there
        # are already included in the ACGT nucleotides
        cols = ["A", "C", "G", "T", "N", "DEL"] 
        df = self.get_bases()
        deletions = self.identify_deletions()

        # consensus without deletions
        dd = df.apply(lambda x: x.idxmax(), axis=1)

        # check that deletions are consistent with the data
        for d in deletions:
            pos = int(d.resume["position"])
            ref = d.resume["reference"]
            # compare the reference of the deletions with the consensus
            if "".join(dd.loc[pos:pos+len(ref)-1]) != ref:
                logger.warning("reference string {} not found in consensus at position {}".format(ref, pos))

        # Now, we insert the deletions removing the reference and then adding
        # the alternate
        # We aware that some deletions may overlap 
        for d in deletions:
            pos = int(d.resume["position"])
            ref = d.resume["reference"]
            alt = d.resume["alternative"]

            # the data up to the position of the reference/alternate SNP
            # indices may not start at zero so we use loc instead of iloc
            dfA = df.loc[0:pos-1]

            # the alternate data needs a dummy dataframe. The indices are 
            # e.g. 0,1,2,3,4,5 and 
            # We reset the indices to start at the dfA last position and
            # to be constant. For instance a dataframe dfB of 3 rows to be
            # appended aftre position 1500 will have the indices 1500,1500,1500
            # This garantee that the following dataframe dfC has indices > to
            # those of dfB while allowing the next iteration to use the same
            # consistance indices when searching for the next deletions
            dfB = df.iloc[0:len(alt)].copy()
            dfB.index = [pos] *  len(dfB)
            dfB *= 0
            for i, nucleotide in enumerate(alt):
                dfB.iloc[i][nucleotide] = 10000 

            # the rest of the data
            dfC = df.loc[pos+len(ref):]

            # !! do no reset indices !!! so that inserted dfB is still sorted
            # and next accession with iloc/loc are still correctin the next
            # iteration
            df = dfA.append(dfB).append(dfC)#.reset_index(drop = True)

        # now we can reset the indices
        df.reset_index(drop=True, inplace=True)

        dd = df.apply(lambda x: x.idxmax(), axis=1)
        return dd
Ejemplo n.º 17
0
    def download_taxonomic_file(self, overwrite=False):
        """Loads entire flat file from EBI

        Do not overwrite the file by default.
        """
        import ftplib
        from sequana import sequana_config_path
        if os.path.exists(self.database) and overwrite is False:
            logger.info(
                "Found taxonomy.dat file in sequana your path {}".format(
                    sequana_config_path))
            return
        else:
            logger.info(
                "Downloading and extracting the taxonomy file from the web. Please be patient."
            )

        if self.source == "ena":
            url = 'ftp.ebi.ac.uk'
        else:
            url = 'ftp.ncbi.nlm.nih.gov'

        self.ftp = ftplib.FTP(url)
        self.ftp.login()
        if self.source == "ena":
            # for the EBI ftp only: self.ftp.cwd('databases')
            self.ftp.cwd('pub')
            self.ftp.cwd('databases')
            self.ftp.cwd('taxonomy')
            logger.warning(
                'Downloading and saving in %s. This is from ebi and may be behind the NCBI taxonomy'
                % self.database)
            self.ftp.retrbinary('RETR taxonomy.dat',
                                open(self.database, 'wb').write)
            ftp.close()
        else:
            self.ftp.cwd('pub')
            self.ftp.cwd('taxonomy')
            logger.warning('Downloading and saving in %s from ncbi ftp' %
                           self.database)
            import tempfile
            import shutil
            with tempfile.TemporaryDirectory() as tmpdir:
                filename = tmpdir + os.sep + "taxdump.tar.gz"
                self.ftp.retrbinary('RETR taxdump.tar.gz',
                                    open(filename, "wb").write)
                import tarfile
                tf = tarfile.open(filename)
                assert "nodes.dmp" in tf.getnames()
                assert "names.dmp" in tf.getnames()
                tf.extract("nodes.dmp", tmpdir)
                tf.extract("names.dmp", tmpdir)
                ncbi = NCBITaxonomy(tmpdir + os.sep + "names.dmp",
                                    tmpdir + os.sep + "nodes.dmp")
                ncbi.create_taxonomy_file(tmpdir + os.sep + "taxonomy.dat")
                shutil.move(tmpdir + os.sep + "taxonomy.dat", self.database)
            self.ftp.close()
Ejemplo n.º 18
0
    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename, sep="\t", header=None,
                               usecols=[0,2,3], chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
                #only_classified_output when there is no found classified read
            self.unclassified = N # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0
Ejemplo n.º 19
0
 def get_cond_from_sample(self, sample_name):
     try:
         candidates = [x for x in self.condition_names if sample_name.startswith(x)]
         if len(candidates) == 1:
             return candidates[0]
         else:
             raise ValueError("ambiguous sample name found in several conditions")
     except:
         logger.warning("{} not found".format(sample_name))
         return None
Ejemplo n.º 20
0
 def _check_if_joint(self):
     try:
         # Needs a try/except for empty VCF files
         line = next(self)
         self.rewind()
         if len(line.samples) > 1:
             return True
     except:
         logger.warning("Your input VCF may be empty")
     return False
Ejemplo n.º 21
0
def copy_config_from_sequana(module, source="config.yaml",
                             target="config.yaml"):
    # identify config name from the requested module
    user_config = module.path + os.sep + source
    if os.path.exists(user_config):
        shutil.copy(user_config, target)
        txt = "copied %s from sequana %s pipeline"
        logger.info(txt % (source, module.name))
    else:
        logger.warning(user_config + "not found")
Ejemplo n.º 22
0
    def get_roi(self):
        """Keep positions with zscore outside of the thresholds range.

        :return: a dataframe from :class:`FilteredGenomeCov`

        .. note:: depends on the :attr:`thresholds` low and high values.
        """
        features = self.bed.feature_dict
        try:
            second_high = self.thresholds.high2
            second_low = self.thresholds.low2
            query = "zscore > @second_high or zscore < @second_low"

            # in the genbank, the names appears as e.g. JB12345
            # but in the fasta or BED files, it may be something like
            # gi|269939526|emb|FN433596.1|
            # so they do not match. We can try to guess it
            alternative = None

            if features:
                if self.chrom_name not in features.keys():
                    msg = """Chromosome name (%s) not found
                        in the genbank. Make sure the chromosome names in
                        the BAM/BED files are compatible with the genbank
                        content. Genbank files contains the following keys """
                    for this in features.keys():
                        msg += "\n                        - %s" % this

                    alternative = [x for x in self.chrom_name.split("|") if x]
                    alternative = alternative[-1] # assume the accession is last
                    alternative = alternative.split('.')[0] # remove version
                    if alternative in features.keys():
                        msg += "\n Guessed the chromosome name to be: %s" % alternative
                    else:
                        features = None
                    logger.warning(msg % self.chrom_name)

            if features:
                if alternative:
                    return FilteredGenomeCov(self.df.query(query), self.thresholds,
                        features[alternative])
                else:
                    return FilteredGenomeCov(self.df.query(query), self.thresholds,
                        features[self.chrom_name])
            else:
                return FilteredGenomeCov(self.df.query(query), self.thresholds)
        except KeyError:
            logger.error("Column zscore is missing in data frame.\n"
                         "You must run compute_zscore before get low coverage."
                         "\n\n", self.__doc__)
            sys.exit(1)
Ejemplo n.º 23
0
 def _get_files(self, pattern):
     filenames = glob.glob(os.sep.join([self.directory, self.phix_directory, 
                                        pattern]))
     if len(filenames) == 4:
         mode = "pe"
     elif len(filenames) == 2:
         mode = "se"
     elif len(filenames) == 0:
         return
     else:
         logger.warning("PhixModule: more than 4 files "
                        "matched the pattern %s" % pattern)
         return
     return filenames, mode
Ejemplo n.º 24
0
 def _get_files(self, pattern):
     filenames = glob.glob(
         os.sep.join([self.directory, self.phix_directory, pattern]))
     if len(filenames) == 4:
         mode = "pe"
     elif len(filenames) == 2:
         mode = "se"
     elif len(filenames) == 0:
         return
     else:
         logger.warning("PhixModule: more than 4 files "
                        "matched the pattern %s" % pattern)
         return
     return filenames, mode
Ejemplo n.º 25
0
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Ejemplo n.º 26
0
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Ejemplo n.º 27
0
 def _get_files(self, pattern):
     # !! need to sort the files so that R1 appears before R2
     filenames = sorted(glob.glob(self.directory + os.sep + pattern))
     if len(filenames) == 2:
         mode = "pe"
     elif len(filenames) == 1:
         mode = "se"
     elif len(filenames) == 0:
         return
     else:
         logger.warning("FastQStatsModule: more than 2 files "
                        "matched the pattern %s" % pattern)
         return
     return filenames, mode
Ejemplo n.º 28
0
 def _get_files(self, pattern):
     # !! need to sort the files so that R1 appears before R2
     filenames = sorted(glob.glob(self.directory + os.sep + pattern))
     if len(filenames) == 2:
         mode = "pe"
     elif len(filenames) == 1:
         mode = "se"
     elif len(filenames) == 0:
         return
     else:
         logger.warning("FastQStatsModule: more than 2 files "
                        "matched the pattern %s" % pattern)
         return
     return filenames, mode
Ejemplo n.º 29
0
    def kraken_to_krona(self, output_filename=None, nofile=False):
        """

        :return: status: True is everything went fine otherwise False
        """
        if output_filename is None:
            output_filename = self.filename + ".summary"
        taxon_to_find = list(self.taxons.index)
        if len(taxon_to_find) == 0:
            logger.warning(
                "No reads were identified. You will need a more complete database"
            )
            self.output_filename = output_filename
            with open(output_filename, "w") as fout:
                fout.write("%s\t%s" % (self.unclassified, "Unclassified"))
            return False

        # classified reads as root  (1)
        """try:
            logger.warning("Removing taxon 1 (%s values) " % self.taxons.iloc[1])
            logger.info("Found %s taxons " % len(taxon_to_find))
            taxon_to_find.pop(taxon_to_find.index(1))
        except:
            pass
        """

        if len(taxon_to_find) == 0:
            return False

        df = self.get_taxonomy_db(taxon_to_find)
        self.lineage = [";".join(this) for this in df[df.columns[0:-1]].values]
        self.scnames = list(df['name'].values)  # do we need a cast ?

        # Now save the file
        self.output_filename = output_filename
        with open(output_filename, "w") as fout:
            for i, this in enumerate(self.lineage):
                taxon = taxon_to_find[i]
                count = self.taxons.loc[taxon]
                line = str(count) + "\t" + "\t".join(this.split(';'))
                line += " " + self.scnames[i]
                fout.write(line + '\n')
            try:
                fout.write("%s\t%s" % (self.unclassified, "Unclassified"))
            except:
                pass  #unclassified may not exists if all classified
        self._data_created = True
        return True
Ejemplo n.º 30
0
    def parse_cutadapt(self):
        d = {}
        # output
        tobefound = self._get_data_tobefound()
        adapters = []

        data = self._rawdata.splitlines()
        # some metadata to extract
        for this in tobefound:
            key, pattern = this
            found = [line for line in data if line.startswith(pattern)]
            if len(found) == 0:
                logger.warning("ReportCutadapt: %s (not found)" % pattern)
            elif len(found) == 1:
                text = found[0].split(":", 1)[1].strip()
                try:
                    this, percent = text.split()
                    self.jinja[key] = this
                    self.jinja[key+'_percent'] = percent
                except:
                    self.jinja[key] = text
                    self.jinja[key+'_percent'] = "?"

        dd = {}
        positions = []
        executable = "cutadapt"
        for pos, this in enumerate(data):
            if "This is Atropos" in this:
                executable = "atropos"
            if "Command line parameters: " in this:
                cmd = this.split("Command line parameters: ")[1]
                self.jinja['command'] = executable + " " + cmd
            if this.startswith("=== ") and "Adapter" in this:
                name = this.split("=== ")[1].split(" ===")[0].strip()
                dd['name'] = name
                continue
            if this.startswith('Sequence:'):
                info = this.split("Sequence:", 1)[1].strip()
                info = info.split(";")
                dd['info'] = {
                    'Sequence': info[0].strip(),
                    'Type': info[1].split(':',1)[1].strip(),
                    'Length': info[2].split(':',1)[1].strip(),
                     'Trimmed': info[3].split(':',1)[1].strip()
                }
                adapters.append(dd.copy())
        self.data["adapters"] = adapters
Ejemplo n.º 31
0
    def parse_cutadapt(self):
        d = {}
        # output
        tobefound = self._get_data_tobefound()
        adapters = []

        data = self._rawdata.splitlines()
        # some metadata to extract
        for this in tobefound:
            key, pattern = this
            found = [line for line in data if line.startswith(pattern)]
            if len(found) == 0:
                logger.warning("ReportCutadapt: %s (not found)" % pattern)
            elif len(found) == 1:
                text = found[0].split(":", 1)[1].strip()
                try:
                    this, percent = text.split()
                    self.jinja[key] = this
                    self.jinja[key + '_percent'] = percent
                except:
                    self.jinja[key] = text
                    self.jinja[key + '_percent'] = "?"

        dd = {}
        positions = []
        executable = "cutadapt"
        for pos, this in enumerate(data):
            if "This is Atropos" in this:
                executable = "atropos"
            if "Command line parameters: " in this:
                cmd = this.split("Command line parameters: ")[1]
                self.jinja['command'] = executable + " " + cmd
            if this.startswith("=== ") and "Adapter" in this:
                name = this.split("=== ")[1].split(" ===")[0].strip()
                dd['name'] = name
                continue
            if this.startswith('Sequence:'):
                info = this.split("Sequence:", 1)[1].strip()
                info = info.split(";")
                dd['info'] = {
                    'Sequence': info[0].strip(),
                    'Type': info[1].split(':', 1)[1].strip(),
                    'Length': info[2].split(':', 1)[1].strip(),
                    'Trimmed': info[3].split(':', 1)[1].strip()
                }
                adapters.append(dd.copy())
        self.data["adapters"] = adapters
Ejemplo n.º 32
0
    def __init__(self, design_filename, adapters):
        """.. rubric:: Constructor

        :param str design_filename: a CSV file that is compatible
            with our :class:`sequana.expdesign.ExpDesignAdapter`
        :param adapters: the type of adapters (PCRFree, Nextera,
            Rubicon, TruSeq, SMARTer, Small)

        The files of adapters are stored in Sequana and accessible with the
        sequana_data function. So, for instance if adapters is set to Nextera,
        the following file is used to identify the adapters::

            sequana_data("adapters_Nextera_fwd.fa")

        New adapters files can be added on request. See resources/data/adapters
        for the full list. You can also use::

            from sequana.adapters import _get_registered_adapters
            _get_registered_adapters()
        """
        from sequana.expdesign import ExpDesignAdapter
        self.design = ExpDesignAdapter(design_filename)

        if self.design.df.index.name == "Sample_ID" or \
            "Sample_ID" in self.design.df.columns:
            self.design.df.set_index("Sample_ID", inplace=True)
        else:
            raise ValueError("Incorrect design file. Missing Sample_ID field")

        self.adapters = adapters

        try:
            file1 = sequana_data("adapters_%s_fwd.fa" % adapters)
            logger.warning("rename your file removing prefix adatper")
        except:
            file1 = sequana_data("%s_fwd.fa" % adapters)

        try:
            file2 = sequana_data("adapters_%s_revcomp.fa" % adapters)
            logger.warning("rename your file removing prefix adatper")
        except:
            file2 = sequana_data("%s_revcomp.fa" % adapters)

        self._adapters_fwd = AdapterReader(file1)
        self._adapters_revc = AdapterReader(file2)  # !!! revcomp
Ejemplo n.º 33
0
 def _block2docstring(self, section):
     if section not in self.sections.keys():
         logger.warning("%s not found in the yaml " % section)
         return
     comments = self.sections[section]
     docstring = []
     for line in comments.split("\n"):
         if "#############" in line:
             pass
         elif sum([this in line for this in self._specials]):
             pass
         else:
             if len(line) < 2:  # an empty line (to keep)
                 docstring.append("")
             else:
                 docstring.append(line[2:])  # strip the "# "characters
     docstring = "\n".join(docstring).strip()
     return docstring
Ejemplo n.º 34
0
 def _block2docstring(self, section):
     if section not in self.sections.keys():
         logger.warning("%s not found in the yaml " % section)
         return
     comments = self.sections[section]
     docstring = []
     for line in comments.split("\n"):
         if "#############" in line:
             pass
         elif sum([this in line for this in self._specials]):
             pass
         else:
             if len(line)<2: # an empty line (to keep)
                 docstring.append("")
             else:
                 docstring.append(line[2:]) # strip the "# "characters
     docstring = "\n".join(docstring).strip()
     return docstring
Ejemplo n.º 35
0
    def scanner(self):
        data = {}
        # shlex removes all white lines and split by return carriage
        # strip is also applied
        rawdata = shlex.split(open(self.filename, "r"))
        for line in rawdata:
            # sometimes, IEM will store the ;;; at the end
            # so we can get [HEADER];;;;;;;;;;;
            if line.startswith('[') and "]" in line:
                line = line.strip(";").strip(",").strip()
                currentkey = line.replace("[", "").replace("]", "")
                data[currentkey] = []
            else:
                data[currentkey].append(line)

        for key in data.keys():
            data[key] = "\n".join(data[key])

        for this in ["Header", "Reads", "Settings", "Data"]:
            if this not in data.keys():
                logger.warning("%s not found in the DesignExpMiSeq file" %
                               this)

        self.data = data
        self.df = pd.read_csv(io.StringIO(data["Data"]))

        ncols = [8, 9, 10, 12]
        if self.df.shape[1] not in ncols:
            self.df = pd.read_csv(io.StringIO(data["Data"]), ";")
            if self.df.shape[1] not in ncols:
                logger.warning(
                    "Data section must have 10 or 12 columns. Check the samplesheet"
                )

        # Fixes https://github.com/sequana/sequana/issues/507
        self.df["Sample_ID"] = self.df["Sample_ID"].astype(str)

        self.df.rename(columns={
            "I7_Index_ID": "Index1_ID",
            "index": "Index1_Seq",
            "I5_Index_ID": "Index2_ID",
            "index2": "Index2_Seq"
        },
                       inplace=True)
Ejemplo n.º 36
0
    def splitter_mapped_unmapped(self, filename, prefix):
        # helpful resources:
        # https://broadinstitute.github.io/picard/explain-flags.html
        logger.info("Creating 2 files (mapped and unmapped reads)")
        data = SAM(filename)

        results = {"flags": [], "mapped": 0, "unmapped": 0, "bad": 0}
        logger.info("Please wait while creating output files")

        with open("{}/{}.unmapped.fastq".format(self.outdir, prefix),
                  "w") as fnosirv:
            with open("{}/{}.mapped.fastq".format(self.outdir, prefix),
                      "w") as fsirv:
                for a in data:
                    if a.flag & 2048:  # suppl
                        # a bad read, we can just drop it
                        results['bad'] += 1
                    elif a.flag & 1024:  # PCR duplicate
                        results['bad'] += 1
                    elif a.flag & 256:  # secondary alignment
                        results["bad"] += 1
                    elif a.flag & 16:  # mapped
                        read = "@{}\n{}\n+\n{}\n".format(
                            a.qname, a.query_sequence, a.qual)
                        assert len(a.query_sequence) == len(a.qual)
                        fsirv.write(read)
                        results["mapped"] += 1
                    elif a.flag & 4:  # unmapped
                        read = "@{}\n{}\n+\n{}\n".format(
                            a.qname, a.query_sequence, a.qual)
                        assert len(a.query_sequence) == len(a.qual)
                        fnosirv.write(read)
                        results["unmapped"] += 1
                    elif a.flag == 0:  # mapped
                        read = "@{}\n{}\n+\n{}\n".format(
                            a.qname, a.query_sequence, a.qual)
                        assert len(a.query_sequence) == len(a.qual)
                        fsirv.write(read)
                        results["mapped"] += 1
                    else:
                        logger.warning("{} flag not handled".format(a.flag))
                    results["flags"].append(a.flag)
        return results
Ejemplo n.º 37
0
    def _scanner(self):

        current_section = None
        data = collections.defaultdict(list)
        with open(self.filename, "r") as fin:
            for line in fin.readlines():
                line = self._line_cleaner(line)
                if len(line) == 0:
                    continue
                if line.startswith("[") and line.endswith("]"):
                    name = line.lstrip("[").rstrip("]")
                    current_section = name
                else:
                    data[current_section] += [line]

        if "Header" not in data.keys():
            logger.warning("Input file must contain [Header]")

        if "Data" not in data.keys():
            logger.warning("Input file must contain [Data]")
        self.data = data
Ejemplo n.º 38
0
    def _plot(self, Xr, pca=None, pc1=0, pc2=1, colors=None, show_labels=True):
        if colors is None:
            colors = [self.colors[k] for k in self.labels]
            if len(colors) != len(Xr):
                colors = ["r"] * len(Xr[:,0])
        else:
            for k in self.labels:
                if k not in colors.keys():
                    logger.warning("No key color for this sample: {}. Set to red".format(k))
                    colors[k] = "r"
            colors = [colors[k] for k in self.labels]

        pylab.scatter(Xr[:,pc1], Xr[:,pc2], c=colors)
        ax = pylab.gca()
        X1, X2 = pylab.xlim()
        dX = X2 - X1
        pylab.xlim([X1 + X1*0.05, X2 + X2*0.05])

        Y1, Y2 = pylab.ylim()
        dY = Y2 - Y1
        pylab.ylim([Y1 + Y1*0.05, Y2 + Y2*0.05])

        count = 0
        if show_labels:
            for x,y in zip(Xr[:,pc1], Xr[:,pc2]):
                x += dX / 40
                y += dY / 40
                ax.annotate(self.labels[count], (x,y))
                count += 1
                if count > 100: 
                    break
        if pca:
            pylab.xlabel("PC{} ({}%)".format(pc1+1,
                round(pca.explained_variance_ratio_[pc1]*100, 2)))
            pylab.ylabel("PC{} ({}%)".format(pc2+1,
                round(pca.explained_variance_ratio_[pc2]*100, 2)))
        pylab.grid(True)
Ejemplo n.º 39
0
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"]
        filenames = [
            "database.idx",
            "database.kdb",
            "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)
Ejemplo n.º 40
0
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"
        ]
        filenames = [
            "database.idx", "database.kdb", "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"
        ]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)
Ejemplo n.º 41
0
    def set_links_to_column(self, link_col, target_col):
        """Hide a column with urls and connect it with a column.

        :param str link_col: column with your URLs.
        :param str target_col: column to connect.
        """
        # hide the link column
        try:
            self.datatable_columns[link_col]['visible'] = 'false'
        except KeyError:
            logger.warning("KeyError: Column name '{0}' does not exist."
                           .format(target_col))
            pass
        # function to add link
        fct = """function(data, type, row, meta){{
            return '<a href="'+row.{0}+'" target="_blank">'+data+'</a>';
        }}
        """.format(link_col)
        try:
            self.datatable_columns[target_col]['render'] = fct
        except KeyError:
            logger.warning("KeyError: Column name '{0}' does not exist."
                           .format(target_col))
            pass
Ejemplo n.º 42
0
    def set_tooltips_to_column(self, tooltips_col, target_col):
        """Hide a column with tooltips and connect it with a column.

        :param str tooltips_col: column with your tooltips.
        :param str target_col: column to connect.
        """
        # hide tooltips
        try:
            self.datatable_columns[tooltips_col]['visible'] = 'false'
        except KeyError:
            logger.warning("KeyError: Column name '{0}' does not exist."
                           .format(target_col))
            pass
        # function to add tooltips
        fct = """function(data, type, row, meta){{
            return '<a href="#" data-toggle="tooltip" title="'+row.{0}+'">'+data+'</a>';
        }}
        """.format(tooltips_col)
        try:
            self.datatable_columns[target_col]['render'] = fct
        except KeyError:
            logger.warning("KeyError: Column name '{0}' does not exist."
                           .format(target_col))
            pass
Ejemplo n.º 43
0
    def kraken_to_krona(self, output_filename=None, mode=None, nofile=False):
        """

        :return: status: True is everything went fine otherwise False
        """
        if output_filename is None:
            output_filename = self.filename + ".summary"
        taxon_to_find = list(self.taxons.index)
        if len(taxon_to_find) == 0:
            logger.warning("No reads were identified. You will need a more complete database")
            self.output_filename = output_filename
            with open(output_filename, "w") as fout:
                fout.write("%s\t%s" % (self.unclassified, "Unclassified"))
            return False

        # classified reads as root  (1)
        """try:
            logger.warning("Removing taxon 1 (%s values) " % self.taxons.ix[1])
            logger.info("Found %s taxons " % len(taxon_to_find))
            taxon_to_find.pop(taxon_to_find.index(1))
        except:
            pass
        """

        if len(taxon_to_find) == 0:
            return False

        if mode != "adapters":
            df = self.get_taxonomy_biokit(taxon_to_find)
            self.lineage = [";".join(this) for this in df[df.columns[0:-1]].values]
            self.scnames = list(df['name'].values)  # do we need a cast ?
        else:
            # Let us get the known adapters and their identifiers
            from sequana.adapters import AdapterDB
            adapters = AdapterDB()
            adapters.load_all()

            self.scnames = []

            for taxon in self.taxons.index:
                if str(taxon) in [1, "1"]:
                    self.scnames.append('unknown')
                    continue

                if str(taxon) not in list(adapters.df.identifier):
                    self.scnames.append('unknown')
                    continue

                self.scnames.append(adapters.get_name(taxon))
            self.lineage = ["Adapters;%s"% x for x in self.scnames]

            assert len(self.lineage) == len(self.taxons)
            assert len(self.scnames) == len(self.taxons)

        # Now save the file
        self.output_filename = output_filename
        with open(output_filename, "w") as fout:
            for i, this in enumerate(self.lineage):
                taxon = taxon_to_find[i]
                count = self.taxons.loc[taxon]
                line = str(count)+"\t"+"\t".join(this.split(';'))
                line += " " +self.scnames[i]
                fout.write(line+'\n')
            try:
                fout.write("%s\t%s" % (self.unclassified, "Unclassified"))
            except:
                pass #unclassified may not exists if all classified
        self._data_created = True
        return True
Ejemplo n.º 44
0
 def __len__(self):
     if self._N is None:
         logger.warning("Scanning the BAM. Please wait")
         self._N = sum(1 for _ in self._data)
         self.reset()
     return self._N
Ejemplo n.º 45
0
def run_analysis(chrom, options, feature_dict):


    logger.info("Computing some metrics")
    if chrom.DOC < 8:
        logger.warning("The depth of coverage is below 8. sequana_coverage is"
                        " not optimised for such depth. You may want to "
                        " increase the threshold to avoid too many false detections")
    logger.info(chrom.__str__())

    if options.w_median > len(chrom.df) / 4:
        NW = int(len(chrom.df) / 4)
        if NW % 2 == 0:
            NW += 1
        logger.warning("median window length is too long. \n"
            "    Setting the window length automatically to a fifth of\n"
            "    the chromosome length ({})".format(NW))
        options.w_median = NW

    # compute the running median, zscore and ROIs for each chunk summarizing the
    # results in a ChromosomeCovMultiChunk instane
    logger.info('Using running median (w=%s)' % options.w_median)
    logger.info("Number of mixture models %s " % options.k)
    results = chrom.run(options.w_median, options.k,
                        circular=options.circular, binning=options.binning,
                        cnv_delta=options.cnv_clustering)


    # Print some info related to the fitted mixture models
    try:
        mu = results.data[0][0].as_dict()['data']['fit_mu']
        sigma = results.data[0][0].as_dict()['data']['fit_sigma']
        pi = results.data[0][0].as_dict()['data']['fit_pi']
        logger.info("Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s" %
              (round(mu,3), round(sigma,3), round(pi,3)))
    except:
        pass

    # some information about the ROIs found
    high = chrom.thresholds.high2
    low = chrom.thresholds.low2
    logger.info("Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
        chrom.thresholds.low, chrom.thresholds.high, low, high))
    ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instane
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))

    # Create directory and save ROIs
    directory = options.output_directory
    directory += os.sep + "coverage_reports"
    directory += os.sep + chrom.chrom_name
    mkdirs(directory)
    ROIs.df.to_csv("{}/rois.csv".format(directory))

    # save summary and metrics
    logger.info("Computing extra metrics")
    summary = results.get_summary()

    summary.to_json(directory + os.sep + "sequana_summary_coverage.json")
    logger.info("Evenness: {}".format(summary.data['evenness']))
    logger.info("Centralness (3 sigma): {}".format(summary.data['C3']))
    logger.info("Centralness (4 sigma): {}".format(summary.data['C4']))

    if options.skip_html:
        return

    logger.info("Creating report in %s. Please wait" % config.output_dir)
    if chrom._mode == "chunks":
        logger.warning(("This chromosome is large. " 
            "Plots in the HTML reports are skipped"))
    datatable = CoverageModule.init_roi_datatable(ROIs)
    ChromosomeCoverageModule(chrom, datatable,
                options={"W": options.w_median,
                         "k": options.k,
                         "ROIs": ROIs,
                         "circular": options.circular},
                command=" ".join(["sequana_coverage"] + sys.argv[1:]))
Ejemplo n.º 46
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.level = options.logging_level

    if options.download_reference:
        logger.info("Downloading reference %s from %s\n" %
            (options.download_reference, options.database))

        from bioservices.apps import download_fasta as df
        df.download_fasta(options.download_reference, method=options.database)
        if options.download_genbank is None:
            return

    if options.download_genbank:
        logger.info("Downloading genbank %s from %s\n" %
            (options.download_genbank, options.database))
        from sequana.snpeff import download_fasta_and_genbank
        download_fasta_and_genbank(options.download_genbank,
                                   options.download_genbank,
                                   genbank=True, fasta=False)
        return

    if options.genbank:
        assert os.path.exists(options.genbank), \
            "%s does not exists" % options.genbank

    logger.info("Reading %s. This may take time depending on "
        "your input file" % options.input)

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile))
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # and output directory
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split('.')[0]

    # Now we can create the instance of GenomeCoverage
    if options.chromosome == -1:
        chrom_list = []
    else: 
        chrom_list = [options.chromosome]
    gc = GenomeCov(bedfile, options.genbank, options.low_threshold,
                   options.high_threshold, options.double_threshold,
                   options.double_threshold, chunksize=options.chunksize,
                   chromosome_list=chrom_list)


    # if we have the reference, let us use it
    if options.reference:
        logger.info('Computing GC content')
        gc.compute_gc_content(options.reference, options.w_gc,
                              options.circular)

    # Now we scan the chromosomes,
    if len(gc.chrom_names) == 1:
        logger.warning("There is only one chromosome. Selected automatically.")
        run_analysis(gc.chr_list[0], options, gc.feature_dict)
    elif options.chromosome <-1 or options.chromosome > len(gc.chrom_names):
        msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names))
        logger.error(msg)
        sys.exit(1)
    else:
        if options.chromosome == -1:
            chromosomes = gc.chrom_names # take all chromosomes
        else:
            # For user, we start at position 1 but in python, we start at zero
            chromosomes = [gc.chrom_names[options.chromosome-1]]

        logger.info("There are %s chromosomes/contigs." % len(gc))
        for this in gc.chrom_names:
            data = (this, gc.positions[this]["start"], gc.positions[this]["end"])
            logger.info("    {} (starting pos: {}, ending pos: {})".format(*data))

        # here we read chromosome by chromosome to save memory.
        # However, if the data is small.
        for i, chrom in enumerate(chromosomes):
            logger.info("==================== analysing chrom/contig %s/%s (%s)"
                  % (i + 1, len(gc), gc.chrom_names[i]))
            # since we read just one contig/chromosome, the chr_list contains
            # only one contig, so we access to it with index 0
            run_analysis(gc.chr_list[i], options, gc.feature_dict)

    if options.skip_multiqc is False:
        logger.info("=========================")
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = 'multiqc . -m sequana_coverage -f -c {}'.format(pathtocfg)
        import subprocess
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()
Ejemplo n.º 47
0
    def __init__(self, filename_fastq, fof_databases, threads=1,
                 output_directory="./kraken_hierarchical/", 
                 keep_temp_files=False, force=False):
        """.. rubric:: **constructor**

        :param filename_fastq: FastQ file to analyse
        :param fof_databases: file that contains a list of databases paths 
            (one per line). The order is important. Note that you may also
            provide a list of datab ase paths.
        :param threads: number of threads to be used by Kraken
        :param output_directory: name of the output directory
        :param keep_temp_files: bool, if True, will keep intermediate files
            from each Kraken analysis, and save html report at each step
        :param bool force: if the output directory already exists, the
            instanciation fails so that the existing data is not overrwritten. 
            If you wish to overwrite the existing directory, set this 
            parameter to True.
        """
        # When running kraken in paired mode and saving the unclassified reads
        # in a file, the output file (fastq) contains both R1 and R2 so there
        # are concatenated in the same file. Actually, if there is R1 and R2,
        # there are concatenated as R1 N R2 (with the letter N as link). 
        # So, in the hiearchical search, paired case, the first iteration has 2
        # input files, must subsequent iterations will have only one file as
        # input, that is the output of the previous run (provided by
        # --unclassified-out option)
        self.filename_fastq = filename_fastq

        # input databases may be stored in a file
        if isinstance(fof_databases, str) and os.path.exists(fof_databases):
            with open(fof_databases, 'r') as fof:
                self.databases = [absolute_path.split('\n')[0] for absolute_path in fof.readlines()]
        # or simply provided as a list
        elif isinstance(fof_databases, list):
            self.databases = fof_databases[:]
        else:
            raise TypeError("input databases must be a list of valid kraken "
                            "databases or a file (see documebntation)")
        self.threads = threads
        self.output_directory = output_directory
        self.keep_temp_files = keep_temp_files

        # check if the output directory already exist
        try:
            os.mkdir(output_directory)
        except OSError:
            if os.path.isdir(output_directory) and force is False:
                logger.error('Output directory %s already exists' % output_directory)
                raise Exception
            elif force is True:
                logger.warning("Output directory %s already exists. You may "
                    "overwrite existing results" % output_directory)

        # list of input fastq files
        if isinstance(filename_fastq, list) and len(filename_fastq) in [1, 2]:
            self.inputs = filename_fastq[:]
        elif isinstance(filename_fastq, str):
            self.inputs = [filename_fastq]
        else:
            msg = "input file must be a string or list of 2 filenames"
            msg += "\nYou provided {}".format(filename_fastq)
            raise TypeError(msg)