Ejemplo n.º 1
0
def _dimension_match(kerasmodel, data, layertype):
    """Check if layer dimensions match.
    The function checks whether the kerasmodel as compatible with
    the supplied inputs.

    Parameters
    ----------
    kerasmodel : :class:`keras.Model`
        Object of type keras.Model.
    data : Dataset or list(Dataset)
        Dataset to check compatiblity for.
    layertype : str
        layers is either 'input_layers' or 'output_layers'.

    Returns
    -------
    boolean :
        Returns True if the keras model is compatible with the data
        and otherwise False.
    """
    if data is None and layertype == 'output_layers':
        return True

    tmpdata = _to_list(data)

    if len(kerasmodel.get_config()[layertype]) != len(tmpdata):
        return False
    # Check if output dims match between model spec and data
    for datum in tmpdata:

        if datum.name not in [
                el[0] for el in kerasmodel.get_config()[layertype]
        ]:
            # If the layer name is not present we end up here
            return False
        layer = kerasmodel.get_layer(datum.name)
        oshape = layer.output_shape
        if isinstance(oshape, list):
            # this case is required for keras 2.4.3 and tf 2
            # which returns a list of tuples
            oshape = oshape[0]
        if not oshape[1:] == datum.shape[1:]:
            # if the layer name is present but the dimensions
            # are incorrect, we end up here.
            return False
    return True
Ejemplo n.º 2
0
    def __init__(self, inputs, outputs, name=None):

        self.kerasmodel = Model(inputs, outputs, name='janggu')

        if not name:

            hasher = hashlib.md5()
            hasher.update(self.kerasmodel.to_json().encode('utf-8'))
            name = hasher.hexdigest()
            print("Generated model-id: '{}'".format(name))

        if hasattr(outputs, '__len__') > 1:
            total_output = K.sum(
                [K.sum(o, axis=-1) for o in self.kerasmodel.output], axis=-1)
        else:
            total_output = K.sum(self.kerasmodel.output, axis=-1)

        grad = K.gradients(total_output, self.kerasmodel.input)
        kinp = self.kerasmodel.input

        kinp = _to_list(kinp)

        self._influence = K.function(kinp, grad)

        self.name = name

        self.outputdir = _get_output_root_directory()

        if not os.path.exists(self.outputdir):  # pragma: no cover
            # this is excluded from unit tests, because the testing
            # framework always provides a directory
            os.makedirs(self.outputdir)

        if not os.path.exists(os.path.join(self.outputdir, 'logs')):
            os.makedirs(os.path.join(self.outputdir, 'logs'))

        logfile = os.path.join(self.outputdir, 'logs', 'janggu.log')

        self.logger = logging.getLogger(self.name)

        logging.basicConfig(filename=logfile,
                            level=logging.DEBUG,
                            format='%(asctime)s:%(name)s:%(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S')
        self.logger.info("Model Summary:")
        self.kerasmodel.summary(print_fn=self.logger.info)
Ejemplo n.º 3
0
    def evaluate(self, inputs=None, outputs=None,  # pylint: disable=too-many-locals
                 batch_size=None,
                 sample_weight=None,
                 steps=None,
                 datatags=None,
                 callbacks=None,
                 use_multiprocessing=False,
                 workers=1):
        """Evaluates the performance.

        This method is used to evaluate a given model.
        All of the parameters are directly delegated the
        evalute_generator of the keras model.
        See https://keras.io/models/model/#methods.


        Parameters
        ----------
        inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence)
            Input Dataset or Sequence to use for evaluating the model.
        outputs :  :code:`Dataset`, list(Dataset) or None
            Output Dataset containing the training targets. If a Sequence
            is used for inputs, outputs will have no effect.
        batch_size : int or None
            Batch size. If set to None a batch size of 32 is used.
        sample_weight : np.array or None
            Sample weights. See https://keras.io.
        steps : int, None.
            Number of predict steps. If None, this value is determined from
            the dataset size and the batch_size.
        datatags : list(str) or None
            Tags to annotate the evaluation results. Default: None.
        callbacks : List(:code:`Scorer` or str)
            Scorer instances to be applied on the predictions. Furthermore,
            commonly used scoring metrics can be added by name, including
            'roc', 'auroc', 'prc', 'auprc' for evaluating binary classification
            applications and 'cor' (for Pearson's correlation), 'mae', 'mse'
            and 'var_explained' for regression applications.
        use_multiprocessing : boolean
            Whether to use multiprocessing for the prediction. Default: False.
        workers : int
            Number of workers to use. Default: 1.


        Examples
        --------

        .. code-block:: python

          model.evaluate(DATA, LABELS)

          # binary classification evaluation with callbacks
          model.evaluate(DATA, LABELS, callcacks=['auprc', 'auroc'])

        """

        self.logger.info('Evaluate: %s', self.name)
        if isinstance(inputs, Sequence):
            inputs_ = _convert_data(self.kerasmodel, inputs.inputs, 'input_layers')
            outputs_ = _convert_data(self.kerasmodel, inputs.outputs, 'output_layers')
            self.logger.info('Using custom Sequence.')
            self.logger.info("Input:")
            self.__dim_logging(inputs_)
            self.logger.info("Output:")
            self.__dim_logging(outputs_)
        else:
            inputs_ = _convert_data(self.kerasmodel, inputs, 'input_layers')
            outputs_ = _convert_data(self.kerasmodel, outputs, 'output_layers')
            self.logger.info("Input:")
            self.__dim_logging(inputs_)
            self.logger.info("Output:")
            self.__dim_logging(outputs_)
        self.timer = time.time()

        if not batch_size:
            batch_size = 32

        if isinstance(inputs, Sequence):
            jseq = inputs
        else:
            jseq = JangguSequence(batch_size, inputs_, outputs_, sample_weight)

        try:
            values = self.kerasmodel.evaluate_generator(
                jseq,
                steps=steps,
                use_multiprocessing=use_multiprocessing,
                workers=workers)
        except Exception:  # pragma: no cover
            self.logger.exception('evaluate_generator failed:')
            raise

        self.logger.info('#' * 40)
        values = _to_list(values)

        for i, value in enumerate(values):
            self.logger.info('%s: %f', self.kerasmodel.metrics_names[i], value)
        self.logger.info('#' * 40)

        self.logger.info("Evaluation finished in %1.3f s",
                         time.time() - self.timer)

        preds = self.kerasmodel.predict_generator(jseq, steps=steps,
                                                  use_multiprocessing=use_multiprocessing,
                                                  workers=workers)
        preds = _convert_data(self.kerasmodel, preds, 'output_layers')

        for callback in callbacks or []:
            callback = get_scorer(callback)
            callback.score(self, preds, outputs=outputs_, datatags=datatags)
        return values
Ejemplo n.º 4
0
def input_attribution(model, inputs,  # pylint: disable=too-many-locals
                      chrom=None, start=None, end=None):

    """Evaluates the integrated gradients method on the input coverage tracks.

    This allows to attribute feature importance values to the prediction scores.
    Integrated gradients have been introduced in
    Sundararajan, Taly and Yan, Axiomatic Attribution for Deep Networks.
    PMLR 70, 2017.

    The method can either be called, by specifying the region of interest directly
    by setting chrom, start and end. Alternatively, it is possible to specify the
    region index. For example, the n^th region of the dataset.

    Parameters
    ----------
    model : Janggu
        Janggu model wrapper
    inputs : :code:`Dataset`, list(Dataset)
        Input Dataset.
    chrom : str or None
        Chromosome name.
    start : int or None
        Region start.
    end : int or None
        Region end.

    Examples
    --------

    .. code-block:: python

      # Suppose DATA is a Bioseq or Cover object
      # To query the input feature importance of a specific genomic region
      # use
      input_attribution(model, DATA, chrom='chr1', start=start, end=end)

    """

    output_chrom, output_start, output_end = chrom, start, end

    inputs = _to_list(inputs)

    # store original gindexer
    gindexers_save = [ip.gindexer for ip in inputs]

    # create new indexers ranging only over the selected region
    # if chrom, start, end was supplied retrieve the respective indices
    index_list = [gi.idx_by_region(include=output_chrom,
                                   start=output_start,
                                   end=output_end) for gi in gindexers_save]

    # first construct the union of indices
    index_set = set()
    for idx_list_el in index_list:
        index_set = index_set | set(idx_list_el)

    # only keep the indices that remain in the across all inputs
    # indices that are only present in some of the inputs are discarded.
    for idx_list_el in index_list:
        index_set = index_set & set(idx_list_el)

    idxs = list(index_set)
    idxs.sort()

    subgindexers = [copy.copy(gi) for gi in gindexers_save]
    for subgi in subgindexers:
        subgi.chrs = [subgi.chrs[i] for i in idxs]
        subgi.starts = [subgi.starts[i] for i in idxs]
        subgi.ends = [subgi.ends[i] for i in idxs]
        subgi.strand = [subgi.strand[i] for i in idxs]

    # assign it to the input datasets temporarily
    for inp, _ in enumerate(inputs):
        inputs[inp].gindexer = subgindexers[inp]

    try:
        #allocate arrays
        output = [np.zeros((1, output_end-output_start,
                            inp.shape[-2], inp.shape[-1])) for inp in inputs]
        resols = [inp.garray.resolution for inp in inputs]

        for igi in range(len(inputs[0])):

            # current influence
            influence = [np.zeros((1,) + inp.shape[1:]) for inp in inputs]

            # get influence for current window with integrated gradient
            x_in = [inp[igi] for inp in inputs]
            for step in range(1, 51):
                grad = model._influence([x*step/50 for x in x_in])
                for iinp, inp in enumerate(x_in):
                    for idim, _ in np.ndenumerate(inp):
                        influence[iinp][idim] += (x_in[iinp][idim]/50)*grad[iinp][idim]

            # scale length to nucleotide resolution
            influence = [np.repeat(influence[i], resols[i],
                                   axis=1) for i, _ in enumerate(inputs)]

            for iout in range(len(output)):
                if influence[iout].shape[1] < inputs[iout].gindexer[igi].length:
                    order = inputs[iout].gindexer[igi].length - influence[iout].shape[1]
                else:
                    order = 0
                # incremetally add the influence results into the output
                # array for all subwindows in the genomic indexer

                if output_start < inputs[iout].gindexer[igi].start:
                    ostart = inputs[iout].gindexer[igi].start - output_start
                    lstart = 0
                else:
                    ostart = 0
                    lstart = output_start - inputs[iout].gindexer[igi].start

                if output_end > inputs[iout].gindexer[igi].end:
                    oend = inputs[iout].gindexer[igi].end - output_start
                    lend = inputs[iout].gindexer[igi].end - inputs[iout].gindexer[igi].start
                else:
                    oend = output_end - output_start
                    lend = output_end - inputs[iout].gindexer[igi].start

                # for mutually overlapping positions, we employ a heuristic
                # that keeps the maximum influence over the overlapping intervals
                # spanning the position
                m = np.zeros((2,) + (1, inputs[iout].gindexer[igi].length, ) \
                             + influence[iout].shape[2:], dtype=influence[iout].dtype)

                m[0][:, lstart:lend, :, :] = output[iout][:, (ostart):(oend), :, :]
                m[1][:, lstart:(lend - order), :, :] = \
                    influence[iout][:, lstart:(lend - order), :, :]
                m = np.abs(m).max(axis=0)
                m = m[:, lstart:lend, :, :]
                output[iout][:, ostart:oend, :, :] = m

        for iout in range(len(output)):
            # finally wrap the output up as coverage track
            output[iout] = Cover.create_from_array('attr_'+inputs[iout].name,
                                                   output[iout],
                                                   GenomicIndexer.create_from_region(
                                                       chrom, start, end, '.',
                                                       binsize=end-start,
                                                       stepsize=1, flank=0),
                                                   conditions=inputs[iout].conditions)

        for inp, _ in enumerate(inputs):
            # restore the initial genomic indexers
            inputs[inp].gindexer = gindexers_save[inp]

    except Exception:  # pragma: no cover
        model.logger.exception('_influence failed:')
        raise

    return output
Ejemplo n.º 5
0
    def create_from_seq(
            cls,
            name,  # pylint: disable=too-many-locals
            fastafile,
            storage='ndarray',
            seqtype='dna',
            order=1,
            fixedlen=None,
            datatags=None,
            cache=False,
            overwrite=False,
            verbose=False):
        """Create a Bioseq class from a biological sequences.

        This constructor loads a set of nucleotide or amino acid sequences.
        By default, the sequence are assumed to be of equal length.
        Alternatively, sequences can be truncated and padded to a fixed length.


        Parameters
        -----------
        name : str
            Name of the dataset
        fastafile : str or list(str) or list(Bio.SeqRecord)
            Fasta file or list of fasta files from which the sequences
            are loaded or a list of Bio.SeqRecord.SeqRecord.
        seqtype : str
            Indicates whether a nucleotide or peptide sequence is loaded
            using 'dna' or 'protein' respectively. Default: 'dna'.
        order : int
            Order for the one-hot representation. Default: 1.
        fixedlen : int or None
            Forces the sequences to be of equal length by truncation or
            zero-padding. If set to None, it will be assumed that the sequences
            are already of equal length. An exception is raised if this is
            not the case. Default: None.
        storage : str
            Storage mode for storing the sequence may be 'ndarray' or 'hdf5'.
            Default: 'ndarray'.
        datatags : list(str) or None
            List of datatags. Together with the dataset name,
            the datatags are used to construct a cache file.
            If :code:`cache=False`, this option does not have an effect.
            Default: None.
        cache : boolean
            Indicates whether to cache the dataset. Default: False.
        overwrite : boolean
            Overwrite the cachefiles. Default: False.
        verbose : boolean
            Verbosity. Default: False
        """
        if storage not in ['ndarray', 'hdf5']:
            raise ValueError(
                'Available storage options for Bioseq are: ndarray or hdf5')

        seqs = []
        fastafile = _to_list(fastafile)

        if not isinstance(fastafile[0], Bio.SeqRecord.SeqRecord):
            for fasta in _check_valid_files(fastafile):
                # += is necessary since sequences_from_fasta
                # returns a list
                seqs += sequences_from_fasta(fasta, seqtype)
        else:
            # This is already a list of SeqRecords
            seqs = fastafile

        if fixedlen is not None:
            seqs = sequence_padding(seqs, fixedlen)

        # Check if sequences are equally long
        lens = [len(seq) for seq in seqs]
        assert lens == [len(seqs[0])] * len(seqs), "Input sequences must " + \
            "be of equal length."

        # Chromnames are required to be Unique
        chroms = [seq.id for seq in seqs]
        assert len(set(chroms)) == len(seqs), "Sequence IDs must be unique."
        # now mimic a dataframe representing a bed file

        reglen = lens[0]
        flank = 0
        stepsize = 1

        gindexer = GenomicIndexer(reglen, stepsize, flank, zero_padding=False)
        for chrom in chroms:
            gindexer.add_interval(chrom, 0, reglen, '.')

        garray = cls._make_genomic_array(name,
                                         gindexer,
                                         seqs,
                                         order,
                                         storage,
                                         cache=cache,
                                         datatags=datatags,
                                         overwrite=overwrite,
                                         store_whole_genome=False,
                                         verbose=verbose)

        return cls(name,
                   garray,
                   gindexer,
                   alphabet=seqs[0].seq.alphabet.letters)
Ejemplo n.º 6
0
def plotGenomeTrack(tracks,
                    chrom,
                    start,
                    end,
                    figsize=(10, 5),
                    plottypes=None):
    """plotGenomeTrack shows plots of a specific interval from cover objects data.

    It takes one or more cover objects as well as a genomic interval consisting
    of chromosome name, start and end and creates
    a genome browser-like plot.

    Parameters
    ----------
    tracks : janggu.data.Cover, list(Cover), janggu.data.Track or list(Track)
        One or more track objects.
    chrom : str
        chromosome name.
    start : int
        The start of the required interval.
    end : int
        The end of the required interval.
    figsize : tuple(int, int)
        Figure size passed on to matplotlib.
    plottype : None or list(str)
        Plot type indicates whether to plot coverage tracks as line plots,
        heatmap, or seqplot using 'line' or 'heatmap', respectively.
        By default, all coverage objects are depicted as line plots if plottype=None.
        Otherwise, a list of types must be supplied containing the plot types for each
        coverage object explicitly. For example, ['line', 'heatmap', 'seqplot'].
        While, 'line' and 'heatmap' can be used for any type of coverage data,
        'seqplot' is reserved to plot sequence influence on the output. It is
        intended to be used in conjunction with 'input_attribution' method which
        determines the importance of paricular sequence letters for the output prediction.

    Returns
    -------
    matplotlib Figure
        A matplotlib figure illustrating the genome browser-view of the coverage
        objects for the given interval.
        To depict and save the figure the native matplotlib functions show()
        and savefig() can be used.
    """

    tracks = _to_list(tracks)

    for track in tracks:
        if not isinstance(track, Track):
            warnings.warn(
                'Convert the Dataset object to proper Track objects.'
                ' In the future, only Track objects will be supported.',
                FutureWarning)
            if plottypes is None:
                plottypes = ['line'] * len(tracks)

            assert len(plottypes) == len(tracks), \
                "The number of cover objects must be the same as the number of plottyes."
            break

    def _convert_to_track(cover, plottype):
        if plottype == 'heatmap':
            track = HeatTrack(cover)
        elif plottype == 'seqplot':
            track = SeqTrack(cover)
        else:
            track = LineTrack(cover)
        return track

    tracks_ = []
    for itrack, track in enumerate(tracks):
        if isinstance(track, Track):
            tracks_.append(track)
        else:
            warnings.warn(
                'Convert the Dataset object to proper Track objects.'
                ' In the future, only Track objects will be supported.',
                FutureWarning)
            tracks_.append(_convert_to_track(track, plottypes[itrack]))

    tracks = tracks_
    headertrack = 2
    trackheights = 0
    for track in tracks:
        trackheights += track.height
    spacer = len(tracks) - 1

    grid = plt.GridSpec(headertrack + trackheights + spacer,
                        10,
                        wspace=0.4,
                        hspace=0.3)
    fig = plt.figure(figsize=figsize)

    # title and reference track
    title = fig.add_subplot(grid[0, 1:])

    title.set_title(chrom)
    plt.xlim([0, end - start])
    title.spines['right'].set_visible(False)
    title.spines['top'].set_visible(False)
    title.spines['left'].set_visible(False)
    plt.xticks([0, end - start], [start, end])
    plt.yticks(())

    y_offset = 1
    for track in tracks:
        y_offset += 1

        track.add_side_bar(fig, grid, y_offset)
        track.plot(fig, grid, y_offset, chrom, start, end)
        y_offset += track.height

    return (fig)