def _dimension_match(kerasmodel, data, layertype): """Check if layer dimensions match. The function checks whether the kerasmodel as compatible with the supplied inputs. Parameters ---------- kerasmodel : :class:`keras.Model` Object of type keras.Model. data : Dataset or list(Dataset) Dataset to check compatiblity for. layertype : str layers is either 'input_layers' or 'output_layers'. Returns ------- boolean : Returns True if the keras model is compatible with the data and otherwise False. """ if data is None and layertype == 'output_layers': return True tmpdata = _to_list(data) if len(kerasmodel.get_config()[layertype]) != len(tmpdata): return False # Check if output dims match between model spec and data for datum in tmpdata: if datum.name not in [ el[0] for el in kerasmodel.get_config()[layertype] ]: # If the layer name is not present we end up here return False layer = kerasmodel.get_layer(datum.name) oshape = layer.output_shape if isinstance(oshape, list): # this case is required for keras 2.4.3 and tf 2 # which returns a list of tuples oshape = oshape[0] if not oshape[1:] == datum.shape[1:]: # if the layer name is present but the dimensions # are incorrect, we end up here. return False return True
def __init__(self, inputs, outputs, name=None): self.kerasmodel = Model(inputs, outputs, name='janggu') if not name: hasher = hashlib.md5() hasher.update(self.kerasmodel.to_json().encode('utf-8')) name = hasher.hexdigest() print("Generated model-id: '{}'".format(name)) if hasattr(outputs, '__len__') > 1: total_output = K.sum( [K.sum(o, axis=-1) for o in self.kerasmodel.output], axis=-1) else: total_output = K.sum(self.kerasmodel.output, axis=-1) grad = K.gradients(total_output, self.kerasmodel.input) kinp = self.kerasmodel.input kinp = _to_list(kinp) self._influence = K.function(kinp, grad) self.name = name self.outputdir = _get_output_root_directory() if not os.path.exists(self.outputdir): # pragma: no cover # this is excluded from unit tests, because the testing # framework always provides a directory os.makedirs(self.outputdir) if not os.path.exists(os.path.join(self.outputdir, 'logs')): os.makedirs(os.path.join(self.outputdir, 'logs')) logfile = os.path.join(self.outputdir, 'logs', 'janggu.log') self.logger = logging.getLogger(self.name) logging.basicConfig(filename=logfile, level=logging.DEBUG, format='%(asctime)s:%(name)s:%(message)s', datefmt='%m/%d/%Y %H:%M:%S') self.logger.info("Model Summary:") self.kerasmodel.summary(print_fn=self.logger.info)
def evaluate(self, inputs=None, outputs=None, # pylint: disable=too-many-locals batch_size=None, sample_weight=None, steps=None, datatags=None, callbacks=None, use_multiprocessing=False, workers=1): """Evaluates the performance. This method is used to evaluate a given model. All of the parameters are directly delegated the evalute_generator of the keras model. See https://keras.io/models/model/#methods. Parameters ---------- inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence) Input Dataset or Sequence to use for evaluating the model. outputs : :code:`Dataset`, list(Dataset) or None Output Dataset containing the training targets. If a Sequence is used for inputs, outputs will have no effect. batch_size : int or None Batch size. If set to None a batch size of 32 is used. sample_weight : np.array or None Sample weights. See https://keras.io. steps : int, None. Number of predict steps. If None, this value is determined from the dataset size and the batch_size. datatags : list(str) or None Tags to annotate the evaluation results. Default: None. callbacks : List(:code:`Scorer` or str) Scorer instances to be applied on the predictions. Furthermore, commonly used scoring metrics can be added by name, including 'roc', 'auroc', 'prc', 'auprc' for evaluating binary classification applications and 'cor' (for Pearson's correlation), 'mae', 'mse' and 'var_explained' for regression applications. use_multiprocessing : boolean Whether to use multiprocessing for the prediction. Default: False. workers : int Number of workers to use. Default: 1. Examples -------- .. code-block:: python model.evaluate(DATA, LABELS) # binary classification evaluation with callbacks model.evaluate(DATA, LABELS, callcacks=['auprc', 'auroc']) """ self.logger.info('Evaluate: %s', self.name) if isinstance(inputs, Sequence): inputs_ = _convert_data(self.kerasmodel, inputs.inputs, 'input_layers') outputs_ = _convert_data(self.kerasmodel, inputs.outputs, 'output_layers') self.logger.info('Using custom Sequence.') self.logger.info("Input:") self.__dim_logging(inputs_) self.logger.info("Output:") self.__dim_logging(outputs_) else: inputs_ = _convert_data(self.kerasmodel, inputs, 'input_layers') outputs_ = _convert_data(self.kerasmodel, outputs, 'output_layers') self.logger.info("Input:") self.__dim_logging(inputs_) self.logger.info("Output:") self.__dim_logging(outputs_) self.timer = time.time() if not batch_size: batch_size = 32 if isinstance(inputs, Sequence): jseq = inputs else: jseq = JangguSequence(batch_size, inputs_, outputs_, sample_weight) try: values = self.kerasmodel.evaluate_generator( jseq, steps=steps, use_multiprocessing=use_multiprocessing, workers=workers) except Exception: # pragma: no cover self.logger.exception('evaluate_generator failed:') raise self.logger.info('#' * 40) values = _to_list(values) for i, value in enumerate(values): self.logger.info('%s: %f', self.kerasmodel.metrics_names[i], value) self.logger.info('#' * 40) self.logger.info("Evaluation finished in %1.3f s", time.time() - self.timer) preds = self.kerasmodel.predict_generator(jseq, steps=steps, use_multiprocessing=use_multiprocessing, workers=workers) preds = _convert_data(self.kerasmodel, preds, 'output_layers') for callback in callbacks or []: callback = get_scorer(callback) callback.score(self, preds, outputs=outputs_, datatags=datatags) return values
def input_attribution(model, inputs, # pylint: disable=too-many-locals chrom=None, start=None, end=None): """Evaluates the integrated gradients method on the input coverage tracks. This allows to attribute feature importance values to the prediction scores. Integrated gradients have been introduced in Sundararajan, Taly and Yan, Axiomatic Attribution for Deep Networks. PMLR 70, 2017. The method can either be called, by specifying the region of interest directly by setting chrom, start and end. Alternatively, it is possible to specify the region index. For example, the n^th region of the dataset. Parameters ---------- model : Janggu Janggu model wrapper inputs : :code:`Dataset`, list(Dataset) Input Dataset. chrom : str or None Chromosome name. start : int or None Region start. end : int or None Region end. Examples -------- .. code-block:: python # Suppose DATA is a Bioseq or Cover object # To query the input feature importance of a specific genomic region # use input_attribution(model, DATA, chrom='chr1', start=start, end=end) """ output_chrom, output_start, output_end = chrom, start, end inputs = _to_list(inputs) # store original gindexer gindexers_save = [ip.gindexer for ip in inputs] # create new indexers ranging only over the selected region # if chrom, start, end was supplied retrieve the respective indices index_list = [gi.idx_by_region(include=output_chrom, start=output_start, end=output_end) for gi in gindexers_save] # first construct the union of indices index_set = set() for idx_list_el in index_list: index_set = index_set | set(idx_list_el) # only keep the indices that remain in the across all inputs # indices that are only present in some of the inputs are discarded. for idx_list_el in index_list: index_set = index_set & set(idx_list_el) idxs = list(index_set) idxs.sort() subgindexers = [copy.copy(gi) for gi in gindexers_save] for subgi in subgindexers: subgi.chrs = [subgi.chrs[i] for i in idxs] subgi.starts = [subgi.starts[i] for i in idxs] subgi.ends = [subgi.ends[i] for i in idxs] subgi.strand = [subgi.strand[i] for i in idxs] # assign it to the input datasets temporarily for inp, _ in enumerate(inputs): inputs[inp].gindexer = subgindexers[inp] try: #allocate arrays output = [np.zeros((1, output_end-output_start, inp.shape[-2], inp.shape[-1])) for inp in inputs] resols = [inp.garray.resolution for inp in inputs] for igi in range(len(inputs[0])): # current influence influence = [np.zeros((1,) + inp.shape[1:]) for inp in inputs] # get influence for current window with integrated gradient x_in = [inp[igi] for inp in inputs] for step in range(1, 51): grad = model._influence([x*step/50 for x in x_in]) for iinp, inp in enumerate(x_in): for idim, _ in np.ndenumerate(inp): influence[iinp][idim] += (x_in[iinp][idim]/50)*grad[iinp][idim] # scale length to nucleotide resolution influence = [np.repeat(influence[i], resols[i], axis=1) for i, _ in enumerate(inputs)] for iout in range(len(output)): if influence[iout].shape[1] < inputs[iout].gindexer[igi].length: order = inputs[iout].gindexer[igi].length - influence[iout].shape[1] else: order = 0 # incremetally add the influence results into the output # array for all subwindows in the genomic indexer if output_start < inputs[iout].gindexer[igi].start: ostart = inputs[iout].gindexer[igi].start - output_start lstart = 0 else: ostart = 0 lstart = output_start - inputs[iout].gindexer[igi].start if output_end > inputs[iout].gindexer[igi].end: oend = inputs[iout].gindexer[igi].end - output_start lend = inputs[iout].gindexer[igi].end - inputs[iout].gindexer[igi].start else: oend = output_end - output_start lend = output_end - inputs[iout].gindexer[igi].start # for mutually overlapping positions, we employ a heuristic # that keeps the maximum influence over the overlapping intervals # spanning the position m = np.zeros((2,) + (1, inputs[iout].gindexer[igi].length, ) \ + influence[iout].shape[2:], dtype=influence[iout].dtype) m[0][:, lstart:lend, :, :] = output[iout][:, (ostart):(oend), :, :] m[1][:, lstart:(lend - order), :, :] = \ influence[iout][:, lstart:(lend - order), :, :] m = np.abs(m).max(axis=0) m = m[:, lstart:lend, :, :] output[iout][:, ostart:oend, :, :] = m for iout in range(len(output)): # finally wrap the output up as coverage track output[iout] = Cover.create_from_array('attr_'+inputs[iout].name, output[iout], GenomicIndexer.create_from_region( chrom, start, end, '.', binsize=end-start, stepsize=1, flank=0), conditions=inputs[iout].conditions) for inp, _ in enumerate(inputs): # restore the initial genomic indexers inputs[inp].gindexer = gindexers_save[inp] except Exception: # pragma: no cover model.logger.exception('_influence failed:') raise return output
def create_from_seq( cls, name, # pylint: disable=too-many-locals fastafile, storage='ndarray', seqtype='dna', order=1, fixedlen=None, datatags=None, cache=False, overwrite=False, verbose=False): """Create a Bioseq class from a biological sequences. This constructor loads a set of nucleotide or amino acid sequences. By default, the sequence are assumed to be of equal length. Alternatively, sequences can be truncated and padded to a fixed length. Parameters ----------- name : str Name of the dataset fastafile : str or list(str) or list(Bio.SeqRecord) Fasta file or list of fasta files from which the sequences are loaded or a list of Bio.SeqRecord.SeqRecord. seqtype : str Indicates whether a nucleotide or peptide sequence is loaded using 'dna' or 'protein' respectively. Default: 'dna'. order : int Order for the one-hot representation. Default: 1. fixedlen : int or None Forces the sequences to be of equal length by truncation or zero-padding. If set to None, it will be assumed that the sequences are already of equal length. An exception is raised if this is not the case. Default: None. storage : str Storage mode for storing the sequence may be 'ndarray' or 'hdf5'. Default: 'ndarray'. datatags : list(str) or None List of datatags. Together with the dataset name, the datatags are used to construct a cache file. If :code:`cache=False`, this option does not have an effect. Default: None. cache : boolean Indicates whether to cache the dataset. Default: False. overwrite : boolean Overwrite the cachefiles. Default: False. verbose : boolean Verbosity. Default: False """ if storage not in ['ndarray', 'hdf5']: raise ValueError( 'Available storage options for Bioseq are: ndarray or hdf5') seqs = [] fastafile = _to_list(fastafile) if not isinstance(fastafile[0], Bio.SeqRecord.SeqRecord): for fasta in _check_valid_files(fastafile): # += is necessary since sequences_from_fasta # returns a list seqs += sequences_from_fasta(fasta, seqtype) else: # This is already a list of SeqRecords seqs = fastafile if fixedlen is not None: seqs = sequence_padding(seqs, fixedlen) # Check if sequences are equally long lens = [len(seq) for seq in seqs] assert lens == [len(seqs[0])] * len(seqs), "Input sequences must " + \ "be of equal length." # Chromnames are required to be Unique chroms = [seq.id for seq in seqs] assert len(set(chroms)) == len(seqs), "Sequence IDs must be unique." # now mimic a dataframe representing a bed file reglen = lens[0] flank = 0 stepsize = 1 gindexer = GenomicIndexer(reglen, stepsize, flank, zero_padding=False) for chrom in chroms: gindexer.add_interval(chrom, 0, reglen, '.') garray = cls._make_genomic_array(name, gindexer, seqs, order, storage, cache=cache, datatags=datatags, overwrite=overwrite, store_whole_genome=False, verbose=verbose) return cls(name, garray, gindexer, alphabet=seqs[0].seq.alphabet.letters)
def plotGenomeTrack(tracks, chrom, start, end, figsize=(10, 5), plottypes=None): """plotGenomeTrack shows plots of a specific interval from cover objects data. It takes one or more cover objects as well as a genomic interval consisting of chromosome name, start and end and creates a genome browser-like plot. Parameters ---------- tracks : janggu.data.Cover, list(Cover), janggu.data.Track or list(Track) One or more track objects. chrom : str chromosome name. start : int The start of the required interval. end : int The end of the required interval. figsize : tuple(int, int) Figure size passed on to matplotlib. plottype : None or list(str) Plot type indicates whether to plot coverage tracks as line plots, heatmap, or seqplot using 'line' or 'heatmap', respectively. By default, all coverage objects are depicted as line plots if plottype=None. Otherwise, a list of types must be supplied containing the plot types for each coverage object explicitly. For example, ['line', 'heatmap', 'seqplot']. While, 'line' and 'heatmap' can be used for any type of coverage data, 'seqplot' is reserved to plot sequence influence on the output. It is intended to be used in conjunction with 'input_attribution' method which determines the importance of paricular sequence letters for the output prediction. Returns ------- matplotlib Figure A matplotlib figure illustrating the genome browser-view of the coverage objects for the given interval. To depict and save the figure the native matplotlib functions show() and savefig() can be used. """ tracks = _to_list(tracks) for track in tracks: if not isinstance(track, Track): warnings.warn( 'Convert the Dataset object to proper Track objects.' ' In the future, only Track objects will be supported.', FutureWarning) if plottypes is None: plottypes = ['line'] * len(tracks) assert len(plottypes) == len(tracks), \ "The number of cover objects must be the same as the number of plottyes." break def _convert_to_track(cover, plottype): if plottype == 'heatmap': track = HeatTrack(cover) elif plottype == 'seqplot': track = SeqTrack(cover) else: track = LineTrack(cover) return track tracks_ = [] for itrack, track in enumerate(tracks): if isinstance(track, Track): tracks_.append(track) else: warnings.warn( 'Convert the Dataset object to proper Track objects.' ' In the future, only Track objects will be supported.', FutureWarning) tracks_.append(_convert_to_track(track, plottypes[itrack])) tracks = tracks_ headertrack = 2 trackheights = 0 for track in tracks: trackheights += track.height spacer = len(tracks) - 1 grid = plt.GridSpec(headertrack + trackheights + spacer, 10, wspace=0.4, hspace=0.3) fig = plt.figure(figsize=figsize) # title and reference track title = fig.add_subplot(grid[0, 1:]) title.set_title(chrom) plt.xlim([0, end - start]) title.spines['right'].set_visible(False) title.spines['top'].set_visible(False) title.spines['left'].set_visible(False) plt.xticks([0, end - start], [start, end]) plt.yticks(()) y_offset = 1 for track in tracks: y_offset += 1 track.add_side_bar(fig, grid, y_offset) track.plot(fig, grid, y_offset, chrom, start, end) y_offset += track.height return (fig)