Beispiel #1
0
    def score(self, input_batch):
        """
        Args:
          input_batch: Input batch that should be scored.

        Returns:
          A list of length: len(`scores`). Every element of the list is
             a stacked list of depth D if the model input is D-dimensional
             with identcal shape. Every entry of that list then contains the
             scores of the model output selected by `output_sel_fn`. Values
             are `None` if the input_batch already had a `1` at that position.
        """

        ref = self.model.predict_on_batch(input_batch)
        scores = []
        for sample_i in range(
                len(get_model_input(input_batch, self.model_input))):

            # get the full set of model inputs for the selected sample
            sample_set = get_dataset_item(input_batch, sample_i)

            # get the reference output for this sample
            ref_sample_pred = get_dataset_item(ref, sample_i)

            # Apply the output selection function if defined
            if self.output_sel_fn is not None:
                ref_sample_pred = self.output_sel_fn(ref_sample_pred)

            # get the one-hot encoded reference input array
            input_sample = get_model_input(sample_set,
                                           input_id=self.model_input)

            # where we keep the scores - scores are lists (ordered by diff
            # method of ndarrays, lists or dictionaries - whatever is returned by the model
            score = np.empty(input_sample.shape, dtype=object)
            score[:] = None
            for alt_batch, alt_idxs in self._mutate_sample_batched(
                    input_sample):
                num_samples = len(alt_batch)
                mult_set = numpy_collate([sample_set] * num_samples)
                mult_set = set_model_input(mult_set,
                                           numpy_collate(alt_batch),
                                           input_id=self.model_input)
                alt = self.model.predict_on_batch(mult_set)
                for alt_sample_i in range(num_samples):
                    alt_sample = get_dataset_item(alt, alt_sample_i)
                    # Apply the output selection function if defined
                    if self.output_sel_fn is not None:
                        alt_sample = self.output_sel_fn(alt_sample)
                    # Apply scores across all model outputs for ref and alt
                    output_scores = [
                        apply_within(ref_sample_pred, alt_sample, scr)
                        for scr in self.scores
                    ]
                    score.__setitem__(alt_idxs[alt_sample_i], output_scores)
            scores.append(score.tolist())

        return scores
Beispiel #2
0
 def from_seqlet_imps(cls, seqlet_imps):
     from kipoi.data_utils import numpy_collate
     s1 = seqlet_imps[0]
     # tasks = s1.tasks()
     return cls(
         seq=np.stack([s.seq for s in seqlet_imps]),
         contrib=numpy_collate([s.contrib for s in seqlet_imps]),
         hyp_contrib=numpy_collate([s.hyp_contrib for s in seqlet_imps]),
         profile=numpy_collate([s.profile for s in seqlet_imps]),
         name=s1.name,
         attrs=s1.attrs
     )
Beispiel #3
0
 def batch_iter(self, batch_size=32, **kwargs):
     # TODO - implement this in parallel - add `num_workers` argument
     # https://github.com/fchollet/keras/blob/master/keras/utils/data_utils.py#L589
     l = []
     for x in iter(self):
         l.append(x)
         if len(l) == batch_size:
             ret = numpy_collate(l)
             # remove all elements
             del l[:]
             yield ret
     # Return the rest
     if len(l) > 0:
         yield numpy_collate(l)
Beispiel #4
0
def get_example_data(example, layer, writer=None):
    example_dir = "examples/{0}".format(example)
    if INSTALL_REQ:
        install_model_requirements(example_dir, "dir", and_dataloaders=True)

    model = kipoi.get_model(example_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(example_dir, source="dir")
    #
    with open(example_dir + "/example_files/test.json", "r") as ifh:
        dataloader_arguments = json.load(ifh)

    for k in dataloader_arguments:
        dataloader_arguments[k] = "example_files/" + dataloader_arguments[k]

    outputs = []
    with cd(model.source_dir):
        dl = Dataloader(**dataloader_arguments)
        it = dl.batch_iter(batch_size=32, num_workers=0)

        # Loop through the data, make predictions, save the output
        for i, batch in enumerate(tqdm(it)):

            # make the prediction
            pred_batch = model.input_grad(batch['inputs'], avg_func="sum", layer=layer,
                                          final_layer=False)
            # write out the predictions, metadata (, inputs, targets)
            # always keep the inputs so that input*grad can be generated!
            output_batch = prepare_batch(batch, pred_batch, keep_inputs=True)
            if writer is not None:
                writer.batch_write(output_batch)
            outputs.append(output_batch)
        if writer is not None:
            writer.close()
    return numpy_collate(outputs)
Beispiel #5
0
    def score(self, input_batch, input_ref):
        """
        Calculate DeepLIFT scores of a given input sequence.
        Args:
          input_batch: Model input data 
        Returns:
          DeepLIFT scores in the same shape / same containers as the input batch.
        """
        x_standardized = self.model._batch_to_list(input_batch)
        ref_standaradized = None
        if input_ref is not None:
            ref_standaradized = self.model._batch_to_list(input_ref)

        scores = self.deeplift_contribs_func(
            task_idx=self.task_idx,
            input_data_list=x_standardized,
            input_references_list=ref_standaradized,
            batch_size=self.batch_size,
            progress_update=1000)

        # TODO DeepLIFT error when using batched execution:
        """
        # run_function_in_batches fails for 
        scores = run_function_in_batches(
            func=self.deeplift_contribs_func,
            input_data_list=x_standardized,
            batch_size=self.batch_size,
            progress_update=1000,
            task_idx=self.task_idx)
        """

        # DeepLIFT returns all samples as a list of individual samples
        scores = [numpy_collate(el) for el in scores]

        # re-format the list-type input back to how the input_batch was:
        scores = self.model._match_to_input(scores, input_batch)
        return scores