def transform(counts):
   if log_transform:
     counts = lt.log(1.0 + counts)
   selection_dict = {'target': list(counts.axes['target'].labels)}
   aligned_means = lt.select(means, selection_dict)
   aligned_stddevs = lt.select(stddevs, selection_dict)
   return (counts - aligned_means) / aligned_stddevs
  def affinity_loss_per_example_and_target(self, logits, outputs):
    """Calculate loss per example on predicting affinity.

    This calls "predict_affinity" which assumably has been implemented in the
    current output layer to predict affinity, and calculates the loss against
    the array output.

    Args:
      logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis].
      outputs: LabeledTensor with dtype=float32 and axes [batch, output_axis].
        These outputs should include everything from the preprocessing, whether
        it is used in the loss or not.

    Returns:
      LabeledTensor with dtype=float32 and axes [batch, target_axis] giving
      loss for each target.
    """
    affinity_pred = _affinities_to_binding_arrays(self.binding_arrays_map,
                                                  self.predict_affinity(logits))

    affinity_pred = lt.rename_axis(affinity_pred, 'output', 'target')
    array_output = lt.rename_axis(
        lt.select(outputs, {'output': list(self.binding_arrays_map.keys())}),
        'output', 'target')

    return self.loss.per_example_and_target_array(affinity_pred, array_output)
  def _normed_prev_round_counts(self, input_counts,
                                input_counts_label='output'):
    """Create a Tensor with normalized counts from the previous round.

    Args:
      input_counts: LabeledTensor with dtype=float32 and axes
        [batch, input_counts_label].
      input_counts_label: Name of the axis in input_counts that contains the
        count data to use. For LatentAffinityWithDeps that uses the
        actual count values, input_counts will be the outputs tensor and the
        label will be 'output'. For LatentAffinityWithPredDeps, input_counts
        will be the predictions for these counts and the axis will be 'target'.

    Returns:
      preds: LabeledTensor with dtype=float32 and axes [batch, target_axis].
    """
    parent_lookup = {}
    for k, parent in self.parent_count_names.items():
      if parent in input_counts.axes:
        parent_lookup[k] = lt.select(input_counts,
                                     {input_counts_label: parent})
    default_tensor = lt.LabeledTensor(
        tf.zeros_like(input_counts[:, 0]), [input_counts.axes['batch']])
    parent_tensors = [
        parent_lookup.get(k, default_tensor) for k in self.target_axis.labels
    ]
    parent_counts = lt.pack(parent_tensors, self.target_axis, axis_position=1)
    normed_counts = self.deps_normalize(parent_counts)
    return normed_counts
Esempio n. 4
0
 def predict_additional_output(self, logits):
     if not self.additional_output_axis:
         raise Error(
             'Tries to calculate additional output while no such output specified'
         )
     return lt.select(logits,
                      {'target': list(self.additional_output_axis.labels)})
  def predict_outputs(self, logits, outputs=None):
    """Predict a score that should correlate with each output.

    Args:
      logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis].
      outputs: optional LabeledTensor with dtype=float32 and axes [batch,
        output_axis]. Note that different output layers may not be directly
        comparable if they make sure of `outputs` from prior rounds of selection
        in predictions.

    Returns:
      LabeledTensor with dtype=float32 and axes [batch, output_axis] giving
      predictions for each count and binding array.
    """
    predicted_counts = lt.rename_axis(
        self.predict_counts(logits, outputs), 'target', 'output')

    if self.binding_arrays_map:
      predicted_affinity = self.predict_affinity(logits)
      predicted_binding_arrays = lt.pack([
          lt.select(predicted_affinity, {'affinity': target})
          for target in self.binding_arrays_map.values()
      ], ('output', list(self.binding_arrays_map.keys())),
                                         axis_position=1)
      preds = lt.concat([predicted_counts, predicted_binding_arrays], 'output')
    else:
      preds = predicted_counts

    if self.additional_output_axis:
      predicted_additional_output = lt.rename_axis(
          self.predict_additional_output(logits), 'target', 'output')
      preds = lt.concat([preds, predicted_additional_output], 'output')
    return preds
  def predict_counts(self, logits, outputs=None):  # pylint: disable=unused-argument
    """Make count predictions from logits and counts.

    Args:
      logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis].
      outputs: LabeledTensor with dtype=float32 and axes [batch, output_axis].
        Unused by the base class but in the signature for the benefit of
        subclasses that use counts from previous rounds to help predict future
        rounds. It is the responsibility of the implementation using `outputs`
        to ensure that this method respects the casual structure of the
        experiment.

    Returns:
      preds: LabeledTensor with dtype=float32 and axes [batch, target_axis].
    """
    # TODO(shoyer): consider using tf.nn.softplus instead of abs here
    weights = abs(self.affinity_weights) * self.selection_signs
    if self.additional_output_axis:
      affinity_logits = lt.rename_axis(
          lt.select(logits, {'target': list(self.affinity_axis.labels)}),
          'target', 'affinity')
    else:
      affinity_logits = lt.rename_axis(logits, 'target', 'affinity')
    preds = lt.matmul(affinity_logits, weights) + self.bias
    return preds
 def predict_affinity(self, logits):
   """See method on base class."""
   if self.additional_output_axis:
     return lt.rename_axis(
         lt.select(logits, {'target': list(self.affinity_axis.labels)}),
         'target', 'affinity')
   else:
     return lt.rename_axis(logits, 'target', 'affinity')
 def loss_per_example_and_target(self, logits, outputs, include_array=True):
   """See method on base class."""
   with tf.name_scope('predictions'):
     if self.additional_output_axis:
       affinity_logits = lt.select(logits,
                                   {'target': list(self.affinity_axis.labels)})
       ao_logits = lt.select(logits,
                             {'target':
                              list(self.additional_output_axis.labels)})
       count_preds = self.predict_counts(affinity_logits, outputs)
       preds = lt.concat([count_preds, ao_logits], 'target')
     else:
       preds = self.predict_counts(logits, outputs)
   targets = _targets_from_outputs(outputs, self.all_target_axis)
   loss = self.loss.per_example_and_target(preds, targets)
   if bool(set(self.binding_arrays_map.keys()) &
           set(outputs.axes['output'].labels)) and include_array:
     affinity_loss = self.affinity_loss_per_example_and_target(logits, outputs)
     return lt.concat([loss, affinity_loss], 'target')
   else:
     return loss
    def __init__(self, hps, net, output_layer, experiment_proto, input_paths):
        inputs, outputs = data.input_pipeline(input_paths,
                                              experiment_proto,
                                              hps.mbsz,
                                              hps=hps,
                                              num_threads=8)
        with tf.name_scope('neural_net'):
            logits = net.fprop(inputs, mode='train')
        with tf.name_scope('output_layer'):
            loss_per_target = output_layer.average_loss_per_target(
                logits, outputs, include_array=hps.train_on_array)
            loss = utils.reduce_nanmean(loss_per_target)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        if hps.optimizer == 'momentum':
            optimizer = tf.MomentumOptimizer(hps.learn_rate, hps.momentum)
        elif hps.optimizer == 'adam':
            optimizer = tf.AdamOptimizer(hps.learn_rate)
        else:
            raise ValueError('invalid optimizer: %s' % hps.optimizer)
        optimizer = tf.MomentumOptimizer(hps.learn_rate, hps.momentum)
        grads = optimizer.compute_gradients(loss,
                                            net.params + output_layer.params)
        opt_op = optimizer.apply_gradients(grads, global_step=self.global_step)
        self.train_op = tf.with_dependencies([opt_op], loss)

        contrib_deprecated.scalar_summary('loss/mean', loss)
        for target in loss_per_target.axes['target'].labels:
            contrib_deprecated.scalar_summary(
                'loss/' + six.ensure_str(target),
                lt.select(loss_per_target, {'target': target}))
        with tf.name_scope('summarize_grads'):
            slim.learning.add_gradients_summaries(grads)

        tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, self.global_step)
        tf.add_to_collection('train_op', self.train_op)
        tf.add_to_collection('loss', loss)

        self.mbsz = hps.mbsz
        # The log Poisson loss implemented in TensorFlow may sometimes be negative.
        if (hps.loss_name == output_layers.LOSS_POISSON_LOSS or hps.loss_name
                == output_layers.LOSS_ZERO_TRUNCATED_POISSON_LOSS):
            self.min_cost = -float('inf')
            self.min_is_inclusive = False
        else:
            self.min_cost = 0
            self.min_is_inclusive = True
 def predict_counts(self, logits, outputs):
   """See method on base class."""
   preds = super(LatentAffinityWithCrossDeps, self).predict_counts(logits,
                                                                   outputs)
   interact_weights = abs(self.logit_by_prev_count) * self.selection_signs
   # We're calling _normed_prev_round_counts a second time here with the same
   # arguments, but that's actually OK because TensorFlow automatically
   # consolidates these calls.
   if self.additional_output_axis:
     affinity_logits = lt.rename_axis(
         lt.select(logits, {'target': list(self.affinity_axis.labels)}),
         'target', 'affinity')
   else:
     affinity_logits = lt.rename_axis(logits, 'target', 'affinity')
   preds += (lt.matmul(affinity_logits, interact_weights) *
             self._normed_prev_round_counts(outputs))
   return preds
  def predict_affinity(self, logits):
    """See method on base class."""

    if not self.affinity_target_lt:
      raise Error(
          'No affinity_target_map has been designated. This FullyObserved '
          'layer cannot calculate the affinity. The FullyObserved layer '
          'must be initialized with an affinity_target_map to be capable '
          'of calculating affinity.')

    # then do matrix multiple to turn (target) X (target by protein)
    # to a vector of length protein. For proteins with multiple targets, the
    # multiplication takes the sum of the values.
    if self.additional_output_axis:
      count_logits = lt.select(logits,
                               {'target': list(self.target_axis.labels)})
    else:
      count_logits = logits
    output_per_affinity = lt.matmul(count_logits, self.affinity_target_lt)

    return output_per_affinity
 def predict_counts(self, logits, outputs=None):
   """See method on base class."""
   if self.additional_output_axis:
     return lt.select(logits, {'target': list(self.target_axis.labels)})
   else:
     return logits
def _affinities_to_binding_arrays(binding_arrays_map, affinities):
  return lt.pack([
      lt.select(affinities, {'affinity': target})
      for target in binding_arrays_map.values()
  ], ('output', list(binding_arrays_map.keys())),
                 axis_position=1)
def _targets_from_outputs(outputs, target_axis):
  selected = lt.select(outputs, {'output': list(target_axis.labels)})
  targets = lt.reshape(selected, ['output'], [target_axis])
  return targets
  def average_loss_per_target(self, logits, outputs, include_array=True):
    """Calculate averaged over examples.

    This is the loss to use for training. If affinity loss is calculated and
    "include_array" is set to True, the count loss for the novel sequences
    included in the microarray and the affinity loss for the sequences not
    included in the microarray are excluded from the average loss calculation.
    Otherwise, return the average count loss over all samples.

    Args:
      logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis].
      outputs: LabeledTensor with dtype=float32 and axes [batch, output_axis].
      include_array: Optional boolean variable indicating whether to also
                     compute affinity loss against binding array data.

    Returns:
      LabeledTensor with type=float32 with axes [output_axis].
    """
    # should be independent of mini-batch size
    loss_matrix = self.loss_per_example_and_target(logits,
                                                   outputs,
                                                   include_array)

    if bool(set(self.binding_arrays_map.keys()) &
            set(outputs.axes['output'].labels)) and include_array:
      count_loss = lt.select(loss_matrix,
                             {'target': list(self.target_axis.labels)})
      # Only the count loss for the samples with at least one non-zero
      # count output will be kept.
      loss_matrix_keep_idx = lt.reduce_any(lt.not_equal(
          lt.select(outputs, {'output': list(self.target_axis.labels)})
          , 0.0), 'output')
      loss_matrix_keep = lt.boolean_mask(count_loss, loss_matrix_keep_idx)
      reduce_loss_matrix = utils.reduce_nanmean(loss_matrix_keep, 'batch')

      affinity_loss = lt.select(
          loss_matrix, {'target': list(self.binding_arrays_map.keys())})
      # Only the affinity loss for the samples with at least one non-zero
      # affinity output wil be kept.
      affinity_loss_keep_idx = lt.reduce_any(
          lt.not_equal(
              lt.select(outputs,
                        {'output': list(self.binding_arrays_map.keys())}), 0.0),
          'output')
      affity_loss_keep = lt.boolean_mask(affinity_loss, affinity_loss_keep_idx)
      reduce_affity_loss = utils.reduce_nanmean(affity_loss_keep, 'batch')
      # Count loss and affinity loss are concatenated
      avg_loss = lt.concat([reduce_loss_matrix, reduce_affity_loss], 'target')

      # Only the additional output loss for the samples with at least one
      # non-zero output value wil be kept.
      if self.additional_output_axis:
        ao_labels = list(self.additional_output_axis.labels)
        af_loss = lt.select(loss_matrix, {'target': ao_labels})
        af_loss_keep_idx = lt.reduce_any(
            lt.not_equal(lt.select(outputs, {'output': ao_labels}), 0.0),
            'output')
        af_loss_keep = lt.boolean_mask(af_loss, af_loss_keep_idx)
        reduce_af_loss = utils.reduce_nanmean(af_loss_keep, 'batch')
        avg_loss = lt.concat([avg_loss, reduce_af_loss], 'target')

    else:
      avg_loss = utils.reduce_nanmean(loss_matrix, 'batch')

    return avg_loss
 def transform(counts):
   aligned_counts = lt.select(
       total_counts, {'target': list(counts.axes['target'].labels)})
   return counts / lt.cast(aligned_counts, tf.float32)
 def _split_outputs(self, outputs):
     """Split outputs into counts and binding array LabeledTensors."""
     counts = lt.select(outputs, {'output': self.all_count_names})
     binding = lt.select(outputs, {'output': self.binding_array_names})
     return counts, binding
Esempio n. 18
0
def compute_experiment_statistics(
        experiment_proto,
        input_paths,
        proto_w_stats_path,
        preprocess_mode=data.PREPROCESS_SKIP_ALL_ZERO_COUNTS,
        max_size=None,
        logdir=None,
        save_stats=False):
    """Calculate the mean and standard deviation of counts from input files.

  These statistics are used for normalization. If any statistic is missing or
  save_stats=True, compute the statistics. Save the statitics to
  proto_w_stats_path if save_stats=True.

  Args:
    experiment_proto: selection_pb2.Experiment describing the experiment.
    input_paths: list of strings giving paths to sstables of input examples.
    proto_w_stats_path: string path to the validation proto file with stats
    preprocess_mode: optional preprocess mode defined in the `data` module.
    max_size: optional number of examples to examine to compute statistics. By
      default, examines the entire dataset.
    logdir: optional path to a directory in which to log events.
    save_stats: optional boolean indicating whether to update all the statistics
      and save to proto_w_stats_path.

  Returns:
    selection_pb2.Experiment with computed statistics.
  """
    experiment_proto = copy.deepcopy(experiment_proto)

    has_all_statistics = True

    all_reads = {}
    for round_proto in experiment_proto.rounds.values():
        for reads in [round_proto.positive_reads, round_proto.negative_reads]:
            if reads.name:
                all_reads[reads.name] = reads
                if not reads.HasField('statistics'):
                    has_all_statistics = False

    all_ao = {}
    for ao_proto in experiment_proto.additional_output:
        if ao_proto.name:
            all_ao[ao_proto.name] = ao_proto
            if not ao_proto.HasField('statistics'):
                has_all_statistics = False

    if not has_all_statistics or save_stats:
        with tf.Graph().as_default():
            logger.info('Setting up graph for statistics')
            # we only care about outputs, which don't rely on training hyper
            # parameters
            hps = tf.HParams(preprocess_mode=preprocess_mode,
                             kmer_k_max=0,
                             ratio_random_dna=0.0,
                             total_reads_defining_positive=0,
                             additional_output=','.join([
                                 x.name
                                 for x in experiment_proto.additional_output
                             ]))
            _, outputs = data.input_pipeline(input_paths,
                                             experiment_proto,
                                             final_mbsz=100000,
                                             hps=hps,
                                             num_epochs=1,
                                             num_threads=1)
            size_op = tf.shape(outputs)[list(
                outputs.axes.keys()).index('batch')]

            all_update_ops = []
            all_value_ops = {}
            for name in all_reads:
                counts = lt.select(outputs, {'output': name})
                log_counts = lt.log(counts + 1.0)
                ops = {
                    'mean': contrib_metrics.streaming_mean(counts),
                    'std_dev': streaming_std(counts),
                    'mean_log_plus_one':
                    contrib_metrics.streaming_mean(log_counts),
                    'std_dev_log_plus_one': streaming_std(log_counts),
                }
                value_ops, update_ops = contrib_metrics.aggregate_metric_map(
                    ops)
                all_update_ops.extend(list(update_ops.values()))
                all_value_ops[name] = value_ops

            for name in all_ao:
                ao = lt.select(outputs, {'output': name})
                log_ao = lt.log(ao + 1.0)
                ops = {
                    'mean': contrib_metrics.streaming_mean(ao),
                    'std_dev': streaming_std(ao),
                    'mean_log_plus_one':
                    contrib_metrics.streaming_mean(log_ao),
                    'std_dev_log_plus_one': streaming_std(log_ao),
                }
                value_ops, update_ops = contrib_metrics.aggregate_metric_map(
                    ops)
                all_update_ops.extend(list(update_ops.values()))
                all_value_ops[name] = value_ops

            logger.info('Running statistics ops')
            sv = tf.train.Supervisor(logdir=logdir)
            with sv.managed_session() as sess:
                total = 0
                for results in run_until_exhausted(sv, sess,
                                                   [size_op] + all_update_ops):
                    total += results[0]
                    if max_size is not None and total >= max_size:
                        break
                all_statistics = {
                    k: sess.run(v)
                    for k, v in all_value_ops.items()
                }

            for reads_name, reads in all_reads.items():
                for name, value in all_statistics[reads_name].items():
                    setattr(reads.statistics, name, value.item())

            for ao_name, ao in all_ao.items():
                for name, value in all_statistics[ao_name].items():
                    setattr(ao.statistics, name, value.item())

            logger.info('Computed statistics: %r', all_statistics)

            if save_stats:
                logger.info('Save the proto with statistics to %s',
                            proto_w_stats_path)
                with open('/tmp/tmp.pbtxt', 'w') as f:
                    f.write(text_format.MessageToString(experiment_proto))
                gfile.Copy('/tmp/tmp.pbtxt',
                           proto_w_stats_path,
                           overwrite=True)
    else:
        logger.info('All the statistics exist. Nothing to compute')
    return experiment_proto
Esempio n. 19
0
def create_input_and_outputs(feature_tensors,
                             experiment_proto,
                             input_features=(SEQUENCE_ONE_HOT, ),
                             skip_all_zero_counts=True,
                             kmer_k_max=4,
                             additional_output=None):
    """Create inputs and outputs from parsed features.

  Args:
    feature_tensors: Dict[str, tf.Tensor] with parsed featured created by
      `build_features`.
    experiment_proto: selection_pb2.Experiment describing the experiment.
    input_features: optional sequence of feature constants defined in this
      module.
    skip_all_zero_counts: some sequences have no counts, e.g., because they were
      created artificially for validation purposes on the binding array. We want
      to skip these sequences for training.
    kmer_k_max: optional integer giving the maximum kmer length to use if
      SEQUENCE_KMER_COUNT is in `input_features`.
    additional_output: optional list of strings contains additional outputs.

  Returns:
    inputs: LabeledTensor with dtype=float32 and axes
      [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded
      rasterized sequences for input into machine learning models.
    outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis]
      denoting possible output tensors, including counts and binding array
      measurements.
  """

    sequence_length = experiment_proto.sequence_length
    count_names = selection.all_count_names(experiment_proto)
    array_names = selection.binding_array_names(experiment_proto)

    sequence_tensor = feature_tensors['sequence']
    batch_axis = sequence_tensor.axes['batch']
    position_axis = ('position', list(range(sequence_length)))

    inputs = {}

    if SEQUENCE_ONE_HOT in input_features:
        seq_indices = custom_ops.dna_sequence_to_indices(
            sequence_tensor, sequence_length)
        tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32)
        channel_axis = ('channel', list(dna.DNA_BASES))
        axes = [batch_axis, position_axis, channel_axis]
        one_hots = lt.LabeledTensor(tensor, axes)
        inputs[SEQUENCE_ONE_HOT] = one_hots

    if SEQUENCE_KMER_COUNT in input_features:
        raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor,
                                                    kmer_k_max)
        kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max))
        counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis])
        means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length)
        mean_count = lt.constant(means, tf.float32, axes=[kmer_axis])
        std_count = lt.constant(stds, tf.float32, axes=[kmer_axis])
        inputs[SEQUENCE_KMER_COUNT] = (
            (lt.cast(counts, tf.float32) - mean_count) / std_count)

    if STRUCTURE_PARTITION_FUNCTION in input_features:
        with tf.name_scope('structure_partition_fn'):
            raw_pf_tensor = lt.expand_dims(
                feature_tensors['partition_function'],
                ['batch', 'partition_fn_axis'])
            inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor)

    output_names = count_names + array_names
    outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names]

    if additional_output and additional_output[0]:
        outputs += [
            lt.cast(feature_tensors[k], tf.float32) for k in additional_output
        ]
        output_names += additional_output
    outputs = lt.pack(outputs, ('output', output_names), axis_position=1)

    if skip_all_zero_counts:
        with tf.name_scope('counts_filtering'):
            counts = lt.select(outputs, {'output': count_names})
            keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output')
            inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()}
            outputs = lt.boolean_mask(outputs, keep)

    return inputs, outputs