def average_loss_per_target(self, logits, outputs, include_array=True):
    """Calculate averaged over examples.

    This is the loss to use for training. If affinity loss is calculated and
    "include_array" is set to True, the count loss for the novel sequences
    included in the microarray and the affinity loss for the sequences not
    included in the microarray are excluded from the average loss calculation.
    Otherwise, return the average count loss over all samples.

    Args:
      logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis].
      outputs: LabeledTensor with dtype=float32 and axes [batch, output_axis].
      include_array: Optional boolean variable indicating whether to also
                     compute affinity loss against binding array data.

    Returns:
      LabeledTensor with type=float32 with axes [output_axis].
    """
    # should be independent of mini-batch size
    loss_matrix = self.loss_per_example_and_target(logits,
                                                   outputs,
                                                   include_array)

    if bool(set(self.binding_arrays_map.keys()) &
            set(outputs.axes['output'].labels)) and include_array:
      count_loss = lt.select(loss_matrix,
                             {'target': list(self.target_axis.labels)})
      # Only the count loss for the samples with at least one non-zero
      # count output will be kept.
      loss_matrix_keep_idx = lt.reduce_any(lt.not_equal(
          lt.select(outputs, {'output': list(self.target_axis.labels)})
          , 0.0), 'output')
      loss_matrix_keep = lt.boolean_mask(count_loss, loss_matrix_keep_idx)
      reduce_loss_matrix = utils.reduce_nanmean(loss_matrix_keep, 'batch')

      affinity_loss = lt.select(
          loss_matrix, {'target': list(self.binding_arrays_map.keys())})
      # Only the affinity loss for the samples with at least one non-zero
      # affinity output wil be kept.
      affinity_loss_keep_idx = lt.reduce_any(
          lt.not_equal(
              lt.select(outputs,
                        {'output': list(self.binding_arrays_map.keys())}), 0.0),
          'output')
      affity_loss_keep = lt.boolean_mask(affinity_loss, affinity_loss_keep_idx)
      reduce_affity_loss = utils.reduce_nanmean(affity_loss_keep, 'batch')
      # Count loss and affinity loss are concatenated
      avg_loss = lt.concat([reduce_loss_matrix, reduce_affity_loss], 'target')

      # Only the additional output loss for the samples with at least one
      # non-zero output value wil be kept.
      if self.additional_output_axis:
        ao_labels = list(self.additional_output_axis.labels)
        af_loss = lt.select(loss_matrix, {'target': ao_labels})
        af_loss_keep_idx = lt.reduce_any(
            lt.not_equal(lt.select(outputs, {'output': ao_labels}), 0.0),
            'output')
        af_loss_keep = lt.boolean_mask(af_loss, af_loss_keep_idx)
        reduce_af_loss = utils.reduce_nanmean(af_loss_keep, 'batch')
        avg_loss = lt.concat([avg_loss, reduce_af_loss], 'target')

    else:
      avg_loss = utils.reduce_nanmean(loss_matrix, 'batch')

    return avg_loss
Exemple #2
0
def create_input_and_outputs(feature_tensors,
                             experiment_proto,
                             input_features=(SEQUENCE_ONE_HOT, ),
                             skip_all_zero_counts=True,
                             kmer_k_max=4,
                             additional_output=None):
    """Create inputs and outputs from parsed features.

  Args:
    feature_tensors: Dict[str, tf.Tensor] with parsed featured created by
      `build_features`.
    experiment_proto: selection_pb2.Experiment describing the experiment.
    input_features: optional sequence of feature constants defined in this
      module.
    skip_all_zero_counts: some sequences have no counts, e.g., because they were
      created artificially for validation purposes on the binding array. We want
      to skip these sequences for training.
    kmer_k_max: optional integer giving the maximum kmer length to use if
      SEQUENCE_KMER_COUNT is in `input_features`.
    additional_output: optional list of strings contains additional outputs.

  Returns:
    inputs: LabeledTensor with dtype=float32 and axes
      [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded
      rasterized sequences for input into machine learning models.
    outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis]
      denoting possible output tensors, including counts and binding array
      measurements.
  """

    sequence_length = experiment_proto.sequence_length
    count_names = selection.all_count_names(experiment_proto)
    array_names = selection.binding_array_names(experiment_proto)

    sequence_tensor = feature_tensors['sequence']
    batch_axis = sequence_tensor.axes['batch']
    position_axis = ('position', list(range(sequence_length)))

    inputs = {}

    if SEQUENCE_ONE_HOT in input_features:
        seq_indices = custom_ops.dna_sequence_to_indices(
            sequence_tensor, sequence_length)
        tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32)
        channel_axis = ('channel', list(dna.DNA_BASES))
        axes = [batch_axis, position_axis, channel_axis]
        one_hots = lt.LabeledTensor(tensor, axes)
        inputs[SEQUENCE_ONE_HOT] = one_hots

    if SEQUENCE_KMER_COUNT in input_features:
        raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor,
                                                    kmer_k_max)
        kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max))
        counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis])
        means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length)
        mean_count = lt.constant(means, tf.float32, axes=[kmer_axis])
        std_count = lt.constant(stds, tf.float32, axes=[kmer_axis])
        inputs[SEQUENCE_KMER_COUNT] = (
            (lt.cast(counts, tf.float32) - mean_count) / std_count)

    if STRUCTURE_PARTITION_FUNCTION in input_features:
        with tf.name_scope('structure_partition_fn'):
            raw_pf_tensor = lt.expand_dims(
                feature_tensors['partition_function'],
                ['batch', 'partition_fn_axis'])
            inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor)

    output_names = count_names + array_names
    outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names]

    if additional_output and additional_output[0]:
        outputs += [
            lt.cast(feature_tensors[k], tf.float32) for k in additional_output
        ]
        output_names += additional_output
    outputs = lt.pack(outputs, ('output', output_names), axis_position=1)

    if skip_all_zero_counts:
        with tf.name_scope('counts_filtering'):
            counts = lt.select(outputs, {'output': count_names})
            keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output')
            inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()}
            outputs = lt.boolean_mask(outputs, keep)

    return inputs, outputs