def _normed_prev_round_counts(self, input_counts,
                                input_counts_label='output'):
    """Create a Tensor with normalized counts from the previous round.

    Args:
      input_counts: LabeledTensor with dtype=float32 and axes
        [batch, input_counts_label].
      input_counts_label: Name of the axis in input_counts that contains the
        count data to use. For LatentAffinityWithDeps that uses the
        actual count values, input_counts will be the outputs tensor and the
        label will be 'output'. For LatentAffinityWithPredDeps, input_counts
        will be the predictions for these counts and the axis will be 'target'.

    Returns:
      preds: LabeledTensor with dtype=float32 and axes [batch, target_axis].
    """
    parent_lookup = {}
    for k, parent in self.parent_count_names.items():
      if parent in input_counts.axes:
        parent_lookup[k] = lt.select(input_counts,
                                     {input_counts_label: parent})
    default_tensor = lt.LabeledTensor(
        tf.zeros_like(input_counts[:, 0]), [input_counts.axes['batch']])
    parent_tensors = [
        parent_lookup.get(k, default_tensor) for k in self.target_axis.labels
    ]
    parent_counts = lt.pack(parent_tensors, self.target_axis, axis_position=1)
    normed_counts = self.deps_normalize(parent_counts)
    return normed_counts
  def predict_outputs(self, logits, outputs=None):
    """Predict a score that should correlate with each output.

    Args:
      logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis].
      outputs: optional LabeledTensor with dtype=float32 and axes [batch,
        output_axis]. Note that different output layers may not be directly
        comparable if they make sure of `outputs` from prior rounds of selection
        in predictions.

    Returns:
      LabeledTensor with dtype=float32 and axes [batch, output_axis] giving
      predictions for each count and binding array.
    """
    predicted_counts = lt.rename_axis(
        self.predict_counts(logits, outputs), 'target', 'output')

    if self.binding_arrays_map:
      predicted_affinity = self.predict_affinity(logits)
      predicted_binding_arrays = lt.pack([
          lt.select(predicted_affinity, {'affinity': target})
          for target in self.binding_arrays_map.values()
      ], ('output', list(self.binding_arrays_map.keys())),
                                         axis_position=1)
      preds = lt.concat([predicted_counts, predicted_binding_arrays], 'output')
    else:
      preds = predicted_counts

    if self.additional_output_axis:
      predicted_additional_output = lt.rename_axis(
          self.predict_additional_output(logits), 'target', 'output')
      preds = lt.concat([preds, predicted_additional_output], 'output')
    return preds
def _affinities_to_binding_arrays(binding_arrays_map, affinities):
  return lt.pack([
      lt.select(affinities, {'affinity': target})
      for target in binding_arrays_map.values()
  ], ('output', list(binding_arrays_map.keys())),
                 axis_position=1)
Esempio n. 4
0
def create_input_and_outputs(feature_tensors,
                             experiment_proto,
                             input_features=(SEQUENCE_ONE_HOT, ),
                             skip_all_zero_counts=True,
                             kmer_k_max=4,
                             additional_output=None):
    """Create inputs and outputs from parsed features.

  Args:
    feature_tensors: Dict[str, tf.Tensor] with parsed featured created by
      `build_features`.
    experiment_proto: selection_pb2.Experiment describing the experiment.
    input_features: optional sequence of feature constants defined in this
      module.
    skip_all_zero_counts: some sequences have no counts, e.g., because they were
      created artificially for validation purposes on the binding array. We want
      to skip these sequences for training.
    kmer_k_max: optional integer giving the maximum kmer length to use if
      SEQUENCE_KMER_COUNT is in `input_features`.
    additional_output: optional list of strings contains additional outputs.

  Returns:
    inputs: LabeledTensor with dtype=float32 and axes
      [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded
      rasterized sequences for input into machine learning models.
    outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis]
      denoting possible output tensors, including counts and binding array
      measurements.
  """

    sequence_length = experiment_proto.sequence_length
    count_names = selection.all_count_names(experiment_proto)
    array_names = selection.binding_array_names(experiment_proto)

    sequence_tensor = feature_tensors['sequence']
    batch_axis = sequence_tensor.axes['batch']
    position_axis = ('position', list(range(sequence_length)))

    inputs = {}

    if SEQUENCE_ONE_HOT in input_features:
        seq_indices = custom_ops.dna_sequence_to_indices(
            sequence_tensor, sequence_length)
        tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32)
        channel_axis = ('channel', list(dna.DNA_BASES))
        axes = [batch_axis, position_axis, channel_axis]
        one_hots = lt.LabeledTensor(tensor, axes)
        inputs[SEQUENCE_ONE_HOT] = one_hots

    if SEQUENCE_KMER_COUNT in input_features:
        raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor,
                                                    kmer_k_max)
        kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max))
        counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis])
        means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length)
        mean_count = lt.constant(means, tf.float32, axes=[kmer_axis])
        std_count = lt.constant(stds, tf.float32, axes=[kmer_axis])
        inputs[SEQUENCE_KMER_COUNT] = (
            (lt.cast(counts, tf.float32) - mean_count) / std_count)

    if STRUCTURE_PARTITION_FUNCTION in input_features:
        with tf.name_scope('structure_partition_fn'):
            raw_pf_tensor = lt.expand_dims(
                feature_tensors['partition_function'],
                ['batch', 'partition_fn_axis'])
            inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor)

    output_names = count_names + array_names
    outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names]

    if additional_output and additional_output[0]:
        outputs += [
            lt.cast(feature_tensors[k], tf.float32) for k in additional_output
        ]
        output_names += additional_output
    outputs = lt.pack(outputs, ('output', output_names), axis_position=1)

    if skip_all_zero_counts:
        with tf.name_scope('counts_filtering'):
            counts = lt.select(outputs, {'output': count_names})
            keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output')
            inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()}
            outputs = lt.boolean_mask(outputs, keep)

    return inputs, outputs
Esempio n. 5
0
def upsample_positives(feature_tensors,
                       count_names,
                       total_reads_defining_positive,
                       min_fraction_positive,
                       seed=None):
    """Returns feature tensors with positives upsampled to the desired rate.

  Args:
    feature_tensors: Dict[str, lt.LabeledTensor] with parsed featured created by
      `build_features`.
    count_names: A list of labels that are count names.
    total_reads_defining_positive: The minimum number of reads detected across
      all conditions that defines a sequence as being a positive example.
    min_fraction_positive: The minimum fraction of positive examples to allow
      in the data.
    seed: The random seed to use in upsampling.

  Returns:
    A dictionary mapping from string feature name to lt.LabeledTensor of parsed
    features created by `build_features` and positive examples upsampled to the
    desired rate.

  Raises:
    ValueError: The minimum positive fraction requested is invalid.
  """
    # Goal: Find the fraction of all input feature tensors that should be
    # classified as "positive" based on the total_reads_defining_positive.
    # Upsample those using resample.resample_at_rate() until they are at least
    # min_fraction_positive of the entire set.
    if min_fraction_positive < 0 or min_fraction_positive >= 1:
        raise ValueError('Invalid fraction positive, must be in [0, 1): %s' %
                         min_fraction_positive)

    with tf.name_scope('upsample_positives'):
        # Classify the inputs as positive or negative.
        total_reads_defining_positive = tf.constant(
            total_reads_defining_positive, dtype=tf.float32)
        min_fraction_positive = tf.constant(min_fraction_positive,
                                            dtype=tf.float32)
        counts = lt.pack(
            [lt.cast(feature_tensors[k], tf.float32) for k in count_names],
            ('sequence_counts', count_names),
            axis_position=1)
        greater_equal = (lt.reduce_sum(counts, 'sequence_counts') >=
                         total_reads_defining_positive)
        num_pos = lt.reduce_sum(lt.cast(greater_equal, tf.int32))
        less_than = lt.logical_not(greater_equal)
        num_neg = lt.reduce_sum(lt.cast(less_than, tf.int32))

        # With an initial number of positives P and number of negatives N,
        # if we keep the negative sampling rate at 1 (to try to retain negatives),
        # to achieve a total positive input fraction of F, we need a positive
        # sampling rate R that satisfies:
        # P * R / (P * R + N) >= F
        #
        # Solving for R:
        #
        # P * R = F * (P*R + N) = F*P*R + F*N
        # P * R (1 - F) = F * N
        # R = F*N / (P * (1 - F))
        numerator = min_fraction_positive * tf.cast(num_neg, tf.float32)
        denom = tf.cast(num_pos, tf.float32) * (1 - min_fraction_positive)
        denom = tf.cond(
            denom > 0.0,
            lambda: denom,
            # If denom == 0, we can set it to anything we want since the
            # tf.cond below is guaranteed to return the input without
            # resampling.
            lambda: tf.constant(1.0, dtype=tf.float32))
        positive_rate = numerator / denom
        batch_size = tf.shape(greater_equal)[0]
        negative_rates = tf.ones([batch_size], tf.float32)
        positive_rates = tf.fill([batch_size], positive_rate)
        rates = tf.where(greater_equal, positive_rates, negative_rates)

        # Pack the LabeledTensors into normal tensors, keeping relevant information
        # for unpacking back to LabeledTensors available.
        ordered_names = sorted(feature_tensors)
        packed_tensors = []
        tensor_axes = []
        tensor_shapes = []
        for name in ordered_names:
            labeled_tensor = feature_tensors[name]
            packed_tensors.append(labeled_tensor.tensor)
            tensor_axes.append(labeled_tensor.axes)
            tensor_shapes.append(labeled_tensor.get_shape())

        # Perform the resampling.
        resampled_tensors = tf.cond(
            tf.logical_or(
                tf.equal(num_pos, 0),
                tf.cast(num_pos, dtype=tf.float32) >=
                (min_fraction_positive *
                 tf.cast(batch_size, dtype=tf.float32))),
            lambda: packed_tensors, lambda: resample.resample_at_rate(
                packed_tensors, rates, seed=seed))

        # Unpack the tensors into a dictionary of LabeledTensors again.
        # First, change the shape so that the batch axis is unknown.
        tensor_shapes = [[None] + list(shape)[1:] for shape in tensor_shapes]
        for tensor, shape in zip(resampled_tensors, tensor_shapes):
            tensor.set_shape(shape)

        unpacked_feature_tensors = {}
        for i, name in enumerate(ordered_names):
            labeled = lt.LabeledTensor(resampled_tensors[i], tensor_axes[i])
            unpacked_feature_tensors[name] = labeled
        return unpacked_feature_tensors