コード例 #1
0
 def transform(counts):
   aligned_counts = lt.select(
       total_counts, {'target': list(counts.axes['target'].labels)})
   return counts / lt.cast(aligned_counts, tf.float32)
コード例 #2
0
 def transform(counts):
   return lt.cast(counts >= float(threshold), tf.float32)
コード例 #3
0
ファイル: data.py プロジェクト: tallamjr/google-research
def upsample_positives(feature_tensors,
                       count_names,
                       total_reads_defining_positive,
                       min_fraction_positive,
                       seed=None):
    """Returns feature tensors with positives upsampled to the desired rate.

  Args:
    feature_tensors: Dict[str, lt.LabeledTensor] with parsed featured created by
      `build_features`.
    count_names: A list of labels that are count names.
    total_reads_defining_positive: The minimum number of reads detected across
      all conditions that defines a sequence as being a positive example.
    min_fraction_positive: The minimum fraction of positive examples to allow
      in the data.
    seed: The random seed to use in upsampling.

  Returns:
    A dictionary mapping from string feature name to lt.LabeledTensor of parsed
    features created by `build_features` and positive examples upsampled to the
    desired rate.

  Raises:
    ValueError: The minimum positive fraction requested is invalid.
  """
    # Goal: Find the fraction of all input feature tensors that should be
    # classified as "positive" based on the total_reads_defining_positive.
    # Upsample those using resample.resample_at_rate() until they are at least
    # min_fraction_positive of the entire set.
    if min_fraction_positive < 0 or min_fraction_positive >= 1:
        raise ValueError('Invalid fraction positive, must be in [0, 1): %s' %
                         min_fraction_positive)

    with tf.name_scope('upsample_positives'):
        # Classify the inputs as positive or negative.
        total_reads_defining_positive = tf.constant(
            total_reads_defining_positive, dtype=tf.float32)
        min_fraction_positive = tf.constant(min_fraction_positive,
                                            dtype=tf.float32)
        counts = lt.pack(
            [lt.cast(feature_tensors[k], tf.float32) for k in count_names],
            ('sequence_counts', count_names),
            axis_position=1)
        greater_equal = (lt.reduce_sum(counts, 'sequence_counts') >=
                         total_reads_defining_positive)
        num_pos = lt.reduce_sum(lt.cast(greater_equal, tf.int32))
        less_than = lt.logical_not(greater_equal)
        num_neg = lt.reduce_sum(lt.cast(less_than, tf.int32))

        # With an initial number of positives P and number of negatives N,
        # if we keep the negative sampling rate at 1 (to try to retain negatives),
        # to achieve a total positive input fraction of F, we need a positive
        # sampling rate R that satisfies:
        # P * R / (P * R + N) >= F
        #
        # Solving for R:
        #
        # P * R = F * (P*R + N) = F*P*R + F*N
        # P * R (1 - F) = F * N
        # R = F*N / (P * (1 - F))
        numerator = min_fraction_positive * tf.cast(num_neg, tf.float32)
        denom = tf.cast(num_pos, tf.float32) * (1 - min_fraction_positive)
        denom = tf.cond(
            denom > 0.0,
            lambda: denom,
            # If denom == 0, we can set it to anything we want since the
            # tf.cond below is guaranteed to return the input without
            # resampling.
            lambda: tf.constant(1.0, dtype=tf.float32))
        positive_rate = numerator / denom
        batch_size = tf.shape(greater_equal)[0]
        negative_rates = tf.ones([batch_size], tf.float32)
        positive_rates = tf.fill([batch_size], positive_rate)
        rates = tf.where(greater_equal, positive_rates, negative_rates)

        # Pack the LabeledTensors into normal tensors, keeping relevant information
        # for unpacking back to LabeledTensors available.
        ordered_names = sorted(feature_tensors)
        packed_tensors = []
        tensor_axes = []
        tensor_shapes = []
        for name in ordered_names:
            labeled_tensor = feature_tensors[name]
            packed_tensors.append(labeled_tensor.tensor)
            tensor_axes.append(labeled_tensor.axes)
            tensor_shapes.append(labeled_tensor.get_shape())

        # Perform the resampling.
        resampled_tensors = tf.cond(
            tf.logical_or(
                tf.equal(num_pos, 0),
                tf.cast(num_pos, dtype=tf.float32) >=
                (min_fraction_positive *
                 tf.cast(batch_size, dtype=tf.float32))),
            lambda: packed_tensors, lambda: resample.resample_at_rate(
                packed_tensors, rates, seed=seed))

        # Unpack the tensors into a dictionary of LabeledTensors again.
        # First, change the shape so that the batch axis is unknown.
        tensor_shapes = [[None] + list(shape)[1:] for shape in tensor_shapes]
        for tensor, shape in zip(resampled_tensors, tensor_shapes):
            tensor.set_shape(shape)

        unpacked_feature_tensors = {}
        for i, name in enumerate(ordered_names):
            labeled = lt.LabeledTensor(resampled_tensors[i], tensor_axes[i])
            unpacked_feature_tensors[name] = labeled
        return unpacked_feature_tensors
コード例 #4
0
ファイル: data.py プロジェクト: tallamjr/google-research
def create_input_and_outputs(feature_tensors,
                             experiment_proto,
                             input_features=(SEQUENCE_ONE_HOT, ),
                             skip_all_zero_counts=True,
                             kmer_k_max=4,
                             additional_output=None):
    """Create inputs and outputs from parsed features.

  Args:
    feature_tensors: Dict[str, tf.Tensor] with parsed featured created by
      `build_features`.
    experiment_proto: selection_pb2.Experiment describing the experiment.
    input_features: optional sequence of feature constants defined in this
      module.
    skip_all_zero_counts: some sequences have no counts, e.g., because they were
      created artificially for validation purposes on the binding array. We want
      to skip these sequences for training.
    kmer_k_max: optional integer giving the maximum kmer length to use if
      SEQUENCE_KMER_COUNT is in `input_features`.
    additional_output: optional list of strings contains additional outputs.

  Returns:
    inputs: LabeledTensor with dtype=float32 and axes
      [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded
      rasterized sequences for input into machine learning models.
    outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis]
      denoting possible output tensors, including counts and binding array
      measurements.
  """

    sequence_length = experiment_proto.sequence_length
    count_names = selection.all_count_names(experiment_proto)
    array_names = selection.binding_array_names(experiment_proto)

    sequence_tensor = feature_tensors['sequence']
    batch_axis = sequence_tensor.axes['batch']
    position_axis = ('position', list(range(sequence_length)))

    inputs = {}

    if SEQUENCE_ONE_HOT in input_features:
        seq_indices = custom_ops.dna_sequence_to_indices(
            sequence_tensor, sequence_length)
        tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32)
        channel_axis = ('channel', list(dna.DNA_BASES))
        axes = [batch_axis, position_axis, channel_axis]
        one_hots = lt.LabeledTensor(tensor, axes)
        inputs[SEQUENCE_ONE_HOT] = one_hots

    if SEQUENCE_KMER_COUNT in input_features:
        raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor,
                                                    kmer_k_max)
        kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max))
        counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis])
        means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length)
        mean_count = lt.constant(means, tf.float32, axes=[kmer_axis])
        std_count = lt.constant(stds, tf.float32, axes=[kmer_axis])
        inputs[SEQUENCE_KMER_COUNT] = (
            (lt.cast(counts, tf.float32) - mean_count) / std_count)

    if STRUCTURE_PARTITION_FUNCTION in input_features:
        with tf.name_scope('structure_partition_fn'):
            raw_pf_tensor = lt.expand_dims(
                feature_tensors['partition_function'],
                ['batch', 'partition_fn_axis'])
            inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor)

    output_names = count_names + array_names
    outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names]

    if additional_output and additional_output[0]:
        outputs += [
            lt.cast(feature_tensors[k], tf.float32) for k in additional_output
        ]
        output_names += additional_output
    outputs = lt.pack(outputs, ('output', output_names), axis_position=1)

    if skip_all_zero_counts:
        with tf.name_scope('counts_filtering'):
            counts = lt.select(outputs, {'output': count_names})
            keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output')
            inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()}
            outputs = lt.boolean_mask(outputs, keep)

    return inputs, outputs
コード例 #5
0
ファイル: data.py プロジェクト: tallamjr/google-research
def preprocess(strs,
               experiment_proto,
               input_features=(SEQUENCE_ONE_HOT, ),
               mode=PREPROCESS_SKIP_ALL_ZERO_COUNTS,
               kmer_k_max=4,
               ratio_random_dna=1,
               total_reads_defining_positive=0,
               additional_output=None):
    """Build a small TF graph to preprocess a minibatch of tf.Example protos.

  Args:
    strs: LabeledTensor holding a minibatch of serialized tf.Example protos
    experiment_proto: selection_pb2.Experiment describing the experiment.
    input_features: optional sequence of feature constants defined in this
      module.
    mode: optional preprocess mode defined in this module.
    kmer_k_max: optional integer giving the maximum kmer length to use if
      SEQUENCE_KMER_COUNT is in `input_features`.
    ratio_random_dna: optional ratio of random sequences to inject if mode ==
      PREPROCESS_INJECT_RANDOM_SEQUENCES
    total_reads_defining_positive: optional integer indicating the sum of all
      read counts required to be seen to classify the tensor as a "positive"
      example when balancing input classes.
    additional_output: optional list of strings contains additional outputs.

  Returns:
    inputs: LabeledTensor with dtype=float32 and axes
      [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded
      rasterized sequences for input into machine learning models.
    outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis]
      denoting possible output tensors, including counts and binding array
      measurements.
  """
    with tf.name_scope('preprocess'):
        features = build_features(experiment_proto)
        parsed_feature_tensors = lt.parse_example(strs, features)
        count_names = selection.all_count_names(experiment_proto)

        if mode == PREPROCESS_SKIP_ALL_ZERO_COUNTS:
            skip_all_zero_counts = True
            feature_tensors = parsed_feature_tensors

        elif mode == PREPROCESS_ALL_COUNTS:
            skip_all_zero_counts = False
            feature_tensors = parsed_feature_tensors

        elif mode == PREPROCESS_INJECT_RANDOM_SEQUENCES:
            skip_all_zero_counts = False

            # replace zero counts with NaN in real data
            for count_name in count_names:
                count = parsed_feature_tensors[count_name]
                parsed_feature_tensors[count_name] = lt.LabeledTensor(
                    tf.where(count != 0, tf.cast(count, tf.float32),
                             tf.fill(tf.shape(count), np.float32(np.nan))),
                    count.axes)

            # only random sequences will have a count of zero
            input_batch_size = tf.shape(strs.tensor)[list(
                strs.axes.keys()).index('batch')]
            n_randoms = tf.cast(
                tf.cast(input_batch_size, tf.float32) * ratio_random_dna,
                tf.int32)
            random_feature_tensors = random_dna_features(
                experiment_proto, n_randoms)
            for count_name in count_names:
                random_feature_tensors[count_name] = lt.cast(
                    random_feature_tensors[count_name], tf.float32)

            feature_tensors = {
                k: lt.concat(
                    [random_feature_tensors[k], parsed_feature_tensors[k]],
                    'batch')
                for k in features
            }

            # shuffle random and non-random inputs because preprocess batches get
            # split across many mini-batches for training
            batch_size = tf.shape(feature_tensors['sequence'].tensor)[0]
            order = tf.random_shuffle(tf.range(batch_size, dtype=tf.int32))
            order.set_shape(feature_tensors['sequence'].tensor.get_shape())
            feature_tensors = {
                k: lt.LabeledTensor(tf.gather(v.tensor, order), v.axes)
                for k, v in feature_tensors.items()
            }

        else:
            raise ValueError('unknown mode: %r' % mode)  # pylint: disable=g-doc-exception

        feature_tensors = upsample_positives(
            feature_tensors,
            count_names,
            total_reads_defining_positive=total_reads_defining_positive,
            min_fraction_positive=0.1)

        inputs, outputs = create_input_and_outputs(
            feature_tensors,
            experiment_proto,
            input_features=input_features,
            kmer_k_max=kmer_k_max,
            skip_all_zero_counts=skip_all_zero_counts,
            additional_output=additional_output)

        return inputs, outputs