Python LabeledTensor Beispiele, tensorflow.contrib.labeled_tensor.LabeledTensor Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: output_layers.py Projekt: sarvex/google-research

    def __init__(self,
                 experiment_proto,
                 loss,
                 target_names=None,
                 additional_output=None):
        """Initialize a LatentAffinity output layer.

    Args:
      experiment_proto: selection_pb2.Experiment describing the experiment.
      loss: instance of an AbstractLoss subclass used for computing loss on this
        output layer.
      target_names: optional list of strings giving target names to train
        against.
      additional_output: optional list of strings containing all the
        additional output to predict.

    Raises:
      ValueError: if any target_names are not counts.
    """
        self.loss = loss

        affinity_names = selection.all_target_and_background_names(
            experiment_proto)
        additional_output = get_additional_output_names(
            experiment_proto, additional_output)
        target_names = get_target_names(experiment_proto, target_names)
        self.target_axis = lt.Axis('target', target_names)

        if additional_output:
            self.additional_output_axis = lt.Axis('additional_output',
                                                  additional_output)
        else:
            self.additional_output_axis = None
        self.logit_axis = lt.Axis('target', affinity_names + additional_output)
        self.affinity_axis = lt.Axis('affinity', affinity_names)
        self.all_target_axis = lt.Axis('target',
                                       target_names + additional_output)

        self.all_count_names = selection.all_count_names(experiment_proto)
        self.binding_arrays_map = _binding_arrays_map(experiment_proto)

        signs = _get_selection_signs(affinity_names, target_names,
                                     experiment_proto)
        self.selection_signs = lt.LabeledTensor(
            tf.constant(signs, dtype=tf.float32, name='selection_signs'),
            [self.affinity_axis, self.target_axis])

        # TODO(shoyer): consider if there's a sane way to make lt.Variable
        affinity_weights = tf.Variable(tf.ones_like(signs, dtype=tf.float32),
                                       name='affinity_weights')
        bias = tf.Variable(tf.zeros([self.target_axis.size]), name='bias')
        self.params = [affinity_weights, bias]

        self.affinity_weights = lt.LabeledTensor(
            tf.convert_to_tensor(affinity_weights),
            [self.affinity_axis, self.target_axis])
        self.bias = lt.LabeledTensor(tf.convert_to_tensor(bias),
                                     [self.target_axis])

Beispiel #2

0

Datei anzeigen

Datei: output_layers.py Projekt: MitchellTesla/google-research

  def _normed_prev_round_counts(self, input_counts,
                                input_counts_label='output'):
    """Create a Tensor with normalized counts from the previous round.

    Args:
      input_counts: LabeledTensor with dtype=float32 and axes
        [batch, input_counts_label].
      input_counts_label: Name of the axis in input_counts that contains the
        count data to use. For LatentAffinityWithDeps that uses the
        actual count values, input_counts will be the outputs tensor and the
        label will be 'output'. For LatentAffinityWithPredDeps, input_counts
        will be the predictions for these counts and the axis will be 'target'.

    Returns:
      preds: LabeledTensor with dtype=float32 and axes [batch, target_axis].
    """
    parent_lookup = {}
    for k, parent in self.parent_count_names.items():
      if parent in input_counts.axes:
        parent_lookup[k] = lt.select(input_counts,
                                     {input_counts_label: parent})
    default_tensor = lt.LabeledTensor(
        tf.zeros_like(input_counts[:, 0]), [input_counts.axes['batch']])
    parent_tensors = [
        parent_lookup.get(k, default_tensor) for k in self.target_axis.labels
    ]
    parent_counts = lt.pack(parent_tensors, self.target_axis, axis_position=1)
    normed_counts = self.deps_normalize(parent_counts)
    return normed_counts

Beispiel #3

0

Datei anzeigen

Datei: output_layers.py Projekt: MitchellTesla/google-research

  def __init__(self, experiment_proto, loss, deps_normalize, target_names=None,
               additional_output=None):
    """Initialize a LatentAffinityWithDeps output layer.

    Args:
      experiment_proto: selection_pb2.Experiment describing the experiment.
      loss: instance of an AbstractLoss subclass used for computing loss on this
        output layer.
      deps_normalize: Normalizer instance used for normalizing dependency
        counts.
      target_names: optional list of strings giving target names to train
        against.
      additional_output: optional list of strings containing all the
        additional output to predict.
    """
    super(LatentAffinityWithDeps, self).__init__(
        experiment_proto, loss,
        target_names=target_names, additional_output=additional_output)
    self.deps_normalize = deps_normalize

    self.parent_count_names = selection.parent_counts(experiment_proto)

    init_value = 0.1 * tf.ones((self.target_axis.size,), dtype=tf.float32)
    prev_round_scale = tf.Variable(init_value, name='prev_round_scale')
    self.params.append(prev_round_scale)

    self.prev_round_scale = lt.LabeledTensor(
        tf.convert_to_tensor(prev_round_scale), [self.target_axis])

Beispiel #4

0

Datei anzeigen

Datei: data.py Projekt: tallamjr/google-research

def random_dna_features(experiment_proto, size):
    """Create a dict of feature tensors for random DNA sequences.

  All features other than 'sequence' should use the default value.

  Args:
    experiment_proto: selection_pb2.Experiment describing the experiment.
    size: scalar integer tf.Tensor giving the number of desired sequences.

  Returns:
    Dict[Any, labeled_tensor.LabeledTensor] providing generated features.
  """
    with tf.name_scope('preprocess_random_input'):
        template = selection.get_template_sequence(experiment_proto)
        features = build_features(experiment_proto)
        feature_tensors = {}
        for k, v in features.items():
            if k != 'sequence':
                default_value = lt.constant(v.default_value, v.dtype, v.axes)
                expanded_default = lt.expand_dims(default_value,
                                                  ['batch'] + v.axes)
                tiled_default = lt.tile(expanded_default, {'batch': size})
                feature_tensors[k] = tiled_default
        feature_tensors['sequence'] = lt.LabeledTensor(
            random_dna_sequence(template, (size, )), ['batch'])
        return feature_tensors

Beispiel #5

0

Datei anzeigen

Datei: eval_feedforward.py Projekt: MitchellTesla/google-research

    def evaluation_tensors(self, examples, keys=None):
        """Build output and evaluation tensors for a feed forward model.

    Args:
      examples: tf.Tensor with dtype=string and shape=[None] holding serialized
        tf.train.Example protos.
      keys: Optional sequence of string tensor names to evaluate. By default,
        uses all known tensors.

    Returns:
      Dict[str, lt.LabeledTensor] giving all possible tensors to run.
    """
        # TODO(shoyer): expose options for injecting/filtering examples?
        inputs, outputs = data.preprocess(
            lt.LabeledTensor(examples, ['batch']),
            self.experiment_proto,
            kmer_k_max=self.hps.kmer_k_max,
            input_features=self.hps.input_features,
            mode=data.PREPROCESS_ALL_COUNTS,
            additional_output=self.hps.additional_output.split(','))

        actual_counts, actual_binding = self._split_outputs(outputs)

        # TODO(shoyer): encapsulate net/output_layer in a single class that makes
        # predictions without building logits as intermediate output.
        # This will be useful for testing other methods.
        net, output_layer = self._create_net_and_output_layer()

        logits = net.fprop(inputs, mode='test')
        predicted_outputs = output_layer.predict_outputs(logits, outputs)

        # only calculate the predicted affinity if it is required.
        predicted_affinity = None
        if not keys or PREDICTED_AFFINITY in keys:
            predicted_affinity = output_layer.predict_affinity(logits)

        loss = output_layer.loss_per_example_and_target(
            logits, outputs, self.hps.train_on_array)

        tensors = {
            ACTUAL_OUTPUTS: outputs,
            ACTUAL_COUNTS: actual_counts,
            ACTUAL_BINDING: actual_binding,
            PREDICTED_OUTPUTS: predicted_outputs,
            PREDICTED_AFFINITY: predicted_affinity,
            LOSS: loss
        }
        tensors.update(inputs)
        # remove outputs not provided by the given model
        tensors = {k: v for k, v in tensors.items() if v is not None}
        if keys is not None:
            tensors = {k: tensors[k] for k in keys}
        return tensors

Beispiel #6

0

Datei anzeigen

Datei: output_layers.py Projekt: MitchellTesla/google-research

  def __init__(self, experiment_proto, loss, deps_normalize, target_names=None,
               additional_output=None):
    """See method on base class."""
    super(LatentAffinityWithCrossDeps, self).__init__(
        experiment_proto, loss, deps_normalize, target_names=target_names,
        additional_output=additional_output)

    init_value = tf.zeros(
        (self.affinity_axis.size, self.target_axis.size), dtype=tf.float32)
    logit_by_prev_count = tf.Variable(init_value, name='logit_by_prev_count')
    self.params.append(logit_by_prev_count)

    self.logit_by_prev_count = lt.LabeledTensor(
        tf.convert_to_tensor(logit_by_prev_count),
        [self.affinity_axis, self.target_axis])

Beispiel #7

0

Datei anzeigen

Datei: data.py Projekt: tallamjr/google-research

def input_pipeline(filenames,
                   experiment_proto,
                   final_mbsz,
                   hps,
                   num_epochs=None,
                   num_threads=1):
    """Using Queues create an infinite stream of training minibatches.

  Args:
    filenames: list of paths to sstables tf.Example protos containing training
      data.
    experiment_proto: selection_pb2.Experiment describing the experiment.
    final_mbsz: minibatch size for returned tensors
    hps: optional tf.HParams with hyper-parameters to pass on to preprocess.
    num_epochs: optional number of epochs to pass over the data.
    num_threads: optional number of threads to use for batching output.

  Returns:
    A dequeue_many node that produces input/output pairs.
  """
    prepro_mbsz = 8 * 8 * 1024
    with tf.name_scope('input_pipeline'):
        filename_queue = tf.train.string_input_producer(filenames,
                                                        num_epochs=num_epochs)
        reader = tf.SSTableReader()
        _, raw_strs = reader.read_up_to(filename_queue, prepro_mbsz)
        strs = lt.LabeledTensor(raw_strs, ['batch'])
        input_features = getattr(hps, 'input_features', ())
        inputs, outputs = preprocess(
            strs,
            experiment_proto,
            input_features=input_features,
            kmer_k_max=hps.kmer_k_max,
            ratio_random_dna=hps.ratio_random_dna,
            mode=hps.preprocess_mode,
            total_reads_defining_positive=hps.total_reads_defining_positive,
            additional_output=hps.additional_output.split(','))
        args = lt.batch(list(inputs.values()) + [outputs],
                        batch_size=final_mbsz,
                        enqueue_many=True,
                        capacity=4 * final_mbsz,
                        num_threads=num_threads,
                        allow_smaller_final_batch=(num_epochs is not None))
        inputs = dict(list(zip(list(inputs.keys()), args[:-1])))
        outputs = args[-1]
        return inputs, outputs

Beispiel #8

0

Datei anzeigen

Datei: output_layers.py Projekt: MitchellTesla/google-research

  def __init__(self,
               experiment_proto,
               loss,
               affinity_target_map=None,
               target_names=None,
               additional_output=None):
    """Initialize a FullyObserved output layer.

    Args:
      experiment_proto: selection_pb2.Experiment describing the experiment.
      loss: instance of an AbstractLoss subclass used for computing loss on this
        output layer.
      affinity_target_map: dictionary with one entry for each selection target
        molecule (e.g. protein) and the list of target output values to be used
        to calculate that target molecule's affinity. This dictionary is
        optional to create this OutputLayer but is required to calculate
        affinity. (In other words, during training it is unnecessary but for
        inference it is usually required.)
      target_names: optional list of strings giving target names to train
        against.
      additional_output: optional list of strings containing all the
        additional output to predict.

    Raises:
      Error: if the affinity_target_map is invalid.
    """
    self.loss = loss

    target_names = get_target_names(experiment_proto, target_names)
    additional_output = get_additional_output_names(experiment_proto,
                                                    additional_output)
    if additional_output:
      self.additional_output_axis = lt.Axis('additional_output',
                                            additional_output)
    else:
      self.additional_output_axis = None
    self.count_axis = self.target_axis = lt.Axis('target', target_names)
    self.logit_axis = lt.Axis('target', target_names+additional_output)

    self.binding_arrays_map = _binding_arrays_map(experiment_proto)

    self.params = []

    self.affinity_target_axis = self.affinity_target_lt = None
    if affinity_target_map:
      affinity_target_map = config.DEFAULT_AFFINITY_TARGET_MAPS[
          affinity_target_map]
      # make sure that every target in the affinity_target_map is in the logits
      # (otherwise the target is silently ignored, could be dangerous)
      target_names = self.count_axis.labels
      affinity_names = list(affinity_target_map.keys())
      for (affinity,
           desired_target_names) in affinity_target_map.items():
        for desired_name in desired_target_names:
          if desired_name not in target_names:
            raise Error('The desired target name %s for the affinity molecule'
                        '%s is not found in the logit target names.\n'
                        'logit target names: %s\n', desired_name,
                        affinity, target_names)

      array = np.zeros((len(affinity_names), len(target_names)), dtype=int)
      for i, affinity in enumerate(affinity_names):
        for j, target in enumerate(target_names):
          if target in affinity_target_map[affinity]:
            array[i, j] = 1
      self.affinity_axis = lt.Axis('affinity', affinity_names)
      self.affinity_target_lt = lt.LabeledTensor(
          tf.constant(
              array, dtype=tf.float32, name='affinity_targets'),
          [self.affinity_axis, self.count_axis])

Beispiel #9

0

Datei anzeigen

Datei: output_layers.py Projekt: MitchellTesla/google-research

 def _calculate(self, preds, targets):
   """See method on base class."""
   loss = zero_truncated_log_poisson_loss(targets, preds)
   return lt.LabeledTensor(loss, targets.axes)

Beispiel #10

0

Datei anzeigen

Datei: output_layers.py Projekt: MitchellTesla/google-research

 def _calculate(self, preds, targets):
   """See method on base class."""
   # TODO(shoyer): replace this with lt.nn.log_poisson_loss when that exists
   loss = tf.nn.log_poisson_loss(targets, preds)
   return lt.LabeledTensor(loss, targets.axes)

Beispiel #11

0

Datei anzeigen

Datei: feedforward.py Projekt: tallamjr/google-research

  def _fprop(self, inputs, mode):
    """Builds the fprop graph from inputs up to logits.

    Args:
      inputs: input LabeledTensor with axes [batch_axis, input_position_axis,
        input_channel_axis].
      mode: either 'test' or 'train', determines whether we add dropout nodes

    Returns:
      Logits tensor with axes [batch_axis, logit_axis].

    Raises:
      ValueError: mode must be 'train' or 'test'
    """
    if mode not in ['test', 'train']:
      raise ValueError('mode must be one of "train" or "test"')
    is_training = mode == 'train'

    inputs_2d, inputs_3d = _stack_inputs_by_rank(inputs)

    if inputs_2d is None and inputs_3d is None:
      raise ValueError('feedforward model has no inputs')

    # Get the batch axis from the actual inputs, because we set up the graph
    # with unknown batch size.
    example_inputs = inputs_3d if inputs_2d is None else inputs_2d
    batch_axis = example_inputs.axes['batch']

    w_initializer = tf.uniform_unit_scaling_initializer
    nonlinearity = nonlinearities[self.config.nonlinearity]

    if inputs_3d is not None:
      conv_args = list(zip(*self._conv_config))
      net = contrib_layers.stack(
          inputs_3d,
          conv1d,
          conv_args,
          scope='conv',
          padding='SAME',
          activation_fn=nonlinearity,
          w_initializer=w_initializer)
      net = contrib_layers.flatten(net)
      if inputs_2d is not None:
        net = tf.concat([net, inputs_2d], 1)
    else:
      net = inputs_2d

    if net.get_shape()[-1].value == 0:
      raise ValueError('feature dimension has size 0')

    keep_probs = [1 - d for d in self.config.dropouts]
    fc_args = list(zip(self.fc_sizes, keep_probs, self.fc_init_factors))

    net = contrib_layers.stack(
        net,
        dropout_and_fully_connected,
        fc_args[:-1],
        scope='fc',
        is_training=is_training,
        activation_fn=nonlinearity,
        w_initializer=w_initializer)

    # the last layer should not have a non-linearity
    net = dropout_and_fully_connected(
        net, *fc_args[-1], scope='fc_final', is_training=is_training,
        activation_fn=None, w_initializer=w_initializer)

    logits = lt.LabeledTensor(net, [batch_axis, self.logit_axis])
    return logits

Beispiel #12

0

Datei anzeigen

Datei: data.py Projekt: tallamjr/google-research

def create_input_and_outputs(feature_tensors,
                             experiment_proto,
                             input_features=(SEQUENCE_ONE_HOT, ),
                             skip_all_zero_counts=True,
                             kmer_k_max=4,
                             additional_output=None):
    """Create inputs and outputs from parsed features.

  Args:
    feature_tensors: Dict[str, tf.Tensor] with parsed featured created by
      `build_features`.
    experiment_proto: selection_pb2.Experiment describing the experiment.
    input_features: optional sequence of feature constants defined in this
      module.
    skip_all_zero_counts: some sequences have no counts, e.g., because they were
      created artificially for validation purposes on the binding array. We want
      to skip these sequences for training.
    kmer_k_max: optional integer giving the maximum kmer length to use if
      SEQUENCE_KMER_COUNT is in `input_features`.
    additional_output: optional list of strings contains additional outputs.

  Returns:
    inputs: LabeledTensor with dtype=float32 and axes
      [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded
      rasterized sequences for input into machine learning models.
    outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis]
      denoting possible output tensors, including counts and binding array
      measurements.
  """

    sequence_length = experiment_proto.sequence_length
    count_names = selection.all_count_names(experiment_proto)
    array_names = selection.binding_array_names(experiment_proto)

    sequence_tensor = feature_tensors['sequence']
    batch_axis = sequence_tensor.axes['batch']
    position_axis = ('position', list(range(sequence_length)))

    inputs = {}

    if SEQUENCE_ONE_HOT in input_features:
        seq_indices = custom_ops.dna_sequence_to_indices(
            sequence_tensor, sequence_length)
        tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32)
        channel_axis = ('channel', list(dna.DNA_BASES))
        axes = [batch_axis, position_axis, channel_axis]
        one_hots = lt.LabeledTensor(tensor, axes)
        inputs[SEQUENCE_ONE_HOT] = one_hots

    if SEQUENCE_KMER_COUNT in input_features:
        raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor,
                                                    kmer_k_max)
        kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max))
        counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis])
        means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length)
        mean_count = lt.constant(means, tf.float32, axes=[kmer_axis])
        std_count = lt.constant(stds, tf.float32, axes=[kmer_axis])
        inputs[SEQUENCE_KMER_COUNT] = (
            (lt.cast(counts, tf.float32) - mean_count) / std_count)

    if STRUCTURE_PARTITION_FUNCTION in input_features:
        with tf.name_scope('structure_partition_fn'):
            raw_pf_tensor = lt.expand_dims(
                feature_tensors['partition_function'],
                ['batch', 'partition_fn_axis'])
            inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor)

    output_names = count_names + array_names
    outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names]

    if additional_output and additional_output[0]:
        outputs += [
            lt.cast(feature_tensors[k], tf.float32) for k in additional_output
        ]
        output_names += additional_output
    outputs = lt.pack(outputs, ('output', output_names), axis_position=1)

    if skip_all_zero_counts:
        with tf.name_scope('counts_filtering'):
            counts = lt.select(outputs, {'output': count_names})
            keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output')
            inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()}
            outputs = lt.boolean_mask(outputs, keep)

    return inputs, outputs

Beispiel #13

0

Datei anzeigen

Datei: data.py Projekt: tallamjr/google-research

def upsample_positives(feature_tensors,
                       count_names,
                       total_reads_defining_positive,
                       min_fraction_positive,
                       seed=None):
    """Returns feature tensors with positives upsampled to the desired rate.

  Args:
    feature_tensors: Dict[str, lt.LabeledTensor] with parsed featured created by
      `build_features`.
    count_names: A list of labels that are count names.
    total_reads_defining_positive: The minimum number of reads detected across
      all conditions that defines a sequence as being a positive example.
    min_fraction_positive: The minimum fraction of positive examples to allow
      in the data.
    seed: The random seed to use in upsampling.

  Returns:
    A dictionary mapping from string feature name to lt.LabeledTensor of parsed
    features created by `build_features` and positive examples upsampled to the
    desired rate.

  Raises:
    ValueError: The minimum positive fraction requested is invalid.
  """
    # Goal: Find the fraction of all input feature tensors that should be
    # classified as "positive" based on the total_reads_defining_positive.
    # Upsample those using resample.resample_at_rate() until they are at least
    # min_fraction_positive of the entire set.
    if min_fraction_positive < 0 or min_fraction_positive >= 1:
        raise ValueError('Invalid fraction positive, must be in [0, 1): %s' %
                         min_fraction_positive)

    with tf.name_scope('upsample_positives'):
        # Classify the inputs as positive or negative.
        total_reads_defining_positive = tf.constant(
            total_reads_defining_positive, dtype=tf.float32)
        min_fraction_positive = tf.constant(min_fraction_positive,
                                            dtype=tf.float32)
        counts = lt.pack(
            [lt.cast(feature_tensors[k], tf.float32) for k in count_names],
            ('sequence_counts', count_names),
            axis_position=1)
        greater_equal = (lt.reduce_sum(counts, 'sequence_counts') >=
                         total_reads_defining_positive)
        num_pos = lt.reduce_sum(lt.cast(greater_equal, tf.int32))
        less_than = lt.logical_not(greater_equal)
        num_neg = lt.reduce_sum(lt.cast(less_than, tf.int32))

        # With an initial number of positives P and number of negatives N,
        # if we keep the negative sampling rate at 1 (to try to retain negatives),
        # to achieve a total positive input fraction of F, we need a positive
        # sampling rate R that satisfies:
        # P * R / (P * R + N) >= F
        #
        # Solving for R:
        #
        # P * R = F * (P*R + N) = F*P*R + F*N
        # P * R (1 - F) = F * N
        # R = F*N / (P * (1 - F))
        numerator = min_fraction_positive * tf.cast(num_neg, tf.float32)
        denom = tf.cast(num_pos, tf.float32) * (1 - min_fraction_positive)
        denom = tf.cond(
            denom > 0.0,
            lambda: denom,
            # If denom == 0, we can set it to anything we want since the
            # tf.cond below is guaranteed to return the input without
            # resampling.
            lambda: tf.constant(1.0, dtype=tf.float32))
        positive_rate = numerator / denom
        batch_size = tf.shape(greater_equal)[0]
        negative_rates = tf.ones([batch_size], tf.float32)
        positive_rates = tf.fill([batch_size], positive_rate)
        rates = tf.where(greater_equal, positive_rates, negative_rates)

        # Pack the LabeledTensors into normal tensors, keeping relevant information
        # for unpacking back to LabeledTensors available.
        ordered_names = sorted(feature_tensors)
        packed_tensors = []
        tensor_axes = []
        tensor_shapes = []
        for name in ordered_names:
            labeled_tensor = feature_tensors[name]
            packed_tensors.append(labeled_tensor.tensor)
            tensor_axes.append(labeled_tensor.axes)
            tensor_shapes.append(labeled_tensor.get_shape())

        # Perform the resampling.
        resampled_tensors = tf.cond(
            tf.logical_or(
                tf.equal(num_pos, 0),
                tf.cast(num_pos, dtype=tf.float32) >=
                (min_fraction_positive *
                 tf.cast(batch_size, dtype=tf.float32))),
            lambda: packed_tensors, lambda: resample.resample_at_rate(
                packed_tensors, rates, seed=seed))

        # Unpack the tensors into a dictionary of LabeledTensors again.
        # First, change the shape so that the batch axis is unknown.
        tensor_shapes = [[None] + list(shape)[1:] for shape in tensor_shapes]
        for tensor, shape in zip(resampled_tensors, tensor_shapes):
            tensor.set_shape(shape)

        unpacked_feature_tensors = {}
        for i, name in enumerate(ordered_names):
            labeled = lt.LabeledTensor(resampled_tensors[i], tensor_axes[i])
            unpacked_feature_tensors[name] = labeled
        return unpacked_feature_tensors

Beispiel #14

0

Datei anzeigen

Datei: data.py Projekt: tallamjr/google-research

def preprocess(strs,
               experiment_proto,
               input_features=(SEQUENCE_ONE_HOT, ),
               mode=PREPROCESS_SKIP_ALL_ZERO_COUNTS,
               kmer_k_max=4,
               ratio_random_dna=1,
               total_reads_defining_positive=0,
               additional_output=None):
    """Build a small TF graph to preprocess a minibatch of tf.Example protos.

  Args:
    strs: LabeledTensor holding a minibatch of serialized tf.Example protos
    experiment_proto: selection_pb2.Experiment describing the experiment.
    input_features: optional sequence of feature constants defined in this
      module.
    mode: optional preprocess mode defined in this module.
    kmer_k_max: optional integer giving the maximum kmer length to use if
      SEQUENCE_KMER_COUNT is in `input_features`.
    ratio_random_dna: optional ratio of random sequences to inject if mode ==
      PREPROCESS_INJECT_RANDOM_SEQUENCES
    total_reads_defining_positive: optional integer indicating the sum of all
      read counts required to be seen to classify the tensor as a "positive"
      example when balancing input classes.
    additional_output: optional list of strings contains additional outputs.

  Returns:
    inputs: LabeledTensor with dtype=float32 and axes
      [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded
      rasterized sequences for input into machine learning models.
    outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis]
      denoting possible output tensors, including counts and binding array
      measurements.
  """
    with tf.name_scope('preprocess'):
        features = build_features(experiment_proto)
        parsed_feature_tensors = lt.parse_example(strs, features)
        count_names = selection.all_count_names(experiment_proto)

        if mode == PREPROCESS_SKIP_ALL_ZERO_COUNTS:
            skip_all_zero_counts = True
            feature_tensors = parsed_feature_tensors

        elif mode == PREPROCESS_ALL_COUNTS:
            skip_all_zero_counts = False
            feature_tensors = parsed_feature_tensors

        elif mode == PREPROCESS_INJECT_RANDOM_SEQUENCES:
            skip_all_zero_counts = False

            # replace zero counts with NaN in real data
            for count_name in count_names:
                count = parsed_feature_tensors[count_name]
                parsed_feature_tensors[count_name] = lt.LabeledTensor(
                    tf.where(count != 0, tf.cast(count, tf.float32),
                             tf.fill(tf.shape(count), np.float32(np.nan))),
                    count.axes)

            # only random sequences will have a count of zero
            input_batch_size = tf.shape(strs.tensor)[list(
                strs.axes.keys()).index('batch')]
            n_randoms = tf.cast(
                tf.cast(input_batch_size, tf.float32) * ratio_random_dna,
                tf.int32)
            random_feature_tensors = random_dna_features(
                experiment_proto, n_randoms)
            for count_name in count_names:
                random_feature_tensors[count_name] = lt.cast(
                    random_feature_tensors[count_name], tf.float32)

            feature_tensors = {
                k: lt.concat(
                    [random_feature_tensors[k], parsed_feature_tensors[k]],
                    'batch')
                for k in features
            }

            # shuffle random and non-random inputs because preprocess batches get
            # split across many mini-batches for training
            batch_size = tf.shape(feature_tensors['sequence'].tensor)[0]
            order = tf.random_shuffle(tf.range(batch_size, dtype=tf.int32))
            order.set_shape(feature_tensors['sequence'].tensor.get_shape())
            feature_tensors = {
                k: lt.LabeledTensor(tf.gather(v.tensor, order), v.axes)
                for k, v in feature_tensors.items()
            }

        else:
            raise ValueError('unknown mode: %r' % mode)  # pylint: disable=g-doc-exception

        feature_tensors = upsample_positives(
            feature_tensors,
            count_names,
            total_reads_defining_positive=total_reads_defining_positive,
            min_fraction_positive=0.1)

        inputs, outputs = create_input_and_outputs(
            feature_tensors,
            experiment_proto,
            input_features=input_features,
            kmer_k_max=kmer_k_max,
            skip_all_zero_counts=skip_all_zero_counts,
            additional_output=additional_output)

        return inputs, outputs