def __init__(self, experiment_proto, loss, target_names=None, additional_output=None): """Initialize a LatentAffinity output layer. Args: experiment_proto: selection_pb2.Experiment describing the experiment. loss: instance of an AbstractLoss subclass used for computing loss on this output layer. target_names: optional list of strings giving target names to train against. additional_output: optional list of strings containing all the additional output to predict. Raises: ValueError: if any target_names are not counts. """ self.loss = loss affinity_names = selection.all_target_and_background_names( experiment_proto) additional_output = get_additional_output_names( experiment_proto, additional_output) target_names = get_target_names(experiment_proto, target_names) self.target_axis = lt.Axis('target', target_names) if additional_output: self.additional_output_axis = lt.Axis('additional_output', additional_output) else: self.additional_output_axis = None self.logit_axis = lt.Axis('target', affinity_names + additional_output) self.affinity_axis = lt.Axis('affinity', affinity_names) self.all_target_axis = lt.Axis('target', target_names + additional_output) self.all_count_names = selection.all_count_names(experiment_proto) self.binding_arrays_map = _binding_arrays_map(experiment_proto) signs = _get_selection_signs(affinity_names, target_names, experiment_proto) self.selection_signs = lt.LabeledTensor( tf.constant(signs, dtype=tf.float32, name='selection_signs'), [self.affinity_axis, self.target_axis]) # TODO(shoyer): consider if there's a sane way to make lt.Variable affinity_weights = tf.Variable(tf.ones_like(signs, dtype=tf.float32), name='affinity_weights') bias = tf.Variable(tf.zeros([self.target_axis.size]), name='bias') self.params = [affinity_weights, bias] self.affinity_weights = lt.LabeledTensor( tf.convert_to_tensor(affinity_weights), [self.affinity_axis, self.target_axis]) self.bias = lt.LabeledTensor(tf.convert_to_tensor(bias), [self.target_axis])
def _normed_prev_round_counts(self, input_counts, input_counts_label='output'): """Create a Tensor with normalized counts from the previous round. Args: input_counts: LabeledTensor with dtype=float32 and axes [batch, input_counts_label]. input_counts_label: Name of the axis in input_counts that contains the count data to use. For LatentAffinityWithDeps that uses the actual count values, input_counts will be the outputs tensor and the label will be 'output'. For LatentAffinityWithPredDeps, input_counts will be the predictions for these counts and the axis will be 'target'. Returns: preds: LabeledTensor with dtype=float32 and axes [batch, target_axis]. """ parent_lookup = {} for k, parent in self.parent_count_names.items(): if parent in input_counts.axes: parent_lookup[k] = lt.select(input_counts, {input_counts_label: parent}) default_tensor = lt.LabeledTensor( tf.zeros_like(input_counts[:, 0]), [input_counts.axes['batch']]) parent_tensors = [ parent_lookup.get(k, default_tensor) for k in self.target_axis.labels ] parent_counts = lt.pack(parent_tensors, self.target_axis, axis_position=1) normed_counts = self.deps_normalize(parent_counts) return normed_counts
def __init__(self, experiment_proto, loss, deps_normalize, target_names=None, additional_output=None): """Initialize a LatentAffinityWithDeps output layer. Args: experiment_proto: selection_pb2.Experiment describing the experiment. loss: instance of an AbstractLoss subclass used for computing loss on this output layer. deps_normalize: Normalizer instance used for normalizing dependency counts. target_names: optional list of strings giving target names to train against. additional_output: optional list of strings containing all the additional output to predict. """ super(LatentAffinityWithDeps, self).__init__( experiment_proto, loss, target_names=target_names, additional_output=additional_output) self.deps_normalize = deps_normalize self.parent_count_names = selection.parent_counts(experiment_proto) init_value = 0.1 * tf.ones((self.target_axis.size,), dtype=tf.float32) prev_round_scale = tf.Variable(init_value, name='prev_round_scale') self.params.append(prev_round_scale) self.prev_round_scale = lt.LabeledTensor( tf.convert_to_tensor(prev_round_scale), [self.target_axis])
def random_dna_features(experiment_proto, size): """Create a dict of feature tensors for random DNA sequences. All features other than 'sequence' should use the default value. Args: experiment_proto: selection_pb2.Experiment describing the experiment. size: scalar integer tf.Tensor giving the number of desired sequences. Returns: Dict[Any, labeled_tensor.LabeledTensor] providing generated features. """ with tf.name_scope('preprocess_random_input'): template = selection.get_template_sequence(experiment_proto) features = build_features(experiment_proto) feature_tensors = {} for k, v in features.items(): if k != 'sequence': default_value = lt.constant(v.default_value, v.dtype, v.axes) expanded_default = lt.expand_dims(default_value, ['batch'] + v.axes) tiled_default = lt.tile(expanded_default, {'batch': size}) feature_tensors[k] = tiled_default feature_tensors['sequence'] = lt.LabeledTensor( random_dna_sequence(template, (size, )), ['batch']) return feature_tensors
def evaluation_tensors(self, examples, keys=None): """Build output and evaluation tensors for a feed forward model. Args: examples: tf.Tensor with dtype=string and shape=[None] holding serialized tf.train.Example protos. keys: Optional sequence of string tensor names to evaluate. By default, uses all known tensors. Returns: Dict[str, lt.LabeledTensor] giving all possible tensors to run. """ # TODO(shoyer): expose options for injecting/filtering examples? inputs, outputs = data.preprocess( lt.LabeledTensor(examples, ['batch']), self.experiment_proto, kmer_k_max=self.hps.kmer_k_max, input_features=self.hps.input_features, mode=data.PREPROCESS_ALL_COUNTS, additional_output=self.hps.additional_output.split(',')) actual_counts, actual_binding = self._split_outputs(outputs) # TODO(shoyer): encapsulate net/output_layer in a single class that makes # predictions without building logits as intermediate output. # This will be useful for testing other methods. net, output_layer = self._create_net_and_output_layer() logits = net.fprop(inputs, mode='test') predicted_outputs = output_layer.predict_outputs(logits, outputs) # only calculate the predicted affinity if it is required. predicted_affinity = None if not keys or PREDICTED_AFFINITY in keys: predicted_affinity = output_layer.predict_affinity(logits) loss = output_layer.loss_per_example_and_target( logits, outputs, self.hps.train_on_array) tensors = { ACTUAL_OUTPUTS: outputs, ACTUAL_COUNTS: actual_counts, ACTUAL_BINDING: actual_binding, PREDICTED_OUTPUTS: predicted_outputs, PREDICTED_AFFINITY: predicted_affinity, LOSS: loss } tensors.update(inputs) # remove outputs not provided by the given model tensors = {k: v for k, v in tensors.items() if v is not None} if keys is not None: tensors = {k: tensors[k] for k in keys} return tensors
def __init__(self, experiment_proto, loss, deps_normalize, target_names=None, additional_output=None): """See method on base class.""" super(LatentAffinityWithCrossDeps, self).__init__( experiment_proto, loss, deps_normalize, target_names=target_names, additional_output=additional_output) init_value = tf.zeros( (self.affinity_axis.size, self.target_axis.size), dtype=tf.float32) logit_by_prev_count = tf.Variable(init_value, name='logit_by_prev_count') self.params.append(logit_by_prev_count) self.logit_by_prev_count = lt.LabeledTensor( tf.convert_to_tensor(logit_by_prev_count), [self.affinity_axis, self.target_axis])
def input_pipeline(filenames, experiment_proto, final_mbsz, hps, num_epochs=None, num_threads=1): """Using Queues create an infinite stream of training minibatches. Args: filenames: list of paths to sstables tf.Example protos containing training data. experiment_proto: selection_pb2.Experiment describing the experiment. final_mbsz: minibatch size for returned tensors hps: optional tf.HParams with hyper-parameters to pass on to preprocess. num_epochs: optional number of epochs to pass over the data. num_threads: optional number of threads to use for batching output. Returns: A dequeue_many node that produces input/output pairs. """ prepro_mbsz = 8 * 8 * 1024 with tf.name_scope('input_pipeline'): filename_queue = tf.train.string_input_producer(filenames, num_epochs=num_epochs) reader = tf.SSTableReader() _, raw_strs = reader.read_up_to(filename_queue, prepro_mbsz) strs = lt.LabeledTensor(raw_strs, ['batch']) input_features = getattr(hps, 'input_features', ()) inputs, outputs = preprocess( strs, experiment_proto, input_features=input_features, kmer_k_max=hps.kmer_k_max, ratio_random_dna=hps.ratio_random_dna, mode=hps.preprocess_mode, total_reads_defining_positive=hps.total_reads_defining_positive, additional_output=hps.additional_output.split(',')) args = lt.batch(list(inputs.values()) + [outputs], batch_size=final_mbsz, enqueue_many=True, capacity=4 * final_mbsz, num_threads=num_threads, allow_smaller_final_batch=(num_epochs is not None)) inputs = dict(list(zip(list(inputs.keys()), args[:-1]))) outputs = args[-1] return inputs, outputs
def __init__(self, experiment_proto, loss, affinity_target_map=None, target_names=None, additional_output=None): """Initialize a FullyObserved output layer. Args: experiment_proto: selection_pb2.Experiment describing the experiment. loss: instance of an AbstractLoss subclass used for computing loss on this output layer. affinity_target_map: dictionary with one entry for each selection target molecule (e.g. protein) and the list of target output values to be used to calculate that target molecule's affinity. This dictionary is optional to create this OutputLayer but is required to calculate affinity. (In other words, during training it is unnecessary but for inference it is usually required.) target_names: optional list of strings giving target names to train against. additional_output: optional list of strings containing all the additional output to predict. Raises: Error: if the affinity_target_map is invalid. """ self.loss = loss target_names = get_target_names(experiment_proto, target_names) additional_output = get_additional_output_names(experiment_proto, additional_output) if additional_output: self.additional_output_axis = lt.Axis('additional_output', additional_output) else: self.additional_output_axis = None self.count_axis = self.target_axis = lt.Axis('target', target_names) self.logit_axis = lt.Axis('target', target_names+additional_output) self.binding_arrays_map = _binding_arrays_map(experiment_proto) self.params = [] self.affinity_target_axis = self.affinity_target_lt = None if affinity_target_map: affinity_target_map = config.DEFAULT_AFFINITY_TARGET_MAPS[ affinity_target_map] # make sure that every target in the affinity_target_map is in the logits # (otherwise the target is silently ignored, could be dangerous) target_names = self.count_axis.labels affinity_names = list(affinity_target_map.keys()) for (affinity, desired_target_names) in affinity_target_map.items(): for desired_name in desired_target_names: if desired_name not in target_names: raise Error('The desired target name %s for the affinity molecule' '%s is not found in the logit target names.\n' 'logit target names: %s\n', desired_name, affinity, target_names) array = np.zeros((len(affinity_names), len(target_names)), dtype=int) for i, affinity in enumerate(affinity_names): for j, target in enumerate(target_names): if target in affinity_target_map[affinity]: array[i, j] = 1 self.affinity_axis = lt.Axis('affinity', affinity_names) self.affinity_target_lt = lt.LabeledTensor( tf.constant( array, dtype=tf.float32, name='affinity_targets'), [self.affinity_axis, self.count_axis])
def _calculate(self, preds, targets): """See method on base class.""" loss = zero_truncated_log_poisson_loss(targets, preds) return lt.LabeledTensor(loss, targets.axes)
def _calculate(self, preds, targets): """See method on base class.""" # TODO(shoyer): replace this with lt.nn.log_poisson_loss when that exists loss = tf.nn.log_poisson_loss(targets, preds) return lt.LabeledTensor(loss, targets.axes)
def _fprop(self, inputs, mode): """Builds the fprop graph from inputs up to logits. Args: inputs: input LabeledTensor with axes [batch_axis, input_position_axis, input_channel_axis]. mode: either 'test' or 'train', determines whether we add dropout nodes Returns: Logits tensor with axes [batch_axis, logit_axis]. Raises: ValueError: mode must be 'train' or 'test' """ if mode not in ['test', 'train']: raise ValueError('mode must be one of "train" or "test"') is_training = mode == 'train' inputs_2d, inputs_3d = _stack_inputs_by_rank(inputs) if inputs_2d is None and inputs_3d is None: raise ValueError('feedforward model has no inputs') # Get the batch axis from the actual inputs, because we set up the graph # with unknown batch size. example_inputs = inputs_3d if inputs_2d is None else inputs_2d batch_axis = example_inputs.axes['batch'] w_initializer = tf.uniform_unit_scaling_initializer nonlinearity = nonlinearities[self.config.nonlinearity] if inputs_3d is not None: conv_args = list(zip(*self._conv_config)) net = contrib_layers.stack( inputs_3d, conv1d, conv_args, scope='conv', padding='SAME', activation_fn=nonlinearity, w_initializer=w_initializer) net = contrib_layers.flatten(net) if inputs_2d is not None: net = tf.concat([net, inputs_2d], 1) else: net = inputs_2d if net.get_shape()[-1].value == 0: raise ValueError('feature dimension has size 0') keep_probs = [1 - d for d in self.config.dropouts] fc_args = list(zip(self.fc_sizes, keep_probs, self.fc_init_factors)) net = contrib_layers.stack( net, dropout_and_fully_connected, fc_args[:-1], scope='fc', is_training=is_training, activation_fn=nonlinearity, w_initializer=w_initializer) # the last layer should not have a non-linearity net = dropout_and_fully_connected( net, *fc_args[-1], scope='fc_final', is_training=is_training, activation_fn=None, w_initializer=w_initializer) logits = lt.LabeledTensor(net, [batch_axis, self.logit_axis]) return logits
def create_input_and_outputs(feature_tensors, experiment_proto, input_features=(SEQUENCE_ONE_HOT, ), skip_all_zero_counts=True, kmer_k_max=4, additional_output=None): """Create inputs and outputs from parsed features. Args: feature_tensors: Dict[str, tf.Tensor] with parsed featured created by `build_features`. experiment_proto: selection_pb2.Experiment describing the experiment. input_features: optional sequence of feature constants defined in this module. skip_all_zero_counts: some sequences have no counts, e.g., because they were created artificially for validation purposes on the binding array. We want to skip these sequences for training. kmer_k_max: optional integer giving the maximum kmer length to use if SEQUENCE_KMER_COUNT is in `input_features`. additional_output: optional list of strings contains additional outputs. Returns: inputs: LabeledTensor with dtype=float32 and axes [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded rasterized sequences for input into machine learning models. outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis] denoting possible output tensors, including counts and binding array measurements. """ sequence_length = experiment_proto.sequence_length count_names = selection.all_count_names(experiment_proto) array_names = selection.binding_array_names(experiment_proto) sequence_tensor = feature_tensors['sequence'] batch_axis = sequence_tensor.axes['batch'] position_axis = ('position', list(range(sequence_length))) inputs = {} if SEQUENCE_ONE_HOT in input_features: seq_indices = custom_ops.dna_sequence_to_indices( sequence_tensor, sequence_length) tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32) channel_axis = ('channel', list(dna.DNA_BASES)) axes = [batch_axis, position_axis, channel_axis] one_hots = lt.LabeledTensor(tensor, axes) inputs[SEQUENCE_ONE_HOT] = one_hots if SEQUENCE_KMER_COUNT in input_features: raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor, kmer_k_max) kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max)) counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis]) means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length) mean_count = lt.constant(means, tf.float32, axes=[kmer_axis]) std_count = lt.constant(stds, tf.float32, axes=[kmer_axis]) inputs[SEQUENCE_KMER_COUNT] = ( (lt.cast(counts, tf.float32) - mean_count) / std_count) if STRUCTURE_PARTITION_FUNCTION in input_features: with tf.name_scope('structure_partition_fn'): raw_pf_tensor = lt.expand_dims( feature_tensors['partition_function'], ['batch', 'partition_fn_axis']) inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor) output_names = count_names + array_names outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names] if additional_output and additional_output[0]: outputs += [ lt.cast(feature_tensors[k], tf.float32) for k in additional_output ] output_names += additional_output outputs = lt.pack(outputs, ('output', output_names), axis_position=1) if skip_all_zero_counts: with tf.name_scope('counts_filtering'): counts = lt.select(outputs, {'output': count_names}) keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output') inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()} outputs = lt.boolean_mask(outputs, keep) return inputs, outputs
def upsample_positives(feature_tensors, count_names, total_reads_defining_positive, min_fraction_positive, seed=None): """Returns feature tensors with positives upsampled to the desired rate. Args: feature_tensors: Dict[str, lt.LabeledTensor] with parsed featured created by `build_features`. count_names: A list of labels that are count names. total_reads_defining_positive: The minimum number of reads detected across all conditions that defines a sequence as being a positive example. min_fraction_positive: The minimum fraction of positive examples to allow in the data. seed: The random seed to use in upsampling. Returns: A dictionary mapping from string feature name to lt.LabeledTensor of parsed features created by `build_features` and positive examples upsampled to the desired rate. Raises: ValueError: The minimum positive fraction requested is invalid. """ # Goal: Find the fraction of all input feature tensors that should be # classified as "positive" based on the total_reads_defining_positive. # Upsample those using resample.resample_at_rate() until they are at least # min_fraction_positive of the entire set. if min_fraction_positive < 0 or min_fraction_positive >= 1: raise ValueError('Invalid fraction positive, must be in [0, 1): %s' % min_fraction_positive) with tf.name_scope('upsample_positives'): # Classify the inputs as positive or negative. total_reads_defining_positive = tf.constant( total_reads_defining_positive, dtype=tf.float32) min_fraction_positive = tf.constant(min_fraction_positive, dtype=tf.float32) counts = lt.pack( [lt.cast(feature_tensors[k], tf.float32) for k in count_names], ('sequence_counts', count_names), axis_position=1) greater_equal = (lt.reduce_sum(counts, 'sequence_counts') >= total_reads_defining_positive) num_pos = lt.reduce_sum(lt.cast(greater_equal, tf.int32)) less_than = lt.logical_not(greater_equal) num_neg = lt.reduce_sum(lt.cast(less_than, tf.int32)) # With an initial number of positives P and number of negatives N, # if we keep the negative sampling rate at 1 (to try to retain negatives), # to achieve a total positive input fraction of F, we need a positive # sampling rate R that satisfies: # P * R / (P * R + N) >= F # # Solving for R: # # P * R = F * (P*R + N) = F*P*R + F*N # P * R (1 - F) = F * N # R = F*N / (P * (1 - F)) numerator = min_fraction_positive * tf.cast(num_neg, tf.float32) denom = tf.cast(num_pos, tf.float32) * (1 - min_fraction_positive) denom = tf.cond( denom > 0.0, lambda: denom, # If denom == 0, we can set it to anything we want since the # tf.cond below is guaranteed to return the input without # resampling. lambda: tf.constant(1.0, dtype=tf.float32)) positive_rate = numerator / denom batch_size = tf.shape(greater_equal)[0] negative_rates = tf.ones([batch_size], tf.float32) positive_rates = tf.fill([batch_size], positive_rate) rates = tf.where(greater_equal, positive_rates, negative_rates) # Pack the LabeledTensors into normal tensors, keeping relevant information # for unpacking back to LabeledTensors available. ordered_names = sorted(feature_tensors) packed_tensors = [] tensor_axes = [] tensor_shapes = [] for name in ordered_names: labeled_tensor = feature_tensors[name] packed_tensors.append(labeled_tensor.tensor) tensor_axes.append(labeled_tensor.axes) tensor_shapes.append(labeled_tensor.get_shape()) # Perform the resampling. resampled_tensors = tf.cond( tf.logical_or( tf.equal(num_pos, 0), tf.cast(num_pos, dtype=tf.float32) >= (min_fraction_positive * tf.cast(batch_size, dtype=tf.float32))), lambda: packed_tensors, lambda: resample.resample_at_rate( packed_tensors, rates, seed=seed)) # Unpack the tensors into a dictionary of LabeledTensors again. # First, change the shape so that the batch axis is unknown. tensor_shapes = [[None] + list(shape)[1:] for shape in tensor_shapes] for tensor, shape in zip(resampled_tensors, tensor_shapes): tensor.set_shape(shape) unpacked_feature_tensors = {} for i, name in enumerate(ordered_names): labeled = lt.LabeledTensor(resampled_tensors[i], tensor_axes[i]) unpacked_feature_tensors[name] = labeled return unpacked_feature_tensors
def preprocess(strs, experiment_proto, input_features=(SEQUENCE_ONE_HOT, ), mode=PREPROCESS_SKIP_ALL_ZERO_COUNTS, kmer_k_max=4, ratio_random_dna=1, total_reads_defining_positive=0, additional_output=None): """Build a small TF graph to preprocess a minibatch of tf.Example protos. Args: strs: LabeledTensor holding a minibatch of serialized tf.Example protos experiment_proto: selection_pb2.Experiment describing the experiment. input_features: optional sequence of feature constants defined in this module. mode: optional preprocess mode defined in this module. kmer_k_max: optional integer giving the maximum kmer length to use if SEQUENCE_KMER_COUNT is in `input_features`. ratio_random_dna: optional ratio of random sequences to inject if mode == PREPROCESS_INJECT_RANDOM_SEQUENCES total_reads_defining_positive: optional integer indicating the sum of all read counts required to be seen to classify the tensor as a "positive" example when balancing input classes. additional_output: optional list of strings contains additional outputs. Returns: inputs: LabeledTensor with dtype=float32 and axes [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded rasterized sequences for input into machine learning models. outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis] denoting possible output tensors, including counts and binding array measurements. """ with tf.name_scope('preprocess'): features = build_features(experiment_proto) parsed_feature_tensors = lt.parse_example(strs, features) count_names = selection.all_count_names(experiment_proto) if mode == PREPROCESS_SKIP_ALL_ZERO_COUNTS: skip_all_zero_counts = True feature_tensors = parsed_feature_tensors elif mode == PREPROCESS_ALL_COUNTS: skip_all_zero_counts = False feature_tensors = parsed_feature_tensors elif mode == PREPROCESS_INJECT_RANDOM_SEQUENCES: skip_all_zero_counts = False # replace zero counts with NaN in real data for count_name in count_names: count = parsed_feature_tensors[count_name] parsed_feature_tensors[count_name] = lt.LabeledTensor( tf.where(count != 0, tf.cast(count, tf.float32), tf.fill(tf.shape(count), np.float32(np.nan))), count.axes) # only random sequences will have a count of zero input_batch_size = tf.shape(strs.tensor)[list( strs.axes.keys()).index('batch')] n_randoms = tf.cast( tf.cast(input_batch_size, tf.float32) * ratio_random_dna, tf.int32) random_feature_tensors = random_dna_features( experiment_proto, n_randoms) for count_name in count_names: random_feature_tensors[count_name] = lt.cast( random_feature_tensors[count_name], tf.float32) feature_tensors = { k: lt.concat( [random_feature_tensors[k], parsed_feature_tensors[k]], 'batch') for k in features } # shuffle random and non-random inputs because preprocess batches get # split across many mini-batches for training batch_size = tf.shape(feature_tensors['sequence'].tensor)[0] order = tf.random_shuffle(tf.range(batch_size, dtype=tf.int32)) order.set_shape(feature_tensors['sequence'].tensor.get_shape()) feature_tensors = { k: lt.LabeledTensor(tf.gather(v.tensor, order), v.axes) for k, v in feature_tensors.items() } else: raise ValueError('unknown mode: %r' % mode) # pylint: disable=g-doc-exception feature_tensors = upsample_positives( feature_tensors, count_names, total_reads_defining_positive=total_reads_defining_positive, min_fraction_positive=0.1) inputs, outputs = create_input_and_outputs( feature_tensors, experiment_proto, input_features=input_features, kmer_k_max=kmer_k_max, skip_all_zero_counts=skip_all_zero_counts, additional_output=additional_output) return inputs, outputs