def _normed_prev_round_counts(self, input_counts, input_counts_label='output'): """Create a Tensor with normalized counts from the previous round. Args: input_counts: LabeledTensor with dtype=float32 and axes [batch, input_counts_label]. input_counts_label: Name of the axis in input_counts that contains the count data to use. For LatentAffinityWithDeps that uses the actual count values, input_counts will be the outputs tensor and the label will be 'output'. For LatentAffinityWithPredDeps, input_counts will be the predictions for these counts and the axis will be 'target'. Returns: preds: LabeledTensor with dtype=float32 and axes [batch, target_axis]. """ parent_lookup = {} for k, parent in self.parent_count_names.items(): if parent in input_counts.axes: parent_lookup[k] = lt.select(input_counts, {input_counts_label: parent}) default_tensor = lt.LabeledTensor( tf.zeros_like(input_counts[:, 0]), [input_counts.axes['batch']]) parent_tensors = [ parent_lookup.get(k, default_tensor) for k in self.target_axis.labels ] parent_counts = lt.pack(parent_tensors, self.target_axis, axis_position=1) normed_counts = self.deps_normalize(parent_counts) return normed_counts
def predict_outputs(self, logits, outputs=None): """Predict a score that should correlate with each output. Args: logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis]. outputs: optional LabeledTensor with dtype=float32 and axes [batch, output_axis]. Note that different output layers may not be directly comparable if they make sure of `outputs` from prior rounds of selection in predictions. Returns: LabeledTensor with dtype=float32 and axes [batch, output_axis] giving predictions for each count and binding array. """ predicted_counts = lt.rename_axis( self.predict_counts(logits, outputs), 'target', 'output') if self.binding_arrays_map: predicted_affinity = self.predict_affinity(logits) predicted_binding_arrays = lt.pack([ lt.select(predicted_affinity, {'affinity': target}) for target in self.binding_arrays_map.values() ], ('output', list(self.binding_arrays_map.keys())), axis_position=1) preds = lt.concat([predicted_counts, predicted_binding_arrays], 'output') else: preds = predicted_counts if self.additional_output_axis: predicted_additional_output = lt.rename_axis( self.predict_additional_output(logits), 'target', 'output') preds = lt.concat([preds, predicted_additional_output], 'output') return preds
def _affinities_to_binding_arrays(binding_arrays_map, affinities): return lt.pack([ lt.select(affinities, {'affinity': target}) for target in binding_arrays_map.values() ], ('output', list(binding_arrays_map.keys())), axis_position=1)
def create_input_and_outputs(feature_tensors, experiment_proto, input_features=(SEQUENCE_ONE_HOT, ), skip_all_zero_counts=True, kmer_k_max=4, additional_output=None): """Create inputs and outputs from parsed features. Args: feature_tensors: Dict[str, tf.Tensor] with parsed featured created by `build_features`. experiment_proto: selection_pb2.Experiment describing the experiment. input_features: optional sequence of feature constants defined in this module. skip_all_zero_counts: some sequences have no counts, e.g., because they were created artificially for validation purposes on the binding array. We want to skip these sequences for training. kmer_k_max: optional integer giving the maximum kmer length to use if SEQUENCE_KMER_COUNT is in `input_features`. additional_output: optional list of strings contains additional outputs. Returns: inputs: LabeledTensor with dtype=float32 and axes [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded rasterized sequences for input into machine learning models. outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis] denoting possible output tensors, including counts and binding array measurements. """ sequence_length = experiment_proto.sequence_length count_names = selection.all_count_names(experiment_proto) array_names = selection.binding_array_names(experiment_proto) sequence_tensor = feature_tensors['sequence'] batch_axis = sequence_tensor.axes['batch'] position_axis = ('position', list(range(sequence_length))) inputs = {} if SEQUENCE_ONE_HOT in input_features: seq_indices = custom_ops.dna_sequence_to_indices( sequence_tensor, sequence_length) tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32) channel_axis = ('channel', list(dna.DNA_BASES)) axes = [batch_axis, position_axis, channel_axis] one_hots = lt.LabeledTensor(tensor, axes) inputs[SEQUENCE_ONE_HOT] = one_hots if SEQUENCE_KMER_COUNT in input_features: raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor, kmer_k_max) kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max)) counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis]) means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length) mean_count = lt.constant(means, tf.float32, axes=[kmer_axis]) std_count = lt.constant(stds, tf.float32, axes=[kmer_axis]) inputs[SEQUENCE_KMER_COUNT] = ( (lt.cast(counts, tf.float32) - mean_count) / std_count) if STRUCTURE_PARTITION_FUNCTION in input_features: with tf.name_scope('structure_partition_fn'): raw_pf_tensor = lt.expand_dims( feature_tensors['partition_function'], ['batch', 'partition_fn_axis']) inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor) output_names = count_names + array_names outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names] if additional_output and additional_output[0]: outputs += [ lt.cast(feature_tensors[k], tf.float32) for k in additional_output ] output_names += additional_output outputs = lt.pack(outputs, ('output', output_names), axis_position=1) if skip_all_zero_counts: with tf.name_scope('counts_filtering'): counts = lt.select(outputs, {'output': count_names}) keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output') inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()} outputs = lt.boolean_mask(outputs, keep) return inputs, outputs
def upsample_positives(feature_tensors, count_names, total_reads_defining_positive, min_fraction_positive, seed=None): """Returns feature tensors with positives upsampled to the desired rate. Args: feature_tensors: Dict[str, lt.LabeledTensor] with parsed featured created by `build_features`. count_names: A list of labels that are count names. total_reads_defining_positive: The minimum number of reads detected across all conditions that defines a sequence as being a positive example. min_fraction_positive: The minimum fraction of positive examples to allow in the data. seed: The random seed to use in upsampling. Returns: A dictionary mapping from string feature name to lt.LabeledTensor of parsed features created by `build_features` and positive examples upsampled to the desired rate. Raises: ValueError: The minimum positive fraction requested is invalid. """ # Goal: Find the fraction of all input feature tensors that should be # classified as "positive" based on the total_reads_defining_positive. # Upsample those using resample.resample_at_rate() until they are at least # min_fraction_positive of the entire set. if min_fraction_positive < 0 or min_fraction_positive >= 1: raise ValueError('Invalid fraction positive, must be in [0, 1): %s' % min_fraction_positive) with tf.name_scope('upsample_positives'): # Classify the inputs as positive or negative. total_reads_defining_positive = tf.constant( total_reads_defining_positive, dtype=tf.float32) min_fraction_positive = tf.constant(min_fraction_positive, dtype=tf.float32) counts = lt.pack( [lt.cast(feature_tensors[k], tf.float32) for k in count_names], ('sequence_counts', count_names), axis_position=1) greater_equal = (lt.reduce_sum(counts, 'sequence_counts') >= total_reads_defining_positive) num_pos = lt.reduce_sum(lt.cast(greater_equal, tf.int32)) less_than = lt.logical_not(greater_equal) num_neg = lt.reduce_sum(lt.cast(less_than, tf.int32)) # With an initial number of positives P and number of negatives N, # if we keep the negative sampling rate at 1 (to try to retain negatives), # to achieve a total positive input fraction of F, we need a positive # sampling rate R that satisfies: # P * R / (P * R + N) >= F # # Solving for R: # # P * R = F * (P*R + N) = F*P*R + F*N # P * R (1 - F) = F * N # R = F*N / (P * (1 - F)) numerator = min_fraction_positive * tf.cast(num_neg, tf.float32) denom = tf.cast(num_pos, tf.float32) * (1 - min_fraction_positive) denom = tf.cond( denom > 0.0, lambda: denom, # If denom == 0, we can set it to anything we want since the # tf.cond below is guaranteed to return the input without # resampling. lambda: tf.constant(1.0, dtype=tf.float32)) positive_rate = numerator / denom batch_size = tf.shape(greater_equal)[0] negative_rates = tf.ones([batch_size], tf.float32) positive_rates = tf.fill([batch_size], positive_rate) rates = tf.where(greater_equal, positive_rates, negative_rates) # Pack the LabeledTensors into normal tensors, keeping relevant information # for unpacking back to LabeledTensors available. ordered_names = sorted(feature_tensors) packed_tensors = [] tensor_axes = [] tensor_shapes = [] for name in ordered_names: labeled_tensor = feature_tensors[name] packed_tensors.append(labeled_tensor.tensor) tensor_axes.append(labeled_tensor.axes) tensor_shapes.append(labeled_tensor.get_shape()) # Perform the resampling. resampled_tensors = tf.cond( tf.logical_or( tf.equal(num_pos, 0), tf.cast(num_pos, dtype=tf.float32) >= (min_fraction_positive * tf.cast(batch_size, dtype=tf.float32))), lambda: packed_tensors, lambda: resample.resample_at_rate( packed_tensors, rates, seed=seed)) # Unpack the tensors into a dictionary of LabeledTensors again. # First, change the shape so that the batch axis is unknown. tensor_shapes = [[None] + list(shape)[1:] for shape in tensor_shapes] for tensor, shape in zip(resampled_tensors, tensor_shapes): tensor.set_shape(shape) unpacked_feature_tensors = {} for i, name in enumerate(ordered_names): labeled = lt.LabeledTensor(resampled_tensors[i], tensor_axes[i]) unpacked_feature_tensors[name] = labeled return unpacked_feature_tensors