def __init__(self, experiment_proto, loss, target_names=None, additional_output=None): """Initialize a LatentAffinity output layer. Args: experiment_proto: selection_pb2.Experiment describing the experiment. loss: instance of an AbstractLoss subclass used for computing loss on this output layer. target_names: optional list of strings giving target names to train against. additional_output: optional list of strings containing all the additional output to predict. Raises: ValueError: if any target_names are not counts. """ self.loss = loss affinity_names = selection.all_target_and_background_names( experiment_proto) additional_output = get_additional_output_names( experiment_proto, additional_output) target_names = get_target_names(experiment_proto, target_names) self.target_axis = lt.Axis('target', target_names) if additional_output: self.additional_output_axis = lt.Axis('additional_output', additional_output) else: self.additional_output_axis = None self.logit_axis = lt.Axis('target', affinity_names + additional_output) self.affinity_axis = lt.Axis('affinity', affinity_names) self.all_target_axis = lt.Axis('target', target_names + additional_output) self.all_count_names = selection.all_count_names(experiment_proto) self.binding_arrays_map = _binding_arrays_map(experiment_proto) signs = _get_selection_signs(affinity_names, target_names, experiment_proto) self.selection_signs = lt.LabeledTensor( tf.constant(signs, dtype=tf.float32, name='selection_signs'), [self.affinity_axis, self.target_axis]) # TODO(shoyer): consider if there's a sane way to make lt.Variable affinity_weights = tf.Variable(tf.ones_like(signs, dtype=tf.float32), name='affinity_weights') bias = tf.Variable(tf.zeros([self.target_axis.size]), name='bias') self.params = [affinity_weights, bias] self.affinity_weights = lt.LabeledTensor( tf.convert_to_tensor(affinity_weights), [self.affinity_axis, self.target_axis]) self.bias = lt.LabeledTensor(tf.convert_to_tensor(bias), [self.target_axis])
def __init__(self, experiment_proto, loss, affinity_target_map=None, target_names=None, additional_output=None): """Initialize a FullyObserved output layer. Args: experiment_proto: selection_pb2.Experiment describing the experiment. loss: instance of an AbstractLoss subclass used for computing loss on this output layer. affinity_target_map: dictionary with one entry for each selection target molecule (e.g. protein) and the list of target output values to be used to calculate that target molecule's affinity. This dictionary is optional to create this OutputLayer but is required to calculate affinity. (In other words, during training it is unnecessary but for inference it is usually required.) target_names: optional list of strings giving target names to train against. additional_output: optional list of strings containing all the additional output to predict. Raises: Error: if the affinity_target_map is invalid. """ self.loss = loss target_names = get_target_names(experiment_proto, target_names) additional_output = get_additional_output_names(experiment_proto, additional_output) if additional_output: self.additional_output_axis = lt.Axis('additional_output', additional_output) else: self.additional_output_axis = None self.count_axis = self.target_axis = lt.Axis('target', target_names) self.logit_axis = lt.Axis('target', target_names+additional_output) self.binding_arrays_map = _binding_arrays_map(experiment_proto) self.params = [] self.affinity_target_axis = self.affinity_target_lt = None if affinity_target_map: affinity_target_map = config.DEFAULT_AFFINITY_TARGET_MAPS[ affinity_target_map] # make sure that every target in the affinity_target_map is in the logits # (otherwise the target is silently ignored, could be dangerous) target_names = self.count_axis.labels affinity_names = list(affinity_target_map.keys()) for (affinity, desired_target_names) in affinity_target_map.items(): for desired_name in desired_target_names: if desired_name not in target_names: raise Error('The desired target name %s for the affinity molecule' '%s is not found in the logit target names.\n' 'logit target names: %s\n', desired_name, affinity, target_names) array = np.zeros((len(affinity_names), len(target_names)), dtype=int) for i, affinity in enumerate(affinity_names): for j, target in enumerate(target_names): if target in affinity_target_map[affinity]: array[i, j] = 1 self.affinity_axis = lt.Axis('affinity', affinity_names) self.affinity_target_lt = lt.LabeledTensor( tf.constant( array, dtype=tf.float32, name='affinity_targets'), [self.affinity_axis, self.count_axis])
def create_input_and_outputs(feature_tensors, experiment_proto, input_features=(SEQUENCE_ONE_HOT, ), skip_all_zero_counts=True, kmer_k_max=4, additional_output=None): """Create inputs and outputs from parsed features. Args: feature_tensors: Dict[str, tf.Tensor] with parsed featured created by `build_features`. experiment_proto: selection_pb2.Experiment describing the experiment. input_features: optional sequence of feature constants defined in this module. skip_all_zero_counts: some sequences have no counts, e.g., because they were created artificially for validation purposes on the binding array. We want to skip these sequences for training. kmer_k_max: optional integer giving the maximum kmer length to use if SEQUENCE_KMER_COUNT is in `input_features`. additional_output: optional list of strings contains additional outputs. Returns: inputs: LabeledTensor with dtype=float32 and axes [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded rasterized sequences for input into machine learning models. outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis] denoting possible output tensors, including counts and binding array measurements. """ sequence_length = experiment_proto.sequence_length count_names = selection.all_count_names(experiment_proto) array_names = selection.binding_array_names(experiment_proto) sequence_tensor = feature_tensors['sequence'] batch_axis = sequence_tensor.axes['batch'] position_axis = ('position', list(range(sequence_length))) inputs = {} if SEQUENCE_ONE_HOT in input_features: seq_indices = custom_ops.dna_sequence_to_indices( sequence_tensor, sequence_length) tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32) channel_axis = ('channel', list(dna.DNA_BASES)) axes = [batch_axis, position_axis, channel_axis] one_hots = lt.LabeledTensor(tensor, axes) inputs[SEQUENCE_ONE_HOT] = one_hots if SEQUENCE_KMER_COUNT in input_features: raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor, kmer_k_max) kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max)) counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis]) means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length) mean_count = lt.constant(means, tf.float32, axes=[kmer_axis]) std_count = lt.constant(stds, tf.float32, axes=[kmer_axis]) inputs[SEQUENCE_KMER_COUNT] = ( (lt.cast(counts, tf.float32) - mean_count) / std_count) if STRUCTURE_PARTITION_FUNCTION in input_features: with tf.name_scope('structure_partition_fn'): raw_pf_tensor = lt.expand_dims( feature_tensors['partition_function'], ['batch', 'partition_fn_axis']) inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor) output_names = count_names + array_names outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names] if additional_output and additional_output[0]: outputs += [ lt.cast(feature_tensors[k], tf.float32) for k in additional_output ] output_names += additional_output outputs = lt.pack(outputs, ('output', output_names), axis_position=1) if skip_all_zero_counts: with tf.name_scope('counts_filtering'): counts = lt.select(outputs, {'output': count_names}) keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output') inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()} outputs = lt.boolean_mask(outputs, keep) return inputs, outputs