def dataset_generator(filepath, dataset, chunk_size=1, start_idx=None, end_idx=None): encoder = dna_encoder.DNAEncoder(chunk_size=chunk_size) with h5py.File(filepath, "r") as h5_file: # Get input keys from h5_file src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]] src_values = [h5_file[k] for k in src_keys] inp_data, mask_data, out_data = src_values assert len(set([v.len() for v in src_values])) == 1 if start_idx is None: start_idx = 0 if end_idx is None: end_idx = inp_data.len() for i in xrange(start_idx, end_idx): if i % 100 == 0: print("Generating example %d for %s" % (i, dataset)) inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i] ex_dict = to_example_dict(encoder, inputs, mask, outputs) # Original data has one output for every 128 input bases. Ensure that the # ratio has been maintained given the chunk size and removing EOS. assert (len(ex_dict["inputs"]) - 1) == ((128 // chunk_size) * ex_dict["targets_shape"][0]) yield ex_dict
def feature_encoders(self, data_dir): del data_dir return { "inputs": dna_encoder.DNAEncoder(chunk_size=self.chunk_size), # TODO(rsepassi): RealEncoder? "targets": text_encoder.TextEncoder() }
def preprocess_example(self, example, mode, hparams): """Preprocess the model inputs. Args: example: Feature dict from feature name to Tensor or SparseTensor. mode: String. Specifies training, eval, and inference. hparams: The problem hparams. Returns: Feature dict from feature name to Tensor or SparseTensor. """ inputs = example["inputs"] targets = example["targets"] encoder = dna_encoder.DNAEncoder(self.chunk_size) def to_ids(inputs): ids = encoder.encode("".join(map(chr, inputs))) return np.array(ids, dtype=np.int64) [inputs] = tf.py_func(to_ids, [inputs], [tf.int64], stateful=False) # Reshape to the [p0, p1, channels] modality convention. out_size = int(np.ceil(self.input_sequence_length / self.chunk_size)) example["inputs"] = tf.reshape(inputs, [out_size, 1, 1]) example["targets"] = tf.reshape(targets, [self.num_binary_predictions, 1, 1]) return example
def test_encode_decode(self): original = 'TTCGCGGNNNAACCCAACGCCATCTATGTANNTTGAGTTGTTGAGTTAAA' # Encoding should be reversible for any reasonable chunk size. for chunk_size in [1, 2, 4, 6, 8]: encoder = dna_encoder.DNAEncoder(chunk_size=chunk_size) encoded = encoder.encode(original) decoded = encoder.decode(encoded) self.assertEqual(original, decoded)
def testRecordToExample(self): encoder = dna_encoder.DNAEncoder(chunk_size=2) raw_inputs = ["A", "C", "G", "N", "C", "T"] # Put in numpy arrays in the same format as in the h5 file inputs = self._one_hot_bases(raw_inputs) mask = np.array([True, False, True]) outputs = np.array([[1.0, 2.0, 3.0], [5.0, 1.0, 0.2], [5.1, 2.3, 2.3]]) # Convert to example dict ex_dict = gene_expression.to_example_dict(encoder, inputs, mask, outputs) self.assertEqual(len(raw_inputs) // 2 + 1, len(ex_dict["inputs"])) self.assertAllEqual(encoder.encode(raw_inputs) + [1], ex_dict["inputs"]) self.assertAllEqual([1.0, 0.0, 1.0], ex_dict["targets_mask"]) self.assertAllEqual([1.0, 2.0, 3.0, 5.0, 1.0, 0.2, 5.1, 2.3, 2.3], ex_dict["targets"]) self.assertAllEqual([3, 3], ex_dict["targets_shape"])
def hparams(self, defaults, model_hparams): """Augment the hparams for this problem. Args: defaults: The default hparams to augment for this problem. model_hparams: The hparams of the model being used. Augment these as needed for this particular problem. Returns: None. """ p = defaults vocab_size = dna_encoder.DNAEncoder(self.chunk_size).vocab_size p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)} p.target_modality = ("%s:binary" % registry.Modalities.CLASS_LABEL, None) p.input_space_id = problem.SpaceID.DNA p.target_space_id = problem.SpaceID.GENERIC
def dataset_generator(filepath, dataset, chunk_size=1, start_idx=None, end_idx=None): encoder = dna_encoder.DNAEncoder(chunk_size=chunk_size) with h5py.File(filepath, "r") as h5_file: # Get input keys from h5_file src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]] src_values = [h5_file[k] for k in src_keys] inp_data, mask_data, out_data = src_values assert len(set([v.len() for v in src_values])) == 1 if start_idx is None: start_idx = 0 if end_idx is None: end_idx = inp_data.len() for i in xrange(start_idx, end_idx): if i % 100 == 0: print("Generating example %d for %s" % (i, dataset)) inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i] yield to_example_dict(encoder, inputs, mask, outputs)