コード例 #1
0
def dataset_generator(filepath,
                      dataset,
                      chunk_size=1,
                      start_idx=None,
                      end_idx=None):
    encoder = dna_encoder.DNAEncoder(chunk_size=chunk_size)
    with h5py.File(filepath, "r") as h5_file:
        # Get input keys from h5_file
        src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]]
        src_values = [h5_file[k] for k in src_keys]
        inp_data, mask_data, out_data = src_values
        assert len(set([v.len() for v in src_values])) == 1

        if start_idx is None:
            start_idx = 0
        if end_idx is None:
            end_idx = inp_data.len()

        for i in xrange(start_idx, end_idx):
            if i % 100 == 0:
                print("Generating example %d for %s" % (i, dataset))
            inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i]
            ex_dict = to_example_dict(encoder, inputs, mask, outputs)
            # Original data has one output for every 128 input bases. Ensure that the
            # ratio has been maintained given the chunk size and removing EOS.
            assert (len(ex_dict["inputs"]) -
                    1) == ((128 // chunk_size) * ex_dict["targets_shape"][0])
            yield ex_dict
コード例 #2
0
 def feature_encoders(self, data_dir):
     del data_dir
     return {
         "inputs": dna_encoder.DNAEncoder(chunk_size=self.chunk_size),
         # TODO(rsepassi): RealEncoder?
         "targets": text_encoder.TextEncoder()
     }
コード例 #3
0
    def preprocess_example(self, example, mode, hparams):
        """Preprocess the model inputs.
    
    Args:
      example: Feature dict from feature name to Tensor or SparseTensor.
      mode: String. Specifies training, eval, and inference.
      hparams: The problem hparams.

    Returns:
      Feature dict from feature name to Tensor or SparseTensor.
    """
        inputs = example["inputs"]
        targets = example["targets"]
        encoder = dna_encoder.DNAEncoder(self.chunk_size)

        def to_ids(inputs):
            ids = encoder.encode("".join(map(chr, inputs)))
            return np.array(ids, dtype=np.int64)

        [inputs] = tf.py_func(to_ids, [inputs], [tf.int64], stateful=False)
        # Reshape to the [p0, p1, channels] modality convention.
        out_size = int(np.ceil(self.input_sequence_length / self.chunk_size))
        example["inputs"] = tf.reshape(inputs, [out_size, 1, 1])
        example["targets"] = tf.reshape(targets,
                                        [self.num_binary_predictions, 1, 1])
        return example
コード例 #4
0
    def test_encode_decode(self):
        original = 'TTCGCGGNNNAACCCAACGCCATCTATGTANNTTGAGTTGTTGAGTTAAA'

        # Encoding should be reversible for any reasonable chunk size.
        for chunk_size in [1, 2, 4, 6, 8]:
            encoder = dna_encoder.DNAEncoder(chunk_size=chunk_size)
            encoded = encoder.encode(original)
            decoded = encoder.decode(encoded)
            self.assertEqual(original, decoded)
コード例 #5
0
  def testRecordToExample(self):
    encoder = dna_encoder.DNAEncoder(chunk_size=2)
    raw_inputs = ["A", "C", "G", "N", "C", "T"]

    # Put in numpy arrays in the same format as in the h5 file
    inputs = self._one_hot_bases(raw_inputs)
    mask = np.array([True, False, True])
    outputs = np.array([[1.0, 2.0, 3.0], [5.0, 1.0, 0.2], [5.1, 2.3, 2.3]])
    # Convert to example dict
    ex_dict = gene_expression.to_example_dict(encoder, inputs, mask, outputs)

    self.assertEqual(len(raw_inputs) // 2 + 1, len(ex_dict["inputs"]))
    self.assertAllEqual(encoder.encode(raw_inputs) + [1], ex_dict["inputs"])
    self.assertAllEqual([1.0, 0.0, 1.0], ex_dict["targets_mask"])
    self.assertAllEqual([1.0, 2.0, 3.0, 5.0, 1.0, 0.2, 5.1, 2.3, 2.3],
                        ex_dict["targets"])
    self.assertAllEqual([3, 3], ex_dict["targets_shape"])
コード例 #6
0
    def hparams(self, defaults, model_hparams):
        """Augment the hparams for this problem.

    Args:
      defaults: The default hparams to augment for this problem.
      model_hparams: The hparams of the model being used. Augment these as 
        needed for this particular problem.

    Returns:
      None.
    """
        p = defaults
        vocab_size = dna_encoder.DNAEncoder(self.chunk_size).vocab_size
        p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)}
        p.target_modality = ("%s:binary" % registry.Modalities.CLASS_LABEL,
                             None)
        p.input_space_id = problem.SpaceID.DNA
        p.target_space_id = problem.SpaceID.GENERIC
コード例 #7
0
def dataset_generator(filepath,
                      dataset,
                      chunk_size=1,
                      start_idx=None,
                      end_idx=None):
    encoder = dna_encoder.DNAEncoder(chunk_size=chunk_size)
    with h5py.File(filepath, "r") as h5_file:
        # Get input keys from h5_file
        src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]]
        src_values = [h5_file[k] for k in src_keys]
        inp_data, mask_data, out_data = src_values
        assert len(set([v.len() for v in src_values])) == 1

        if start_idx is None:
            start_idx = 0
        if end_idx is None:
            end_idx = inp_data.len()

        for i in xrange(start_idx, end_idx):
            if i % 100 == 0:
                print("Generating example %d for %s" % (i, dataset))
            inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i]
            yield to_example_dict(encoder, inputs, mask, outputs)