コード例 #1
0
  def test_non_padded_dataset(self):
    # Set up test data.
    test_data_directory = os.path.join(
        FLAGS.test_srcdir,
        './testdata'
    )

    label_vocab_array = [
        'EMBL:AE017224', 'RefSeq:WP_002966386.1', 'ProteinModelPortal:P0CB34',
        'SMR:P0CB34', 'EnsemblBacteria:AAX75635', 'GeneID:29595679',
        'KEGG:bmb:BruAb2_0191', 'HOGENOM:HOG000133897', 'KO:K04078',
        'OMA:PGRIDDN', 'Proteomes:UP000000540', 'GO:GO:0005737',
        'GO:GO:0005524', 'GO:GO:0006457', 'CDD:cd00320', 'Gene3D:2.30.33.40',
        'HAMAP:MF_00580', 'InterPro:IPR020818', 'InterPro:IPR037124',
        'InterPro:IPR018369', 'InterPro:IPR011032', 'PANTHER:PTHR10772',
        'Pfam:PF00166', 'PRINTS:PR00297', 'SMART:SM00883', 'SUPFAM:SSF50129',
        'PROSITE:PS00681'
    ]
    with tf.Graph().as_default():
      sess = tf.Session()
      dataset = protein_dataset.non_batched_dataset(
          # Dev fold instead of train fold because the train fold is repeated.
          train_dev_or_test=protein_dataset.DEV_FOLD,
          label_vocab=label_vocab_array,
          data_root_dir=test_data_directory)
      example_itr = dataset.make_initializable_iterator()

      sess.run(tf.tables_initializer())
      sess.run(tf.global_variables_initializer())
      sess.run(example_itr.initializer)

    # Compute actual output
    actual_examples = _dataset_iterator_to_list(example_itr, sess)

    expected_length = 4

    # Compute expected values
    expected_sequence = 'MADIKFRPLHDRVVVRRVESEAKTAGGIIIPDTAKEKPQEGEVVAAGAGARDEAGKLVPLDVKAGDRVLFGKWSGTEVKIGGEDLLIMKESDILGIVG'
    expected_sequence_indexes = [
        utils.AMINO_ACID_VOCABULARY.index(x) for x in expected_sequence
    ]
    expected_sequence_one_hot = _numpy_one_hot(
        expected_sequence_indexes, depth=len(utils.AMINO_ACID_VOCABULARY))
    # Because the label vocab is exactly the labels in the first example, we
    # just get range(len(label_vocab_array))
    expected_label_indexes = range(len(label_vocab_array))
    expected_id = b'P0CB34'

    # Assert values correct
    self.assertLen(actual_examples, expected_length)
    np.testing.assert_equal(actual_examples[0][protein_dataset.SEQUENCE_KEY],
                            expected_sequence_one_hot)
    np.testing.assert_equal(
        actual_examples[0][protein_dataset.SEQUENCE_LENGTH_KEY],
        len(expected_sequence))
    np.testing.assert_equal(actual_examples[0][protein_dataset.LABEL_KEY],
                            expected_label_indexes)
    np.testing.assert_equal(actual_examples[0][protein_dataset.SEQUENCE_ID_KEY],
                            expected_id)
コード例 #2
0
  def test_padded_dataset(self):
    # Set up test data.
    test_data_directory = os.path.join(
        FLAGS.test_srcdir,
        './testdata'
    )

    label_vocab_array = ['EMBL:AE017224']

    batch_size = 3

    with tf.Graph().as_default():
      sess = tf.Session()
      non_padded_dataset = protein_dataset.non_batched_dataset(
          # Dev fold instead of train fold because the train fold is repeated.
          train_dev_or_test=protein_dataset.DEV_FOLD,
          label_vocab=label_vocab_array,
          data_root_dir=test_data_directory)
      batched_dataset = protein_dataset.batched_dataset(
          non_padded_dataset, batch_size=batch_size, bucket_boundaries=[11000])
      batch_itr = batched_dataset.make_initializable_iterator()

      sess.run(tf.tables_initializer())
      sess.run(tf.global_variables_initializer())
      sess.run(batch_itr.initializer)

    # Compute actual output
    actual_examples = _dataset_iterator_to_list(batch_itr, sess)

    # Examine correctness of first element.
    actual_sequence_batch_shape = actual_examples[0][
        protein_dataset.SEQUENCE_KEY].shape
    expected_longest_sequence_len_in_first_batch = 98
    expected_first_batch_sequence_shape = (
        batch_size, expected_longest_sequence_len_in_first_batch,
        len(utils.AMINO_ACID_VOCABULARY))
    self.assertEqual(actual_sequence_batch_shape,
                     expected_first_batch_sequence_shape)

    actual_label_batch_shape = actual_examples[0][
        protein_dataset.LABEL_KEY].shape
    # Because the label vocab contains the labels in the first example, we
    # get len(label_vocab_array) as the number of labels.
    expected_batch_label_shape = (batch_size, len(label_vocab_array))
    self.assertEqual(actual_label_batch_shape, expected_batch_label_shape)