Example #1
0
  def init(self):
    # init
    self.global_step = global_step = tf.Variable(0, trainable=False, name='global_step')
    self.learning_rate = learning_rate = tf.train.exponential_decay(1e-2, global_step, 500, 0.95, staircase=True)

    # Load classes
    src_table = tf.contrib.lookup.index_table_from_file('./iwslt15/vocab.en', default_value=0)
    tgt_table = tf.contrib.lookup.index_table_from_file('./iwslt15/vocab.vi', default_value=0)

    #src_table_size = src_table.size()
    #tgt_table_size = tgt_table.size()
    src_table_size = 17191
    tgt_table_size = 7709
    src_eos_id = tf.cast(src_table.lookup(tf.constant('</s>')), tf.int64)
    self.tgt_eos_id = tgt_eos_id = tf.cast(tgt_table.lookup(tf.constant('</s>')), tf.int64)
    self.tgt_sos_id = tgt_sos_id = tf.cast(tgt_table.lookup(tf.constant('<s>')), tf.int64)

    # file placeholder
    src_files = tf.placeholder(tf.string, shape=[None])
    tgt_files = tf.placeholder(tf.string, shape=[None])

    # Read data
    src_dataset = tf.contrib.data.TextLineDataset(src_files)
    tgt_dataset = tf.contrib.data.TextLineDataset(tgt_files)

    # Convert data to word indices
    src_dataset = src_dataset.map(lambda string: tf.concat([['<s>'], tf.string_split([string]).values, ['</s>']], 0))
    src_dataset = src_dataset.map(lambda words: (words, tf.size(words)))
    src_dataset = src_dataset.map(lambda words, size: (src_table.lookup(words), size))

    tgt_dataset = tgt_dataset.map(lambda string: tf.concat([['<s>'], tf.string_split([string]).values, ['</s>']], 0))
    tgt_dataset = tgt_dataset.map(lambda words: (words, tf.size(words)))
    tgt_dataset = tgt_dataset.map(lambda words, size: (tgt_table.lookup(words), size))

    # zip data
    dataset = tf.contrib.data.Dataset.zip((src_dataset, tgt_dataset))

    # batch
    batched_dataset = dataset.padded_batch(self.batch_size,
        padded_shapes=((tf.TensorShape([None]), tf.TensorShape([])),(tf.TensorShape([None]), tf.TensorShape([]))),
        padding_values=((src_eos_id, 0), (tgt_eos_id, 0)))
    batched_iterator = batched_dataset.make_initializable_iterator()
    ((source, source_lengths), (target, target_lengths)) = batched_iterator.get_next()

    self.target = target
    self.target_lengths = target_lengths
    self.source_lengths = source_lengths

    # Load embedding (dic limits to 100000)
    src_embed = tf.Variable(tf.random_normal([100000, self.embed_vector_size], stddev=0.1))
    self.tgt_embed = tgt_embed = tf.Variable(tf.random_normal([100000, self.embed_vector_size], stddev=0.1))

    self.src_lookup = src_lookup = tf.nn.embedding_lookup(src_embed, source)
    self.tgt_lookup = tgt_lookup = tf.nn.embedding_lookup(tgt_embed, target)

    # Projection Layer
    self.projection_layer = projection_layer = layers_core.Dense(tgt_table_size)

    return batched_iterator, src_files, tgt_files
Example #2
0
 def decode_libsvm(line):
     columns = tf.string_split([line], ' ')
     labels = tf.string_to_number(columns.values[0], out_type=tf.float32)
     splits = tf.string_split(columns.values[1:], ':')
     id_vals = tf.reshape(splits.values,splits.dense_shape)
     feat_ids, feat_vals = tf.split(id_vals,num_or_size_splits=2,axis=1)
     feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32)
     feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32)
     return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels
Example #3
0
 def create_char_vectors_from_post(self, raw_post, mxlen):
     char2index = self.index
     if self.do_lowercase:
         raw_post = self.lowercase(raw_post)
     raw_post = tf.string_split(tf.reshape(raw_post, [-1]))
     culled_word_token_vals = tf.substr(raw_post.values, 0, self.mxwlen)
     char_tokens = tf.string_split(culled_word_token_vals, delimiter='')
     char_indices = char2index.lookup(char_tokens)
     return self.reshape_indices(char_indices, [mxlen, self.mxwlen])
    def __init__(self, args, txt_file, num_classes, mode, batch_size, num_preprocess_threads=1, shuffle=True,
                 min_queue_examples=1):
        self.args = args
        self.txt_file = txt_file
        self.num_preprocess_threads = num_preprocess_threads
        self.min_queue_examples = min_queue_examples
        self.batch_size = batch_size
        self.mode = mode
        self.imgShape = [self.args.imageHeight, self.args.imageWidth, self.args.imageChannels]
        self.maskShape = tf.stack([self.args.imageHeight, self.args.imageWidth])
        self.num_classes = int(num_classes)

        input_queue = tf.train.string_input_producer([txt_file], shuffle=False)
        line_reader = tf.TextLineReader()
        _, line = line_reader.read(input_queue)
        split_line = tf.string_split([line]).values

        if (mode == 'training' or mode == 'validation'):
            split_line = tf.string_split([line]).values

            rgb_image_path = split_line[0]
            label_image_path = split_line[1]

            self.image_o = self.read_image(rgb_image_path, 0)

            self.label_image_o = self.read_image(label_image_path, 1)

            do_flip = tf.random_uniform([], 0, 1)
            self.image = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(self.image_o), lambda: self.image_o)
            self.label_image = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(self.label_image_o),
                                       lambda: self.label_image_o)

            self.image.set_shape((self.args.imageHeight, self.args.imageWidth, 3))
            self.label_image.set_shape((self.args.imageHeight, self.args.imageWidth, 1))

            self.img_batch, self.label_batch = tf.train.shuffle_batch([self.image, self.label_image],
                                                                      batch_size=batch_size,
                                                                      num_threads=num_preprocess_threads,
                                                                      capacity=min_queue_examples + 3 * batch_size,
                                                                      min_after_dequeue=min_queue_examples)

        elif (mode == 'test'):
            print('Generating test Image Batch')
            split_line = tf.string_split([line]).values

            rgb_image_path = split_line[0]
            self.image = self.read_image(rgb_image_path, 0)

            self.image.set_shape((self.args.imageHeight, self.args.imageWidth, 3))

            self.img_batch = tf.train.batch([self.image],
                                            batch_size=batch_size,
                                            num_threads=num_preprocess_threads,
                                            capacity=min_queue_examples + 1 * batch_size,
                                            )
Example #5
0
 def decode_libsvm(line):
     #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS)
     #features = dict(zip(CSV_COLUMNS, columns))
     #labels = features.pop(LABEL_COLUMN)
     columns = tf.string_split([line], ' ')
     labels = tf.string_to_number(columns.values[0], out_type=tf.float32)
     splits = tf.string_split(columns.values[1:], ':')
     id_vals = tf.reshape(splits.values,splits.dense_shape)
     feat_ids, feat_vals = tf.split(id_vals,num_or_size_splits=2,axis=1)
     feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32)
     feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32)
     return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels
Example #6
0
def _parse_line(line):
    """
    _parse_line
    """
    line_arr = tf.string_split([line], '\t').values
    #print(line_arr[2]) Tensor("strided_slice:0", shape=(), dtype=string)
    user = line_arr[0]
    label = tf.string_to_number(line_arr[1], out_type=tf.int32)
    #print(tf.string_split([line_arr[2]]).values)  Tensor("StringSplit_1:1", shape=(?,), dtype=string)
    features = {}
    features["words"] = tf.string_to_number(tf.string_split([line_arr[2]], ",").values, tf.int32)
    features["id"] = user
    return features, label
Example #7
0
  def _get_labels_builder(self, labels_file):
    labels_vocabulary = tf.contrib.lookup.index_table_from_file(
        self.labels_vocabulary_file,
        vocab_size=self.num_labels)

    dataset = tf.data.TextLineDataset(labels_file)
    process_fn = lambda x: {
        "tags": tf.string_split([x]).values,
        "tags_id": labels_vocabulary.lookup(tf.string_split([x]).values)
    }
    padded_shapes_fn = lambda: {
        "tags": [None],
        "tags_id": [None]
    }
    return dataset, process_fn, padded_shapes_fn
Example #8
0
def get_predict_iterator(src_vocab_table, vocab_size, batch_size, max_len=max_sequence):
    pred_dataset = tf.contrib.data.TextLineDataset(pred_file)
    pred_dataset = pred_dataset.map(
        lambda src: tf.string_split([src]).values)
    if max_len:
        pred_dataset = pred_dataset.map(lambda src: src[:max_sequence])

    pred_dataset = pred_dataset.map(
        lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32))

    pred_dataset = pred_dataset.map(lambda src: (src, tf.size(src)))

    def batching_func(x):
        return x.padded_batch(
            batch_size,
            padded_shapes=(tf.TensorShape([None]),  # src
                           tf.TensorShape([])),  # src_len
            padding_values=(vocab_size+1,  # src
                            0))  # src_len -- unused

    batched_dataset = batching_func(pred_dataset)
    batched_iter = batched_dataset.make_initializable_iterator()
    (src_ids, src_seq_len) = batched_iter.get_next()

    # 这里target_input在预测的时候不需要,但是不能返回None否则报错。这里则用个placeholder代替,但是仍然不会用到。
    WAHTEVER = 10
    fake_tag = tf.placeholder(tf.int32, [None, WAHTEVER])
    return BatchedInput(
        initializer=batched_iter.initializer,
        source=src_ids,
        target_input=fake_tag,
        source_sequence_length=src_seq_len,
        target_sequence_length=src_seq_len)
Example #9
0
 def lowercase(self, raw_post):
     split_chars = tf.string_split(tf.reshape(raw_post, [-1]), delimiter="").values
     upchar_inds = self.upchars_lut.lookup(split_chars)
     return tf.reduce_join(tf.map_fn(lambda x: tf.cond(x[0] > 25,
                                                       lambda: x[1],
                                                       lambda: self.lchars[x[0]]),
                                     (upchar_inds, split_chars), dtype=tf.string))
    def _decode_and_resize(image_tensor):
      """Decodes jpeg string, resizes it and returns a uint8 tensor."""

      # These constants are set by Inception v3's expectations.
      height = 299
      width = 299
      channels = 3

      image_tensor = tf.where(tf.equal(image_tensor, ''), IMAGE_DEFAULT_STRING, image_tensor)

      # Fork by whether image_tensor value is a file path, or a base64 encoded string.
      slash_positions = tf.equal(tf.string_split([image_tensor], delimiter="").values, '/')
      is_file_path = tf.cast(tf.count_nonzero(slash_positions), tf.bool)

      # The following two functions are required for tf.cond. Note that we can not replace them
      # with lambda. According to TF docs, if using inline lambda, both branches of condition
      # will be executed. The workaround is to use a function call.
      def _read_file():
        return tf.read_file(image_tensor)

      def _decode_base64():
        return tf.decode_base64(image_tensor)

      image = tf.cond(is_file_path, lambda: _read_file(), lambda: _decode_base64())
      image = tf.image.decode_jpeg(image, channels=channels)
      image = tf.expand_dims(image, 0)
      image = tf.image.resize_bilinear(image, [height, width], align_corners=False)
      image = tf.squeeze(image, squeeze_dims=[0])
      image = tf.cast(image, dtype=tf.uint8)
      return image
Example #11
0
def get_test_iterator(src_dataset, src_vocab_table, batch_size, config):
    src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(config.eos)), tf.int32)
    src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)

    src_dataset = src_dataset.map(lambda src: src[:config.src_max_len])

    src_dataset = src_dataset.map(
        lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32))

    if config.reverse_src:
        src_dataset = src_dataset.map(lambda src: tf.reverse(src, axis=[0]))

    src_dataset = src_dataset.map(lambda src: (src, tf.size(src)))

    def batching_func(x):
        return x.padded_batch(
            config.batch_size,
            padded_shapes=(tf.TensorShape([None]),
                           tf.TensorShape([])),
            padding_values=(src_eos_id,
                            0))

    batched_dataset = batching_func(src_dataset)
    batched_iter = batched_dataset.make_initializable_iterator()
    src_ids, src_seq_len = batched_iter.get_next()
    return BatchedInput(
        initializer=batched_iter.initializer,
        source=src_ids,
        target_input=None,
        target_output=None,
        source_sequence_length=src_seq_len,
        target_sequence_length=None)
def custom_fast_text(features, labels, mode, params):
    vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.CNN_MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.CNN_MAX_DOCUMENT_LENGTH])

    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)

    embedded_sequences = tf.keras.layers.Embedding(params.N_WORDS, 20, input_length=commons.CNN_MAX_DOCUMENT_LENGTH)(
        word_id_vector)
    f1 = tf.keras.layers.GlobalMaxPooling1D()(embedded_sequences)
    logits = tf.keras.layers.Dense(commons.TARGET_SIZE, activation=None)(f1)

    predictions = tf.nn.sigmoid(logits)

    if mode == tf.estimator.ModeKeys.PREDICT:
        prediction_dict = {
            'class': tf.cast(tf.map_fn(lambda x: tf.cond(x > 0.30, lambda: 1.0, lambda: 0.0),
                                       tf.squeeze(predictions)), dtype=tf.int32),


        }

        export_outputs = {
            'predictions': tf.estimator.export.PredictOutput(prediction_dict)
        }

        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs)

    loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=labels, logits=logits)

    tf.summary.scalar('loss', loss)

    acc = tf.equal(tf.cast(predictions, dtype=tf.int32), labels)
    acc = tf.reduce_mean(tf.cast(acc, tf.float32))

    tf.summary.scalar('acc', acc)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()

        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss)

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_ops = {
            'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions)
        }
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
Example #13
0
 def decode_libsvm(line):
     #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS)
     #features = dict(zip(CSV_COLUMNS, columns))
     #labels = features.pop(LABEL_COLUMN)
     columns = tf.string_split([line], ' ')
     labels = tf.string_to_number(columns.values[0], out_type=tf.float32)
     splits = tf.string_split(columns.values[1:], ':')
     id_vals = tf.reshape(splits.values,splits.dense_shape)
     feat_ids, feat_vals = tf.split(id_vals,num_or_size_splits=2,axis=1)
     feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32)
     feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32)
     #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size])
     #for i in range(splits.dense_shape.eval()[0]):
     #    feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32))
     #    feat_vals.append(tf.string_to_number(splits.values[2*i+1]))
     #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels
     return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels
def sparse_from_csv(csv):
  ids, post_tags_str = tf.decode_csv(csv, [[-1], [""]])
  table = tf.contrib.lookup.index_table_from_tensor(
      mapping=TAG_SET, default_value=-1) ## 这里构造了个查找表 ##
  split_tags = tf.string_split(post_tags_str, "|")
  return tf.SparseTensor(
      indices=split_tags.indices,
      values=table.lookup(split_tags.values), ## 这里给出了不同值通过表查到的index ##
      dense_shape=split_tags.dense_shape)
Example #15
0
 def resize_sen(self, raw, mxlen):
     """
     Splits and rejoins a string to ensure that tokens meet
     the required max len.
     """
     raw_tokens = tf.string_split(tf.reshape(raw, [-1])).values
     # sentence length > mxlen
     raw_post = tf.reduce_join(raw_tokens[:mxlen], separator=" ")
     return raw_post
  def testStringSplit(self):
    strings = ["pigs on the wing", "animals"]

    with self.test_session() as sess:
      tokens = tf.string_split(strings)
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
      self.assertAllEqual(shape, [2, 4])
  def testStringSplitEmptyToken(self):
    strings = [" hello ", "", "world "]

    with self.test_session() as sess:
      tokens = tf.string_split(strings)
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [2, 0]])
      self.assertAllEqual(values, [b"hello", b"world"])
      self.assertAllEqual(shape, [3, 1])
Example #18
0
  def has_no_question_marks(line):
    """Returns True if the line of text has no question marks."""
    # split the line into an array of characters
    chars = tf.string_split(line[tf.newaxis], "").values
    # for each character check if it is a question mark
    is_question = tf.equal(chars, "?")
    any_question = tf.reduce_any(is_question)
    no_question = ~any_question

    return no_question
Example #19
0
 def create_word_vectors_from_post(self, raw_post, mxlen):
     # vocab has only lowercase words
     word2index = self.index
     if self.do_lowercase:
         raw_post = self.lowercase(raw_post)
     word_tokens = tf.string_split(tf.reshape(raw_post, [-1]))
     word_indices = word2index.lookup(word_tokens)
     # Reshape them out to the proper length
     reshaped_words = tf.sparse_reshape(word_indices, shape=[-1])
     return self.reshape_indices(reshaped_words, [mxlen])
Example #20
0
    def _create_word_vectors_from_post_mixed_case(self, nraw_post, mxlen):
        # vocab has only lowercase words
        word_tokens = tf.string_split(tf.reshape(nraw_post, [-1]))

        word_indices = self.word2index.lookup(word_tokens)

        # Reshape them out to the proper length
        reshaped_words = tf.sparse_reshape(word_indices, shape=[-1])
        x = self._reshape_indices(reshaped_words, [mxlen])

        return x
Example #21
0
  def testStringSplitWithDelimiter(self):
    strings = ["hello|world", "hello world"]

    with self.test_session() as sess:
      self.assertRaises(
          ValueError, tf.string_split, strings, delimiter=["|", ""])

      self.assertRaises(ValueError, tf.string_split, strings, delimiter=["a"])

      tokens = tf.string_split(strings, delimiter="|")
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]])
      self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
      self.assertAllEqual(shape, [2, 2])

      tokens = tf.string_split(strings, delimiter="| ")
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
      self.assertAllEqual(values, [b"hello", b"world", b"hello", b"world"])
      self.assertAllEqual(shape, [2, 2])
  def testStringSplitEmptyDelimiter(self):
    strings = ["hello", "hola"]

    with self.test_session() as sess:
      tokens = tf.string_split(strings, delimiter="")
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                                    [1, 0], [1, 1], [1, 2], [1, 3]])
      self.assertAllEqual(values, [b"h", b"e", b"l", b"l", b"o", b"h", b"o",
                                   b"l", b"a"])
      self.assertAllEqual(shape, [2, 5])
  def preprocessing_fn(inputs):
    """Preprocessing function.

    Args:
      inputs: dictionary of raw input tensors

    Returns:
      A dictionary of transformed tensors
    """
    stats = json.loads(
      file_io.read_file_to_string(
          os.path.join(output_dir, STATS_FILE)).decode())

    result = {}
    for name, transform in six.iteritems(features):
      transform_name = transform['transform']
      source_column = transform['source_column']

      if transform_name == TARGET_TRANSFORM:
        if not keep_target:
          continue
        if file_io.file_exists(os.path.join(output_dir, VOCAB_ANALYSIS_FILE % source_column)):
          transform_name = 'one_hot'
        else:
          transform_name = 'identity'

      if transform_name == 'identity':
        result[name] = inputs[source_column]
      elif transform_name == 'scale':
        result[name] = _scale(
            inputs[name],
            min_x_value=stats['column_stats'][source_column]['min'],
            max_x_value=stats['column_stats'][source_column]['max'],
            output_min=transform.get('value', 1) * (-1),
            output_max=transform.get('value', 1))
      elif transform_name in [ONE_HOT_TRANSFORM, MULTI_HOT_TRANSFORM]:
        vocab, ex_count = read_vocab_file(
            os.path.join(output_dir, VOCAB_ANALYSIS_FILE % source_column))
        if transform_name == MULTI_HOT_TRANSFORM:
          separator = transform.get('separator', ' ')
          tokens = tf.string_split(inputs[source_column], separator)
          result[name] = _string_to_int(tokens, vocab)
        else:
          result[name] = _string_to_int(inputs[source_column], vocab)
      elif transform_name == IMAGE_TRANSFORM:
        make_image_to_vec_fn = _make_image_to_vec_tito(
            name, checkpoint=transform.get('checkpoint', None))
        result[name] = make_image_to_vec_fn(inputs[source_column])
      else:
        raise ValueError('unknown transform %s' % transform_name)
    return result
  def _tokenize(self, sentences):
    # Perform a minimalistic text preprocessing by removing punctuation and
    # splitting on spaces.
    normalized_sentences = tf.strings.regex_replace(
        input=sentences, pattern=r"\pP", rewrite="")
    sparse_tokens = tf.string_split(normalized_sentences, " ")

    # Deal with a corner case: there is one empty sentence.
    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
    # Deal with a corner case: all sentences are empty.
    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)

    return (sparse_tokens.indices, sparse_tokens.values,
            sparse_tokens.dense_shape)
Example #25
0
  def testStringSplitEmptyDelimiter(self):
    strings = ["hello", "hola", b"\xF0\x9F\x98\x8E"]  # Last string is U+1F60E

    with self.test_session() as sess:
      tokens = tf.string_split(strings, delimiter="")
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                                    [1, 0], [1, 1], [1, 2], [1, 3],
                                    [2, 0], [2, 1], [2, 2], [2, 3]])
      expected = np.array(
          ['h', 'e', 'l', 'l', 'o', 'h', 'o', 'l',
           'a', b'\xf0', b'\x9f', b'\x98', b'\x8e'], dtype='|S1')
      self.assertAllEqual(values.tolist(), expected)
      self.assertAllEqual(shape, [3, 5])
Example #26
0
def parse(line):
  """Parse a line from the colors dataset."""

  # Each line of the dataset is comma-separated and formatted as
  #    color_name, r, g, b
  # so `items` is a list [color_name, r, g, b].
  items = tf.string_split([line], ",").values
  rgb = tf.string_to_number(items[1:], out_type=tf.float32) / 255.
  # Represent the color name as a one-hot encoded character sequence.
  color_name = items[0]
  chars = tf.one_hot(tf.decode_raw(color_name, tf.uint8), depth=256)
  # The sequence length is needed by our RNN.
  length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)
  return rgb, chars, length
def cnn_model(features, target, mode):
    table = lookup.index_table_from_file(vocabulary_file=WORD_VOCAB_FILE, num_oov_buckets=1, default_value=-1)
    
    # string operations
    titles = tf.squeeze(features['title'], [1])
    words = tf.string_split(titles)
    densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD)
    numbers = table.lookup(densewords)
    padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]])
    padded = tf.pad(numbers, padding)
    sliced = tf.slice(padded, [0,0], [-1, MAX_DOCUMENT_LENGTH])
    print('words_sliced={}'.format(words))  # (?, 20)

    # layer to take the words and convert them into vectors (embeddings)
    embeds = tf.contrib.layers.embed_sequence(sliced, vocab_size=N_WORDS, embed_dim=EMBEDDING_SIZE)
    print('words_embed={}'.format(embeds)) # (?, 20, 10)
    
    # now do convolution
    conv = tf.contrib.layers.conv2d(embeds, 1, WINDOW_SIZE, stride=STRIDE, padding='SAME') # (?, 4, 1)
    conv = tf.nn.relu(conv) # (?, 4, 1)
    words = tf.squeeze(conv, [2]) # (?, 4)
    print('words_conv={}'.format(words)) # (?, 4)

    n_classes = len(TARGETS)

    logits = tf.contrib.layers.fully_connected(words, n_classes, activation_fn=None)
    #print('logits={}'.format(logits)) # (?, 3)
    predictions_dict = {
      'source': tf.gather(TARGETS, tf.argmax(logits, 1)),
      'class': tf.argmax(logits, 1),
      'prob': tf.nn.softmax(logits)
    }

    if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL:
       loss = tf.losses.sparse_softmax_cross_entropy(target, logits)
       train_op = tf.contrib.layers.optimize_loss(
         loss,
         tf.contrib.framework.get_global_step(),
         optimizer='Adam',
         learning_rate=0.01)
    else:
       loss = None
       train_op = None

    return tflearn.ModelFnOps(
      mode=mode,
      predictions=predictions_dict,
      loss=loss,
      train_op=train_op)
Example #28
0
def model_fn(features, labels, mode, params):
    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)

    vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH])

    f1 = tf.keras.layers.Embedding(params.N_WORDS, 100, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)
    f2 = tf.keras.layers.Embedding(params.N_WORDS, 200, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)
    f3 = tf.keras.layers.Embedding(params.N_WORDS, 300, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)

    filter_sizes = [3, 5]

    conv_pools = []
    for text_embedding in [f1, f2, f3]:
        for filter_size in filter_sizes:
            l_zero = tf.keras.layers.ZeroPadding1D((filter_size - 1, filter_size - 1))(text_embedding)
            l_conv = tf.keras.layers.Conv1D(filters=32, kernel_size=filter_size, padding='same', activation='tanh')(l_zero)
            l_pool = tf.keras.layers.GlobalMaxPool1D()(l_conv)
            conv_pools.append(l_pool)
    merged = tf.keras.layers.Concatenate(axis=1)(conv_pools)
    dense1 = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(merged)
    dense2 = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense1)

    logits = tf.keras.layers.Dense(1, activation=None)(dense2)

    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])

    optimizer = tf.train.AdamOptimizer()

    def _train_op_fn(loss):
        return optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())

    return head.create_estimator_spec(features=features, labels=labels, mode=mode, logits=logits,
                                      train_op_fn=_train_op_fn)
Example #29
0
  def testStringSplitWithDelimiterTensor(self):
    strings = ["hello|world", "hello world"]

    with self.test_session() as sess:
      delimiter = tf.placeholder(tf.string)

      tokens = tf.string_split(strings, delimiter=delimiter)

      with self.assertRaises(tf.errors.InvalidArgumentError):
        sess.run(tokens, feed_dict={delimiter: ["a", "b"]})
      with self.assertRaises(tf.errors.InvalidArgumentError):
        sess.run(tokens, feed_dict={delimiter: ["a"]})
      indices, values, shape = sess.run(tokens, feed_dict={delimiter: "|"})

      self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]])
      self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
      self.assertAllEqual(shape, [2, 2])
  def decode(self, data, items):
    decoded_items = {}

    # Split tokens
    tokens = tf.string_split([data], delimiter=self.delimiter).values

    # Optionally prepend a special token
    if self.prepend_token is not None:
      tokens = tf.concat([[self.prepend_token], tokens], 0)

    # Optionally append a special token
    if self.append_token is not None:
      tokens = tf.concat([tokens, [self.append_token]], 0)

    decoded_items[self.length_feature_name] = tf.size(tokens)
    decoded_items[self.tokens_feature_name] = tokens
    return [decoded_items[_] for _ in items]
Example #31
0
def model_fn(features, labels, mode, params):
    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)

    vocab_table = lookup.index_table_from_file(
        vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words,
                                            default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0],
                              [-1, commons.MAX_DOCUMENT_LENGTH])

    f1 = tf.keras.layers.Embedding(
        params.N_WORDS, 100,
        input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)
    f2 = tf.keras.layers.Embedding(
        params.N_WORDS, 200,
        input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)
    f3 = tf.keras.layers.Embedding(
        params.N_WORDS, 300,
        input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)

    filter_sizes = [3, 5]

    conv_pools = []
    for text_embedding in [f1, f2, f3]:
        for filter_size in filter_sizes:
            l_zero = tf.keras.layers.ZeroPadding1D(
                (filter_size - 1, filter_size - 1))(text_embedding)
            l_conv = tf.keras.layers.Conv1D(filters=32,
                                            kernel_size=filter_size,
                                            padding='same',
                                            activation='tanh')(l_zero)
            l_pool = tf.keras.layers.GlobalMaxPool1D()(l_conv)
            conv_pools.append(l_pool)
    merged = tf.keras.layers.Concatenate(axis=1)(conv_pools)
    dense1 = tf.keras.layers.Dense(
        128,
        activation='relu',
        kernel_regularizer=tf.keras.regularizers.l2(0.01))(merged)
    dense2 = tf.keras.layers.Dense(
        64,
        activation='relu',
        kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense1)

    logits = tf.keras.layers.Dense(2, activation=None)(dense2)

    predictions = tf.nn.softmax(logits)
    prediction_indices = tf.argmax(predictions, axis=1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        prediction_dict = {
            'class':
            prediction_indices,  # tf.gather(commons.TARGET_LABELS, prediction_indices),
            'class_index': prediction_indices,
            'probabilities': predictions
        }

        export_outputs = {
            'predictions': tf.estimator.export.PredictOutput(prediction_dict)
        }

        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)

    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    tf.summary.scalar('loss', loss)

    acc = tf.equal(tf.cast(prediction_indices, dtype=tf.int64),
                   tf.cast(labels, dtype=tf.int64))
    acc = tf.reduce_mean(tf.cast(acc, tf.float32))

    tf.summary.scalar('acc', acc)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()

        train_op = optimizer.minimize(loss=loss,
                                      global_step=tf.train.get_global_step())

        return tf.estimator.EstimatorSpec(mode=mode,
                                          train_op=train_op,
                                          loss=loss)

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_ops = {
            'accuracy':
            tf.metrics.accuracy(labels=labels, predictions=prediction_indices)
        }
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          eval_metric_ops=eval_metrics_ops)
Example #32
0
def get_training_input(filenames, params):
    """ Get input for training stage
    Args:
        filenames: A list contains [source_filename, target_filename]
        params: Hyper-parameters

    Returns
        A dictionary of pair <Key, Tensor>
    """

    with tf.device("/cpu:0"):
        src_dataset = tf.data.TextLineDataset(filenames[0])
        tgt_dataset = tf.data.TextLineDataset(filenames[1])

        dataset = tf.data.Dataset.zip((src_dataset, tgt_dataset))
        dataset = dataset.shuffle(params.buffer_size)
        dataset = dataset.repeat()

        # Split string
        dataset = dataset.map(
            lambda src, tgt:
            (tf.string_split([src]).values, tf.string_split([tgt]).values),
            num_parallel_calls=params.num_threads)

        # Append <eos> symbol
        dataset = dataset.map(
            lambda src, tgt:
            (tf.concat([src, [tf.constant(params.eos)]], axis=0),
             tf.concat([tgt, [tf.constant(params.eos)]], axis=0)),
            num_parallel_calls=params.num_threads)

        # Convert to dictionary
        dataset = dataset.map(lambda src, tgt: {
            "source": src,
            "target": tgt,
            "source_length": tf.shape(src),
            "target_length": tf.shape(tgt)
        },
                              num_parallel_calls=params.num_threads)

        # Create iterator
        iterator = dataset.make_one_shot_iterator()
        features = iterator.get_next()

        # Create lookup table
        src_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["source"]),
            default_value=params.mapping["source"][params.unk])
        tgt_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["target"]),
            default_value=params.mapping["target"][params.unk])

        # String to index lookup
        features["source"] = src_table.lookup(features["source"])
        features["target"] = tgt_table.lookup(features["target"])

        # Batching
        features = batch_examples(features,
                                  params.batch_size,
                                  params.max_length,
                                  params.mantissa_bits,
                                  shard_multiplier=len(params.device_list),
                                  length_multiplier=params.length_multiplier,
                                  constant=params.constant_batch_size,
                                  num_threads=params.num_threads)

        # Convert to int32
        features["source"] = tf.to_int32(features["source"])
        features["target"] = tf.to_int32(features["target"])
        features["source_length"] = tf.to_int32(features["source_length"])
        features["target_length"] = tf.to_int32(features["target_length"])
        features["source_length"] = tf.squeeze(features["source_length"], 1)
        features["target_length"] = tf.squeeze(features["target_length"], 1)

        return features
Example #33
0
    def module_fn():
        """Spec function for a token embedding module."""
        # init
        _bos_id = 256
        _eos_id = 257
        _bow_id = 258
        _eow_id = 259
        _pad_id = 260

        _max_word_length = 50
        _parallel_iterations = 10
        _max_batch_size = 1024

        id_dtype = tf.int32
        id_nptype = np.int32
        max_word_length = tf.constant(_max_word_length,
                                      dtype=id_dtype,
                                      name='max_word_length')

        version = tf.constant('from_dp_1', dtype=tf.string, name='version')

        # the charcter representation of the begin/end of sentence characters
        def _make_bos_eos(c):
            r = np.zeros([_max_word_length], dtype=id_nptype)
            r[:] = _pad_id
            r[0] = _bow_id
            r[1] = c
            r[2] = _eow_id
            return tf.constant(r, dtype=id_dtype)

        bos_ids = _make_bos_eos(_bos_id)
        eos_ids = _make_bos_eos(_eos_id)

        def token2ids(token):
            with tf.name_scope("token2ids_preprocessor"):
                char_ids = tf.decode_raw(token,
                                         tf.uint8,
                                         name='decode_raw2get_char_ids')
                char_ids = tf.cast(char_ids, tf.int32, name='cast2int_token')
                char_ids = tf.strided_slice(char_ids, [0],
                                            [max_word_length - 2], [1],
                                            name='slice2resized_token')
                ids_num = tf.shape(char_ids)[0]
                fill_ids_num = (_max_word_length - 2) - ids_num
                pads = tf.fill([fill_ids_num], _pad_id)
                bow_token_eow_pads = tf.concat(
                    [[_bow_id], char_ids, [_eow_id], pads],
                    0,
                    name='concat2bow_token_eow_pads')
                return bow_token_eow_pads

        def sentence_tagging_and_padding(sen_dim):
            with tf.name_scope("sentence_tagging_and_padding_preprocessor"):
                sen = sen_dim[0]
                dim = sen_dim[1]
                extra_dim = tf.shape(sen)[0] - dim
                sen = tf.slice(sen, [0, 0], [dim, max_word_length],
                               name='slice2sen')

                bos_sen_eos = tf.concat([[bos_ids], sen, [eos_ids]],
                                        0,
                                        name='concat2bos_sen_eos')
                bos_sen_eos_plus_one = bos_sen_eos + 1
                bos_sen_eos_pads = tf.pad(bos_sen_eos_plus_one,
                                          [[0, extra_dim], [0, 0]],
                                          "CONSTANT",
                                          name='pad2bos_sen_eos_pads')
                return bos_sen_eos_pads

        # Input placeholders to the biLM.
        tokens = tf.placeholder(shape=(None, None),
                                dtype=tf.string,
                                name='ph2tokens')
        sequence_len = tf.placeholder(shape=(None, ),
                                      dtype=tf.int32,
                                      name='ph2sequence_len')

        tok_shape = tf.shape(tokens)
        line_tokens = tf.reshape(tokens,
                                 shape=[-1],
                                 name='reshape2line_tokens')

        with tf.device('/cpu:0'):
            tok_ids = tf.map_fn(token2ids,
                                line_tokens,
                                dtype=tf.int32,
                                back_prop=False,
                                parallel_iterations=_parallel_iterations,
                                name='map_fn2get_tok_ids')

        tok_ids = tf.reshape(tok_ids, [tok_shape[0], tok_shape[1], -1],
                             name='reshape2tok_ids')
        with tf.device('/cpu:0'):
            sen_ids = tf.map_fn(sentence_tagging_and_padding,
                                (tok_ids, sequence_len),
                                dtype=tf.int32,
                                back_prop=False,
                                parallel_iterations=_parallel_iterations,
                                name='map_fn2get_sen_ids')

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(options,
                                          str(weight_file),
                                          max_batch_size=_max_batch_size)

        embeddings_op = bilm(sen_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        elmo_output = weight_layers('elmo_output', embeddings_op, l2_coef=0.0)

        weighted_op = elmo_output['weighted_op']
        mean_op = elmo_output['mean_op']
        word_emb = elmo_output['word_emb']
        lstm_outputs1 = elmo_output['lstm_outputs1']
        lstm_outputs2 = elmo_output['lstm_outputs2']

        hub.add_signature("tokens", {
            "tokens": tokens,
            "sequence_len": sequence_len
        }, {
            "elmo": weighted_op,
            "default": mean_op,
            "word_emb": word_emb,
            "lstm_outputs1": lstm_outputs1,
            "lstm_outputs2": lstm_outputs2,
            "version": version
        })

        # #########################Next signature############################# #

        # Input placeholders to the biLM.
        def_strings = tf.placeholder(shape=(None), dtype=tf.string)
        def_tokens_sparse = tf.string_split(def_strings)
        def_tokens_dense = tf.sparse_to_dense(
            sparse_indices=def_tokens_sparse.indices,
            output_shape=def_tokens_sparse.dense_shape,
            sparse_values=def_tokens_sparse.values,
            default_value='')
        def_mask = tf.not_equal(def_tokens_dense, '')
        def_int_mask = tf.cast(def_mask, dtype=tf.int32)
        def_sequence_len = tf.reduce_sum(def_int_mask, axis=-1)

        def_tok_shape = tf.shape(def_tokens_dense)
        def_line_tokens = tf.reshape(def_tokens_dense,
                                     shape=[-1],
                                     name='reshape2line_tokens')

        with tf.device('/cpu:0'):
            def_tok_ids = tf.map_fn(token2ids,
                                    def_line_tokens,
                                    dtype=tf.int32,
                                    back_prop=False,
                                    parallel_iterations=_parallel_iterations,
                                    name='map_fn2get_tok_ids')

        def_tok_ids = tf.reshape(def_tok_ids,
                                 [def_tok_shape[0], def_tok_shape[1], -1],
                                 name='reshape2tok_ids')
        with tf.device('/cpu:0'):
            def_sen_ids = tf.map_fn(sentence_tagging_and_padding,
                                    (def_tok_ids, def_sequence_len),
                                    dtype=tf.int32,
                                    back_prop=False,
                                    parallel_iterations=_parallel_iterations,
                                    name='map_fn2get_sen_ids')

        # Get ops to compute the LM embeddings.
        def_embeddings_op = bilm(def_sen_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        def_elmo_output = weight_layers('elmo_output',
                                        def_embeddings_op,
                                        l2_coef=0.0,
                                        reuse=True)

        def_weighted_op = def_elmo_output['weighted_op']
        def_mean_op = def_elmo_output['mean_op']
        def_word_emb = def_elmo_output['word_emb']
        def_lstm_outputs1 = def_elmo_output['lstm_outputs1']
        def_lstm_outputs2 = def_elmo_output['lstm_outputs2']

        hub.add_signature("default", {"strings": def_strings}, {
            "elmo": def_weighted_op,
            "default": def_mean_op,
            "word_emb": def_word_emb,
            "lstm_outputs1": def_lstm_outputs1,
            "lstm_outputs2": def_lstm_outputs2,
            "version": version
        })
tf.reduce_join(a, [0, 1])  #==> ["acbd"]
tf.reduce_join(a, [1, 0])  #==> ["abcd"]
tf.reduce_join(a, [])  #==> ["abcd"]

b = tf.convert_to_tensor(["ac"])
c = tf.convert_to_tensor(["bd"])
d = tf.string_join([b, c], separator=" ", name=None)
print(sess.run(d))

e = tf.reduce_join(a, 0)
print(tf.string_to_hash_bucket(e, 2))
print(sess.run(tf.string_to_hash_bucket(e, 5)))

f = tf.string_to_hash_bucket(e, 2)
hw = tf.convert_to_tensor(["hello worls"])
print(sess.run(tf.string_split(hw, delimiter=' ')))

### Exercise modelue_1_4

#Create new string tensors with:
#	a) transform str_1 in a way to get [["name: ", "surname: "], ["Jan", "Idziak"]]
#   a')str_1 with argument ["name: Jan", "surname: Idziak"]
#	b) str_2 with argument[["helo ", "world"], ["tensor", "flow"]]
#	b') str_2 with argument ["helo world","tensorflow"]
#   c) Create simple string tensors with arguments:
#	c')	str_3 - ["My name is:"]
#	c'') str_4 - ["Janek"]
#   c''') string_join to obtain ["My name is: Janek"]
#   c''') string_join to obtain ["My name is:__Janek"]
#   c''') string_join to obtain ["My name is:randomseparatorJanek"]
#
Example #35
0
 def loop_body(i, sp):
     splitted = tf.string_split([utterances[i]]).values
     if src_max_len:
         splitted = splitted[:src_max_len]
     return tf.add(i, 1), tf.concat([sp, splitted], axis=0)
Example #36
0
def tf_str_len(s):
    """
    Returns length of tf.string s
    """
    return tf.size(tf.string_split([s],""))
Example #37
0
def model_fn(features, labels, mode, params):
    # 加载词到id的映射
    with tf.name_scope('vocab'):
        vocab_table = tf.contrib.lookup.index_table_from_tensor(
            mapping=tf.convert_to_tensor(params['vocabs']),
            num_oov_buckets=0,
            default_value=params['vocab_size'] - 1)  # 单词未定义时,默认指向词向量表的最后一个单词下标

    # 定义隐藏层, 词汇扩展1个用来存储未知单词
    with tf.name_scope('hidden'):
        embeddings = tf.get_variable(
            'embeddings',
            shape=[params['vocab_size'], params['embedding_size']],
            initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0))

    # 预测相似词
    if mode == tf.estimator.ModeKeys.PREDICT:
        # 将相似id转为词
        sparse_index_tensor = tf.string_split(
            [tf.read_file(params['vocab_file'])], delimiter='\n')
        index_tensor = tf.squeeze(
            tf.sparse_to_dense(sparse_index_tensor.indices,
                               [1, params['vocab_size']],
                               sparse_index_tensor.values,
                               default_value='unknown'))

        # L2正则化,泛化,防止过拟合
        normalized_embeddings = tf.nn.l2_normalize(embeddings, axis=1)
        discret_features = vocab_table.lookup(features)
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  tf.squeeze(discret_features))

        # 用向量内积表示余弦值: 内积越大,夹角越小,余弦值越大,向量越相似
        similarity = tf.matmul(valid_embeddings,
                               normalized_embeddings,
                               transpose_b=True)
        values, preds = tf.nn.top_k(similarity,
                                    sorted=True,
                                    k=params['pred_top'])  # 计算top
        predictions = {"prob": tf.gather(index_tensor, preds)}
        export_outputs = {
            tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            tf.estimator.export.PredictOutput(predictions)
        }
        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)

    discret_labels = vocab_table.lookup(labels)
    discret_features = vocab_table.lookup(features)
    discret_features_embeddings = tf.nn.embedding_lookup(
        embeddings, discret_features)

    #定义输出层权重
    with tf.name_scope('weights'):
        nce_weights = tf.get_variable(
            'nce_weights',
            shape=[params['vocab_size'], params['embedding_size']],
            initializer=tf.truncated_normal_initializer(
                stddev=1.0 / math.sqrt(params['embedding_size'])))

    # 定义输出层偏置
    with tf.name_scope('biases'):
        nce_biases = tf.get_variable('nce_biases',
                                     shape=[params['vocab_size']],
                                     initializer=tf.zeros_initializer)

    # 定义损失函数, 采用nce
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=discret_labels,
                           inputs=discret_features_embeddings,
                           num_sampled=params['num_neg_samples'],
                           num_classes=params['vocab_size']))

    # 训练,采用随机梯度下降优化
    with tf.name_scope('optimizer'):
        optimizer = (tf.train.GradientDescentOptimizer(
            params['learning_rate']).minimize(
                loss, global_step=tf.train.get_global_step()))

    assert mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=optimizer)
Example #38
0
 def _text_content_parser(self, text, max_length):
     word_strs = tf.string_split([text], " ")
     return tf.string_to_number(word_strs.values,
                                out_type=tf.int64)[:max_length], tf.minimum(
                                    tf.shape(word_strs)[-1], max_length)
def get_iterator(src_dataset,
                 tgt_dataset,
                 src_vocab_table,
                 tgt_vocab_table,
                 batch_size,
                 sos,
                 eos,
                 random_seed,
                 num_buckets,
                 src_max_len=None,
                 tgt_max_len=None,
                 num_parallel_calls=4,
                 output_buffer_size=None,
                 skip_count=None,
                 num_shards=1,
                 shard_index=0,
                 reshuffle_each_iteration=True,
                 use_char_encode=False):
    if not output_buffer_size:
        output_buffer_size = batch_size * 1000

    if use_char_encode:
        src_eos_id = vocab_utils.EOS_CHAR_ID
    else:
        src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)),
                             tf.int32)

    tgt_sos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(sos)), tf.int32)
    tgt_eos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(eos)), tf.int32)

    src_tgt_dataset = tf.data.Dataset.zip((src_dataset, tgt_dataset))

    src_tgt_dataset = src_tgt_dataset.shard(num_shards, shard_index)
    if skip_count is not None:
        src_tgt_dataset = src_tgt_dataset.skip(skip_count)

    src_tgt_dataset = src_tgt_dataset.shuffle(output_buffer_size, random_seed,
                                              reshuffle_each_iteration)

    src_tgt_dataset = src_tgt_dataset.map(
        lambda src, tgt:
        (tf.string_split([src]).values, tf.string_split([tgt]).values),
        num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)

    # Filter zero length input sequences.
    src_tgt_dataset = src_tgt_dataset.filter(
        lambda src, tgt: tf.logical_and(tf.size(src) > 0,
                                        tf.size(tgt) > 0))

    if src_max_len:
        src_tgt_dataset = src_tgt_dataset.map(
            lambda src, tgt: (src[:src_max_len], tgt),
            num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
    if tgt_max_len:
        src_tgt_dataset = src_tgt_dataset.map(
            lambda src, tgt: (src, tgt[:tgt_max_len]),
            num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)

    # Convert the word strings to ids.  Word strings that are not in the
    # vocab get the lookup table's default_value integer.
    if use_char_encode:
        src_tgt_dataset = src_tgt_dataset.map(
            lambda src, tgt: (tf.reshape(vocab_utils.tokens_to_bytes(
                src), [-1]), tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)),
            num_parallel_calls=num_parallel_calls)
    else:
        src_tgt_dataset = src_tgt_dataset.map(
            lambda src, tgt: (tf.cast(src_vocab_table.lookup(src), tf.int32),
                              tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)),
            num_parallel_calls=num_parallel_calls)

    src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size)
    # Create a tgt_input prefixed with <sos> and a tgt_output suffixed with <eos>.
    src_tgt_dataset = src_tgt_dataset.map(
        lambda src, tgt: (src, tf.concat(
            ([tgt_sos_id], tgt), 0), tf.concat((tgt, [tgt_eos_id]), 0)),
        num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
    # Add in sequence lengths.
    if use_char_encode:
        src_tgt_dataset = src_tgt_dataset.map(
            lambda src, tgt_in, tgt_out:
            (src, tgt_in, tgt_out,
             tf.to_int32(tf.size(src) / vocab_utils.DEFAULT_CHAR_MAXLEN),
             tf.size(tgt_in)),
            num_parallel_calls=num_parallel_calls)
    else:
        src_tgt_dataset = src_tgt_dataset.map(
            lambda src, tgt_in, tgt_out:
            (src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in)),
            num_parallel_calls=num_parallel_calls)

    src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size)

    # Bucket by source sequence length (buckets for lengths 0-9, 10-19, ...)
    def batching_func(x):
        return x.padded_batch(
            batch_size,
            # The first three entries are the source and target line rows;
            # these have unknown-length vectors.  The last two entries are
            # the source and target row sizes; these are scalars.
            padded_shapes=(
                tf.TensorShape([None]),  # src
                tf.TensorShape([None]),  # tgt_input
                tf.TensorShape([None]),  # tgt_output
                tf.TensorShape([]),  # src_len
                tf.TensorShape([])),  # tgt_len
            # Pad the source and target sequences with eos tokens.
            # (Though notice we don't generally need to do this since
            # later on we will be masking out calculations past the true sequence.
            padding_values=(
                src_eos_id,  # src
                tgt_eos_id,  # tgt_input
                tgt_eos_id,  # tgt_output
                0,  # src_len -- unused
                0))  # tgt_len -- unused

    if num_buckets > 1:

        def key_func(unused_1, unused_2, unused_3, src_len, tgt_len):
            # Calculate bucket_width by maximum source sequence length.
            # Pairs with length [0, bucket_width) go to bucket 0, length
            # [bucket_width, 2 * bucket_width) go to bucket 1, etc.  Pairs with length
            # over ((num_bucket-1) * bucket_width) words all go into the last bucket.
            if src_max_len:
                bucket_width = (src_max_len + num_buckets - 1) // num_buckets
            else:
                bucket_width = 10

            # Bucket sentence pairs by the length of their source sentence and target
            # sentence.
            bucket_id = tf.maximum(src_len // bucket_width,
                                   tgt_len // bucket_width)
            return tf.to_int64(tf.minimum(num_buckets, bucket_id))

        def reduce_func(unused_key, windowed_data):
            return batching_func(windowed_data)

        batched_dataset = src_tgt_dataset.apply(
            tf.contrib.data.group_by_window(key_func=key_func,
                                            reduce_func=reduce_func,
                                            window_size=batch_size))

    else:
        batched_dataset = batching_func(src_tgt_dataset)
    batched_iter = batched_dataset.make_initializable_iterator()
    (src_ids, tgt_input_ids, tgt_output_ids, src_seq_len,
     tgt_seq_len) = (batched_iter.get_next())
    return BatchedInput(initializer=batched_iter.initializer,
                        source=src_ids,
                        target_input=tgt_input_ids,
                        target_output=tgt_output_ids,
                        source_sequence_length=src_seq_len,
                        target_sequence_length=tgt_seq_len)
Example #40
0
 def decode_record(record):
     src = tf.string_split([record]).values
     src = tf.string_to_number(src, out_type=tf.int32)
     return src, tf.constant([SOS], dtype=tf.int32)
Example #41
0
def get_iterator(hparams, datasets, max_rows=0, num_parallel_calls=4):
    output_buffer_size = hparams.batch_size * 1000
    src_vocab, tgt_vocab, src_dataset, tgt_dataset, _, _ = datasets
    if max_rows > 0:
        src_dataset = src_dataset.take(max_rows)
        tgt_dataset = tgt_dataset.take(max_rows)
    src_dataset = src_dataset.map(
        lambda x: tf.string_split([x]).values,
        num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
    tgt_dataset = tgt_dataset.map(
        lambda x: tf.string_split([x]).values,
        num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)

    sos_id = tf.cast(tgt_vocab.lookup(tf.constant(hparams.sos)), tf.int32)
    eos_id = tf.cast(tgt_vocab.lookup(tf.constant(hparams.eos)), tf.int32)
    pad_id = tf.cast(tgt_vocab.lookup(tf.constant(hparams.pad)), tf.int32)
    src_dataset = src_dataset.map(
        lambda x: tf.cast(src_vocab.lookup(x), tf.int32),
        num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
    tgt_dataset = tgt_dataset.map(
        lambda x: tf.cast(tgt_vocab.lookup(x), tf.int32),
        num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)

    dataset = tf.data.Dataset.zip((src_dataset, tgt_dataset))
    # source,target의 길이가 max_len보다 큰 문장은 학습에서 제외
    if hparams.src_max_len > 0:
        dataset = dataset.filter(lambda src, tgt: tf.logical_and(
            tf.size(src) < hparams.src_max_len,
            tf.size(tgt) < hparams.tgt_max_len))
    dataset = dataset.shuffle(buffer_size=1000)
    dataset = dataset.map(
        lambda src, tgt: (src, tf.concat(
            ([sos_id], tgt), axis=0), tf.concat(
                (tgt, [eos_id]), axis=0), tf.size(src), tf.size(tgt) + 1),
        num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)

    def batching_func(x):
        return x.padded_batch(hparams.batch_size,
                              padded_shapes=(tf.TensorShape([None]),
                                             tf.TensorShape([None]),
                                             tf.TensorShape([None]),
                                             tf.TensorShape([]),
                                             tf.TensorShape([])),
                              padding_values=(pad_id, pad_id, pad_id, 0, 0))

    num_buckets = hparams.bucket
    if num_buckets > 1:

        def key_func(unused_1, unused_2, unused_3, src_len, tgt_len):
            # Calculate bucket_width by maximum source sequence length.
            # Pairs with length [0, bucket_width) go to bucket 0, length
            # [bucket_width, 2 * bucket_width) go to bucket 1, etc.  Pairs with length
            # over ((num_bucket-1) * bucket_width) words all go into the last bucket.
            bucket_width = (hparams.infer_src_max_len + num_buckets -
                            1) // num_buckets

            # Bucket sentence pairs by the length of their source sentence and target
            # sentence.
            bucket_id = tf.maximum(src_len // bucket_width,
                                   tgt_len // bucket_width)
            return tf.to_int64(tf.minimum(num_buckets, bucket_id))

        def reduce_func(unused_key, windowed_data):
            return batching_func(windowed_data)

        batched_dataset = dataset.apply(
            tf.contrib.data.group_by_window(key_func=key_func,
                                            reduce_func=reduce_func,
                                            window_size=hparams.batch_size))
    else:
        batched_dataset = batching_func(dataset)

    batched_iter = batched_dataset.make_initializable_iterator()
    src_ids, tgt_in_ids, tgt_out_ids, src_length, tgt_length = batched_iter.get_next(
    )
    batched_input = BatchedInput(initializer=batched_iter.initializer,
                                 source=src_ids,
                                 target_in=tgt_in_ids,
                                 target_out=tgt_out_ids,
                                 source_length=src_length,
                                 target_length=tgt_length)

    return batched_input
def create_dataset(file):
    dataset = tf.data.TextLineDataset(pjoin(DATA_DIR, file))
    string_split = dataset.map(lambda string: tf.string_split([string]).values)
    integer_dataset = string_split.map(
        lambda x: tf.string_to_number(x, out_type=tf.int32))
    return integer_dataset
Example #43
0
    def process_dataset(self, *row_parts):
        row_parts = list(row_parts)
        if self.use_multilanguage:
            language_id = row_parts[0]
            row_parts = row_parts[1]
        else:
            language_id = None

        word = row_parts[0]  # (, )

        if not self.is_evaluating and self.config.RANDOM_CONTEXTS:
            all_contexts = tf.stack(row_parts[1:])
            all_contexts_padded = tf.concat([all_contexts, [self.context_pad]],
                                            axis=-1)
            index_of_blank_context = tf.where(
                tf.equal(all_contexts_padded, self.context_pad))
            num_contexts_per_example = tf.reduce_min(index_of_blank_context)

            # if there are less than self.max_contexts valid contexts, still sample self.max_contexts
            safe_limit = tf.cast(
                tf.maximum(num_contexts_per_example, self.config.MAX_CONTEXTS),
                tf.int32)
            rand_indices = tf.random_shuffle(
                tf.range(safe_limit))[:self.config.MAX_CONTEXTS]
            contexts = tf.gather(all_contexts, rand_indices)  # (max_contexts,)
        else:
            contexts = row_parts[1:(self.config.MAX_CONTEXTS +
                                    1)]  # (max_contexts,)

        # contexts: (max_contexts, )
        split_contexts = tf.string_split(contexts,
                                         delimiter=',',
                                         skip_empty=False)
        sparse_split_contexts = tf.sparse.SparseTensor(
            indices=split_contexts.indices,
            values=split_contexts.values,
            dense_shape=[self.config.MAX_CONTEXTS, 3])
        dense_split_contexts = tf.reshape(
            tf.sparse.to_dense(sp_input=sparse_split_contexts,
                               default_value=Common.PAD),
            shape=[self.config.MAX_CONTEXTS, 3])  # (batch, max_contexts, 3)

        split_target_labels = tf.string_split(tf.expand_dims(word, -1),
                                              delimiter='|')
        target_dense_shape = [
            1,
            tf.maximum(tf.to_int64(self.config.MAX_TARGET_PARTS),
                       split_target_labels.dense_shape[1] + 1)
        ]
        sparse_target_labels = tf.sparse.SparseTensor(
            indices=split_target_labels.indices,
            values=split_target_labels.values,
            dense_shape=target_dense_shape)
        dense_target_label = tf.reshape(
            tf.sparse.to_dense(sp_input=sparse_target_labels,
                               default_value=Common.PAD), [-1])
        index_of_blank = tf.where(tf.equal(dense_target_label, Common.PAD))
        target_length = tf.reduce_min(index_of_blank)
        dense_target_label = dense_target_label[:self.config.MAX_TARGET_PARTS]
        clipped_target_lengths = tf.clip_by_value(
            target_length,
            clip_value_min=0,
            clip_value_max=self.config.MAX_TARGET_PARTS)
        target_word_labels = tf.concat(
            [self.target_table.lookup(dense_target_label), [0]],
            axis=-1)  # (max_target_parts + 1) of int

        path_source_strings = tf.slice(
            dense_split_contexts, [0, 0],
            [self.config.MAX_CONTEXTS, 1])  # (max_contexts, 1)
        flat_source_strings = tf.reshape(path_source_strings,
                                         [-1])  # (max_contexts)
        split_source = tf.string_split(
            flat_source_strings, delimiter='|',
            skip_empty=False)  # (max_contexts, max_name_parts)

        sparse_split_source = tf.sparse.SparseTensor(
            indices=split_source.indices,
            values=split_source.values,
            dense_shape=[
                self.config.MAX_CONTEXTS,
                tf.maximum(tf.to_int64(self.config.MAX_NAME_PARTS),
                           split_source.dense_shape[1])
            ])
        dense_split_source = tf.sparse.to_dense(
            sp_input=sparse_split_source,
            default_value=Common.PAD)  # (max_contexts, max_name_parts)
        dense_split_source = tf.slice(dense_split_source, [0, 0],
                                      [-1, self.config.MAX_NAME_PARTS])
        path_source_indices = self.subtoken_table.lookup(
            dense_split_source)  # (max_contexts, max_name_parts)
        path_source_lengths = tf.reduce_sum(
            tf.cast(tf.not_equal(dense_split_source, Common.PAD), tf.int32),
            -1)  # (max_contexts)

        path_strings = tf.slice(dense_split_contexts, [0, 1],
                                [self.config.MAX_CONTEXTS, 1])
        flat_path_strings = tf.reshape(path_strings, [-1])
        split_path = tf.string_split(flat_path_strings,
                                     delimiter='|',
                                     skip_empty=False)
        sparse_split_path = tf.sparse.SparseTensor(
            indices=split_path.indices,
            values=split_path.values,
            dense_shape=[
                self.config.MAX_CONTEXTS, self.config.MAX_PATH_LENGTH
            ])
        dense_split_path = tf.sparse.to_dense(
            sp_input=sparse_split_path,
            default_value=Common.PAD)  # (batch, max_contexts, max_path_length)

        node_indices = self.node_table.lookup(
            dense_split_path)  # (max_contexts, max_path_length)
        path_lengths = tf.reduce_sum(
            tf.cast(tf.not_equal(dense_split_path, Common.PAD), tf.int32),
            -1)  # (max_contexts)

        path_target_strings = tf.slice(
            dense_split_contexts, [0, 2],
            [self.config.MAX_CONTEXTS, 1])  # (max_contexts, 1)
        flat_target_strings = tf.reshape(path_target_strings,
                                         [-1])  # (max_contexts)
        split_target = tf.string_split(
            flat_target_strings, delimiter='|',
            skip_empty=False)  # (max_contexts, max_name_parts)
        sparse_split_target = tf.sparse.SparseTensor(
            indices=split_target.indices,
            values=split_target.values,
            dense_shape=[
                self.config.MAX_CONTEXTS,
                tf.maximum(tf.to_int64(self.config.MAX_NAME_PARTS),
                           split_target.dense_shape[1])
            ])
        dense_split_target = tf.sparse.to_dense(
            sp_input=sparse_split_target,
            default_value=Common.PAD)  # (max_contexts, max_name_parts)
        dense_split_target = tf.slice(dense_split_target, [0, 0],
                                      [-1, self.config.MAX_NAME_PARTS])
        path_target_indices = self.subtoken_table.lookup(
            dense_split_target)  # (max_contexts, max_name_parts)
        path_target_lengths = tf.reduce_sum(
            tf.cast(tf.not_equal(dense_split_target, Common.PAD), tf.int32),
            -1)  # (max_contexts)

        valid_contexts_mask = tf.to_float(
            tf.not_equal(
                tf.reduce_max(path_source_indices, -1) +
                tf.reduce_max(node_indices, -1) +
                tf.reduce_max(path_target_indices, -1), 0))

        return {
            TARGET_STRING_KEY: word,
            TARGET_INDEX_KEY: target_word_labels,
            TARGET_LENGTH_KEY: clipped_target_lengths,
            PATH_SOURCE_INDICES_KEY: path_source_indices,
            NODE_INDICES_KEY: node_indices,
            PATH_TARGET_INDICES_KEY: path_target_indices,
            VALID_CONTEXT_MASK_KEY: valid_contexts_mask,
            PATH_SOURCE_LENGTHS_KEY: path_source_lengths,
            PATH_LENGTHS_KEY: path_lengths,
            PATH_TARGET_LENGTHS_KEY: path_target_lengths,
            PATH_SOURCE_STRINGS_KEY: path_source_strings,
            PATH_STRINGS_KEY: path_strings,
            PATH_TARGET_STRINGS_KEY: path_target_strings,
            LANGUAGE_ID: language_id
        }
Example #44
0
def load_examples():
    if a.input_dir is None or not os.path.exists(a.input_dir):
        raise Exception("input_dir does not exist")

    input_paths = glob.glob(os.path.join(a.input_dir, "*.jpg"))
    decode = tf.image.decode_jpeg
    if len(input_paths) == 0:
        input_paths = glob.glob(os.path.join(a.input_dir, "*.png"))
        decode = tf.image.decode_png

    if len(input_paths) == 0:
        raise Exception("input_dir contains no image files")

    def get_name(path):
        name, _ = os.path.splitext(os.path.basename(path))
        return name

    # if the image names are numbers, sort by the value rather than asciibetically
    # having sorted inputs means that the outputs are sorted in test mode
    if all(get_name(path).isdigit() for path in input_paths):
        input_paths = sorted(input_paths, key=lambda path: int(get_name(path)))
    else:
        input_paths = sorted(input_paths)

    with tf.name_scope("load_images"):
        path_queue = tf.train.string_input_producer(input_paths,
                                                    shuffle=a.mode == "train")
        reader = tf.WholeFileReader()
        paths, contents = reader.read(path_queue)
        #paths = tf.Print(paths, [paths], message="paths:")
        raw_input = decode(contents)
        raw_input = tf.image.convert_image_dtype(raw_input, dtype=tf.float32)
        img_path = tf.string_split([paths], delimiter='/').values[-1]
        classes = tf.string_to_number(tf.string_split([img_path],
                                                      delimiter='_').values[0],
                                      out_type=tf.int32)
        # NOTE: may want to use one hots instead of numbers
        #classes = tf.one_hot(classes, NUM_CLASSES) # or NUM_CLASSES*2 if we want the full real/fake one hot
        #classes = tf.Print(classes, [img_path, classes], message="one hot", summarize=NUM_CLASSES)
        shape = classes.get_shape().dims  #f.shape(classes)
        classes_real = classes
        classes_fake = tf.add(classes, tf.constant(NUM_CLASSES, shape=shape))

        assertion = tf.assert_equal(tf.shape(raw_input)[2],
                                    3,
                                    message="image does not have 3 channels")
        with tf.control_dependencies([assertion]):
            raw_input = tf.identity(raw_input)

        raw_input.set_shape([None, None, 3])

        if a.lab_colorization:
            # load color and brightness from image, no B image exists here
            lab = rgb_to_lab(raw_input)
            L_chan, a_chan, b_chan = preprocess_lab(lab)
            a_images = tf.expand_dims(L_chan, axis=2)
            b_images = tf.stack([a_chan, b_chan], axis=2)
        else:
            # break apart image pair and move to range [-1, 1]
            width = tf.shape(raw_input)[1]  # [height, width, channels]
            a_images = preprocess(raw_input[:, :width // 2, :])
            b_images = preprocess(raw_input[:, width // 2:, :])

    #print(raw_input.shape, len(classes))
    if a.which_direction == "AtoB":
        inputs, targets = [a_images, b_images]
    elif a.which_direction == "BtoA":
        inputs, targets = [b_images, a_images]
    else:
        raise Exception("invalid direction")

    # synchronize seed for image operations so that we do the same operations to both
    # input and output images
    seed = random.randint(0, 2**31 - 1)

    def transform(image):
        r = image
        if a.flip:
            r = tf.image.random_flip_left_right(r, seed=seed)

        # area produces a nice downscaling, but does nearest neighbor for upscaling
        # assume we're going to be doing downscaling here
        r = tf.image.resize_images(r, [a.scale_size, a.scale_size],
                                   method=tf.image.ResizeMethod.AREA)

        offset = tf.cast(tf.floor(
            tf.random_uniform([2], 0, a.scale_size - CROP_SIZE + 1,
                              seed=seed)),
                         dtype=tf.int32)
        if a.scale_size > CROP_SIZE:
            r = tf.image.crop_to_bounding_box(r, offset[0], offset[1],
                                              CROP_SIZE, CROP_SIZE)
        elif a.scale_size < CROP_SIZE:
            raise Exception("scale size cannot be less than crop size")
        return r

    with tf.name_scope("input_images"):
        input_images = transform(inputs)

    with tf.name_scope("target_images"):
        target_images = transform(targets)

    paths_batch, inputs_batch, targets_batch, classes_real_batch, classes_fake_batch = tf.train.batch(
        [paths, input_images, target_images, classes_real, classes_fake],
        batch_size=a.batch_size)
    steps_per_epoch = int(math.ceil(len(input_paths) / a.batch_size))

    return Examples(
        paths=paths_batch,
        inputs=inputs_batch,
        targets=targets_batch,
        classes_real=classes_real_batch,
        classes_fake=classes_fake_batch,
        count=len(input_paths),
        steps_per_epoch=steps_per_epoch,
    )
def train():
    """Trains the model."""

    # Log Input Settings
    logFile = MODEL_DIRECTORY + '/' + 'Train_Log.txt'

    # Set Tensorflow Logging
    tf.logging.set_verbosity(tf.logging.INFO)

    # Create input data pipeline.
    with tf.device('/cpu:0'):
        train_files = glob.glob(TRAIN_DIRECTORY)
        train_labels = glob.glob(LABEL_DIRECTORY)
        train_dataset = tf.data.Dataset.from_tensor_slices(train_files)

        # NEW - The below seems to be one option to obtain information from
        # text files.  However, TF is extraordinarily difficult with respect to
        # being able to parse the text.  I've Googled this for hours, and
        # it's not explained as far as I can tell (it likely is of course)

        # label_dataset = tf.data.Dataset.from_tensor_slices(train_labels)

        # This was from the cs230 input pipeline website provided to us.
        # the only error it throws is that the read-in text files are of
        # a different size.  That is, some text files define multiple bounding
        # boxes.  I recommend we just use the first included bounding box;
        # this would give us 4 values for each text file then and there would
        # be no issue.
        label_dataset = tf.data.TextLineDataset(train_labels)
        # label_dataset = tf.data.TextLineDataset.from_tensor_slices(label_dataset)
        label_dataset = label_dataset.map(
            lambda token: tf.string_split([token]).values)
        label_dataset = label_dataset.map(lambda token:
                                          (token, extract_char(token)))

        # NEW - PLEASE REVIEW - we load images here
        # note that TF throws an error if any image is a different size
        # so we can either use the patch scheme of Balle, or we can resize
        # the images.  I'm not sure if the patch size would work, because
        # when we compute the MSE I dont know if TF first recombines all the patches
        # or if computes the MSE of each patch.  if its each patch then we would need
        # a function to check whether a patch includes a portion of a bounding box.
        # That said, if we resize the images it's unclear to me what size they should be
        # also we have to scale the bounding boxes to the new size somehow.

        train_dataset = train_dataset.map(
            load_image, num_parallel_calls=PREPROCESS_THREADS)
        train_dataset = train_dataset.map(
            lambda x: tf.random_crop(x, (PATCHSIZE, PATCHSIZE, 3)))

        # label_dataset = label_dataset.map(load_labels, num_parallel_calls=PREPROCESS_THREADS)

        # This combines the two datasets so they are coordinated.
        total_data = tf.data.Dataset.zip((train_dataset, label_dataset))
        total_data = total_data.shuffle(buffer_size=len(train_files)).repeat()

        # We prefetch some initial batches
        total_data = total_data.batch(BATCH_SIZE)
        total_data = total_data.prefetch(32)

        # train_labels = train_labels.batch(BATCH_SIZE)
        # train_labels = train_labels.prefetch(32)

    # Determine number of pixels and print input data info
    num_pixels = BATCH_SIZE * PATCHSIZE**2
    print('Num Train File', len(train_files))
    print('Num_Pix', num_pixels, BATCH_SIZE, PATCHSIZE)

    # Get Data - this includes labels and training images
    x = total_data.make_one_shot_iterator().get_next()

    # We then pass the training images in x[0] to our autoencoder
    y = analysis_transform(x[0], NUM_FILTERS)
    entropy_bottleneck = tfc.EntropyBottleneck()
    y_tilde, likelihoods = entropy_bottleneck(y, training=True)
    x_tilde = synthesis_transform(y_tilde, NUM_FILTERS)

    # Total number of bits divided by number of pixels.
    train_bpp = tf.reduce_sum(tf.log(likelihoods)) / (-np.log(2) * num_pixels)

    # Mean squared error across pixels.
    train_mse = tf.reduce_mean(tf.squared_difference(x[0], x_tilde))
    train_mse *= 255**2  # Multiply by 255^2 to correct for rescaling.

    ######################START TEST DECOTO############################

    #Grab the 4 Corners
    corners = [
        tf.string_to_number(x[1][1][1][2]),
        tf.string_to_number(x[1][1][1][3]),
        tf.string_to_number(x[1][1][1][4]),
        tf.string_to_number(x[1][1][1][5])
    ]

    #Build a Mask of All 0,s of Proper Shape to Multiply With x[0] (Shape = 1,256,256,1)
    M = tf.zeros([1, x[0].get_shape()[1], x[0].get_shape()[1], 1])

    #START PENDING - WORK IN PROGRESS
    #Replace the 0's in M with 1's for all areas inside the bounding box
    indices = []
    values = []
    for i in range(0, 10):  #Replace 0 and 10 w/ the corner values
        for j in range(0, 10):  #Replace 0 and 10 w/ the corner values
            indices.append([0, i, j, 0])  #Indices of Values to Change
            values.append(1)  #What to Change the Values at Indices To
    shape = M.get_shape()
    delta = tf.SparseTensor(indices, values, shape)
    delta = tf.cast(delta, tf.float32)
    M2 = M + tf.sparse_tensor_to_dense(delta)

    sums = [
        tf.reduce_sum(M), tf.reduce_sum(M2)
    ]  #Used to Print Later to Check This is Working (Sum of M = 0, Sum of M1 > 0)

    #END PENDING  - WORK IN PROGRESS

    #Mean Squared Error for the Box Portion Only
    train_mse_box = tf.reduce_mean(
        tf.multiply(tf.squared_difference(x[0], x_tilde), M2))
    train_mse_box *= 255**2

    #Training Loss Including the Bounding Box as a separate loss component
    train_loss = LMBDA * train_mse + train_bpp + LMBDA2 * train_mse_box

    ###################END TEST DECOTO############################

    # Minimize loss and auxiliary loss, and execute update op.
    step = tf.train.create_global_step()
    main_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    main_step = main_optimizer.minimize(train_loss, global_step=step)

    aux_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE * 10)
    aux_step = aux_optimizer.minimize(entropy_bottleneck.losses[0])

    train_op = tf.group(main_step, aux_step, entropy_bottleneck.updates[0])

    tf.summary.scalar("loss", train_loss)
    tf.summary.scalar("bpp", train_bpp)
    tf.summary.scalar("mse", train_mse)

    tf.summary.image("original", quantize_image(x[0]))
    tf.summary.image("reconstruction", quantize_image(x_tilde))

    # Creates summary for the probability mass function (PMF) estimated in the bottleneck.
    entropy_bottleneck.visualize()

    hooks = [
        tf.train.StopAtStepHook(last_step=NUM_STEPS),
        tf.train.NanTensorHook(train_loss)
    ]

    ep = 0
    epSub = 0
    scaffold = tf.train.Scaffold(saver=tf.train.Saver(max_to_keep=1))
    with tf.train.MonitoredTrainingSession(
            scaffold=scaffold,
            hooks=hooks,
            checkpoint_dir=MODEL_DIRECTORY,
            save_checkpoint_secs=CHECKPOINT_SAVE,
            save_summaries_secs=CHECKPOINT_SAVE) as sess:
        while not sess.should_stop():
            sess.run(train_op)

            if epSub >= LOG_STEPS:
                epSub = 0
                ep += 1
            if epSub == 0:
                print(ep * LOG_STEPS + epSub, 'train loss',
                      sess.run(train_loss))

                ######################START DECOTO EDITS######################################
                print('Corners', sess.run(corners))
                print('Sums M and M2', sess.run(sums))
                ######################END DECOTO EDITS######################################

                with open(logFile, 'a') as f:
                    f.write('step=' + str(ep * LOG_STEPS + epSub) +
                            ',train_loss=' + str(sess.run(train_loss)) +
                            ',train_bpp=' + str(sess.run(train_bpp)) +
                            ',train_mse=' + str(sess.run(train_mse)) + '\n')
            epSub += 1

    print('TRAIN COMPLETED')
Example #46
0
def tensor_predict(words_list):
    num_classes = FLAGS.num_classes
    num_layers = FLAGS.num_layers
    num_steps = FLAGS.num_steps
    embedding_size = FLAGS.embedding_size
    hidden_size = FLAGS.hidden_size
    keep_prob = FLAGS.keep_prob
    vocab_size = FLAGS.vocab_size
    vocab_path = FLAGS.vocab_path
    prop_limit = FLAGS.prop_limit
    checkpoint_path = FLAGS.checkpoint_path

    # split 1-D String dense Tensor to words SparseTensor
    sentences = tf.placeholder(dtype=tf.string,
                               shape=[None],
                               name='input_sentences')
    sparse_words = tf.string_split(sentences, delimiter=' ')

    # slice SparseTensor
    valid_indices = tf.less(sparse_words.indices,
                            tf.constant([num_steps], dtype=tf.int64))
    valid_indices = tf.reshape(
        tf.split(valid_indices, [1, 1], axis=1)[1], [-1])
    valid_sparse_words = tf.sparse_retain(sparse_words, valid_indices)

    excess_indices = tf.greater_equal(sparse_words.indices,
                                      tf.constant([num_steps], dtype=tf.int64))
    excess_indices = tf.reshape(
        tf.split(excess_indices, [1, 1], axis=1)[1], [-1])
    excess_sparse_words = tf.sparse_retain(sparse_words, excess_indices)

    # sparse to dense
    words = tf.sparse_to_dense(
        sparse_indices=valid_sparse_words.indices,
        output_shape=[valid_sparse_words.dense_shape[0], num_steps],
        sparse_values=valid_sparse_words.values,
        default_value='_PAD')

    # dict words to token ids
    # with open(os.path.join(vocab_path, 'words_vocab.txt'), 'r') as data_file:
    #   words_table_list = [line.strip() for line in data_file if line.strip()]
    # words_table_tensor = tf.constant(words_table_list, dtype=tf.string)
    # words_table = lookup.index_table_from_tensor(mapping=words_table_tensor, default_value=3)
    words_table = lookup.index_table_from_file(os.path.join(
        vocab_path, 'words_vocab.txt'),
                                               default_value=3)
    words_ids = words_table.lookup(words)

    # blstm model predict
    with tf.variable_scope('model', reuse=None):
        logits, _ = model.inference(words_ids,
                                    valid_sparse_words.dense_shape[0],
                                    num_steps,
                                    vocab_size,
                                    embedding_size,
                                    hidden_size,
                                    keep_prob,
                                    num_layers,
                                    num_classes,
                                    is_training=False)

    # using softmax
    # props = tf.nn.softmax(logits)
    # max_prop_values, max_prop_indices = tf.nn.top_k(props, k=1)
    # predict_scores = tf.reshape(max_prop_values, shape=[-1, num_steps])
    # predict_labels_ids = tf.reshape(max_prop_indices, shape=[-1, num_steps])
    # predict_labels_ids = tf.to_int64(predict_labels_ids)

    # using crf
    logits = tf.reshape(logits, shape=[-1, num_steps, num_classes])
    transition_params = tf.get_variable("transitions",
                                        [num_classes, num_classes])
    sequence_length = tf.constant(num_steps,
                                  shape=[logits.get_shape()[0]],
                                  dtype=tf.int64)
    predict_labels_ids, _ = crf_utils.crf_decode(logits, transition_params,
                                                 sequence_length)
    predict_labels_ids = tf.to_int64(predict_labels_ids)
    predict_scores = tf.constant(1.0,
                                 shape=predict_labels_ids.get_shape(),
                                 dtype=tf.float32)

    # replace untrusted prop that less than prop_limit
    trusted_prop_flag = tf.greater_equal(
        predict_scores, tf.constant(prop_limit, dtype=tf.float32))
    replace_prop_labels_ids = tf.to_int64(
        tf.fill(tf.shape(predict_labels_ids), 4))
    predict_labels_ids = tf.where(trusted_prop_flag, predict_labels_ids,
                                  replace_prop_labels_ids)

    # dict token ids to labels
    # with open(os.path.join(vocab_path, 'labels_vocab.txt'), 'r') as data_file:
    #   labels_table_list = [line.strip() for line in data_file if line.strip()]
    # labels_table_tensor = tf.constant(labels_table_list, dtype=tf.string)
    # labels_table = lookup.index_to_string_table_from_tensor(mapping=labels_table_tensor, default_value='O')
    labels_table = lookup.index_to_string_table_from_file(os.path.join(
        vocab_path, 'labels_vocab.txt'),
                                                          default_value='O')
    predict_labels = labels_table.lookup(predict_labels_ids)

    # extract real blstm predict label in dense and save to sparse
    valid_sparse_predict_labels = tf.SparseTensor(
        indices=valid_sparse_words.indices,
        values=tf.gather_nd(predict_labels, valid_sparse_words.indices),
        dense_shape=valid_sparse_words.dense_shape)

    # create excess label SparseTensor with 'O'
    excess_sparse_predict_labels = tf.SparseTensor(
        indices=excess_sparse_words.indices,
        values=tf.fill(tf.shape(excess_sparse_words.values), 'O'),
        dense_shape=excess_sparse_words.dense_shape)

    # concat SparseTensor
    sparse_predict_labels = tf.SparseTensor(
        indices=tf.concat(axis=0,
                          values=[
                              valid_sparse_predict_labels.indices,
                              excess_sparse_predict_labels.indices
                          ]),
        values=tf.concat(axis=0,
                         values=[
                             valid_sparse_predict_labels.values,
                             excess_sparse_predict_labels.values
                         ]),
        dense_shape=excess_sparse_predict_labels.dense_shape)
    sparse_predict_labels = tf.sparse_reorder(sparse_predict_labels)

    # join SparseTensor to 1-D String dense Tensor
    # remain issue, num_split should equal the real size, but here limit to 1
    join_labels_list = []
    slice_labels_list = tf.sparse_split(sp_input=sparse_predict_labels,
                                        num_split=1,
                                        axis=0)
    for slice_labels in slice_labels_list:
        slice_labels = slice_labels.values
        join_labels = tf.reduce_join(slice_labels,
                                     reduction_indices=0,
                                     separator=' ')
        join_labels_list.append(join_labels)
    format_predict_labels = tf.stack(join_labels_list, name='predict_labels')

    saver = tf.train.Saver()
    tables_init_op = tf.tables_initializer()
    with tf.Session() as sess:
        sess.run(tables_init_op)
        ckpt = tf.train.get_checkpoint_state(checkpoint_path)
        if ckpt and ckpt.model_checkpoint_path:
            print('read model from {}'.format(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print('No checkpoint file found at %s' % checkpoint_path)
            return
        # crf tensor
        predict_labels_list = sess.run(format_predict_labels,
                                       feed_dict={sentences: words_list})
        # save graph into .pb file
        graph = tf.graph_util.convert_variables_to_constants(
            sess, sess.graph_def, ["init_all_tables", "predict_labels"])
        tf.train.write_graph(graph, '.', 'ner_graph.pb', as_text=False)
        return predict_labels_list
Example #47
0
def file_to_dataset(file_holder):
    files = tf.train.match_filenames_once(file_holder)
    dataset = tf.data.TextLineDataset(files)
    # input Shape must be rank 1 for 'StringSplit', output shape is rank 1
    return dataset.map(lambda line: tf.string_split([line]).values)\
            .map(lambda str_tokens: tf.cast(tf.string_to_number(string_tensor=str_tokens), tf.int64))
Example #48
0
    def __init__(self,
                 args,
                 txt_file,
                 num_classes,
                 mode,
                 batch_size,
                 num_preprocess_threads=1,
                 shuffle=True,
                 min_queue_examples=1):
        self.args = args
        self.txt_file = txt_file
        self.num_preprocess_threads = num_preprocess_threads
        self.min_queue_examples = min_queue_examples
        self.batch_size = batch_size
        self.mode = mode
        self.imgShape = [
            self.args.imageHeight, self.args.imageWidth,
            self.args.imageChannels
        ]
        self.maskShape = tf.stack(
            [self.args.imageHeight, self.args.imageWidth])
        self.num_classes = int(num_classes)
        input_queue = tf.train.string_input_producer([txt_file], shuffle=False)
        line_reader = tf.TextLineReader()
        _, line = line_reader.read(input_queue)
        split_line = tf.string_split([line]).values

        if (mode == 'training' or mode == 'validation'):
            split_line = tf.string_split([line]).values

            rgb_image_path = split_line[0]
            label_image_path = split_line[1]

            self.image_o = self.read_image(rgb_image_path, 0)
            # self.image = tf.subtract(self.image, VGG_MEAN)

            self.label_image_o = self.read_image(label_image_path, 1)

            do_flip = tf.random_uniform([], 0, 1)
            self.image = tf.cond(
                do_flip > 0.5, lambda: tf.image.flip_left_right(self.image_o),
                lambda: self.image_o)
            self.label_image = tf.cond(
                do_flip > 0.5,
                lambda: tf.image.flip_left_right(self.label_image_o),
                lambda: self.label_image_o)

            self.image.set_shape(
                (self.args.imageHeight, self.args.imageWidth, 3))
            self.label_image.set_shape(
                (self.args.imageHeight, self.args.imageWidth, 1))

            # self.img_batch, self.label_batch = tf.train.shuffle_batch([self.imageC, self.label],
            #                                 batch_size=batch_size,
            #                                 num_threads=num_preprocess_threads,
            #                                 capacity=min_queue_examples + 3 * batch_size,
            #                                 min_after_dequeue=min_queue_examples)

            self.img_batch, self.label_batch = tf.train.batch(
                [self.image, self.label_image],
                batch_size=batch_size,
                num_threads=num_preprocess_threads,
                capacity=min_queue_examples + 3 * batch_size,
            )

        elif (mode == 'test'):
            print 'Generating test Image Batch'
            split_line = tf.string_split([line]).values

            rgb_image_path = split_line[0]
            self.image = self.read_image(rgb_image_path, 0)
            self.label = rgb_image_path

            self.image.set_shape(
                (self.args.imageHeight, self.args.imageWidth, 3))

            self.img_batch, self.label_batch = tf.train.batch(
                [self.image, self.label],
                batch_size=batch_size,
                num_threads=num_preprocess_threads,
                capacity=min_queue_examples + 1 * batch_size,
            )
Example #49
0
 def _parse_one_feature(k, x):
     indices = tf.string_split(x, ":")
     return tf.cond(pred=tf.equal(k, 'w1'),
                    true_fn=lambda: _mk_wide(indices),  # lambda is a must as true_fn/false_fn expects a callable
                    false_fn=lambda: _mk_deep(indices))
Example #50
0
def get_iterator(src_dataset,
                 tgt_dataset,
                 src_vocab_table,
                 tgt_vocab_table,
                 batch_size,
                 sos,
                 eos,
                 source_reverse,
                 random_seed,
                 num_buckets,
                 src_max_len=None,
                 tgt_max_len=None,
                 num_threads=4,
                 output_buffer_size=None,
                 skip_count=None):
    if not output_buffer_size: output_buffer_size = batch_size * 1000
    src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32)
    tgt_sos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(sos)), tf.int32)
    tgt_eos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(eos)), tf.int32)
    print("src_dataset", src_dataset)
    src_tgt_dataset = tf.contrib.data.Dataset.zip((src_dataset, tgt_dataset))

    if skip_count is not None:
        src_tgt_dataset = src_tgt_dataset.skip(skip_count)

    src_tgt_dataset = src_tgt_dataset.shuffle(output_buffer_size, random_seed)

    src_tgt_dataset = src_tgt_dataset.map(
        lambda src, tgt:
        (tf.string_split([src]).values, tf.string_split([tgt]).values),
        num_threads=num_threads,
        output_buffer_size=output_buffer_size)

    # Filter zero length input sequences.
    src_tgt_dataset = src_tgt_dataset.filter(
        lambda src, tgt: tf.logical_and(tf.size(src) > 0,
                                        tf.size(tgt) > 0))

    if src_max_len:
        src_tgt_dataset = src_tgt_dataset.map(
            lambda src, tgt: (src[:src_max_len], tgt),
            num_threads=num_threads,
            output_buffer_size=output_buffer_size)
    if tgt_max_len:
        src_tgt_dataset = src_tgt_dataset.map(
            lambda src, tgt: (src, tgt[:tgt_max_len]),
            num_threads=num_threads,
            output_buffer_size=output_buffer_size)
    if source_reverse:
        src_tgt_dataset = src_tgt_dataset.map(
            lambda src, tgt: (tf.reverse(src, axis=[0]), tgt),
            num_threads=num_threads,
            output_buffer_size=output_buffer_size)
    # Convert the word strings to ids.  Word strings that are not in the
    # vocab get the lookup table's default_value integer.
    src_tgt_dataset = src_tgt_dataset.map(
        lambda src, tgt: (tf.cast(src_vocab_table.lookup(src), tf.int32),
                          tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)),
        num_threads=num_threads,
        output_buffer_size=output_buffer_size)
    # Create a tgt_input prefixed with <sos> and a tgt_output suffixed with <eos>.
    src_tgt_dataset = src_tgt_dataset.map(
        lambda src, tgt: (src, tf.concat(
            ([tgt_sos_id], tgt), 0), tf.concat((tgt, [tgt_eos_id]), 0)),
        num_threads=num_threads,
        output_buffer_size=output_buffer_size)
    # Add in the word counts.  Subtract one from the target to avoid counting
    # the target_input <eos> tag (resp. target_output <sos> tag).
    src_tgt_dataset = src_tgt_dataset.map(
        lambda src, tgt_in, tgt_out:
        (src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in)),
        num_threads=num_threads,
        output_buffer_size=output_buffer_size)
    # Bucket by source sequence length (buckets for lengths 0-9, 10-19, ...)
    print("source target dataset", src_tgt_dataset)

    def batching_func(x):
        return x.padded_batch(
            batch_size,
            # The first three entries are the source and target line rows;
            # these have unknown-length vectors.  The last two entries are
            # the source and target row sizes; these are scalars.
            padded_shapes=(
                tf.TensorShape([None]),  # src
                tf.TensorShape([None]),  # tgt_input
                tf.TensorShape([None]),  # tgt_output
                tf.TensorShape([]),  # src_len
                tf.TensorShape([])),  # tgt_len
            # Pad the source and target sequences with eos tokens.
            # (Though notice we don't generally need to do this since
            # later on we will be masking out calculations past the true sequence.
            padding_values=(
                src_eos_id,  # src
                tgt_eos_id,  # tgt_input
                tgt_eos_id,  # tgt_output
                0,  # src_len -- unused
                0))  # tgt_len -- unused

    if num_buckets > 1:

        def key_func(unused_1, unused_2, unused_3, src_len, tgt_len):
            # Calculate bucket_width by maximum source sequence length.
            # Pairs with length [0, bucket_width) go to bucket 0, length
            # [bucket_width, 2 * bucket_width) go to bucket 1, etc.  Pairs with length
            # over ((num_bucket-1) * bucket_width) words all go into the last bucket.
            if src_max_len:
                bucket_width = (src_max_len + num_buckets - 1) // num_buckets
            else:
                bucket_width = 10

            # Bucket sentence pairs by the length of their source sentence and target
            # sentence.
            print("src_len", tf.to_int64(src_len), "bucket_width",
                  bucket_width)
            bucket_id = tf.maximum(src_len // bucket_width,
                                   tgt_len // bucket_width)
            return tf.to_int64(tf.minimum(num_buckets, bucket_id))

        def reduce_func(unused_key, windowed_data):
            return batching_func(windowed_data)

        batched_dataset = src_tgt_dataset.group_by_window(
            key_func=key_func, reduce_func=reduce_func, window_size=batch_size)
    else:
        print("num_buckets", num_buckets)
        batched_dataset = batching_func(src_tgt_dataset)
    #sess1 = tf.InteractiveSession()
    #print("batched_dataset",batched_dataset.eval(session=sess1))
    #print("Debug", sess1.run(src_dataset))
    batched_iter = batched_dataset.make_initializable_iterator()
    #iterator1 = batched_dataset.make_one_shot_iterator()
    #next_element = iterator1.get_next()
    #print("Debug", batched_iter.shape)
    #print(sess1.run(next_element))
    #print("src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len",src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len)
    (src_ids, tgt_input_ids, tgt_output_ids, src_seq_len,
     tgt_seq_len) = (batched_iter.get_next())
    print("src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len",
          src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len)
    return BatchedInput(initializer=batched_iter.initializer,
                        source=src_ids,
                        target_input=tgt_input_ids,
                        target_output=tgt_output_ids,
                        source_sequence_length=src_seq_len,
                        target_sequence_length=tgt_seq_len)
Example #51
0
def model_fn(features, labels, mode, params):
    input = features["input"]

    # TODO: put this in a file and use index_table_from_file instead (set up graph using init op)
    vocabulary = tf.constant(list(" abcdefghijklmnopqrstuvwxyz"), name="vocab")

    # use the vocabulary lookup table
    vocab = tf.contrib.lookup.index_table_from_tensor(vocabulary)

    # split input strings into characters
    with tf.name_scope("encoder"):
        split = tf.string_split(input, delimiter='')
        # for each character, lookup the index
        encoded = vocab.lookup(split)

        # perform one_hot encoding
        dense_encoding = tf.sparse_tensor_to_dense(encoded, default_value=-1)
        one_hot = tf.one_hot(dense_encoding, vocabulary.get_shape()[0])

        # TODO: better way of computing sequence lengths in the graph?
        lengths = tf.cast(tf.reduce_sum(one_hot, reduction_indices=[1, 2]),
                          tf.int32)

    def rnn_layer(size):
        keep_prob = 1.0 - params["rnn_dropout"]
        l = tf.contrib.rnn.GRUCell(size)
        if keep_prob < 1.0 and mode is tf.estimator.ModeKeys.TRAIN:
            l = tf.contrib.rnn.DropoutWrapper(l, output_keep_prob=keep_prob)
        return l

    rnn_layers = []

    rnn_cell_sizes = params["rnn_cells"]
    for size in rnn_cell_sizes:
        rnn_layers.append(rnn_layer(size))

    multi_rnn_cell = tf.contrib.rnn.MultiRNNCell(rnn_layers)
    rnn_raw_out, _ = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                       inputs=one_hot,
                                       sequence_length=lengths,
                                       dtype=tf.float32,
                                       scope="rnn_layers")

    with tf.name_scope("rnn_output_relevant"):
        # get the last relevant output from the rnn outputs
        batch = tf.range(0, tf.shape(rnn_raw_out)[0])  # generate 0->batch_size
        coordinates = tf.stack(
            [batch, lengths - 1],
            1)  # stack the 0->batch_size sequence with the sequence lengths
        rnn_out = tf.gather_nd(
            rnn_raw_out,
            coordinates)  # perform a gather using those coordinates

    # output sigmoid layers
    output_cell_sizes = params["output_cells"]
    output_dropout = params["output_dropout"]

    def output_layer(last_layer, last_layer_size, layer_size, dropout):
        W = tf.Variable(tf.random_uniform((last_layer_size, layer_size), -1,
                                          1),
                        dtype=tf.float32,
                        name="W")
        b = tf.Variable(tf.random_uniform((1, layer_size), -1, 1), name="b")
        sig = tf.sigmoid(tf.matmul(last_layer, W) + b)
        output = sig
        keep_prob = 1.0 - dropout
        if keep_prob > 0.0:
            output = tf.nn.dropout(output, 1.0 - dropout)

        return output

    last_layer = rnn_out
    last_layer_size = rnn_cell_sizes[-1]
    with tf.name_scope("output_layers"):
        for idx in range(0, len(output_cell_sizes)):
            with tf.name_scope("layer" + str(idx)):
                last_layer = output_layer(last_layer, last_layer_size,
                                          output_cell_sizes[idx],
                                          output_dropout)
                last_layer_size = output_cell_sizes[idx]

        # final prediction output
        with tf.name_scope("final"):
            predictions = output_layer(last_layer, last_layer_size, 3, False)

    # predict
    predictions_dict = {
        "color": predictions,
    }

    # export outputs
    exports_dict = {
        "color": tf.estimator.export.PredictOutput(predictions_dict),
    }

    loss = None
    train_op = None

    if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL]:
        # calculate loss
        rsme = tf.sqrt(
            tf.reduce_sum(tf.square(tf.subtract(labels, predictions)), axis=1))
        loss = tf.reduce_sum(rsme) / (tf.cast(tf.shape(input)[0], tf.float32))

        # metrics for each mode (train, eval)
        tf.summary.scalar("loss/" + mode, loss)
        tf.summary.histogram("loss/" + mode, rsme)

    if mode is tf.estimator.ModeKeys.TRAIN:
        learning_rate = params["learning_rate"]

        # set up optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)
        grads_and_vars = optimizer.compute_gradients(loss)

        # clip gradients to help with exploding gradients in RNN's
        grad_clip = params["grad_clip"]
        grads_and_vars = [(tf.clip_by_value(g, -grad_clip, grad_clip), v)
                          for g, v in grads_and_vars]

        # train
        global_step = tf.train.get_global_step()
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        # training summaries (picked up by tf.Estimator)
        tf.summary.scalar("learning_rate", learning_rate)
        for g, v in grads_and_vars:
            tf.summary.histogram("grads/" + v.name.replace(":", "_"), g)
        for v in tf.trainable_variables():
            tf.summary.histogram("vars/" + v.name.replace(":", "_"), v)

    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions_dict,
        export_outputs=exports_dict,
        loss=loss,
        train_op=train_op,
    )
Example #52
0
    def __init__(self, data_path, filenames_file, params, dataset, mode):
        self.data_path = data_path
        self.params = params
        self.dataset = dataset
        self.mode = mode

        self.left_image_batch = None
        self.right_image_batch = None
        self.left_next_image_batch = None
        self.right_next_image_batch = None
        self.cam_params_batch = None

        input_queue = tf.train.string_input_producer([filenames_file],
                                                     shuffle=False)
        line_reader = tf.TextLineReader()
        _, line = line_reader.read(input_queue)

        split_line = tf.string_split([line]).values

        # we load only one image for test, except if we trained a stereo model
        if mode == 'test':
            left_image_path = tf.string_join([self.data_path, split_line[0]])
            left_image_o = self.read_image(left_image_path)
        else:
            left_image_path = tf.string_join([self.data_path, split_line[0]])
            right_image_path = tf.string_join([self.data_path, split_line[1]])
            left_next_image_path = tf.string_join(
                [self.data_path, split_line[2]])
            right_next_image_path = tf.string_join(
                [self.data_path, split_line[3]])
            cam_params = tf.string_to_number(split_line[4:11])
            height_o = tf.string_to_number(split_line[11])
            width_o = tf.string_to_number(split_line[12])

            left_image_o = self.read_image(left_image_path)
            right_image_o = self.read_image(right_image_path)
            left_next_image_o = self.read_image(left_next_image_path)
            right_next_image_o = self.read_image(right_next_image_path)

            # set cam_params shape
            cam_params = tf.reshape(cam_params, [7])
            cam_params = tf.expand_dims(cam_params, 0)
            h_tensor = tf.expand_dims(
                tf.cast(tf.constant([self.params.height]), tf.float32), 0)
            w_tensor = tf.expand_dims(
                tf.cast(tf.constant([self.params.width]), tf.float32), 0)
            cam_params = tf.squeeze(
                tf.concat(
                    [cam_params, h_tensor / height_o, w_tensor / width_o], 1))
#            print(h_tensor/height_o)
        if mode == 'train':
            # randomly flip images
            do_flip = tf.random_uniform([], 0, 1)
            left_image = tf.cond(
                do_flip > 0.5, lambda: tf.image.flip_left_right(right_image_o),
                lambda: left_image_o)
            right_image = tf.cond(
                do_flip > 0.5, lambda: tf.image.flip_left_right(left_image_o),
                lambda: right_image_o)
            left_next_image = tf.cond(
                do_flip > 0.5,
                lambda: tf.image.flip_left_right(right_next_image_o),
                lambda: left_next_image_o)
            right_next_image = tf.cond(
                do_flip > 0.5,
                lambda: tf.image.flip_left_right(left_next_image_o),
                lambda: right_next_image_o)

            # randomly augment images
            do_augment = tf.random_uniform([], 0, 1)
            left_image, right_image, left_next_image, right_next_image = tf.cond(
                do_augment > 0.5, lambda: self.augment_image_pair(
                    left_image, right_image, left_next_image, right_next_image
                ), lambda:
                (left_image, right_image, left_next_image, right_next_image))

            # set image shape
            left_image.set_shape([self.params.height, self.params.width, 3])
            right_image.set_shape([self.params.height, self.params.width, 3])
            left_next_image.set_shape(
                [self.params.height, self.params.width, 3])
            right_next_image.set_shape(
                [self.params.height, self.params.width, 3])

            # capacity = min_after_dequeue + (num_threads + a small safety margin) * batch_size
            min_after_dequeue = 2048
            capacity = min_after_dequeue + 4 * params.batch_size
            self.left_image_batch, self.right_image_batch, self.left_next_image_batch, self.right_next_image_batch, self.cam_params_batch = tf.train.shuffle_batch(
                [
                    left_image, right_image, left_next_image, right_next_image,
                    cam_params
                ], params.batch_size, capacity, min_after_dequeue,
                params.num_threads)

        elif mode == 'test':
            self.left_image_batch = tf.stack(
                [left_image_o,
                 tf.image.flip_left_right(left_image_o)], 0)
            self.left_image_batch.set_shape([2, None, None, 3])
Example #53
0
# source_query_tokens_input = tf.expand_dims(source_query_tokens_ph, 0)
# source_query_len_input = tf.expand_dims(source_query_len_ph, 0)
# source_candidate_tokens_input = tf.expand_dims(source_candidate_tokens_ph, 0)
# source_candidate_len_input = tf.expand_dims(source_candidate_len_ph,0)

# source_query_tokens_ph = tf.string_split(source_query_tokens_ph, " ")
# source_query_len_ph = tf.string_split(source_query_len_ph, " ")
# source_candidate_tokens_ph = tf.string_split(source_candidate_tokens_ph, " ")
# source_candidate_len_ph = tf.string_split(source_candidate_len_ph, " ")

source_query_tokens_input = tf.expand_dims(source_query_tokens_ph, 0)
source_query_len_input = tf.expand_dims(source_query_len_ph, 0)
source_candidate_tokens_input = tf.expand_dims(source_candidate_tokens_ph, 0)
source_candidate_len_input = tf.expand_dims(source_candidate_len_ph, 0)

source_query_tokens_input = tf.string_split(source_query_tokens_input)
source_query_tokens_input = tf.sparse_tensor_to_dense(
    source_query_tokens_input, default_value="")
# source_query_len_input = tf.string_split(source_query_len_input)
source_candidate_tokens_input = tf.string_split(source_candidate_tokens_input)
source_candidate_tokens_input = tf.sparse_tensor_to_dense(
    source_candidate_tokens_input, default_value="")
# source_candidate_len_input = tf.string_split(source_candidate_len_input)

model(
    features={
        # "source_tokens": source_query_tokens_ph,
        # "source_len": source_query_len_ph,
        # "source_candidate_tokens": source_candidate_tokens_ph,
        # "source_candidate_len": source_candidate_len_ph
    def _realize_mappings(self):
        with tf.device('/cpu:0'), tf.variable_scope('word_embedding'):
            features = tf.unstack(
                self.X, axis=1
            )  # List with Feature_NUM ele each with a shape of [batch_size]
            wide_mappings = {}
            wide_tensors = []
            deep_mappings = {}
            deep_tensors = []
            for one_feature, tag in zip(features, self.tags):
                if tag.wide_or_deep_side != "wide":
                    continue
                split_tag = tf.string_split(one_feature, "|")
                one_sparse = tf.SparseTensor(
                    indices=split_tag.indices,
                    values=tag.table.lookup(split_tag.values)
                    if tag.tag_name == "custom" else split_tag.values,
                    ## 这里给出了不同值通过表查到的index ##
                    dense_shape=split_tag.dense_shape)

                wide_mappings[tag.tag_name] = one_sparse
                wide_tensors.append(tag.embedding_res)

            for one_feature, tag in zip(features, self.tags):
                if tag.wide_or_deep_side == "wide":
                    continue
                split_tag = tf.string_split(one_feature, "|")
                one_sparse = tf.SparseTensor(
                    indices=split_tag.indices,
                    values=tag.table.lookup(split_tag.values)
                    if tag.tag_name == "custom" else split_tag.values,
                    ## 这里给出了不同值通过表查到的index ##
                    dense_shape=split_tag.dense_shape)

                deep_mappings[tag.tag_name] = one_sparse
                deep_tensors.append(tag.embedding_res)

                if tag.sibling is not None:
                    sibling = tag.sibling
                    # print("sibling.tag_name = ",sibling.tag_name)
                    # print("sibling.embedding_size = ",sibling.embedding_size)
                    deep_mappings[sibling.tag_name] = one_sparse
                    deep_tensors.append(sibling.embedding_res)

            mappings = {}
            tensors = []
            for key in wide_mappings:
                mappings[key] = wide_mappings[key]
            for key in deep_mappings:
                mappings[key] = deep_mappings[key]
            tensors = wide_tensors + deep_tensors
            wide_and_deep_embedding_res = tf.feature_column.input_layer(
                mappings, tensors)
            print("batch_embedding_res.shape = ",
                  wide_and_deep_embedding_res.shape)
            wide_inputs, deep_inputs = tf.split(
                wide_and_deep_embedding_res,
                [self.wide_side_dimension_size, self.deep_side_dimension_size],
                1)

            self.wide_inputs = tf.reshape(
                wide_inputs, [self.batch_size, self.wide_side_dimension_size])
            self.deep_inputs = tf.reshape(
                deep_inputs, [self.batch_size, self.deep_side_dimension_size])
            print("wide_inputs.shape = ", self.wide_inputs.shape)
            print("deep_inputs.shape = ", self.deep_inputs.shape)
            '''
Example #55
0
def vectorize(string, vocab, seq_len):
    splitted = tf.string_split([string]).values
    vectorized = vocab.lookup(splitted)
    vectorized = vectorized[:seq_len]
    return vectorized
Example #56
0
def decode_p1(line):
    fields = tf.string_split([line], ',').values
    fields = rtt.PrivateInput(fields, data_owner=1)
    return fields
def ctc_loss(prob,
             labels,
             input_shape,
             alphabet,
             alphabet_codes,
             batch_size,
             n_pools=2 * 2,
             decode=True):
    # Compute seq_len from image width
    # 2x2 pooling in dimension W on layer 1 and 2 -> n-pools = 2*2
    seq_len_inputs = tf.divide(
        [input_shape[1]] * batch_size, n_pools, name='seq_len_input_op') - 1

    # Get keys (letters) and values (integer stand ins for letters)
    # Alphabet and codes
    keys = [c for c in alphabet]  # the letters themselves
    values = alphabet_codes  # integer representations

    # Create non-string labels from the keys and values above
    # Convert string label to code label
    with tf.name_scope('str2code_conversion'):
        table_str2int = tf.contrib.lookup.HashTable(
            tf.contrib.lookup.KeyValueTensorInitializer(keys, values), -1)
        splited = tf.string_split(
            labels, delimiter=''
        )  # TODO change string split to utf8 split in next tf version
        codes = table_str2int.lookup(splited.values)
        sparse_code_target = tf.SparseTensor(splited.indices, codes,
                                             splited.dense_shape)

    seq_lengths_labels = tf.bincount(tf.cast(sparse_code_target.indices[:, 0],
                                             tf.int32),
                                     minlength=tf.shape(prob)[1])

    # Use ctc loss on probabilities from lstm output
    # Loss
    # ----
    # >>> Cannot have longer labels than predictions -> error
    with tf.control_dependencies([
            tf.less_equal(sparse_code_target.dense_shape[1],
                          tf.reduce_max(tf.cast(seq_len_inputs, tf.int64)))
    ]):
        loss_ctc = tf.nn.ctc_loss(
            labels=sparse_code_target,
            inputs=prob,
            sequence_length=tf.cast(seq_len_inputs, tf.int32),
            preprocess_collapse_repeated=False,
            ctc_merge_repeated=True,
            ignore_longer_outputs_than_inputs=
            True,  # returns zero gradient in case it happens -> ema loss = NaN
            time_major=True)
        loss_ctc = tf.reduce_mean(loss_ctc)
        # loss_ctc = tf.Print(loss_ctc, [loss_ctc], message='* Loss : ')

    if decode:
        with tf.name_scope('code2str_conversion'):
            keys = tf.cast(alphabet_codes, tf.int64)
            values = [c for c in alphabet]

            table_int2str = tf.contrib.lookup.HashTable(
                tf.contrib.lookup.KeyValueTensorInitializer(keys, values), '?')

            sparse_code_pred, log_probability = tf.nn.ctc_beam_search_decoder(
                prob,
                sequence_length=tf.cast(seq_len_inputs, tf.int32),
                merge_repeated=False,
                beam_width=100,
                top_paths=2)
            # Score
            pred_score = tf.subtract(log_probability[:, 0], log_probability[:,
                                                                            1])

            sparse_code_pred = sparse_code_pred[0]

            sequence_lengths_pred = tf.bincount(tf.cast(
                sparse_code_pred.indices[:, 0], tf.int32),
                                                minlength=tf.shape(prob)[1])

            pred_chars = table_int2str.lookup(sparse_code_pred)
            words = get_words_from_chars(
                pred_chars.values, sequence_lengths=sequence_lengths_pred)

            # tf.summary.text('predicted_words', words[:10])

        with tf.name_scope('evaluation'):
            CER = tf.metrics.mean(tf.edit_distance(
                sparse_code_pred, tf.cast(sparse_code_target, dtype=tf.int64)),
                                  name='CER')
            CER = tf.reduce_mean(tf.edit_distance(
                sparse_code_pred, tf.cast(sparse_code_target, dtype=tf.int64)),
                                 name='CER')

            # Convert label codes to decoding alphabet to compare predicted and groundtrouth words
            target_chars = table_int2str.lookup(
                tf.cast(sparse_code_target, tf.int64))
            target_words = get_words_from_chars(target_chars.values,
                                                seq_lengths_labels)
            accuracy = tf.metrics.accuracy(target_words,
                                           words,
                                           name='accuracy')

            # CER = tf.Print(CER, [CER], message='-- CER : ')
            # accuracy = tf.Print(accuracy, [accuracy], message='-- Accuracy : ')
    else:
        CER = None
        accuracy = None

    return loss_ctc, words, pred_score, CER, accuracy
 def extract_char(token, default_value="<PAD>"):
     out = tf.string_split(token, delimiter='')
     out = tf.sparse.to_dense(out, default_value=default_value)
     return out
def extract_raw_value(padded):
    split_stensor = tf.string_split(padded, delimiter="\t")
    split_tensor = tf.sparse.to_dense(split_stensor, default_value="")
    raw_value = split_tensor[:, 0]
    return raw_value
Example #60
0
def train_input_fn(hparams):
    # create Dataset by train file and skip header line
    dataset = tf.data.TextLineDataset(hparams.train_file).skip(1)
    # parse csv
    dataset = dataset.map(parse_line)
    # split string
    dataset = dataset.map(
            lambda question, answer, lable:(
                tf.string_split([question]).values,
                tf.string_split([answer]).values,
                lable))
    # filter question and answer length
    dataset = dataset.filter(
            lambda question, answer, lable:(
                tf.logical_and(tf.size(question) > 0, tf.size(answer) > 0)))
    dataset = dataset.map(
            lambda question, answer, lable:(
                question[:hparams.max_question_len],
                answer[:hparams.max_answer_len],
                lable))
    # convert word strings to ids
    vocab_table = create_vocab_table(hparams.vocabulary_file)
    dataset = dataset.map(
            lambda question, answer, lable:(
                vocab_table.lookup(question),
                vocab_table.lookup(answer),
                lable))
    
    # add in question and answer sequence length
    dataset = dataset.map(
            lambda question, answer, lable:(
                question, tf.size(question), answer, tf.size(answer), lable))

    # padding question and answer
    dataset = dataset.map(padding_string_sequence)
    
    # convert to features dict and lable tuple
    dataset = dataset.map(
            lambda question, question_len, answer, answer_len, lable:(
                {
                    "question":question,
                    "question_len":question_len,
                    "answer":answer,
                    "answer_len":answer_len
                 },
                lable))

    # shuffle and repeat
    dataset = dataset.shuffle(1000).repeat()

    # padded batch as question and answer have varying size
    dataset = dataset.padded_batch(
            hparams.batch_size,
            padded_shapes=(
                {
                    "question":tf.TensorShape([None]),
                    "question_len":tf.TensorShape([]),
                    "answer":tf.TensorShape([None]),
                    "answer_len":tf.TensorShape([])
                },
                tf.TensorShape([])))
    
    # create features and lable
    #iterator = dataset.make_initializable_iterator()
    #tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer)

    """
    question, question_len, answer, answer_len, lable = iterator.get_next()
    features = dict()
    features["question"] = question
    features["question_len"] = question_len
    features["answer"] = answer
    features["answer_len"] = answer_len
    """

    return dataset