def init(self): # init self.global_step = global_step = tf.Variable(0, trainable=False, name='global_step') self.learning_rate = learning_rate = tf.train.exponential_decay(1e-2, global_step, 500, 0.95, staircase=True) # Load classes src_table = tf.contrib.lookup.index_table_from_file('./iwslt15/vocab.en', default_value=0) tgt_table = tf.contrib.lookup.index_table_from_file('./iwslt15/vocab.vi', default_value=0) #src_table_size = src_table.size() #tgt_table_size = tgt_table.size() src_table_size = 17191 tgt_table_size = 7709 src_eos_id = tf.cast(src_table.lookup(tf.constant('</s>')), tf.int64) self.tgt_eos_id = tgt_eos_id = tf.cast(tgt_table.lookup(tf.constant('</s>')), tf.int64) self.tgt_sos_id = tgt_sos_id = tf.cast(tgt_table.lookup(tf.constant('<s>')), tf.int64) # file placeholder src_files = tf.placeholder(tf.string, shape=[None]) tgt_files = tf.placeholder(tf.string, shape=[None]) # Read data src_dataset = tf.contrib.data.TextLineDataset(src_files) tgt_dataset = tf.contrib.data.TextLineDataset(tgt_files) # Convert data to word indices src_dataset = src_dataset.map(lambda string: tf.concat([['<s>'], tf.string_split([string]).values, ['</s>']], 0)) src_dataset = src_dataset.map(lambda words: (words, tf.size(words))) src_dataset = src_dataset.map(lambda words, size: (src_table.lookup(words), size)) tgt_dataset = tgt_dataset.map(lambda string: tf.concat([['<s>'], tf.string_split([string]).values, ['</s>']], 0)) tgt_dataset = tgt_dataset.map(lambda words: (words, tf.size(words))) tgt_dataset = tgt_dataset.map(lambda words, size: (tgt_table.lookup(words), size)) # zip data dataset = tf.contrib.data.Dataset.zip((src_dataset, tgt_dataset)) # batch batched_dataset = dataset.padded_batch(self.batch_size, padded_shapes=((tf.TensorShape([None]), tf.TensorShape([])),(tf.TensorShape([None]), tf.TensorShape([]))), padding_values=((src_eos_id, 0), (tgt_eos_id, 0))) batched_iterator = batched_dataset.make_initializable_iterator() ((source, source_lengths), (target, target_lengths)) = batched_iterator.get_next() self.target = target self.target_lengths = target_lengths self.source_lengths = source_lengths # Load embedding (dic limits to 100000) src_embed = tf.Variable(tf.random_normal([100000, self.embed_vector_size], stddev=0.1)) self.tgt_embed = tgt_embed = tf.Variable(tf.random_normal([100000, self.embed_vector_size], stddev=0.1)) self.src_lookup = src_lookup = tf.nn.embedding_lookup(src_embed, source) self.tgt_lookup = tgt_lookup = tf.nn.embedding_lookup(tgt_embed, target) # Projection Layer self.projection_layer = projection_layer = layers_core.Dense(tgt_table_size) return batched_iterator, src_files, tgt_files
def decode_libsvm(line): columns = tf.string_split([line], ' ') labels = tf.string_to_number(columns.values[0], out_type=tf.float32) splits = tf.string_split(columns.values[1:], ':') id_vals = tf.reshape(splits.values,splits.dense_shape) feat_ids, feat_vals = tf.split(id_vals,num_or_size_splits=2,axis=1) feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32) feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32) return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels
def create_char_vectors_from_post(self, raw_post, mxlen): char2index = self.index if self.do_lowercase: raw_post = self.lowercase(raw_post) raw_post = tf.string_split(tf.reshape(raw_post, [-1])) culled_word_token_vals = tf.substr(raw_post.values, 0, self.mxwlen) char_tokens = tf.string_split(culled_word_token_vals, delimiter='') char_indices = char2index.lookup(char_tokens) return self.reshape_indices(char_indices, [mxlen, self.mxwlen])
def __init__(self, args, txt_file, num_classes, mode, batch_size, num_preprocess_threads=1, shuffle=True, min_queue_examples=1): self.args = args self.txt_file = txt_file self.num_preprocess_threads = num_preprocess_threads self.min_queue_examples = min_queue_examples self.batch_size = batch_size self.mode = mode self.imgShape = [self.args.imageHeight, self.args.imageWidth, self.args.imageChannels] self.maskShape = tf.stack([self.args.imageHeight, self.args.imageWidth]) self.num_classes = int(num_classes) input_queue = tf.train.string_input_producer([txt_file], shuffle=False) line_reader = tf.TextLineReader() _, line = line_reader.read(input_queue) split_line = tf.string_split([line]).values if (mode == 'training' or mode == 'validation'): split_line = tf.string_split([line]).values rgb_image_path = split_line[0] label_image_path = split_line[1] self.image_o = self.read_image(rgb_image_path, 0) self.label_image_o = self.read_image(label_image_path, 1) do_flip = tf.random_uniform([], 0, 1) self.image = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(self.image_o), lambda: self.image_o) self.label_image = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(self.label_image_o), lambda: self.label_image_o) self.image.set_shape((self.args.imageHeight, self.args.imageWidth, 3)) self.label_image.set_shape((self.args.imageHeight, self.args.imageWidth, 1)) self.img_batch, self.label_batch = tf.train.shuffle_batch([self.image, self.label_image], batch_size=batch_size, num_threads=num_preprocess_threads, capacity=min_queue_examples + 3 * batch_size, min_after_dequeue=min_queue_examples) elif (mode == 'test'): print('Generating test Image Batch') split_line = tf.string_split([line]).values rgb_image_path = split_line[0] self.image = self.read_image(rgb_image_path, 0) self.image.set_shape((self.args.imageHeight, self.args.imageWidth, 3)) self.img_batch = tf.train.batch([self.image], batch_size=batch_size, num_threads=num_preprocess_threads, capacity=min_queue_examples + 1 * batch_size, )
def decode_libsvm(line): #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS) #features = dict(zip(CSV_COLUMNS, columns)) #labels = features.pop(LABEL_COLUMN) columns = tf.string_split([line], ' ') labels = tf.string_to_number(columns.values[0], out_type=tf.float32) splits = tf.string_split(columns.values[1:], ':') id_vals = tf.reshape(splits.values,splits.dense_shape) feat_ids, feat_vals = tf.split(id_vals,num_or_size_splits=2,axis=1) feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32) feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32) return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels
def _parse_line(line): """ _parse_line """ line_arr = tf.string_split([line], '\t').values #print(line_arr[2]) Tensor("strided_slice:0", shape=(), dtype=string) user = line_arr[0] label = tf.string_to_number(line_arr[1], out_type=tf.int32) #print(tf.string_split([line_arr[2]]).values) Tensor("StringSplit_1:1", shape=(?,), dtype=string) features = {} features["words"] = tf.string_to_number(tf.string_split([line_arr[2]], ",").values, tf.int32) features["id"] = user return features, label
def _get_labels_builder(self, labels_file): labels_vocabulary = tf.contrib.lookup.index_table_from_file( self.labels_vocabulary_file, vocab_size=self.num_labels) dataset = tf.data.TextLineDataset(labels_file) process_fn = lambda x: { "tags": tf.string_split([x]).values, "tags_id": labels_vocabulary.lookup(tf.string_split([x]).values) } padded_shapes_fn = lambda: { "tags": [None], "tags_id": [None] } return dataset, process_fn, padded_shapes_fn
def get_predict_iterator(src_vocab_table, vocab_size, batch_size, max_len=max_sequence): pred_dataset = tf.contrib.data.TextLineDataset(pred_file) pred_dataset = pred_dataset.map( lambda src: tf.string_split([src]).values) if max_len: pred_dataset = pred_dataset.map(lambda src: src[:max_sequence]) pred_dataset = pred_dataset.map( lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32)) pred_dataset = pred_dataset.map(lambda src: (src, tf.size(src))) def batching_func(x): return x.padded_batch( batch_size, padded_shapes=(tf.TensorShape([None]), # src tf.TensorShape([])), # src_len padding_values=(vocab_size+1, # src 0)) # src_len -- unused batched_dataset = batching_func(pred_dataset) batched_iter = batched_dataset.make_initializable_iterator() (src_ids, src_seq_len) = batched_iter.get_next() # 这里target_input在预测的时候不需要,但是不能返回None否则报错。这里则用个placeholder代替,但是仍然不会用到。 WAHTEVER = 10 fake_tag = tf.placeholder(tf.int32, [None, WAHTEVER]) return BatchedInput( initializer=batched_iter.initializer, source=src_ids, target_input=fake_tag, source_sequence_length=src_seq_len, target_sequence_length=src_seq_len)
def lowercase(self, raw_post): split_chars = tf.string_split(tf.reshape(raw_post, [-1]), delimiter="").values upchar_inds = self.upchars_lut.lookup(split_chars) return tf.reduce_join(tf.map_fn(lambda x: tf.cond(x[0] > 25, lambda: x[1], lambda: self.lchars[x[0]]), (upchar_inds, split_chars), dtype=tf.string))
def _decode_and_resize(image_tensor): """Decodes jpeg string, resizes it and returns a uint8 tensor.""" # These constants are set by Inception v3's expectations. height = 299 width = 299 channels = 3 image_tensor = tf.where(tf.equal(image_tensor, ''), IMAGE_DEFAULT_STRING, image_tensor) # Fork by whether image_tensor value is a file path, or a base64 encoded string. slash_positions = tf.equal(tf.string_split([image_tensor], delimiter="").values, '/') is_file_path = tf.cast(tf.count_nonzero(slash_positions), tf.bool) # The following two functions are required for tf.cond. Note that we can not replace them # with lambda. According to TF docs, if using inline lambda, both branches of condition # will be executed. The workaround is to use a function call. def _read_file(): return tf.read_file(image_tensor) def _decode_base64(): return tf.decode_base64(image_tensor) image = tf.cond(is_file_path, lambda: _read_file(), lambda: _decode_base64()) image = tf.image.decode_jpeg(image, channels=channels) image = tf.expand_dims(image, 0) image = tf.image.resize_bilinear(image, [height, width], align_corners=False) image = tf.squeeze(image, squeeze_dims=[0]) image = tf.cast(image, dtype=tf.uint8) return image
def get_test_iterator(src_dataset, src_vocab_table, batch_size, config): src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(config.eos)), tf.int32) src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values) src_dataset = src_dataset.map(lambda src: src[:config.src_max_len]) src_dataset = src_dataset.map( lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32)) if config.reverse_src: src_dataset = src_dataset.map(lambda src: tf.reverse(src, axis=[0])) src_dataset = src_dataset.map(lambda src: (src, tf.size(src))) def batching_func(x): return x.padded_batch( config.batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([])), padding_values=(src_eos_id, 0)) batched_dataset = batching_func(src_dataset) batched_iter = batched_dataset.make_initializable_iterator() src_ids, src_seq_len = batched_iter.get_next() return BatchedInput( initializer=batched_iter.initializer, source=src_ids, target_input=None, target_output=None, source_sequence_length=src_seq_len, target_sequence_length=None)
def custom_fast_text(features, labels, mode, params): vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.CNN_MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.CNN_MAX_DOCUMENT_LENGTH]) if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) embedded_sequences = tf.keras.layers.Embedding(params.N_WORDS, 20, input_length=commons.CNN_MAX_DOCUMENT_LENGTH)( word_id_vector) f1 = tf.keras.layers.GlobalMaxPooling1D()(embedded_sequences) logits = tf.keras.layers.Dense(commons.TARGET_SIZE, activation=None)(f1) predictions = tf.nn.sigmoid(logits) if mode == tf.estimator.ModeKeys.PREDICT: prediction_dict = { 'class': tf.cast(tf.map_fn(lambda x: tf.cond(x > 0.30, lambda: 1.0, lambda: 0.0), tf.squeeze(predictions)), dtype=tf.int32), } export_outputs = { 'predictions': tf.estimator.export.PredictOutput(prediction_dict) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs) loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=labels, logits=logits) tf.summary.scalar('loss', loss) acc = tf.equal(tf.cast(predictions, dtype=tf.int32), labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def decode_libsvm(line): #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS) #features = dict(zip(CSV_COLUMNS, columns)) #labels = features.pop(LABEL_COLUMN) columns = tf.string_split([line], ' ') labels = tf.string_to_number(columns.values[0], out_type=tf.float32) splits = tf.string_split(columns.values[1:], ':') id_vals = tf.reshape(splits.values,splits.dense_shape) feat_ids, feat_vals = tf.split(id_vals,num_or_size_splits=2,axis=1) feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32) feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32) #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size]) #for i in range(splits.dense_shape.eval()[0]): # feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32)) # feat_vals.append(tf.string_to_number(splits.values[2*i+1])) #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels
def sparse_from_csv(csv): ids, post_tags_str = tf.decode_csv(csv, [[-1], [""]]) table = tf.contrib.lookup.index_table_from_tensor( mapping=TAG_SET, default_value=-1) ## 这里构造了个查找表 ## split_tags = tf.string_split(post_tags_str, "|") return tf.SparseTensor( indices=split_tags.indices, values=table.lookup(split_tags.values), ## 这里给出了不同值通过表查到的index ## dense_shape=split_tags.dense_shape)
def resize_sen(self, raw, mxlen): """ Splits and rejoins a string to ensure that tokens meet the required max len. """ raw_tokens = tf.string_split(tf.reshape(raw, [-1])).values # sentence length > mxlen raw_post = tf.reduce_join(raw_tokens[:mxlen], separator=" ") return raw_post
def testStringSplit(self): strings = ["pigs on the wing", "animals"] with self.test_session() as sess: tokens = tf.string_split(strings) indices, values, shape = sess.run(tokens) self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]]) self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"]) self.assertAllEqual(shape, [2, 4])
def testStringSplitEmptyToken(self): strings = [" hello ", "", "world "] with self.test_session() as sess: tokens = tf.string_split(strings) indices, values, shape = sess.run(tokens) self.assertAllEqual(indices, [[0, 0], [2, 0]]) self.assertAllEqual(values, [b"hello", b"world"]) self.assertAllEqual(shape, [3, 1])
def has_no_question_marks(line): """Returns True if the line of text has no question marks.""" # split the line into an array of characters chars = tf.string_split(line[tf.newaxis], "").values # for each character check if it is a question mark is_question = tf.equal(chars, "?") any_question = tf.reduce_any(is_question) no_question = ~any_question return no_question
def create_word_vectors_from_post(self, raw_post, mxlen): # vocab has only lowercase words word2index = self.index if self.do_lowercase: raw_post = self.lowercase(raw_post) word_tokens = tf.string_split(tf.reshape(raw_post, [-1])) word_indices = word2index.lookup(word_tokens) # Reshape them out to the proper length reshaped_words = tf.sparse_reshape(word_indices, shape=[-1]) return self.reshape_indices(reshaped_words, [mxlen])
def _create_word_vectors_from_post_mixed_case(self, nraw_post, mxlen): # vocab has only lowercase words word_tokens = tf.string_split(tf.reshape(nraw_post, [-1])) word_indices = self.word2index.lookup(word_tokens) # Reshape them out to the proper length reshaped_words = tf.sparse_reshape(word_indices, shape=[-1]) x = self._reshape_indices(reshaped_words, [mxlen]) return x
def testStringSplitWithDelimiter(self): strings = ["hello|world", "hello world"] with self.test_session() as sess: self.assertRaises( ValueError, tf.string_split, strings, delimiter=["|", ""]) self.assertRaises(ValueError, tf.string_split, strings, delimiter=["a"]) tokens = tf.string_split(strings, delimiter="|") indices, values, shape = sess.run(tokens) self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]]) self.assertAllEqual(values, [b"hello", b"world", b"hello world"]) self.assertAllEqual(shape, [2, 2]) tokens = tf.string_split(strings, delimiter="| ") indices, values, shape = sess.run(tokens) self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]]) self.assertAllEqual(values, [b"hello", b"world", b"hello", b"world"]) self.assertAllEqual(shape, [2, 2])
def testStringSplitEmptyDelimiter(self): strings = ["hello", "hola"] with self.test_session() as sess: tokens = tf.string_split(strings, delimiter="") indices, values, shape = sess.run(tokens) self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1], [1, 2], [1, 3]]) self.assertAllEqual(values, [b"h", b"e", b"l", b"l", b"o", b"h", b"o", b"l", b"a"]) self.assertAllEqual(shape, [2, 5])
def preprocessing_fn(inputs): """Preprocessing function. Args: inputs: dictionary of raw input tensors Returns: A dictionary of transformed tensors """ stats = json.loads( file_io.read_file_to_string( os.path.join(output_dir, STATS_FILE)).decode()) result = {} for name, transform in six.iteritems(features): transform_name = transform['transform'] source_column = transform['source_column'] if transform_name == TARGET_TRANSFORM: if not keep_target: continue if file_io.file_exists(os.path.join(output_dir, VOCAB_ANALYSIS_FILE % source_column)): transform_name = 'one_hot' else: transform_name = 'identity' if transform_name == 'identity': result[name] = inputs[source_column] elif transform_name == 'scale': result[name] = _scale( inputs[name], min_x_value=stats['column_stats'][source_column]['min'], max_x_value=stats['column_stats'][source_column]['max'], output_min=transform.get('value', 1) * (-1), output_max=transform.get('value', 1)) elif transform_name in [ONE_HOT_TRANSFORM, MULTI_HOT_TRANSFORM]: vocab, ex_count = read_vocab_file( os.path.join(output_dir, VOCAB_ANALYSIS_FILE % source_column)) if transform_name == MULTI_HOT_TRANSFORM: separator = transform.get('separator', ' ') tokens = tf.string_split(inputs[source_column], separator) result[name] = _string_to_int(tokens, vocab) else: result[name] = _string_to_int(inputs[source_column], vocab) elif transform_name == IMAGE_TRANSFORM: make_image_to_vec_fn = _make_image_to_vec_tito( name, checkpoint=transform.get('checkpoint', None)) result[name] = make_image_to_vec_fn(inputs[source_column]) else: raise ValueError('unknown transform %s' % transform_name) return result
def _tokenize(self, sentences): # Perform a minimalistic text preprocessing by removing punctuation and # splitting on spaces. normalized_sentences = tf.strings.regex_replace( input=sentences, pattern=r"\pP", rewrite="") sparse_tokens = tf.string_split(normalized_sentences, " ") # Deal with a corner case: there is one empty sentence. sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant("")) # Deal with a corner case: all sentences are empty. sparse_tokens = tf.sparse.reset_shape(sparse_tokens) return (sparse_tokens.indices, sparse_tokens.values, sparse_tokens.dense_shape)
def testStringSplitEmptyDelimiter(self): strings = ["hello", "hola", b"\xF0\x9F\x98\x8E"] # Last string is U+1F60E with self.test_session() as sess: tokens = tf.string_split(strings, delimiter="") indices, values, shape = sess.run(tokens) self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1], [1, 2], [1, 3], [2, 0], [2, 1], [2, 2], [2, 3]]) expected = np.array( ['h', 'e', 'l', 'l', 'o', 'h', 'o', 'l', 'a', b'\xf0', b'\x9f', b'\x98', b'\x8e'], dtype='|S1') self.assertAllEqual(values.tolist(), expected) self.assertAllEqual(shape, [3, 5])
def parse(line): """Parse a line from the colors dataset.""" # Each line of the dataset is comma-separated and formatted as # color_name, r, g, b # so `items` is a list [color_name, r, g, b]. items = tf.string_split([line], ",").values rgb = tf.string_to_number(items[1:], out_type=tf.float32) / 255. # Represent the color name as a one-hot encoded character sequence. color_name = items[0] chars = tf.one_hot(tf.decode_raw(color_name, tf.uint8), depth=256) # The sequence length is needed by our RNN. length = tf.cast(tf.shape(chars)[0], dtype=tf.int64) return rgb, chars, length
def cnn_model(features, target, mode): table = lookup.index_table_from_file(vocabulary_file=WORD_VOCAB_FILE, num_oov_buckets=1, default_value=-1) # string operations titles = tf.squeeze(features['title'], [1]) words = tf.string_split(titles) densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD) numbers = table.lookup(densewords) padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]]) padded = tf.pad(numbers, padding) sliced = tf.slice(padded, [0,0], [-1, MAX_DOCUMENT_LENGTH]) print('words_sliced={}'.format(words)) # (?, 20) # layer to take the words and convert them into vectors (embeddings) embeds = tf.contrib.layers.embed_sequence(sliced, vocab_size=N_WORDS, embed_dim=EMBEDDING_SIZE) print('words_embed={}'.format(embeds)) # (?, 20, 10) # now do convolution conv = tf.contrib.layers.conv2d(embeds, 1, WINDOW_SIZE, stride=STRIDE, padding='SAME') # (?, 4, 1) conv = tf.nn.relu(conv) # (?, 4, 1) words = tf.squeeze(conv, [2]) # (?, 4) print('words_conv={}'.format(words)) # (?, 4) n_classes = len(TARGETS) logits = tf.contrib.layers.fully_connected(words, n_classes, activation_fn=None) #print('logits={}'.format(logits)) # (?, 3) predictions_dict = { 'source': tf.gather(TARGETS, tf.argmax(logits, 1)), 'class': tf.argmax(logits, 1), 'prob': tf.nn.softmax(logits) } if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL: loss = tf.losses.sparse_softmax_cross_entropy(target, logits) train_op = tf.contrib.layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer='Adam', learning_rate=0.01) else: loss = None train_op = None return tflearn.ModelFnOps( mode=mode, predictions=predictions_dict, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) f1 = tf.keras.layers.Embedding(params.N_WORDS, 100, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) f2 = tf.keras.layers.Embedding(params.N_WORDS, 200, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) f3 = tf.keras.layers.Embedding(params.N_WORDS, 300, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) filter_sizes = [3, 5] conv_pools = [] for text_embedding in [f1, f2, f3]: for filter_size in filter_sizes: l_zero = tf.keras.layers.ZeroPadding1D((filter_size - 1, filter_size - 1))(text_embedding) l_conv = tf.keras.layers.Conv1D(filters=32, kernel_size=filter_size, padding='same', activation='tanh')(l_zero) l_pool = tf.keras.layers.GlobalMaxPool1D()(l_conv) conv_pools.append(l_pool) merged = tf.keras.layers.Concatenate(axis=1)(conv_pools) dense1 = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(merged) dense2 = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense1) logits = tf.keras.layers.Dense(1, activation=None)(dense2) if labels is not None: labels = tf.reshape(labels, [-1, 1]) optimizer = tf.train.AdamOptimizer() def _train_op_fn(loss): return optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return head.create_estimator_spec(features=features, labels=labels, mode=mode, logits=logits, train_op_fn=_train_op_fn)
def testStringSplitWithDelimiterTensor(self): strings = ["hello|world", "hello world"] with self.test_session() as sess: delimiter = tf.placeholder(tf.string) tokens = tf.string_split(strings, delimiter=delimiter) with self.assertRaises(tf.errors.InvalidArgumentError): sess.run(tokens, feed_dict={delimiter: ["a", "b"]}) with self.assertRaises(tf.errors.InvalidArgumentError): sess.run(tokens, feed_dict={delimiter: ["a"]}) indices, values, shape = sess.run(tokens, feed_dict={delimiter: "|"}) self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]]) self.assertAllEqual(values, [b"hello", b"world", b"hello world"]) self.assertAllEqual(shape, [2, 2])
def decode(self, data, items): decoded_items = {} # Split tokens tokens = tf.string_split([data], delimiter=self.delimiter).values # Optionally prepend a special token if self.prepend_token is not None: tokens = tf.concat([[self.prepend_token], tokens], 0) # Optionally append a special token if self.append_token is not None: tokens = tf.concat([tokens, [self.append_token]], 0) decoded_items[self.length_feature_name] = tf.size(tokens) decoded_items[self.tokens_feature_name] = tokens return [decoded_items[_] for _ in items]
def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) vocab_table = lookup.index_table_from_file( vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) f1 = tf.keras.layers.Embedding( params.N_WORDS, 100, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) f2 = tf.keras.layers.Embedding( params.N_WORDS, 200, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) f3 = tf.keras.layers.Embedding( params.N_WORDS, 300, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) filter_sizes = [3, 5] conv_pools = [] for text_embedding in [f1, f2, f3]: for filter_size in filter_sizes: l_zero = tf.keras.layers.ZeroPadding1D( (filter_size - 1, filter_size - 1))(text_embedding) l_conv = tf.keras.layers.Conv1D(filters=32, kernel_size=filter_size, padding='same', activation='tanh')(l_zero) l_pool = tf.keras.layers.GlobalMaxPool1D()(l_conv) conv_pools.append(l_pool) merged = tf.keras.layers.Concatenate(axis=1)(conv_pools) dense1 = tf.keras.layers.Dense( 128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(merged) dense2 = tf.keras.layers.Dense( 64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense1) logits = tf.keras.layers.Dense(2, activation=None)(dense2) predictions = tf.nn.softmax(logits) prediction_indices = tf.argmax(predictions, axis=1) if mode == tf.estimator.ModeKeys.PREDICT: prediction_dict = { 'class': prediction_indices, # tf.gather(commons.TARGET_LABELS, prediction_indices), 'class_index': prediction_indices, 'probabilities': predictions } export_outputs = { 'predictions': tf.estimator.export.PredictOutput(prediction_dict) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) tf.summary.scalar('loss', loss) acc = tf.equal(tf.cast(prediction_indices, dtype=tf.int64), tf.cast(labels, dtype=tf.int64)) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=prediction_indices) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def get_training_input(filenames, params): """ Get input for training stage Args: filenames: A list contains [source_filename, target_filename] params: Hyper-parameters Returns A dictionary of pair <Key, Tensor> """ with tf.device("/cpu:0"): src_dataset = tf.data.TextLineDataset(filenames[0]) tgt_dataset = tf.data.TextLineDataset(filenames[1]) dataset = tf.data.Dataset.zip((src_dataset, tgt_dataset)) dataset = dataset.shuffle(params.buffer_size) dataset = dataset.repeat() # Split string dataset = dataset.map( lambda src, tgt: (tf.string_split([src]).values, tf.string_split([tgt]).values), num_parallel_calls=params.num_threads) # Append <eos> symbol dataset = dataset.map( lambda src, tgt: (tf.concat([src, [tf.constant(params.eos)]], axis=0), tf.concat([tgt, [tf.constant(params.eos)]], axis=0)), num_parallel_calls=params.num_threads) # Convert to dictionary dataset = dataset.map(lambda src, tgt: { "source": src, "target": tgt, "source_length": tf.shape(src), "target_length": tf.shape(tgt) }, num_parallel_calls=params.num_threads) # Create iterator iterator = dataset.make_one_shot_iterator() features = iterator.get_next() # Create lookup table src_table = tf.contrib.lookup.index_table_from_tensor( tf.constant(params.vocabulary["source"]), default_value=params.mapping["source"][params.unk]) tgt_table = tf.contrib.lookup.index_table_from_tensor( tf.constant(params.vocabulary["target"]), default_value=params.mapping["target"][params.unk]) # String to index lookup features["source"] = src_table.lookup(features["source"]) features["target"] = tgt_table.lookup(features["target"]) # Batching features = batch_examples(features, params.batch_size, params.max_length, params.mantissa_bits, shard_multiplier=len(params.device_list), length_multiplier=params.length_multiplier, constant=params.constant_batch_size, num_threads=params.num_threads) # Convert to int32 features["source"] = tf.to_int32(features["source"]) features["target"] = tf.to_int32(features["target"]) features["source_length"] = tf.to_int32(features["source_length"]) features["target_length"] = tf.to_int32(features["target_length"]) features["source_length"] = tf.squeeze(features["source_length"], 1) features["target_length"] = tf.squeeze(features["target_length"], 1) return features
def module_fn(): """Spec function for a token embedding module.""" # init _bos_id = 256 _eos_id = 257 _bow_id = 258 _eow_id = 259 _pad_id = 260 _max_word_length = 50 _parallel_iterations = 10 _max_batch_size = 1024 id_dtype = tf.int32 id_nptype = np.int32 max_word_length = tf.constant(_max_word_length, dtype=id_dtype, name='max_word_length') version = tf.constant('from_dp_1', dtype=tf.string, name='version') # the charcter representation of the begin/end of sentence characters def _make_bos_eos(c): r = np.zeros([_max_word_length], dtype=id_nptype) r[:] = _pad_id r[0] = _bow_id r[1] = c r[2] = _eow_id return tf.constant(r, dtype=id_dtype) bos_ids = _make_bos_eos(_bos_id) eos_ids = _make_bos_eos(_eos_id) def token2ids(token): with tf.name_scope("token2ids_preprocessor"): char_ids = tf.decode_raw(token, tf.uint8, name='decode_raw2get_char_ids') char_ids = tf.cast(char_ids, tf.int32, name='cast2int_token') char_ids = tf.strided_slice(char_ids, [0], [max_word_length - 2], [1], name='slice2resized_token') ids_num = tf.shape(char_ids)[0] fill_ids_num = (_max_word_length - 2) - ids_num pads = tf.fill([fill_ids_num], _pad_id) bow_token_eow_pads = tf.concat( [[_bow_id], char_ids, [_eow_id], pads], 0, name='concat2bow_token_eow_pads') return bow_token_eow_pads def sentence_tagging_and_padding(sen_dim): with tf.name_scope("sentence_tagging_and_padding_preprocessor"): sen = sen_dim[0] dim = sen_dim[1] extra_dim = tf.shape(sen)[0] - dim sen = tf.slice(sen, [0, 0], [dim, max_word_length], name='slice2sen') bos_sen_eos = tf.concat([[bos_ids], sen, [eos_ids]], 0, name='concat2bos_sen_eos') bos_sen_eos_plus_one = bos_sen_eos + 1 bos_sen_eos_pads = tf.pad(bos_sen_eos_plus_one, [[0, extra_dim], [0, 0]], "CONSTANT", name='pad2bos_sen_eos_pads') return bos_sen_eos_pads # Input placeholders to the biLM. tokens = tf.placeholder(shape=(None, None), dtype=tf.string, name='ph2tokens') sequence_len = tf.placeholder(shape=(None, ), dtype=tf.int32, name='ph2sequence_len') tok_shape = tf.shape(tokens) line_tokens = tf.reshape(tokens, shape=[-1], name='reshape2line_tokens') with tf.device('/cpu:0'): tok_ids = tf.map_fn(token2ids, line_tokens, dtype=tf.int32, back_prop=False, parallel_iterations=_parallel_iterations, name='map_fn2get_tok_ids') tok_ids = tf.reshape(tok_ids, [tok_shape[0], tok_shape[1], -1], name='reshape2tok_ids') with tf.device('/cpu:0'): sen_ids = tf.map_fn(sentence_tagging_and_padding, (tok_ids, sequence_len), dtype=tf.int32, back_prop=False, parallel_iterations=_parallel_iterations, name='map_fn2get_sen_ids') # Build the biLM graph. bilm = BidirectionalLanguageModel(options, str(weight_file), max_batch_size=_max_batch_size) embeddings_op = bilm(sen_ids) # Get an op to compute ELMo (weighted average of the internal biLM layers) elmo_output = weight_layers('elmo_output', embeddings_op, l2_coef=0.0) weighted_op = elmo_output['weighted_op'] mean_op = elmo_output['mean_op'] word_emb = elmo_output['word_emb'] lstm_outputs1 = elmo_output['lstm_outputs1'] lstm_outputs2 = elmo_output['lstm_outputs2'] hub.add_signature("tokens", { "tokens": tokens, "sequence_len": sequence_len }, { "elmo": weighted_op, "default": mean_op, "word_emb": word_emb, "lstm_outputs1": lstm_outputs1, "lstm_outputs2": lstm_outputs2, "version": version }) # #########################Next signature############################# # # Input placeholders to the biLM. def_strings = tf.placeholder(shape=(None), dtype=tf.string) def_tokens_sparse = tf.string_split(def_strings) def_tokens_dense = tf.sparse_to_dense( sparse_indices=def_tokens_sparse.indices, output_shape=def_tokens_sparse.dense_shape, sparse_values=def_tokens_sparse.values, default_value='') def_mask = tf.not_equal(def_tokens_dense, '') def_int_mask = tf.cast(def_mask, dtype=tf.int32) def_sequence_len = tf.reduce_sum(def_int_mask, axis=-1) def_tok_shape = tf.shape(def_tokens_dense) def_line_tokens = tf.reshape(def_tokens_dense, shape=[-1], name='reshape2line_tokens') with tf.device('/cpu:0'): def_tok_ids = tf.map_fn(token2ids, def_line_tokens, dtype=tf.int32, back_prop=False, parallel_iterations=_parallel_iterations, name='map_fn2get_tok_ids') def_tok_ids = tf.reshape(def_tok_ids, [def_tok_shape[0], def_tok_shape[1], -1], name='reshape2tok_ids') with tf.device('/cpu:0'): def_sen_ids = tf.map_fn(sentence_tagging_and_padding, (def_tok_ids, def_sequence_len), dtype=tf.int32, back_prop=False, parallel_iterations=_parallel_iterations, name='map_fn2get_sen_ids') # Get ops to compute the LM embeddings. def_embeddings_op = bilm(def_sen_ids) # Get an op to compute ELMo (weighted average of the internal biLM layers) def_elmo_output = weight_layers('elmo_output', def_embeddings_op, l2_coef=0.0, reuse=True) def_weighted_op = def_elmo_output['weighted_op'] def_mean_op = def_elmo_output['mean_op'] def_word_emb = def_elmo_output['word_emb'] def_lstm_outputs1 = def_elmo_output['lstm_outputs1'] def_lstm_outputs2 = def_elmo_output['lstm_outputs2'] hub.add_signature("default", {"strings": def_strings}, { "elmo": def_weighted_op, "default": def_mean_op, "word_emb": def_word_emb, "lstm_outputs1": def_lstm_outputs1, "lstm_outputs2": def_lstm_outputs2, "version": version })
tf.reduce_join(a, [0, 1]) #==> ["acbd"] tf.reduce_join(a, [1, 0]) #==> ["abcd"] tf.reduce_join(a, []) #==> ["abcd"] b = tf.convert_to_tensor(["ac"]) c = tf.convert_to_tensor(["bd"]) d = tf.string_join([b, c], separator=" ", name=None) print(sess.run(d)) e = tf.reduce_join(a, 0) print(tf.string_to_hash_bucket(e, 2)) print(sess.run(tf.string_to_hash_bucket(e, 5))) f = tf.string_to_hash_bucket(e, 2) hw = tf.convert_to_tensor(["hello worls"]) print(sess.run(tf.string_split(hw, delimiter=' '))) ### Exercise modelue_1_4 #Create new string tensors with: # a) transform str_1 in a way to get [["name: ", "surname: "], ["Jan", "Idziak"]] # a')str_1 with argument ["name: Jan", "surname: Idziak"] # b) str_2 with argument[["helo ", "world"], ["tensor", "flow"]] # b') str_2 with argument ["helo world","tensorflow"] # c) Create simple string tensors with arguments: # c') str_3 - ["My name is:"] # c'') str_4 - ["Janek"] # c''') string_join to obtain ["My name is: Janek"] # c''') string_join to obtain ["My name is:__Janek"] # c''') string_join to obtain ["My name is:randomseparatorJanek"] #
def loop_body(i, sp): splitted = tf.string_split([utterances[i]]).values if src_max_len: splitted = splitted[:src_max_len] return tf.add(i, 1), tf.concat([sp, splitted], axis=0)
def tf_str_len(s): """ Returns length of tf.string s """ return tf.size(tf.string_split([s],""))
def model_fn(features, labels, mode, params): # 加载词到id的映射 with tf.name_scope('vocab'): vocab_table = tf.contrib.lookup.index_table_from_tensor( mapping=tf.convert_to_tensor(params['vocabs']), num_oov_buckets=0, default_value=params['vocab_size'] - 1) # 单词未定义时,默认指向词向量表的最后一个单词下标 # 定义隐藏层, 词汇扩展1个用来存储未知单词 with tf.name_scope('hidden'): embeddings = tf.get_variable( 'embeddings', shape=[params['vocab_size'], params['embedding_size']], initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0)) # 预测相似词 if mode == tf.estimator.ModeKeys.PREDICT: # 将相似id转为词 sparse_index_tensor = tf.string_split( [tf.read_file(params['vocab_file'])], delimiter='\n') index_tensor = tf.squeeze( tf.sparse_to_dense(sparse_index_tensor.indices, [1, params['vocab_size']], sparse_index_tensor.values, default_value='unknown')) # L2正则化,泛化,防止过拟合 normalized_embeddings = tf.nn.l2_normalize(embeddings, axis=1) discret_features = vocab_table.lookup(features) valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, tf.squeeze(discret_features)) # 用向量内积表示余弦值: 内积越大,夹角越小,余弦值越大,向量越相似 similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) values, preds = tf.nn.top_k(similarity, sorted=True, k=params['pred_top']) # 计算top predictions = {"prob": tf.gather(index_tensor, preds)} export_outputs = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs) discret_labels = vocab_table.lookup(labels) discret_features = vocab_table.lookup(features) discret_features_embeddings = tf.nn.embedding_lookup( embeddings, discret_features) #定义输出层权重 with tf.name_scope('weights'): nce_weights = tf.get_variable( 'nce_weights', shape=[params['vocab_size'], params['embedding_size']], initializer=tf.truncated_normal_initializer( stddev=1.0 / math.sqrt(params['embedding_size']))) # 定义输出层偏置 with tf.name_scope('biases'): nce_biases = tf.get_variable('nce_biases', shape=[params['vocab_size']], initializer=tf.zeros_initializer) # 定义损失函数, 采用nce with tf.name_scope('loss'): loss = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=discret_labels, inputs=discret_features_embeddings, num_sampled=params['num_neg_samples'], num_classes=params['vocab_size'])) # 训练,采用随机梯度下降优化 with tf.name_scope('optimizer'): optimizer = (tf.train.GradientDescentOptimizer( params['learning_rate']).minimize( loss, global_step=tf.train.get_global_step())) assert mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=optimizer)
def _text_content_parser(self, text, max_length): word_strs = tf.string_split([text], " ") return tf.string_to_number(word_strs.values, out_type=tf.int64)[:max_length], tf.minimum( tf.shape(word_strs)[-1], max_length)
def get_iterator(src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, batch_size, sos, eos, random_seed, num_buckets, src_max_len=None, tgt_max_len=None, num_parallel_calls=4, output_buffer_size=None, skip_count=None, num_shards=1, shard_index=0, reshuffle_each_iteration=True, use_char_encode=False): if not output_buffer_size: output_buffer_size = batch_size * 1000 if use_char_encode: src_eos_id = vocab_utils.EOS_CHAR_ID else: src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32) tgt_sos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(sos)), tf.int32) tgt_eos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(eos)), tf.int32) src_tgt_dataset = tf.data.Dataset.zip((src_dataset, tgt_dataset)) src_tgt_dataset = src_tgt_dataset.shard(num_shards, shard_index) if skip_count is not None: src_tgt_dataset = src_tgt_dataset.skip(skip_count) src_tgt_dataset = src_tgt_dataset.shuffle(output_buffer_size, random_seed, reshuffle_each_iteration) src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (tf.string_split([src]).values, tf.string_split([tgt]).values), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) # Filter zero length input sequences. src_tgt_dataset = src_tgt_dataset.filter( lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0)) if src_max_len: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (src[:src_max_len], tgt), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) if tgt_max_len: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (src, tgt[:tgt_max_len]), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) # Convert the word strings to ids. Word strings that are not in the # vocab get the lookup table's default_value integer. if use_char_encode: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (tf.reshape(vocab_utils.tokens_to_bytes( src), [-1]), tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)), num_parallel_calls=num_parallel_calls) else: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (tf.cast(src_vocab_table.lookup(src), tf.int32), tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)), num_parallel_calls=num_parallel_calls) src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size) # Create a tgt_input prefixed with <sos> and a tgt_output suffixed with <eos>. src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (src, tf.concat( ([tgt_sos_id], tgt), 0), tf.concat((tgt, [tgt_eos_id]), 0)), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) # Add in sequence lengths. if use_char_encode: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt_in, tgt_out: (src, tgt_in, tgt_out, tf.to_int32(tf.size(src) / vocab_utils.DEFAULT_CHAR_MAXLEN), tf.size(tgt_in)), num_parallel_calls=num_parallel_calls) else: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt_in, tgt_out: (src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in)), num_parallel_calls=num_parallel_calls) src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size) # Bucket by source sequence length (buckets for lengths 0-9, 10-19, ...) def batching_func(x): return x.padded_batch( batch_size, # The first three entries are the source and target line rows; # these have unknown-length vectors. The last two entries are # the source and target row sizes; these are scalars. padded_shapes=( tf.TensorShape([None]), # src tf.TensorShape([None]), # tgt_input tf.TensorShape([None]), # tgt_output tf.TensorShape([]), # src_len tf.TensorShape([])), # tgt_len # Pad the source and target sequences with eos tokens. # (Though notice we don't generally need to do this since # later on we will be masking out calculations past the true sequence. padding_values=( src_eos_id, # src tgt_eos_id, # tgt_input tgt_eos_id, # tgt_output 0, # src_len -- unused 0)) # tgt_len -- unused if num_buckets > 1: def key_func(unused_1, unused_2, unused_3, src_len, tgt_len): # Calculate bucket_width by maximum source sequence length. # Pairs with length [0, bucket_width) go to bucket 0, length # [bucket_width, 2 * bucket_width) go to bucket 1, etc. Pairs with length # over ((num_bucket-1) * bucket_width) words all go into the last bucket. if src_max_len: bucket_width = (src_max_len + num_buckets - 1) // num_buckets else: bucket_width = 10 # Bucket sentence pairs by the length of their source sentence and target # sentence. bucket_id = tf.maximum(src_len // bucket_width, tgt_len // bucket_width) return tf.to_int64(tf.minimum(num_buckets, bucket_id)) def reduce_func(unused_key, windowed_data): return batching_func(windowed_data) batched_dataset = src_tgt_dataset.apply( tf.contrib.data.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=batch_size)) else: batched_dataset = batching_func(src_tgt_dataset) batched_iter = batched_dataset.make_initializable_iterator() (src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len) = (batched_iter.get_next()) return BatchedInput(initializer=batched_iter.initializer, source=src_ids, target_input=tgt_input_ids, target_output=tgt_output_ids, source_sequence_length=src_seq_len, target_sequence_length=tgt_seq_len)
def decode_record(record): src = tf.string_split([record]).values src = tf.string_to_number(src, out_type=tf.int32) return src, tf.constant([SOS], dtype=tf.int32)
def get_iterator(hparams, datasets, max_rows=0, num_parallel_calls=4): output_buffer_size = hparams.batch_size * 1000 src_vocab, tgt_vocab, src_dataset, tgt_dataset, _, _ = datasets if max_rows > 0: src_dataset = src_dataset.take(max_rows) tgt_dataset = tgt_dataset.take(max_rows) src_dataset = src_dataset.map( lambda x: tf.string_split([x]).values, num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) tgt_dataset = tgt_dataset.map( lambda x: tf.string_split([x]).values, num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) sos_id = tf.cast(tgt_vocab.lookup(tf.constant(hparams.sos)), tf.int32) eos_id = tf.cast(tgt_vocab.lookup(tf.constant(hparams.eos)), tf.int32) pad_id = tf.cast(tgt_vocab.lookup(tf.constant(hparams.pad)), tf.int32) src_dataset = src_dataset.map( lambda x: tf.cast(src_vocab.lookup(x), tf.int32), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) tgt_dataset = tgt_dataset.map( lambda x: tf.cast(tgt_vocab.lookup(x), tf.int32), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) dataset = tf.data.Dataset.zip((src_dataset, tgt_dataset)) # source,target의 길이가 max_len보다 큰 문장은 학습에서 제외 if hparams.src_max_len > 0: dataset = dataset.filter(lambda src, tgt: tf.logical_and( tf.size(src) < hparams.src_max_len, tf.size(tgt) < hparams.tgt_max_len)) dataset = dataset.shuffle(buffer_size=1000) dataset = dataset.map( lambda src, tgt: (src, tf.concat( ([sos_id], tgt), axis=0), tf.concat( (tgt, [eos_id]), axis=0), tf.size(src), tf.size(tgt) + 1), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) def batching_func(x): return x.padded_batch(hparams.batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([])), padding_values=(pad_id, pad_id, pad_id, 0, 0)) num_buckets = hparams.bucket if num_buckets > 1: def key_func(unused_1, unused_2, unused_3, src_len, tgt_len): # Calculate bucket_width by maximum source sequence length. # Pairs with length [0, bucket_width) go to bucket 0, length # [bucket_width, 2 * bucket_width) go to bucket 1, etc. Pairs with length # over ((num_bucket-1) * bucket_width) words all go into the last bucket. bucket_width = (hparams.infer_src_max_len + num_buckets - 1) // num_buckets # Bucket sentence pairs by the length of their source sentence and target # sentence. bucket_id = tf.maximum(src_len // bucket_width, tgt_len // bucket_width) return tf.to_int64(tf.minimum(num_buckets, bucket_id)) def reduce_func(unused_key, windowed_data): return batching_func(windowed_data) batched_dataset = dataset.apply( tf.contrib.data.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=hparams.batch_size)) else: batched_dataset = batching_func(dataset) batched_iter = batched_dataset.make_initializable_iterator() src_ids, tgt_in_ids, tgt_out_ids, src_length, tgt_length = batched_iter.get_next( ) batched_input = BatchedInput(initializer=batched_iter.initializer, source=src_ids, target_in=tgt_in_ids, target_out=tgt_out_ids, source_length=src_length, target_length=tgt_length) return batched_input
def create_dataset(file): dataset = tf.data.TextLineDataset(pjoin(DATA_DIR, file)) string_split = dataset.map(lambda string: tf.string_split([string]).values) integer_dataset = string_split.map( lambda x: tf.string_to_number(x, out_type=tf.int32)) return integer_dataset
def process_dataset(self, *row_parts): row_parts = list(row_parts) if self.use_multilanguage: language_id = row_parts[0] row_parts = row_parts[1] else: language_id = None word = row_parts[0] # (, ) if not self.is_evaluating and self.config.RANDOM_CONTEXTS: all_contexts = tf.stack(row_parts[1:]) all_contexts_padded = tf.concat([all_contexts, [self.context_pad]], axis=-1) index_of_blank_context = tf.where( tf.equal(all_contexts_padded, self.context_pad)) num_contexts_per_example = tf.reduce_min(index_of_blank_context) # if there are less than self.max_contexts valid contexts, still sample self.max_contexts safe_limit = tf.cast( tf.maximum(num_contexts_per_example, self.config.MAX_CONTEXTS), tf.int32) rand_indices = tf.random_shuffle( tf.range(safe_limit))[:self.config.MAX_CONTEXTS] contexts = tf.gather(all_contexts, rand_indices) # (max_contexts,) else: contexts = row_parts[1:(self.config.MAX_CONTEXTS + 1)] # (max_contexts,) # contexts: (max_contexts, ) split_contexts = tf.string_split(contexts, delimiter=',', skip_empty=False) sparse_split_contexts = tf.sparse.SparseTensor( indices=split_contexts.indices, values=split_contexts.values, dense_shape=[self.config.MAX_CONTEXTS, 3]) dense_split_contexts = tf.reshape( tf.sparse.to_dense(sp_input=sparse_split_contexts, default_value=Common.PAD), shape=[self.config.MAX_CONTEXTS, 3]) # (batch, max_contexts, 3) split_target_labels = tf.string_split(tf.expand_dims(word, -1), delimiter='|') target_dense_shape = [ 1, tf.maximum(tf.to_int64(self.config.MAX_TARGET_PARTS), split_target_labels.dense_shape[1] + 1) ] sparse_target_labels = tf.sparse.SparseTensor( indices=split_target_labels.indices, values=split_target_labels.values, dense_shape=target_dense_shape) dense_target_label = tf.reshape( tf.sparse.to_dense(sp_input=sparse_target_labels, default_value=Common.PAD), [-1]) index_of_blank = tf.where(tf.equal(dense_target_label, Common.PAD)) target_length = tf.reduce_min(index_of_blank) dense_target_label = dense_target_label[:self.config.MAX_TARGET_PARTS] clipped_target_lengths = tf.clip_by_value( target_length, clip_value_min=0, clip_value_max=self.config.MAX_TARGET_PARTS) target_word_labels = tf.concat( [self.target_table.lookup(dense_target_label), [0]], axis=-1) # (max_target_parts + 1) of int path_source_strings = tf.slice( dense_split_contexts, [0, 0], [self.config.MAX_CONTEXTS, 1]) # (max_contexts, 1) flat_source_strings = tf.reshape(path_source_strings, [-1]) # (max_contexts) split_source = tf.string_split( flat_source_strings, delimiter='|', skip_empty=False) # (max_contexts, max_name_parts) sparse_split_source = tf.sparse.SparseTensor( indices=split_source.indices, values=split_source.values, dense_shape=[ self.config.MAX_CONTEXTS, tf.maximum(tf.to_int64(self.config.MAX_NAME_PARTS), split_source.dense_shape[1]) ]) dense_split_source = tf.sparse.to_dense( sp_input=sparse_split_source, default_value=Common.PAD) # (max_contexts, max_name_parts) dense_split_source = tf.slice(dense_split_source, [0, 0], [-1, self.config.MAX_NAME_PARTS]) path_source_indices = self.subtoken_table.lookup( dense_split_source) # (max_contexts, max_name_parts) path_source_lengths = tf.reduce_sum( tf.cast(tf.not_equal(dense_split_source, Common.PAD), tf.int32), -1) # (max_contexts) path_strings = tf.slice(dense_split_contexts, [0, 1], [self.config.MAX_CONTEXTS, 1]) flat_path_strings = tf.reshape(path_strings, [-1]) split_path = tf.string_split(flat_path_strings, delimiter='|', skip_empty=False) sparse_split_path = tf.sparse.SparseTensor( indices=split_path.indices, values=split_path.values, dense_shape=[ self.config.MAX_CONTEXTS, self.config.MAX_PATH_LENGTH ]) dense_split_path = tf.sparse.to_dense( sp_input=sparse_split_path, default_value=Common.PAD) # (batch, max_contexts, max_path_length) node_indices = self.node_table.lookup( dense_split_path) # (max_contexts, max_path_length) path_lengths = tf.reduce_sum( tf.cast(tf.not_equal(dense_split_path, Common.PAD), tf.int32), -1) # (max_contexts) path_target_strings = tf.slice( dense_split_contexts, [0, 2], [self.config.MAX_CONTEXTS, 1]) # (max_contexts, 1) flat_target_strings = tf.reshape(path_target_strings, [-1]) # (max_contexts) split_target = tf.string_split( flat_target_strings, delimiter='|', skip_empty=False) # (max_contexts, max_name_parts) sparse_split_target = tf.sparse.SparseTensor( indices=split_target.indices, values=split_target.values, dense_shape=[ self.config.MAX_CONTEXTS, tf.maximum(tf.to_int64(self.config.MAX_NAME_PARTS), split_target.dense_shape[1]) ]) dense_split_target = tf.sparse.to_dense( sp_input=sparse_split_target, default_value=Common.PAD) # (max_contexts, max_name_parts) dense_split_target = tf.slice(dense_split_target, [0, 0], [-1, self.config.MAX_NAME_PARTS]) path_target_indices = self.subtoken_table.lookup( dense_split_target) # (max_contexts, max_name_parts) path_target_lengths = tf.reduce_sum( tf.cast(tf.not_equal(dense_split_target, Common.PAD), tf.int32), -1) # (max_contexts) valid_contexts_mask = tf.to_float( tf.not_equal( tf.reduce_max(path_source_indices, -1) + tf.reduce_max(node_indices, -1) + tf.reduce_max(path_target_indices, -1), 0)) return { TARGET_STRING_KEY: word, TARGET_INDEX_KEY: target_word_labels, TARGET_LENGTH_KEY: clipped_target_lengths, PATH_SOURCE_INDICES_KEY: path_source_indices, NODE_INDICES_KEY: node_indices, PATH_TARGET_INDICES_KEY: path_target_indices, VALID_CONTEXT_MASK_KEY: valid_contexts_mask, PATH_SOURCE_LENGTHS_KEY: path_source_lengths, PATH_LENGTHS_KEY: path_lengths, PATH_TARGET_LENGTHS_KEY: path_target_lengths, PATH_SOURCE_STRINGS_KEY: path_source_strings, PATH_STRINGS_KEY: path_strings, PATH_TARGET_STRINGS_KEY: path_target_strings, LANGUAGE_ID: language_id }
def load_examples(): if a.input_dir is None or not os.path.exists(a.input_dir): raise Exception("input_dir does not exist") input_paths = glob.glob(os.path.join(a.input_dir, "*.jpg")) decode = tf.image.decode_jpeg if len(input_paths) == 0: input_paths = glob.glob(os.path.join(a.input_dir, "*.png")) decode = tf.image.decode_png if len(input_paths) == 0: raise Exception("input_dir contains no image files") def get_name(path): name, _ = os.path.splitext(os.path.basename(path)) return name # if the image names are numbers, sort by the value rather than asciibetically # having sorted inputs means that the outputs are sorted in test mode if all(get_name(path).isdigit() for path in input_paths): input_paths = sorted(input_paths, key=lambda path: int(get_name(path))) else: input_paths = sorted(input_paths) with tf.name_scope("load_images"): path_queue = tf.train.string_input_producer(input_paths, shuffle=a.mode == "train") reader = tf.WholeFileReader() paths, contents = reader.read(path_queue) #paths = tf.Print(paths, [paths], message="paths:") raw_input = decode(contents) raw_input = tf.image.convert_image_dtype(raw_input, dtype=tf.float32) img_path = tf.string_split([paths], delimiter='/').values[-1] classes = tf.string_to_number(tf.string_split([img_path], delimiter='_').values[0], out_type=tf.int32) # NOTE: may want to use one hots instead of numbers #classes = tf.one_hot(classes, NUM_CLASSES) # or NUM_CLASSES*2 if we want the full real/fake one hot #classes = tf.Print(classes, [img_path, classes], message="one hot", summarize=NUM_CLASSES) shape = classes.get_shape().dims #f.shape(classes) classes_real = classes classes_fake = tf.add(classes, tf.constant(NUM_CLASSES, shape=shape)) assertion = tf.assert_equal(tf.shape(raw_input)[2], 3, message="image does not have 3 channels") with tf.control_dependencies([assertion]): raw_input = tf.identity(raw_input) raw_input.set_shape([None, None, 3]) if a.lab_colorization: # load color and brightness from image, no B image exists here lab = rgb_to_lab(raw_input) L_chan, a_chan, b_chan = preprocess_lab(lab) a_images = tf.expand_dims(L_chan, axis=2) b_images = tf.stack([a_chan, b_chan], axis=2) else: # break apart image pair and move to range [-1, 1] width = tf.shape(raw_input)[1] # [height, width, channels] a_images = preprocess(raw_input[:, :width // 2, :]) b_images = preprocess(raw_input[:, width // 2:, :]) #print(raw_input.shape, len(classes)) if a.which_direction == "AtoB": inputs, targets = [a_images, b_images] elif a.which_direction == "BtoA": inputs, targets = [b_images, a_images] else: raise Exception("invalid direction") # synchronize seed for image operations so that we do the same operations to both # input and output images seed = random.randint(0, 2**31 - 1) def transform(image): r = image if a.flip: r = tf.image.random_flip_left_right(r, seed=seed) # area produces a nice downscaling, but does nearest neighbor for upscaling # assume we're going to be doing downscaling here r = tf.image.resize_images(r, [a.scale_size, a.scale_size], method=tf.image.ResizeMethod.AREA) offset = tf.cast(tf.floor( tf.random_uniform([2], 0, a.scale_size - CROP_SIZE + 1, seed=seed)), dtype=tf.int32) if a.scale_size > CROP_SIZE: r = tf.image.crop_to_bounding_box(r, offset[0], offset[1], CROP_SIZE, CROP_SIZE) elif a.scale_size < CROP_SIZE: raise Exception("scale size cannot be less than crop size") return r with tf.name_scope("input_images"): input_images = transform(inputs) with tf.name_scope("target_images"): target_images = transform(targets) paths_batch, inputs_batch, targets_batch, classes_real_batch, classes_fake_batch = tf.train.batch( [paths, input_images, target_images, classes_real, classes_fake], batch_size=a.batch_size) steps_per_epoch = int(math.ceil(len(input_paths) / a.batch_size)) return Examples( paths=paths_batch, inputs=inputs_batch, targets=targets_batch, classes_real=classes_real_batch, classes_fake=classes_fake_batch, count=len(input_paths), steps_per_epoch=steps_per_epoch, )
def train(): """Trains the model.""" # Log Input Settings logFile = MODEL_DIRECTORY + '/' + 'Train_Log.txt' # Set Tensorflow Logging tf.logging.set_verbosity(tf.logging.INFO) # Create input data pipeline. with tf.device('/cpu:0'): train_files = glob.glob(TRAIN_DIRECTORY) train_labels = glob.glob(LABEL_DIRECTORY) train_dataset = tf.data.Dataset.from_tensor_slices(train_files) # NEW - The below seems to be one option to obtain information from # text files. However, TF is extraordinarily difficult with respect to # being able to parse the text. I've Googled this for hours, and # it's not explained as far as I can tell (it likely is of course) # label_dataset = tf.data.Dataset.from_tensor_slices(train_labels) # This was from the cs230 input pipeline website provided to us. # the only error it throws is that the read-in text files are of # a different size. That is, some text files define multiple bounding # boxes. I recommend we just use the first included bounding box; # this would give us 4 values for each text file then and there would # be no issue. label_dataset = tf.data.TextLineDataset(train_labels) # label_dataset = tf.data.TextLineDataset.from_tensor_slices(label_dataset) label_dataset = label_dataset.map( lambda token: tf.string_split([token]).values) label_dataset = label_dataset.map(lambda token: (token, extract_char(token))) # NEW - PLEASE REVIEW - we load images here # note that TF throws an error if any image is a different size # so we can either use the patch scheme of Balle, or we can resize # the images. I'm not sure if the patch size would work, because # when we compute the MSE I dont know if TF first recombines all the patches # or if computes the MSE of each patch. if its each patch then we would need # a function to check whether a patch includes a portion of a bounding box. # That said, if we resize the images it's unclear to me what size they should be # also we have to scale the bounding boxes to the new size somehow. train_dataset = train_dataset.map( load_image, num_parallel_calls=PREPROCESS_THREADS) train_dataset = train_dataset.map( lambda x: tf.random_crop(x, (PATCHSIZE, PATCHSIZE, 3))) # label_dataset = label_dataset.map(load_labels, num_parallel_calls=PREPROCESS_THREADS) # This combines the two datasets so they are coordinated. total_data = tf.data.Dataset.zip((train_dataset, label_dataset)) total_data = total_data.shuffle(buffer_size=len(train_files)).repeat() # We prefetch some initial batches total_data = total_data.batch(BATCH_SIZE) total_data = total_data.prefetch(32) # train_labels = train_labels.batch(BATCH_SIZE) # train_labels = train_labels.prefetch(32) # Determine number of pixels and print input data info num_pixels = BATCH_SIZE * PATCHSIZE**2 print('Num Train File', len(train_files)) print('Num_Pix', num_pixels, BATCH_SIZE, PATCHSIZE) # Get Data - this includes labels and training images x = total_data.make_one_shot_iterator().get_next() # We then pass the training images in x[0] to our autoencoder y = analysis_transform(x[0], NUM_FILTERS) entropy_bottleneck = tfc.EntropyBottleneck() y_tilde, likelihoods = entropy_bottleneck(y, training=True) x_tilde = synthesis_transform(y_tilde, NUM_FILTERS) # Total number of bits divided by number of pixels. train_bpp = tf.reduce_sum(tf.log(likelihoods)) / (-np.log(2) * num_pixels) # Mean squared error across pixels. train_mse = tf.reduce_mean(tf.squared_difference(x[0], x_tilde)) train_mse *= 255**2 # Multiply by 255^2 to correct for rescaling. ######################START TEST DECOTO############################ #Grab the 4 Corners corners = [ tf.string_to_number(x[1][1][1][2]), tf.string_to_number(x[1][1][1][3]), tf.string_to_number(x[1][1][1][4]), tf.string_to_number(x[1][1][1][5]) ] #Build a Mask of All 0,s of Proper Shape to Multiply With x[0] (Shape = 1,256,256,1) M = tf.zeros([1, x[0].get_shape()[1], x[0].get_shape()[1], 1]) #START PENDING - WORK IN PROGRESS #Replace the 0's in M with 1's for all areas inside the bounding box indices = [] values = [] for i in range(0, 10): #Replace 0 and 10 w/ the corner values for j in range(0, 10): #Replace 0 and 10 w/ the corner values indices.append([0, i, j, 0]) #Indices of Values to Change values.append(1) #What to Change the Values at Indices To shape = M.get_shape() delta = tf.SparseTensor(indices, values, shape) delta = tf.cast(delta, tf.float32) M2 = M + tf.sparse_tensor_to_dense(delta) sums = [ tf.reduce_sum(M), tf.reduce_sum(M2) ] #Used to Print Later to Check This is Working (Sum of M = 0, Sum of M1 > 0) #END PENDING - WORK IN PROGRESS #Mean Squared Error for the Box Portion Only train_mse_box = tf.reduce_mean( tf.multiply(tf.squared_difference(x[0], x_tilde), M2)) train_mse_box *= 255**2 #Training Loss Including the Bounding Box as a separate loss component train_loss = LMBDA * train_mse + train_bpp + LMBDA2 * train_mse_box ###################END TEST DECOTO############################ # Minimize loss and auxiliary loss, and execute update op. step = tf.train.create_global_step() main_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) main_step = main_optimizer.minimize(train_loss, global_step=step) aux_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE * 10) aux_step = aux_optimizer.minimize(entropy_bottleneck.losses[0]) train_op = tf.group(main_step, aux_step, entropy_bottleneck.updates[0]) tf.summary.scalar("loss", train_loss) tf.summary.scalar("bpp", train_bpp) tf.summary.scalar("mse", train_mse) tf.summary.image("original", quantize_image(x[0])) tf.summary.image("reconstruction", quantize_image(x_tilde)) # Creates summary for the probability mass function (PMF) estimated in the bottleneck. entropy_bottleneck.visualize() hooks = [ tf.train.StopAtStepHook(last_step=NUM_STEPS), tf.train.NanTensorHook(train_loss) ] ep = 0 epSub = 0 scaffold = tf.train.Scaffold(saver=tf.train.Saver(max_to_keep=1)) with tf.train.MonitoredTrainingSession( scaffold=scaffold, hooks=hooks, checkpoint_dir=MODEL_DIRECTORY, save_checkpoint_secs=CHECKPOINT_SAVE, save_summaries_secs=CHECKPOINT_SAVE) as sess: while not sess.should_stop(): sess.run(train_op) if epSub >= LOG_STEPS: epSub = 0 ep += 1 if epSub == 0: print(ep * LOG_STEPS + epSub, 'train loss', sess.run(train_loss)) ######################START DECOTO EDITS###################################### print('Corners', sess.run(corners)) print('Sums M and M2', sess.run(sums)) ######################END DECOTO EDITS###################################### with open(logFile, 'a') as f: f.write('step=' + str(ep * LOG_STEPS + epSub) + ',train_loss=' + str(sess.run(train_loss)) + ',train_bpp=' + str(sess.run(train_bpp)) + ',train_mse=' + str(sess.run(train_mse)) + '\n') epSub += 1 print('TRAIN COMPLETED')
def tensor_predict(words_list): num_classes = FLAGS.num_classes num_layers = FLAGS.num_layers num_steps = FLAGS.num_steps embedding_size = FLAGS.embedding_size hidden_size = FLAGS.hidden_size keep_prob = FLAGS.keep_prob vocab_size = FLAGS.vocab_size vocab_path = FLAGS.vocab_path prop_limit = FLAGS.prop_limit checkpoint_path = FLAGS.checkpoint_path # split 1-D String dense Tensor to words SparseTensor sentences = tf.placeholder(dtype=tf.string, shape=[None], name='input_sentences') sparse_words = tf.string_split(sentences, delimiter=' ') # slice SparseTensor valid_indices = tf.less(sparse_words.indices, tf.constant([num_steps], dtype=tf.int64)) valid_indices = tf.reshape( tf.split(valid_indices, [1, 1], axis=1)[1], [-1]) valid_sparse_words = tf.sparse_retain(sparse_words, valid_indices) excess_indices = tf.greater_equal(sparse_words.indices, tf.constant([num_steps], dtype=tf.int64)) excess_indices = tf.reshape( tf.split(excess_indices, [1, 1], axis=1)[1], [-1]) excess_sparse_words = tf.sparse_retain(sparse_words, excess_indices) # sparse to dense words = tf.sparse_to_dense( sparse_indices=valid_sparse_words.indices, output_shape=[valid_sparse_words.dense_shape[0], num_steps], sparse_values=valid_sparse_words.values, default_value='_PAD') # dict words to token ids # with open(os.path.join(vocab_path, 'words_vocab.txt'), 'r') as data_file: # words_table_list = [line.strip() for line in data_file if line.strip()] # words_table_tensor = tf.constant(words_table_list, dtype=tf.string) # words_table = lookup.index_table_from_tensor(mapping=words_table_tensor, default_value=3) words_table = lookup.index_table_from_file(os.path.join( vocab_path, 'words_vocab.txt'), default_value=3) words_ids = words_table.lookup(words) # blstm model predict with tf.variable_scope('model', reuse=None): logits, _ = model.inference(words_ids, valid_sparse_words.dense_shape[0], num_steps, vocab_size, embedding_size, hidden_size, keep_prob, num_layers, num_classes, is_training=False) # using softmax # props = tf.nn.softmax(logits) # max_prop_values, max_prop_indices = tf.nn.top_k(props, k=1) # predict_scores = tf.reshape(max_prop_values, shape=[-1, num_steps]) # predict_labels_ids = tf.reshape(max_prop_indices, shape=[-1, num_steps]) # predict_labels_ids = tf.to_int64(predict_labels_ids) # using crf logits = tf.reshape(logits, shape=[-1, num_steps, num_classes]) transition_params = tf.get_variable("transitions", [num_classes, num_classes]) sequence_length = tf.constant(num_steps, shape=[logits.get_shape()[0]], dtype=tf.int64) predict_labels_ids, _ = crf_utils.crf_decode(logits, transition_params, sequence_length) predict_labels_ids = tf.to_int64(predict_labels_ids) predict_scores = tf.constant(1.0, shape=predict_labels_ids.get_shape(), dtype=tf.float32) # replace untrusted prop that less than prop_limit trusted_prop_flag = tf.greater_equal( predict_scores, tf.constant(prop_limit, dtype=tf.float32)) replace_prop_labels_ids = tf.to_int64( tf.fill(tf.shape(predict_labels_ids), 4)) predict_labels_ids = tf.where(trusted_prop_flag, predict_labels_ids, replace_prop_labels_ids) # dict token ids to labels # with open(os.path.join(vocab_path, 'labels_vocab.txt'), 'r') as data_file: # labels_table_list = [line.strip() for line in data_file if line.strip()] # labels_table_tensor = tf.constant(labels_table_list, dtype=tf.string) # labels_table = lookup.index_to_string_table_from_tensor(mapping=labels_table_tensor, default_value='O') labels_table = lookup.index_to_string_table_from_file(os.path.join( vocab_path, 'labels_vocab.txt'), default_value='O') predict_labels = labels_table.lookup(predict_labels_ids) # extract real blstm predict label in dense and save to sparse valid_sparse_predict_labels = tf.SparseTensor( indices=valid_sparse_words.indices, values=tf.gather_nd(predict_labels, valid_sparse_words.indices), dense_shape=valid_sparse_words.dense_shape) # create excess label SparseTensor with 'O' excess_sparse_predict_labels = tf.SparseTensor( indices=excess_sparse_words.indices, values=tf.fill(tf.shape(excess_sparse_words.values), 'O'), dense_shape=excess_sparse_words.dense_shape) # concat SparseTensor sparse_predict_labels = tf.SparseTensor( indices=tf.concat(axis=0, values=[ valid_sparse_predict_labels.indices, excess_sparse_predict_labels.indices ]), values=tf.concat(axis=0, values=[ valid_sparse_predict_labels.values, excess_sparse_predict_labels.values ]), dense_shape=excess_sparse_predict_labels.dense_shape) sparse_predict_labels = tf.sparse_reorder(sparse_predict_labels) # join SparseTensor to 1-D String dense Tensor # remain issue, num_split should equal the real size, but here limit to 1 join_labels_list = [] slice_labels_list = tf.sparse_split(sp_input=sparse_predict_labels, num_split=1, axis=0) for slice_labels in slice_labels_list: slice_labels = slice_labels.values join_labels = tf.reduce_join(slice_labels, reduction_indices=0, separator=' ') join_labels_list.append(join_labels) format_predict_labels = tf.stack(join_labels_list, name='predict_labels') saver = tf.train.Saver() tables_init_op = tf.tables_initializer() with tf.Session() as sess: sess.run(tables_init_op) ckpt = tf.train.get_checkpoint_state(checkpoint_path) if ckpt and ckpt.model_checkpoint_path: print('read model from {}'.format(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found at %s' % checkpoint_path) return # crf tensor predict_labels_list = sess.run(format_predict_labels, feed_dict={sentences: words_list}) # save graph into .pb file graph = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, ["init_all_tables", "predict_labels"]) tf.train.write_graph(graph, '.', 'ner_graph.pb', as_text=False) return predict_labels_list
def file_to_dataset(file_holder): files = tf.train.match_filenames_once(file_holder) dataset = tf.data.TextLineDataset(files) # input Shape must be rank 1 for 'StringSplit', output shape is rank 1 return dataset.map(lambda line: tf.string_split([line]).values)\ .map(lambda str_tokens: tf.cast(tf.string_to_number(string_tensor=str_tokens), tf.int64))
def __init__(self, args, txt_file, num_classes, mode, batch_size, num_preprocess_threads=1, shuffle=True, min_queue_examples=1): self.args = args self.txt_file = txt_file self.num_preprocess_threads = num_preprocess_threads self.min_queue_examples = min_queue_examples self.batch_size = batch_size self.mode = mode self.imgShape = [ self.args.imageHeight, self.args.imageWidth, self.args.imageChannels ] self.maskShape = tf.stack( [self.args.imageHeight, self.args.imageWidth]) self.num_classes = int(num_classes) input_queue = tf.train.string_input_producer([txt_file], shuffle=False) line_reader = tf.TextLineReader() _, line = line_reader.read(input_queue) split_line = tf.string_split([line]).values if (mode == 'training' or mode == 'validation'): split_line = tf.string_split([line]).values rgb_image_path = split_line[0] label_image_path = split_line[1] self.image_o = self.read_image(rgb_image_path, 0) # self.image = tf.subtract(self.image, VGG_MEAN) self.label_image_o = self.read_image(label_image_path, 1) do_flip = tf.random_uniform([], 0, 1) self.image = tf.cond( do_flip > 0.5, lambda: tf.image.flip_left_right(self.image_o), lambda: self.image_o) self.label_image = tf.cond( do_flip > 0.5, lambda: tf.image.flip_left_right(self.label_image_o), lambda: self.label_image_o) self.image.set_shape( (self.args.imageHeight, self.args.imageWidth, 3)) self.label_image.set_shape( (self.args.imageHeight, self.args.imageWidth, 1)) # self.img_batch, self.label_batch = tf.train.shuffle_batch([self.imageC, self.label], # batch_size=batch_size, # num_threads=num_preprocess_threads, # capacity=min_queue_examples + 3 * batch_size, # min_after_dequeue=min_queue_examples) self.img_batch, self.label_batch = tf.train.batch( [self.image, self.label_image], batch_size=batch_size, num_threads=num_preprocess_threads, capacity=min_queue_examples + 3 * batch_size, ) elif (mode == 'test'): print 'Generating test Image Batch' split_line = tf.string_split([line]).values rgb_image_path = split_line[0] self.image = self.read_image(rgb_image_path, 0) self.label = rgb_image_path self.image.set_shape( (self.args.imageHeight, self.args.imageWidth, 3)) self.img_batch, self.label_batch = tf.train.batch( [self.image, self.label], batch_size=batch_size, num_threads=num_preprocess_threads, capacity=min_queue_examples + 1 * batch_size, )
def _parse_one_feature(k, x): indices = tf.string_split(x, ":") return tf.cond(pred=tf.equal(k, 'w1'), true_fn=lambda: _mk_wide(indices), # lambda is a must as true_fn/false_fn expects a callable false_fn=lambda: _mk_deep(indices))
def get_iterator(src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, batch_size, sos, eos, source_reverse, random_seed, num_buckets, src_max_len=None, tgt_max_len=None, num_threads=4, output_buffer_size=None, skip_count=None): if not output_buffer_size: output_buffer_size = batch_size * 1000 src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32) tgt_sos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(sos)), tf.int32) tgt_eos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(eos)), tf.int32) print("src_dataset", src_dataset) src_tgt_dataset = tf.contrib.data.Dataset.zip((src_dataset, tgt_dataset)) if skip_count is not None: src_tgt_dataset = src_tgt_dataset.skip(skip_count) src_tgt_dataset = src_tgt_dataset.shuffle(output_buffer_size, random_seed) src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (tf.string_split([src]).values, tf.string_split([tgt]).values), num_threads=num_threads, output_buffer_size=output_buffer_size) # Filter zero length input sequences. src_tgt_dataset = src_tgt_dataset.filter( lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0)) if src_max_len: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (src[:src_max_len], tgt), num_threads=num_threads, output_buffer_size=output_buffer_size) if tgt_max_len: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (src, tgt[:tgt_max_len]), num_threads=num_threads, output_buffer_size=output_buffer_size) if source_reverse: src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (tf.reverse(src, axis=[0]), tgt), num_threads=num_threads, output_buffer_size=output_buffer_size) # Convert the word strings to ids. Word strings that are not in the # vocab get the lookup table's default_value integer. src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (tf.cast(src_vocab_table.lookup(src), tf.int32), tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)), num_threads=num_threads, output_buffer_size=output_buffer_size) # Create a tgt_input prefixed with <sos> and a tgt_output suffixed with <eos>. src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (src, tf.concat( ([tgt_sos_id], tgt), 0), tf.concat((tgt, [tgt_eos_id]), 0)), num_threads=num_threads, output_buffer_size=output_buffer_size) # Add in the word counts. Subtract one from the target to avoid counting # the target_input <eos> tag (resp. target_output <sos> tag). src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt_in, tgt_out: (src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in)), num_threads=num_threads, output_buffer_size=output_buffer_size) # Bucket by source sequence length (buckets for lengths 0-9, 10-19, ...) print("source target dataset", src_tgt_dataset) def batching_func(x): return x.padded_batch( batch_size, # The first three entries are the source and target line rows; # these have unknown-length vectors. The last two entries are # the source and target row sizes; these are scalars. padded_shapes=( tf.TensorShape([None]), # src tf.TensorShape([None]), # tgt_input tf.TensorShape([None]), # tgt_output tf.TensorShape([]), # src_len tf.TensorShape([])), # tgt_len # Pad the source and target sequences with eos tokens. # (Though notice we don't generally need to do this since # later on we will be masking out calculations past the true sequence. padding_values=( src_eos_id, # src tgt_eos_id, # tgt_input tgt_eos_id, # tgt_output 0, # src_len -- unused 0)) # tgt_len -- unused if num_buckets > 1: def key_func(unused_1, unused_2, unused_3, src_len, tgt_len): # Calculate bucket_width by maximum source sequence length. # Pairs with length [0, bucket_width) go to bucket 0, length # [bucket_width, 2 * bucket_width) go to bucket 1, etc. Pairs with length # over ((num_bucket-1) * bucket_width) words all go into the last bucket. if src_max_len: bucket_width = (src_max_len + num_buckets - 1) // num_buckets else: bucket_width = 10 # Bucket sentence pairs by the length of their source sentence and target # sentence. print("src_len", tf.to_int64(src_len), "bucket_width", bucket_width) bucket_id = tf.maximum(src_len // bucket_width, tgt_len // bucket_width) return tf.to_int64(tf.minimum(num_buckets, bucket_id)) def reduce_func(unused_key, windowed_data): return batching_func(windowed_data) batched_dataset = src_tgt_dataset.group_by_window( key_func=key_func, reduce_func=reduce_func, window_size=batch_size) else: print("num_buckets", num_buckets) batched_dataset = batching_func(src_tgt_dataset) #sess1 = tf.InteractiveSession() #print("batched_dataset",batched_dataset.eval(session=sess1)) #print("Debug", sess1.run(src_dataset)) batched_iter = batched_dataset.make_initializable_iterator() #iterator1 = batched_dataset.make_one_shot_iterator() #next_element = iterator1.get_next() #print("Debug", batched_iter.shape) #print(sess1.run(next_element)) #print("src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len",src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len) (src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len) = (batched_iter.get_next()) print("src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len", src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len) return BatchedInput(initializer=batched_iter.initializer, source=src_ids, target_input=tgt_input_ids, target_output=tgt_output_ids, source_sequence_length=src_seq_len, target_sequence_length=tgt_seq_len)
def model_fn(features, labels, mode, params): input = features["input"] # TODO: put this in a file and use index_table_from_file instead (set up graph using init op) vocabulary = tf.constant(list(" abcdefghijklmnopqrstuvwxyz"), name="vocab") # use the vocabulary lookup table vocab = tf.contrib.lookup.index_table_from_tensor(vocabulary) # split input strings into characters with tf.name_scope("encoder"): split = tf.string_split(input, delimiter='') # for each character, lookup the index encoded = vocab.lookup(split) # perform one_hot encoding dense_encoding = tf.sparse_tensor_to_dense(encoded, default_value=-1) one_hot = tf.one_hot(dense_encoding, vocabulary.get_shape()[0]) # TODO: better way of computing sequence lengths in the graph? lengths = tf.cast(tf.reduce_sum(one_hot, reduction_indices=[1, 2]), tf.int32) def rnn_layer(size): keep_prob = 1.0 - params["rnn_dropout"] l = tf.contrib.rnn.GRUCell(size) if keep_prob < 1.0 and mode is tf.estimator.ModeKeys.TRAIN: l = tf.contrib.rnn.DropoutWrapper(l, output_keep_prob=keep_prob) return l rnn_layers = [] rnn_cell_sizes = params["rnn_cells"] for size in rnn_cell_sizes: rnn_layers.append(rnn_layer(size)) multi_rnn_cell = tf.contrib.rnn.MultiRNNCell(rnn_layers) rnn_raw_out, _ = tf.nn.dynamic_rnn(cell=multi_rnn_cell, inputs=one_hot, sequence_length=lengths, dtype=tf.float32, scope="rnn_layers") with tf.name_scope("rnn_output_relevant"): # get the last relevant output from the rnn outputs batch = tf.range(0, tf.shape(rnn_raw_out)[0]) # generate 0->batch_size coordinates = tf.stack( [batch, lengths - 1], 1) # stack the 0->batch_size sequence with the sequence lengths rnn_out = tf.gather_nd( rnn_raw_out, coordinates) # perform a gather using those coordinates # output sigmoid layers output_cell_sizes = params["output_cells"] output_dropout = params["output_dropout"] def output_layer(last_layer, last_layer_size, layer_size, dropout): W = tf.Variable(tf.random_uniform((last_layer_size, layer_size), -1, 1), dtype=tf.float32, name="W") b = tf.Variable(tf.random_uniform((1, layer_size), -1, 1), name="b") sig = tf.sigmoid(tf.matmul(last_layer, W) + b) output = sig keep_prob = 1.0 - dropout if keep_prob > 0.0: output = tf.nn.dropout(output, 1.0 - dropout) return output last_layer = rnn_out last_layer_size = rnn_cell_sizes[-1] with tf.name_scope("output_layers"): for idx in range(0, len(output_cell_sizes)): with tf.name_scope("layer" + str(idx)): last_layer = output_layer(last_layer, last_layer_size, output_cell_sizes[idx], output_dropout) last_layer_size = output_cell_sizes[idx] # final prediction output with tf.name_scope("final"): predictions = output_layer(last_layer, last_layer_size, 3, False) # predict predictions_dict = { "color": predictions, } # export outputs exports_dict = { "color": tf.estimator.export.PredictOutput(predictions_dict), } loss = None train_op = None if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL]: # calculate loss rsme = tf.sqrt( tf.reduce_sum(tf.square(tf.subtract(labels, predictions)), axis=1)) loss = tf.reduce_sum(rsme) / (tf.cast(tf.shape(input)[0], tf.float32)) # metrics for each mode (train, eval) tf.summary.scalar("loss/" + mode, loss) tf.summary.histogram("loss/" + mode, rsme) if mode is tf.estimator.ModeKeys.TRAIN: learning_rate = params["learning_rate"] # set up optimizer optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(loss) # clip gradients to help with exploding gradients in RNN's grad_clip = params["grad_clip"] grads_and_vars = [(tf.clip_by_value(g, -grad_clip, grad_clip), v) for g, v in grads_and_vars] # train global_step = tf.train.get_global_step() train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # training summaries (picked up by tf.Estimator) tf.summary.scalar("learning_rate", learning_rate) for g, v in grads_and_vars: tf.summary.histogram("grads/" + v.name.replace(":", "_"), g) for v in tf.trainable_variables(): tf.summary.histogram("vars/" + v.name.replace(":", "_"), v) return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions_dict, export_outputs=exports_dict, loss=loss, train_op=train_op, )
def __init__(self, data_path, filenames_file, params, dataset, mode): self.data_path = data_path self.params = params self.dataset = dataset self.mode = mode self.left_image_batch = None self.right_image_batch = None self.left_next_image_batch = None self.right_next_image_batch = None self.cam_params_batch = None input_queue = tf.train.string_input_producer([filenames_file], shuffle=False) line_reader = tf.TextLineReader() _, line = line_reader.read(input_queue) split_line = tf.string_split([line]).values # we load only one image for test, except if we trained a stereo model if mode == 'test': left_image_path = tf.string_join([self.data_path, split_line[0]]) left_image_o = self.read_image(left_image_path) else: left_image_path = tf.string_join([self.data_path, split_line[0]]) right_image_path = tf.string_join([self.data_path, split_line[1]]) left_next_image_path = tf.string_join( [self.data_path, split_line[2]]) right_next_image_path = tf.string_join( [self.data_path, split_line[3]]) cam_params = tf.string_to_number(split_line[4:11]) height_o = tf.string_to_number(split_line[11]) width_o = tf.string_to_number(split_line[12]) left_image_o = self.read_image(left_image_path) right_image_o = self.read_image(right_image_path) left_next_image_o = self.read_image(left_next_image_path) right_next_image_o = self.read_image(right_next_image_path) # set cam_params shape cam_params = tf.reshape(cam_params, [7]) cam_params = tf.expand_dims(cam_params, 0) h_tensor = tf.expand_dims( tf.cast(tf.constant([self.params.height]), tf.float32), 0) w_tensor = tf.expand_dims( tf.cast(tf.constant([self.params.width]), tf.float32), 0) cam_params = tf.squeeze( tf.concat( [cam_params, h_tensor / height_o, w_tensor / width_o], 1)) # print(h_tensor/height_o) if mode == 'train': # randomly flip images do_flip = tf.random_uniform([], 0, 1) left_image = tf.cond( do_flip > 0.5, lambda: tf.image.flip_left_right(right_image_o), lambda: left_image_o) right_image = tf.cond( do_flip > 0.5, lambda: tf.image.flip_left_right(left_image_o), lambda: right_image_o) left_next_image = tf.cond( do_flip > 0.5, lambda: tf.image.flip_left_right(right_next_image_o), lambda: left_next_image_o) right_next_image = tf.cond( do_flip > 0.5, lambda: tf.image.flip_left_right(left_next_image_o), lambda: right_next_image_o) # randomly augment images do_augment = tf.random_uniform([], 0, 1) left_image, right_image, left_next_image, right_next_image = tf.cond( do_augment > 0.5, lambda: self.augment_image_pair( left_image, right_image, left_next_image, right_next_image ), lambda: (left_image, right_image, left_next_image, right_next_image)) # set image shape left_image.set_shape([self.params.height, self.params.width, 3]) right_image.set_shape([self.params.height, self.params.width, 3]) left_next_image.set_shape( [self.params.height, self.params.width, 3]) right_next_image.set_shape( [self.params.height, self.params.width, 3]) # capacity = min_after_dequeue + (num_threads + a small safety margin) * batch_size min_after_dequeue = 2048 capacity = min_after_dequeue + 4 * params.batch_size self.left_image_batch, self.right_image_batch, self.left_next_image_batch, self.right_next_image_batch, self.cam_params_batch = tf.train.shuffle_batch( [ left_image, right_image, left_next_image, right_next_image, cam_params ], params.batch_size, capacity, min_after_dequeue, params.num_threads) elif mode == 'test': self.left_image_batch = tf.stack( [left_image_o, tf.image.flip_left_right(left_image_o)], 0) self.left_image_batch.set_shape([2, None, None, 3])
# source_query_tokens_input = tf.expand_dims(source_query_tokens_ph, 0) # source_query_len_input = tf.expand_dims(source_query_len_ph, 0) # source_candidate_tokens_input = tf.expand_dims(source_candidate_tokens_ph, 0) # source_candidate_len_input = tf.expand_dims(source_candidate_len_ph,0) # source_query_tokens_ph = tf.string_split(source_query_tokens_ph, " ") # source_query_len_ph = tf.string_split(source_query_len_ph, " ") # source_candidate_tokens_ph = tf.string_split(source_candidate_tokens_ph, " ") # source_candidate_len_ph = tf.string_split(source_candidate_len_ph, " ") source_query_tokens_input = tf.expand_dims(source_query_tokens_ph, 0) source_query_len_input = tf.expand_dims(source_query_len_ph, 0) source_candidate_tokens_input = tf.expand_dims(source_candidate_tokens_ph, 0) source_candidate_len_input = tf.expand_dims(source_candidate_len_ph, 0) source_query_tokens_input = tf.string_split(source_query_tokens_input) source_query_tokens_input = tf.sparse_tensor_to_dense( source_query_tokens_input, default_value="") # source_query_len_input = tf.string_split(source_query_len_input) source_candidate_tokens_input = tf.string_split(source_candidate_tokens_input) source_candidate_tokens_input = tf.sparse_tensor_to_dense( source_candidate_tokens_input, default_value="") # source_candidate_len_input = tf.string_split(source_candidate_len_input) model( features={ # "source_tokens": source_query_tokens_ph, # "source_len": source_query_len_ph, # "source_candidate_tokens": source_candidate_tokens_ph, # "source_candidate_len": source_candidate_len_ph
def _realize_mappings(self): with tf.device('/cpu:0'), tf.variable_scope('word_embedding'): features = tf.unstack( self.X, axis=1 ) # List with Feature_NUM ele each with a shape of [batch_size] wide_mappings = {} wide_tensors = [] deep_mappings = {} deep_tensors = [] for one_feature, tag in zip(features, self.tags): if tag.wide_or_deep_side != "wide": continue split_tag = tf.string_split(one_feature, "|") one_sparse = tf.SparseTensor( indices=split_tag.indices, values=tag.table.lookup(split_tag.values) if tag.tag_name == "custom" else split_tag.values, ## 这里给出了不同值通过表查到的index ## dense_shape=split_tag.dense_shape) wide_mappings[tag.tag_name] = one_sparse wide_tensors.append(tag.embedding_res) for one_feature, tag in zip(features, self.tags): if tag.wide_or_deep_side == "wide": continue split_tag = tf.string_split(one_feature, "|") one_sparse = tf.SparseTensor( indices=split_tag.indices, values=tag.table.lookup(split_tag.values) if tag.tag_name == "custom" else split_tag.values, ## 这里给出了不同值通过表查到的index ## dense_shape=split_tag.dense_shape) deep_mappings[tag.tag_name] = one_sparse deep_tensors.append(tag.embedding_res) if tag.sibling is not None: sibling = tag.sibling # print("sibling.tag_name = ",sibling.tag_name) # print("sibling.embedding_size = ",sibling.embedding_size) deep_mappings[sibling.tag_name] = one_sparse deep_tensors.append(sibling.embedding_res) mappings = {} tensors = [] for key in wide_mappings: mappings[key] = wide_mappings[key] for key in deep_mappings: mappings[key] = deep_mappings[key] tensors = wide_tensors + deep_tensors wide_and_deep_embedding_res = tf.feature_column.input_layer( mappings, tensors) print("batch_embedding_res.shape = ", wide_and_deep_embedding_res.shape) wide_inputs, deep_inputs = tf.split( wide_and_deep_embedding_res, [self.wide_side_dimension_size, self.deep_side_dimension_size], 1) self.wide_inputs = tf.reshape( wide_inputs, [self.batch_size, self.wide_side_dimension_size]) self.deep_inputs = tf.reshape( deep_inputs, [self.batch_size, self.deep_side_dimension_size]) print("wide_inputs.shape = ", self.wide_inputs.shape) print("deep_inputs.shape = ", self.deep_inputs.shape) '''
def vectorize(string, vocab, seq_len): splitted = tf.string_split([string]).values vectorized = vocab.lookup(splitted) vectorized = vectorized[:seq_len] return vectorized
def decode_p1(line): fields = tf.string_split([line], ',').values fields = rtt.PrivateInput(fields, data_owner=1) return fields
def ctc_loss(prob, labels, input_shape, alphabet, alphabet_codes, batch_size, n_pools=2 * 2, decode=True): # Compute seq_len from image width # 2x2 pooling in dimension W on layer 1 and 2 -> n-pools = 2*2 seq_len_inputs = tf.divide( [input_shape[1]] * batch_size, n_pools, name='seq_len_input_op') - 1 # Get keys (letters) and values (integer stand ins for letters) # Alphabet and codes keys = [c for c in alphabet] # the letters themselves values = alphabet_codes # integer representations # Create non-string labels from the keys and values above # Convert string label to code label with tf.name_scope('str2code_conversion'): table_str2int = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer(keys, values), -1) splited = tf.string_split( labels, delimiter='' ) # TODO change string split to utf8 split in next tf version codes = table_str2int.lookup(splited.values) sparse_code_target = tf.SparseTensor(splited.indices, codes, splited.dense_shape) seq_lengths_labels = tf.bincount(tf.cast(sparse_code_target.indices[:, 0], tf.int32), minlength=tf.shape(prob)[1]) # Use ctc loss on probabilities from lstm output # Loss # ---- # >>> Cannot have longer labels than predictions -> error with tf.control_dependencies([ tf.less_equal(sparse_code_target.dense_shape[1], tf.reduce_max(tf.cast(seq_len_inputs, tf.int64))) ]): loss_ctc = tf.nn.ctc_loss( labels=sparse_code_target, inputs=prob, sequence_length=tf.cast(seq_len_inputs, tf.int32), preprocess_collapse_repeated=False, ctc_merge_repeated=True, ignore_longer_outputs_than_inputs= True, # returns zero gradient in case it happens -> ema loss = NaN time_major=True) loss_ctc = tf.reduce_mean(loss_ctc) # loss_ctc = tf.Print(loss_ctc, [loss_ctc], message='* Loss : ') if decode: with tf.name_scope('code2str_conversion'): keys = tf.cast(alphabet_codes, tf.int64) values = [c for c in alphabet] table_int2str = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer(keys, values), '?') sparse_code_pred, log_probability = tf.nn.ctc_beam_search_decoder( prob, sequence_length=tf.cast(seq_len_inputs, tf.int32), merge_repeated=False, beam_width=100, top_paths=2) # Score pred_score = tf.subtract(log_probability[:, 0], log_probability[:, 1]) sparse_code_pred = sparse_code_pred[0] sequence_lengths_pred = tf.bincount(tf.cast( sparse_code_pred.indices[:, 0], tf.int32), minlength=tf.shape(prob)[1]) pred_chars = table_int2str.lookup(sparse_code_pred) words = get_words_from_chars( pred_chars.values, sequence_lengths=sequence_lengths_pred) # tf.summary.text('predicted_words', words[:10]) with tf.name_scope('evaluation'): CER = tf.metrics.mean(tf.edit_distance( sparse_code_pred, tf.cast(sparse_code_target, dtype=tf.int64)), name='CER') CER = tf.reduce_mean(tf.edit_distance( sparse_code_pred, tf.cast(sparse_code_target, dtype=tf.int64)), name='CER') # Convert label codes to decoding alphabet to compare predicted and groundtrouth words target_chars = table_int2str.lookup( tf.cast(sparse_code_target, tf.int64)) target_words = get_words_from_chars(target_chars.values, seq_lengths_labels) accuracy = tf.metrics.accuracy(target_words, words, name='accuracy') # CER = tf.Print(CER, [CER], message='-- CER : ') # accuracy = tf.Print(accuracy, [accuracy], message='-- Accuracy : ') else: CER = None accuracy = None return loss_ctc, words, pred_score, CER, accuracy
def extract_char(token, default_value="<PAD>"): out = tf.string_split(token, delimiter='') out = tf.sparse.to_dense(out, default_value=default_value) return out
def extract_raw_value(padded): split_stensor = tf.string_split(padded, delimiter="\t") split_tensor = tf.sparse.to_dense(split_stensor, default_value="") raw_value = split_tensor[:, 0] return raw_value
def train_input_fn(hparams): # create Dataset by train file and skip header line dataset = tf.data.TextLineDataset(hparams.train_file).skip(1) # parse csv dataset = dataset.map(parse_line) # split string dataset = dataset.map( lambda question, answer, lable:( tf.string_split([question]).values, tf.string_split([answer]).values, lable)) # filter question and answer length dataset = dataset.filter( lambda question, answer, lable:( tf.logical_and(tf.size(question) > 0, tf.size(answer) > 0))) dataset = dataset.map( lambda question, answer, lable:( question[:hparams.max_question_len], answer[:hparams.max_answer_len], lable)) # convert word strings to ids vocab_table = create_vocab_table(hparams.vocabulary_file) dataset = dataset.map( lambda question, answer, lable:( vocab_table.lookup(question), vocab_table.lookup(answer), lable)) # add in question and answer sequence length dataset = dataset.map( lambda question, answer, lable:( question, tf.size(question), answer, tf.size(answer), lable)) # padding question and answer dataset = dataset.map(padding_string_sequence) # convert to features dict and lable tuple dataset = dataset.map( lambda question, question_len, answer, answer_len, lable:( { "question":question, "question_len":question_len, "answer":answer, "answer_len":answer_len }, lable)) # shuffle and repeat dataset = dataset.shuffle(1000).repeat() # padded batch as question and answer have varying size dataset = dataset.padded_batch( hparams.batch_size, padded_shapes=( { "question":tf.TensorShape([None]), "question_len":tf.TensorShape([]), "answer":tf.TensorShape([None]), "answer_len":tf.TensorShape([]) }, tf.TensorShape([]))) # create features and lable #iterator = dataset.make_initializable_iterator() #tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer) """ question, question_len, answer, answer_len, lable = iterator.get_next() features = dict() features["question"] = question features["question_len"] = question_len features["answer"] = answer features["answer_len"] = answer_len """ return dataset