def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary en2idx, idx2en = load_en_vocab() zh2idx, idx2zh = load_zh_vocab() # initialize transformer transformer = vanilla_transformer(hp, self.is_training) self.enc = transformer.encode(self.x, len(en2idx)) # Decoder self.dec = transformer.decode(self.decoder_inputs, self.enc, len(zh2idx), hp.maxlen) # Final linear projection self.logits = tf.layers.dense(self.dec, len(zh2idx)) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / (tf.reduce_sum(self.istarget)) tf.summary.scalar('acc', self.acc) if is_training: # Loss self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(zh2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def eval(task_name): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data X, Sources, Targets = load_test_data() #print(X, Sources, Targets) en2idx, idx2en = load_en_vocab() zh2idx, idx2zh = load_zh_vocab() # Start session with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## Get model name print('Model dir:', hp.logdir) mname = '{}'.format(''.join( hp.source_test.split('/')[-1].split('.', 3)[:-1])) + open( hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name print("Model name:", mname) ## Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open("results/" + mname, "w", "utf-8") as fout: list_of_refs, hypotheses, scores = [], [], [] print("Iterator:", len(X), hp.batch_size) for i in range(len(X) // hp.batch_size): print('Step:\t', i) ### Get mini-batches x = X[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] ### Autoregressive inference preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) for j in range(hp.maxlen): _preds = sess.run(g.preds, {g.x: x, g.y: preds}) preds[:, j] = _preds[:, j] ### Write to file for source, target, pred in zip(sources, targets, preds): # sentence-wise #print('Inspecting:', source, target, pred) #got = " ".join(idx2zh[idx] for idx in pred).split("。", 2)[0].strip() + ' 。' #got = ''.join(idx2zh[idx] for idx in pred).split('。')[0].strip() got = ' '.join(idx2zh[idx] for idx in pred).split('</S>')[0] if task_name == 'jieba': fout.write("- source: " + source + "\n") fout.write("- expected: " + ' '.join(cut(source, target)) + "\n") fout.write("- got: " + ' '.join(cut(source, got)) + "\n\n") fout.flush() else: fout.write("- source: " + source + "\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # accumlate accuracty ref = cut(source, target) hypothesis = cut(source, got) acc = len([x for x in hypothesis if x in ref]) / len(ref) scores.append(min(1, acc)) ## Calculate bleu score #score = corpus_bleu(list_of_refs, hypotheses) fout.write("Tokenization Accuracy = " + str(100 * (sum(scores) / len(scores))))
self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all() if __name__ == '__main__': # Load vocabulary en2idx, idx2en = load_en_vocab() zh2idx, idx2zh = load_zh_vocab() # Construct graph g = Graph("train") print("Graph loaded") # Start session sv = tf.train.Supervisor(graph=g.graph, logdir=hp.logdir, save_model_secs=0) with sv.managed_session() as sess: for epoch in range(1, hp.num_epochs + 1): if sv.should_stop(): break for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70,
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary en2idx, idx2en = load_en_vocab() zh2idx, idx2zh = load_zh_vocab() # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout(self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention(queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(zh2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") ## Dropout self.dec = tf.layers.dropout(self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention(queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention(queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(zh2idx)) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget)) tf.summary.scalar('acc', self.acc) if is_training: # Loss self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(zh2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()