def main(_): t_size = 0 test_set = [[] for _ in _buckets] en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) with tf.gfile.GFile(en_train, mode='r') as en_file: with tf.gfile.GFile(fr_train, mode='r') as fr_file: en_sentence, fr_sentence = en_file.readline(), fr_file.readline() counter = 0 while counter < FLAGS.test_size: counter += 1 en_sentence_ids = [int(x) for x in en_sentence.split()] fr_sentence_ids = [int(x) for x in fr_sentence.split()] for bucket_id, (size, _) in enumerate(_buckets): if len(en_sentence_ids) < size and len( fr_sentence_ids) < size: test_set[bucket_id].append( [en_sentence_ids, fr_sentence_ids]) t_size += 1 break en_sentence, fr_sentence = en_file.readline( ), fr_file.readline() print("successful generate test set, now saving") with open('test_set.pkl', 'wb') as f: pickle.dump(test_set, f) print('All set, saved {} sentence pairs'.format(t_size))
def prepare_date(data_dir, vocab_size, sample=True): """ Downloads english/fench pair and returns the data set as an instace of LanguagePairDataSet """ from dataset import LanguageDataSet pathes = data_utils.prepare_wmt_data(data_dir, vocab_size, vocab_size) en2_path, fr2_path, en2013_path, fr2013_path, en_vocab_path, fr_vocab_path = pathes en_index, en_vocab = read_vocab(en_vocab_path) fr_index, fr_vocab = read_vocab(fr_vocab_path) #FIXME: some non-ascii charachters en_vocab_size = len(en_vocab) + 1 fr_vocab_size = len(fr_vocab) + 1 if sample: en_ids = read_data(en2013_path) fr_ids = read_data(fr2013_path) else: print("reading the full dataset") en_ids = read_data(en2_path) fr_ids = read_data(fr2_path) #Make it the same length (= the max length of the sentences) with zeros for shorter sentences return LanguageDataSet(en_ids, en_vocab, en_index), LanguageDataSet(fr_ids, fr_vocab, fr_index)
def train(): """Train a en->fr translation model using WMT data.""" # Prepare WMT data. print("Preparing WMT data in %s" % FLAGS.data_dir) en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) with tf.Session() as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False) # Read data into buckets and compute their sizes. print ("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(en_dev, fr_dev) train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print ("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_buckets)): encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()
import tensorflow as tf from tensorflow.models.rnn.translate import data_utils import translation as tr #params eng_size = 40000 fr_size = 40000 embedding_size = 256 en_train, fr_train, en_test, fr_test = data_utils.prepare_wmt_data('data', eng_size, fr_size) encoder_inputs = [] decoder_inputs = [] #encoder/decoder placeholders self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}",format(i))) #attention model = seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, num_layers=1, num_units:embedding_size, num_encoder_symbols=eng_size, num_decoder_symbols=fr_size, embedding_size=embedding_size) with tf.Session() as sess: model = create_model(sess) tf.train(model)
def train(): ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): """Train a en->fr translation model using WMT data.""" # Prepare WMT data. print("Preparing WMT data in %s" % FLAGS.data_dir) en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) #session = tf.Session(config=tf.ConfigProto(log_device_placement=False)) #model = create_model(sess, False) model = seq2seq_model.Seq2SeqModel( FLAGS.en_vocab_size, FLAGS.fr_vocab_size, _buckets, FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, forward_only=False) global_step = tf.Variable(0, name='global_step', trainable=False) init_op = tf.initialize_all_variables() ''' ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") session.run(tf.initialize_all_variables()) #return model ''' sv =tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), #logdir="./dir/train", logdir=FLAGS.train_dir, init_op=init_op, global_step=global_step, save_model_secs=60) #with sv.managed_session(server.target,config=tf.ConfigProto(allow_soft_placement=True,log_device_placement=True)) as sess: with sv.managed_session(server.target) as sess: # Read data into buckets and compute their sizes. print ("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(en_dev, fr_dev) train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') #print ("global step %d learning rate %.4f step-time %.2f perplexity " # "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), # step_time, perplexity)) print ("step-time %.2f perplexity " "%.2f" % ( step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()
def train(): tfd = TFCluster.TF_Dist() if FLAGS.job_name != "worker": return with tf.device(tfd.Get_replica_device_setter()): print("Creating %d layers of %d units On task %d." % (FLAGS.num_layers, FLAGS.size, FLAGS.task_index)) model = create_model_distributed(False) global_step = tf.Variable(0, name='global_step', trainable=False) saver = tf.train.Saver() sv = tfd.Get_Supervisor(logdir=FLAGS.train_dir, saver=saver, global_step=global_step) """Train a en->fr translation model using WMT data.""" # Prepare WMT data. print("Preparing WMT data in %s" % FLAGS.data_dir) en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) with sv.managed_session(tfd.server.target) as sess: print("Worker %d: Session initialization complete." % FLAGS.task_index) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(en_dev, fr_dev) train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] #while True: while not sv.should_stop() and current_step < 100000: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp( float(loss)) if loss < 300 else float("inf") print( "current step %d step-time %.2f sample-per-sec %.2f perplexity " "%.2f loss %.2f " % (current_step, step_time, FLAGS.batch_size / step_time, perplexity, loss)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. #checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") #model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp( float(eval_loss)) if eval_loss < 300 else float("inf") print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()
def train(): # set distributed configs ps_hosts = ["9.91.9.129:2222"] worker_hosts = ["9.91.9.130:2223", "9.91.9.130:2224", "9.91.9.130:2225"] #worker_hosts = ["9.91.9.130:2223"] cluster = tf.train.ClusterSpec({"ps":ps_hosts, "worker":worker_hosts}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": # Worker server is_chief = (FLAGS.task_index == 0) gpu_num = FLAGS.task_index with tf.Graph().as_default(): with tf.device(tf.train.replica_device_setter(cluster=cluster, worker_device="/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu_num))): #with tf.device("/gpu:%d" % FLAGS.task_index): """Train a en->fr translation model using WMT data.""" # Prepare WMT data. print("Preparing WMT data in %s" % FLAGS.data_dir) en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) logdir = FLAGS.train_dir model = seq2seq_model.Seq2SeqModel( FLAGS.en_vocab_size, FLAGS.fr_vocab_size, _buckets, FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, forward_only=False, is_chief=is_chief, num_workers=FLAGS.num_workers, task_index=FLAGS.task_index) init_op = tf.initialize_all_variables() summary_op = tf.merge_all_summaries() sv = tf.train.Supervisor(is_chief=is_chief, logdir=logdir, init_op=init_op, summary_op = None, saver=model.saver, save_model_secs=FLAGS.save_interval_secs, global_step=model.global_step) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False ) #sess = sv.prepare_or_wait_for_session(server.target) #with sv.prepare_or_wait_for_session(server.target, sess_config) as sess: with sv.managed_session(server.target, config=sess_config) as sess: sess.run(init_op) """ queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if FLAGS.sync_replicas and is_chief: sv.start_queue_runners(sess, model.chief_queue_runners) sess.run(model.init_tokens_op) """ """ if is_chief: ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) sess.run(init_op) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") tf.gfile.MakeDirs(logdir) sess.run(init_op) """ # Read data into buckets and compute their sizes. print ("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(en_dev, fr_dev) train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 step = 0 previous_losses = [] next_summary_time = time.time() + FLAGS.save_summaryies_secs #with sess.as_default(): print("ready to train") while not sv.should_stop() and step < FLAGS.max_step: #print("start training loop:%d" % current_step) # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() print("start_time:%.4f" % start_time) encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 #print("global_step=%d" %step) print("current_step=%d " % current_step) # Once in a while, we save checkpoint, print statistics, and run evals. if step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print ("current step %d loss %.4f learning rate %.4f step-time %.2f " "perplexity %.2f" % (current_step, loss, model.learning_rate.eval(session = sess), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): print("learning_rate_decay_factor=%d" % FLAGS.learning_rate_decay_factor) #_, step = sess.run([model.train_op, model.global_step]) previous_losses.append(loss) if is_chief and next_summary_time < time.time(): # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) next_summary_time += time.time() # Run evals on development set and print their perplexity. """ if is_chief: checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) """ step_time, loss = 0.0, 0.0 for bucket_id in xrange(len(_buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) #print("sys.stdout.flush()") sys.stdout.flush() #print(not sv.should_stop()) sv.stop() print("sv finished")
np.array([encoder_inputs[batch_idx][lenth_idx] for batch_idx in range(batch_size)], dtype=np.int32) ) # Batch decoder inputs are re-indexed decoder_inputs, we create weights. for length_idx in range(decoder_size): batch_decoder_inputs.append( np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in range(batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(batch_size, dtype=np.float32) for batch_idx in range(batch_size): # We set weight to 0 if the corresponding target is a PAD symbol. # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights if __name__ == '__main__': # data_utils.maybe_download(DATA_DIR, TRAIN_FN, data_utils._WMT_ENFR_TRAIN_URL) # data_utils.maybe_download(DATA_DIR, TRAIN_FN, data_utils._WMT_ENFR_DEV_URL) source_path, target_path, _, _, _, _ = data_utils.prepare_wmt_data(DATA_DIR, 10000, 10000) data_set = read_data(source_path, target_path, 50) batch_encoder_inputs, batch_decoder_inputs, batch_weights = get_batch(data_set , 10,0)
def train(): """Train a en->fr translation model using WMT data.""" # Prepare WMT data. print("Preparing WMT data in %s" % FLAGS.data_dir) en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) #with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: #with tf.Session() as sess: # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(en_dev, fr_dev) train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) encoder_inputs = [[0 for x in range(50)] for i in xrange(FLAGS.num_gpus)] decoder_inputs = [[0 for x in range(50)] for i in xrange(FLAGS.num_gpus)] target_weights = [[0 for x in range(50)] for i in xrange(FLAGS.num_gpus)] tower_grads = [] step_losses = [] model_list = [[] for i in xrange(FLAGS.num_gpus)] #sys.stdout.write("FLAGS.num_gpus:",FLAGS.num_gpus) for i in xrange(FLAGS.num_gpus): #sys.stdout.write("FLAGS.num_gpus:",FLAGS.num_gpus) with tf.device('/gpu:%d' % i): with tf.name_scope('TOWER_%d' % (i)) as scope: # Create model. print("Creating %d layers of %d units On Gpu:%d." % (FLAGS.num_layers, FLAGS.size, i)) model_list[i] = create_model2(sess, False) step_losses.append(model_list[i].losses[bucket_id]) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() gradient_norms = [] #updates = [] #for b in xrange(len(_buckets)): params = tf.trainable_variables() #with tf.device('/cpu:0'): gradients = tf.gradients(step_losses[i], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, FLAGS.max_gradient_norm) gradient_norms.append(norm) # Keep track of the gradients across all towers. tower_grads.append(clipped_gradients) #tower_grads.append(model_list[i].clipped_gradients[bucket_id]) #grads = average_gradients(tower_grads) average_grads = average_gradients(tower_grads) params = tf.trainable_variables() opt = tf.train.GradientDescentOptimizer(FLAGS.learning_rate) updates = opt.apply_gradients(zip(average_grads, params), global_step=global_step) #zip(tower_grads[0], params), global_step=global_step) #zip(grads, params), global_step=global_step) train_op = tf.group(updates) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") sess.run(tf.initialize_all_variables()) saver = tf.train.Saver(tf.all_variables()) #input_feed = [{} for i in xrange(FLAGS.num_gpus)] #input_feed = {} # Calculate the gradients for each model tower. while True: input_feed = {} for i in xrange(FLAGS.num_gpus): #sys.stdout.write("FLAGS.num_gpus:",FLAGS.num_gpus) #with tf.name_scope('TOWER_%d' % (i)) as scope: #print ("gpuId:%d\tcurrent_step:%d" % (i,current_step)) #print ("model_list[i].encoder_inputs[l]:%s" % (model_list[i].encoder_inputs[0].name)) # Check if the sizes match. encoder_size, decoder_size = model_list[i].buckets[bucket_id] encoder_inputs[i], decoder_inputs[i], target_weights[ i] = model_list[i].get_batch(train_set, bucket_id) # Input feed: encoder inputs, decoder inputs, target_weights, as provided. for l in xrange(encoder_size): input_feed[model_list[i].encoder_inputs[l]. name] = encoder_inputs[i][l] for l in xrange(decoder_size): input_feed[model_list[i].decoder_inputs[l]. name] = decoder_inputs[i][l] input_feed[model_list[i].target_weights[l]. name] = target_weights[i][l] # Since our targets are decoder inputs shifted by one, we need one more. last_target = model_list[i].decoder_inputs[decoder_size].name input_feed[last_target] = np.zeros([model_list[i].batch_size], dtype=np.int32) #print ("___load end") #for GpuIdx in xrange(FLAGS.num_gpus): current_step += 1 # Gradients and SGD update operation for training the model. #params = tf.trainable_variables() start_time = time.time() _, step_loss = sess.run([train_op, step_losses], input_feed) #step_loss = sess.run([step_losses]) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss[0] / FLAGS.steps_per_checkpoint #print ("___load end:%f" %(step_time)) # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print( "global step %d learning rate %.4f step-time %.2f sample-per-sec %.2f perplexity " "%.2f" % (global_step.eval(), model_list[0].learning_rate.eval(), step_time, FLAGS.num_gpus * FLAGS.batch_size / step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model_list[0].learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") saver.save(sess, checkpoint_path, global_step=global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. '''