Beispiel #1
0
def main(_):
    t_size = 0
    test_set = [[] for _ in _buckets]
    en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data(
        FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size)
    with tf.gfile.GFile(en_train, mode='r') as en_file:
        with tf.gfile.GFile(fr_train, mode='r') as fr_file:
            en_sentence, fr_sentence = en_file.readline(), fr_file.readline()
            counter = 0
            while counter < FLAGS.test_size:
                counter += 1
                en_sentence_ids = [int(x) for x in en_sentence.split()]
                fr_sentence_ids = [int(x) for x in fr_sentence.split()]
                for bucket_id, (size, _) in enumerate(_buckets):
                    if len(en_sentence_ids) < size and len(
                            fr_sentence_ids) < size:
                        test_set[bucket_id].append(
                            [en_sentence_ids, fr_sentence_ids])
                        t_size += 1
                        break
                en_sentence, fr_sentence = en_file.readline(
                ), fr_file.readline()
    print("successful generate test set, now saving")
    with open('test_set.pkl', 'wb') as f:
        pickle.dump(test_set, f)
    print('All set, saved {} sentence pairs'.format(t_size))
def prepare_date(data_dir, vocab_size, sample=True):
    """
    Downloads english/fench pair and returns the data set as an instace of LanguagePairDataSet
    """
    from dataset import LanguageDataSet
    pathes = data_utils.prepare_wmt_data(data_dir, vocab_size, vocab_size)
    en2_path, fr2_path, en2013_path, fr2013_path, en_vocab_path, fr_vocab_path = pathes
    en_index, en_vocab = read_vocab(en_vocab_path)
    fr_index, fr_vocab = read_vocab(fr_vocab_path)
    #FIXME: some non-ascii charachters
    en_vocab_size = len(en_vocab) + 1
    fr_vocab_size = len(fr_vocab) + 1

    if sample:
        en_ids = read_data(en2013_path)
        fr_ids = read_data(fr2013_path)

    else:
        print("reading the full dataset")
        en_ids = read_data(en2_path)
        fr_ids = read_data(fr2_path)

    #Make it the same length (= the max length of the sentences) with zeros for shorter sentences
    return LanguageDataSet(en_ids, en_vocab, en_index), LanguageDataSet(fr_ids, fr_vocab, fr_index)
Beispiel #3
0
def train():
  """Train a en->fr translation model using WMT data."""
  # Prepare WMT data.
  print("Preparing WMT data in %s" % FLAGS.data_dir)
  en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data(
      FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size)

  with tf.Session() as sess:
    # Create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, False)

    # Read data into buckets and compute their sizes.
    print ("Reading development and training data (limit: %d)."
           % FLAGS.max_train_data_size)
    dev_set = read_data(en_dev, fr_dev)
    train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size)
    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))

    # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
    # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
    # the size if i-th training bucket, as used later.
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    while True:
      # Choose a bucket according to data distribution. We pick a random number
      # in [0, 1] and use the corresponding interval in train_buckets_scale.
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set, bucket_id)
      _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, False)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:
        # Print statistics for the previous epoch.
        perplexity = math.exp(loss) if loss < 300 else float('inf')
        print ("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          sess.run(model.learning_rate_decay_op)
        previous_losses.append(loss)
        # Save checkpoint and zero timer and loss.
        checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0
        # Run evals on development set and print their perplexity.
        for bucket_id in xrange(len(_buckets)):
          encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              dev_set, bucket_id)
          _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
          eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
          print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
        sys.stdout.flush()
import tensorflow as tf 
from tensorflow.models.rnn.translate import data_utils
import translation as tr 


#params 
eng_size = 40000
fr_size = 40000
embedding_size = 256
en_train, fr_train, en_test, fr_test = data_utils.prepare_wmt_data('data', eng_size, fr_size)

encoder_inputs = []
decoder_inputs = []


#encoder/decoder placeholders
self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}",format(i)))


#attention
model = seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs,
	num_layers=1, num_units:embedding_size, num_encoder_symbols=eng_size, num_decoder_symbols=fr_size,
	embedding_size=embedding_size)

with tf.Session() as sess:
	model = create_model(sess)
	tf.train(model)
Beispiel #5
0
def train():
  ps_hosts = FLAGS.ps_hosts.split(",")
  worker_hosts = FLAGS.worker_hosts.split(",")
  cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
  server = tf.train.Server(cluster,
                           job_name=FLAGS.job_name,
                           task_index=FLAGS.task_index)

  if FLAGS.job_name == "ps":
      server.join()
  elif FLAGS.job_name == "worker":
    with tf.device(tf.train.replica_device_setter(
            worker_device="/job:worker/task:%d" % FLAGS.task_index,
            cluster=cluster)):

      """Train a en->fr translation model using WMT data."""
      # Prepare WMT data.
      print("Preparing WMT data in %s" % FLAGS.data_dir)
      en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data(
          FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size)
      
      
      # Create model.
      print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
      #session = tf.Session(config=tf.ConfigProto(log_device_placement=False))
      #model = create_model(sess, False)
      model = seq2seq_model.Seq2SeqModel(
          FLAGS.en_vocab_size, FLAGS.fr_vocab_size, _buckets,
          FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size,
          FLAGS.learning_rate, FLAGS.learning_rate_decay_factor,
          forward_only=False)
      global_step = tf.Variable(0, name='global_step', trainable=False)
      init_op = tf.initialize_all_variables()
    '''
      ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
      if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        model.saver.restore(session, ckpt.model_checkpoint_path)
      else:
        print("Created model with fresh parameters.")
        session.run(tf.initialize_all_variables())
      #return model
    '''
    sv =tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
                           #logdir="./dir/train",
                           logdir=FLAGS.train_dir,
                           init_op=init_op,
                           global_step=global_step,
                           save_model_secs=60)
    #with sv.managed_session(server.target,config=tf.ConfigProto(allow_soft_placement=True,log_device_placement=True)) as sess:
    with sv.managed_session(server.target) as sess:
    
      # Read data into buckets and compute their sizes.
      print ("Reading development and training data (limit: %d)."
             % FLAGS.max_train_data_size)
      dev_set = read_data(en_dev, fr_dev)
      train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size)
      train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
      train_total_size = float(sum(train_bucket_sizes))
    
      # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
      # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
      # the size if i-th training bucket, as used later.
      train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                             for i in xrange(len(train_bucket_sizes))]
    
      # This is the training loop.
      step_time, loss = 0.0, 0.0
      current_step = 0
      previous_losses = []
      while True:
        # Choose a bucket according to data distribution. We pick a random number
        # in [0, 1] and use the corresponding interval in train_buckets_scale.
        random_number_01 = np.random.random_sample()
        bucket_id = min([i for i in xrange(len(train_buckets_scale))
                         if train_buckets_scale[i] > random_number_01])
    
        # Get a batch and make a step.
        start_time = time.time()
        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
            train_set, bucket_id)
        _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, bucket_id, False)
        step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
        loss += step_loss / FLAGS.steps_per_checkpoint
        current_step += 1
    
        # Once in a while, we save checkpoint, print statistics, and run evals.
        if current_step % FLAGS.steps_per_checkpoint == 0:
          # Print statistics for the previous epoch.
          perplexity = math.exp(loss) if loss < 300 else float('inf')
          #print ("global step %d learning rate %.4f step-time %.2f perplexity "
          #       "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
          #                 step_time, perplexity))
          print ("step-time %.2f perplexity "
                 "%.2f" % (
                           step_time, perplexity))
          # Decrease learning rate if no improvement was seen over last 3 times.
          if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
            sess.run(model.learning_rate_decay_op)
          previous_losses.append(loss)
          # Save checkpoint and zero timer and loss.
          checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
          model.saver.save(sess, checkpoint_path, global_step=model.global_step)
          step_time, loss = 0.0, 0.0
          # Run evals on development set and print their perplexity.
          for bucket_id in xrange(len(_buckets)):
            if len(dev_set[bucket_id]) == 0:
              print("  eval: empty bucket %d" % (bucket_id))
              continue
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                dev_set, bucket_id)
            _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, True)
            eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
            print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
          sys.stdout.flush()
def train():
    tfd = TFCluster.TF_Dist()
    if FLAGS.job_name != "worker":
        return
    with tf.device(tfd.Get_replica_device_setter()):
        print("Creating %d layers of %d units On task %d." %
              (FLAGS.num_layers, FLAGS.size, FLAGS.task_index))
        model = create_model_distributed(False)
        global_step = tf.Variable(0, name='global_step', trainable=False)
        saver = tf.train.Saver()
    sv = tfd.Get_Supervisor(logdir=FLAGS.train_dir,
                            saver=saver,
                            global_step=global_step)
    """Train a en->fr translation model using WMT data."""
    # Prepare WMT data.
    print("Preparing WMT data in %s" % FLAGS.data_dir)
    en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data(
        FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size)
    with sv.managed_session(tfd.server.target) as sess:
        print("Worker %d: Session initialization complete." % FLAGS.task_index)
        # Read data into buckets and compute their sizes.
        print("Reading development and training data (limit: %d)." %
              FLAGS.max_train_data_size)
        dev_set = read_data(en_dev, fr_dev)
        train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size)
        train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))

        # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
        # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
        # the size if i-th training bucket, as used later.
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in xrange(len(train_bucket_sizes))
        ]

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        #while True:
        while not sv.should_stop() and current_step < 100000:
            # Choose a bucket according to data distribution. We pick a random number
            # in [0, 1] and use the corresponding interval in train_buckets_scale.
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in xrange(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                train_set, bucket_id)
            _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, False)
            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            loss += step_loss / FLAGS.steps_per_checkpoint
            current_step += 1

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                perplexity = math.exp(
                    float(loss)) if loss < 300 else float("inf")
                print(
                    "current step %d  step-time %.2f sample-per-sec %.2f perplexity "
                    "%.2f loss %.2f " %
                    (current_step, step_time, FLAGS.batch_size / step_time,
                     perplexity, loss))
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and loss > max(
                        previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                previous_losses.append(loss)
                # Save checkpoint and zero timer and loss.
                #checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
                #model.saver.save(sess, checkpoint_path, global_step=model.global_step)
                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                for bucket_id in xrange(len(_buckets)):
                    if len(dev_set[bucket_id]) == 0:
                        print("  eval: empty bucket %d" % (bucket_id))
                        continue
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                        dev_set, bucket_id)
                    _, eval_loss, _ = model.step(sess, encoder_inputs,
                                                 decoder_inputs,
                                                 target_weights, bucket_id,
                                                 True)
                    eval_ppx = math.exp(
                        float(eval_loss)) if eval_loss < 300 else float("inf")
                    print("  eval: bucket %d perplexity %.2f" %
                          (bucket_id, eval_ppx))
                sys.stdout.flush()
Beispiel #7
0
def train():

  # set distributed configs
  ps_hosts = ["9.91.9.129:2222"]
  worker_hosts = ["9.91.9.130:2223", "9.91.9.130:2224", "9.91.9.130:2225"]
  #worker_hosts = ["9.91.9.130:2223"]

  cluster = tf.train.ClusterSpec({"ps":ps_hosts, "worker":worker_hosts})
  server = tf.train.Server(cluster,
                            job_name=FLAGS.job_name,
                            task_index=FLAGS.task_index)
  if FLAGS.job_name == "ps":
        server.join()
  elif FLAGS.job_name == "worker":
      # Worker server 
      is_chief = (FLAGS.task_index == 0)      
      gpu_num = FLAGS.task_index
      with tf.Graph().as_default():
        with tf.device(tf.train.replica_device_setter(cluster=cluster,
            worker_device="/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu_num))):
        #with tf.device("/gpu:%d" % FLAGS.task_index):
            """Train a en->fr translation model using WMT data."""
            # Prepare WMT data.
            print("Preparing WMT data in %s" % FLAGS.data_dir)
            en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data(
                FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size)

            # Create model.
            print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))


            logdir = FLAGS.train_dir

            model = seq2seq_model.Seq2SeqModel(
                  FLAGS.en_vocab_size, FLAGS.fr_vocab_size, _buckets,
                  FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size,
                  FLAGS.learning_rate, FLAGS.learning_rate_decay_factor,
                  forward_only=False, is_chief=is_chief, 
                  num_workers=FLAGS.num_workers, task_index=FLAGS.task_index)

            init_op = tf.initialize_all_variables()
            summary_op = tf.merge_all_summaries()
            
            sv = tf.train.Supervisor(is_chief=is_chief,
                                    logdir=logdir,
                                    init_op=init_op,
                                    summary_op = None,
                                    saver=model.saver,
                                    save_model_secs=FLAGS.save_interval_secs,
                                    global_step=model.global_step)          



            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=False
                )
                

            #sess = sv.prepare_or_wait_for_session(server.target)
            #with sv.prepare_or_wait_for_session(server.target, sess_config) as sess:
            with sv.managed_session(server.target, config=sess_config) as sess:

                sess.run(init_op)              
                """
                queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
                sv.start_queue_runners(sess, queue_runners)
                tf.logging.info('Started %d queues for processing input data.',
                  len(queue_runners))
                
                if FLAGS.sync_replicas and is_chief:

                  sv.start_queue_runners(sess, model.chief_queue_runners)
                  sess.run(model.init_tokens_op)
                """

                """
                if is_chief:
                  ckpt = tf.train.get_checkpoint_state(logdir)
                  if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
                    print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
                    sess.run(init_op)
                    model.saver.restore(sess, ckpt.model_checkpoint_path)
                  else:
                    print("Created model with fresh parameters.")
                    tf.gfile.MakeDirs(logdir)
                    sess.run(init_op)
                """    
                
                # Read data into buckets and compute their sizes.
                print ("Reading development and training data (limit: %d)."
                       % FLAGS.max_train_data_size)
                dev_set = read_data(en_dev, fr_dev)
                train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size)
                train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
                train_total_size = float(sum(train_bucket_sizes))

                # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
                # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
                # the size if i-th training bucket, as used later.
                train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                                       for i in xrange(len(train_bucket_sizes))]

                # This is the training loop.
                step_time, loss = 0.0, 0.0
                current_step = 0
                step = 0
                previous_losses = []
                next_summary_time = time.time() + FLAGS.save_summaryies_secs
                #with sess.as_default():
                print("ready to train")
                while not sv.should_stop() and step < FLAGS.max_step:
                  #print("start training loop:%d" % current_step)

                  # Choose a bucket according to data distribution. We pick a random number
                  # in [0, 1] and use the corresponding interval in train_buckets_scale.
                  random_number_01 = np.random.random_sample()
                  bucket_id = min([i for i in xrange(len(train_buckets_scale))
                                   if train_buckets_scale[i] > random_number_01])
                  # Get a batch and make a step.
                  start_time = time.time()
                  print("start_time:%.4f" % start_time)

                  encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                      train_set, bucket_id)


                  _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                               target_weights, bucket_id, False)
                  
                  step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
                  loss += step_loss / FLAGS.steps_per_checkpoint
                  current_step += 1
                  
                  #print("global_step=%d" %step)
                  print("current_step=%d " % current_step)
            
                  # Once in a while, we save checkpoint, print statistics, and run evals.
                  if step % FLAGS.steps_per_checkpoint == 0:
                    # Print statistics for the previous epoch.
                    perplexity = math.exp(loss) if loss < 300 else float('inf') 
                    print ("current step %d loss %.4f learning rate %.4f step-time %.2f "
                      "perplexity %.2f" % (current_step, loss, 
                                          model.learning_rate.eval(session = sess),
                                          step_time, perplexity))
                    # Decrease learning rate if no improvement was seen over last 3 times.
                    if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
                      print("learning_rate_decay_factor=%d" % FLAGS.learning_rate_decay_factor)
                      
                      #_, step = sess.run([model.train_op, model.global_step])

                    previous_losses.append(loss)
                    
                    if is_chief and next_summary_time < time.time():
                      # Save checkpoint and zero timer and loss.
                      checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
                      model.saver.save(sess, checkpoint_path, global_step=model.global_step)
                      next_summary_time += time.time()
                      # Run evals on development set and print their perplexity.
                    
                    """
                    if is_chief:
                      checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
                      model.saver.save(sess, checkpoint_path, global_step=model.global_step)
                    """
                    step_time, loss = 0.0, 0.0
                    for bucket_id in xrange(len(_buckets)):
                      if len(dev_set[bucket_id]) == 0:
                        print("  eval: empty bucket %d" % (bucket_id))
                        continue
                      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                          dev_set, bucket_id)
                      _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                                   target_weights, bucket_id, True)
                      eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
                      print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
    
                      #print("sys.stdout.flush()")
                      sys.stdout.flush()
                  #print(not sv.should_stop())

            sv.stop()
            print("sv finished")
            np.array([encoder_inputs[batch_idx][lenth_idx]
                      for batch_idx in range(batch_size)], dtype=np.int32)
        )

        # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
    for length_idx in range(decoder_size):
        batch_decoder_inputs.append(
            np.array([decoder_inputs[batch_idx][length_idx]
                      for batch_idx in range(batch_size)], dtype=np.int32))

        # Create target_weights to be 0 for targets that are padding.
        batch_weight = np.ones(batch_size, dtype=np.float32)
        for batch_idx in range(batch_size):
            # We set weight to 0 if the corresponding target is a PAD symbol.
            # The corresponding target is decoder_input shifted by 1 forward.
            if length_idx < decoder_size - 1:
                target = decoder_inputs[batch_idx][length_idx + 1]
            if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
                batch_weight[batch_idx] = 0.0
        batch_weights.append(batch_weight)
    return batch_encoder_inputs, batch_decoder_inputs, batch_weights


if __name__ == '__main__':
    # data_utils.maybe_download(DATA_DIR, TRAIN_FN, data_utils._WMT_ENFR_TRAIN_URL)
    # data_utils.maybe_download(DATA_DIR, TRAIN_FN, data_utils._WMT_ENFR_DEV_URL)
    source_path, target_path, _, _, _, _ = data_utils.prepare_wmt_data(DATA_DIR, 10000, 10000)
    data_set = read_data(source_path, target_path, 50)
    batch_encoder_inputs, batch_decoder_inputs, batch_weights = get_batch(data_set
                                                                          , 10,0)
def train():
    """Train a en->fr translation model using WMT data."""
    # Prepare WMT data.
    print("Preparing WMT data in %s" % FLAGS.data_dir)
    en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data(
        FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size)

    #with tf.Graph().as_default(), tf.device('/cpu:0'):
    # Create a variable to count the number of train() calls. This equals the
    # number of batches processed * FLAGS.num_gpus.
    global_step = tf.get_variable('global_step', [],
                                  initializer=tf.constant_initializer(0),
                                  trainable=False)
    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        #with tf.Session() as sess:

        # Read data into buckets and compute their sizes.
        print("Reading development and training data (limit: %d)." %
              FLAGS.max_train_data_size)
        dev_set = read_data(en_dev, fr_dev)
        train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size)
        train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))

        # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
        # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
        # the size if i-th training bucket, as used later.
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in xrange(len(train_bucket_sizes))
        ]

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []

        # Choose a bucket according to data distribution. We pick a random number
        # in [0, 1] and use the corresponding interval in train_buckets_scale.
        random_number_01 = np.random.random_sample()
        bucket_id = min([
            i for i in xrange(len(train_buckets_scale))
            if train_buckets_scale[i] > random_number_01
        ])
        encoder_inputs = [[0 for x in range(50)]
                          for i in xrange(FLAGS.num_gpus)]
        decoder_inputs = [[0 for x in range(50)]
                          for i in xrange(FLAGS.num_gpus)]
        target_weights = [[0 for x in range(50)]
                          for i in xrange(FLAGS.num_gpus)]
        tower_grads = []
        step_losses = []
        model_list = [[] for i in xrange(FLAGS.num_gpus)]
        #sys.stdout.write("FLAGS.num_gpus:",FLAGS.num_gpus)
        for i in xrange(FLAGS.num_gpus):
            #sys.stdout.write("FLAGS.num_gpus:",FLAGS.num_gpus)
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('TOWER_%d' % (i)) as scope:
                    # Create model.
                    print("Creating %d layers of %d units On Gpu:%d." %
                          (FLAGS.num_layers, FLAGS.size, i))
                    model_list[i] = create_model2(sess, False)

                    step_losses.append(model_list[i].losses[bucket_id])
                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    gradient_norms = []
                    #updates = []
                    #for b in xrange(len(_buckets)):
                    params = tf.trainable_variables()
                    #with tf.device('/cpu:0'):
                    gradients = tf.gradients(step_losses[i], params)
                    clipped_gradients, norm = tf.clip_by_global_norm(
                        gradients, FLAGS.max_gradient_norm)
                    gradient_norms.append(norm)
                    # Keep track of the gradients across all towers.
                    tower_grads.append(clipped_gradients)
                    #tower_grads.append(model_list[i].clipped_gradients[bucket_id])

        #grads = average_gradients(tower_grads)

        average_grads = average_gradients(tower_grads)

        params = tf.trainable_variables()
        opt = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
        updates = opt.apply_gradients(zip(average_grads, params),
                                      global_step=global_step)
        #zip(tower_grads[0], params), global_step=global_step)
        #zip(grads, params), global_step=global_step)
        train_op = tf.group(updates)
        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            saver.restore(session, ckpt.model_checkpoint_path)
        else:
            print("Created model with fresh parameters.")
            sess.run(tf.initialize_all_variables())

        saver = tf.train.Saver(tf.all_variables())
        #input_feed = [{} for i in xrange(FLAGS.num_gpus)]
        #input_feed = {}
        # Calculate the gradients for each model tower.
        while True:
            input_feed = {}
            for i in xrange(FLAGS.num_gpus):
                #sys.stdout.write("FLAGS.num_gpus:",FLAGS.num_gpus)
                #with tf.name_scope('TOWER_%d' % (i)) as scope:
                #print ("gpuId:%d\tcurrent_step:%d"  % (i,current_step))
                #print ("model_list[i].encoder_inputs[l]:%s"  % (model_list[i].encoder_inputs[0].name))

                # Check if the sizes match.
                encoder_size, decoder_size = model_list[i].buckets[bucket_id]
                encoder_inputs[i], decoder_inputs[i], target_weights[
                    i] = model_list[i].get_batch(train_set, bucket_id)

                # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
                for l in xrange(encoder_size):
                    input_feed[model_list[i].encoder_inputs[l].
                               name] = encoder_inputs[i][l]
                for l in xrange(decoder_size):
                    input_feed[model_list[i].decoder_inputs[l].
                               name] = decoder_inputs[i][l]
                    input_feed[model_list[i].target_weights[l].
                               name] = target_weights[i][l]

                # Since our targets are decoder inputs shifted by one, we need one more.
                last_target = model_list[i].decoder_inputs[decoder_size].name
                input_feed[last_target] = np.zeros([model_list[i].batch_size],
                                                   dtype=np.int32)

            #print ("___load end")
            #for GpuIdx in xrange(FLAGS.num_gpus):
            current_step += 1
            # Gradients and SGD update operation for training the model.
            #params = tf.trainable_variables()
            start_time = time.time()
            _, step_loss = sess.run([train_op, step_losses], input_feed)
            #step_loss = sess.run([step_losses])
            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            loss += step_loss[0] / FLAGS.steps_per_checkpoint
            #print ("___load end:%f" %(step_time))

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                perplexity = math.exp(loss) if loss < 300 else float('inf')
                print(
                    "global step %d learning rate %.4f step-time %.2f sample-per-sec %.2f perplexity "
                    "%.2f" %
                    (global_step.eval(), model_list[0].learning_rate.eval(),
                     step_time, FLAGS.num_gpus * FLAGS.batch_size / step_time,
                     perplexity))
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and loss > max(
                        previous_losses[-3:]):
                    sess.run(model_list[0].learning_rate_decay_op)
                previous_losses.append(loss)
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.train_dir,
                                               "translate.ckpt")
                saver.save(sess, checkpoint_path, global_step=global_step)
                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                '''