Esempi in Python per initialize_vocabulary, esempi in Python per nlc_data.initialize_vocabulary

Esempio n. 1

0

Mostra file

def decode():
    # Prepare NLC data.
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.LanguageModel(FLAGS.lmfile)

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        while True:
            sent = raw_input("Enter a sentence: ")

            output_sent = fix_sent(model, sess, sent)

            print("Candidate: ", output_sent)

Esempio n. 2

0

Mostra file

File: decode.py Progetto: buptpriswang/nlc

def decode():
  # Prepare NLC data.
  global reverse_vocab, vocab, lm

  if FLAGS.lmfile is not None:
    print("Loading Language model from %s" % FLAGS.lmfile)
    lm = kenlm.LanguageModel(FLAGS.lmfile)

  print("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS))
  vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path)
  vocab_size = len(vocab)
  print("Vocabulary size: %d" % vocab_size)

  with tf.Session() as sess:
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    while True:
      sent = raw_input("Enter a sentence: ")

      output_sent = fix_sent(model, sess, sent)

      print("Candidate: ", output_sent)

Esempio n. 3

0

Mostra file

File: decode.py Progetto: IamHimon/nlc

def decode():
    # Prepare NLC data.
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.LanguageModel(FLAGS.lmfile)

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)
        # import codecs
        # outfile = open('predictions.txt','w')
        # outfile2 = open('predictions_all.txt','w')
        # outfile = open('lambda_train.txt','w')
        # infile = open(FLAGS.data_dir+'/'+FLAGS.tokenizer.lower()+'/test.y.txt')
        # lines = infile.readlines()
        # index = 0
        # with codecs.open(FLAGS.data_dir+'/'+FLAGS.tokenizer.lower()+'/test.x.txt', encoding='utf-8') as fr:
        # with codecs.open('ytc_test.txt', encoding='utf-8') as fr:
        # for sent in fr:
        # print("Original: ", sent.strip().encode('utf-8'))
        # output_sent,all_sents,all_prob,all_lmscore = fix_sent(model, sess, sent.encode('utf-8'))
        # print("Revised: ", output_sent)
        # print('*'*30)
        # outfile.write(output_sent+'\n')
        # outfile2.write('\t'.join(all_sents)+'\n')
        # correct_sent = lines[index].strip('\n').strip('\r')
        # for i in range(len(all_sents)):
        #    if all_sents[i] == correct_sent:
        #        outfile.write('10 qid:'+str(index)+' 1:'+str(all_prob[i])+' 2:'+str(all_lmscore[i])+' #'+all_sents[i]+'\n')
        #    else:
        #        outfile.write('0 qid:'+str(index)+' 1:'+str(all_prob[i])+' 2:'+str(all_lmscore[i])+' #'+all_sents[i]+'\n')
        # index+=1
        while True:
            sent = raw_input("Enter a sentence: ")
            output_sent, _, _, _ = fix_sent(model, sess, sent)
            print("Candidate: ", output_sent)

Esempio n. 4

0

Mostra file

File: decode.py Progetto: anoopyear2020/OCR_Correction

def load_vocab():
    # Prepare NLC data.
    global reverse_vocab, vocab, vocab_size, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.LanguageModel(FLAGS.lmfile)

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=nlc_data.char_tokenizer)
    vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path)
    # print(vocab)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

Esempio n. 5

0

Mostra file

File: error_analysis.py Progetto: sdlg/nlc

def setup_batch_decode(sess):
  # decode for dev-sets, in batches
  global reverse_vocab, vocab, lm

  if FLAGS.lmfile is not None:
    print("Loading Language model from %s" % FLAGS.lmfile)
    lm = kenlm.LanguageModel(FLAGS.lmfile)

  print("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS), other_dev_path="/deep/group/nlp_data/nlc_data/ourdev/bpe")
  vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path, bpe=(FLAGS.tokenizer.lower()=="bpe"))
  vocab_size = len(vocab)
  print("Vocabulary size: %d" % vocab_size)

  print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
  model = create_model(sess, vocab_size, False)

  return model, x_dev, y_dev

Esempio n. 6

0

Mostra file

def setup_batch_decode(sess):
  # decode for dev-sets, in batches
  global reverse_vocab, vocab, lm

  if FLAGS.lmfile is not None:
    print("Loading Language model from %s" % FLAGS.lmfile)
    lm = kenlm.LanguageModel(FLAGS.lmfile)

  print("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS))
  vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path)
  vocab_size = len(vocab)
  print("Vocabulary size: %d" % vocab_size)

  print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
  model = create_model(sess, vocab_size, False)

  return model, x_dev, y_dev

Esempio n. 7

0

Mostra file

File: decode_sent.py Progetto: catcatrun/Neural_Language_Correction

def decode():
    # Prepare NLC data.
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.LanguageModel(FLAGS.lmfile)

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)
        if FLAGS.interactive:
            while True:
                sent = raw_input("Enter a sentence: ")
                if sent == 'exit':
                    exit(0)
                output_sent = fix_sent(model, sess, sent.decode('utf-8'))
                print("Candidate: ", output_sent)
        else:
            test_x_data = os.path.join(FLAGS.data_dir, FLAGS.tokenizer.lower()+'/test.x.txt')
            if not os.path.exists(test_x_data):
                print("Please provide {} to test.".format(test_x_data))
                exit(-1)
            with codecs.open(test_x_data, encoding='utf-8') as fr:
                for sent in fr:
                    print("Original: ", sent.strip().encode('utf-8'))
                    output_sent = fix_sent(model, sess, sent1)
                    print("Revised: ", output_sent.encode('utf-8'))
                    print('*'*30)

Esempio n. 8

0

Mostra file

File: util.py Progetto: davidrossouw/nlc

def pair_iter(fnamex,
              fnamey,
              batch_size,
              num_layers,
              FLAGS,
              sort_and_shuffle=True):
    global vocab, reverse_vocab
    vocab, reverse_vocab = nlc_data.initialize_vocabulary(
        os.path.join(FLAGS['data_dir'], FLAGS['tokenizer'].lower(),
                     "vocab.dat"))

    fdx, fdy = open(fnamex), open(fnamey)
    batches = []

    while True:
        if len(batches) == 0:
            refill(batches,
                   fdx,
                   fdy,
                   batch_size,
                   FLAGS,
                   sort_and_shuffle=sort_and_shuffle)
        if len(batches) == 0:
            break

        x_tokens, y_tokens = batches.pop(0)
        y_tokens = add_sos_eos(y_tokens)
        x_padded, y_padded = padded(x_tokens, num_layers), padded(y_tokens, 1)

        source_tokens = np.array(x_padded).T
        source_mask = (source_tokens != nlc_data.PAD_ID).astype(np.int32)
        target_tokens = np.array(y_padded).T
        target_mask = (target_tokens != nlc_data.PAD_ID).astype(np.int32)

        yield (source_tokens, source_mask, target_tokens, target_mask)

    return

Esempio n. 9

0

Mostra file

File: train.py Progetto: tonydeep/nlc

def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        print('Initial validation cost: %f' %
              validate(model, sess, x_dev, y_dev))

        if False:
            tic = time.time()
            params = tf.trainable_variables()
            num_params = sum(
                map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
            toc = time.time()
            print("Number of params: %d (retreival took %f secs)" %
                  (num_params, toc - tic))

        epoch = 0
        previous_losses = []
        exp_cost = None
        while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
            epoch += 1
            current_step = 0
            exp_length = None
            exp_norm = None

            ## Train
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
                    x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
                # Get a batch and make a step.
                tic = time.time()

                grad_norm, cost, param_norm = model.train(
                    sess, source_tokens, source_mask, target_tokens,
                    target_mask)

                toc = time.time()
                iter_time = toc - tic
                current_step += 1

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                if not exp_cost:
                    exp_cost = cost
                    exp_length = mean_length
                    exp_norm = grad_norm
                else:
                    exp_cost = 0.99 * exp_cost + 0.01 * cost
                    exp_length = 0.99 * exp_length + 0.01 * mean_length
                    exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

                cost = cost / mean_length

                if current_step % FLAGS.print_every == 0:
                    print(
                        'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f'
                        % (epoch, current_step, cost, exp_cost / exp_length,
                           grad_norm, param_norm, iter_time, mean_length,
                           std_length))

            ## Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
            model.saver.save(sess,
                             checkpoint_path,
                             global_step=model.global_step)

            ## Validate
            valid_cost = validate(model, sess, x_dev, y_dev)

            print("Epoch %d Validation cost: %f" % (epoch, valid_cost))

            previous_losses.append(valid_cost)
            if len(previous_losses) > 2 and valid_cost > max(
                    previous_losses[-3:]):
                sess.run(model.learning_rate_decay_op)
            sys.stdout.flush()

Esempio n. 10

0

Mostra file

def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        logging.info('Initial validation cost: %f' %
                     validate(model, sess, x_dev, y_dev))

        if False:
            tic = time.time()
            params = tf.trainable_variables()
            num_params = sum(
                map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
            toc = time.time()
            print("Number of params: %d (retreival took %f secs)" %
                  (num_params, toc - tic))

        epoch = 0
        best_epoch = 0
        previous_losses = []
        exp_cost = None
        exp_length = None
        exp_norm = None
        total_iters = 0
        start_time = time.time()
        while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
            epoch += 1
            current_step = 0

            ## Train
            epoch_tic = time.time()
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
                    x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
                # Get a batch and make a step.
                tic = time.time()

                grad_norm, cost, param_norm = model.train(
                    sess, source_tokens, source_mask, target_tokens,
                    target_mask)

                toc = time.time()
                iter_time = toc - tic
                total_iters += np.sum(target_mask)
                tps = total_iters / (time.time() - start_time)
                current_step += 1

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                if not exp_cost:
                    exp_cost = cost
                    exp_length = mean_length
                    exp_norm = grad_norm
                else:
                    exp_cost = 0.99 * exp_cost + 0.01 * cost
                    exp_length = 0.99 * exp_length + 0.01 * mean_length
                    exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

                cost = cost / mean_length

                if current_step % FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, length mean/std %f/%f'
                        %
                        (epoch, current_step, cost, exp_cost / exp_length,
                         grad_norm, param_norm, tps, mean_length, std_length))
            epoch_toc = time.time()

            ## Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

            ## Validate
            valid_cost = validate(model, sess, x_dev, y_dev)

            logging.info("Epoch %d Validation cost: %f time: %f" %
                         (epoch, valid_cost, epoch_toc - epoch_tic))

            if len(previous_losses) > 2 and valid_cost > previous_losses[-1]:
                logging.info("Annealing learning rate by %f" %
                             FLAGS.learning_rate_decay_factor)
                sess.run(model.learning_rate_decay_op)
                model.saver.restore(sess,
                                    checkpoint_path + ("-%d" % best_epoch))
            else:
                previous_losses.append(valid_cost)
                best_epoch = epoch
                model.saver.save(sess, checkpoint_path, global_step=epoch)
            sys.stdout.flush()

Esempio n. 11

0

Mostra file

File: train.py Progetto: windweller/nlc

def train():
  """Train a translation model using NLC data."""
  # Prepare NLC data.
  logging.info("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS))
  vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
  vocab_size = len(vocab)
  logging.info("Vocabulary size: %d" % vocab_size)

  if not os.path.exists(FLAGS.train_dir):
    os.makedirs(FLAGS.train_dir)
  file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
  logging.getLogger().addHandler(file_handler)

  print(vars(FLAGS))
  with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
    json.dump(FLAGS.__flags, fout)

  with tf.Session() as sess:
    logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev))

    if False:
      tic = time.time()
      params = tf.trainable_variables()
      num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
      toc = time.time()
      print ("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic))

    epoch = 0
    best_epoch = 0
    previous_losses = []
    exp_cost = None
    exp_length = None
    exp_norm = None
    while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
      epoch += 1
      current_step = 0

      ## Train
      epoch_tic = time.time()
      for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
        # Get a batch and make a step.
        tic = time.time()

        grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask)

        toc = time.time()
        iter_time = toc - tic
        current_step += 1

        lengths = np.sum(target_mask, axis=0)
        mean_length = np.mean(lengths)
        std_length = np.std(lengths)

        if not exp_cost:
          exp_cost = cost
          exp_length = mean_length
          exp_norm = grad_norm
        else:
          exp_cost = 0.99*exp_cost + 0.01*cost
          exp_length = 0.99*exp_length + 0.01*mean_length
          exp_norm = 0.99*exp_norm + 0.01*grad_norm

        cost = cost / mean_length

        if current_step % FLAGS.print_every == 0:
          logging.info('epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' %
                (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length))
      epoch_toc = time.time()

      ## Checkpoint
      checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

      ## Validate
      valid_cost = validate(model, sess, x_dev, y_dev)

      logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic))

      if len(previous_losses) > 2 and valid_cost > previous_losses[-1]:
        logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor)
        sess.run(model.learning_rate_decay_op)
        model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch))
      else:
        previous_losses.append(valid_cost)
        best_epoch = epoch
        model.saver.save(sess, checkpoint_path, global_step=epoch)
      sys.stdout.flush()

Esempio n. 12

0

Mostra file

File: train.py Progetto: anoopyear2020/OCR_Correction

def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)
    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + os.sep + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=nlc_data.char_tokenizer)
    vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        tic = time.time()
        params = tf.trainable_variables()
        num_params = sum(
            map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
        toc = time.time()
        print("Number of params: %d (retrieval took %f secs)" %
              (num_params, toc - tic))

        epoch = 0
        best_epoch = 0
        train_costs = []
        valid_costs = []
        previous_valid_losses = []
        while FLAGS.epochs == 0 or epoch < FLAGS.epochs:
            epoch += 1
            current_step = 0
            epoch_cost = 0
            epoch_tic = time.time()
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
                    x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
                # Get a batch and make a step.fa
                grad_norm, cost, param_norm = model.train(
                    sess, source_tokens, source_mask, target_tokens,
                    target_mask)

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                cost = cost / mean_length
                epoch_cost += cost
                current_step += 1

                if current_step % FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, cost %f, length mean/std %f/%f' %
                        (epoch, current_step, cost, mean_length, std_length))

                    if (epoch >= FLAGS.anomaly_epochs) and \
                            (cost >= FLAGS.anomaly_threshold):
                        write_anomaly(
                            source_tokens, vocab_path, SOURCE_PATH + '_' +
                            str(epoch) + '_' + str(current_step))
                        write_anomaly(
                            target_tokens, vocab_path, TARGET_PATH + '_' +
                            str(epoch) + '_' + str(current_step))

            # One epoch average train cost
            train_costs.append(epoch_cost / current_step)

            # After one epoch average validate cost
            epoch_toc = time.time()
            epoch_time = epoch_toc - epoch_tic
            valid_cost = validate(model, sess, x_dev, y_dev)
            valid_costs.append(valid_cost)
            logging.info("Epoch %d Validation cost: %f time:to %2fs" %
                         (epoch, valid_cost, epoch_time))

            # Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")
            if len(previous_valid_losses
                   ) > 2 and valid_cost > previous_valid_losses[-1]:
                logging.info("Annealing learning rate by %f" %
                             FLAGS.learning_rate_decay_factor)
                sess.run(model.learning_rate_decay_op)
                model.saver.restore(sess,
                                    checkpoint_path + ("-%d" % best_epoch))
            else:
                previous_valid_losses.append(valid_cost)
                best_epoch = epoch
                model.saver.save(sess, checkpoint_path, global_step=epoch)

        pickle.dump([train_costs, valid_costs], open('costs_data.pkl', 'wb'))

Esempio n. 13

0

Mostra file

File: train.py Progetto: nipengmath/nlc

def train():
  """Train a translation model using NLC data."""
  # Prepare NLC data.
  print("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS))
  vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
  vocab_size = len(vocab)
  print("Vocabulary size: %d" % vocab_size)

  with tf.Session() as sess:
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    epoch = 0
    while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
      epoch += 1
      current_step = 0
      exp_cost = None
      exp_length = None
      exp_norm = None

      ## Train
      for source_tokens, source_mask, target_tokens, target_mask in PairIter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
        # Get a batch and make a step.
        tic = time.time()

        grad_norm, cost = model.train(sess, source_tokens, source_mask, target_tokens, target_mask)

        toc = time.time()
        iter_time = toc - tic
        current_step += 1

        lengths = np.sum(target_mask, axis=0)
        mean_length = np.mean(lengths)
        std_length = np.std(lengths)

        if not exp_cost:
          exp_cost = cost
          exp_length = mean_length
          exp_norm = grad_norm
        else:
          exp_cost = 0.99*exp_cost + 0.01*cost
          exp_length = 0.99*exp_length + 0.01*mean_length
          exp_norm = 0.99*exp_norm + 0.01*grad_norm

        cost = cost / mean_length

        if current_step % FLAGS.print_every == 0:
          print('epoch %d, iter %d, cost %f, exp_cost %f, grad_norm %f, batch time %f, length mean/std %f/%f' %
                (epoch, current_step, cost, exp_cost / exp_length, grad_norm, iter_time, mean_length, std_length))

      ## Checkpoint
      checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
      model.saver.save(sess, checkpoint_path, global_step=model.global_step)

      valid_costs, valid_lengths = [], []
      for source_tokens, source_mask, target_tokens, target_mask in PairIter(x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers):
        cost, _ = model.test(sess, source_tokens, source_mask, target_tokens, target_mask)
        valid_costs.append(cost * target_mask.shape[1])
        valid_lengths.append(np.sum(target_mask[1:, :]))
      valid_cost = sum(valid_costs) / float(sum(valid_lengths))

      print("Epoch %d Validation cost: %f" % (epoch, valid_cost))

      previous_losses.append(valid_cost)
      if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
        sess.run(model.learning_rate_decay_op)
      sys.stdout.flush()

Esempio n. 14

0

Mostra file

File: test.py Progetto: davidrossouw/nlc

## Checkpoint
import json
import os
import logging
import pdb

import tensorflow as tf
import nlc_data
import numpy as np

from decode import decode_beam, detokenize, create_model, FLAGS
# from train import create_model
from util import pair_iter


vocab, reverse_vocab = nlc_data.initialize_vocabulary("data/char/vocab.dat")
best_epoch = 2
vocab_size = 42
checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

config = tf.ConfigProto(
    device_count={'GPU': 0}
)

with tf.Session(config=config) as sess:
    logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch))

    valid_costs, valid_lengths = [], []

Esempio n. 15

0

Mostra file

File: train.py Progetto: buptpriswang/nlc

def train():
  """Train a translation model using NLC data."""
  # Prepare NLC data.
  print("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS))
  vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
  vocab_size = len(vocab)
  print("Vocabulary size: %d" % vocab_size)

  with tf.Session() as sess:
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    print('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev))

    if False:
      tic = time.time()
      params = tf.trainable_variables()
      num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
      toc = time.time()
      print ("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic))

    epoch = 0
    previous_losses = []
    while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
      epoch += 1
      current_step = 0
      exp_cost = None
      exp_length = None
      exp_norm = None

      ## Train
      for source_tokens, source_mask, target_tokens, target_mask in PairIter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
        # Get a batch and make a step.
        tic = time.time()

        grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask)

        toc = time.time()
        iter_time = toc - tic
        current_step += 1

        lengths = np.sum(target_mask, axis=0)
        mean_length = np.mean(lengths)
        std_length = np.std(lengths)

        if not exp_cost:
          exp_cost = cost
          exp_length = mean_length
          exp_norm = grad_norm
        else:
          exp_cost = 0.99*exp_cost + 0.01*cost
          exp_length = 0.99*exp_length + 0.01*mean_length
          exp_norm = 0.99*exp_norm + 0.01*grad_norm

        cost = cost / mean_length

        if current_step % FLAGS.print_every == 0:
          print('epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' %
                (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length))

      ## Checkpoint
      checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
      model.saver.save(sess, checkpoint_path, global_step=model.global_step)

      ## Validate
      valid_cost = validate(model, sess, x_dev, y_dev)

      print("Epoch %d Validation cost: %f" % (epoch, valid_cost))

      previous_losses.append(valid_cost)
      if len(previous_losses) > 2 and valid_cost > max(previous_losses[-3:]):
        sess.run(model.learning_rate_decay_op)
      sys.stdout.flush()

Esempio n. 16

0

Mostra file

File: rl_train.py Progetto: windweller/nlc

def train():
    global vocab, rev_vocab
    print("Preparing data in %s" % FLAGS.data_dir)
    path_2_ptb_data = FLAGS.data_dir + "/ptb_data"

    x_train = "{}/train.ids.x".format(path_2_ptb_data)
    y_train = "{}/train.ids.y".format(path_2_ptb_data)

    x_dev = "{}/valid.ids.x".format(path_2_ptb_data)
    y_dev = "{}/valid.ids.y".format(path_2_ptb_data)

    vocab_path = "{}/vocab.dat".format(path_2_ptb_data)

    # source_tokens and target_tokens are transposed
    source_tokens, source_mask, target_tokens, target_mask = build_data(fnamex="{}/train.ids.x".format(path_2_ptb_data),
                                                                        fnamey="{}/train.ids.y".format(path_2_ptb_data),
                                                                        num_layers=FLAGS.num_layers,
                                                                        max_seq_len=FLAGS.max_seq_len)
    vocab, rev_vocab = nlc_data.initialize_vocabulary(vocab_path)

    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        with tf.variable_scope("actor") as actor_vs:
            model = create_model(vocab_size, False, actor_vs.name)
            setup_actor_update(model)
        with tf.variable_scope("critic") as critic_vs:
            critic = create_model(vocab_size, False, critic_vs.name)
            setup_loss_critic(critic)
        with tf.variable_scope("delayed_actor") as delayed_actor_vs:
            delayed_actor = create_model(vocab_size, False, delayed_actor_vs.name)
            setup_actor_update(delayed_actor)
        with tf.variable_scope("target_critic") as target_critic_vs:
            target_critic = create_model(vocab_size, False, target_critic_vs.name)
            setup_loss_critic(target_critic)

        # if there is not model to restore, we initialize all of them
        # otherwise, we only need to restore ONCE for everything.
        # TODO: is this saving for critic even just for sup_only?
        if not restore_models(sess, model):
            initialize_models(sess, model)  # this should initialize all variables..

        # by doing this, we are assigning embeddings as well
        # thinking about how critic's embeddings can make sense
        actor_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=actor_vs.name)
        delayed_actor_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=delayed_actor_vs.name)
        critic_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=critic_vs.name)
        target_critic_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=target_critic_vs.name)

        # initialized but untrained variables are NOT saved
        # remove this code...
        # if FLAGS.rl_new:
            # actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=actor_vs.name)
            # # filter down to Adam
            # actor_vars = [v for v in actor_vars if "Adam_3" or "_power" in v.name]  #
            #
            # all_delayed_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=delayed_actor_vs.name)
            # all_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=critic_vs.name)
            # all_target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=target_critic_vs.name)
            #
            # sess.run([tf.variables_initializer(all_delayed_actor_vars),
            #           tf.variables_initializer(all_critic_vars),
            #           tf.variables_initializer(all_target_critic_vars),
            #           tf.variables_initializer(actor_vars)])
            # sess.run(tf.global_variables_initializer())

        if not FLAGS.rl_only:
            model = train_seq2seq(model, sess, x_dev, y_dev, x_train, y_train)  # pre-train actor

        # assign model's parameter values to delayed_actor
        set_params_values(actor_variables, delayed_actor_variables, sess, "actor", "delayed_actor")

        # assign critic's initial parameter values to target_critic
        set_params_values(critic_variables, target_critic_variables, sess, "critic", "target_critic")

        if not FLAGS.sup_only:

            print('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev))
            # pre-train critic
            train_critic(sess, model, critic, delayed_actor, target_critic,
                         x_dev, y_dev, x_train, y_train,
                         actor_variables, delayed_actor_variables, critic_variables,
                         target_critic_variables, train_epochs=FLAGS.critic_epochs, pretrain=True)

            # train actor-critic (for a given # of epoch?)
            train_critic(sess, model, critic, delayed_actor, target_critic,
                         x_dev, y_dev, x_train, y_train,
                         actor_variables, delayed_actor_variables, critic_variables,
                         target_critic_variables, train_epochs=FLAGS.rl_epochs, pretrain=False)

        print('Final validation cost: %f' % validate(model, sess, x_dev, y_dev))