Python get_tokenizer Examples, pycorrector.rnn_attention.util.get_tokenizer Python Examples

Example #1

0

Show file

File: infer.py Project: djk111/pycorrector

def decode():
    # Prepare NLC data.
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        # lm = kenlm.LanguageModel(FLAGS.lmfile)
        lm = kenlm.Model(FLAGS.lmfile)
    else:
        print('No lmfile, better to add kenlm arpa data file')

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())
    vocab, reverse_vocab = initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        while True:
            sent = input("Enter a sentence: ")

            output_sent = fix_sent(model, sess, sent)

            print("Candidate: ", output_sent)

Example #2

0

Show file

def decode():
    # Prepare NLC data.
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.Model(FLAGS.lmfile)
    else:
        print('No lmfile, better to add kenlm arpa data file')

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())
    vocab, reverse_vocab = initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        while True:
            sent = input("Enter a sentence: ")

            output_sent = fix_sent(model, sess, sent)

            print("Candidate: ", output_sent)

Example #3

0

Show file

def tokenize(sent, vocab, depth=FLAGS.num_layers):
    align = pow(2, depth - 1)
    token_ids = sentence_to_token_ids(sent, vocab, get_tokenizer())
    ones = [1] * len(token_ids)
    pad = (align - len(token_ids)) % align

    token_ids += [PAD_ID] * pad
    ones += [0] * pad

    source = np.array(token_ids).reshape([-1, 1])
    mask = np.array(ones).reshape([-1, 1])

    return source, mask

Example #4

0

Show file

File: infer.py Project: djk111/pycorrector

def tokenize(sent, vocab, depth=FLAGS.num_layers):
    align = pow(2, depth - 1)
    token_ids = sentence_to_token_ids(sent, vocab, get_tokenizer())
    ones = [1] * len(token_ids)
    pad = (align - len(token_ids)) % align

    token_ids += [PAD_ID] * pad
    ones += [0] * pad

    source = np.array(token_ids).reshape([-1, 1])
    mask = np.array(ones).reshape([-1, 1])

    return source, mask

Example #5

0

Show file

File: error_analysis.py Project: zw76859420/pycorrector

def setup_batch_decode(sess):
    # decode for dev-sets, in batches
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.LanguageModel(FLAGS.lmfile)

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size,
        tokenizer=get_tokenizer())  # , other_dev_path="/deep/group/nlp_data/nlc_data/ourdev/bpe")
    vocab, reverse_vocab = initialize_vocabulary(vocab_path, bpe=(FLAGS.tokenizer.lower() == "bpe"))
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    return model, x_dev, y_dev

Example #6

0

Show file

File: error_analysis.py Project: djk111/pycorrector

def setup_batch_decode(sess):
    # decode for dev-sets, in batches
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.LanguageModel(FLAGS.lmfile)

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size,
        tokenizer=get_tokenizer())  # , other_dev_path="/deep/group/nlp_data/nlc_data/ourdev/bpe")
    vocab, reverse_vocab = initialize_vocabulary(vocab_path, bpe=(FLAGS.tokenizer.lower() == "bpe"))
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    return model, x_dev, y_dev

Example #7

0

Show file

File: train.py Project: zhangpengshan/pycorrector

def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)
    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())

    vocab, _ = initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        logging.info('Initial validation cost: %f' %
                     validate(model, sess, x_dev, y_dev))

        epoch = 0
        previous_losses = []
        exp_cost = None
        exp_length = None
        exp_norm = None
        total_iters = 0
        start_time = time.time()
        while FLAGS.epochs == 0 or epoch < FLAGS.epochs:
            epoch += 1
            current_step = 0

            # Train
            epoch_tic = time.time()
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
                    x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
                # Get a batch and make a step.
                grad_norm, cost, param_norm = model.train(
                    sess, source_tokens, source_mask, target_tokens,
                    target_mask)
                total_iters += np.sum(target_mask)
                tps = total_iters / (time.time() - start_time)
                current_step += 1

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                if not exp_cost:
                    exp_cost = cost
                    exp_length = mean_length
                    exp_norm = grad_norm
                else:
                    exp_cost = 0.99 * exp_cost + 0.01 * cost
                    exp_length = 0.99 * exp_length + 0.01 * mean_length
                    exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

                cost = cost / mean_length

                if current_step % FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, '
                        'length mean/std %f/%f' %
                        (epoch, current_step, cost, exp_cost / exp_length,
                         grad_norm, param_norm, tps, mean_length, std_length))
            epoch_toc = time.time()

            # Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

            # Validate
            valid_cost = validate(model, sess, x_dev, y_dev)

            logging.info("Epoch %d Validation cost: %f time: %f" %
                         (epoch, valid_cost, epoch_toc - epoch_tic))

            if len(previous_losses) > 2 and valid_cost > previous_losses[-1]:
                logging.info("Annealing learning rate by %f" %
                             FLAGS.learning_rate_decay_factor)
                sess.run(model.learning_rate_decay_op)
                model.saver.restore(sess, checkpoint_path)
            else:
                previous_losses.append(valid_cost)
                model.saver.save(sess, checkpoint_path)
            sys.stdout.flush()

Example #8

0

Show file

File: train.py Project: djk111/pycorrector

def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)
    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())

    vocab, _ = initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev))

        epoch = 0
        previous_losses = []
        exp_cost = None
        exp_length = None
        exp_norm = None
        total_iters = 0
        start_time = time.time()
        while FLAGS.epochs == 0 or epoch < FLAGS.epochs:
            epoch += 1
            current_step = 0

            # Train
            epoch_tic = time.time()
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size,
                                                                                    FLAGS.num_layers):
                # Get a batch and make a step.
                grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask)
                total_iters += np.sum(target_mask)
                tps = total_iters / (time.time() - start_time)
                current_step += 1

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                if not exp_cost:
                    exp_cost = cost
                    exp_length = mean_length
                    exp_norm = grad_norm
                else:
                    exp_cost = 0.99 * exp_cost + 0.01 * cost
                    exp_length = 0.99 * exp_length + 0.01 * mean_length
                    exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

                cost = cost / mean_length

                if current_step % FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, '
                        'length mean/std %f/%f' %
                        (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, tps, mean_length,
                         std_length))
            epoch_toc = time.time()

            # Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

            # Validate
            valid_cost = validate(model, sess, x_dev, y_dev)

            logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic))

            if len(previous_losses) > 2 and valid_cost > previous_losses[-1]:
                logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor)
                sess.run(model.learning_rate_decay_op)
                model.saver.restore(sess, checkpoint_path)
            else:
                previous_losses.append(valid_cost)
                model.saver.save(sess, checkpoint_path)
            sys.stdout.flush()