Example #1
0
def main(_):

    # tf.estimator will load/reuse anything found in its model_dir, so
    # we make sure to clear its contents before every training run.
    # For predictions, however, we of course want to load the previously
    # trained model from disk.
    if tf.gfile.Exists(args.model_dir) and not args.predict_only:
        tf.gfile.DeleteRecursively(args.model_dir)
    tf.gfile.MakeDirs(args.model_dir)

    hparams = HParams(**vars(args))

    # We will use the 20 newsgroups dataset to train our model.
    # Note that we won't be using the labels, since our model is simply
    # learning to reconstruct its inputs as its output.
    train_file_path = os.path.join(hparams.data_dir,
                                   '20ng-train-all-terms.txt')

    # Define the path to the file that we'll store our vocabulary in.
    # This file will have the same number of lines as our vocab_size.
    # Each line will contain a single word in our vocabulary, listed in
    # order of decreasing frequency seen in our training data.
    vocab_path = os.path.join(hparams.processed_data_dir, 'vocab.txt')

    # Data preparation: getting vocabulary and saving tfrecords format.
    if not tf.gfile.Exists(vocab_path):
        print('Extracting vocab, labels, and tokenized texts from data.')
        vocab, labels, texts = newsgroups.fit_and_extract(
            train_file_path, hparams.vocab_size)
        print('Saving vocabulary to {}.'.format(vocab_path))
        with open(vocab_path, 'w+') as f:
            f.write('\n'.join(vocab))

        tfrecords_path = os.path.join(hparams.processed_data_dir,
                                      'embed.tfrecords')
        print('Saving tfrecords to {}.'.format(tfrecords_path))
        tfrecords.save_tfrecords(out_path=tfrecords_path,
                                 labels=labels,
                                 texts=texts,
                                 vocab=vocab)
    else:
        print('Reading existing vocabulary from {}.'.format(vocab_path))
        with open(vocab_path) as f:
            vocab = [l.strip() for l in f.readlines()]

    hparams.vocab = vocab
    print('Creating autoencoder.')
    autoencoder = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=hparams.model_dir,
        config=tf.estimator.RunConfig(log_step_count_steps=10000),
        params=hparams)

    if not args.predict_only:
        print('Training autoencoder.')
        autoencoder.train(
            input_fn=lambda: input_fn(hparams.processed_data_dir, hparams),
            steps=1000)

    sample_sentences = [
        'i like dogs', 'i am a test sentence',
        'TensorFlow is a fun library to use'
    ]
    pred_inputs = []
    for sent in sample_sentences:
        token_ids = [
            vocab.index(w) for w in sent.split()[:args.max_seq_len]
            if w in vocab
        ]
        # Pad if necessary.
        if len(token_ids) < args.max_seq_len:
            token_ids.extend([0] * (args.max_seq_len - len(token_ids)))
        pred_inputs.append(token_ids)

    pred_inp_fn = tf.estimator.inputs.numpy_input_fn(
        x={'x': np.asarray(pred_inputs)}, shuffle=False)
    predictions = autoencoder.predict(input_fn=pred_inp_fn)

    print('Sample predictions:')
    for i, prediction in enumerate(predictions):
        clean_prediction = ' '.join(
            [tok.decode() for tok in prediction if tok != b'_UNK'])
        print('\nExpected:', sample_sentences[i], sep='\t')
        print('Actual:  ', clean_prediction, sep='\t')
Example #2
0
def main():
    # tf.estimator will load/reuse anything found in its model_dir, so
    # we make sure to clear its contents before every training run.
    # For predictions, however, we of course want to load the previously
    # trained model from disk.
    if tf.gfile.Exists(args.model_dir):
        tf.gfile.DeleteRecursively(args.model_dir)
    tf.gfile.MakeDirs(args.model_dir)
    tf.gfile.MakeDirs(args.processed_data_dir)
    tf.gfile.Copy(os.path.join(args.data_dir, 'labels.txt'),
                  os.path.join(args.processed_data_dir, 'labels.txt'),
                  overwrite=True)

    hparams = HParams(**vars(args))

    # Define the path to the file that we'll store our vocabulary in.
    # This file will have the same number of lines as our vocab_size.
    # Each line will contain a single word in our vocabulary, listed in
    # order of decreasing frequency seen in our training data.
    vocab_path = os.path.join(hparams.processed_data_dir,
                              'vocab_{}.txt'.format(hparams.vocab_size))

    # Data preparation: getting vocabulary and saving tfrecords format.
    if not tf.gfile.Exists(vocab_path):
        for mode in ['train', 'test']:
            data_file_path = os.path.join(hparams.data_dir,
                                          '20ng-{}-all-terms.txt'.format(mode))

            print('Extracting vocab, labels, and tokenized texts from data.')
            if mode == 'train':
                vocab, labels, texts = newsgroups.fit_and_extract(
                    data_file_path, hparams.vocab_size)
                print('Saving vocabulary to {}.'.format(vocab_path))
                with open(vocab_path, 'w+') as f:
                    f.write('\n'.join(vocab))
            else:
                _, labels, texts = newsgroups.fit_and_extract(
                    data_file_path, hparams.vocab_size)

            tfrecords_path = os.path.join(
                hparams.processed_data_dir,
                '20ng_advanced_{}_{}.tfrecords'.format(mode,
                                                       hparams.vocab_size))
            print('Saving tfrecords to {}.'.format(tfrecords_path))
            tfrecords.save_tfrecords(out_path=tfrecords_path,
                                     labels=labels,
                                     texts=texts,
                                     vocab=vocab)
    else:
        print('Reading existing vocabulary from {}.'.format(vocab_path))
        with open(vocab_path) as f:
            vocab = [l.strip() for l in f.readlines()]

    hparams.vocab = vocab
    print('Creating classifier.')
    classifier = tf.estimator.Estimator(model_fn=model_fn,
                                        model_dir=hparams.model_dir,
                                        config=tf.estimator.RunConfig(
                                            save_summary_steps=100,
                                            save_checkpoints_steps=500,
                                            log_step_count_steps=10000,
                                        ),
                                        params=hparams)

    for i in range(hparams.num_iter):
        classifier.train(input_fn=lambda: input_fn(hparams, 'train'),
                         hooks=[
                             tf.train.ProfilerHook(
                                 save_steps=100, output_dir=hparams.model_dir)
                         ],
                         steps=hparams.train_steps)
        classifier.evaluate(input_fn=lambda: input_fn(hparams, 'test'),
                            steps=hparams.eval_steps)