def main(_): # tf.estimator will load/reuse anything found in its model_dir, so # we make sure to clear its contents before every training run. # For predictions, however, we of course want to load the previously # trained model from disk. if tf.gfile.Exists(args.model_dir) and not args.predict_only: tf.gfile.DeleteRecursively(args.model_dir) tf.gfile.MakeDirs(args.model_dir) hparams = HParams(**vars(args)) # We will use the 20 newsgroups dataset to train our model. # Note that we won't be using the labels, since our model is simply # learning to reconstruct its inputs as its output. train_file_path = os.path.join(hparams.data_dir, '20ng-train-all-terms.txt') # Define the path to the file that we'll store our vocabulary in. # This file will have the same number of lines as our vocab_size. # Each line will contain a single word in our vocabulary, listed in # order of decreasing frequency seen in our training data. vocab_path = os.path.join(hparams.processed_data_dir, 'vocab.txt') # Data preparation: getting vocabulary and saving tfrecords format. if not tf.gfile.Exists(vocab_path): print('Extracting vocab, labels, and tokenized texts from data.') vocab, labels, texts = newsgroups.fit_and_extract( train_file_path, hparams.vocab_size) print('Saving vocabulary to {}.'.format(vocab_path)) with open(vocab_path, 'w+') as f: f.write('\n'.join(vocab)) tfrecords_path = os.path.join(hparams.processed_data_dir, 'embed.tfrecords') print('Saving tfrecords to {}.'.format(tfrecords_path)) tfrecords.save_tfrecords(out_path=tfrecords_path, labels=labels, texts=texts, vocab=vocab) else: print('Reading existing vocabulary from {}.'.format(vocab_path)) with open(vocab_path) as f: vocab = [l.strip() for l in f.readlines()] hparams.vocab = vocab print('Creating autoencoder.') autoencoder = tf.estimator.Estimator( model_fn=model_fn, model_dir=hparams.model_dir, config=tf.estimator.RunConfig(log_step_count_steps=10000), params=hparams) if not args.predict_only: print('Training autoencoder.') autoencoder.train( input_fn=lambda: input_fn(hparams.processed_data_dir, hparams), steps=1000) sample_sentences = [ 'i like dogs', 'i am a test sentence', 'TensorFlow is a fun library to use' ] pred_inputs = [] for sent in sample_sentences: token_ids = [ vocab.index(w) for w in sent.split()[:args.max_seq_len] if w in vocab ] # Pad if necessary. if len(token_ids) < args.max_seq_len: token_ids.extend([0] * (args.max_seq_len - len(token_ids))) pred_inputs.append(token_ids) pred_inp_fn = tf.estimator.inputs.numpy_input_fn( x={'x': np.asarray(pred_inputs)}, shuffle=False) predictions = autoencoder.predict(input_fn=pred_inp_fn) print('Sample predictions:') for i, prediction in enumerate(predictions): clean_prediction = ' '.join( [tok.decode() for tok in prediction if tok != b'_UNK']) print('\nExpected:', sample_sentences[i], sep='\t') print('Actual: ', clean_prediction, sep='\t')
def main(): # tf.estimator will load/reuse anything found in its model_dir, so # we make sure to clear its contents before every training run. # For predictions, however, we of course want to load the previously # trained model from disk. if tf.gfile.Exists(args.model_dir): tf.gfile.DeleteRecursively(args.model_dir) tf.gfile.MakeDirs(args.model_dir) tf.gfile.MakeDirs(args.processed_data_dir) tf.gfile.Copy(os.path.join(args.data_dir, 'labels.txt'), os.path.join(args.processed_data_dir, 'labels.txt'), overwrite=True) hparams = HParams(**vars(args)) # Define the path to the file that we'll store our vocabulary in. # This file will have the same number of lines as our vocab_size. # Each line will contain a single word in our vocabulary, listed in # order of decreasing frequency seen in our training data. vocab_path = os.path.join(hparams.processed_data_dir, 'vocab_{}.txt'.format(hparams.vocab_size)) # Data preparation: getting vocabulary and saving tfrecords format. if not tf.gfile.Exists(vocab_path): for mode in ['train', 'test']: data_file_path = os.path.join(hparams.data_dir, '20ng-{}-all-terms.txt'.format(mode)) print('Extracting vocab, labels, and tokenized texts from data.') if mode == 'train': vocab, labels, texts = newsgroups.fit_and_extract( data_file_path, hparams.vocab_size) print('Saving vocabulary to {}.'.format(vocab_path)) with open(vocab_path, 'w+') as f: f.write('\n'.join(vocab)) else: _, labels, texts = newsgroups.fit_and_extract( data_file_path, hparams.vocab_size) tfrecords_path = os.path.join( hparams.processed_data_dir, '20ng_advanced_{}_{}.tfrecords'.format(mode, hparams.vocab_size)) print('Saving tfrecords to {}.'.format(tfrecords_path)) tfrecords.save_tfrecords(out_path=tfrecords_path, labels=labels, texts=texts, vocab=vocab) else: print('Reading existing vocabulary from {}.'.format(vocab_path)) with open(vocab_path) as f: vocab = [l.strip() for l in f.readlines()] hparams.vocab = vocab print('Creating classifier.') classifier = tf.estimator.Estimator(model_fn=model_fn, model_dir=hparams.model_dir, config=tf.estimator.RunConfig( save_summary_steps=100, save_checkpoints_steps=500, log_step_count_steps=10000, ), params=hparams) for i in range(hparams.num_iter): classifier.train(input_fn=lambda: input_fn(hparams, 'train'), hooks=[ tf.train.ProfilerHook( save_steps=100, output_dir=hparams.model_dir) ], steps=hparams.train_steps) classifier.evaluate(input_fn=lambda: input_fn(hparams, 'test'), steps=hparams.eval_steps)