def test_train_and_load_embedding(self): test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['passj', 'word', 'db']) with tempfile.NamedTemporaryFile(mode='w') as ofile: config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=5, embedding_window_size=1, batch_size=2, embedding_size=4, embedding_num_neg_samples=2, logging_freq=1, num_train_epochs=1) emb_trainer = pe.EmbeddingTrainer(config) with self.test_session() as session: emb_trainer.train_and_save(test_dataset, session, ofile) ofile.flush() with open(ofile.name, 'r') as ifile: loader = pe.CharEmbeddingLoader(config) output = loader.read_from_file(ifile) self.assertEqual(26, len(output)) for _, value in output.items(): self.assertEqual(4, len(value))
def test_skipgram_randomize(self): test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['passj', 'word', 'db']) config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=5, batch_size=10, embedding_window_size=3) emb_trainer = pe.EmbeddingTrainer(config) examples, labels = emb_trainer.skipgram(test_dataset, randomize=True) with self.test_session() as session: session.run([tf.tables_initializer()]) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) try: session.run([examples, labels]) except tf.errors.OutOfRangeError: pass finally: coord.request_stop() coord.join(threads)
def test_graph_builds(self): config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=5, embedding_window_size=3) test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['passj', 'word', 'db']) emb_trainer = pe.EmbeddingTrainer(config) with self.test_session() as session: emb_trainer.build_graph(test_dataset, session)
def test_train_loop(self): config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=5, embedding_window_size=1, batch_size=2, embedding_size=4, embedding_num_neg_samples=2, logging_freq=1, num_train_epochs=1) test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['passj', 'word', 'db']) emb_trainer = pe.EmbeddingTrainer(config) with self.test_session() as session: answer = emb_trainer.train(test_dataset, session) self.assertAllEqual([26, 4], answer.shape)
def test_train_save(self): with tempfile.NamedTemporaryFile(mode='w') as ofile: config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=5, embedding_window_size=1, batch_size=2, embedding_size=4, embedding_num_neg_samples=2, logging_freq=1, num_train_epochs=1) test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['passj', 'word', 'db']) emb_trainer = pe.EmbeddingTrainer(config) with self.test_session() as session: emb_trainer.train_and_save(test_dataset, session, ofile) ofile.flush()
def test_initial_counts_partial_batch(self): test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['pass', 'word', 'db']) config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=1, embedding_window_size=1) emb_trainer = pe.EmbeddingTrainer(config) with self.test_session() as session: counts, samples = emb_trainer.initial_counts(test_dataset, session) # a b c d e f g h i j k l m n o p q r s t u v expect = [ 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 0, 0, 0, # w x y z 1, 0, 0, 0 ] self.assertAllEqual(expect, counts) self.assertEqual(7, samples)
def main(args): if not args.input_file or not args.output_file: sys.stderr.write('--input-file and --output-file are required\n') return FORMAT = '%(asctime)s %(levelname)s %(message)s' logging.basicConfig(level=logging.INFO, format=FORMAT) import pass_embedding as pe import pass_dataset as pd if args.help_config: sys.stdout.write(pe.EmbeddingConfig.__init__.__doc__ + "\n") return logging.info('Called with %s', vars(args)) if args.config is not None: logging.info('Reading configuration from %s', args.config) try: config = pe.EmbeddingConfig.from_config_file(args.config) except pe.ConfigurationException as e: logging.fatal('Error while reading configuration: %s', str(e)) raise except ValueError as e: logging.fatal('Error while reading configuration: %s', str(e)) raise else: config = pe.EmbeddingConfig() dataset = pd.PasswordDatasetMakerFromFile([args.input_file]).make() with tf.Graph().as_default(): with tf.Session() as session: trainer = pe.EmbeddingTrainer( config, tensorboard_logdir=args.tensorboard_logdir) trainer.train_and_save(dataset, session, args.output_file)