def test_train_and_load_embedding(self): test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['passj', 'word', 'db']) with tempfile.NamedTemporaryFile(mode='w') as ofile: config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=5, embedding_window_size=1, batch_size=2, embedding_size=4, embedding_num_neg_samples=2, logging_freq=1, num_train_epochs=1) emb_trainer = pe.EmbeddingTrainer(config) with self.test_session() as session: emb_trainer.train_and_save(test_dataset, session, ofile) ofile.flush() with open(ofile.name, 'r') as ifile: loader = pe.CharEmbeddingLoader(config) output = loader.read_from_file(ifile) self.assertEqual(26, len(output)) for _, value in output.items(): self.assertEqual(4, len(value))
def test_skipgram_randomize(self): test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['passj', 'word', 'db']) config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=5, batch_size=10, embedding_window_size=3) emb_trainer = pe.EmbeddingTrainer(config) examples, labels = emb_trainer.skipgram(test_dataset, randomize=True) with self.test_session() as session: session.run([tf.tables_initializer()]) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) try: session.run([examples, labels]) except tf.errors.OutOfRangeError: pass finally: coord.request_stop() coord.join(threads)
def test_expand_one_hot(self): with self.test_session() as sess: config = pe.EmbeddingConfig(alphabet='abc', batch_size=6) data_maker = p_enc.encoder_from_config(config) data_maker.one_time_tensor_initialize() prefix = tf.convert_to_tensor([[0, 0], [ord('a'), 0], [ord('a'), ord('b')], [0, 0], [ord('b'), 0], [ord('b'), ord('c')]]) labels = tf.convert_to_tensor( [ord(c) for c in ['a', 'b', '\n', 'b', 'c', '\n']]) seq_len = tf.convert_to_tensor([0, 1, 2, 0, 1, 2]) one_hot_prefix, one_hot_label = data_maker.encode_training( prefix, labels) sess.run(data_maker.initializers()) outputs_out, lab_out, seq_len_out = sess.run( [one_hot_prefix, one_hot_label, seq_len]) self.assertAllClose( [[[0, 0, 0], [0, 0, 0]], [[1, 0, 0], [0, 0, 0]], [[1, 0, 0], [0, 1, 0]], [[0, 0, 0], [0, 0, 0]], [[0, 1, 0], [0, 0, 0]], [[0, 1, 0], [0, 0, 1]]], outputs_out) self.assertAllClose([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], lab_out) self.assertAllEqual([0, 1, 2, 0, 1, 2], seq_len_out)
def test_graph_builds(self): config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=5, embedding_window_size=3) test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['passj', 'word', 'db']) emb_trainer = pe.EmbeddingTrainer(config) with self.test_session() as session: emb_trainer.build_graph(test_dataset, session)
def test_input_prefix_to_tensor_padding(self): with self.test_session() as sess: config = pe.EmbeddingConfig(alphabet='abc') maker = p_enc.encoder_from_config(config) maker.one_time_tensor_initialize() out = maker.encode_inputs([[97, 98, 10], [99, 99, 10]]) sess.run(maker.initializers()) self.assertAllEqual([[[1., 0., 0.], [0., 1., 0.], [0., 0., 0.]], [[0., 0., 1.], [0., 0., 1.], [0., 0., 0.]]], sess.run(out))
def test_alphabet_not_float(self): ifile = io.StringIO("""{ "a": [0.4], "b" : ["asdf"], "c" : [0.2] }""") config = pe.EmbeddingConfig(alphabet='ab', embedding_size=1) loader = pe.CharEmbeddingLoader(config) try: loader.read_from_file(ifile) errored = False except pe.CharEmbeddingLoader.LoadException: errored = True self.assertTrue(errored)
def test_alphabet_not_equal(self): config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', embedding_size=1) ifile = io.StringIO("""{ "a" : [0.4], "b" : [0.1]}""") loader = pe.CharEmbeddingLoader(config) try: loader.read_from_file(ifile) errored = False except pe.CharEmbeddingLoader.LoadException: errored = True self.assertTrue(errored)
def test_train_loop(self): config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=5, embedding_window_size=1, batch_size=2, embedding_size=4, embedding_num_neg_samples=2, logging_freq=1, num_train_epochs=1) test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['passj', 'word', 'db']) emb_trainer = pe.EmbeddingTrainer(config) with self.test_session() as session: answer = emb_trainer.train(test_dataset, session) self.assertAllEqual([26, 4], answer.shape)
def test_train_save(self): with tempfile.NamedTemporaryFile(mode='w') as ofile: config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=5, embedding_window_size=1, batch_size=2, embedding_size=4, embedding_num_neg_samples=2, logging_freq=1, num_train_epochs=1) test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['passj', 'word', 'db']) emb_trainer = pe.EmbeddingTrainer(config) with self.test_session() as session: emb_trainer.train_and_save(test_dataset, session, ofile) ofile.flush()
def test_initial_counts_partial_batch(self): test_dataset = tf.contrib.data.Dataset.from_tensor_slices( ['pass', 'word', 'db']) config = pe.EmbeddingConfig(alphabet='abcdefghijklmnopqrstuvwxyz', password_batch=1, embedding_window_size=1) emb_trainer = pe.EmbeddingTrainer(config) with self.test_session() as session: counts, samples = emb_trainer.initial_counts(test_dataset, session) # a b c d e f g h i j k l m n o p q r s t u v expect = [ 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 0, 0, 0, # w x y z 1, 0, 0, 0 ] self.assertAllEqual(expect, counts) self.assertEqual(7, samples)
def main(args): if not args.input_file or not args.output_file: sys.stderr.write('--input-file and --output-file are required\n') return FORMAT = '%(asctime)s %(levelname)s %(message)s' logging.basicConfig(level=logging.INFO, format=FORMAT) import pass_embedding as pe import pass_dataset as pd if args.help_config: sys.stdout.write(pe.EmbeddingConfig.__init__.__doc__ + "\n") return logging.info('Called with %s', vars(args)) if args.config is not None: logging.info('Reading configuration from %s', args.config) try: config = pe.EmbeddingConfig.from_config_file(args.config) except pe.ConfigurationException as e: logging.fatal('Error while reading configuration: %s', str(e)) raise except ValueError as e: logging.fatal('Error while reading configuration: %s', str(e)) raise else: config = pe.EmbeddingConfig() dataset = pd.PasswordDatasetMakerFromFile([args.input_file]).make() with tf.Graph().as_default(): with tf.Session() as session: trainer = pe.EmbeddingTrainer( config, tensorboard_logdir=args.tensorboard_logdir) trainer.train_and_save(dataset, session, args.output_file)