class TrainIdSplitTest(unittest.TestCase): def setUp(self): self.input = os.path.join(os.path.dirname(__file__), "identifiers.csv.tar.gz") @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_id_split_train(self): try: with tempfile.TemporaryDirectory() as tmpdir: args = argparse.Namespace(input=self.input, output=tmpdir, model="CNN", devices="-1", test_ratio=0.2, padding="post", optimizer="Adam", batch_size=2, val_batch_size=2, length=10, dim_reduction=2, epochs=1, samples_before_report=10, lr=0.001, final_lr=0.00001, seed=1989, csv_identifier=3, csv_identifier_split=4, stack=2, include_csv_header=True, filters="64,32,16,8", kernel_sizes="2,4,8,16") train_id_split(args) except Exception as e: self.fail("CNN training raised %s with log: %s" % (type(e), str(e)))
class MetricsTests(unittest.TestCase): @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_register_metric(self): from sourced.ml.algorithms.id_splitter.nn_model import register_metric, METRICS fake_metric = "fake metric" register_metric(fake_metric) self.assertIn(fake_metric, METRICS) METRICS.pop() self.assertNotIn(fake_metric, METRICS) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_raise_register_metric(self): from sourced.ml.algorithms.id_splitter.nn_model import register_metric, METRICS bad_metric = 1 with self.assertRaises(AssertionError): register_metric(bad_metric) self.assertNotIn(bad_metric, METRICS)
class ModelsTests(unittest.TestCase): def setUp(self): from sourced.ml.algorithms.id_splitter.nn_model import build_rnn, build_cnn self.n_uniq = len(string.ascii_lowercase) self.model_rnn = build_rnn(maxlen=5, units=24, stack=2, rnn_layer="LSTM", optimizer="Adam", dev0="/cpu:0", dev1="/cpu:0") self.model_cnn = build_cnn(maxlen=5, filters=[64, 32, 16, 8], output_n_filters=32, stack=2, kernel_sizes=[2, 4, 8, 16], optimizer="Adam", device="/cpu:0") @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_build_rnn(self): self.assertTrue(self.model_rnn.built) self.assertTrue(self.model_rnn.trainable) self.assertIsInstance(self.model_rnn.get_weights()[0], numpy.ndarray) self.assertEqual(self.model_rnn.get_weights()[0].shape, (self.n_uniq + 1, self.n_uniq + 1)) self.assertTrue(self.model_rnn.uses_learning_phase) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_build_cnn(self): self.assertTrue(self.model_cnn.built) self.assertTrue(self.model_cnn.trainable) self.assertIsInstance(self.model_cnn.get_weights()[0], numpy.ndarray) self.assertEqual(self.model_cnn.get_weights()[0].shape, (self.n_uniq + 1, self.n_uniq + 1)) self.assertTrue(self.model_cnn.uses_learning_phase)
class TensorFlowModelTests(unittest.TestCase): @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_serialize(self): import tensorflow as tf a = tf.constant([[1, 0], [0, 1]]) b = tf.constant([[0, 1], [1, 0]]) c = tf.matmul(a, b) gd = tf.get_default_graph().as_graph_def() buffer = io.BytesIO() TensorFlowModel().construct(graphdef=gd).save(buffer) buffer.seek(0) model = TensorFlowModel().load(buffer) self.assertEqual(gd.node, model.graphdef.node) buffer = io.BytesIO() with tf.Session() as session: TensorFlowModel().construct(session=session, outputs=[c.name[:-2]]).save(buffer) buffer.seek(0) model = TensorFlowModel().load(buffer) self.assertEqual(gd.node, model.graphdef.node)
class IdEmbeddingTests(unittest.TestCase): def test_preprocess_bad_params(self): with tempfile.TemporaryDirectory() as tmpdir: args = default_preprocess_params(tmpdir, VOCAB) args.shard_size = VOCAB + 1 self.assertRaises(ValueError, lambda: id2vec_preprocess(args)) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_preprocess(self): import tensorflow as tf with tempfile.TemporaryDirectory() as tmpdir: args = default_preprocess_params(tmpdir, VOCAB) with captured_output() as (out, err, log): id2vec_preprocess(args) self.assertFalse(out.getvalue()) self.assertFalse(err.getvalue()) self.assertEqual(sorted(os.listdir(tmpdir)), [ "col_sums.txt", "col_vocab.txt", "row_sums.txt", "row_vocab.txt", "shard-000-000.pb" ]) df = OrderedDocumentFrequencies().load(source=args.docfreq_in) self.assertEqual(len(df), VOCAB) with open(os.path.join(tmpdir, "col_sums.txt")) as fin: col_sums = fin.read() with open(os.path.join(tmpdir, "row_sums.txt")) as fin: row_sums = fin.read() self.assertEqual(col_sums, row_sums) with open(os.path.join(tmpdir, "col_vocab.txt")) as fin: col_vocab = fin.read() with open(os.path.join(tmpdir, "row_vocab.txt")) as fin: row_vocab = fin.read() self.assertEqual(col_vocab, row_vocab) self.assertEqual(row_vocab.split("\n"), df.tokens()) for word in row_vocab.split("\n"): self.assertGreater(df[word], 0) with open(os.path.join(tmpdir, "shard-000-000.pb"), "rb") as fin: features = tf.parse_single_example( fin.read(), features={ "global_row": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "global_col": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "sparse_local_row": tf.VarLenFeature(dtype=tf.int64), "sparse_local_col": tf.VarLenFeature(dtype=tf.int64), "sparse_value": tf.VarLenFeature(dtype=tf.float32) }) with tf.Session() as session: global_row, global_col, local_row, local_col, value = session.run( [ features[n] for n in ("global_row", "global_col", "sparse_local_row", "sparse_local_col", "sparse_value") ]) self.assertEqual(set(range(VOCAB)), set(global_row)) self.assertEqual(set(range(VOCAB)), set(global_col)) nnz = 16001 self.assertEqual(value.values.shape, (nnz, )) self.assertEqual(local_row.values.shape, (nnz, )) self.assertEqual(local_col.values.shape, (nnz, )) numpy.random.seed(0) all_tokens = row_vocab.split("\n") chosen_indices = numpy.random.choice(list(range(VOCAB)), 128, replace=False) chosen = [all_tokens[i] for i in chosen_indices] freqs = numpy.zeros((len(chosen), ) * 2, dtype=int) index = {w: i for i, w in enumerate(chosen)} chosen = set(chosen) with asdf.open(args.input) as model: matrix = assemble_sparse_matrix(model.tree["matrix"]).tocsr() tokens = split_strings(model.tree["tokens"]) interesting = {i for i, t in enumerate(tokens) if t in chosen} for y in interesting: row = matrix[y] yi = index[tokens[y]] for x, v in zip(row.indices, row.data): if x in interesting: freqs[yi, index[tokens[x]]] += v matrix = coo_matrix( (value.values, ([global_row[row] for row in local_row.values ], [global_col[col] for col in local_col.values])), shape=(VOCAB, VOCAB)) matrix = matrix.tocsr()[chosen_indices][:, chosen_indices].todense( ).astype(int) self.assertTrue((matrix == freqs).all()) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_swivel_bad_params_submatrix_cols(self): with tempfile.TemporaryDirectory() as tmpdir: args = default_swivel_args(tmpdir) args.submatrix_cols += 1 self.assertRaises(ValueError, lambda: run_swivel(args)) args.submatrix_cols -= 1 args.submatrix_rows += 1 self.assertRaises(ValueError, lambda: run_swivel(args)) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_swivel(self): with tempfile.TemporaryDirectory() as tmpdir: args = default_swivel_args(tmpdir) run_swivel(args) check_swivel_results(self, tmpdir) def test_postprocess(self): buffer = BytesIO() args = argparse.Namespace(swivel_data=os.path.join( os.path.dirname(__file__), "postproc"), output=buffer) prepare_postproc_files(args.swivel_data) id2vec_postprocess(args) buffer.seek(0) check_postproc_results(self, buffer)
class IdSplitterTest(unittest.TestCase): @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_prepare_features(self): from sourced.ml.algorithms.id_splitter.features import prepare_features # check feature extraction text = "a a" n_lines = 10 max_identifier_len = 20 with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=n_lines, char_sizes=1, n_cols=2, text=text) feat = prepare_features(csv_path=tmp.name, use_header=True, identifier_col=0, max_identifier_len=max_identifier_len, split_identifier_col=1, shuffle=True, test_ratio=0.5, padding="post") x_train, x_test, y_train, y_test = feat # because of test_ratio=0.5 - shapes should be equal self.assertEqual(x_test.shape, x_train.shape) self.assertEqual(y_test.shape, y_train.shape) # each line contains only one split -> so it should be only 5 nonzero for train/test self.assertEqual(numpy.sum(y_test), 5) self.assertEqual(numpy.sum(y_train), 5) # each line contains only two chars -> so it should be only 10 nonzero for train/test self.assertEqual(numpy.count_nonzero(x_test), 10) self.assertEqual(numpy.count_nonzero(x_train), 10) # y should be 3 dimensional matrix self.assertEqual(y_test.ndim, 3) self.assertEqual(y_train.ndim, 3) # x should be 2 dimensional matrix self.assertEqual(x_test.ndim, 2) self.assertEqual(x_train.ndim, 2) # check number of samples self.assertEqual(x_test.shape[0] + x_train.shape[0], n_lines) self.assertEqual(y_test.shape[0] + y_train.shape[0], n_lines) # check max_identifier_len self.assertEqual(x_test.shape[1], max_identifier_len) self.assertEqual(x_train.shape[1], max_identifier_len) self.assertEqual(y_test.shape[1], max_identifier_len) self.assertEqual(y_train.shape[1], max_identifier_len) # normal file try: prepare_features(csv_path=IDENTIFIERS, use_header=True, identifier_col=0, max_identifier_len=max_identifier_len, split_identifier_col=1, shuffle=True, test_ratio=0.5, padding="post") except Exception as e: self.fail("prepare_features raised %s with log %s" % (type(e), str(e))) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_read_identifiers(self): from sourced.ml.algorithms.id_splitter.features import read_identifiers # read with header with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=10, identifier_col=3, split_identifier_col=4) self.assertEqual(len(res), 10) # read without header with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) res = read_identifiers(csv_path=tmp.name, use_header=False, max_identifier_len=10, identifier_col=3, split_identifier_col=4) self.assertEqual(len(res), 9) # read with max_identifier_len equal to 0 -> expect empty list with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=0, identifier_col=3, split_identifier_col=4) self.assertEqual(len(res), 0) # generate temporary file with identifiers of specific lengths and filter by length char_sizes = list(range(1, 11)) with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=5) # check filtering # read last two columns as identifiers for i in range(11): res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=i, identifier_col=3, split_identifier_col=4) self.assertEqual(len(res), i) # read wrong columns with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=2) with self.assertRaises(IndexError): read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=10, identifier_col=3, split_identifier_col=4) # normal file try: read_identifiers(csv_path=IDENTIFIERS, use_header=True, max_identifier_len=10, identifier_col=3, split_identifier_col=4) except Exception as e: self.fail("read_identifiers raised %s with log %s" % (type(e), str(e)))
class IdSplitterPipelineTest(unittest.TestCase): @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_binarize(self): from sourced.ml.algorithms.id_splitter.pipeline import binarize thresholds = [0, 0.09, 0.19, 0.29, 0.39, 0.49, 0.59, 0.69, 0.79, 0.89, 0.99] n_pos = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] for th, n_p in zip(thresholds, n_pos): vals = numpy.arange(10) / 10 res = binarize(vals, th) self.assertEqual(sum(binarize(vals, th)), n_p) if th in (0, 0.99): self.assertEqual(numpy.unique(res).shape[0], 1) else: self.assertEqual(numpy.unique(res).shape[0], 2) vals = numpy.arange(10) / 10 old_vals = vals.copy() for th, n_p in zip(thresholds, n_pos): res = binarize(vals, th, inplace=False) self.assertEqual(sum(res), n_p) self.assertTrue(numpy.array_equal(old_vals, vals)) if th in (0, 0.99): self.assertEqual(numpy.unique(res).shape[0], 1) else: self.assertEqual(numpy.unique(res).shape[0], 2) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_prepare_devices(self): from sourced.ml.algorithms.id_splitter.nn_model import prepare_devices correct_args = ["1", "0,1", "-1"] resulted_dev = [("/gpu:1", "/gpu:1"), ("/gpu:0", "/gpu:1"), ("/cpu:0", "/cpu:0")] for res, arg in zip(resulted_dev, correct_args): self.assertEqual(res, prepare_devices(arg)) bad_args = ["", "1,2,3"] for arg in bad_args: with self.assertRaises(ValueError): prepare_devices(arg) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_build_schedule(self): from sourced.ml.algorithms.id_splitter.pipeline import build_schedule start_lr = 10 end_lr = 1 n_epochs = 9 lr_schedule = build_schedule(lr=start_lr, final_lr=end_lr, n_epochs=n_epochs) for i in range(n_epochs): self.assertEqual(start_lr - i, lr_schedule(epoch=i)) with self.assertRaises(AssertionError): lr_schedule(-1) with self.assertRaises(AssertionError): lr_schedule(n_epochs + 1) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_build_train_generator(self): from sourced.ml.algorithms.id_splitter.pipeline import build_train_generator batch_size = 3 # mismatch number of samples bad_x = numpy.zeros(3) bad_y = numpy.zeros(4) with self.assertRaises(AssertionError): build_train_generator(bad_x, bad_y, batch_size=batch_size) # check generator with correct inputs x = numpy.zeros(5) gen = build_train_generator(x, x, batch_size=batch_size) expected_n_samples = [3, 2] for n_samples in expected_n_samples: x_gen, y_gen = next(gen) self.assertEqual(x_gen.shape, y_gen.shape) self.assertEqual(n_samples, x_gen.shape[0]) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_train_parameters(self): from sourced.ml.algorithms.id_splitter.pipeline import create_generator_params batch_size = 500 samples_per_epoch = 10 ** 6 n_samples = 40 * 10 ** 6 epochs = 10 steps_per_epoch_ = samples_per_epoch // batch_size n_epochs_ = numpy.ceil(epochs * n_samples / samples_per_epoch) steps_per_epoch, n_epochs = create_generator_params(batch_size, samples_per_epoch, n_samples, epochs) self.assertEqual(steps_per_epoch, steps_per_epoch_) self.assertEqual(n_epochs, n_epochs_) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_config_keras(self): from keras.backend.tensorflow_backend import get_session from sourced.ml.algorithms.id_splitter.pipeline import config_keras config_keras() sess = get_session() self.assertTrue(sess._config.gpu_options.allow_growth) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_prepare_callbacks(self): from keras.callbacks import TensorBoard, CSVLogger, ModelCheckpoint from sourced.ml.algorithms.id_splitter.pipeline import prepare_callbacks with tempfile.TemporaryDirectory() as tmpdir: callbacks = prepare_callbacks(tmpdir) # TensorBoard self.assertIsInstance(callbacks[0], TensorBoard) self.assertTrue(callbacks[0].log_dir.startswith(tmpdir)) # CSVLogger self.assertIsInstance(callbacks[1], CSVLogger) self.assertTrue(callbacks[1].filename.startswith(tmpdir)) # ModelCheckpoint self.assertIsInstance(callbacks[2], ModelCheckpoint) self.assertTrue(callbacks[2].filepath.startswith(tmpdir))