class ModelsTests(unittest.TestCase): def setUp(self): from sourced.ml.core.algorithms.id_splitter.nn_model import build_rnn, build_cnn self.n_uniq = len(string.ascii_lowercase) self.model_rnn = build_rnn(maxlen=5, units=24, stack=2, rnn_layer="LSTM", optimizer="Adam", dev0="/cpu:0", dev1="/cpu:0") self.model_cnn = build_cnn(maxlen=5, filters=[64, 32, 16, 8], output_n_filters=32, stack=2, kernel_sizes=[2, 4, 8, 16], optimizer="Adam", device="/cpu:0") @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_build_rnn(self): self.assertTrue(self.model_rnn.built) self.assertTrue(self.model_rnn.trainable) self.assertIsInstance(self.model_rnn.get_weights()[0], numpy.ndarray) self.assertEqual(self.model_rnn.get_weights()[0].shape, (self.n_uniq+1, self.n_uniq+1)) self.assertTrue(self.model_rnn.uses_learning_phase) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_build_cnn(self): self.assertTrue(self.model_cnn.built) self.assertTrue(self.model_cnn.trainable) self.assertIsInstance(self.model_cnn.get_weights()[0], numpy.ndarray) self.assertEqual(self.model_cnn.get_weights()[0].shape, (self.n_uniq+1, self.n_uniq+1)) self.assertTrue(self.model_cnn.uses_learning_phase)
class NNTokenParserTests(unittest.TestCase): @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def setUp(self): self.tp = TokenParser(stem_threshold=4, max_token_length=20, attach_upper=False, use_nn=True) self.tp._single_shot = False @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_process_token(self): self.tp.max_token_length = 100 tokens = [ ("ONLYCAPS", ["only", "caps"]), ("nocaps", ["no", "caps"]), ("UpperCamelCase", ["upper", "camel", "case"]), ("camelCase", ["camel", "case"]), ("FRAPScase", ["frap", "case"]), ("SQLThing", ["sql", "thing"]), ("_Astra", ["astra"]), ("CAPS_CONST", ["caps", "const"]), ("_something_SILLY_", ["someth", "silli"]), ("blink182", ["blink"]), ("FooBar100500Bingo", ["foobar", "bingo"]), ("Man45var", ["man", "var"]), ("method_name", ["method", "name"]), ("Method_Name", ["method", "name"]), ("101dalms", ["dalm"]), ("101_dalms", ["dalm"]), ("101_DalmsBug", ["dalmsbug"]), ("101_Dalms45Bug7", ["dalm", "bug"]), ("wdSize", ["wd", "size"]), ("Glint", ["glint"]), ("foo_BAR", ["foo", "bar"]), ("sourced.ml.algorithms.uast_ids_to_bag", ["sourc", "d", "ml", "algorithm", "uast", "ids", "to", "bag"]), ("WORSTnameYOUcanIMAGINE", ["worst", "name", "you", "can", "imagin"]), # Another bad example. Parser failed to parse it correctly ("SmallIdsToFoOo", ["small", "ids", "to", "fooo"]), ("SmallIdFooo", ["small", "id", "foo", "o"]), ("ONE_M0re_.__badId.example", ["one", "m", "re", "badid", "exampl"]), ("never_use_Such__varsableNames", ["never", "use", "such", "varsabl", "name"]), ("a.b.c.d", ["a", "b", "c", "d"]), ("A.b.Cd.E", ["a", "b", "cd", "e"]), ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]), ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]), ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]), ] for token, correct in tokens: res = list(self.tp.process_token(token)) self.assertEqual(res, correct)
class MetricsTests(unittest.TestCase): @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_register_metric(self): from sourced.ml.core.algorithms.id_splitter.nn_model import register_metric, METRICS fake_metric = "fake metric" register_metric(fake_metric) self.assertIn(fake_metric, METRICS) METRICS.pop() self.assertNotIn(fake_metric, METRICS) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_raise_register_metric(self): from sourced.ml.core.algorithms.id_splitter.nn_model import register_metric, METRICS bad_metric = 1 with self.assertRaises(AssertionError): register_metric(bad_metric) self.assertNotIn(bad_metric, METRICS)
class TensorFlowModelTests(unittest.TestCase): @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_serialize(self): import tensorflow as tf a = tf.constant([[1, 0], [0, 1]]) b = tf.constant([[0, 1], [1, 0]]) c = tf.matmul(a, b) gd = tf.get_default_graph().as_graph_def() buffer = io.BytesIO() TensorFlowModel().construct(graphdef=gd).save( buffer, series="tensorflow-model") buffer.seek(0) model = TensorFlowModel().load(buffer) self.assertEqual(gd.node, model.graphdef.node) buffer = io.BytesIO() with tf.Session() as session: TensorFlowModel().construct(session=session, outputs=[c.name[:-2]]).save( buffer, series="tensorflow-model") buffer.seek(0) model = TensorFlowModel().load(buffer) self.assertEqual(gd.node, model.graphdef.node)
class IdSplitterTest(unittest.TestCase): @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_prepare_features(self): from sourced.ml.core.algorithms.id_splitter.features import prepare_features # check feature extraction text = "a a" n_lines = 10 max_identifier_len = 20 with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=n_lines, char_sizes=1, n_cols=2, text=text) feat = prepare_features(csv_path=tmp.name, use_header=True, identifier_col=0, max_identifier_len=max_identifier_len, split_identifier_col=1, shuffle=True, test_ratio=0.5, padding="post") x_train, x_test, y_train, y_test = feat # because of test_ratio=0.5 - shapes should be equal self.assertEqual(x_test.shape, x_train.shape) self.assertEqual(y_test.shape, y_train.shape) # each line contains only one split -> so it should be only 5 nonzero for train/test self.assertEqual(numpy.sum(y_test), 5) self.assertEqual(numpy.sum(y_train), 5) # each line contains only two chars -> so it should be only 10 nonzero for train/test self.assertEqual(numpy.count_nonzero(x_test), 10) self.assertEqual(numpy.count_nonzero(x_train), 10) # y should be 3 dimensional matrix self.assertEqual(y_test.ndim, 3) self.assertEqual(y_train.ndim, 3) # x should be 2 dimensional matrix self.assertEqual(x_test.ndim, 2) self.assertEqual(x_train.ndim, 2) # check number of samples self.assertEqual(x_test.shape[0] + x_train.shape[0], n_lines) self.assertEqual(y_test.shape[0] + y_train.shape[0], n_lines) # check max_identifier_len self.assertEqual(x_test.shape[1], max_identifier_len) self.assertEqual(x_train.shape[1], max_identifier_len) self.assertEqual(y_test.shape[1], max_identifier_len) self.assertEqual(y_train.shape[1], max_identifier_len) # normal file try: prepare_features(csv_path=IDENTIFIERS, use_header=True, identifier_col=0, max_identifier_len=max_identifier_len, split_identifier_col=1, shuffle=True, test_ratio=0.5, padding="post") except Exception as e: self.fail("prepare_features raised %s with log %s" % (type(e), str(e))) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_read_identifiers(self): from sourced.ml.core.algorithms.id_splitter.features import read_identifiers # read with header with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=10, identifier_col=3, split_identifier_col=4) self.assertEqual(len(res), 10) # read without header with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) res = read_identifiers(csv_path=tmp.name, use_header=False, max_identifier_len=10, identifier_col=3, split_identifier_col=4) self.assertEqual(len(res), 9) # read with max_identifier_len equal to 0 -> expect empty list with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=0, identifier_col=3, split_identifier_col=4) self.assertEqual(len(res), 0) # generate temporary file with identifiers of specific lengths and filter by length char_sizes = list(range(1, 11)) with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=5) # check filtering # read last two columns as identifiers for i in range(11): res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=i, identifier_col=3, split_identifier_col=4) self.assertEqual(len(res), i) # read wrong columns with tempfile.NamedTemporaryFile() as tmp: with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=2) with self.assertRaises(IndexError): read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=10, identifier_col=3, split_identifier_col=4) # normal file try: read_identifiers(csv_path=IDENTIFIERS, use_header=True, max_identifier_len=10, identifier_col=3, split_identifier_col=4) except Exception as e: self.fail("read_identifiers raised %s with log %s" % (type(e), str(e)))
self.assertIsInstance(self.model_rnn.get_weights()[0], numpy.ndarray) self.assertEqual(self.model_rnn.get_weights()[0].shape, (self.n_uniq + 1, self.n_uniq + 1)) self.assertTrue(self.model_rnn.uses_learning_phase) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_build_cnn(self): self.assertTrue(self.model_cnn.built) self.assertTrue(self.model_cnn.trainable) self.assertIsInstance(self.model_cnn.get_weights()[0], numpy.ndarray) self.assertEqual(self.model_cnn.get_weights()[0].shape, (self.n_uniq + 1, self.n_uniq + 1)) self.assertTrue(self.model_cnn.uses_learning_phase) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") class NNModelTest(unittest.TestCase): def setUp(self): from sourced.ml.core.models.id_splitter import IdentifierSplitterBiLSTM self.test_X = [ "networkSocket", "variablename", "loadfile", "blahblah", "foobar" ] self.test_y = [ "network", "socket", "variable", "name", "load", "file", "blah", "blah", "foobar" ] self.id_splitter = IdentifierSplitterBiLSTM() self.id_splitter.load(ID_SPLITTER_BILSTM) def test_load_and_run_model(self): self.assertEqual(self.id_splitter.split(self.test_X), self.test_y)
class IdSplitterPipelineTest(unittest.TestCase): @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_binarize(self): from sourced.ml.core.algorithms.id_splitter.pipeline import binarize thresholds = [ 0, 0.09, 0.19, 0.29, 0.39, 0.49, 0.59, 0.69, 0.79, 0.89, 0.99 ] n_pos = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] for th, n_p in zip(thresholds, n_pos): vals = numpy.arange(10) / 10 res = binarize(vals, th) self.assertEqual(sum(binarize(vals, th)), n_p) if th in (0, 0.99): self.assertEqual(numpy.unique(res).shape[0], 1) else: self.assertEqual(numpy.unique(res).shape[0], 2) vals = numpy.arange(10) / 10 old_vals = vals.copy() for th, n_p in zip(thresholds, n_pos): res = binarize(vals, th, inplace=False) self.assertEqual(sum(res), n_p) self.assertTrue(numpy.array_equal(old_vals, vals)) if th in (0, 0.99): self.assertEqual(numpy.unique(res).shape[0], 1) else: self.assertEqual(numpy.unique(res).shape[0], 2) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_prepare_devices(self): from sourced.ml.core.algorithms.id_splitter.nn_model import prepare_devices correct_args = ["1", "0,1", "-1"] resulted_dev = [("/gpu:1", "/gpu:1"), ("/gpu:0", "/gpu:1"), ("/cpu:0", "/cpu:0")] for res, arg in zip(resulted_dev, correct_args): self.assertEqual(res, prepare_devices(arg)) bad_args = ["", "1,2,3"] for arg in bad_args: with self.assertRaises(ValueError): prepare_devices(arg) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_build_schedule(self): from sourced.ml.core.algorithms.id_splitter.pipeline import build_schedule start_lr = 10 end_lr = 1 n_epochs = 9 lr_schedule = build_schedule(lr=start_lr, final_lr=end_lr, n_epochs=n_epochs) for i in range(n_epochs): self.assertEqual(start_lr - i, lr_schedule(epoch=i)) with self.assertRaises(AssertionError): lr_schedule(-1) with self.assertRaises(AssertionError): lr_schedule(n_epochs + 1) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_build_train_generator(self): from sourced.ml.core.algorithms.id_splitter.pipeline import build_train_generator batch_size = 3 # mismatch number of samples bad_x = numpy.zeros(3) bad_y = numpy.zeros(4) with self.assertRaises(AssertionError): build_train_generator(bad_x, bad_y, batch_size=batch_size) # check generator with correct inputs x = numpy.zeros(5) gen = build_train_generator(x, x, batch_size=batch_size) expected_n_samples = [3, 2] for n_samples in expected_n_samples: x_gen, y_gen = next(gen) self.assertEqual(x_gen.shape, y_gen.shape) self.assertEqual(n_samples, x_gen.shape[0]) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_train_parameters(self): from sourced.ml.core.algorithms.id_splitter.pipeline import create_generator_params batch_size = 500 samples_per_epoch = 10**6 n_samples = 40 * 10**6 epochs = 10 steps_per_epoch_ = samples_per_epoch // batch_size n_epochs_ = numpy.ceil(epochs * n_samples / samples_per_epoch) steps_per_epoch, n_epochs = create_generator_params( batch_size, samples_per_epoch, n_samples, epochs) self.assertEqual(steps_per_epoch, steps_per_epoch_) self.assertEqual(n_epochs, n_epochs_) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_config_keras(self): from keras.backend.tensorflow_backend import get_session from sourced.ml.core.algorithms.id_splitter.pipeline import config_keras config_keras() sess = get_session() self.assertTrue(sess._config.gpu_options.allow_growth) @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") def test_prepare_callbacks(self): from keras.callbacks import TensorBoard, CSVLogger, ModelCheckpoint from sourced.ml.core.algorithms.id_splitter.pipeline import prepare_callbacks with tempfile.TemporaryDirectory() as tmpdir: callbacks = prepare_callbacks(tmpdir) # TensorBoard self.assertIsInstance(callbacks[0], TensorBoard) self.assertTrue(callbacks[0].log_dir.startswith(tmpdir)) # CSVLogger self.assertIsInstance(callbacks[1], CSVLogger) self.assertTrue(callbacks[1].filename.startswith(tmpdir)) # ModelCheckpoint self.assertIsInstance(callbacks[2], ModelCheckpoint) self.assertTrue(callbacks[2].filepath.startswith(tmpdir))