def test_get_encoding(self): file_raw = os.path.join(self.tmpdir, "encoding_raw.bin") file_op = os.path.join(self.tmpdir, "encoding_op.bin") with BinaryDs(file_raw, encoded=True) as dataset_raw: self.assertTrue(dataset_raw.is_encoded()) with BinaryDs(file_op, encoded=False) as dataset_op: self.assertFalse(dataset_op.is_encoded())
def evaluate_confusion(bs: int, file: str, fixed: int, model_path: str, test_bin) -> None: """ Evaluates the confusion matrix for a given number of features :param bs: batch size :param file: file where the confusion matrix will be written :param fixed: number of features to be considered :param model_path: string pointing to the .h5 keras model of the network. If empty will default to data_dir/model.h5 :param test_bin: path to the test dataset that will be used """ test = BinaryDs(test_bin, read_only=True).open() binary = test.get_categories() <= 2 model = load_model(model_path) generator = DataGenerator(test, bs, fake_pad=True, pad_len=fixed, predict=True) expected = get_expected(bs, test) predicted = model.predict(generator, verbose=1) if binary: predicted = np.round(predicted).flatten().astype(np.int8) else: predicted = np.argmax(predicted, axis=1) matrix = np.array(tf.math.confusion_matrix(expected, predicted)) with open(file, "w") as f: np.savetxt(f, X=matrix, fmt="%d") test.close()
def test_truncate_but_one(self): file = os.path.join(self.tmpdir, "truncate_1.bin") with BinaryDs(file, features=14) as dataset: dataset.write(self.data_raw2) self.assertGreater(dataset.get_examples_no(), 0) with BinaryDs(file, features=14) as dataset: dataset.truncate(left=1) self.assertEqual(dataset.get_examples_no(), 1)
def test_merge_different_features(self): file14 = os.path.join(self.tmpdir, "merge_f14.bin") file2k = os.path.join(self.tmpdir, "merge_f2048.bin") with BinaryDs(file2k) as ds2k: with BinaryDs(file14, features=14) as ds14: ds14.write(self.data_raw) with self.assertRaises(IOError): ds2k.merge(ds14)
def test_merge_different_encoding(self): file_op = os.path.join(self.tmpdir, "merge_op.bin") file_raw = os.path.join(self.tmpdir, "merge_raw.bin") with BinaryDs(file_op, encoded=False, features=14) as ds_op: ds_op.write(self.data_raw) with BinaryDs(file_raw, encoded=True, features=14) as ds_raw: ds_raw.write(self.data_raw) with self.assertRaises(IOError): ds_raw.merge(ds_op)
def test_balance(self): file = os.path.join(self.tmpdir, "balance.bin") with BinaryDs(file, features=14) as binary: binary.write(self.data_raw2) binary.balance() with BinaryDs(file, features=14) as binary: results = binary.read(0, binary.examples) expected = self.data_raw2[:5] + [self.data_raw2[6]] self.assertEqual(results, expected)
def test_update_categories(self): file = os.path.join(self.tmpdir, "categories.bin") with BinaryDs(file, features=14) as dataset: self.assertEqual(dataset.get_categories(), 0) dataset.write(self.data_raw[:1]) with BinaryDs(file, features=14) as dataset: self.assertEqual(dataset.get_categories(), 1) dataset.write(self.data_raw[2:3]) with BinaryDs(file, features=14) as dataset: self.assertEqual(dataset.get_categories(), 3)
def test_multi_read_write(self): file = os.path.join(self.tmpdir, "rwmulti.bin") with BinaryDs(file, features=14) as binary: binary.write(self.data_raw) with BinaryDs(file, features=14) as binary: binary.write(self.data_raw2) expected = [self.data_raw[2]] + self.data_raw2[:3] with BinaryDs(file, features=14, read_only=True) as dataset: read = dataset.read(2, 4) self.assertEqual(read, expected)
def test_truncate_all(self): file = os.path.join(self.tmpdir, "truncate.bin") dataset = BinaryDs(file, features=14).open() dataset.close() empty_size = os.path.getsize(file) with BinaryDs(file, features=14) as dataset: dataset.write(self.data_raw2) self.assertGreater(os.path.getsize(file), empty_size) with BinaryDs(file, features=14) as dataset: dataset.truncate() self.assertEqual(os.path.getsize(file), empty_size)
def test_shuffle(self): seed = 32000 # assert that the order is the expected one random.seed(seed) expected_order = [4, 0, 1, 6, 3, 7, 5, 2] file = os.path.join(self.tmpdir, "shuffle.bin") with BinaryDs(file, features=14) as binary: binary.write(self.data_raw2) binary.shuffle(seed) with BinaryDs(file, features=14) as binary: results = binary.read(0, binary.examples) for res_idx, exp_idx in enumerate(expected_order): self.assertEqual(results[res_idx], self.data_raw2[exp_idx])
def test_write_to_ro(self): file = os.path.join(self.tmpdir, "write_ro.bin") dataset = BinaryDs(file, features=14).open() dataset.close() with BinaryDs(file, features=14, read_only=True) as dataset: with self.assertRaises(IOError): dataset.write(self.data_raw)
def test_deduplicate(self): file = os.path.join(self.tmpdir, "deduplicate.bin") with BinaryDs(file, features=14) as dataset: dataset.write([self.data_raw[0]]) dataset.write(self.data_raw) dataset.write(self.data_raw2) dataset.write(self.data_raw) dataset.write(self.data_raw2) dataset.write(self.data_raw2) dataset.write(self.data_raw2) dataset.write(self.data_raw2) dataset.write(self.data_raw) dataset.write(self.data_raw) dataset.write(self.data_raw2) with BinaryDs(file, features=14) as dataset: dataset.deduplicate() data = dataset.read(0, 11) self.assertEqual(len(data), len(set(data)))
def test_read_write(self): file = os.path.join(self.tmpdir, "rw.bin") binary = BinaryDs(file, features=14).open() binary.write(self.data_raw) binary.close() with BinaryDs(file, features=14, read_only=True) as dataset: read = dataset.read(0, len(self.data_raw)) self.assertEqual(read, self.data_raw)
def count_categories(dataset: BinaryDs) -> List[int]: examples = dataset.get_examples_no() amount = 1000 read_total = int(examples / amount) remainder = examples % amount categories = [] for i in range(read_total): buffer = dataset.read(i * amount, amount) for val in buffer: category = val[0] while len(categories) <= category: categories.append(0) categories[category] += 1 if remainder > 0: buffer = dataset.read(read_total * amount, remainder) for val in buffer: category = val[0] while len(categories) <= category: categories.append(0) categories[category] += 1 assert len(categories) == dataset.get_categories() return categories
def evaluate_incremental(bs: int, file: str, model_path: str, test_bin) -> None: """ Evaluates the accuracy incrementally (first only 1 feature, then 3, then 5) :param bs: batch size :param file: file where to write the accuracy (.csv) :param model_path: string pointing to the .h5 keras model of the network. If empty will default to data_dir/model.h5 :param test_bin: path to the test dataset that will be used """ cut = 1 test = BinaryDs(test_bin, read_only=True).open() model = load_model(model_path) features = test.get_features() with open(file, "w") as f: f.write("features,accuracy\n") while cut <= features: print(f"Evaluating {cut}") generator = DataGenerator(test, bs, fake_pad=True, pad_len=cut) score = model.evaluate(generator) with open(file, "a") as f: f.write(f"{cut},{score[1]}\n") if cut < 24: cut = cut + 2 elif cut < 80: cut = cut + 22 elif cut < 256: cut = cut + 33 elif cut < 500: cut = cut + 61 elif cut < features: cut = cut + 129 cut = min(cut, features) else: break test.close()
def test_open_wrong_features(self): file = os.path.join(self.tmpdir, "open_wrong_features.bin") dataset = BinaryDs(file, features=1024).open() dataset.close() with self.assertRaises(IOError): BinaryDs(file, features=2048).open()
def test_open_wrong_features_readonly(self): file = os.path.join(self.tmpdir, "open_wrong_features_readonly.bin") dataset = BinaryDs(file, features=1024).open() dataset.close() with BinaryDs(file, features=2048, read_only=True) as dataset: self.assertEqual(dataset.get_features(), 1024)
def test_wrong_encoding(self): file = os.path.join(self.tmpdir, "wrongenc.bin") dataset = BinaryDs(file, encoded=False).open() dataset.close() with self.assertRaises(IOError): BinaryDs(file, encoded=True).open()
def test_wrong_encoding_readonly(self): file = os.path.join(self.tmpdir, "wrongenc_readonly.bin") dataset = BinaryDs(file, encoded=False).open() dataset.close() with BinaryDs(file, encoded=True, read_only=True) as dataset: self.assertFalse(dataset.is_encoded())
def test_write_wrong_number_features(self): file = os.path.join(self.tmpdir, "write_wrong_features.bin") with BinaryDs(file) as dataset: with self.assertRaises(ValueError): dataset.write(self.data_raw)
def test_open_readonly_not_existing(self): file = os.path.join(self.tmpdir, "readonly_not_existing.bin") with self.assertRaises(PermissionError): BinaryDs(file, True).open()
def test_get_features(self): file = os.path.join(self.tmpdir, "features.bin") with BinaryDs(file, features=14) as dataset: self.assertEqual(dataset.get_features(), 14)
def test_get_examples(self): file = os.path.join(self.tmpdir, "examples.bin") with BinaryDs(file, features=14) as dataset: self.assertEqual(dataset.get_examples_no(), 0) dataset.write(self.data_raw) self.assertEqual(dataset.get_examples_no(), 3)
def test_split(self): file1 = os.path.join(self.tmpdir, "splitA.bin") file2 = os.path.join(self.tmpdir, "splitB.bin") dataset1 = BinaryDs(file1, features=14).open() dataset1.write(self.data_raw2) dataset2 = BinaryDs(file2, features=14).open() self.assertEqual(dataset1.get_examples_no(), 8) self.assertEqual(dataset2.get_examples_no(), 0) dataset1.split(dataset2, 0.5) self.assertEqual(dataset1.get_examples_no(), 4) self.assertEqual(dataset2.get_examples_no(), 4) self.assertEqual(dataset1.read(0, 4), self.data_raw2[:4]) self.assertEqual(dataset2.read(0, 4), self.data_raw2[4:]) dataset1.close() dataset2.close()
def test_merge(self): file1 = os.path.join(self.tmpdir, "mergeA.bin") file2 = os.path.join(self.tmpdir, "mergeB.bin") dataset1 = BinaryDs(file1, features=14).open() dataset1.write(self.data_raw) dataset2 = BinaryDs(file2, features=14).open() dataset2.write(self.data_raw2) self.assertEqual(dataset1.get_examples_no(), 3) self.assertEqual(dataset2.get_examples_no(), 8) dataset1.merge(dataset2) self.assertEqual(dataset1.get_examples_no(), 11) self.assertEqual(dataset2.get_examples_no(), 0) self.assertEqual(dataset1.read(0, 11), self.data_raw + self.data_raw2) dataset1.close() dataset2.close()
def run_summary(model_dir: str) -> None: """ Gets a summary of the dataset contained in a directory :param model_dir: Path to the folder where the train.bin, test.bin and validate.bin can be found """ assert (os.path.exists(model_dir)) train_bin = os.path.join(model_dir, "train.bin") test_bin = os.path.join(model_dir, "test.bin") validate_bin = os.path.join(model_dir, "validate.bin") assert os.path.exists(train_bin), "Train dataset does not exists!" assert os.path.exists(test_bin), "Test dataset does not exists!" assert os.path.exists(validate_bin), "Validation dataset does not exists!" train = BinaryDs(train_bin, read_only=True).open() train_categories = count_categories(train) openc = train.is_encoded() features = train.get_features() train.close() val = BinaryDs(validate_bin, read_only=True).open() val_categories = count_categories(val) val.close() test = BinaryDs(test_bin, read_only=True).open() test_categories = count_categories(test) test.close() print(f"Features: {features}") print(f"Number of classes: {len(train_categories)}") if openc: print("Type: opcode encoded") else: print("Type: raw values") print("--------------------") for i in range(0, len(train_categories)): print(f"Training examples for class {i}: {train_categories[i]}") for i in range(0, len(val_categories)): print(f"Validation examples for class {i}: {val_categories[i]}") for i in range(0, len(test_categories)): print(f"Testing examples for class {i}: {test_categories[i]}")