def test_get_encoding(self):
     file_raw = os.path.join(self.tmpdir, "encoding_raw.bin")
     file_op = os.path.join(self.tmpdir, "encoding_op.bin")
     with BinaryDs(file_raw, encoded=True) as dataset_raw:
         self.assertTrue(dataset_raw.is_encoded())
     with BinaryDs(file_op, encoded=False) as dataset_op:
         self.assertFalse(dataset_op.is_encoded())
Example #2
0
def evaluate_confusion(bs: int, file: str, fixed: int, model_path: str,
                       test_bin) -> None:
    """
    Evaluates the confusion matrix for a given number of features
    :param bs: batch size
    :param file: file where the confusion matrix will be written
    :param fixed: number of features to be considered
    :param model_path: string pointing to the .h5 keras model of the network.
    If empty will default to data_dir/model.h5
    :param test_bin: path to the test dataset that will be used
    """
    test = BinaryDs(test_bin, read_only=True).open()
    binary = test.get_categories() <= 2
    model = load_model(model_path)
    generator = DataGenerator(test, bs, fake_pad=True, pad_len=fixed,
                              predict=True)
    expected = get_expected(bs, test)
    predicted = model.predict(generator, verbose=1)
    if binary:
        predicted = np.round(predicted).flatten().astype(np.int8)
    else:
        predicted = np.argmax(predicted, axis=1)
    matrix = np.array(tf.math.confusion_matrix(expected, predicted))
    with open(file, "w") as f:
        np.savetxt(f, X=matrix, fmt="%d")
    test.close()
 def test_truncate_but_one(self):
     file = os.path.join(self.tmpdir, "truncate_1.bin")
     with BinaryDs(file, features=14) as dataset:
         dataset.write(self.data_raw2)
     self.assertGreater(dataset.get_examples_no(), 0)
     with BinaryDs(file, features=14) as dataset:
         dataset.truncate(left=1)
     self.assertEqual(dataset.get_examples_no(), 1)
 def test_merge_different_features(self):
     file14 = os.path.join(self.tmpdir, "merge_f14.bin")
     file2k = os.path.join(self.tmpdir, "merge_f2048.bin")
     with BinaryDs(file2k) as ds2k:
         with BinaryDs(file14, features=14) as ds14:
             ds14.write(self.data_raw)
             with self.assertRaises(IOError):
                 ds2k.merge(ds14)
 def test_merge_different_encoding(self):
     file_op = os.path.join(self.tmpdir, "merge_op.bin")
     file_raw = os.path.join(self.tmpdir, "merge_raw.bin")
     with BinaryDs(file_op, encoded=False, features=14) as ds_op:
         ds_op.write(self.data_raw)
         with BinaryDs(file_raw, encoded=True, features=14) as ds_raw:
             ds_raw.write(self.data_raw)
             with self.assertRaises(IOError):
                 ds_raw.merge(ds_op)
 def test_balance(self):
     file = os.path.join(self.tmpdir, "balance.bin")
     with BinaryDs(file, features=14) as binary:
         binary.write(self.data_raw2)
         binary.balance()
     with BinaryDs(file, features=14) as binary:
         results = binary.read(0, binary.examples)
     expected = self.data_raw2[:5] + [self.data_raw2[6]]
     self.assertEqual(results, expected)
 def test_update_categories(self):
     file = os.path.join(self.tmpdir, "categories.bin")
     with BinaryDs(file, features=14) as dataset:
         self.assertEqual(dataset.get_categories(), 0)
         dataset.write(self.data_raw[:1])
     with BinaryDs(file, features=14) as dataset:
         self.assertEqual(dataset.get_categories(), 1)
         dataset.write(self.data_raw[2:3])
     with BinaryDs(file, features=14) as dataset:
         self.assertEqual(dataset.get_categories(), 3)
 def test_multi_read_write(self):
     file = os.path.join(self.tmpdir, "rwmulti.bin")
     with BinaryDs(file, features=14) as binary:
         binary.write(self.data_raw)
     with BinaryDs(file, features=14) as binary:
         binary.write(self.data_raw2)
     expected = [self.data_raw[2]] + self.data_raw2[:3]
     with BinaryDs(file, features=14, read_only=True) as dataset:
         read = dataset.read(2, 4)
     self.assertEqual(read, expected)
 def test_truncate_all(self):
     file = os.path.join(self.tmpdir, "truncate.bin")
     dataset = BinaryDs(file, features=14).open()
     dataset.close()
     empty_size = os.path.getsize(file)
     with BinaryDs(file, features=14) as dataset:
         dataset.write(self.data_raw2)
     self.assertGreater(os.path.getsize(file), empty_size)
     with BinaryDs(file, features=14) as dataset:
         dataset.truncate()
     self.assertEqual(os.path.getsize(file), empty_size)
 def test_shuffle(self):
     seed = 32000
     # assert that the order is the expected one
     random.seed(seed)
     expected_order = [4, 0, 1, 6, 3, 7, 5, 2]
     file = os.path.join(self.tmpdir, "shuffle.bin")
     with BinaryDs(file, features=14) as binary:
         binary.write(self.data_raw2)
         binary.shuffle(seed)
     with BinaryDs(file, features=14) as binary:
         results = binary.read(0, binary.examples)
     for res_idx, exp_idx in enumerate(expected_order):
         self.assertEqual(results[res_idx], self.data_raw2[exp_idx])
 def test_write_to_ro(self):
     file = os.path.join(self.tmpdir, "write_ro.bin")
     dataset = BinaryDs(file, features=14).open()
     dataset.close()
     with BinaryDs(file, features=14, read_only=True) as dataset:
         with self.assertRaises(IOError):
             dataset.write(self.data_raw)
 def test_deduplicate(self):
     file = os.path.join(self.tmpdir, "deduplicate.bin")
     with BinaryDs(file, features=14) as dataset:
         dataset.write([self.data_raw[0]])
         dataset.write(self.data_raw)
         dataset.write(self.data_raw2)
         dataset.write(self.data_raw)
         dataset.write(self.data_raw2)
         dataset.write(self.data_raw2)
         dataset.write(self.data_raw2)
         dataset.write(self.data_raw2)
         dataset.write(self.data_raw)
         dataset.write(self.data_raw)
         dataset.write(self.data_raw2)
     with BinaryDs(file, features=14) as dataset:
         dataset.deduplicate()
         data = dataset.read(0, 11)
         self.assertEqual(len(data), len(set(data)))
 def test_read_write(self):
     file = os.path.join(self.tmpdir, "rw.bin")
     binary = BinaryDs(file, features=14).open()
     binary.write(self.data_raw)
     binary.close()
     with BinaryDs(file, features=14, read_only=True) as dataset:
         read = dataset.read(0, len(self.data_raw))
     self.assertEqual(read, self.data_raw)
Example #14
0
def count_categories(dataset: BinaryDs) -> List[int]:
    examples = dataset.get_examples_no()
    amount = 1000
    read_total = int(examples / amount)
    remainder = examples % amount
    categories = []
    for i in range(read_total):
        buffer = dataset.read(i * amount, amount)
        for val in buffer:
            category = val[0]
            while len(categories) <= category:
                categories.append(0)
            categories[category] += 1
    if remainder > 0:
        buffer = dataset.read(read_total * amount, remainder)
        for val in buffer:
            category = val[0]
            while len(categories) <= category:
                categories.append(0)
            categories[category] += 1
    assert len(categories) == dataset.get_categories()
    return categories
Example #15
0
def evaluate_incremental(bs: int, file: str, model_path: str,
                         test_bin) -> None:
    """
    Evaluates the accuracy incrementally (first only 1 feature, then 3, then 5)
    :param bs: batch size
    :param file: file where to write the accuracy (.csv)
    :param model_path: string pointing to the .h5 keras model of the network.
    If empty will default to data_dir/model.h5
    :param test_bin: path to the test dataset that will be used
    """
    cut = 1
    test = BinaryDs(test_bin, read_only=True).open()
    model = load_model(model_path)
    features = test.get_features()
    with open(file, "w") as f:
        f.write("features,accuracy\n")
    while cut <= features:
        print(f"Evaluating {cut}")
        generator = DataGenerator(test, bs, fake_pad=True, pad_len=cut)
        score = model.evaluate(generator)
        with open(file, "a") as f:
            f.write(f"{cut},{score[1]}\n")
        if cut < 24:
            cut = cut + 2
        elif cut < 80:
            cut = cut + 22
        elif cut < 256:
            cut = cut + 33
        elif cut < 500:
            cut = cut + 61
        elif cut < features:
            cut = cut + 129
            cut = min(cut, features)
        else:
            break
    test.close()
 def test_open_wrong_features(self):
     file = os.path.join(self.tmpdir, "open_wrong_features.bin")
     dataset = BinaryDs(file, features=1024).open()
     dataset.close()
     with self.assertRaises(IOError):
         BinaryDs(file, features=2048).open()
 def test_open_wrong_features_readonly(self):
     file = os.path.join(self.tmpdir, "open_wrong_features_readonly.bin")
     dataset = BinaryDs(file, features=1024).open()
     dataset.close()
     with BinaryDs(file, features=2048, read_only=True) as dataset:
         self.assertEqual(dataset.get_features(), 1024)
 def test_wrong_encoding(self):
     file = os.path.join(self.tmpdir, "wrongenc.bin")
     dataset = BinaryDs(file, encoded=False).open()
     dataset.close()
     with self.assertRaises(IOError):
         BinaryDs(file, encoded=True).open()
 def test_wrong_encoding_readonly(self):
     file = os.path.join(self.tmpdir, "wrongenc_readonly.bin")
     dataset = BinaryDs(file, encoded=False).open()
     dataset.close()
     with BinaryDs(file, encoded=True, read_only=True) as dataset:
         self.assertFalse(dataset.is_encoded())
 def test_write_wrong_number_features(self):
     file = os.path.join(self.tmpdir, "write_wrong_features.bin")
     with BinaryDs(file) as dataset:
         with self.assertRaises(ValueError):
             dataset.write(self.data_raw)
 def test_open_readonly_not_existing(self):
     file = os.path.join(self.tmpdir, "readonly_not_existing.bin")
     with self.assertRaises(PermissionError):
         BinaryDs(file, True).open()
 def test_get_features(self):
     file = os.path.join(self.tmpdir, "features.bin")
     with BinaryDs(file, features=14) as dataset:
         self.assertEqual(dataset.get_features(), 14)
 def test_get_examples(self):
     file = os.path.join(self.tmpdir, "examples.bin")
     with BinaryDs(file, features=14) as dataset:
         self.assertEqual(dataset.get_examples_no(), 0)
         dataset.write(self.data_raw)
         self.assertEqual(dataset.get_examples_no(), 3)
 def test_split(self):
     file1 = os.path.join(self.tmpdir, "splitA.bin")
     file2 = os.path.join(self.tmpdir, "splitB.bin")
     dataset1 = BinaryDs(file1, features=14).open()
     dataset1.write(self.data_raw2)
     dataset2 = BinaryDs(file2, features=14).open()
     self.assertEqual(dataset1.get_examples_no(), 8)
     self.assertEqual(dataset2.get_examples_no(), 0)
     dataset1.split(dataset2, 0.5)
     self.assertEqual(dataset1.get_examples_no(), 4)
     self.assertEqual(dataset2.get_examples_no(), 4)
     self.assertEqual(dataset1.read(0, 4), self.data_raw2[:4])
     self.assertEqual(dataset2.read(0, 4), self.data_raw2[4:])
     dataset1.close()
     dataset2.close()
 def test_merge(self):
     file1 = os.path.join(self.tmpdir, "mergeA.bin")
     file2 = os.path.join(self.tmpdir, "mergeB.bin")
     dataset1 = BinaryDs(file1, features=14).open()
     dataset1.write(self.data_raw)
     dataset2 = BinaryDs(file2, features=14).open()
     dataset2.write(self.data_raw2)
     self.assertEqual(dataset1.get_examples_no(), 3)
     self.assertEqual(dataset2.get_examples_no(), 8)
     dataset1.merge(dataset2)
     self.assertEqual(dataset1.get_examples_no(), 11)
     self.assertEqual(dataset2.get_examples_no(), 0)
     self.assertEqual(dataset1.read(0, 11), self.data_raw + self.data_raw2)
     dataset1.close()
     dataset2.close()
Example #26
0
def run_summary(model_dir: str) -> None:
    """
    Gets a summary of the dataset contained in a directory
    :param model_dir: Path to the folder where the train.bin, test.bin and
    validate.bin can be found
    """
    assert (os.path.exists(model_dir))
    train_bin = os.path.join(model_dir, "train.bin")
    test_bin = os.path.join(model_dir, "test.bin")
    validate_bin = os.path.join(model_dir, "validate.bin")
    assert os.path.exists(train_bin), "Train dataset does not exists!"
    assert os.path.exists(test_bin), "Test dataset does not exists!"
    assert os.path.exists(validate_bin), "Validation dataset does not exists!"
    train = BinaryDs(train_bin, read_only=True).open()
    train_categories = count_categories(train)
    openc = train.is_encoded()
    features = train.get_features()
    train.close()
    val = BinaryDs(validate_bin, read_only=True).open()
    val_categories = count_categories(val)
    val.close()
    test = BinaryDs(test_bin, read_only=True).open()
    test_categories = count_categories(test)
    test.close()
    print(f"Features: {features}")
    print(f"Number of classes: {len(train_categories)}")
    if openc:
        print("Type: opcode encoded")
    else:
        print("Type: raw values")
    print("--------------------")
    for i in range(0, len(train_categories)):
        print(f"Training examples for class {i}: {train_categories[i]}")
    for i in range(0, len(val_categories)):
        print(f"Validation examples for class {i}: {val_categories[i]}")
    for i in range(0, len(test_categories)):
        print(f"Testing examples for class {i}: {test_categories[i]}")