Exemple #1
0
def doc_weights_from_dir(dir_path):
    # get list of all files in dir matching regex
    files = glob.glob(dir_path, recursive=True)
    print(f"Number of files: {len(files)}")

    # generate doc_weights
    weights = fbHashB.compute_document_weights(files)
    return weights
Exemple #2
0
def pick_and_gen_doc_weights(file_path, num, save_path, exclude_type=[]):
    ref_files = select_ref_files(file_path, num, exclude_type)
    print("Generate weights from files:")
    print(ref_files)
    w = fbHashB.compute_document_weights(ref_files)
    print(f"number of weights: {len(w)}")
    # for k in (sorted(w, key=w.get, reverse=True)[:10]):
    #    print(f"{k}: {w[k]}")
    fbHashB.doc_weights2sqlite(w, save_path)
    print("serialized")
Exemple #3
0
def test_comparison():
    files = ["./tests/files/testfile_1.txt", "./tests/files/testfile_1_1.txt", "./tests/files/testfile_2.txt", "./tests/files/testfile_3.txt"]
    doc_w_path = "test_weights.db"
    doc_w = fbHashB.compute_document_weights(files)
    fbHashB.doc_weights2sqlite(doc_w, doc_w_path)
    h1 = fbHashB.hashf(files[0], doc_w_path)
    h1_1 = fbHashB.hashf(files[1], doc_w_path)
    h2 = fbHashB.hashf(files[2], doc_w_path)

    # different files
    assert fbHashB.compare(h1, h2) == 0

    # similar files
    assert 40 < fbHashB.compare(h1, h1_1) < 60
Exemple #4
0
def fd_kfold(frag_sizes):
    num_folds = 5
    res = []
    # files_path = "./tests/files/t5-corpus/t5/*.text"
    files_path = "/dev/shm/t5-corpus/t5/*.text"
    min_size = 512 * 100  # for sdhash
    files = list(
        filter(lambda f: os.path.getsize(f) > min_size, glob.glob(files_path)))
    random.shuffle(files)
    print(f"num file pool: {len(files)}")
    # reduce number of files for testing
    # files = files[0:20]

    kf = gen_kfold_files(files, num_folds)
    for train, test in kf:
        # compute doc weights
        print("generate doc weights")
        docw = fbHashB.compute_document_weights(train)

        def fbHashB_hashd(data):
            return fbHashB.hashd_weights(data, docw)

        sdhash = hw.HashWrapper("sdhash", [""], ["-t", "-1", "-c"],
                                r".*?\|.*?\|(\d{3})")
        schemes = [('fbHashB', fbHashB_hashd, fbHashB.compare, None),
                   ('ssdeep', ssdeep.hash, ssdeep.compare, None),
                   ('sdhash', sdhash.hashd, sdhash.compare, None)]

        # compute fragment detection
        print("compute fragment detection")
        r = fragment_detection(schemes, test, frag_sizes)
        res.append(r)
        print(f"res:\n{res}")

    # compose results
    result = np.array(res)
    result = np.average(result, axis=0)
    return result
Exemple #5
0
def test_document_weights():
    files = ["./tests/files/testfile_1.txt", "./tests/files/testfile_2.txt", "./tests/files/testfile_3.txt", "./tests/files/testfile_4.txt"]
    doc_w = fbHashB.compute_document_weights(files)
    assert len(doc_w) > 0