def doc_weights_from_dir(dir_path): # get list of all files in dir matching regex files = glob.glob(dir_path, recursive=True) print(f"Number of files: {len(files)}") # generate doc_weights weights = fbHashB.compute_document_weights(files) return weights
def pick_and_gen_doc_weights(file_path, num, save_path, exclude_type=[]): ref_files = select_ref_files(file_path, num, exclude_type) print("Generate weights from files:") print(ref_files) w = fbHashB.compute_document_weights(ref_files) print(f"number of weights: {len(w)}") # for k in (sorted(w, key=w.get, reverse=True)[:10]): # print(f"{k}: {w[k]}") fbHashB.doc_weights2sqlite(w, save_path) print("serialized")
def test_comparison(): files = ["./tests/files/testfile_1.txt", "./tests/files/testfile_1_1.txt", "./tests/files/testfile_2.txt", "./tests/files/testfile_3.txt"] doc_w_path = "test_weights.db" doc_w = fbHashB.compute_document_weights(files) fbHashB.doc_weights2sqlite(doc_w, doc_w_path) h1 = fbHashB.hashf(files[0], doc_w_path) h1_1 = fbHashB.hashf(files[1], doc_w_path) h2 = fbHashB.hashf(files[2], doc_w_path) # different files assert fbHashB.compare(h1, h2) == 0 # similar files assert 40 < fbHashB.compare(h1, h1_1) < 60
def fd_kfold(frag_sizes): num_folds = 5 res = [] # files_path = "./tests/files/t5-corpus/t5/*.text" files_path = "/dev/shm/t5-corpus/t5/*.text" min_size = 512 * 100 # for sdhash files = list( filter(lambda f: os.path.getsize(f) > min_size, glob.glob(files_path))) random.shuffle(files) print(f"num file pool: {len(files)}") # reduce number of files for testing # files = files[0:20] kf = gen_kfold_files(files, num_folds) for train, test in kf: # compute doc weights print("generate doc weights") docw = fbHashB.compute_document_weights(train) def fbHashB_hashd(data): return fbHashB.hashd_weights(data, docw) sdhash = hw.HashWrapper("sdhash", [""], ["-t", "-1", "-c"], r".*?\|.*?\|(\d{3})") schemes = [('fbHashB', fbHashB_hashd, fbHashB.compare, None), ('ssdeep', ssdeep.hash, ssdeep.compare, None), ('sdhash', sdhash.hashd, sdhash.compare, None)] # compute fragment detection print("compute fragment detection") r = fragment_detection(schemes, test, frag_sizes) res.append(r) print(f"res:\n{res}") # compose results result = np.array(res) result = np.average(result, axis=0) return result
def test_document_weights(): files = ["./tests/files/testfile_1.txt", "./tests/files/testfile_2.txt", "./tests/files/testfile_3.txt", "./tests/files/testfile_4.txt"] doc_w = fbHashB.compute_document_weights(files) assert len(doc_w) > 0