/
minhash_v.py
103 lines (82 loc) · 2.82 KB
/
minhash_v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# minhash_v.py
from binascii import crc32
from numba.typed import List
from tqdm import tqdm
import ctypes
import group
import multiprocessing as mp
import numba as nb
import numpy as np
import os
import random
import re
import sys
MAXHASH = 2**32-1
C = 4294967311
NF = 100
t = 0.7
@nb.njit(fastmath=True)
def signature_jit(shingles, coeffs):
return [np.min((coeffs[0][j]*np.array(shingles) + coeffs[1][j]) % C) for j in range(NF)]
def shingle(i, aux_data):
signatures, files, coeffs = aux_data
with open(files[i],'r',errors='ignore') as fh:
w = re.split("\W+|_",fh.read().lower())
shingles = List()
for j in range(len(w)-2):
shingles.append(crc32((w[j]+" "+w[j+1]+" "+w[j+2]).encode()) & 0xffffffff)
# build signatures
if len(shingles) == 0:
signatures[i] = C + 1
else:
signatures[i] = signature_jit(shingles, coeffs)
@nb.njit(fastmath=True)
def hashcount_jit(i, signatures):
sig_i = signatures[i]
return [j for j in range(i-1, -1, -1) if np.sum(sig_i == signatures[j]) > t*NF]
aux_data = None
def initializer(init_data):
global aux_data
aux_data = init_data
def hashcount_wrapper(var_data):
signatures, _, _ = aux_data
indexes = set(hashcount_jit(var_data,signatures))
if len(indexes) > 0:
indexes.add(var_data)
return indexes
def shingle_wrapper(var_data):
return shingle(var_data, aux_data)
if __name__ == '__main__':
coeffs = np.array([[random.randint(0,MAXHASH) for j in range(NF)] for i in range(2)])
files = [os.path.join(root,f) for root,_,fnames in os.walk('.') for f in fnames]
if len(sys.argv) > 1:
files = files[:int(sys.argv[1])]
filenum = len(files)
signatures = np.ctypeslib.as_array(mp.RawArray(ctypes.c_ulong, filenum*NF)).reshape(filenum,NF)
aux_data = (signatures, files, coeffs)
with mp.Pool(mp.cpu_count(), initializer, (aux_data,)) as p:
# shingle files and create signatures
for i in tqdm(p.imap(shingle_wrapper,range(filenum),chunksize=100),total=filenum,desc="shingling"):
pass
# compare signatures
results = []
for s in tqdm(p.imap_unordered(hashcount_wrapper,range(1,filenum),chunksize=100),total=filenum-1,desc="comparing"):
if s is not None:
updated = False
for r in results:
if len(r.intersection(s)) > 0:
r.update(s)
updated = True
break
if updated is False:
results.append(s)
results = group.group(results)
count = 0
for s in results:
count += len(s)
for index in s:
print(files[index],end='\n')
print("---")
print("%d files in %d groups" %(count,len(results)))