def calculateSimhashes():
    """Calculate simhash values for number of textfiles."""
    word_counts = dict()
    hashes = dict()
    files = os.listdir('./text')
    counter = 0

    for file in files:
        counter += 1
        if counter % 1000 == 0:
            print counter
        with open('./text/' + file, 'r') as f:
            lines = f.readlines()
            words = []
            for line in lines:
                for word in re.split('\W+', line):
                    if word != '':
                        newWord = word.lower()
                        words.append(newWord)
            hashes[file] = simhash(words)
            word_counts[file] = len(words)

    with open("word_counts.txt", 'w+') as f:
        for (file, count) in word_counts.items():
            f.write(file + '\t' + str(count) + '\n')
    with open("hashes.txt", 'w+') as f:
        for (file, hash) in hashes.items():
            f.write(file + '\t' + str(hash) + '\n')
def predeal(pos, next):
    for i in range(pos, next):
        print(i)
        # mon=0
        no = ws.cell(row=i, column=1).value
        cont = ws.cell(row=i, column=2).value
        money = ws.cell(row=i, column=5).value
        type1 = ws.cell(row=i, column=3).value
        type2 = ws.cell(row=i, column=4).value
        # print(money)
        if re.search('万', money):
            mon = delete(money)
            mon = float(mon) * 10000
        elif re.search('千', money):
            mon = delete(money)
            mon = float(mon) * 1000
        else:
            mon = delete(money)
            mon = float(mon)

            # try:
            #     mon = float(mon)
            # except ValueError:
            #     print(mon)
        money = mon
        hashcode = simhash.simhash(cont, content_list, type=True)
        if hashcode.__str__() == '00':
            continue
        keywords = ''
        for tu in hashcode.tf_idf[:21]:
            keywords += str(tu[0]).replace('\'', '') + ' '
        ws.cell(row=i, column=7, value=keywords)
        print(keywords)
        file2.write(no + ' ' + type1 + ' ' + type2 + ' ' + str(money) + ' ' + keywords + '\n')
Example #3
0
def similarity(name, content):
    '''

    :param name: 被告人姓名
    :param content:庭审过程
    :return:
    '''
    file = []
    predeal = source_file_stan()
    file.append(predeal)
    data = predeal.readlines()
    content_list = get_idf_content()
    result, money = split.split_text(name, content)
    ca = simhash.simhash(result, content_list)
    print(ca.tf_idf)
    simi = {}
    for has in data:
        has = has.strip('\n')
        da = has.split(' ')
        if len(da[4:]) < 10:
            continue
        simila = ca.matrix_dis(da[4:len(da) - 1])
        simi[da[0]] = simila
    simi = sorted(simi.items(), key=lambda x: x[1], reverse=True)
    num = 1
    print('与该案件相似判决书有:')
    for tu in simi:
        if num > 10:
            break
        print(tu)
        num += 1
    source_close(file)
Example #4
0
def test():
    file = []
    cases, predeal = source_file()
    file.append(cases)
    file.append(predeal)
    ws = cases.worksheets[1]
    data = predeal.readlines()
    content_list = get_idf_content()
    candidate = [22, 23, 30, 36, 37, 44, 45, 46, 48, 714, 2, 185, 315, 71, 80]
    name = ws.cell(row=15, column=10).value  # 获取被告人姓名
    value = ws.cell(row=15, column=22).value  # 获取庭审过程
    result = split.split_text(name, value)
    ca = simhash.simhash(
        result,
        content_list,
        type=True,
    )
    sim = {}
    for has in data:
        has = has.strip('\n')
        da = has.split(' ')
        if int(da[0]) not in candidate:
            continue
        simila = ca.matrix_dis(da[4:len(da) - 1])
        sim[da[0]] = simila
    sim = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    print('与第%d号判决书相似的案件有:' % 15)
    for tu in sim:
        print(tu)
    source_close(file)
Example #5
0
def on_request(ch, method, props, body):
    params = simplejson.loads(body)
    filedesc, conf = params
    LOG.info('({}) Processing {}'.format(PID, filedesc[0]))
    sh = simhash.simhash(filedesc[1],
                         k=conf['k'],
                         lenhash=conf['lenhash'],
                         stopwords=conf['stopwords'])
    response = {'name': filedesc[0], 'sh': sh}
    send_back(ch, method, props, body, response)
Example #6
0
def on_request(ch, method, props, body):
    params = simplejson.loads(body)
    filedesc, conf = params
    LOG.info('({}) Processing {}'.format(PID, filedesc[0]))
    sh = simhash.simhash(filedesc[1],
                         k=conf['k'],
                         lenhash=conf['lenhash'],
                         stopwords=conf['stopwords'])
    response = {'name': filedesc[0], 'sh': sh}
    send_back(ch, method, props, body, response)
Example #7
0
def precess(money_dis=False, number=500):
    file = []
    cases, predeal = source_file()
    file.append(cases)
    file.append(predeal)
    ws = cases.worksheets[1]
    data = predeal.readlines()
    count = 1
    content_list = get_idf_content()
    for j in range(40, 100):
        if count > 4:
            break
        name = ws.cell(row=j, column=10).value  # 获取被告人姓名
        value = ws.cell(row=j, column=22).value  # 获取庭审过程
        if len(name.split('、')) > 1:  # 处理只有一名被告人的文本
            continue
        result = split.split_text(name, value)
        ca = simhash.simhash(result, content_list, type=True)
        if len(ca.__str__()) < 64:
            continue
        simi = {}
        if money_dis:
            money = project1.distinguish(value)
            money = money_convert(money)
            sim_case = compare_money(float(money), data, number)
            for has in sim_case:
                # start = time.time()
                simi[has[0]] = ca.matrix_dis(has[2])
                # print("total compare time ",time.time()-start)
        else:
            for has in data:
                has = has.strip('\n')
                da = has.split(' ')
                if len(da[-1]) < 64:
                    continue
                # if len(da[4:]) < 10:
                #     continue
                # start = time.time()
                # simila = ca.matrix_dis(da[4:len(da) - 1])
                # print(time.time()-start)
                simi[da[0]] = ca.count_cos(da[-1])
        simi = sorted(simi.items(), key=lambda x: x[1], reverse=True)
        num = 1
        print('与第%d号判决书相似的案件有:' % j)
        for tu in simi:
            if num > 10:
                break
            if int(tu[0]) == j:
                continue
            print(tu)
            num += 1
        count += 1
    source_close(file)
Example #8
0
def train(dic, Corpus):
    corpus = Corpus  # remember to increse highly.
    sentences = corpus
    hashedSentences = simhash.simhash(sentences, 32)
    B = hashedSentences

    # you can try simhash directly, maybe it performs better.
    # hashedSentences = simhash.simhash(sentences, 128)
    # dataMat = np.array(hashedSentences)
    # lambda_, eigenVec_ = laplacian_eigenmap.laplacian_eigenmap(dataMat, 15, 32)
    # B=eigenVec_

    input = embedding(dic, corpus)
    input = np.array(input)
    input = input.reshape(-1, 200 * 300)  # [batch_size,2000]
    output = np.array(B)  # [batch_size,32]
    data = np.concatenate((input, output), axis=1)

    net = Net()
    criterion = nn.MSELoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    print('Start Traning Cnn')

    for epoch in range(8000):  # loop over the dataset multiple times
        generator = batch_generator(data, 30)
        for inputs, labels in generator:
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs, h = net(torch.Tensor(inputs.reshape(-1, 1, 200, 300)))
            loss = criterion(outputs, torch.Tensor(labels.reshape(-1, 32)))
            loss.backward()
            optimizer.step()

            # print statistics
            if epoch % 100 == 0:
                print('loss:{}'.format(loss))

    print('Finished Training')
    torch.save(net, "cnn_model.pkl")
def train(dic, Corpus):
    corpus = Corpus  # remember to increse highly.
    sentences = corpus
    hashedSentences = simhash.simhash(sentences, 32)
    B = hashedSentences

    # you can try simhash directly, maybe it performs better.
    # hashedSentences = simhash.simhash(sentences, 128)
    # dataMat = np.array(hashedSentences)
    # lambda_, eigenVec_ = laplacian_eigenmap.laplacian_eigenmap(dataMat, 15, 32)
    # B=eigenVec_

    input = embedding(dic, corpus)
    input = np.array(input)
    input = input.reshape(-1, 20 * 300)  # [batch_size,2000]
    output = np.array(B)  # [batch_size,32]
    data = np.concatenate((input, output), axis=1)

    net = Net()
    criterion = nn.MSELoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    print('Start Traning Cnn')

    inputs = 0
    for epoch in range(800):  # loop over the dataset multiple times
        generator = batch_generator(data, 30)
        cnt = 0
        loss_ = 0
        for inputs, labels in generator:
            cnt += 1
            # zero the parameter gradients
            optimizer.zero_grad()
            inputs = torch.from_numpy(inputs).reshape(-1, 1, 20, 300).float()
            labels = torch.from_numpy(labels).reshape(-1, 32).float()
            # forward + backward + optimize
            outputs, h = net(inputs)
            loss = criterion(outputs, labels)
            loss_ += loss

            loss.backward()
            optimizer.step()

            # print statistics
            if epoch % 100 == 0:
                print('loss:{}'.format(loss))

        for tag, value in net.named_parameters():
            tag = tag.replace('.', '/')
            writer.add_histogram(tag, value.data.cpu().numpy(), epoch + 1)
            #logger.histo_summary(tag, value.data.cpu().numpy(), epoch + 1)
            #logger.histo_summary(tag + '/grad', value.grad.data.cpu().numpy(), epoch + 1)

        writer.add_scalar("STCC/loss", loss_ / cnt, epoch)
        cnt = 0
        loss_ = 0

    print('Finished Training')
    torch.save(net, "cnn_model.pkl")
    writer.add_graph(net, input_to_model=(inputs, ))
    writer.close()
Example #10
0
            else:
                partition_texts = set()

            partition_texts.add(current_id)
            partitions[val] = partition_texts.copy()

    return candidates


if __name__ == '__main__':
    # get texts
    number_of_texts = int(input())
    hashes = []
    for i in range(int(number_of_texts)):
        text = input().strip()
        text_hash = simhash.simhash(text)
        hashes.append(text_hash)

    candidates = lsh(hashes, number_of_texts)

    # get queries
    number_of_queries = int(input())
    for i in range(int(number_of_queries)):
        query = input().strip().split(" ")

        # process the query
        text_index = int(query[0])
        max_distance = int(query[1])
        target_text_hash = hashes[text_index]
        results = []
        if text_index in candidates:
Example #11
0
def simhash(x) :
    import simhash
    return simhash.simhash(x)
 def process_IN_CREATE(self,event):
     print "Create file:%s"%os.path.join(event.path,event.name)
     print "simhash of newfile is %s"%simhash.simhash(open(os.path.join(event.path,event.name)))
Example #13
0
#!/usr/bin/env python
# encoding: utf-8

import time
import getpath
import simhash
import os
import cPickle as pickle


time.clock()
rootdir = '/home/ted'
dirlist = getpath.fileso().getpath(rootdir,filter = ['txt','doc'])
print dirlist
res = []
for i in dirlist:
    with open(i,'r') as f:
        content = f.read()
    res.append([i,str(simhash.simhash(content))])
res.sort(key = lambda a: a[1])
with open(os.path.join(os.getcwd(),'result'),'w') as f:
    f.write(pickle.dumps(res))
print time.clock()
Example #14
0
import os
import subprocess
import simhash as si
result = os.environ['PATH']
new_result = result + '//liu//text//'
os.putenv(new_result, 'PATH')
print os.environ['PATH']


def find():

    pass


if __name__ == '__main__':
    s = 'To be or not to be ,this is a question'
    hash1 = si.simhash(s.split())

    s = 'whether to be is a question'
    hash2 = si.simhash(s.split())

    s = 'i have a question to say ,not to be or to be'
    hash3 = si.simhash(s.split())

    print(hash1.hamming_distance(hash2), "   ", hash1.similarity(hash2))
    print(hash1.hamming_distance(hash3), "   ", hash1.similarity(hash3))
Example #15
0
def getFeature(dic, Corpus):
    corpus = Corpus[:5000]  # remember to increse highly.
    sentences = [j[0] for j in corpus]
    hashedSentences = simhash.simhash(sentences, 32)
    B = hashedSentences

    # # you can try simhash directly, maybe it performs better.
    # hashedSentences = simhash.simhash(sentences, 128)
    # dataMat = np.array(hashedSentences)
    # lambda_, eigenVec_ = laplacian_eigenmap.laplacian_eigenmap(dataMat, 15, 32)

    input = getMat(dic, corpus)
    input = np.array(input)
    input = input.reshape(-1, 20 * 300)
    output = np.array(B)
    data = np.concatenate((input, output), axis=1)

    net = cnn_model.Net()
    criterion = cnn_model.nn.MSELoss()
    optimizer = cnn_model.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    print('Start Traning Cnn')

    i = 0
    running_loss = 0.0
    for epoch in range(50):  # loop over the dataset multiple times

        generator = cnn_model.batch_generator(data, 100)
        for inputs, labels in generator:
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs, h = net(torch.Tensor(inputs.reshape(-1, 1, 20, 300)))
            loss = criterion(outputs, torch.Tensor(labels.reshape(-1, 32)))
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            i += 1
            if i % 20 == 19:  # print every 100 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

    print('Finished Training')

    represented = []
    length = len(Corpus)
    for i in range(0, length, 500):
        input = getMat(dic, Corpus[i:min(length, i + 500)])
        input = np.array(input)
        input = input.reshape(-1, 1, 20, 300)
        outputs, h = net(torch.Tensor(input))
        represented += h.data.numpy().tolist()
        print('(%d %% %d) have been Embedding.' % (i, length))

    print('all sentences finished Embedding')

    return represented
Example #16
0
filter_list = []
for line in open("filter.txt"):
    filter_list.append(line.split("\t")[0])

token = dict()
frq = dict()
(token, frq) = find_all_instances("data-concept-instance-relations.txt",
                                  filter_list)

hash_data = []
simhash_output = open('simhash.txt', 'w')
answer = open('answer.txt', 'w')
inac = open("interaction.txt", 'w')

for name in filter_list:
    sim = simhash(name, token[name], frq[name])
    hash_data.append(sim)
    simhash_output.write(sim.name)
    simhash_output.write("\t")
    simhash_output.write(str(sim.hash))
    simhash_output.write('\n')

print len(hash_data)
for i in range(0, len(hash_data) - 1):
    if (i % 200 == 0):
        print i
    for j in (range(i + 1, len(hash_data))):
        if (hash_data[i].hamming_distance(hash_data[j]) >= LINE):
            sent = hash_data[i].name + '\t' + hash_data[j].name + '\n'
            answer.write(sent)
            find_interaction(inac, hash_data[i].name, hash_data[j].name,
Example #17
0
 #  a filter of what changes to notify.
 #
 # NB Tim Juchcinski reports that he needed to up
 #  the buffer size to be sure of picking up all
 #  events when a large number of files were
 #  deleted at once.
 #
 results = win32file.ReadDirectoryChangesW (
   hDir,
   1024,
   True,
   win32con.FILE_NOTIFY_CHANGE_FILE_NAME |
    win32con.FILE_NOTIFY_CHANGE_DIR_NAME |
    win32con.FILE_NOTIFY_CHANGE_ATTRIBUTES |
    win32con.FILE_NOTIFY_CHANGE_SIZE |
    win32con.FILE_NOTIFY_CHANGE_LAST_WRITE |
    win32con.FILE_NOTIFY_CHANGE_SECURITY,
   None,
   None
 )
 print results
 if results[0][0] == 1:
   pass
 for action, file in results:
   full_filename = unicode(os.path.join(path_to_watch,file))
   print ACTIONS.get(action, "Unknown")
   if action == 1:
     with open(full_filename,'r') as f:
       content = f.read()
     print u'新建文件的simhash值为'+str(simhash.simhash(content))
Example #18
0
	html1 = getpage('http://www.2345.com/')
	html2 = getpage('http://www.hao123.com')

	st = time()
	t1 = gettags(html1)
	t2 = gettags(html2)
	print time() - st
	print t1
	print t2
	
	

	t1 = ['head', 'base', 'meta', 'title', 'meta', 'meta', 'meta', 'meta', 'meta', 'style', 'script', 'body', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'script', 'script', 'div']
 	t2 = ['meta', 'title', 'meta', 'meta', 'meta', 'meta', 'meta', 'style', 'script', 'script', 'body', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'script', 'script', 'div']

	hash1 = simhash.simhash(t1)
	hash2 = simhash.simhash(t2)
	
	print hash1,hash2

	print issim(hash1,hash2)

	domain="11.aaa.com"

	t = time()

	mdomain = "aaa.com"
	ip = "1.2.3.4"
	hash = hash1

	update(mdomain,hash)
Example #19
0
    print time() - st
    print t1
    print t2

    t1 = [
        'head', 'base', 'meta', 'title', 'meta', 'meta', 'meta', 'meta',
        'meta', 'style', 'script', 'body', 'div', 'div', 'div', 'div', 'div',
        'div', 'div', 'div', 'div', 'div', 'div', 'script', 'script', 'div'
    ]
    t2 = [
        'meta', 'title', 'meta', 'meta', 'meta', 'meta', 'meta', 'style',
        'script', 'script', 'body', 'div', 'div', 'div', 'div', 'div', 'div',
        'div', 'div', 'div', 'div', 'div', 'script', 'script', 'div'
    ]

    hash1 = simhash.simhash(t1)
    hash2 = simhash.simhash(t2)

    print hash1, hash2

    print issim(hash1, hash2)

    domain = "11.aaa.com"

    t = time()

    mdomain = "aaa.com"
    ip = "1.2.3.4"
    hash = hash1

    update(mdomain, hash)
Example #20
0
# assuming that you have a dictionary with document id as the key and the document as the value:
# documents = { doc_id: doc } you can do:
from simhash import simhash

documents = { 1 : open('first.txt', 'r').read() , 2 : open('second.txt', 'r').read(), 3 : open('Tests/third.txt', 'r').read(), 4 : open('Tests/fourth.txt', 'r').read()}

def split_hash(str, num):
    return [ str[start:start+num] for start in range(0, len(str), num) ]

hashes = {}
for doc_id, doc in documents.items():
    print(doc_id)
    print(doc)
    hash = simhash(doc)

    # you can either use the whole hash for higher precision or split into chunks for higher recall
    hash_chunks = split_hash(hash, 4)

    for chunk in hash_chunks:
        if chunk not in hashes:
            hashes[chunk] = []
        hashes[chunk].append(doc_id)

# now you can print the duplicate documents:
for hash, doc_list in hashes:
    if doc_list > 1:
        print("Duplicates documents: ", doc_list)
Example #21
0
                for r in result:
                    varst += str(r.name)
                     
    return varst
    
skipexts = ['.gif', '.exe', '.pyc', '.o', '.a','.dll','.lib','.pdb','.mdb']        # ignore binary files
scanexts = ['.inc','.php'] 
strlen = 0 
lexer = phplex.lexer.clone()
lexer.filename = None

p = []
s1 = file('d:\\office\\2008.php').read()
log = Log()
import simhash
hash1 =simhash.simhash(getvarlist(s1))
if __name__ == '__main__':
    
    if os.name =='nt':
        os.system('color 0a')
        os.system('mode con cols=155 lines=300')
    args = sys.argv
    if len(args) == 2:
        scan(args[1])
    elif len(args) == 3:
        scan( args[2])
    elif len(args) == 4:
        scan(args[2])
    else:
        pass
Example #22
0
import random
from simhash import simhash

class Permute(object):
    def __init__(self, size, count):
        self.size = size
        self.count = count
        self.permutation = []
        ks = set()
        while len(self.permutation) < count:
            a = [i for i in range(size)]
            random.shuffle(a)
            k = '.'.join([str(x) for x in a])
            if k not in ks:
                self.permutation.append(a)
                ks.add(k)

    def permute(self, s):
        for i in range(self.count):
            yield ''.join([s[self.permutation[i][j]] for j in range(self.size)])

N = 128
sh = simhash(sys.argv[1], N)
print sh

p = Permute(size=7, count=4)
for x in p.permute('1234567'):
    print x