Ejemplo n.º 1
0
def hash_all():
    for obj in HtmlContent.objects.filter(status__lte=2).filter(~Q(content='')):
        h = simhash.hash_tokenpy(list(Tokenize(obj.content)))
        if find_duplicate(c, h) == 0:
            obj.status = 0
        else:
            obj.status = 1
        obj.hash = h
        obj.save()
        c.insert(h)
Ejemplo n.º 2
0
def hash_all():
    for obj in HtmlContent.objects.filter(
            status__lte=2).filter(~Q(content='')):
        h = simhash.hash_tokenpy(list(Tokenize(obj.content)))
        if find_duplicate(c, h) == 0:
            obj.status = 0
        else:
            obj.status = 1
        obj.hash = h
        obj.save()
        c.insert(h)
Ejemplo n.º 3
0
    sim_server.delete(dels)
#hash_test()

obj1 = HtmlContent.objects.get(pk=6870)
obj2 = HtmlContent.objects.get(pk=7024)
token1 = list(Tokenize(obj1.content))
token2 = list(Tokenize(obj2.content))
h1 = simhashpy(token1, 64)
h2 = simhashpy(token2, 64)
print h1,h2
print corpus.distance(h1,h2)
h1 = simhash.hash_token(token1)
h2 = simhash.hash_token(token2)
print h1,h2
print corpus.distance(h1,h2)
h1 = simhash.hash_tokenpy(token1)
h2 = simhash.hash_tokenpy(token2)
print h1,h2
print corpus.distance(h1,h2)

'''
str1 = 'test love you'
str2 = 'love you test'
t1 = str1.decode('utf-8').split()
t2 = str2.decode('utf-8').split()
h1 = simhash.hash_token(t1)
h2 = simhash.hash_token(t2)
h2 = simhash.hash_token(t1)
print h1,h2
print corpus.distance(h1,h2)
'''
Ejemplo n.º 4
0
#hash_test()

obj1 = HtmlContent.objects.get(pk=6870)
obj2 = HtmlContent.objects.get(pk=7024)
token1 = list(Tokenize(obj1.content))
token2 = list(Tokenize(obj2.content))
h1 = simhashpy(token1, 64)
h2 = simhashpy(token2, 64)
print h1, h2
print corpus.distance(h1, h2)
h1 = simhash.hash_token(token1)
h2 = simhash.hash_token(token2)
print h1, h2
print corpus.distance(h1, h2)
h1 = simhash.hash_tokenpy(token1)
h2 = simhash.hash_tokenpy(token2)
print h1, h2
print corpus.distance(h1, h2)
'''
str1 = 'test love you'
str2 = 'love you test'
t1 = str1.decode('utf-8').split()
t2 = str2.decode('utf-8').split()
h1 = simhash.hash_token(t1)
h2 = simhash.hash_token(t2)
h2 = simhash.hash_token(t1)
print h1,h2
print corpus.distance(h1,h2)
'''