/
sel_dupl.py
executable file
·116 lines (73 loc) · 2.85 KB
/
sel_dupl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/python
# -*- coding: utf-8 -*-
#Вот в таком виде оно и будет храниться:
#[[запрос1,[дубль1,дубль2]]
#[запрос2, [дубль1,дубль2]]
#итд
#
#]
#Поиск дублей!
import psycopg2, time
from pg import escape_string
dbname = "bibliography"
user = 'annndrey'
host = "piggy"
passwd = "andreygon"
col='source'
conn = psycopg2.connect("dbname='%s' user='%s' host='%s' password='%s'" % (dbname, user, host, passwd))
cur = conn.cursor()
a = []
b=[]
cur.execute("select distinct name_orig from articles")
for i in cur.fetchall():
a.append(i[0])
a.sort()
cnt = 0
f = open("duplicates.txt", "w")
for i in a:
if i is not None and len(i) > 0 and len(i.split(" ")) > 1:
c=[]
c.append(i)
cur.execute("""select distinct name_orig, similarity(name_orig, %s) from dupl_table where name_orig %% %s order by similarity(name_orig, %s) desc""", (escape_string(i), escape_string(i), escape_string(i)))
if cur.rowcount > 1:
d=[]
for j in cur.fetchall():
#if i!=j[0] and j[1] > 0.5:
d.append("%s, %f" %( j[0], j[1]))
c.append(d)
b.append(c)
try:
a.pop(a.index(i))
except:
pass
print c[0], c[1]
for i in b:
print i
#f.write("\n\n")
# time.sleep(1)
#c = []
#b = {}
#r = {}
#for j in cur.fetchall():
# c.append(j[1])
# b[j[1]] = j[0]
# r[j[1]] = j[2]
#d = set(c)
#if len(d) > 1 and len(d) < 5:
#
# for u in d:
# if u != i:# and r[u] < 1:
# cnt = cnt +1
# print "Запрос: %s" % i
# f.write("Запрос: %s\n" % i)
# print "Возможный дубликат: rank:%s, uid:%i, назв.: %s" % (r[u], b[u], u)
# f.write("Возможный дубликат: rank: %s, uid:%i, назв.: %s\n" % (r[u], b[u], u))
# #print b[u]
# #print "В строке с uid %s изменим source на %s" % (str(b[u]), i)
# #cur.execute("update articles set source = %s", (i, ))
# print "\n\n"
# f.write("\n\n\n")
# #time.sleep(10)
#print "Всего %i кандидатов в дубли" % cnt
#f.write("Всего %i кандидатов в дубли" % cnt)
f.close()