-
Notifications
You must be signed in to change notification settings - Fork 0
/
gexf_change_ref.py
executable file
·191 lines (182 loc) · 6.04 KB
/
gexf_change_ref.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys
sys.path.append('lib')
#import networkx as nx
from mog_op import NewMongoOp
from glob import glob
import unicodecsv
import simstring
import xml.etree.ElementTree as ET
import logging
logging.basicConfig(level=logging.DEBUG,\
format="%(asctime)s %(levelname)s %(message)s")
import re
import os
def read_csv():
law_dict={}
r=unicodecsv.reader(open('abbreviation.csv','rb'),encoding='utf-8')
for row in r:
k=row[0]
law_dict[k]=[c for c in row if c!='']
return law_dict
def get_mp_data(mp,law_dict):
col=mp.con['laws_2018-09-24'].raw_laws
dkt={}
for a in col.find({},{'title':1,'href':1}):
#logging.info(u"title={} url={}".format(a['title'],a['href']))
dkt[a['title']]=a['href']
for kk in law_dict.get(a['title'],[]):
#logging.info(kk)
if isinstance(kk,str):
kk=unicode(kk,'utf-8')
dkt[kk]=a['href']
return dkt
def write_simstring(dkt):
dbpath='simstring_law/law.db'
db=simstring.writer(dbpath,3,False,True)
for k in dkt:
if isinstance(k,unicode):
k=k.encode('utf-8')
db.insert(k)
def read_simstring():
dbpath='simstring_law/law.db'
db=simstring.reader(dbpath)
db.measure = simstring.cosine
db.threshold=0.9
return db
def read_xml(f,dkt2):
logging.info("read={}".format(f))
tree = ET.ElementTree(file=f)
root = tree.getroot()
xpath=".//{http://www.gexf.net/1.3}node"
for n in root.findall(xpath):
label=n.get('label')
if isinstance(label,str):
label=unicode(label,'utf-8')
for attr in n.getiterator("{http://www.gexf.net/1.3}attvalue"):
if attr.get('for')=='url':
url=attr.get('value')
dkt2[label]=url
def write_xml(f,url_match):
logging.info("write={}".format(f))
tree = ET.ElementTree(file=f)
root = tree.getroot()
xpath=".//{http://www.gexf.net/1.3}node"
ET.register_namespace('', 'http://www.gexf.net/1.3')
ET.register_namespace('viz', 'http://www.gexf.net/1.3/viz')
for n in root.findall(xpath):
label=n.get('label')
if isinstance(label,str):
label=unicode(label,'utf-8')
for attr in n.getiterator("{http://www.gexf.net/1.3}attvalue"):
if attr.get('for')=='url':
url=url_match.get(label,None)
if url:
attr.set('value',url)
#logging.info(url)
#pass
outf="temp_xml/{}".format(os.path.basename(f))
logging.info(outf)
tree.write(outf,encoding='utf-8')
def combine(dkt,dkt2,db):
not_match=[]
url_match={}
for k in dkt2:
v0=dkt2[k]
v1=dkt.get(k,None)
if not v1:
korg=k
#logging.info(u'k={}'.format(k))
if isinstance(k,unicode):
k=k.encode('utf-8')
is_match=False
for a in db.retrieve(k):
#logging.info("\tk={}\tr={}".format(k,a))
is_match=True
v1=dkt.get(a)
if not is_match:
not_match.append(korg)
if v1:
#logging.info(u"v0={} v1={}".format(v0,v1))
if v0:
assert(re.search('http://law\.e-gov',v0))
else:
logging.info(u'k={}'.format(k))
if v1:
assert(re.search('http://elaws\.e-gov',v1))
url_match[k]=v1
logging.info(len(not_match))
#for a in not_match:
# logging.info(u"not_match={}".format(a))
logging.info(len(not_match))
return url_match
def modify_link():
mp=NewMongoOp()
law_dict=read_csv()
dkt=get_mp_data(mp,law_dict)
#write_simstring(dkt)
db=read_simstring()
gd='../visualized_laws_web/app/data/kenpo.gexf'
gd='../visualized_laws_web/app/data/*.gexf'
dkt2={}
for f in list(glob(gd)):
read_xml(f,dkt2)
url_match=combine(dkt,dkt2,db)
for f in glob(gd):
write_xml(f,url_match)
class ImgSize(object):
def __init__(self,label,f):
self.f=f
self.label=label
self.u0=None
self.size=0
def __str__(self):
return u"file={} l={} u0={} size={}".\
format(self.f,self.label,self.u0,self.size)
def write_alone(f):
#logging.info("write={}".format(f))
tree = ET.ElementTree(file=f)
root = tree.getroot()
xpath=".//{http://www.gexf.net/1.3}node"
ET.register_namespace('', 'http://www.gexf.net/1.3')
ET.register_namespace('viz', 'http://www.gexf.net/1.3/viz')
for n in root.findall(xpath):
label=n.get('label')
img=ImgSize(label,f)
if isinstance(label,str):
label=unicode(label,'utf-8')
for attr in n.getiterator("{http://www.gexf.net/1.3}attvalue"):
if attr.get('for')=='url':
u0=attr.get('value')
if re.search('http://law\.e-gov\.go.jp/',u0):
img.u0=u0
size=n.getiterator("{http://www.gexf.net/1.3/viz}size")
size=size[0].get('value')
if size:
size=int(float(size))
img.size=size
if img.u0 and img.size> 30:
logging.info(u'img={}'.format(img))
def write_https(f):
logging.info("write={}".format(f))
tree = ET.ElementTree(file=f)
root = tree.getroot()
xpath=".//{http://www.gexf.net/1.3}node"
ET.register_namespace('', 'http://www.gexf.net/1.3')
ET.register_namespace('viz', 'http://www.gexf.net/1.3/viz')
for n in root.findall(xpath):
label=n.get('label')
for attr in n.getiterator("{http://www.gexf.net/1.3}attvalue"):
if attr.get('for')=='url':
u0=attr.get('value')
u0=re.sub('http://elaws\.e-gov\.go\.jp/','https://elaws.e-gov.go.jp/',u0)
attr.set('value',u0)
tree.write(f,encoding='utf-8')
def gleaning():
gd='../visualized_laws_web/app/data/*.gexf'
for f in glob(gd):
write_https(f)
def main():
gleaning()
if __name__=='__main__':main()