forked from JoshData/dc-code-prototype-tools
-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_code_2015-06.py
127 lines (114 loc) · 3.55 KB
/
parse_code_2015-06.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os, os.path, sys, json, time, re
import lxml.etree as etree
from parsers import Parser, _make_node, _para_text_content
from worddoc import open_docx
import matchers
import copy
div_re = re.compile(r'(?P<div>\w+)\.docx$')
def parse_file(dom, path_to_file, start_para_index):
# Open the Word file. Use a cached json file if it exists
# since that's faster that opening the raw .docx file.
print('\nparsing {}'.format(path_to_file), file=sys.stderr)
fhash = _hashfile(path_to_file)
doc = None
tmp_doc = "/tmp/doc.cache.{}.json".format(fhash)
if os.path.exists(tmp_doc):
doc = json.load(open(tmp_doc))
print('loading from', tmp_doc, file=sys.stderr)
else:
print('saving to', tmp_doc, file=sys.stderr)
if doc is None:
doc = open_docx(path_to_file, pict=pict_handler)
div_re.search('./2015-06/Division VIII.docx').group('div')
for section in doc['sections']:
for para_index, para in enumerate(section["paragraphs"], start_para_index):
para['index'] = para_index
start_para_index += len(section['paragraphs'])
with open(tmp_doc, "w") as doccache:
json.dump( doc, doccache, indent=2)
try:
# Parse each section.
for section in doc["sections"]:
parse_doc_section(section, dom)
except:
import traceback
traceback.print_exc()
return start_para_index
def main():
# Form the output DOM.
dom = etree.Element("code")
_make_node(dom, "heading", "Code of the District of Columbia")
meta = _make_node(dom, "meta", None)
recency = etree.fromstring(sys.argv[2] if len(sys.argv) > 2 else """
<recency>
<law>
<law>20-241</law>
<effective>2015-04-13</effective>
</law>
<emergency>
<law>20-617</law>
<effective>2015-01-28</effective>
</emergency>
<federal>
<law>113-235</law>
<effective>2014-12-16</effective>
</federal>
</recency>
""")
meta.append(recency)
start_time = time.time()
DIR = sys.argv[1]
try:
all_file_names = os.listdir(DIR)
except NotADirectoryError:
file_paths = [DIR]
else:
file_paths = [os.path.join(DIR, fn) for fn in all_file_names if fn.endswith('.docx')]
start_para_index = 0
for fp in file_paths:
start_para_index = parse_file(dom, fp, start_para_index)
# print(time.time() - start_time)
# Output, being careful we get UTF-8 to the byte stream.
sys.stdout.buffer.write(etree.tostring(dom, pretty_print=True, encoding="utf-8", xml_declaration=True))
def pict_handler(node):
return "@@PICT@@"
def _hashfile(filepath):
import hashlib
sha1 = hashlib.sha1()
f = open(filepath, 'rb')
try:
sha1.update(f.read())
finally:
f.close()
return sha1.hexdigest()
def parse_doc_section(section, dom):
def prep_para(para):
para['text'] = _para_text_content(para)
def next_para():
paras = section['paragraphs']
next_index = para['index'] - paras[0]['index'] + 1
if next_index >= len(paras):
return None
next_p = prep_para(copy.deepcopy(paras[next_index]))
if matchers.empty(next_p):
next_p = next_p['next']()
return next_p
para['next'] = next_para;
return para
parser = Parser(dom)
unhandled_count = 0
handled_count = 0
for para in section["paragraphs"]:
prep_para(para)
if not para['text']:
continue
success = parser(para)
if not success and para['text']:
unhandled_count += 1
print('unhandled para {}:'.format(para['index']), para, '\n', file=sys.stderr)
elif success:
handled_count += 1
print('handled paras: {}'.format(handled_count), file=sys.stderr)
print('unhandled paras: {}'.format(unhandled_count), file=sys.stderr)
if __name__ == '__main__':
main()