forked from mbijon/parcels
/
parse.py
136 lines (104 loc) · 3.62 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import csv
import logging
import os
import re
import shutil
import sys
import traceback
import config
from openaddr.jobs import setup_logger
from shapely.wkt import dumps
from utils import fetch, unzip, rlistdir, import_with_fiona, import_csv
_L = logging.getLogger('openaddr.parcels')
csv.field_size_limit(sys.maxsize)
setup_logger()
def parse_source(source, idx, header):
"""
Import data from a single source based on the data type.
"""
path = '{}/{}'.format(config.workspace_dir, idx)
if not os.path.exists(path):
os.makedirs(path)
cache_url = source[header.index('cache')]
cache_filename = re.search('/[^/]*$', cache_url).group()
fetch(cache_url, path + cache_filename)
files = rlistdir(path)
for f in files:
if re.match('.*\.(zip|obj|exe)$', f): # some files had mislabelled ext
unzip(f, path)
shapes = []
files = rlistdir(path)
for f in files:
if re.match('.*\.({})$'.format('|'.join(config.fiona_extensions)), f):
objs = import_with_fiona(f, source[0])
for obj in objs:
shapes.append(obj)
elif re.match('.*\.csv$', f):
objs = import_csv(f, source[0])
for obj in objs:
shapes.append(obj)
shutil.rmtree(path)
if not shapes:
_L.warning('failed to parse source. did not find shapes. files in archive: {}'.format(files))
return shapes
def writeout(fp, data):
"""
Write the csv file.
"""
keys = list(data[0].keys())
writer = csv.DictWriter(fp, fieldnames=keys)
writer.writeheader()
for row in data:
writer.writerow(row)
fp.close()
def parse_statefile(state, header):
"""
Imports all available data from state.
Note: We catch all errors during processing, give a warning,
and churn on, this is the preferred data processing method.
"""
ct = 0
for idx in range(0, len(state)):
try:
data = parse_source(state[idx], idx, header)
if data:
filename = re.sub(r'\.[^\.]*$', '.csv', state[idx][header.index('source')])
path = '{}/{}'.format(config.output_dir, re.sub(r'\/[^\/]*$', '', filename))
if not os.path.exists(path):
os.makedirs(path)
wkt_file = open("{}/{}".format(config.output_dir, filename), 'w')
writeout(wkt_file, data)
ct += 1
except Exception as e:
_L.warning('error parsing source. {}'.format(e))
_L.info('parsed {} [{}/{}]'.format(idx + 1, ct, len(state)))
def load_state():
"""
Loads a python representation of the state file.
"""
state = []
with open(config.statefile_path, 'r') as statefile:
statereader = csv.reader(statefile, dialect='excel-tab')
for row in statereader:
state.append(row)
header = state.pop(0)
return state, header
def filter_polygons(state, header):
"""
Removes any non-polygon sources from the state file.
We are only interested in parsing parcel data, which is
marked as Polygon in the state file.
"""
filtered_state = []
for source in state:
if 'Polygon' in source[header.index('geometry type')]:
filtered_state.append(source)
return filtered_state
if __name__ == '__main__':
if not os.path.isfile(config.statefile_path):
fetch(config.state_url, config.statefile_path)
if not os.path.exists(config.output_dir):
os.makedirs(config.output_dir)
raw_state, header = load_state()
state = filter_polygons(raw_state, header)
parse_statefile(state, header)