This repository has been archived by the owner on Jun 19, 2023. It is now read-only.
forked from openspending/dpkg-uk25k
/
cleanup.py
79 lines (66 loc) · 2.54 KB
/
cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import sqlaload as sl
from common import *
from common import issue as _issue
import cleanup_dates
import cleanup_numbers
import cleanup_gov
import cleanup_supplier
log = logging.getLogger('cleanup')
def issue(engine, resource_id, resource_hash, message, data={}):
_issue(engine, resource_id, resource_hash, 'cleanup',
message, data=data)
def cleanup_sheet(engine, row, sheet_id):
spending_table = sl.get_table(engine, 'spending')
data = list(sl.find(engine, spending_table,
resource_id=row['resource_id'],
sheet_id=sheet_id))
connection = engine.connect()
trans = connection.begin()
date_formats = cleanup_dates.detect_formats(data)
try:
if None in date_formats.values():
log.warn("Couldn't detect date formats: %r", date_formats)
issue(engine, row['resource_id'], row['retrieve_hash'],
"Couldn't detect date formats", repr(date_formats))
return False
sl.delete(connection, spending_table,
resource_id=row['resource_id'],
sheet_id=sheet_id)
for row in data:
row = cleanup_dates.apply(row, date_formats)
row = cleanup_numbers.apply(row)
row = cleanup_gov.apply(row)
#row = cleanup_supplier.apply(row, engine)
del row['id']
sl.add_row(connection, spending_table, row)
trans.commit()
return True
finally:
connection.close()
def cleanup_resource(engine, source_table, row, force):
if not row['combine_status']:
return
# Skip over tables we have already cleaned up
if not force and sl.find_one(engine, source_table,
resource_id=row['resource_id'],
cleanup_status=True,
cleanup_hash=row['combine_hash']) is not None:
return
log.info("Cleanup: %s, Resource %s", row['package_name'], row['resource_id'])
status = True
for sheet_id in range(0, row['sheets']):
sheet_status = cleanup_sheet(engine, row, sheet_id)
if status and not sheet_status:
status = False
sl.upsert(engine, source_table, {
'resource_id': row['resource_id'],
'cleanup_hash': row['combine_hash'],
'cleanup_status': status,
}, unique=['resource_id'])
def cleanup_all(force=False):
engine = db_connect()
source_table = sl.get_table(engine, 'source')
for row in sl.find(engine, source_table):
cleanup_resource(engine, source_table, row, force)
if __name__ == '__main__':
cleanup_all(False)