forked from dmil/iCorruptionHack
/
ingester.py
179 lines (144 loc) · 7.02 KB
/
ingester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""
Push new data to database.
"""
import datetime, os, re
import hashlib
from peewee import *
from app import db
from models import File, Contribution, ContributionChanges, ContributionHistory
from flask_peewee.utils import get_dictionary_from_model
from dateutil.parser import parse as dateparse
from datadiff import diff
from blessings import Terminal
t = Terminal()
def sha1OfFile(filepath):
with open(filepath, 'rb') as f:
return hashlib.sha1(f.read()).hexdigest()
def parse_fec_file(infile):
cycle = re.match(r"^.*(\d{4}_\d{4}).\w{3}$", infile.split("/")[-1], re.I).groups()[0].replace("_", "-")
date = dateparse(os.path.dirname(infile).split("/")[-1].replace("downloaded_", "").replace("_", "-")).date()
with open(infile) as f:
res = [parse_line(line) for line in f]
return [row_to_dict(val, cycle, date) for val in res]
def parse_line(l):
vals = l.split('|')
vals = map(lambda x: None if x=='' else x,vals)
int_cols = [18-1,21-1] #file num and sub id
for i in int_cols:
if vals[i] is not None:
vals[i] = int(vals[i])
float_cols = [15-1] #transaction amount
for i in float_cols:
if vals[i] is not None:
vals[i] = float(vals[i])
date_cols = [14-1] #date
for i in date_cols:
vl = vals[i]
if vl is not None:
vals[i] = datetime.datetime(month=int(vl[0:2]), day=int(vl[2:4]), year=int(vl[4:8]))
return vals
def row_to_dict(row, cycle, date):
return {
"cycle": unicode(cycle),
"date": date,
"comittee_id" : unicode(row[0]) if row[0] else None,
"ammendment_id" : unicode(row[1]) if row[1] else None,
"report_type" : unicode(row[2]) if row[2] else None,
"transaction_pgi" : unicode(row[3]) if row[3] else None,
"image_num" : unicode(row[4]) if row[4] else None,
"transaction_tp" : unicode(row[5]) if row[5] else None,
"entity_tp" : unicode(row[6]) if row[6] else None,
"name" : unicode(row[7]) if row[7] else None,
"city" : unicode(row[8]) if row[8] else None,
"state" : unicode(row[9]) if row[9] else None,
"zip_code" : unicode(row[10]) if row[10] else None,
"employer" : unicode(row[11]) if row[11] else None,
"occupation" : unicode(row[12]) if row[12] else None,
"transaction_id" : unicode(row[16]) if row[16] else None,
"memo_cd" : unicode(row[18]) if row[18] else None,
"memo_text" : unicode(row[19]) if row[19] else None,
"other_id" : unicode(row[15]) if row[15] else None,
"transaction_date" : row[13],
"transaction_amount" : float(row[14]),
"file_num" : row[17],
"sub_id" : int(row[20])
}
def ingested(filepath):
'''Return true if file is already ingested, false otherwise'''
# TODO: implement better (just checks if file in table right now)
try:
myfile = File.get(name=filepath)
print "%s already in database." % filepath
return True
except:
return False
def seed_from(filepath):
'''Ingest file into sqlite database'''
print "Ingesting %s" % filepath
rows = parse_fec_file(filepath)
with db.transaction():
myfile, _ = File.get_or_create(
name = os.path.basename(filepath),
years=next(re.finditer(r'\d{4}_\d{4}', os.path.basename(filepath))),
sha1 = sha1OfFile(filepath),
updated = dateparse(os.path.dirname(filepath).split("/")[-1].replace("downloaded_", "").replace("_", "-")).date(),
ingested = False
)
for idx in range(0, len(rows), 500):
print "Inserting row %d of %d from %s" % (idx, len(rows), filepath)
rows_subset = rows[idx:idx+500]
Contribution.insert_many(rows_subset).execute()
myfile.update(ingested=True)
def ingest(filepath):
'''Ingest file into database'''
print "Ingesting %s" % filepath
rows = parse_fec_file(filepath)
# check history table to see if this file is done
with db.transaction():
for idx, row in enumerate(rows):
print "Checking row %d of %d from %s" % (idx, len(rows), filepath)
try:
contribution_in_db = Contribution.get(cycle=row['cycle'], sub_id=row['sub_id'])
except Contribution.DoesNotExist:
contribution_in_db = None
# If the row isn't already there, insert it
if not contribution_in_db:
print t.cyan("\tInserting new row %d of %s" % (idx, filepath))
new_contribution = Contribution.create(**row)
ContributionHistory.create(contribution=new_contribution.id, date=row['date'], cycle=row['cycle'], sub_id=row['sub_id'])
# If the row is there, check for modifications
else:
# If it has not been modified, simply add a ContributionHistory object
contribution_in_db_dict = get_dictionary_from_model(contribution_in_db)
# x = {k:v for k,v in contribution_in_db_dict.iteritems() if k not in ["date", "id"]}
# y = {k:v for k,v in row.iteritems() if k != "date"}
if {k:v for k,v in contribution_in_db_dict.iteritems() if k not in ["date", "id"]} == {k:v for k,v in row.iteritems() if k != "date"}:
print t.white("\tNo changes found in row %d of %s" % (idx, filepath))
ContributionHistory.create(contribution=contribution_in_db.id, date=row['date'], cycle=row['cycle'], sub_id=row['sub_id'])
# If it has been modified, create a new object and give the new object a contribution history
else:
print t.magenta("\tDetected change in row %d of %s" % (idx, filepath))
# print diff(x,y)
# import pdb; pdb.set_trace()
ContributionChanges.create(contribution=contribution_in_db.id, **{k:v for k,v in contribution_in_db_dict.iteritems() if k != "id"})
for k,v in row.iteritems():
if v != getattr(contribution_in_db, k):
setattr(contribution_in_db, k, v)
contribution_in_db.save()
ContributionHistory.create(contribution=contribution_in_db.id, date=row['date'], cycle=row['cycle'], sub_id=row['sub_id'])
myfile, _ = File.get_or_create(
name = os.path.basename(filepath),
years=next(re.finditer(r'\d{4}_\d{4}', os.path.basename(filepath))),
sha1 = sha1OfFile(filepath),
updated = dateparse(os.path.dirname(filepath).split("/")[-1].replace("downloaded_", "").replace("_", "-")).date(),
ingested = True
)
# if __name__ == '__main__':
# filepaths = [
# "data/FEC 2014 2.17.2014/itcont.txt",
# "data/FEC 2014 3.22.2015/itcont.txt",
# "data/FEC 2014 9.14.2014/itcont.txt"
# ]
# for filepath in filepaths:
# if not ingested(filepath):
# ingest(filepath)