forked from dmil/iCorruptionHack
-
Notifications
You must be signed in to change notification settings - Fork 1
/
downloader.py
64 lines (52 loc) · 1.75 KB
/
downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
Check the FEC website to find if a new filing has been posted.
If there is a new filing store it in /data folder with new date.
http://www.fec.gov/finance/disclosure/ftpdet.shtml
"""
from app import root
import peewee
import os, subprocess, re
from datetime import date
import hashlib
from models import File
def sha1OfFile(filepath):
with open(filepath, 'rb') as f:
return hashlib.sha1(f.read()).hexdigest()
def download_files(new_folder_path):
'''
Get files from FEC and put them in a tmp folder
'''
subprocess.call(['sh', root+'/download.sh', new_folder_path])
download_files
def already_downloaded(filepath):
'''
Return true if we already have this version of the file
(check date and file hash). False otherwise
'''
try:
File.get(File.sha1 == sha1OfFile(filepath))
return True
except peewee.DoesNotExist:
return False
def download():
'''Grabs the latest.'''
# Download Files
date_str = date.today().strftime("%Y_%m_%d")
new_folder_path = "data/downloaded_%s" % date_str
download_files(new_folder_path)
# Delete if already exist in database
for path, subdirs, files in os.walk(new_folder_path):
for f in files:
if already_downloaded(path + '/' + f):
print "Didn't save '%s' because it was already in the database." % f
os.remove(path + '/' + f)
else:
print "Saved new file '%s/%s'" % (path, f)
File.create(
name = f,
years=next(re.finditer(r'\d{4}_\d{4}', f)),
sha1 = sha1OfFile(path + '/' + f),
updated = date.today(),
ingested = False
)
download()