line = line.strip().split('|') line_dict = {k: v for k, v in zip(mapping, line)} out_file_obj.write(dumps(line_dict)) out_file_obj.write("\n") # For each data element, pull the data itself and the header. Combine to form JSON then upload. if __name__ == "__main__": import sys azure = len(sys.argv) > 1 and sys.argv[1] == "azure" blob_upload = None if azure: print("Upload to azure") from azureblob.upload_file_to_blob import BlobUploader blob_upload = BlobUploader(make_container_public=True) else: print("No azure upload requested. Pulling locally only.") for key, files in sources.items(): print("Downloading " + key) local_zip_file = "{0}/{1}.zip".format(tmp_data_folder, key) local_unzip_dir = "{0}/{1}".format(tmp_data_folder, key) pull_ftp_data(files['data'], local_zip_file) mapping = pull_http_data(files['mapping']).strip() mapping = mapping.split(',') # unzip the data z = ZipFile(local_zip_file) z.extractall(local_unzip_dir)
""" Run the program. """ if len(sys.argv) < 3: print("Usage: python download_fec_filings.py fecdefs.json 20151103 20151104 [azure]") print("This would download and process all FEC data for Nov 3 and Nov 4.") mindate_s = sys.argv[2] maxdate_s = sys.argv[3] blob_upload = None if len(sys.argv) > 4 and sys.argv[4] == 'azure': print("Will upload to Azure") from azureblob.upload_file_to_blob import BlobUploader blob_upload = BlobUploader(make_container_public=True) else: print("No azure upload requested.") print("Running from {0} to {1}".format(mindate_s, maxdate_s)) mindate = datetime.strptime(mindate_s, "%Y%m%d") maxdate = datetime.strptime(maxdate_s, "%Y%m%d") tmp_data_folder = "./tmp_data" clean_data_folder = "./data" try: os.mkdir(tmp_data_folder)
import pandas as pd zipcodes = 'http://www2.census.gov/geo/relfiles/cdsld13/natl/natl_zccd_delim.txt' districts = 'http://www2.census.gov/geo/docs/reference/codes/files/national_cd113.txt' # Get data df_zip = pd.read_csv(zipcodes, skiprows=1) df_zip.columns = ['State', 'ZCTA', 'District'] df_districts = pd.read_table(districts, delimiter='\s\s+', header=None, skiprows=1, engine='python') df_districts.columns = ['STATE', 'STATEKEY', 'DISTRICT', 'DISTRICTNAME'] # get just state bits from districts. states = df_districts.groupby('STATE').agg({'STATEKEY': max}).reset_index() # df_zip_withstates = df_zip.merge(states, left_on='State', right_on='STATEKEY', how='left') df_zip_withstates = df_zip_withstates.icol(range(4)) df_zip_withstates.columns = ['state_code', "zcta", "district", "state"] df_zip_withstates.to_csv("data/zip_to_district.csv", index=False) azure = len(sys.argv) > 1 and sys.argv[1] == 'azure' if azure: print("Uploading to azure") from azureblob.upload_file_to_blob import BlobUploader blob_upload = BlobUploader(make_container_public=True) blob_upload.put_json_file(open("data/zip_to_district.csv", 'r'), "raw_mappings/zip_to_district.csv")
""" Download the zcta to county file (many to many relationship) """ import requests import sys use_azure = len(sys.argv) > 1 and sys.argv[1] == 'azure' file_loc = "http://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt" # Get file locally r = requests.get(file_loc) local_file = open("data/zcta_to_county.csv", 'w') local_file.write(r.text) local_file.close() if use_azure: print("Uploading to azure") from azureblob.upload_file_to_blob import BlobUploader blob_upload = BlobUploader(make_container_public=True) blob_upload.put_json_file(open("data/zcta_to_county.csv", 'r'), "raw_mappings/zcta_to_county.csv")