if __name__ == "__main__": import sys azure = len(sys.argv) > 1 and sys.argv[1] == "azure" blob_upload = None if azure: print("Upload to azure") from azureblob.upload_file_to_blob import BlobUploader blob_upload = BlobUploader(make_container_public=True) else: print("No azure upload requested. Pulling locally only.") for key, files in sources.items(): print("Downloading " + key) local_zip_file = "{0}/{1}.zip".format(tmp_data_folder, key) local_unzip_dir = "{0}/{1}".format(tmp_data_folder, key) pull_ftp_data(files['data'], local_zip_file) mapping = pull_http_data(files['mapping']).strip() mapping = mapping.split(',') # unzip the data z = ZipFile(local_zip_file) z.extractall(local_unzip_dir) for unzipfile in z.namelist(): clean_file = "{0}/{1}.txt".format(clean_data_folder, key) to_json(mapping, "{0}/{1}".format(local_unzip_dir, unzipfile), clean_file) if blob_upload: print("Uploading to azure blob.") blob_upload.put_json_file(open(clean_file, 'r'), "raw_mappings/{0}.json".format(key))
pass try: os.mkdir(clean_data_folder) except WindowsError: # Can't recreate same dir. pass # perform pull num_days_to_pull = (maxdate - mindate).days for i in range(num_days_to_pull + 1): date_str = datetime.strftime(mindate + timedelta(days=i), "%Y%m%d") print("Parsing " + date_str) ziplocation, unziplocation = pull_data(date_str, tmp_data_folder) # unzip z = ZipFile(ziplocation) z.extractall(unziplocation) local_store_unzipped_folder(z) if blob_upload: for filedate, fileobj in filehandles.items(): print("Uploading main data for {0}".format(filedate)) blob_upload.put_json_file(fileobj, "raw_filings/{0}.json".format(filedate)) print("Uploading headers") for filedate, fileobj in org_defs.items(): blob_upload.put_json_file(fileobj, "raw_headers/{0}.json".format(filedate))
import pandas as pd zipcodes = 'http://www2.census.gov/geo/relfiles/cdsld13/natl/natl_zccd_delim.txt' districts = 'http://www2.census.gov/geo/docs/reference/codes/files/national_cd113.txt' # Get data df_zip = pd.read_csv(zipcodes, skiprows=1) df_zip.columns = ['State', 'ZCTA', 'District'] df_districts = pd.read_table(districts, delimiter='\s\s+', header=None, skiprows=1, engine='python') df_districts.columns = ['STATE', 'STATEKEY', 'DISTRICT', 'DISTRICTNAME'] # get just state bits from districts. states = df_districts.groupby('STATE').agg({'STATEKEY': max}).reset_index() # df_zip_withstates = df_zip.merge(states, left_on='State', right_on='STATEKEY', how='left') df_zip_withstates = df_zip_withstates.icol(range(4)) df_zip_withstates.columns = ['state_code', "zcta", "district", "state"] df_zip_withstates.to_csv("data/zip_to_district.csv", index=False) azure = len(sys.argv) > 1 and sys.argv[1] == 'azure' if azure: print("Uploading to azure") from azureblob.upload_file_to_blob import BlobUploader blob_upload = BlobUploader(make_container_public=True) blob_upload.put_json_file(open("data/zip_to_district.csv", 'r'), "raw_mappings/zip_to_district.csv")
""" Download the zcta to county file (many to many relationship) """ import requests import sys use_azure = len(sys.argv) > 1 and sys.argv[1] == 'azure' file_loc = "http://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt" # Get file locally r = requests.get(file_loc) local_file = open("data/zcta_to_county.csv", 'w') local_file.write(r.text) local_file.close() if use_azure: print("Uploading to azure") from azureblob.upload_file_to_blob import BlobUploader blob_upload = BlobUploader(make_container_public=True) blob_upload.put_json_file(open("data/zcta_to_county.csv", 'r'), "raw_mappings/zcta_to_county.csv")