def prep(filename): prepstart = datetime.datetime.now() slug = filename[:3] rawfile = "%s/%s" % (rawbase, filename) tempfile = "%s/%s_temp.csv" % (temp, slug) prepfile = "%s/%s_prep.csv" % (prepbase, slug) loadfile = "%s/%s.csv" % (loadbase, slug) call("cp %s %s" % (rawheader, tempfile), shell=True) call('cat %s/%s >> %s' % (rawbase, filename, tempfile), shell=True) call("csvcut -t -c %s %s > %s" % (columns, tempfile, prepfile), shell=True) with open(prepfile, 'r') as f: biglist = [] reader = cdr(f) header = reader.fieldnames for row in reader: if row['race'] in raced.keys(): RACE = raced[row['race']] else: RACE = '' if row['party'] in partyd.keys(): PARTY = partyd[row['party']] else: PARTY = '' biglist.append([ row['lname'].strip(), row['fname'].strip(), row['mname'].strip(), row['suffix'].strip(), ' '.join(row['addr1'].split()),#strips extra interior white space row['addr2'].strip(), row['city'].strip(), row['zip'].strip(), row['gender'].strip(), RACE, row['birthdate'].strip(), PARTY, row['areacode'].strip(), row['phone'].strip(), row['email'].strip(), row['voter_ID'].strip() ]) biglist=sorted(biglist)#sorts list of lists based on first field, which is last name with open(loadfile, 'w') as lf: writer = ckw(lf) writer.writerow(header) for entry in biglist: writer.writerow(entry) print "%s ready for loading; prepping took %s" % (loadfile, (datetime.datetime.now()-prepstart))
def export_spreadsheet(year): """traverse pages and assemble all school ids and names""" starter = datetime.datetime.now() fields = build_string(year) headings = fields.replace(".", "_").split(",") collector = {} url = "%s?api_key=%s&per_page=%s&fields=%s" % (schools_root, api_key, page_max, fields) data = json.loads(requests.get(url).text) # initial pass if "results" not in data: print "no results" return data for school in data["results"]: collector[school["id"]] = {key: "" for key in headings} for key, value in school.iteritems(): collector[school["id"]][key.replace(".", "_")] = value next_page = data["metadata"]["page"] + 1 more_data = True # harvest rest of pages while more_data: print "getting page %s" % next_page next_url = "%s&page=%s" % (url, next_page) nextdata = json.loads(requests.get(next_url).text) if len(nextdata["results"]) == 0: more_data = False print "no more pages; exporting ..." else: for school in nextdata["results"]: collector[school["id"]] = {key: "" for key in headings} for key, value in school.iteritems(): collector[school["id"]][key.replace(".", "_")] = value next_page = nextdata["metadata"]["page"] + 1 with open("schools_%s.csv" % year, "w") as f: writer = ckw(f) writer.writerow(headings) for school_id in collector: writer.writerow([collector[school_id][field] for field in headings]) print "export_spreadsheet took %s to process schools\ for the year %s" % ( (datetime.datetime.now() - starter), year, ) return data