def loadbill(fn, maplightid=None): bill = xmltramp.load(fn) d = bill2dict(bill) d.maplightid = maplightid try: bill_id = d.id db.insert('bill', seqname=False, **d) except IntegrityError: bill_id = d.pop('id') db.update('bill', where="id=" + web.sqlquote(bill_id), **d) positions = {} for vote in bill.actions['vote':]: if not vote().get('roll'): continue rolldoc = '/us/%s/rolls/%s%s-%s.xml' % ( d.session, vote('where'), vote('datetime')[:4], vote('roll')) roll = xmltramp.load(GOVTRACK_CRAWL + rolldoc) for voter in roll['voter':]: positions[govtrackp(voter('id'))] = fixvote(voter('vote')) if None in positions: del positions[None] with db.transaction(): db.delete('position', where='bill_id=$bill_id', vars=locals()) for p, v in positions.iteritems(): db.insert('position', seqname=False, bill_id=bill_id, politician_id=p, vote=v)
def load_data(): c = csv.reader(file('../data/crawl/maplight/uniq_map_export_bill_research.csv')) supportdict = {'0': -1, '1': 1, '2': 0 } #0: oppose ; 1: support; 2: not known (from README) with db.transaction(): db.delete('interest_group_bill_support', '1=1') for line in c: if not line[0].startswith('#'): category_id, longname, maplightid, session, measure, support = line support = supportdict[support] if support == 0: continue typenumber = measure.lower().replace(' ', '') r = db.select('interest_group', what="id", where="longname=$longname", vars=locals()) if r: groupid = r[0].id else: groupid = db.insert('interest_group', longname=longname, category_id=category_id) bill_id = 'us/%s/%s' % (session, typenumber) r = db.select('bill', where="id=$bill_id", vars=locals()) if not r: filename = "../data/crawl/govtrack/us/%s/bills/%s.xml" % (session, typenumber) bills.loadbill(filename, maplightid=maplightid) else: db.update('bill', maplightid=maplightid, where="id=$bill_id", vars=locals()) try: #print '\r', bill_id, db.insert('interest_group_bill_support', seqname=False, bill_id=bill_id, group_id=groupid, support=support) except: print '\n Duplicate row with billid %s groupid %s support %s longname %s' % (bill_id, groupid, support, longname) raise
def generate_similarities(): """ Generate similarity information for each (interest group, politician) pair and store in DB """ result = db.query( 'select igbp.group_id, position.politician_id, igbp.support, position.vote' ' from interest_group_bill_support igbp, position' ' where igbp.bill_id = position.bill_id') sim = {} total = {} for r in result: k = (r.group_id, r.politician_id) if r.support == r.vote and r.support != 0: sim[k] = sim.get(k, 0) + 1 total[k] = total.get(k, 0) + 1 with db.transaction(): db.delete('group_politician_similarity', '1=1') for k, agreed in sim.items(): group_id, politician_id = k db.insert('group_politician_similarity', seqname=False, group_id=group_id, politician_id=politician_id, agreed=agreed, total=total[k])
def load_soi(): # TODO: not sure how to handle agi and gini values. districts = {} data = {} for z in soi.parse_soi(): dists_for_data = get_dist(z.loc) if dists_for_data: data[z.loc] = z for d in dists_for_data.keys(): # for each district associated with loc if d not in districts: districts[d] = { 'brackets': [{'n_filers':0} for x in range(len(z.brackets))] } for new_data,cur_data in zip(z.brackets, districts[d]['brackets']): #print new_data.n_filers, new_data.n_prepared, new_data.agi, new_data.bracket_low n_filers_old = cur_data['n_filers'] if not new_data.n_filers: new_data['n_filers'] = 0 n_filers_new = n_filers_old + new_data.n_filers * dists_for_data[d] for k in new_data.keys(): if k not in cur_data: cur_data[k] = 0 if k.startswith('n_') or k.startswith('tot_') or k == 'agi': if new_data[k]: cur_data[k] += new_data[k] * dists_for_data[d] elif k.startswith('pct_') or k.startswith('avg_'): if new_data[k]: cur_data[k] = (cur_data[k] * n_filers_old + dists_for_data[d] * new_data[k] * new_data.n_filers) / n_filers_new else: #if k in cur_data and cur_data[k] != new_data[k]: print k, cur_data[k], new_data[k] cur_data[k] = new_data[k] for d in districts.keys(): for b in districts[d]['brackets']: if d == 'DC': d = 'DC-00' # HACK: Data has DC which should be DC-00. # We can't use None because bracket_low is part of the primary key. if isinstance(b['bracket_low'], NoneType): b['bracket_low'] = -1 db.insert('soi', seqname=False, district_id=d, **b)
def loadroll(fn): roll = web.storage() roll.id = fn.split('/')[-1].split('.')[0] vote = xmltramp.load(fn) if vote['bill':]: b = vote.bill roll.bill_id = 'us/%s/%s%s' % (b('session'), b('type'), b('number')) else: roll.bill_id = None roll.type = str(vote.type) roll.question = str(vote.question) roll.required = str(vote.required) roll.result = str(vote.result) try: db.insert('roll', seqname=False, **roll) except IntegrityError: if not db.update('roll', where="id=" + web.sqlquote(roll.id), bill_id=roll.bill_id): print "\nMissing bill:", roll.bill_id raise NotDone with db.transaction(): db.delete('vote', where="roll_id=$roll.id", vars=locals()) for voter in vote['voter':]: rep = govtrackp(voter('id')) if rep: db.insert('vote', seqname=False, politician_id=rep, roll_id=roll.id, vote=fixvote(voter('vote'))) else: pass #@@!--check again after load_everyone
def load_categories(): c = csv.reader(file('../data/crawl/maplight/CRP_Categories.csv')) with db.transaction(): db.delete('category', '1=1') for line in c: if not line[0].startswith('#'): cid, cname, industry, sector, empty = line db.insert('category', seqname=False, id=cid, name=cname, industry=industry, sector=sector)
def load_data(): c = csv.reader( file('../data/crawl/maplight/uniq_map_export_bill_research.csv')) supportdict = { '0': -1, '1': 1, '2': 0 } #0: oppose ; 1: support; 2: not known (from README) with db.transaction(): db.delete('interest_group_bill_support', '1=1') for line in c: if not line[0].startswith('#'): category_id, longname, maplightid, session, measure, support = line support = supportdict[support] if support == 0: continue typenumber = measure.lower().replace(' ', '') r = db.select('interest_group', what="id", where="longname=$longname", vars=locals()) if r: groupid = r[0].id else: groupid = db.insert('interest_group', longname=longname, category_id=category_id) bill_id = 'us/%s/%s' % (session, typenumber) r = db.select('bill', where="id=$bill_id", vars=locals()) if not r: filename = "../data/crawl/govtrack/us/%s/bills/%s.xml" % ( session, typenumber) bills.loadbill(filename, maplightid=maplightid) else: db.update('bill', maplightid=maplightid, where="id=$bill_id", vars=locals()) try: #print '\r', bill_id, db.insert('interest_group_bill_support', seqname=False, bill_id=bill_id, group_id=groupid, support=support) except: print '\n Duplicate row with billid %s groupid %s support %s longname %s' % ( bill_id, groupid, support, longname) raise
def load_fec_ids(): with db.transaction(): db.delete('politician_fec_ids', '1=1') fh = iter(file('../data/crawl/opensecrets/FEC_CRP_ID.tsv')) header = fh.next() for line in fh: fec_id, crp_id = line.split() if tools.opensecretsp(crp_id): fec2pol[fec_id] = tools.opensecretsp(crp_id) db.insert('politician_fec_ids', seqname=False, politician_id=tools.opensecretsp(crp_id), fec_id=fec_id)
def load_fec_efilings(filepattern=fec_crude_csv.DEFAULT_EFILINGS_FILEPATTERN): for f, schedules in fec_crude_csv.parse_efilings(glob.glob(filepattern)): for s in schedules: if s.get('type') == 'contribution': # XXX all this code for politician_id is currently # dead, does nothing useful politician_id = None if f.get('candidate_fec_id'): fec_id = f['candidate_fec_id'] pol_fec_id = list( db.select('politician_fec_ids', where='fec_id=$fec_id', vars=locals())) if pol_fec_id and len(pol_fec_id) == 1: politician_id = pol_fec_id[0].politician_id elif not politician_id and f.get('candidate'): names = f['candidate'].split(' ') fn, ln = names[0], names[-1] pol = list( db.select('politician', where='lastname=$ln and firstname=$fn', vars=locals())) if pol and len(pol) == 1: politician_id = pol[0].id db.insert('contribution', committee=f['committee'], contrib_date=s['date'], contributor_org=s.get('contributor_org'), contributor=s['contributor'], occupation=s['occupation'], employer=s['employer'], employer_stem=tools.stemcorpname(s['employer']), candidate_name=f.get('candidate'), filer_id=f['filer_id'], report_id=f['report_id'], amount=s['amount']) elif s.get('type') == 'expenditure': db.insert('expenditure', candidate_name=f.get('candidate'), committee=f['committee'], expenditure_date=s['date'], recipient=s['recipient'], filer_id=f['filer_id'], report_id=f['report_id'], amount=s['amount']) else: print "ignoring record of type %s" % \ s['original_data'].get('form_type')
def load_fec_efilings(filepattern=fec_crude_csv.DEFAULT_EFILINGS_FILEPATTERN): for f, schedules in fec_crude_csv.parse_efilings(glob.glob(filepattern)): for s in schedules: if s.get('type') == 'contribution': # XXX all this code for politician_id is currently # dead, does nothing useful politician_id = None if f.get('candidate_fec_id'): fec_id = f['candidate_fec_id'] pol_fec_id = list(db.select('politician_fec_ids', where='fec_id=$fec_id', vars=locals())) if pol_fec_id and len(pol_fec_id) == 1: politician_id = pol_fec_id[0].politician_id elif not politician_id and f.get('candidate'): names = f['candidate'].split(' ') fn, ln = names[0], names[-1] pol = list(db.select('politician', where='lastname=$ln and firstname=$fn', vars=locals())) if pol and len(pol) == 1: politician_id = pol[0].id db.insert('contribution', committee=f['committee'], contrib_date=s['date'], contributor_org=s.get('contributor_org'), contributor=s['contributor'], occupation=s['occupation'], employer=s['employer'], employer_stem=tools.stemcorpname(s['employer']), candidate_name=f.get('candidate'), filer_id=f['filer_id'], report_id=f['report_id'], amount=s['amount']) elif s.get('type') == 'expenditure': db.insert('expenditure', candidate_name=f.get('candidate'), committee=f['committee'], expenditure_date=s['date'], recipient=s['recipient'], filer_id=f['filer_id'], report_id=f['report_id'], amount=s['amount']) else: print "ignoring record of type %s" % \ s['original_data'].get('form_type')
def loadbill(fn, maplightid=None): bill = xmltramp.load(fn) d = bill2dict(bill) if maplightid: d["maplightid"] = maplightid db.insert("bill", seqname=False, **d) print "\r %-25s" % d["id"], sys.stdout.flush() done = [] for vote in bill.actions["vote":]: if not vote().get("roll"): continue if vote("where") in done: continue # don't count veto overrides done.append(vote("where")) votedoc = "%s/rolls/%s%s-%s.xml" % (d["session"], vote("where"), vote("datetime")[:4], vote("roll")) vote = xmltramp.load("../data/crawl/govtrack/us/" + votedoc) yeas = 0 neas = 0 for voter in vote["voter":]: if fixvote(voter("vote")) == 1: yeas += 1 elif fixvote(voter("vote")) == -1: neas += 1 rep = govtrackp(voter("id")) if rep: # UGLY HACK: if a politician (bob_menendez for instance) voted # for the same bill in both chambers of congress the insert # fails. if not db.select("vote", where="bill_id=$d['id'] AND politician_id=$rep", vars=locals()): db.insert("vote", seqname=False, politician_id=rep, bill_id=d["id"], vote=fixvote(voter("vote"))) else: print print "Updating:", votedoc, rep, d["id"], fixvote(voter("vote")) db.update( "vote", where="bill_id=$d['id'] AND politician_id=$rep", vote=fixvote(voter("vote")), vars=locals(), ) db.update("bill", where="id = $d['id']", yeas=yeas, neas=neas, vars=locals())
def load_fec_committees(): db.delete('contribution', '1=1') db.delete('committee', '1=1') for f in fec_cobol.parse_committees(reverse=True): f = web.storage(f) try: db.insert('committee', seqname=False, id = f.committee_id, name = f.committee_name, treasurer = f.treasurer_name, street1 = f.street_one, street2 = f.street_two, city = f.city, state = f.state, zip = f.zip, connected_org_name = f.connected_org_name, candidate_id = f.candidate_id, type = f.committee_type ) except psycopg2.IntegrityError: pass # already imported
def load_fec_committees(): db.delete('contribution', '1=1') db.delete('committee', '1=1') for f in fec_cobol.parse_committees(reverse=True): f = web.storage(f) try: db.insert('committee', seqname=False, id=f.committee_id, name=f.committee_name, treasurer=f.treasurer_name, street1=f.street_one, street2=f.street_two, city=f.city, state=f.state, zip=f.zip, connected_org_name=f.connected_org_name, candidate_id=f.candidate_id, type=f.committee_type) except psycopg2.IntegrityError: pass # already imported
def load_fec_contributions(): t = db.transaction() n = 0 db.delete('contribution', '1=1') for f in fec_cobol.parse_contributions(): f = web.storage(f) f.occupation = f.occupation.replace('N/A', '') if '/' in f.occupation: employer, occupation = f.occupation.split('/', 1) else: employer = '' occupation = f.occupation try: datetime.date(*[int(x) for x in f.date.split('-')]) except ValueError: f.date = None db.insert('contribution', fec_record_id=f.get('fec_record_id'), microfilm_loc=f.microfilm_loc, recipient_id=f.filer_id, name=f.name, street=f.get('street'), city=f.city, state=f.state, zip=f.zip, occupation=occupation, employer=employer, employer_stem=tools.stemcorpname(employer), committee=f.from_id or None, sent=f.date, amount=f.amount) n += 1 if n % 10000 == 0: t.commit() t = db.transaction() print n t.commit()
def load_fec_contributions(): t = db.transaction(); n = 0 db.delete('contribution', '1=1') for f in fec_cobol.parse_contributions(): f = web.storage(f) f.occupation = f.occupation.replace('N/A', '') if '/' in f.occupation: employer, occupation = f.occupation.split('/', 1) else: employer = '' occupation = f.occupation try: datetime.date(*[int(x) for x in f.date.split('-')]) except ValueError: f.date = None db.insert('contribution', fec_record_id = f.get('fec_record_id'), microfilm_loc = f.microfilm_loc, recipient_id = f.filer_id, name = f.name, street = f.get('street'), city = f.city, state = f.state, zip = f.zip, occupation = occupation, employer = employer, employer_stem = tools.stemcorpname(employer), committee = f.from_id or None, sent = f.date, amount = f.amount ) n += 1 if n % 10000 == 0: t.commit(); t = db.transaction(); print n t.commit() print "Creating indexes on table `contribution`..." schema.Contribution.create_indexes() print "done."
def generate_similarities(): """ Generate similarity information for each (interest group, politician) pair and store in DB """ result = db.query('select igbp.group_id, position.politician_id, igbp.support, position.vote' ' from interest_group_bill_support igbp, position' ' where igbp.bill_id = position.bill_id') sim = {} total = {} for r in result: k = (r.group_id, r.politician_id) if r.support == r.vote and r.support != 0: sim[k] = sim.get(k, 0) + 1 total[k] = total.get(k, 0) + 1 with db.transaction(): db.delete('group_politician_similarity', '1=1') for k, agreed in sim.items(): group_id, politician_id = k db.insert('group_politician_similarity', seqname=False, group_id=group_id, politician_id=politician_id, agreed=agreed, total=total[k])
def load_fec_efilings(): for f in fec_csv.parse_efilings(): for s in f['schedules']: if s['type'] == 'contribution': politician_id = None if f['candidate_fec_id']: fec_id = f['candidate_fec_id'] pol_fec_id = list(db.select('politician_fec_ids', where='fec_id=$fec_id', vars=locals())) if pol_fec_id and len(pol_fec_id) == 1: politician_id = pol_fec_id[0].politician_id elif not politician_id and f['candidate']: names = f['candidate'].split(' ') fn, ln = names[0], names[-1] pol = list(db.select('politician', where='lastname=$ln and firstname=$fn', vars=locals())) if pol and len(pol) == 1: politician_id = pol[0].id db.insert('contribution', committee=f['committee'], contrib_date=s['date'], contributor_org=s['contributor_org'], contributor=s['contributor'], occupation=s['occupation'], employer=s['employer'], employer_stem=tools.stemcorpname(s['employer']), candidate_name=f['candidate'], filer_id=f['filer_id'], report_id=f['report_id'], amount=s['amount']) else: db.insert('expenditure', candidate_name=f['candidate'], committee=f['committee'], expenditure_date=s['date'], recipient=s['recipient'], filer_id=f['filer_id'], report_id=f['report_id'], amount=s['amount'])