def fetch_senate_committee_meetings(committees, options): # Load any existing meetings file so we can recycle any GUIDs. existing_meetings = [] output_file = output_for("senate") if os.path.exists(output_file): existing_meetings = json.load(open(output_file)) options = dict(options) # clone options["binary"] = True # options["force"] = True meetings = [] dom = lxml.etree.fromstring( utils.download( "http://www.senate.gov/general/committee_schedules/hearings.xml", "committee_schedule/senate.xml", options)) for node in dom.xpath("meeting"): committee_id = str(node.xpath('string(cmte_code)')) if committee_id.strip() == "": continue # "No committee hearings scheduled" placeholder occurs_at = str(node.xpath('string(date)')) room = str(node.xpath('string(room)')) topic = str(node.xpath('string(matter)')) occurs_at = datetime.datetime.strptime(occurs_at, "%d-%b-%Y %I:%M %p") topic = re.sub(r"\s+", " ", topic).strip() # Validate committee code. try: committee_code, subcommittee_code = re.match( r"(\D+)(\d+)$", committee_id).groups() if committee_code not in committees: raise ValueError(committee_code) if subcommittee_code == "00": subcommittee_code = None if subcommittee_code and subcommittee_code not in committees[ committee_code]["subcommittees"]: raise ValueError(subcommittee_code) except: print("Invalid committee code", committee_id) continue # See if this meeting already exists. If so, take its GUID. # Assume meetings are the same if they are for the same committee/subcommittee and # at the same time. for mtg in existing_meetings: if mtg["committee"] == committee_code and mtg.get( "subcommittee", None) == subcommittee_code and mtg[ "occurs_at"] == occurs_at.isoformat(): if options.get("debug", False): print("[%s] Reusing gUID." % mtg["guid"]) guid = mtg["guid"] break else: # Not found, so create a new ID. # TODO: Can we make this a human-readable ID? guid = str(uuid.uuid4()) # Scrape the topic text for mentions of bill numbers. congress = utils.congress_from_legislative_year( utils.current_legislative_year(occurs_at)) bills = [] bill_number_re = re.compile( r"(hr|s|hconres|sconres|hjres|sjres|hres|sres)\s?(\d+)", re.I) for bill_match in bill_number_re.findall(topic.replace(".", "")): bills.append(bill_match[0].lower() + bill_match[1] + "-" + str(congress)) # Create the meeting event. if options.get("debug", False): print("[senate][%s][%s] Found meeting in room %s at %s." % (committee_code, subcommittee_code, room, occurs_at.isoformat())) meetings.append({ "chamber": "senate", "congress": congress, "guid": guid, "committee": committee_code, "subcommittee": subcommittee_code, "occurs_at": occurs_at.isoformat(), "room": room, "topic": topic, "bill_ids": bills, }) print("[senate] Found %i meetings." % len(meetings)) return meetings
def run(): # default to caching cache = utils.flags().get('cache', True) force = not cache only_bioguide = utils.flags().get('bioguide', None) congress = utils.flags().get('congress', None) data_files = [] print("Loading %s..." % "legislators-current.yaml") legislators = load_data("legislators-current.yaml") data_files.append((legislators, "legislators-current.yaml")) print("Loading %s..." % "legislators-historical.yaml") legislators = load_data("legislators-historical.yaml") data_files.append((legislators, "legislators-historical.yaml")) #load roll call data. Will need to be updated (possibly) for 114th+ congresses, since it is unclear what the URl format will be if congress == None: raise Exception("the --congress flag is required") elif congress == "113": url_senate = "http://amypond.sscnet.ucla.edu/rollcall/static/S113.ord" url_house = "http://amypond.sscnet.ucla.edu/rollcall/static/H113.ord" elif int(congress) < 10 and int(congress) > 0: url_senate = "ftp://voteview.com/dtaord/sen0%skh.ord" % congress url_house = "ftp://voteview.com/dtaord/hou0%skh.ord" % congress elif int(congress) < 113 and int(congress) >= 10: url_senate = "ftp://voteview.com/dtaord/sen%skh.ord" % congress url_house = "ftp://voteview.com/dtaord/hou%skh.ord" % congress else: raise Exception("no data for congress " + congress) senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress senate_data = utils.download(url_senate, senate_destination, force) house_destination = "icpsr/source/house_rollcall%s.txt" % congress house_data = utils.download(url_house, house_destination, force) error_log = csv.writer( open("cache/errors/mismatch/mismatch_%s.csv" % congress, "wb")) error_log.writerow([ "error_type", "matches", "icpsr_name", "icpsr_state", "is_territory", "old_id", "new_id" ]) read_files = [(senate_data, "sen"), (house_data, "rep")] print("Running for congress " + congress) for read_file in read_files: for data_file in data_files: for legislator in data_file[0]: num_matches = 0 # # this can't run unless we've already collected a bioguide for this person bioguide = legislator["id"].get("bioguide", None) # if we've limited this to just one bioguide, skip over everyone else if only_bioguide and (bioguide != only_bioguide): continue #if not in currently read chamber, skip chamber = legislator['terms'][len(legislator['terms']) - 1]['type'] if chamber != read_file[1]: continue #only run for selected congress latest_congress = utils.congress_from_legislative_year( utils.legislative_year( parse_date( legislator['terms'][len(legislator['terms']) - 1]['start']))) if chamber == "sen": congresses = [ latest_congress, latest_congress + 1, latest_congress + 2 ] else: congresses = [latest_congress] if int(congress) not in congresses: continue # pull data to match from yaml last_name_unicode = legislator['name']['last'].upper().strip( ).replace('\'', '') last_name = unicodedata.normalize( 'NFD', str(last_name_unicode)).encode('ascii', 'ignore') state = utils.states[legislator['terms'] [len(legislator['terms']) - 1]['state']].upper()[:7].strip() # select icpsr source data based on more recent chamber write_id = "" lines = read_file[0].split('\n') for line in lines: # parse source data icpsr_state = line[12:20].strip() icpsr_name = line[21:].strip().strip(string.digits).strip() icpsr_id = line[3:8].strip() #ensure unique match if icpsr_name[:8] == last_name[:8] and state == icpsr_state: num_matches += 1 write_id = icpsr_id #skip if icpsr id is currently in data if "icpsr" in legislator["id"]: if write_id == legislator["id"]["icpsr"] or write_id == "": continue elif write_id != legislator["id"][ "icpsr"] and write_id != "": error_log.writerow([ "Incorrect_ID", "NA", last_name[:8], state, "NA", legislator["id"]["icpsr"], write_id ]) print("ID updated for %s" % last_name) if num_matches == 1: legislator['id']['icpsr'] = int(write_id) else: if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP": error_log.writerow([ "Non_1_match_number", str(num_matches), last_name[:8], state, "Y", "NA", "NA" ]) else: print( str(num_matches) + " matches found for " + last_name[:8] + ", " + state + " in congress " + str(congress)) error_log.writerow([ "Non_1_match_number", str(num_matches), last_name, state, "N", "NA", "NA" ]) save_data(data_file[0], data_file[1])
for data_file in data_files: for legislator in data_file[0]: num_matches = 0 # # this can't run unless we've already collected a bioguide for this person bioguide = legislator["id"].get("bioguide", None) # if we've limited this to just one bioguide, skip over everyone else if only_bioguide and (bioguide != only_bioguide): continue # if not in currently read chamber, skip chamber = legislator["terms"][len(legislator["terms"]) - 1]["type"] if chamber != read_file[1]: continue # only run for selected congress latest_congress = utils.congress_from_legislative_year( utils.legislative_year(parse_date(legislator["terms"][len(legislator["terms"]) - 1]["start"])) ) if chamber == "sen": congresses = [latest_congress, latest_congress + 1, latest_congress + 2] else: congresses = [latest_congress] if int(congress) not in congresses: continue # pull data to match from yaml last_name_unicode = legislator["name"]["last"].upper().strip().replace("'", "") last_name = unicodedata.normalize("NFD", unicode(last_name_unicode)).encode("ascii", "ignore") state = utils.states[legislator["terms"][len(legislator["terms"]) - 1]["state"]].upper()[:7].strip() # select icpsr source data based on more recent chamber
def run(): # default to caching cache = utils.flags().get('cache', True) force = not cache only_bioguide = utils.flags().get('bioguide', None) congress = utils.flags().get('congress', None) data_files = [] print("Loading %s..." % "legislators-current.yaml") legislators = load_data("legislators-current.yaml") data_files.append((legislators, "legislators-current.yaml")) print("Loading %s..." % "legislators-historical.yaml") legislators = load_data("legislators-historical.yaml") data_files.append((legislators, "legislators-historical.yaml")) # load member data from vote view if congress == None: raise Exception("the --congress flag is required") elif int(congress) < 10 and int(congress) > 0: url_senate = "https://voteview.com/static/data/out/members/S00%s_members.csv" % congress url_house = "https://voteview.com/static/data/out/members/H00%s_members.csv" % congress elif int(congress) < 100 and int(congress) >= 10: url_senate = "https://voteview.com/static/data/out/members/S0%s_members.csv" % congress url_house = "https://voteview.com/static/data/out/members/H0%s_members.csv" % congress elif int(congress) >= 100: url_senate = "https://voteview.com/static/data/out/members/S%s_members.csv" % congress url_house = "https://voteview.com/static/data/out/members/H%s_members.csv" % congress else: raise Exception("no data for congress " + congress) senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress senate_data = utils.download(url_senate, senate_destination, force) house_destination = "icpsr/source/house_rollcall%s.txt" % congress house_data = utils.download(url_house, house_destination, force) error_log = csv.writer( open("cache/errors/mismatch/mismatch_%s.csv" % congress, "w")) error_log.writerow([ "error_type", "matches", "icpsr_name", "icpsr_state", "is_territory", "old_id", "new_id" ]) read_files = [("sen", senate_data), ("rep", house_data)] print("Running for congress " + congress) for read_file_chamber, read_file_content in read_files: for data_file in data_files: for legislator in data_file[0]: num_matches = 0 write_id = "" # this can't run unless we've already collected a bioguide for this person bioguide = legislator["id"].get("bioguide", None) # if we've limited this to just one bioguide, skip over everyone else if only_bioguide and (bioguide != only_bioguide): continue #if not in currently read chamber, skip chamber = legislator['terms'][len(legislator['terms']) - 1]['type'] if chamber != read_file_chamber: continue #only run for selected congress latest_congress = utils.congress_from_legislative_year( utils.legislative_year( parse_date( legislator['terms'][len(legislator['terms']) - 1]['start']))) if chamber == "sen": congresses = [ latest_congress, latest_congress + 1, latest_congress + 2 ] else: congresses = [latest_congress] if int(congress) not in congresses: continue # pull data to match from yaml last_name = legislator['name']['last'].upper() state = utils.states[legislator['terms'] [len(legislator['terms']) - 1]['state']].upper()[:7].strip() # convert read_file_content str to file object, then parse as csv file content_as_file = StringIO(read_file_content) content_parsed = csv.reader(content_as_file, delimiter=',') # loop through congress members in read file, see if one matches the current legislator for icpsr_member in content_parsed: # ensure unique match bassed of bioguide id if bioguide == icpsr_member[10]: num_matches += 1 write_id = int(icpsr_member[2]) # skip if icpsr id is currently in data if "icpsr" in legislator["id"]: if write_id == legislator["id"]["icpsr"] or write_id == "": continue elif write_id != legislator["id"][ "icpsr"] and write_id != "": error_log.writerow([ "Incorrect_ID", "NA", last_name[:8], state, "NA", legislator["id"]["icpsr"], write_id ]) print("ID updated for %s" % last_name) if num_matches == 1: legislator['id']['icpsr'] = int(write_id) else: if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP": print('error: non 1 match') error_log.writerow([ "Non_1_match_number", str(num_matches), last_name[:8], state, "Y", "NA", "NA" ]) else: print( str(num_matches) + " matches found for " + last_name[:8] + ", " + state + " in congress " + str(congress)) error_log.writerow([ "Non_1_match_number", str(num_matches), last_name, state, "N", "NA", "NA" ]) save_data(data_file[0], data_file[1])
def fetch_senate_committee_meetings(existing_meetings, committees, options): # Parse the Senate committee meeting XML feed for meetings. # To aid users of the data, attempt to assign GUIDs to meetings. options = dict(options) # clone options["binary"] = True meetings = [] dom = lxml.etree.fromstring(utils.download( "http://www.senate.gov/general/committee_schedules/hearings.xml", "committee_schedule/senate.xml", options)) for node in dom.xpath("meeting"): committee_id = unicode(node.xpath('string(cmte_code)')) if committee_id.strip() == "": continue # "No committee hearings scheduled" placeholder occurs_at = unicode(node.xpath('string(date)')) room = unicode(node.xpath('string(room)')) topic = unicode(node.xpath('string(matter)')) occurs_at = datetime.datetime.strptime(occurs_at, "%d-%b-%Y %I:%M %p") topic = re.sub(r"\s+", " ", topic).strip() # Validate committee code. try: committee_code, subcommittee_code = re.match(r"(\D+)(\d+)$", committee_id).groups() if committee_code not in committees: raise ValueError(committee_code) if subcommittee_code == "00": subcommittee_code = None if subcommittee_code and subcommittee_code not in committees[committee_code]["subcommittees"]: raise ValueError(subcommittee_code) except: print "Invalid committee code", committee_id continue # See if this meeting already exists. If so, take its GUID. # Assume meetings are the same if they are for the same committee/subcommittee and # at the same time. for mtg in existing_meetings: if mtg["committee"] == committee_code and mtg.get("subcommittee", None) == subcommittee_code and mtg["occurs_at"] == occurs_at.isoformat(): guid = mtg["guid"] break else: # Not found, so create a new ID. guid = unicode(uuid.uuid4()) # Scrape the topic text for mentions of bill numbers. congress = utils.congress_from_legislative_year(utils.current_legislative_year(occurs_at)) bills = [] bill_number_re = re.compile(r"(hr|s|hconres|sconres|hjres|sjres|hres|sres)\s?(\d+)", re.I) for bill_match in bill_number_re.findall(topic.replace(".", "")): bills.append( bill_match[0].lower() + bill_match[1] + "-" + str(congress) ) # Create the meeting event. meetings.append({ "chamber": "senate", "congress": congress, "guid": guid, "committee": committee_code, "subcommittee": subcommittee_code, "occurs_at": occurs_at.isoformat(), "room": room, "topic": topic, "bills": bills, }) return meetings
def fetch_senate_committee_meetings(existing_meetings, committees, options): # Parse the Senate committee meeting XML feed for meetings. # To aid users of the data, attempt to assign GUIDs to meetings. options = dict(options) # clone options["binary"] = True meetings = [] dom = lxml.etree.fromstring( utils.download( "http://www.senate.gov/general/committee_schedules/hearings.xml", "committee_schedule/senate.xml", options)) for node in dom.xpath("meeting"): committee_id = unicode(node.xpath('string(cmte_code)')) if committee_id.strip() == "": continue # "No committee hearings scheduled" placeholder occurs_at = unicode(node.xpath('string(date)')) room = unicode(node.xpath('string(room)')) topic = unicode(node.xpath('string(matter)')) occurs_at = datetime.datetime.strptime(occurs_at, "%d-%b-%Y %I:%M %p") topic = re.sub(r"\s+", " ", topic).strip() # Validate committee code. try: committee_code, subcommittee_code = re.match( r"(\D+)(\d+)$", committee_id).groups() if committee_code not in committees: raise ValueError(committee_code) if subcommittee_code == "00": subcommittee_code = None if subcommittee_code and subcommittee_code not in committees[ committee_code]["subcommittees"]: raise ValueError(subcommittee_code) except: print "Invalid committee code", committee_id continue # See if this meeting already exists. If so, take its GUID. # Assume meetings are the same if they are for the same committee/subcommittee and # at the same time. for mtg in existing_meetings: if mtg["committee"] == committee_code and mtg.get( "subcommittee", None) == subcommittee_code and mtg[ "occurs_at"] == occurs_at.isoformat(): guid = mtg["guid"] break else: # Not found, so create a new ID. guid = unicode(uuid.uuid4()) # Scrape the topic text for mentions of bill numbers. congress = utils.congress_from_legislative_year( utils.current_legislative_year(occurs_at)) bills = [] bill_number_re = re.compile( r"(hr|s|hconres|sconres|hjres|sjres|hres|sres)\s?(\d+)", re.I) for bill_match in bill_number_re.findall(topic.replace(".", "")): bills.append(bill_match[0].lower() + bill_match[1] + "-" + str(congress)) # Create the meeting event. meetings.append({ "chamber": "senate", "congress": congress, "guid": guid, "committee": committee_code, "subcommittee": subcommittee_code, "occurs_at": occurs_at.isoformat(), "room": room, "topic": topic, "bills": bills, }) return meetings
def fetch_senate_committee_meetings(committees, options): # Load any existing meetings file so we can recycle any GUIDs. existing_meetings = [] output_file = output_for("senate") if os.path.exists(output_file): existing_meetings = json.load(open(output_file)) options = dict(options) # clone options["binary"] = True # options["force"] = True meetings = [] dom = lxml.etree.fromstring( utils.download( "http://www.senate.gov/general/committee_schedules/hearings.xml", "committee_schedule/senate.xml", options ) ) for node in dom.xpath("meeting"): committee_id = unicode(node.xpath("string(cmte_code)")) if committee_id.strip() == "": continue # "No committee hearings scheduled" placeholder occurs_at = unicode(node.xpath("string(date)")) room = unicode(node.xpath("string(room)")) topic = unicode(node.xpath("string(matter)")) occurs_at = datetime.datetime.strptime(occurs_at, "%d-%b-%Y %I:%M %p") topic = re.sub(r"\s+", " ", topic).strip() # Validate committee code. try: committee_code, subcommittee_code = re.match(r"(\D+)(\d+)$", committee_id).groups() if committee_code not in committees: raise ValueError(committee_code) if subcommittee_code == "00": subcommittee_code = None if subcommittee_code and subcommittee_code not in committees[committee_code]["subcommittees"]: raise ValueError(subcommittee_code) except: print "Invalid committee code", committee_id continue # See if this meeting already exists. If so, take its GUID. # Assume meetings are the same if they are for the same committee/subcommittee and # at the same time. for mtg in existing_meetings: if ( mtg["committee"] == committee_code and mtg.get("subcommittee", None) == subcommittee_code and mtg["occurs_at"] == occurs_at.isoformat() ): if options.get("debug", False): print "[%s] Reusing gUID." % mtg["guid"] guid = mtg["guid"] break else: # Not found, so create a new ID. # TODO: Can we make this a human-readable ID? guid = unicode(uuid.uuid4()) # Scrape the topic text for mentions of bill numbers. congress = utils.congress_from_legislative_year(utils.current_legislative_year(occurs_at)) bills = [] bill_number_re = re.compile(r"(hr|s|hconres|sconres|hjres|sjres|hres|sres)\s?(\d+)", re.I) for bill_match in bill_number_re.findall(topic.replace(".", "")): bills.append(bill_match[0].lower() + bill_match[1] + "-" + str(congress)) # Create the meeting event. if options.get("debug", False): print "[senate][%s][%s] Found meeting in room %s at %s." % ( committee_code, subcommittee_code, room, occurs_at.isoformat(), ) meetings.append( { "chamber": "senate", "congress": congress, "guid": guid, "committee": committee_code, "subcommittee": subcommittee_code, "occurs_at": occurs_at.isoformat(), "room": room, "topic": topic, "bill_ids": bills, } ) print "[senate] Found %i meetings." % len(meetings) return meetings
def run(): # default to caching cache = utils.flags().get('cache', True) force = not cache only_bioguide = utils.flags().get('bioguide', None) congress = utils.flags().get('congress',None) data_files = [] print("Loading %s..." % "legislators-current.yaml") legislators = load_data("legislators-current.yaml") data_files.append((legislators,"legislators-current.yaml")) print("Loading %s..." % "legislators-historical.yaml") legislators = load_data("legislators-historical.yaml") data_files.append((legislators,"legislators-historical.yaml")) #load roll call data. Will need to be updated (possibly) for 114th+ congresses, since it is unclear what the URl format will be if congress == None: raise Exception("the --congress flag is required") elif congress == "113": url_senate = "http://amypond.sscnet.ucla.edu/rollcall/static/S113.ord" url_house = "http://amypond.sscnet.ucla.edu/rollcall/static/H113.ord" elif int(congress) <10 and int(congress) >0: url_senate = "ftp://voteview.com/dtaord/sen0%skh.ord" % congress url_house = "ftp://voteview.com/dtaord/hou0%skh.ord" % congress elif int(congress) < 113 and int(congress) >= 10: url_senate = "ftp://voteview.com/dtaord/sen%skh.ord" % congress url_house = "ftp://voteview.com/dtaord/hou%skh.ord" % congress else: raise Exception("no data for congress " + congress) senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress senate_data = utils.download(url_senate, senate_destination, force) house_destination = "icpsr/source/house_rollcall%s.txt" % congress house_data = utils.download(url_house, house_destination, force) error_log = csv.writer(open("cache/errors/mismatch/mismatch_%s.csv" % congress, "wb")) error_log.writerow(["error_type","matches","icpsr_name","icpsr_state","is_territory","old_id","new_id"]) read_files = [(senate_data,"sen"),(house_data,"rep")] print("Running for congress " + congress) for read_file in read_files: for data_file in data_files: for legislator in data_file[0]: num_matches = 0 # # this can't run unless we've already collected a bioguide for this person bioguide = legislator["id"].get("bioguide", None) # if we've limited this to just one bioguide, skip over everyone else if only_bioguide and (bioguide != only_bioguide): continue #if not in currently read chamber, skip chamber = legislator['terms'][len(legislator['terms'])-1]['type'] if chamber != read_file[1]: continue #only run for selected congress latest_congress = utils.congress_from_legislative_year(utils.legislative_year(parse_date(legislator['terms'][len(legislator['terms'])-1]['start']))) if chamber == "sen": congresses = [latest_congress,latest_congress+1,latest_congress+2] else: congresses =[latest_congress] if int(congress) not in congresses: continue # pull data to match from yaml last_name_unicode = legislator['name']['last'].upper().strip().replace('\'','') last_name = unicodedata.normalize('NFD', str(last_name_unicode)).encode('ascii', 'ignore') state = utils.states[legislator['terms'][len(legislator['terms'])-1]['state']].upper()[:7].strip() # select icpsr source data based on more recent chamber write_id = "" lines = read_file[0].split('\n') for line in lines: # parse source data icpsr_state = line[12:20].strip() icpsr_name = line[21:].strip().strip(string.digits).strip() icpsr_id = line[3:8].strip() #ensure unique match if icpsr_name[:8] == last_name[:8] and state == icpsr_state: num_matches += 1 write_id = icpsr_id #skip if icpsr id is currently in data if "icpsr" in legislator["id"]: if write_id == legislator["id"]["icpsr"] or write_id == "": continue elif write_id != legislator["id"]["icpsr"] and write_id != "": error_log.writerow(["Incorrect_ID","NA",last_name[:8],state,"NA",legislator["id"]["icpsr"],write_id]) print("ID updated for %s" % last_name) if num_matches == 1: legislator['id']['icpsr'] = int(write_id) else: if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP": error_log.writerow(["Non_1_match_number",str(num_matches),last_name[:8],state,"Y","NA","NA"]) else: print(str(num_matches) + " matches found for "+ last_name[:8] + ", " + state + " in congress " + str(congress)) error_log.writerow(["Non_1_match_number",str(num_matches),last_name,state,"N","NA","NA"]) save_data(data_file[0], data_file[1])