def sweep(): to_check = [] bioguide = utils.flags().get('bioguide', None) if bioguide: possibles = [bioguide] else: possibles = current_bioguide.keys() for bioguide in possibles: if media_bioguide.get(bioguide, None) is None: to_check.append(bioguide) elif media_bioguide[bioguide]["social"].get(service, None) is None: to_check.append(bioguide) else: pass utils.mkdir_p("cache/social_media") writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w')) writer.writerow(["bioguide", "official_full", "website", "service", "candidate"]) for bioguide in to_check: candidate = candidate_for(bioguide) if candidate: url = current_bioguide[bioguide]["terms"][-1].get("url", None) writer.writerow([bioguide, current_bioguide[bioguide]['name']['official_full'], url, service, candidate]) print "\tWrote: %s" % candidate
def resolveig(): # in order to preserve the comment block at the top of the file, # copy it over into a new RtYamlList instance. We do this because # Python list instances can't hold other random attributes. import rtyaml updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr(media, '__initial_comment_block') client_id_file = open('cache/instagram_client_id','r') client_id = client_id_file.read() bioguide = utils.flags().get('bioguide', None) for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if 'instagram' not in social and 'instagram_id' not in social: updated_media.append(m) continue instagram_handle = social['instagram'] query_url = "https://api.instagram.com/v1/users/search?q={query}&client_id={client_id}".format(query=instagram_handle,client_id=client_id) instagram_user_search = requests.get(query_url).json() for user in instagram_user_search['data']: time.sleep(0.5) if user['username'] == instagram_handle: m['social']['instagram_id'] = int(user['id']) print("matched instagram_id {instagram_id} to {instagram_handle}".format(instagram_id=social['instagram_id'],instagram_handle=instagram_handle)) updated_media.append(m) save_data(updated_media, "legislators-social-media.yaml")
def verify(): bioguide = utils.flags().get('bioguide', None) if bioguide: to_check = [bioguide] else: to_check = media_bioguide.keys() for bioguide in to_check: entry = media_bioguide[bioguide] current = entry['social'].get(service, None) if not current: continue bioguide = entry['id']['bioguide'] candidate = candidate_for(bioguide) if not candidate: # if current is in whitelist, and none is on the page, that's okay if current.lower() in whitelist[service]: continue else: candidate = "" url = current_bioguide[bioguide]['terms'][-1].get('url') if current.lower() != candidate.lower(): print "[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate)
def sweep(): to_check = [] bioguide = utils.flags().get('bioguide', None) if bioguide: possibles = [bioguide] else: possibles = current_bioguide.keys() for bioguide in possibles: if media_bioguide.get(bioguide, None) is None: to_check.append(bioguide) elif media_bioguide[bioguide]["social"].get(service, None) is None: to_check.append(bioguide) else: pass utils.mkdir_p("cache/social_media") writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w')) writer.writerow(["bioguide", "official_full", "website", "service", "candidate", "candidate_url"]) if len(to_check) > 0: email_body = "Social media leads found:\n\n" for bioguide in to_check: candidate = candidate_for(bioguide) if candidate: url = current_bioguide[bioguide]["terms"][-1].get("url", None) candidate_url = "https://%s.com/%s" % (service, candidate) row = [bioguide, current_bioguide[bioguide]['name']['official_full'].encode('utf-8'), url, service, candidate, candidate_url] writer.writerow(row) print "\tWrote: %s" % candidate email_body += ("%s\n" % row) if email_enabled: utils.send_email(email_body)
def run(): options = utils.flags() debug = options.get('debug', False) filename = "legislators-current.yaml" args = utils.args() legislators = load_data(filename) if len(args) != 0: bioguides = args print("Fetching contact forms for %s..." % ', '.join(bioguides)) else: bioguides = [member['id']['bioguide'] for member in legislators] print("Fetching contact forms for all current members...") for legislator in legislators: bioguide = legislator['id']['bioguide'] if bioguide not in bioguides: continue if bioguide in SKIP_BIOGUIDES: continue if debug: print("Downloading form for %s" % bioguide, flush=True) try: steps = contact_steps_for(bioguide) except LegislatorNotFoundError as e: if debug: print("skipping, %s..." % e, flush=True) continue legislator['terms'][-1]['contact_form'] = steps['contact_form']['steps'][0]['visit'] print("Saving data to %s..." % filename) save_data(legislators, filename)
def run(): # pick either current or historical # order is important here, since current defaults to true if utils.flags().get('historical', False): filename = "legislators-historical.yaml" elif utils.flags().get('current', True): filename = "legislators-current.yaml" else: print("No legislators selected.") exit(0) print("Loading %s..." % filename) legislators = load_data(filename) # reoriented cache to access by bioguide ID by_bioguide = { } for m in legislators: if "bioguide" in m["id"]: by_bioguide[m["id"]["bioguide"]] = m count = 0 for id in range(8245,21131): print(id) url = "http://history.house.gov/People/Detail/%s" % id r = requests.get(url, allow_redirects=False) if r.status_code == 200: dom = lxml.html.parse(io.StringIO(r.text)).getroot() try: bioguide_link = dom.cssselect("a.view-in-bioguide")[0].get('href') bioguide_id = bioguide_link.split('=')[1] by_bioguide[bioguide_id]["id"]["house_history"] = id count = count + 1 except: continue else: continue print("Saving data to %s..." % filename) save_data(legislators, filename) print("Saved %d legislators to %s" % (count, filename))
def run(): house_labels = "labels-113.csv" names = utils.flags().get('names', False) y = load_data("legislators-current.yaml") by_district = { } for m in y: last_term = m['terms'][-1] if last_term['type'] != 'sen': full_district = "%s%02d" % (last_term['state'], int(last_term['district'])) by_district[full_district] = m for rec in csv.DictReader(open(house_labels)): full_district = rec['113 ST/DIS'] # empty seat - IL-02 if full_district not in by_district: if full_district == "IL02": continue else: raise "No!!" rec["MIDDLE"] = rec["MIDDLE"].decode("utf8").strip() rec["NICK"] = None m = re.match('^(.*) \u201c(.*)\u201d$', rec["MIDDLE"]) if m: rec["MIDDLE"] = m.group(1) rec["NICK"] = m.group(2) by_district[full_district]['terms'][-1]['office'] = rec["ADDRESS"].strip() # only set name fields if we've been asked to (as a stopgap) if names: by_district[full_district]["name"]["first"] = rec["FIRST"].decode("utf8").strip() if rec["MIDDLE"]: by_district[full_district]["name"]["middle"] = rec["MIDDLE"] if rec["NICK"]: by_district[full_district]["name"]["nickname"] = rec["NICK"] by_district[full_district]["name"]["last"] = rec["LAST"].decode("utf8").strip() if rec["BIOGUIDE ID"] == "G000574": # The Clerk has the wrong ID for Alan Grayson! rec["BIOGUIDE ID"] = "G000556" by_district[full_district]["id"]["bioguide"] = rec["BIOGUIDE ID"] print("[%s] Saved" % full_district) save_data(y, "legislators-current.yaml")
def main(): regexes = { "youtube": [ "(?:https?:)?//(?:www\\.)?youtube.com/embed/?\?(list=[^\\s\"/\\?#&']+)", "(?:https?:)?//(?:www\\.)?youtube.com/channel/([^\\s\"/\\?#']+)", "(?:https?:)?//(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)" ], "facebook": [ "\\('facebook.com/([^']+)'\\)", "(?:https?:)?//(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)", "(?:https?:)?//(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)" ], "twitter": [ "(?:https?:)?//(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/?]+)", "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)" ], "instagram": ["instagram.com/(\w{3,})"] } email_enabled = utils.flags().get('email', False) debug = utils.flags().get('debug', False) do_update = utils.flags().get('update', False) do_clean = utils.flags().get('clean', False) do_verify = utils.flags().get('verify', False) do_resolveyt = utils.flags().get('resolveyt', False) do_resolveig = utils.flags().get('resolveig', False) do_resolvetw = utils.flags().get('resolvetw', False) # default to not caching cache = utils.flags().get('cache', False) force = not cache if do_resolveyt: service = "youtube" elif do_resolveig: service = "instagram" elif do_resolvetw: service = "twitter" else: service = utils.flags().get('service', None) if service not in ["twitter", "youtube", "facebook", "instagram"]: print( "--service must be one of twitter, youtube, facebook, or instagram" ) exit(0) # load in members, orient by bioguide ID print("Loading current legislators...") current = load_data("legislators-current.yaml") current_bioguide = {} for m in current: if "bioguide" in m["id"]: current_bioguide[m["id"]["bioguide"]] = m print("Loading blacklist...") blacklist = {'twitter': [], 'facebook': [], 'youtube': [], 'instagram': []} for rec in csv.DictReader(open("data/social_media_blacklist.csv")): blacklist[rec["service"]].append(rec["pattern"]) print("Loading whitelist...") whitelist = {'twitter': [], 'facebook': [], 'youtube': []} for rec in csv.DictReader(open("data/social_media_whitelist.csv")): whitelist[rec["service"]].append(rec["account"].lower()) # reorient currently known social media by ID print("Loading social media...") media = load_data("legislators-social-media.yaml") media_bioguide = {} for m in media: media_bioguide[m["id"]["bioguide"]] = m def resolveyt(): # To avoid hitting quota limits, register for a YouTube 2.0 API key at # https://code.google.com/apis/youtube/dashboard # and put it below api_file = open('cache/youtube_api_key', 'r') api_key = api_file.read() bioguide = utils.flags().get('bioguide', None) updated_media = [] for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if ('youtube' in social) or ('youtube_id' in social): if 'youtube' not in social: social['youtube'] = social['youtube_id'] ytid = social['youtube'] profile_url = ("https://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key)) try: print("Resolving YT info for %s" % social['youtube']) ytreq = requests.get(profile_url) # print "\tFetched with status code %i..." % ytreq.status_code if ytreq.status_code == 404: # If the account name isn't valid, it's probably a redirect. try: # Try to scrape the real YouTube username print("\Scraping YouTube username") search_url = ("https://www.youtube.com/%s" % social['youtube']) csearch = requests.get(search_url).text.encode( 'ascii', 'ignore') u = re.search( r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>', csearch) if u: print("\t%s maps to %s" % (social['youtube'], u.group(1))) social['youtube'] = u.group(1) profile_url = ( "https://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json" % social['youtube']) print("\tFetching GData profile...") ytreq = requests.get(profile_url) print("\tFetched GData profile") else: raise Exception( "Couldn't figure out the username format for %s" % social['youtube']) except: print("\tCouldn't locate YouTube account") raise ytobj = ytreq.json() social['youtube_id'] = ytobj['entry']['yt$channelId']['$t'] print("\tResolved youtube_id to %s" % social['youtube_id']) # even though we have their channel ID, do they also have a username? if ytobj['entry']['yt$username']['$t'] != ytobj['entry'][ 'yt$userId']['$t']: if social['youtube'].lower( ) != ytobj['entry']['yt$username']['$t'].lower(): # YT accounts are case-insensitive. Preserve capitalization if possible. social['youtube'] = ytobj['entry']['yt$username'][ '$t'] print("\tAdded YouTube username of %s" % social['youtube']) else: print( "\tYouTube says they do not have a separate username" ) del social['youtube'] except: print("Unable to get YouTube Channel ID for: %s" % social['youtube']) updated_media.append(m) print("Saving social media...") save_data(updated_media, "legislators-social-media.yaml") def resolveig(): # in order to preserve the comment block at the top of the file, # copy it over into a new RtYamlList instance. We do this because # Python list instances can't hold other random attributes. import rtyaml updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr( media, '__initial_comment_block') client_id_file = open('cache/instagram_client_id', 'r') client_id = client_id_file.read() bioguide = utils.flags().get('bioguide', None) for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if 'instagram' not in social and 'instagram_id' not in social: updated_media.append(m) continue instagram_handle = social['instagram'] query_url = "https://api.instagram.com/v1/users/search?q={query}&client_id={client_id}".format( query=instagram_handle, client_id=client_id) instagram_user_search = requests.get(query_url).json() for user in instagram_user_search['data']: time.sleep(0.5) if user['username'] == instagram_handle: m['social']['instagram_id'] = int(user['id']) print( "matched instagram_id {instagram_id} to {instagram_handle}" .format(instagram_id=social['instagram_id'], instagram_handle=instagram_handle)) updated_media.append(m) save_data(updated_media, "legislators-social-media.yaml") def resolvetw(): """ Does two batch lookups: 1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name as found in the entry's `twitter`. If not, the `twitter` value is updated. 2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and inserts ID. If no profile is found, the `twitter` value is deleted. Note: cache/twitter_client_id must be a formatted JSON dict: { "consumer_secret": "xyz", "access_token": "abc", "access_token_secret": "def", "consumer_key": "jk" } """ import rtyaml from social.twitter import get_api, fetch_profiles updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr( media, '__initial_comment_block') client_id_file = open('cache/twitter_client_id', 'r') _c = json.load(client_id_file) api = get_api(_c['access_token'], _c['access_token_secret'], _c['consumer_key'], _c['consumer_secret']) bioguide = utils.flags().get('bioguide', None) lookups = { 'screen_names': [], 'ids': [] } # store members that have `twitter` or `twitter_id` info for m in media: # we start with appending to updated_media so that we keep the same order of entries # as found in the loaded file updated_media.append(m) if bioguide and (m['id']['bioguide'] != bioguide): continue social = m['social'] # now we add entries to either the `ids` or the `screen_names` list to batch lookup if 'twitter_id' in social: # add to the queue to be batched-looked-up lookups['ids'].append(m) # append elif 'twitter' in social: lookups['screen_names'].append(m) ####################################### # perform Twitter batch lookup for ids: if lookups['screen_names']: arr = lookups['screen_names'] print("Looking up Twitter ids for", len(arr), "names.") tw_names = [m['social']['twitter'] for m in arr] tw_profiles = fetch_profiles(api, screen_names=tw_names) for m in arr: social = m['social'] # find profile that corresponds to a given screen_name twitter_handle = social['twitter'] twp = next( (p for p in tw_profiles if p['screen_name'].lower() == twitter_handle.lower()), None) if twp: m['social']['twitter_id'] = int(twp['id']) print("Matched twitter_id `%s` to `%s`" % (social['twitter_id'], twitter_handle)) else: # Remove errant Twitter entry for now print("No Twitter user profile for:", twitter_handle) m['social'].pop('twitter') print("\t ! removing Twitter handle:", twitter_handle) ########################################## # perform Twitter batch lookup for names by id, to update any renamings: if lookups['ids']: arr = lookups['ids'] print("Looking up Twitter screen_names for", len(arr), "ids.") tw_ids = [m['social']['twitter_id'] for m in arr] tw_profiles = fetch_profiles(api, ids=tw_ids) any_renames_needed = False for m in arr: social = m['social'] # find profile that corresponds to a given screen_name t_id = social['twitter_id'] t_name = social.get('twitter') twp = next((p for p in tw_profiles if int(p['id']) == t_id), None) if twp: # Be silent if there is no change to screen name if t_name and (twp['screen_name'].lower() == t_name.lower()): pass else: any_renames_needed = True m['social']['twitter'] = twp['screen_name'] print("For twitter_id `%s`, renamed `%s` to `%s`" % (t_id, t_name, m['social']['twitter'])) else: # No entry found for this twitter id print("No Twitter user profile for %s, %s" % (t_id, t_name)) m['social'].pop('twitter_id') print("\t ! removing Twitter id:", t_id) if not any_renames_needed: print("No renames needed") # all done with Twitter save_data(updated_media, "legislators-social-media.yaml") def sweep(): to_check = [] bioguide = utils.flags().get('bioguide', None) if bioguide: possibles = [bioguide] else: possibles = list(current_bioguide.keys()) for bioguide in possibles: if media_bioguide.get(bioguide, None) is None: to_check.append(bioguide) elif (media_bioguide[bioguide]["social"].get(service, None) is None) and \ (media_bioguide[bioguide]["social"].get(service + "_id", None) is None): to_check.append(bioguide) else: pass utils.mkdir_p("cache/social_media") writer = csv.writer( open("cache/social_media/%s_candidates.csv" % service, 'w')) writer.writerow([ "bioguide", "official_full", "website", "service", "candidate", "candidate_url" ]) if len(to_check) > 0: rows_found = [] for bioguide in to_check: candidate = candidate_for(bioguide) if candidate: url = current_bioguide[bioguide]["terms"][-1].get( "url", None) candidate_url = "https://%s.com/%s" % (service, candidate) row = [ bioguide, current_bioguide[bioguide]['name'] ['official_full'].encode('utf-8'), url, service, candidate, candidate_url ] writer.writerow(row) print("\tWrote: %s" % candidate) rows_found.append(row) if email_enabled and len(rows_found) > 0: email_body = "Social media leads found:\n\n" for row in rows_found: email_body += ("%s\n" % row) utils.send_email(email_body) def verify(): bioguide = utils.flags().get('bioguide', None) if bioguide: to_check = [bioguide] else: to_check = list(media_bioguide.keys()) for bioguide in to_check: entry = media_bioguide[bioguide] current = entry['social'].get(service, None) if not current: continue bioguide = entry['id']['bioguide'] candidate = candidate_for(bioguide, current) if not candidate: # if current is in whitelist, and none is on the page, that's okay if current.lower() in whitelist[service]: continue else: candidate = "" url = current_bioguide[bioguide]['terms'][-1].get('url') if current.lower() != candidate.lower(): print("[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate)) def update(): for rec in csv.DictReader( open("cache/social_media/%s_candidates.csv" % service)): bioguide = rec["bioguide"] candidate = rec["candidate"] if bioguide in media_bioguide: media_bioguide[bioguide]['social'][service] = candidate else: new_media = {'id': {}, 'social': {}} new_media['id']['bioguide'] = bioguide thomas_id = current_bioguide[bioguide]['id'].get( "thomas", None) govtrack_id = current_bioguide[bioguide]['id'].get( "govtrack", None) if thomas_id: new_media['id']['thomas'] = thomas_id if govtrack_id: new_media['id']['govtrack'] = govtrack_id new_media['social'][service] = candidate media.append(new_media) print("Saving social media...") save_data(media, "legislators-social-media.yaml") # if it's a youtube update, always do the resolve # if service == "youtube": # resolveyt() def clean(): print("Loading historical legislators...") historical = load_data("legislators-historical.yaml") count = 0 for m in historical: if m["id"]["bioguide"] in media_bioguide: media.remove(media_bioguide[m["id"]["bioguide"]]) count += 1 print( "Removed %i out of office legislators from social media file..." % count) print("Saving historical legislators...") save_data(media, "legislators-social-media.yaml") def candidate_for(bioguide, current=None): """find the most likely candidate account from the URL. If current is passed, the candidate will match it if found otherwise, the first candidate match is returned """ url = current_bioguide[bioguide]["terms"][-1].get("url", None) if not url: if debug: print("[%s] No official website, skipping" % bioguide) return None if debug: print("[%s] Downloading..." % bioguide) cache = "congress/%s.html" % bioguide body = utils.download(url, cache, force, {'check_redirects': True}) if not body: return None all_matches = [] for regex in regexes[service]: matches = re.findall(regex, body, re.I) if matches: all_matches.extend(matches) if not current == None and current in all_matches: return current if all_matches: for candidate in all_matches: passed = True for blacked in blacklist[service]: if re.search(blacked, candidate, re.I): passed = False if not passed: if debug: print("\tBlacklisted: %s" % candidate) continue return candidate return None if do_update: update() elif do_clean: clean() elif do_verify: verify() elif do_resolveyt: resolveyt() elif do_resolveig: resolveig() elif do_resolvetw: resolvetw() else: sweep()
def run(): # default to not caching cache = utils.flags().get('cache', False) force = not cache states = [] current = load_data("legislators-current.yaml") by_district = { } for m in current: last_term = m['terms'][-1] if last_term['type'] != 'sen': state = last_term['state'] full_district = "%s%02d" % (state, int(last_term['district'])) by_district[full_district] = m if not state in states: # house lists AS (American Samoa) as AQ, awesome if state == "AS": state = "AQ" states.append(state) destination = "legislators/house.html" url = "https://www.house.gov/representatives/" body = utils.download(url, destination, force) if not body: print("Couldn't download House listing!") exit(0) try: dom = lxml.html.parse(io.StringIO(body)).getroot() except lxml.etree.XMLSyntaxError: print("Error parsing House listing!") exit(0) # process: # go through every state in our records, fetching that state's table # go through every row after the first, pick the district to isolate the member # pluck out the URL, update that member's last term's URL count = 0 for state in states: rows = dom.cssselect("h2#state_%s+table tr" % state.lower()) for row in rows: cells = row.cssselect("td") if not cells: continue district = str(cells[0].text_content()) if district == "At Large": district = 0 url = cells[1].cssselect("a")[0].get("href") # The House uses subdomains now, and occasionally the directory # uses URLs with some trailing redirected-to page, like /home. # We can safely use the subdomain as the root, to be future-proof # against redirects changing mid-session. # We should still follow any redirects, and not just trust the # directory to have the current active subdomain. As an example, # the directory lists randyforbes.house.gov, which redirects to # forbes.house.gov. resp = urllib.request.urlopen(url) url = resp.geturl() # kill everything after the domain url = re.sub(".gov/.*$", ".gov", url) if state == "AQ": state = "AS" full_district = "%s%02d" % (state, int(district)) if full_district in by_district: print("[%s] %s" % (full_district, url)) by_district[full_district]['terms'][-1]['url'] = url else: print("[%s] No current legislator" % full_district) count += 1 print("Processed %i people rows on House listing." % count) print("Saving data...") save_data(current, "legislators-current.yaml")
def resolvetw(): """ Does two batch lookups: 1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name as found in the entry's `twitter`. If not, the `twitter` value is updated. 2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and inserts ID. If no profile is found, the `twitter` value is deleted. Note: cache/twitter_client_id must be a formatted JSON dict: { "consumer_secret": "xyz", "access_token": "abc", "access_token_secret": "def", "consumer_key": "jk" } """ import rtyaml from social.twitter import get_api, fetch_profiles updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr(media, '__initial_comment_block') client_id_file = open('cache/twitter_client_id', 'r') _c = json.load(client_id_file) api = get_api(_c['access_token'], _c['access_token_secret'], _c['consumer_key'], _c['consumer_secret']) bioguide = utils.flags().get('bioguide', None) lookups = {'screen_names': [], 'ids': []} # store members that have `twitter` or `twitter_id` info for m in media: # we start with appending to updated_media so that we keep the same order of entries # as found in the loaded file updated_media.append(m) if bioguide and (m['id']['bioguide'] != bioguide): continue social = m['social'] # now we add entries to either the `ids` or the `screen_names` list to batch lookup if 'twitter_id' in social: # add to the queue to be batched-looked-up lookups['ids'].append(m) # append elif 'twitter' in social: lookups['screen_names'].append(m) ####################################### # perform Twitter batch lookup for ids: if lookups['screen_names']: arr = lookups['screen_names'] print("Looking up Twitter ids for", len(arr), "names.") tw_names = [m['social']['twitter'] for m in arr] tw_profiles = fetch_profiles(api, screen_names = tw_names) for m in arr: social = m['social'] # find profile that corresponds to a given screen_name twitter_handle = social['twitter'] twp = next((p for p in tw_profiles if p['screen_name'].lower() == twitter_handle.lower()), None) if twp: m['social']['twitter_id'] = int(twp['id']) print("Matched twitter_id `%s` to `%s`" % (social['twitter_id'], twitter_handle)) else: # Remove errant Twitter entry for now print("No Twitter user profile for:", twitter_handle) m['social'].pop('twitter') print("\t ! removing Twitter handle:", twitter_handle) ########################################## # perform Twitter batch lookup for names by id, to update any renamings: if lookups['ids']: arr = lookups['ids'] print("Looking up Twitter screen_names for", len(arr), "ids.") tw_ids = [m['social']['twitter_id'] for m in arr] tw_profiles = fetch_profiles(api, ids = tw_ids) any_renames_needed = False for m in arr: social = m['social'] # find profile that corresponds to a given screen_name t_id = social['twitter_id'] t_name = social.get('twitter') twp = next((p for p in tw_profiles if int(p['id']) == t_id), None) if twp: # Be silent if there is no change to screen name if t_name and (twp['screen_name'].lower() == t_name.lower()): pass else: any_renames_needed = True m['social']['twitter'] = twp['screen_name'] print("For twitter_id `%s`, renamed `%s` to `%s`" % (t_id, t_name, m['social']['twitter'])) else: # No entry found for this twitter id print("No Twitter user profile for %s, %s" % (t_id, t_name)) m['social'].pop('twitter_id') print("\t ! removing Twitter id:", t_id) if not any_renames_needed: print("No renames needed") # all done with Twitter save_data(updated_media, "legislators-social-media.yaml")
def resolveyt(): # To avoid hitting quota limits, register for a YouTube 2.0 API key at # https://code.google.com/apis/youtube/dashboard # and put it below api_file = open('cache/youtube_api_key','r') api_key = api_file.read() bioguide = utils.flags().get('bioguide', None) updated_media = [] for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if ('youtube' in social) or ('youtube_id' in social): if 'youtube' not in social: social['youtube'] = social['youtube_id'] ytid = social['youtube'] profile_url = ("https://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key)) try: print("Resolving YT info for %s" % social['youtube']) ytreq = requests.get(profile_url) # print "\tFetched with status code %i..." % ytreq.status_code if ytreq.status_code == 404: # If the account name isn't valid, it's probably a redirect. try: # Try to scrape the real YouTube username print("\Scraping YouTube username") search_url = ("https://www.youtube.com/%s" % social['youtube']) csearch = requests.get(search_url).text.encode('ascii','ignore') u = re.search(r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',csearch) if u: print("\t%s maps to %s" % (social['youtube'],u.group(1))) social['youtube'] = u.group(1) profile_url = ("https://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json" % social['youtube']) print("\tFetching GData profile...") ytreq = requests.get(profile_url) print("\tFetched GData profile") else: raise Exception("Couldn't figure out the username format for %s" % social['youtube']) except: print("\tCouldn't locate YouTube account") raise ytobj = ytreq.json() social['youtube_id'] = ytobj['entry']['yt$channelId']['$t'] print("\tResolved youtube_id to %s" % social['youtube_id']) # even though we have their channel ID, do they also have a username? if ytobj['entry']['yt$username']['$t'] != ytobj['entry']['yt$userId']['$t']: if social['youtube'].lower() != ytobj['entry']['yt$username']['$t'].lower(): # YT accounts are case-insensitive. Preserve capitalization if possible. social['youtube'] = ytobj['entry']['yt$username']['$t'] print("\tAdded YouTube username of %s" % social['youtube']) else: print("\tYouTube says they do not have a separate username") del social['youtube'] except: print("Unable to get YouTube Channel ID for: %s" % social['youtube']) updated_media.append(m) print("Saving social media...") save_data(updated_media, "legislators-social-media.yaml")
def main(): regexes = { "youtube": [ "https?://(?:www\\.)?youtube.com/channel/([^\\s\"/\\?#']+)", "https?://(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)" ], "facebook": [ "\\('facebook.com/([^']+)'\\)", "https?://(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)", "https?://(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)" ], "twitter": [ "https?://(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/]+)", "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)" ] } email_enabled = utils.flags().get('email', False) debug = utils.flags().get('debug', False) do_update = utils.flags().get('update', False) do_clean = utils.flags().get('clean', False) do_verify = utils.flags().get('verify', False) do_resolvefb = utils.flags().get('resolvefb', False) do_resolveyt = utils.flags().get('resolveyt', False) # default to not caching cache = utils.flags().get('cache', False) force = not cache if do_resolvefb: service = "facebook" elif do_resolveyt: service = "youtube" else: service = utils.flags().get('service', None) if service not in ["twitter", "youtube", "facebook"]: print("--service must be one of twitter, youtube, or facebook") exit(0) # load in members, orient by bioguide ID print("Loading current legislators...") current = load_data("legislators-current.yaml") current_bioguide = { } for m in current: if "bioguide" in m["id"]: current_bioguide[m["id"]["bioguide"]] = m print("Loading blacklist...") blacklist = { 'twitter': [], 'facebook': [], 'youtube': [] } for rec in csv.DictReader(open("data/social_media_blacklist.csv")): blacklist[rec["service"]].append(rec["pattern"]) print("Loading whitelist...") whitelist = { 'twitter': [], 'facebook': [], 'youtube': [] } for rec in csv.DictReader(open("data/social_media_whitelist.csv")): whitelist[rec["service"]].append(rec["account"].lower()) # reorient currently known social media by ID print("Loading social media...") media = load_data("legislators-social-media.yaml") media_bioguide = { } for m in media: media_bioguide[m["id"]["bioguide"]] = m def resolvefb(): # in order to preserve the comment block at the top of the file, # copy it over into a new RtYamlList instance. We do this because # Python list instances can't hold other random attributes. import rtyaml updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr(media, '__initial_comment_block') for m in media: social = m['social'] if ('facebook' in social and social['facebook']) and ('facebook_id' not in social): graph_url = "https://graph.facebook.com/%s" % social['facebook'] if re.match('\d+', social['facebook']): social['facebook_id'] = social['facebook'] print("Looking up graph username for %s" % social['facebook']) fbobj = requests.get(graph_url).json() if 'username' in fbobj: print("\tGot graph username of %s" % fbobj['username']) social['facebook'] = fbobj['username'] else: print("\tUnable to get graph username") else: try: print("Looking up graph ID for %s" % social['facebook']) fbobj = requests.get(graph_url).json() if 'id' in fbobj: print("\tGot graph ID of %s" % fbobj['id']) social['facebook_id'] = fbobj['id'] else: print("\tUnable to get graph ID") except: print("\tUnable to get graph ID for: %s" % social['facebook']) social['facebook_id'] = None updated_media.append(m) print("Saving social media...") save_data(updated_media, "legislators-social-media.yaml") def resolveyt(): # To avoid hitting quota limits, register for a YouTube 2.0 API key at # https://code.google.com/apis/youtube/dashboard # and put it below api_file = open('cache/youtube_api_key','r') api_key = api_file.read() bioguide = utils.flags().get('bioguide', None) updated_media = [] for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if ('youtube' in social) or ('youtube_id' in social): if 'youtube' not in social: social['youtube'] = social['youtube_id'] ytid = social['youtube'] profile_url = ("http://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key)) try: print("Resolving YT info for %s" % social['youtube']) ytreq = requests.get(profile_url) # print "\tFetched with status code %i..." % ytreq.status_code if ytreq.status_code == 404: # If the account name isn't valid, it's probably a redirect. try: # Try to scrape the real YouTube username print("\Scraping YouTube username") search_url = ("http://www.youtube.com/%s" % social['youtube']) csearch = requests.get(search_url).text.encode('ascii','ignore') u = re.search(r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',csearch) if u: print("\t%s maps to %s" % (social['youtube'],u.group(1))) social['youtube'] = u.group(1) profile_url = ("http://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json" % social['youtube']) print("\tFetching GData profile...") ytreq = requests.get(profile_url) print("\tFetched GData profile") else: raise Exception("Couldn't figure out the username format for %s" % social['youtube']) except: print("\tCouldn't locate YouTube account") raise ytobj = ytreq.json() social['youtube_id'] = ytobj['entry']['yt$channelId']['$t'] print("\tResolved youtube_id to %s" % social['youtube_id']) # even though we have their channel ID, do they also have a username? if ytobj['entry']['yt$username']['$t'] != ytobj['entry']['yt$userId']['$t']: if social['youtube'].lower() != ytobj['entry']['yt$username']['$t'].lower(): # YT accounts are case-insensitive. Preserve capitalization if possible. social['youtube'] = ytobj['entry']['yt$username']['$t'] print("\tAdded YouTube username of %s" % social['youtube']) else: print("\tYouTube says they do not have a separate username") del social['youtube'] except: print("Unable to get YouTube Channel ID for: %s" % social['youtube']) updated_media.append(m) print("Saving social media...") save_data(updated_media, "legislators-social-media.yaml") def sweep(): to_check = [] bioguide = utils.flags().get('bioguide', None) if bioguide: possibles = [bioguide] else: possibles = list(current_bioguide.keys()) for bioguide in possibles: if media_bioguide.get(bioguide, None) is None: to_check.append(bioguide) elif (media_bioguide[bioguide]["social"].get(service, None) is None) and \ (media_bioguide[bioguide]["social"].get(service + "_id", None) is None): to_check.append(bioguide) else: pass utils.mkdir_p("cache/social_media") writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w')) writer.writerow(["bioguide", "official_full", "website", "service", "candidate", "candidate_url"]) if len(to_check) > 0: rows_found = [] for bioguide in to_check: candidate = candidate_for(bioguide) if candidate: url = current_bioguide[bioguide]["terms"][-1].get("url", None) candidate_url = "https://%s.com/%s" % (service, candidate) row = [bioguide, current_bioguide[bioguide]['name']['official_full'].encode('utf-8'), url, service, candidate, candidate_url] writer.writerow(row) print("\tWrote: %s" % candidate) rows_found.append(row) if email_enabled and len(rows_found) > 0: email_body = "Social media leads found:\n\n" for row in rows_found: email_body += ("%s\n" % row) utils.send_email(email_body) def verify(): bioguide = utils.flags().get('bioguide', None) if bioguide: to_check = [bioguide] else: to_check = list(media_bioguide.keys()) for bioguide in to_check: entry = media_bioguide[bioguide] current = entry['social'].get(service, None) if not current: continue bioguide = entry['id']['bioguide'] candidate = candidate_for(bioguide) if not candidate: # if current is in whitelist, and none is on the page, that's okay if current.lower() in whitelist[service]: continue else: candidate = "" url = current_bioguide[bioguide]['terms'][-1].get('url') if current.lower() != candidate.lower(): print("[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate)) def update(): for rec in csv.DictReader(open("cache/social_media/%s_candidates.csv" % service)): bioguide = rec["bioguide"] candidate = rec["candidate"] if bioguide in media_bioguide: media_bioguide[bioguide]['social'][service] = candidate else: new_media = {'id': {}, 'social': {}} new_media['id']['bioguide'] = bioguide thomas_id = current_bioguide[bioguide]['id'].get("thomas", None) govtrack_id = current_bioguide[bioguide]['id'].get("govtrack", None) if thomas_id: new_media['id']['thomas'] = thomas_id if govtrack_id: new_media['id']['govtrack'] = govtrack_id new_media['social'][service] = candidate media.append(new_media) print("Saving social media...") save_data(media, "legislators-social-media.yaml") # if it's a youtube update, always do the resolve # if service == "youtube": # resolveyt() def clean(): print("Loading historical legislators...") historical = load_data("legislators-historical.yaml") count = 0 for m in historical: if m["id"]["bioguide"] in media_bioguide: media.remove(media_bioguide[m["id"]["bioguide"]]) count += 1 print("Removed %i out of office legislators from social media file..." % count) print("Saving historical legislators...") save_data(media, "legislators-social-media.yaml") def candidate_for(bioguide): url = current_bioguide[bioguide]["terms"][-1].get("url", None) if not url: if debug: print("[%s] No official website, skipping" % bioguide) return None if debug: print("[%s] Downloading..." % bioguide) cache = "congress/%s.html" % bioguide body = utils.download(url, cache, force, {'check_redirects': True}) all_matches = [] for regex in regexes[service]: matches = re.findall(regex, body, re.I) if matches: all_matches.extend(matches) if all_matches: for candidate in all_matches: passed = True for blacked in blacklist[service]: if re.search(blacked, candidate, re.I): passed = False if not passed: if debug: print("\tBlacklisted: %s" % candidate) continue return candidate return None if do_update: update() elif do_clean: clean() elif do_verify: verify() elif do_resolvefb: resolvefb() elif do_resolveyt: resolveyt() else: sweep()
def run(): # Field mapping. And which fields should be turned into integers. # See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available. fieldmap = { "congbio": "bioguide", #"fec": "fec", # handled specially... "govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors) "opensecrets": "opensecrets", "votesmart": "votesmart", "cspan": "cspan", } int_fields = ("govtrack", "votesmart", "cspan") # default to not caching cache = utils.flags().get('cache', False) # Load legislator files and map bioguide IDs. y1 = utils.load_data("legislators-current.yaml") y2 = utils.load_data("legislators-historical.yaml") bioguides = { } for y in y1+y2: bioguides[y["id"]["bioguide"]] = y # Okay now the Wikipedia stuff... def get_matching_pages(): # Does a Wikipedia API search for pages containing either of the # two templates. Returns the pages. page_titles = set() for template in ("CongLinks", "CongBio"): eicontinue = "" while True: # construct query URL, using the "eicontinue" of the last query to get the next batch url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template if eicontinue: url += "&eicontinue=" + eicontinue # load the XML print("Getting %s pages (%d...)" % (template, len(page_titles))) dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably for pgname in dom.xpath("query/embeddedin/ei/@title"): page_titles.add(pgname) # get the next eicontinue value and loop eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)") if not eicontinue: break return page_titles # Get the list of Wikipedia pages that use any of the templates we care about. page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles") if cache and os.path.exists(page_list_cache_file): # Load from cache. matching_pages = open(page_list_cache_file).read().split("\n") else: # Query Wikipedia API and save to cache. matching_pages = get_matching_pages() utils.write(("\n".join(matching_pages)), page_list_cache_file) # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon). matching_pages = [p for p in matching_pages if ":" not in p] # Load each page's content and parse the template. for p in sorted(matching_pages): if " campaign" in p: continue if " (surname)" in p: continue if "career of " in p: continue if "for Congress" in p: continue if p.startswith("List of "): continue if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue # Query the Wikipedia API to get the raw page content in XML, # and then use XPath to get the raw page text. url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(p.encode("utf8")) + "&export&exportnowrap" cache_path = "legislators/wikipedia/pages/" + p dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache)) page_content = dom.xpath("string(mw:page/mw:revision/mw:text)", namespaces={ "mw": "http://www.mediawiki.org/xml/export-0.8/" }) # Build a dict for the IDs that we want to insert into our files. new_ids = { "wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores) } if "CongLinks" in page_content: # Parse the key/val pairs in the template. m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content) if not m: continue # no template? for arg in m.group(1).split("|"): if "=" not in arg: continue key, val = arg.split("=", 1) key = key.strip() val = val.strip() if val and key in fieldmap: try: if fieldmap[key] in int_fields: val = int(val) except ValueError: print("invalid value", key, val) continue if key == "opensecrets": val = val.replace("&newMem=Y", "").replace("&newmem=Y", "").replace("&cycle=2004", "").upper() new_ids[fieldmap[key]] = val if "bioguide" not in new_ids: continue new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm bioguide = new_ids["bioguide"] else: m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content) if not m: continue # no template? bioguide = m.group(1).upper() if not bioguide in bioguides: print("Member not found: " + bioguide, p.encode("utf8"), "(Might have been a delegate to the Constitutional Convention.)") continue # handle FEC ids specially because they are stored in an array... fec_id = new_ids.get("fec") if fec_id: del new_ids["fec"] member = bioguides[bioguide] member["id"].update(new_ids) # ...finish the FEC id. if fec_id: if fec_id not in bioguides[bioguide]["id"].get("fec", []): bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id) #print p.encode("utf8"), new_ids utils.save_data(y1, "legislators-current.yaml") utils.save_data(y2, "legislators-historical.yaml")
for n in chunks: if isinstance(n, tree.Tree) and n.node == "MATCH": people = [] relationship = None for piece in n: if piece.node == "RELATIONSHIP": relationship = " ".join([x[0] for x in piece]) elif piece.node == "NAMES": for name in [x for x in piece if isinstance(x, tree.Tree)]: people.append(" ".join([x[0] for x in name])) for person in people: relationships.append({ "relation": relationship, "name": person}) return relationships debug = utils.flags().get('debug', False) # default to caching cache = utils.flags().get('cache', True) force = not cache # pick either current or historical # order is important here, since current defaults to true if utils.flags().get('historical', False): filename = "legislators-historical.yaml" elif utils.flags().get('current', True): filename = "legislators-current.yaml" else: print "No legislators selected." exit(0)
def run(): committees_historical = load_data("committees-historical.yaml") # default to not caching flags = utils.flags() cache = flags.get('cache', False) if cache: from scrapelib.cache import FileCache scraper.cache_storage = FileCache('cache') scraper.cache_write_only = False else: raise # map thomas_id's to their dicts committees_historical_ref = {} for cx in committees_historical: committees_historical_ref[cx["thomas_id"]] = cx # pick the range of committees to get single_congress = flags.get('congress', False) if single_congress: start_congress = int(single_congress) end_congress = int(single_congress) + 1 else: start_congress = 113 end_congress = CURRENT_CONGRESS + 1 urls = { 'senate': 'https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/s/BILLSTATUS-{congress}-s.zip', 'house': 'https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/hr/BILLSTATUS-{congress}-hr.zip' } all_committees = {'house': {}, 'senate': {}} for congress in range(start_congress, end_congress): for chamber, bill_status_url in urls.items(): chamber_committees = all_committees[chamber] url = bill_status_url.format(congress=congress) response = scraper.get(url) with zipfile.ZipFile(io.BytesIO(response.content)) as z: for name in z.namelist(): if name.startswith('BILLSTATUS'): with z.open(name) as xml_file: bill_status = lxml.etree.parse(xml_file) committees = bill_status.xpath( '//billCommittees/item') for committee in committees: code = str( committee.xpath('./systemCode/text()')[0]) name = str(committee.xpath('./name/text()')[0]) if name.endswith(' Committee'): name = name[:-10] if code not in chamber_committees: chamber_committees[code] = { 'names': { congress: name }, 'subcommittees': {} } else: if congress not in chamber_committees[ code]: chamber_committees[code]['names'][ congress] = name subcommittees_d = chamber_committees[code][ 'subcommittees'] for subcommittee in committee.xpath( './subcommittees/item'): code = str( subcommittee.xpath( './systemCode/text()')[0]) name = str( subcommittee.xpath('./name/text()')[0]) if name.endswith(' Subcommittee'): name = name[:-13] if code not in subcommittees_d: subcommittees_d[code] = { congress: name } else: if congress not in subcommittees_d[ code]: subcommittees_d[code][ congress] = name import pprint pprint.pprint(chamber_committees) print(len(chamber_committees)) for chamber, committees in all_committees.items(): for code, committee in committees.items(): id = str(code).upper() id = id[:-2] if id in committees_historical_ref: # Update existing record. cx = committees_historical_ref[id] else: # Create a new record. cx = OrderedDict() committees_historical_ref[id] = cx cx['type'] = chamber.lower() if id[0] != "J": # Joint committees show their full name, otherwise they show a partial name cx['name'] = chamber + " Committee on " + name else: cx['name'] = committee['names'][min(committee['names'])] cx['thomas_id'] = id committees_historical.append(cx) for code, subcommittee in committee['subcommittees'].items(): for sx in cx.setdefault('subcommittees', []): if sx['thomas_id'] == code[-2:]: # found existing record break else: # 'break' not executed, so create a new record sx = OrderedDict() sx['name'] = subcommittee[min(subcommittee)] sx['thomas_id'] = code[-2:] cx['subcommittees'].append(sx) sx.setdefault('congresses', []) sx.setdefault('names', {}) for congress, name in subcommittee.items(): if congress not in sx['congresses']: sx['congresses'].append(congress) sx['names'][congress] = name cx.setdefault('congresses', []) cx.setdefault('names', {}) for congress, name in committee['names'].items(): if congress not in cx['congresses']: cx['congresses'].append(congress) cx['names'][congress] = name # TODO # after checking diff on first commit, we should re-sort #committees_historical.sort(key = lambda c : c["thomas_id"]) #for c in committees_historical: # c.get("subcommittees", []).sort(key = lambda s : s["thomas_id"]) save_data(committees_historical, "committees-historical.yaml")
def run(): committee_membership = {} committees_current = load_data("committees-current.yaml") memberships_current = load_data("committee-membership-current.yaml") # default to not caching cache = utils.flags().get('cache', False) force = not cache # map house/senate committee IDs to their dicts house_ref = {} for cx in committees_current: if "house_committee_id" in cx: house_ref[cx["house_committee_id"]] = cx senate_ref = {} for cx in committees_current: if "senate_committee_id" in cx: senate_ref[cx["senate_committee_id"]] = cx # map state/district to current representatives and state/lastname to current senators # since the House/Senate pages do not provide IDs for Members of Congress today = datetime.datetime.now().date() legislators_current = load_data("legislators-current.yaml") congressmen = {} senators = {} for moc in legislators_current: term = moc["terms"][-1] if today < parse_date(term["start"]) or today > parse_date( term["end"]): raise ValueError("Member's last listed term is not current: " + repr(moc) + " / " + term["start"]) if term["type"] == "rep": congressmen["%s%02d" % (term["state"], term["district"])] = moc elif term["type"] == "sen": for n in [moc["name"]] + moc.get("other_names", []): senators[(term["state"], n["last"])] = moc # Scrape clerk.house.gov... def scrape_house_alt(): for id, cx in list(house_ref.items()): scrape_house_committee(cx, cx["thomas_id"], id + "00") def scrape_house(): """The old way of scraping House committees was to start with the committee list at the URL below, but this page no longer has links to the committee info pages even though those pages exist. Preserving this function in case we need it later.""" url = "http://clerk.house.gov/committee_info/index.aspx" body = download(url, "committees/membership/house.html", force) for id, name in re.findall( r'<a href="/committee_info/index.aspx\?comcode=(..)00">(.*)</a>', body, re.I): if id not in house_ref: print("Unrecognized committee:", id, name) continue cx = house_ref[id] scrape_house_committee(cx, cx["thomas_id"], id + "00") def scrape_house_committee(cx, output_code, house_code): # load the House Clerk's committee membership page for the committee # (it is encoded in utf-8 even though the page indicates otherwise, and # while we don't really care, it helps our sanity check that compares # names) url = "http://clerk.house.gov/committee_info/index.aspx?%s=%s" % ( 'comcode' if house_code[-2:] == '00' else 'subcomcode', house_code) body = download(url, "committees/membership/house/%s.html" % house_code, force) dom = lxml.html.parse(io.StringIO(body)).getroot() # update official name metadata if house_code[-2:] == "00": cx["name"] = "House " + str( dom.cssselect("#com_display h3")[0].text_content()) else: cx["name"] = str( dom.cssselect("#subcom_title h4")[0].text_content()) # update address/phone metadata address_info = re.search( r"""Mailing Address:\s*(.*\S)\s*Telephone:\s*(\(202\) .*\S)""", dom.cssselect("#address")[0].text_content(), re.I | re.S) if not address_info: raise Exception("Failed to parse address info in %s." % house_code) cx["address"] = address_info.group(1) cx["address"] = re.sub(r"\s+", " ", cx["address"]) cx["address"] = re.sub( r"(.*\S)(Washington, DC \d+)\s*(-\d+)?", lambda m: m.group(1) + "; " + m.group(2) + (m.group(3) if m.group(3) else ""), cx["address"]) cx["phone"] = address_info.group(2) # get the ratio line to use in a sanity check later ratio = dom.cssselect("#ratio") if len(ratio): # some committees are missing ratio = re.search(r"Ratio (\d+)/(\d+)", ratio[0].text_content()) else: ratio = None # scan the membership, which is listed by party for i, party, nodename in ((1, 'majority', 'primary'), (2, 'minority', 'secondary')): ctr = 0 for rank, node in enumerate( dom.cssselect("#%s_group li" % nodename)): ctr += 1 lnk = node.cssselect('a') if len(lnk) == 0: if node.text_content() == "Vacancy": continue raise ValueError("Failed to parse a <li> node.") moc = lnk[0].get('href') m = re.search(r"statdis=([A-Z][A-Z]\d\d)", moc) if not m: raise ValueError("Failed to parse member link: " + moc) if not m.group(1) in congressmen: print("Vacancy discrepancy? " + m.group(1)) continue moc = congressmen[m.group(1)] # Sanity check that the name matches the name in our data. found_name = node.cssselect('a')[0].text_content() found_name = re.sub(r"\s+", " ", found_name) # fix whitespace found_name = found_name.replace("'", "’") # fix smart apos if moc['name'].get("official_full", None) is None: print("No official_full field for %s" % found_name) continue if found_name != moc['name']['official_full']: print( "Name mismatch: %s (in our file) vs %s (on the Clerk page)" % (moc['name']['official_full'], found_name)) entry = OrderedDict() entry["name"] = moc['name']['official_full'] entry["party"] = party entry["rank"] = rank + 1 if rank == 0: entry["title"] = "Chair" if entry[ "party"] == "majority" else "Ranking Member" # not explicit, frown entry.update(ids_from(moc["id"])) committee_membership.setdefault(output_code, []).append(entry) # the .tail attribute has the text to the right of the link m = re.match(r", [A-Z][A-Z](,\s*)?(.*\S)?", lnk[0].tail) if m.group(2): # Chairman, Vice Chair, etc. (all but Ex Officio) started appearing on subcommittees around Feb 2014. # For the chair, this should overwrite the implicit title given for the rank 0 majority party member. if m.group(2) in ("Chair", "Chairman", "Chairwoman"): entry["title"] = "Chair" elif m.group(2) in ("Vice Chair", "Vice Chairman"): entry["title"] = "Vice Chair" elif m.group(2) == "Ex Officio": entry["title"] = m.group(2) else: raise ValueError( "Unrecognized title information '%s' in %s." % (m.group(2), url)) # sanity check we got the right number of nodes if ratio and ctr != int(ratio.group(i)): raise ValueError( "Parsing didn't get the right count of members.") # scan for subcommittees for subcom in dom.cssselect("#subcom_list li a"): m = re.search("subcomcode=(..(\d\d))", subcom.get('href')) if not m: raise ValueError("Failed to parse subcommittee link.") for sx in cx['subcommittees']: if sx["thomas_id"] == m.group(2): break else: print("Subcommittee not found, creating it", output_code, m.group(1)) sx = OrderedDict() sx['name'] = "[not initialized]" # will be set inside of scrape_house_committee sx['thomas_id'] = m.group(2) cx['subcommittees'].append(sx) scrape_house_committee(sx, cx["thomas_id"] + sx["thomas_id"], m.group(1)) # Scrape senate.gov.... def scrape_senate(): url = "https://www.senate.gov/pagelayout/committees/b_three_sections_with_teasers/membership.htm" body = download(url, "committees/membership/senate.html", force) for id, name in re.findall( r'value="/general/committee_membership/committee_memberships_(....).htm">(.*?)</option>', body, re.I | re.S): if id not in senate_ref: print("Unrecognized committee:", id, name) continue cx = senate_ref[id] is_joint = (id[0] == "J") # Scrape some metadata on the HTML page first. committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.htm" % id print("[%s] Fetching members for %s (%s)" % (id, name, committee_url)) body2 = download(committee_url, "committees/membership/senate/%s.html" % id, force) if not body2: print("\tcommittee page not good:", committee_url) continue m = re.search( r'<span class="contenttext"><a href="(http://(.*?).senate.gov/)">', body2, re.I) if m: cx["url"] = m.group(1) # Use the XML for the rest. print("\tDownloading XML...") committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.xml" % id body3 = download(committee_url, "committees/membership/senate/%s.xml" % id, force) dom = lxml.etree.fromstring( body3.encode("utf8") ) # must be bytes to parse if there is an encoding declaration inside the string cx["name"] = dom.xpath("committees/committee_name")[0].text if id[0] != "J" and id[0:2] != 'SC': cx["name"] = "Senate " + cx["name"] majority_party = dom.xpath("committees/majority_party")[0].text # update full committee members committee_membership[id] = [] for member in dom.xpath("committees/members/member"): scrape_senate_member(committee_membership[id], member, majority_party, is_joint) # update subcommittees for subcom in dom.xpath("committees/subcommittee"): scid = subcom.xpath("committee_code")[0].text[4:] for sx in cx.get('subcommittees', []): if sx["thomas_id"] == scid: break else: print("Subcommittee not found, creating it", scid, name) sx = OrderedDict() sx['thomas_id'] = scid cx.setdefault('subcommittees', []).append(sx) # update metadata name = subcom.xpath("subcommittee_name")[0].text sx["name"] = name.strip() sx["name"] = re.sub(r"^\s*Subcommittee on\s*", "", sx["name"]) sx["name"] = re.sub(r"\s+", " ", sx["name"]) committee_membership[id + scid] = [] for member in subcom.xpath("members/member"): scrape_senate_member(committee_membership[id + scid], member, majority_party, is_joint) def scrape_senate_member(output_list, membernode, majority_party, is_joint): last_name = membernode.xpath("name/last")[0].text state = membernode.xpath("state")[0].text party = "majority" if membernode.xpath( "party")[0].text == majority_party else "minority" title = membernode.xpath("position")[0].text if title == "Member": title = None if title == "Ranking": title = "Ranking Member" # look up senator by state and last name if (state, last_name) not in senators: print("\t[%s] Unknown member: %s" % (state, last_name)) return None moc = senators[(state, last_name)] entry = OrderedDict() if 'official_full' in moc['name']: entry["name"] = moc['name']['official_full'] else: print("missing name->official_full field for", moc['id']['bioguide']) entry["party"] = party entry["rank"] = len([ e for e in output_list if e["party"] == entry["party"] ]) + 1 # how many have we seen so far in this party, +1 if title: entry["title"] = title entry.update(ids_from(moc["id"])) if is_joint: entry["chamber"] = "senate" output_list.append(entry) # sort by party, then by rank, since we get the nodes in the XML in a rough seniority order that ignores party # should be done once at the end, but cleaner to do it here output_list.sort(key=lambda e: (e["party"] != "majority", e["rank"])) # stick to a specific small set of official IDs to cross-link members # this limits the IDs from going out of control in this file, while # preserving us flexibility to be inclusive of IDs in the main leg files def ids_from(moc): ids = OrderedDict() for id in ["thomas", "bioguide"]: if id in moc: ids[id] = moc[id] if len(ids) == 0: raise ValueError( "Missing an official ID for this legislator, won't be able to link back" ) return ids def restore_house_members_on_joint_committees(): # The House doesn't publish joint committee members, but we're manaually gathering # that. Add them back into the output from whatever we have on disk. Put them after # Senate members. for c, mbrs in list(memberships_current.items()): if c[0] != "J": continue for m in mbrs: if m["chamber"] != "house": continue committee_membership[c].append(m) # MAIN scrape_house() scrape_senate() restore_house_members_on_joint_committees() save_data(committee_membership, "committee-membership-current.yaml") save_data(committees_current, "committees-current.yaml")
def main(): regexes = { "youtube": [ "https?://(?:www\\.)?youtube.com/(channel/[^\\s\"/\\?#']+)", "https?://(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)" ], "facebook": [ "\\('facebook.com/([^']+)'\\)", "https?://(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)", "https?://(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)" ], "twitter": [ "https?://(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/]+)", "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)" ] } debug = utils.flags().get('debug', False) do_update = utils.flags().get('update', False) do_clean = utils.flags().get('clean', False) do_verify = utils.flags().get('verify', False) do_resolvefb = utils.flags().get('resolvefb', False) do_resolveyt = utils.flags().get('resolveyt', False) # default to not caching cache = utils.flags().get('cache', False) force = not cache if do_resolvefb: service = "facebook" elif do_resolveyt: service = "youtube" else: service = utils.flags().get('service', None) if service not in ["twitter", "youtube", "facebook"]: print "--service must be one of twitter, youtube, or facebook" exit(0) # load in members, orient by bioguide ID print "Loading current legislators..." current = load_data("legislators-current.yaml") current_bioguide = { } for m in current: if m["id"].has_key("bioguide"): current_bioguide[m["id"]["bioguide"]] = m print "Loading blacklist..." blacklist = { 'twitter': [], 'facebook': [], 'youtube': [] } for rec in csv.DictReader(open("data/social_media_blacklist.csv")): blacklist[rec["service"]].append(rec["pattern"]) print "Loading whitelist..." whitelist = { 'twitter': [], 'facebook': [], 'youtube': [] } for rec in csv.DictReader(open("data/social_media_whitelist.csv")): whitelist[rec["service"]].append(rec["account"].lower()) # reorient currently known social media by ID print "Loading social media..." media = load_data("legislators-social-media.yaml") media_bioguide = { } for m in media: media_bioguide[m["id"]["bioguide"]] = m def resolvefb(): updated_media = [] for m in media: social = m['social'] if 'facebook' in social and social['facebook']: graph_url = "https://graph.facebook.com/%s" % social['facebook'] if re.match('\d+', social['facebook']): social['facebook_id'] = social['facebook'] fbobj = requests.get(graph_url).json() if 'username' in fbobj: social['facebook'] = fbobj['username'] else: try: social['facebook_id'] = requests.get(graph_url).json()['id'] except: print "Unable to get graph ID for: %s" % social['facebook'] social['facebook_id'] = None updated_media.append(m) print "Saving social media..." save_data(updated_media, "legislators-social-media.yaml") def resolveyt(): # To avoid hitting quota limits, register for a YouTube 2.0 API key at # https://code.google.com/apis/youtube/dashboard # and put it below api_file = open('cache/youtube_api_key','r') api_key = api_file.read() updated_media = [] for m in media: social = m['social'] if 'youtube' in social and (social['youtube'] or social['youtube_id']): if not social['youtube']: social['youtube'] = social['youtube_id'] if re.match('^channel/',social['youtube']): ytid = social['youtube'][8:] else: ytid = social['youtube'] profile_url = ("http://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key)) try: ytreq = requests.get(profile_url) if ytreq.status_code == 404: # If the account name isn't valid, it's probably a redirect. try: # Try to scrape the real YouTube username search_url = ("http://www.youtube.com/%s" % social['youtube']) csearch = requests.get(search_url).text.encode('ascii','ignore') u = re.search(r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',csearch) if u: print "%s maps to %s" % (social['youtube'],u.group(1)) social['youtube'] = u.group(1) profile_url = ("http://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json" % social['youtube']) ytreq = requests.get(profile_url) else: raise Exception("Couldn't figure out the username format for %s" % social['youtube']) except: print "Search couldn't locate YouTube account for %s" % social['youtube'] raise ytobj = ytreq.json() social['youtube_id'] = ytobj['entry']['yt$channelId']['$t'] if ytobj['entry']['yt$username']['$t'] != ytobj['entry']['yt$userId']['$t']: if social['youtube'].lower() != ytobj['entry']['yt$username']['$t']: # YT accounts are case-insensitive. Preserve capitalization if possible. social['youtube'] = ytobj['entry']['yt$username']['$t'] else: del social['youtube'] except: print "Unable to get YouTube Channel ID for: %s" % social['youtube'] updated_media.append(m) print "Saving social media..." save_data(updated_media, "legislators-social-media.yaml") def sweep(): to_check = [] bioguide = utils.flags().get('bioguide', None) if bioguide: possibles = [bioguide] else: possibles = current_bioguide.keys() for bioguide in possibles: if media_bioguide.get(bioguide, None) is None: to_check.append(bioguide) elif media_bioguide[bioguide]["social"].get(service, None) is None: to_check.append(bioguide) else: pass utils.mkdir_p("cache/social_media") writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w')) writer.writerow(["bioguide", "official_full", "website", "service", "candidate", "candidate_url"]) for bioguide in to_check: candidate = candidate_for(bioguide) if candidate: url = current_bioguide[bioguide]["terms"][-1].get("url", None) candidate_url = "https://%s.com/%s" % (service, candidate) writer.writerow([bioguide, current_bioguide[bioguide]['name']['official_full'].encode('utf-8'), url, service, candidate, candidate_url]) print "\tWrote: %s" % candidate def verify(): bioguide = utils.flags().get('bioguide', None) if bioguide: to_check = [bioguide] else: to_check = media_bioguide.keys() for bioguide in to_check: entry = media_bioguide[bioguide] current = entry['social'].get(service, None) if not current: continue bioguide = entry['id']['bioguide'] candidate = candidate_for(bioguide) if not candidate: # if current is in whitelist, and none is on the page, that's okay if current.lower() in whitelist[service]: continue else: candidate = "" url = current_bioguide[bioguide]['terms'][-1].get('url') if current.lower() != candidate.lower(): print "[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate) def update(): for rec in csv.DictReader(open("cache/social_media/%s_candidates.csv" % service)): bioguide = rec["bioguide"] candidate = rec["candidate"] if media_bioguide.has_key(bioguide): media_bioguide[bioguide]['social'][service] = candidate else: new_media = {'id': {}, 'social': {}} new_media['id']['bioguide'] = bioguide thomas_id = current_bioguide[bioguide]['id'].get("thomas", None) if thomas_id: new_media['id']['thomas'] = thomas_id new_media['social'][service] = candidate media.append(new_media) print "Saving social media..." save_data(media, "legislators-social-media.yaml") def clean(): print "Loading historical legislators..." historical = load_data("legislators-historical.yaml") count = 0 for m in historical: if media_bioguide.has_key(m["id"]["bioguide"]): media.remove(media_bioguide[m["id"]["bioguide"]]) count += 1 print "Removed %i out of office legislators from social media file..." % count print "Saving historical legislators..." save_data(media, "legislators-social-media.yaml") def candidate_for(bioguide): url = current_bioguide[bioguide]["terms"][-1].get("url", None) if not url: if debug: print "[%s] No official website, skipping" % bioguide return None if debug: print "[%s] Downloading..." % bioguide cache = "congress/%s.html" % bioguide body = utils.download(url, cache, force) all_matches = [] for regex in regexes[service]: matches = re.findall(regex, body, re.I) if matches: all_matches.extend(matches) if all_matches: for candidate in all_matches: passed = True for blacked in blacklist[service]: if re.search(blacked, candidate, re.I): passed = False if not passed: if debug: print "\tBlacklisted: %s" % candidate continue return candidate return None if do_update: update() elif do_clean: clean() elif do_verify: verify() elif do_resolvefb: resolvefb() elif do_resolveyt: resolveyt() else: sweep()
# --bioguide: do *only* a single legislator import lxml.html, StringIO import datetime import re import utils from utils import download, load_data, save_data, parse_date def birthday_for(string): pattern = "born(.+?)((?:January|February|March|April|May|June|July|August|September|October|November|December),? \\d{1,2},? \\d{4})" match = re.search(pattern, string, re.I) if match: if len(re.findall(";", match.group(1))) <= 1: return match.group(2).strip() debug = utils.flags().get('debug', False) # default to caching cache = utils.flags().get('cache', True) force = not cache # pick either current or historical # order is important here, since current defaults to true if utils.flags().get('historical', False): filename = "legislators-historical.yaml" elif utils.flags().get('current', True): filename = "legislators-current.yaml" else: print "No legislators selected." exit(0)
def run(): today = datetime.now().date() # default to not caching cache = utils.flags().get('cache', False) force = not cache y = load_data("legislators-current.yaml") for moc in y: try: term = moc["terms"][-1] except IndexError: print("Member has no terms", moc) continue if term["type"] != "rep": continue if today < parse_date(term["start"]) or today > parse_date( term["end"]): print("Member's last listed term is not current", moc, term["start"]) continue # Specify districts e.g. WA-02 on the command line to only update those. # if len(sys.argv) > 1 and ("%s-%02d" % (term["state"], term["district"])) not in sys.argv: continue if "class" in term: del term["class"] url = "http://clerk.house.gov/member_info/mem_contact_info.aspx?statdis=%s%02d" % ( term["state"], term["district"]) cache = "legislators/house/%s%02d.html" % (term["state"], term["district"]) try: # the meta tag say it's iso-8859-1, but... names are actually in utf8... body = download(url, cache, force) dom = lxml.html.parse(io.StringIO(body)).getroot() except lxml.etree.XMLSyntaxError: print("Error parsing: ", url) continue name = str(dom.cssselect("#results h3")[0].text_content()) addressinfo = str(dom.cssselect("#results p")[0].text_content()) # Sanity check that the name is similar. if name != moc["name"].get("official_full", ""): cfname = moc["name"]["first"] + " " + moc["name"]["last"] print("Warning: Are these the same people?", name.encode("utf8"), "|", cfname.encode("utf8")) # Parse the address out of the address p tag. addressinfo = "; ".join(line.strip() for line in addressinfo.split("\n") if line.strip() != "") m = re.match( r"[\w\s]+-(\d+(st|nd|rd|th)|At Large|Delegate|Resident Commissioner), ([A-Za-z]*)(.+); Phone: (.*)", addressinfo, re.DOTALL) if not m: print("Error parsing address info: ", name.encode("utf8"), ":", addressinfo.encode("utf8")) continue address = m.group(4) phone = re.sub( "^\((\d\d\d)\) ", lambda m: m.group(1) + "-", m.group(5) ) # replace (XXX) area code with XXX- for compatibility w/ existing format office = address.split(";")[0].replace("HOB", "House Office Building") moc["name"]["official_full"] = name term["address"] = address term["office"] = office term["phone"] = phone save_data(y, "legislators-current.yaml")
# Scrape house.gov and senate.gov for current committee membership, # and updates the committees-current.yaml file with metadata including # name, url, address, and phone number. import re, lxml.html, lxml.etree, StringIO, datetime from collections import OrderedDict import utils from utils import download, load_data, save_data, parse_date, CURRENT_CONGRESS committee_membership = {} committees_current = load_data("committees-current.yaml") memberships_current = load_data("committee-membership-current.yaml") # default to not caching cache = utils.flags().get('cache', False) force = not cache # map house/senate committee IDs to their dicts house_ref = {} for cx in committees_current: if "house_committee_id" in cx: house_ref[cx["house_committee_id"]] = cx senate_ref = {} for cx in committees_current: if "senate_committee_id" in cx: senate_ref[cx["senate_committee_id"]] = cx # map state/district to current representatives and state/lastname to current senators # since the House/Senate pages do not provide IDs for Members of Congress today = datetime.datetime.now().date()
def run(options): cache = utils.flags().get('cache', False) force = not cache scrape(options)
def run(): # Field mapping. And which fields should be turned into integers. # See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available. fieldmap = { "congbio": "bioguide", #"fec": "fec", # handled specially... "govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors) "opensecrets": "opensecrets", "votesmart": "votesmart", "cspan": "cspan", } int_fields = ("govtrack", "votesmart", "cspan") # default to not caching cache = utils.flags().get('cache', False) # Load legislator files and map bioguide IDs. y1 = utils.load_data("legislators-current.yaml") y2 = utils.load_data("legislators-historical.yaml") bioguides = {} for y in y1 + y2: bioguides[y["id"]["bioguide"]] = y # Okay now the Wikipedia stuff... def get_matching_pages(): # Does a Wikipedia API search for pages containing either of the # two templates. Returns the pages. page_titles = set() for template in ("CongLinks", "CongBio"): eicontinue = "" while True: # construct query URL, using the "eicontinue" of the last query to get the next batch url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template if eicontinue: url += "&eicontinue=" + eicontinue # load the XML print("Getting %s pages (%d...)" % (template, len(page_titles))) dom = lxml.etree.fromstring(utils.download( url, None, True)) # can't cache eicontinue probably for pgname in dom.xpath("query/embeddedin/ei/@title"): page_titles.add(pgname) # get the next eicontinue value and loop eicontinue = dom.xpath( "string(query-continue/embeddedin/@eicontinue)") if not eicontinue: break return page_titles # Get the list of Wikipedia pages that use any of the templates we care about. page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles") if cache and os.path.exists(page_list_cache_file): # Load from cache. matching_pages = open(page_list_cache_file).read().split("\n") else: # Query Wikipedia API and save to cache. matching_pages = get_matching_pages() utils.write(("\n".join(matching_pages)), page_list_cache_file) # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon). matching_pages = [p for p in matching_pages if ":" not in p] # Load each page's content and parse the template. for p in sorted(matching_pages): if " campaign" in p: continue if " (surname)" in p: continue if "career of " in p: continue if "for Congress" in p: continue if p.startswith("List of "): continue if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue # Query the Wikipedia API to get the raw page content in XML, # and then use XPath to get the raw page text. url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote( p.encode("utf8")) + "&export&exportnowrap" cache_path = "legislators/wikipedia/pages/" + p dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache)) page_content = dom.xpath( "string(mw:page/mw:revision/mw:text)", namespaces={"mw": "http://www.mediawiki.org/xml/export-0.8/"}) # Build a dict for the IDs that we want to insert into our files. new_ids = { "wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores) } if "CongLinks" in page_content: # Parse the key/val pairs in the template. m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content) if not m: continue # no template? for arg in m.group(1).split("|"): if "=" not in arg: continue key, val = arg.split("=", 1) key = key.strip() val = val.strip() if val and key in fieldmap: try: if fieldmap[key] in int_fields: val = int(val) except ValueError: print("invalid value", key, val) continue if key == "opensecrets": val = val.replace("&newMem=Y", "").replace( "&newmem=Y", "").replace("&cycle=2004", "").upper() new_ids[fieldmap[key]] = val if "bioguide" not in new_ids: continue new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm bioguide = new_ids["bioguide"] else: m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content) if not m: continue # no template? bioguide = m.group(1).upper() if not bioguide in bioguides: print( "Member not found: " + bioguide, p, "(Might have been a delegate to the Constitutional Convention.)" ) continue # handle FEC ids specially because they are stored in an array... fec_id = new_ids.get("fec") if fec_id: del new_ids["fec"] member = bioguides[bioguide] member["id"].update(new_ids) # ...finish the FEC id. if fec_id: if fec_id not in bioguides[bioguide]["id"].get("fec", []): bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id) #print p.encode("utf8"), new_ids utils.save_data(y1, "legislators-current.yaml") utils.save_data(y2, "legislators-historical.yaml")
def run(): # default to not caching cache = utils.flags().get('cache', False) force = not cache states = [] current = load_data("legislators-current.yaml") by_district = { } for m in current: last_term = m['terms'][-1] if last_term['type'] != 'sen': state = last_term['state'] full_district = "%s%02d" % (state, int(last_term['district'])) by_district[full_district] = m if not state in states: # house lists AS (American Samoa) as AQ, awesome if state == "AS": state = "AQ" states.append(state) destination = "legislators/house.html" url = "http://house.gov/representatives/" body = utils.download(url, destination, force) if not body: print("Couldn't download House listing!") exit(0) try: dom = lxml.html.parse(io.StringIO(body)).getroot() except lxml.etree.XMLSyntaxError: print("Error parsing House listing!") exit(0) # process: # go through every state in our records, fetching that state's table # go through every row after the first, pick the district to isolate the member # pluck out the URL, update that member's last term's URL count = 0 for state in states: rows = dom.cssselect("h2#state_%s+table tr" % state.lower()) for row in rows: cells = row.cssselect("td") if not cells: continue district = str(cells[0].text_content()) if district == "At Large": district = 0 url = cells[1].cssselect("a")[0].get("href") # hit the URL to resolve any redirects to get the canonical URL, # since the listing on house.gov sometimes gives URLs that redirect. resp = urllib.request.urlopen(url) url = resp.geturl() # kill trailing slashes url = re.sub("/$", "", url) if state == "AQ": state = "AS" full_district = "%s%02d" % (state, int(district)) if full_district in by_district: by_district[full_district]['terms'][-1]['url'] = url else: print("[%s] No current legislator" % full_district) count += 1 print("Processed %i people rows on House listing." % count) print("Saving data...") save_data(current, "legislators-current.yaml")
def run(): def update_birthday(bioguide, person, main): birthday = birthday_for(main) if not birthday: print("[%s] NO BIRTHDAY :(\n\n%s" % (bioguide, main)) warnings.append(bioguide) return if birthday == "UNKNOWN": return try: birthday = datetime.datetime.strptime(birthday.replace(",", ""), "%B %d %Y") except ValueError: print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide, main)) warnings.append(bioguide) return birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day) person.setdefault("bio", {})["birthday"] = birthday def birthday_for(string): # exceptions for not-nicely-placed semicolons string = string.replace("born in Cresskill, Bergen County, N. J.; April", "born April") string = string.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802") string = string.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967") string = string.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962") string = string.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947") string = string.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968") # look for a date pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})" match = re.search(pattern, string, re.I) if not match or not match.group(1): # specifically detect cases that we can't handle to avoid unnecessary warnings if re.search("birth dates? unknown|date of birth is unknown", string, re.I): return "UNKNOWN" if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", string, re.I): return "UNKNOWN" return None return match.group(1).strip() def relationships_of(string): # relationship data is stored in a parenthetical immediately after the end of the </font> tag in the bio # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)" pattern = "^\((.*?)\)" match = re.search(pattern, string, re.I) relationships = [] if match and len(match.groups()) > 0: relationship_text = match.group(1).encode("ascii", "replace") # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar from nltk import tree, pos_tag, RegexpParser tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text) pos = pos_tag(tokens) grammar = r""" NAME: {<NNP>+} NAMES: { <IN><NAME>(?:<CC><NAME>)* } RELATIONSHIP: { <JJ|NN|RB|VB|VBD|VBN|IN|PRP\$>+ } MATCH: { <RELATIONSHIP><NAMES> } """ cp = RegexpParser(grammar) chunks = cp.parse(pos) # iterate through the Relationship/Names pairs for n in chunks: if isinstance(n, tree.Tree) and n.node == "MATCH": people = [] relationship = None for piece in n: if piece.node == "RELATIONSHIP": relationship = " ".join([x[0] for x in piece]) elif piece.node == "NAMES": for name in [x for x in piece if isinstance(x, tree.Tree)]: people.append(" ".join([x[0] for x in name])) for person in people: relationships.append({ "relation": relationship, "name": person}) return relationships # default to caching cache = utils.flags().get('cache', True) force = not cache # pick either current or historical # order is important here, since current defaults to true if utils.flags().get('historical', False): filename = "legislators-historical.yaml" elif utils.flags().get('current', True): filename = "legislators-current.yaml" else: print("No legislators selected.") exit(0) print("Loading %s..." % filename) legislators = load_data(filename) # reoriented cache to access by bioguide ID by_bioguide = { } for m in legislators: if "bioguide" in m["id"]: by_bioguide[m["id"]["bioguide"]] = m # optionally focus on one legislator bioguide = utils.flags().get('bioguide', None) if bioguide: bioguides = [bioguide] else: bioguides = list(by_bioguide.keys()) warnings = [] missing = [] count = 0 families = 0 for bioguide in bioguides: # Download & parse the HTML of the bioguide page. try: dom = fetch_bioguide_page(bioguide, force) except Exception as e: print(e) missing.append(bioguide) continue # Extract the member's name and the biography paragraph (main). try: name = dom.cssselect("p font")[0] main = dom.cssselect("p")[0] except IndexError: print("[%s] Missing name or content!" % bioguide) exit(0) name = name.text_content().strip() main = main.text_content().strip().replace("\n", " ").replace("\r", " ") main = re.sub("\s+", " ", main) # Extract the member's birthday. update_birthday(bioguide, by_bioguide[bioguide], main) # Extract relationships with other Members of Congress. if utils.flags().get("relationships", False): #relationship information, if present, is in a parenthetical immediately after the name. #should always be present if we passed the IndexError catch above after_name = dom.cssselect("p font")[0].tail.strip() relationships = relationships_of(after_name) if len(relationships): families = families + 1 by_bioguide[bioguide]["family"] = relationships count = count + 1 print() if warnings: print("Missed %d birthdays: %s" % (len(warnings), str.join(", ", warnings))) if missing: print("Missing a page for %d bioguides: %s" % (len(missing), str.join(", ", missing))) print("Saving data to %s..." % filename) save_data(legislators, filename) print("Saved %d legislators to %s" % (count, filename)) if utils.flags().get("relationships", False): print("Found family members for %d of those legislators" % families)
#!/usr/bin/env python # Uses http://house.gov/representatives/ to scrape official member websites. # Only known source. # Assumptions: # member's state and district fields are present and accurate. # member's most recent term in the terms field is their current one. import lxml.html, StringIO, urllib2 import re import utils from utils import download, load_data, save_data, parse_date # default to not caching cache = utils.flags().get("cache", False) force = not cache states = [] current = load_data("legislators-current.yaml") by_district = {} for m in current: last_term = m["terms"][-1] if last_term["type"] != "sen": state = last_term["state"] full_district = "%s%02d" % (state, int(last_term["district"])) by_district[full_district] = m if not state in states:
def run(): CONGRESS_ID = "113th Congress (2013-2014)" # the query string parameter # constants state_names = { "Alabama": "AL", "Alaska": "AK", "American Samoa": "AS", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Guam": "GU", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Northern Mariana Islands": "MP", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Puerto Rico": "PR", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virgin Islands": "VI", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY" } # default to not caching cache = utils.flags().get('cache', False) force = not cache # load in current members y = load_data("legislators-current.yaml") by_district = {} existing_senator_ids = set() for m in y: last_term = m['terms'][-1] if last_term['type'] == 'rep': full_district = "%s%02d" % (last_term['state'], int(last_term['district'])) by_district[full_district] = m elif last_term['type'] == 'sen': if "thomas" in m["id"]: existing_senator_ids.add(m["id"]["thomas"]) seen_ids = set() for chamber in ("House of Representatives", "Senate"): url = "http://beta.congress.gov/members?pageSize=500&Legislative_Source=Member+Profiles&Congress=%s&Chamber_of_Congress=%s" % ( urllib.parse.quote_plus(CONGRESS_ID), urllib.parse.quote_plus(chamber)) cache = "congress.gov/members/%s-%s.html" % (CONGRESS_ID, chamber) try: body = download(url, cache, force) dom = lxml.html.parse(io.StringIO(body)).getroot() except lxml.etree.XMLSyntaxError: print("Error parsing: ", url) continue for node in dom.xpath("//ul[@class='results_list']/li"): thomas_id = "%05d" % int( re.search("/member/.*/(\d+)$", node.xpath('h2/a')[0].get('href')).group(1)) # THOMAS misassigned these 'new' IDs to existing individuals. if thomas_id in ('02139', '02132'): continue name = node.xpath('h2/a')[0].text state = node.xpath( 'div[@class="memberProfile"]/table/tbody/tr[1]/td' )[0].text.strip() state = state_names[state] if chamber == "House of Representatives": # There's enough information to easily pick out which Member this refers to, so write it # directly to the file. district = node.xpath( 'div[@class="memberProfile"]/table/tbody/tr[2]/td' )[0].text.strip() if district == "At Large": district = 0 district = "%02d" % int(district) if state + district not in by_district: print( state + district + "'s", name, "appears on Congress.gov but the office is vacant in our data." ) continue if state + district in seen_ids: print("Congress.gov lists two people for %s%s!" % (state, district)) seen_ids.add(state + district) by_district[state + district]["id"]["thomas"] = thomas_id elif chamber == "Senate": # For senators we'd have to match on name or something else, so that's too difficult. # Just look for new IDs. if thomas_id not in existing_senator_ids: print("Please manually set", thomas_id, "for", name, "from", state) save_data(y, "legislators-current.yaml")
# --cache: load from cache if present on disk (default: true) # --bioguide: load only one legislator, by his/her bioguide ID # --congress: do *only* updates for legislators serving in specific congress import datetime import re import utils import urllib2 import requests from utils import download, load_data, save_data, parse_date, states, congress_from_legislative_year, legislative_year import json import string import csv import unicodedata debug = utils.flags().get("debug", False) # default to caching cache = utils.flags().get("cache", True) force = not cache only_bioguide = utils.flags().get("bioguide", None) congress = utils.flags().get("congress", None) filename_historical = "legislators-historical.yaml" filename_current = "legislators-current.yaml" data_files = [] print "Loading %s..." % "legislators-current.yaml"
def run(): CONGRESS_ID = "113th Congress (2013-2014)" # the query string parameter # constants state_names = {"Alabama": "AL", "Alaska": "AK", "American Samoa": "AS", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Guam": "GU", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Northern Mariana Islands": "MP", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Puerto Rico": "PR", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virgin Islands": "VI", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"} # default to not caching cache = utils.flags().get('cache', False) force = not cache # load in current members y = load_data("legislators-current.yaml") by_district = { } existing_senator_ids = set() for m in y: last_term = m['terms'][-1] if last_term['type'] == 'rep': full_district = "%s%02d" % (last_term['state'], int(last_term['district'])) by_district[full_district] = m elif last_term['type'] == 'sen': if "thomas" in m["id"]: existing_senator_ids.add(m["id"]["thomas"]) seen_ids = set() for chamber in ("House of Representatives", "Senate"): url = "http://beta.congress.gov/members?pageSize=500&Legislative_Source=Member+Profiles&Congress=%s&Chamber_of_Congress=%s" % ( urllib.parse.quote_plus(CONGRESS_ID), urllib.parse.quote_plus(chamber)) cache = "congress.gov/members/%s-%s.html" % (CONGRESS_ID, chamber) try: body = download(url, cache, force) dom = lxml.html.parse(io.StringIO(body)).getroot() except lxml.etree.XMLSyntaxError: print("Error parsing: ", url) continue for node in dom.xpath("//ul[@class='results_list']/li"): thomas_id = "%05d" % int(re.search("/member/.*/(\d+)$", node.xpath('h2/a')[0].get('href')).group(1)) # THOMAS misassigned these 'new' IDs to existing individuals. if thomas_id in ('02139', '02132'): continue name = node.xpath('h2/a')[0].text state = node.xpath('div[@class="memberProfile"]/table/tbody/tr[1]/td')[0].text.strip() state = state_names[state] if chamber == "House of Representatives": # There's enough information to easily pick out which Member this refers to, so write it # directly to the file. district = node.xpath('div[@class="memberProfile"]/table/tbody/tr[2]/td')[0].text.strip() if district == "At Large": district = 0 district = "%02d" % int(district) if state + district not in by_district: print(state + district + "'s", name, "appears on Congress.gov but the office is vacant in our data.") continue if state + district in seen_ids: print("Congress.gov lists two people for %s%s!" % (state, district)) seen_ids.add(state+district) by_district[state + district]["id"]["thomas"] = thomas_id elif chamber == "Senate": # For senators we'd have to match on name or something else, so that's too difficult. # Just look for new IDs. if thomas_id not in existing_senator_ids: print("Please manually set", thomas_id, "for", name, "from", state) save_data(y, "legislators-current.yaml")
# Parse the THOMAS advanced search page for a list of all committees # and subcommittees from the 93rd Congress forward and store them in # the committees-historical.yaml file. It will include current committees # as well. import re, itertools from collections import OrderedDict import utils from utils import download, load_data, save_data, CURRENT_CONGRESS committees_historical = load_data("committees-historical.yaml") # default to not caching flags = utils.flags() cache = flags.get('cache', False) force = not cache # map thomas_id's to their dicts committees_historical_ref = { } for cx in committees_historical: committees_historical_ref[cx["thomas_id"]] = cx # pick the range of committees to get single_congress = flags.get('congress', False) if single_congress: start_congress = int(single_congress) end_congress = int(single_congress) + 1 else:
# and updates the committees-current.yaml file with metadata including # name, url, address, and phone number. import re, lxml.html, lxml.etree, StringIO, datetime from collections import OrderedDict import utils from utils import download, load_data, save_data, parse_date, CURRENT_CONGRESS committee_membership = { } committees_current = load_data("committees-current.yaml") memberships_current = load_data("committee-membership-current.yaml") # default to not caching cache = utils.flags().get('cache', False) force = not cache # map house/senate committee IDs to their dicts house_ref = { } for cx in committees_current: if "house_committee_id" in cx: house_ref[cx["house_committee_id"]] = cx senate_ref = { } for cx in committees_current: if "senate_committee_id" in cx: senate_ref[cx["senate_committee_id"]] = cx # map state/district to current representatives and state/lastname to current senators
def resolveyt(): # To avoid hitting quota limits, register for a YouTube 2.0 API key at # https://code.google.com/apis/youtube/dashboard # and put it below api_file = open('cache/youtube_api_key', 'r') api_key = api_file.read() bioguide = utils.flags().get('bioguide', None) updated_media = [] for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if ('youtube' in social) or ('youtube_id' in social): if 'youtube' not in social: social['youtube'] = social['youtube_id'] ytid = social['youtube'] profile_url = ("https://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key)) try: print("Resolving YT info for %s" % social['youtube']) ytreq = requests.get(profile_url) # print "\tFetched with status code %i..." % ytreq.status_code if ytreq.status_code == 404: # If the account name isn't valid, it's probably a redirect. try: # Try to scrape the real YouTube username print("\Scraping YouTube username") search_url = ("https://www.youtube.com/%s" % social['youtube']) csearch = requests.get(search_url).text.encode( 'ascii', 'ignore') u = re.search( r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>', csearch) if u: print("\t%s maps to %s" % (social['youtube'], u.group(1))) social['youtube'] = u.group(1) profile_url = ( "https://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json" % social['youtube']) print("\tFetching GData profile...") ytreq = requests.get(profile_url) print("\tFetched GData profile") else: raise Exception( "Couldn't figure out the username format for %s" % social['youtube']) except: print("\tCouldn't locate YouTube account") raise ytobj = ytreq.json() social['youtube_id'] = ytobj['entry']['yt$channelId']['$t'] print("\tResolved youtube_id to %s" % social['youtube_id']) # even though we have their channel ID, do they also have a username? if ytobj['entry']['yt$username']['$t'] != ytobj['entry'][ 'yt$userId']['$t']: if social['youtube'].lower( ) != ytobj['entry']['yt$username']['$t'].lower(): # YT accounts are case-insensitive. Preserve capitalization if possible. social['youtube'] = ytobj['entry']['yt$username'][ '$t'] print("\tAdded YouTube username of %s" % social['youtube']) else: print( "\tYouTube says they do not have a separate username" ) del social['youtube'] except: print("Unable to get YouTube Channel ID for: %s" % social['youtube']) updated_media.append(m) print("Saving social media...") save_data(updated_media, "legislators-social-media.yaml")
def main(): regexes = { "youtube": [ "https?://(?:www\\.)?youtube.com/channel/([^\\s\"/\\?#']+)", "https?://(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)" ], "facebook": [ "\\('facebook.com/([^']+)'\\)", "https?://(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)", "https?://(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)" ], "twitter": [ "https?://(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/]+)", "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)" ] } email_enabled = utils.flags().get('email', False) debug = utils.flags().get('debug', False) do_update = utils.flags().get('update', False) do_clean = utils.flags().get('clean', False) do_verify = utils.flags().get('verify', False) do_resolvefb = utils.flags().get('resolvefb', False) do_resolveyt = utils.flags().get('resolveyt', False) # default to not caching cache = utils.flags().get('cache', False) force = not cache if do_resolvefb: service = "facebook" elif do_resolveyt: service = "youtube" else: service = utils.flags().get('service', None) if service not in ["twitter", "youtube", "facebook"]: print("--service must be one of twitter, youtube, or facebook") exit(0) # load in members, orient by bioguide ID print("Loading current legislators...") current = load_data("legislators-current.yaml") current_bioguide = {} for m in current: if "bioguide" in m["id"]: current_bioguide[m["id"]["bioguide"]] = m print("Loading blacklist...") blacklist = {'twitter': [], 'facebook': [], 'youtube': []} for rec in csv.DictReader(open("data/social_media_blacklist.csv")): blacklist[rec["service"]].append(rec["pattern"]) print("Loading whitelist...") whitelist = {'twitter': [], 'facebook': [], 'youtube': []} for rec in csv.DictReader(open("data/social_media_whitelist.csv")): whitelist[rec["service"]].append(rec["account"].lower()) # reorient currently known social media by ID print("Loading social media...") media = load_data("legislators-social-media.yaml") media_bioguide = {} for m in media: media_bioguide[m["id"]["bioguide"]] = m def resolvefb(): # in order to preserve the comment block at the top of the file, # copy it over into a new RtYamlList instance. We do this because # Python list instances can't hold other random attributes. import rtyaml updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr( media, '__initial_comment_block') for m in media: social = m['social'] if ('facebook' in social and social['facebook']) and ('facebook_id' not in social): graph_url = "https://graph.facebook.com/%s" % social['facebook'] if re.match('\d+', social['facebook']): social['facebook_id'] = social['facebook'] print("Looking up graph username for %s" % social['facebook']) fbobj = requests.get(graph_url).json() if 'username' in fbobj: print("\tGot graph username of %s" % fbobj['username']) social['facebook'] = fbobj['username'] else: print("\tUnable to get graph username") else: try: print("Looking up graph ID for %s" % social['facebook']) fbobj = requests.get(graph_url).json() if 'id' in fbobj: print("\tGot graph ID of %s" % fbobj['id']) social['facebook_id'] = fbobj['id'] else: print("\tUnable to get graph ID") except: print("\tUnable to get graph ID for: %s" % social['facebook']) social['facebook_id'] = None updated_media.append(m) print("Saving social media...") save_data(updated_media, "legislators-social-media.yaml") def resolveyt(): # To avoid hitting quota limits, register for a YouTube 2.0 API key at # https://code.google.com/apis/youtube/dashboard # and put it below api_file = open('cache/youtube_api_key', 'r') api_key = api_file.read() bioguide = utils.flags().get('bioguide', None) updated_media = [] for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if ('youtube' in social) or ('youtube_id' in social): if 'youtube' not in social: social['youtube'] = social['youtube_id'] ytid = social['youtube'] profile_url = ("http://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key)) try: print("Resolving YT info for %s" % social['youtube']) ytreq = requests.get(profile_url) # print "\tFetched with status code %i..." % ytreq.status_code if ytreq.status_code == 404: # If the account name isn't valid, it's probably a redirect. try: # Try to scrape the real YouTube username print("\Scraping YouTube username") search_url = ("http://www.youtube.com/%s" % social['youtube']) csearch = requests.get(search_url).text.encode( 'ascii', 'ignore') u = re.search( r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>', csearch) if u: print("\t%s maps to %s" % (social['youtube'], u.group(1))) social['youtube'] = u.group(1) profile_url = ( "http://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json" % social['youtube']) print("\tFetching GData profile...") ytreq = requests.get(profile_url) print("\tFetched GData profile") else: raise Exception( "Couldn't figure out the username format for %s" % social['youtube']) except: print("\tCouldn't locate YouTube account") raise ytobj = ytreq.json() social['youtube_id'] = ytobj['entry']['yt$channelId']['$t'] print("\tResolved youtube_id to %s" % social['youtube_id']) # even though we have their channel ID, do they also have a username? if ytobj['entry']['yt$username']['$t'] != ytobj['entry'][ 'yt$userId']['$t']: if social['youtube'].lower( ) != ytobj['entry']['yt$username']['$t'].lower(): # YT accounts are case-insensitive. Preserve capitalization if possible. social['youtube'] = ytobj['entry']['yt$username'][ '$t'] print("\tAdded YouTube username of %s" % social['youtube']) else: print( "\tYouTube says they do not have a separate username" ) del social['youtube'] except: print("Unable to get YouTube Channel ID for: %s" % social['youtube']) updated_media.append(m) print("Saving social media...") save_data(updated_media, "legislators-social-media.yaml") def sweep(): to_check = [] bioguide = utils.flags().get('bioguide', None) if bioguide: possibles = [bioguide] else: possibles = list(current_bioguide.keys()) for bioguide in possibles: if media_bioguide.get(bioguide, None) is None: to_check.append(bioguide) elif (media_bioguide[bioguide]["social"].get(service, None) is None) and \ (media_bioguide[bioguide]["social"].get(service + "_id", None) is None): to_check.append(bioguide) else: pass utils.mkdir_p("cache/social_media") writer = csv.writer( open("cache/social_media/%s_candidates.csv" % service, 'w')) writer.writerow([ "bioguide", "official_full", "website", "service", "candidate", "candidate_url" ]) if len(to_check) > 0: rows_found = [] for bioguide in to_check: candidate = candidate_for(bioguide) if candidate: url = current_bioguide[bioguide]["terms"][-1].get( "url", None) candidate_url = "https://%s.com/%s" % (service, candidate) row = [ bioguide, current_bioguide[bioguide]['name'] ['official_full'].encode('utf-8'), url, service, candidate, candidate_url ] writer.writerow(row) print("\tWrote: %s" % candidate) rows_found.append(row) if email_enabled and len(rows_found) > 0: email_body = "Social media leads found:\n\n" for row in rows_found: email_body += ("%s\n" % row) utils.send_email(email_body) def verify(): bioguide = utils.flags().get('bioguide', None) if bioguide: to_check = [bioguide] else: to_check = list(media_bioguide.keys()) for bioguide in to_check: entry = media_bioguide[bioguide] current = entry['social'].get(service, None) if not current: continue bioguide = entry['id']['bioguide'] candidate = candidate_for(bioguide) if not candidate: # if current is in whitelist, and none is on the page, that's okay if current.lower() in whitelist[service]: continue else: candidate = "" url = current_bioguide[bioguide]['terms'][-1].get('url') if current.lower() != candidate.lower(): print("[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate)) def update(): for rec in csv.DictReader( open("cache/social_media/%s_candidates.csv" % service)): bioguide = rec["bioguide"] candidate = rec["candidate"] if bioguide in media_bioguide: media_bioguide[bioguide]['social'][service] = candidate else: new_media = {'id': {}, 'social': {}} new_media['id']['bioguide'] = bioguide thomas_id = current_bioguide[bioguide]['id'].get( "thomas", None) govtrack_id = current_bioguide[bioguide]['id'].get( "govtrack", None) if thomas_id: new_media['id']['thomas'] = thomas_id if govtrack_id: new_media['id']['govtrack'] = govtrack_id new_media['social'][service] = candidate media.append(new_media) print("Saving social media...") save_data(media, "legislators-social-media.yaml") # if it's a youtube update, always do the resolve # if service == "youtube": # resolveyt() def clean(): print("Loading historical legislators...") historical = load_data("legislators-historical.yaml") count = 0 for m in historical: if m["id"]["bioguide"] in media_bioguide: media.remove(media_bioguide[m["id"]["bioguide"]]) count += 1 print( "Removed %i out of office legislators from social media file..." % count) print("Saving historical legislators...") save_data(media, "legislators-social-media.yaml") def candidate_for(bioguide): url = current_bioguide[bioguide]["terms"][-1].get("url", None) if not url: if debug: print("[%s] No official website, skipping" % bioguide) return None if debug: print("[%s] Downloading..." % bioguide) cache = "congress/%s.html" % bioguide body = utils.download(url, cache, force, {'check_redirects': True}) all_matches = [] for regex in regexes[service]: matches = re.findall(regex, body, re.I) if matches: all_matches.extend(matches) if all_matches: for candidate in all_matches: passed = True for blacked in blacklist[service]: if re.search(blacked, candidate, re.I): passed = False if not passed: if debug: print("\tBlacklisted: %s" % candidate) continue return candidate return None if do_update: update() elif do_clean: clean() elif do_verify: verify() elif do_resolvefb: resolvefb() elif do_resolveyt: resolveyt() else: sweep()
def main(): regexes = { "youtube": [ "(?:https?:)?//(?:www\\.)?youtube.com/embed/?\?(list=[^\\s\"/\\?#&']+)", "(?:https?:)?//(?:www\\.)?youtube.com/channel/([^\\s\"/\\?#']+)", "(?:https?:)?//(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)" ], "facebook": [ "\\('facebook.com/([^']+)'\\)", "(?:https?:)?//(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)", "(?:https?:)?//(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)" ], "twitter": [ "(?:https?:)?//(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/]+)", "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)" ], "instagram": [ "instagram.com/(\w{3,})" ] } email_enabled = utils.flags().get('email', False) debug = utils.flags().get('debug', False) do_update = utils.flags().get('update', False) do_clean = utils.flags().get('clean', False) do_verify = utils.flags().get('verify', False) do_resolveyt = utils.flags().get('resolveyt', False) do_resolveig = utils.flags().get('resolveig', False) do_resolvetw = utils.flags().get('resolvetw', False) # default to not caching cache = utils.flags().get('cache', False) force = not cache if do_resolveyt: service = "youtube" elif do_resolveig: service = "instagram" elif do_resolvetw: service = "twitter" else: service = utils.flags().get('service', None) if service not in ["twitter", "youtube", "facebook", "instagram"]: print("--service must be one of twitter, youtube, facebook, or instagram") exit(0) # load in members, orient by bioguide ID print("Loading current legislators...") current = load_data("legislators-current.yaml") current_bioguide = { } for m in current: if "bioguide" in m["id"]: current_bioguide[m["id"]["bioguide"]] = m print("Loading blacklist...") blacklist = { 'twitter': [], 'facebook': [], 'youtube': [], 'instagram': [] } for rec in csv.DictReader(open("data/social_media_blacklist.csv")): blacklist[rec["service"]].append(rec["pattern"]) print("Loading whitelist...") whitelist = { 'twitter': [], 'facebook': [], 'youtube': [] } for rec in csv.DictReader(open("data/social_media_whitelist.csv")): whitelist[rec["service"]].append(rec["account"].lower()) # reorient currently known social media by ID print("Loading social media...") media = load_data("legislators-social-media.yaml") media_bioguide = { } for m in media: media_bioguide[m["id"]["bioguide"]] = m def resolveyt(): # To avoid hitting quota limits, register for a YouTube 2.0 API key at # https://code.google.com/apis/youtube/dashboard # and put it below api_file = open('cache/youtube_api_key','r') api_key = api_file.read() bioguide = utils.flags().get('bioguide', None) updated_media = [] for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if ('youtube' in social) or ('youtube_id' in social): if 'youtube' not in social: social['youtube'] = social['youtube_id'] ytid = social['youtube'] profile_url = ("https://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key)) try: print("Resolving YT info for %s" % social['youtube']) ytreq = requests.get(profile_url) # print "\tFetched with status code %i..." % ytreq.status_code if ytreq.status_code == 404: # If the account name isn't valid, it's probably a redirect. try: # Try to scrape the real YouTube username print("\Scraping YouTube username") search_url = ("https://www.youtube.com/%s" % social['youtube']) csearch = requests.get(search_url).text.encode('ascii','ignore') u = re.search(r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',csearch) if u: print("\t%s maps to %s" % (social['youtube'],u.group(1))) social['youtube'] = u.group(1) profile_url = ("https://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json" % social['youtube']) print("\tFetching GData profile...") ytreq = requests.get(profile_url) print("\tFetched GData profile") else: raise Exception("Couldn't figure out the username format for %s" % social['youtube']) except: print("\tCouldn't locate YouTube account") raise ytobj = ytreq.json() social['youtube_id'] = ytobj['entry']['yt$channelId']['$t'] print("\tResolved youtube_id to %s" % social['youtube_id']) # even though we have their channel ID, do they also have a username? if ytobj['entry']['yt$username']['$t'] != ytobj['entry']['yt$userId']['$t']: if social['youtube'].lower() != ytobj['entry']['yt$username']['$t'].lower(): # YT accounts are case-insensitive. Preserve capitalization if possible. social['youtube'] = ytobj['entry']['yt$username']['$t'] print("\tAdded YouTube username of %s" % social['youtube']) else: print("\tYouTube says they do not have a separate username") del social['youtube'] except: print("Unable to get YouTube Channel ID for: %s" % social['youtube']) updated_media.append(m) print("Saving social media...") save_data(updated_media, "legislators-social-media.yaml") def resolveig(): # in order to preserve the comment block at the top of the file, # copy it over into a new RtYamlList instance. We do this because # Python list instances can't hold other random attributes. import rtyaml updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr(media, '__initial_comment_block') client_id_file = open('cache/instagram_client_id','r') client_id = client_id_file.read() bioguide = utils.flags().get('bioguide', None) for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if 'instagram' not in social and 'instagram_id' not in social: updated_media.append(m) continue instagram_handle = social['instagram'] query_url = "https://api.instagram.com/v1/users/search?q={query}&client_id={client_id}".format(query=instagram_handle,client_id=client_id) instagram_user_search = requests.get(query_url).json() for user in instagram_user_search['data']: time.sleep(0.5) if user['username'] == instagram_handle: m['social']['instagram_id'] = int(user['id']) print("matched instagram_id {instagram_id} to {instagram_handle}".format(instagram_id=social['instagram_id'],instagram_handle=instagram_handle)) updated_media.append(m) save_data(updated_media, "legislators-social-media.yaml") def resolvetw(): """ Does two batch lookups: 1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name as found in the entry's `twitter`. If not, the `twitter` value is updated. 2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and inserts ID. If no profile is found, the `twitter` value is deleted. Note: cache/twitter_client_id must be a formatted JSON dict: { "consumer_secret": "xyz", "access_token": "abc", "access_token_secret": "def", "consumer_key": "jk" } """ import rtyaml from social.twitter import get_api, fetch_profiles updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr(media, '__initial_comment_block') client_id_file = open('cache/twitter_client_id', 'r') _c = json.load(client_id_file) api = get_api(_c['access_token'], _c['access_token_secret'], _c['consumer_key'], _c['consumer_secret']) bioguide = utils.flags().get('bioguide', None) lookups = {'screen_names': [], 'ids': []} # store members that have `twitter` or `twitter_id` info for m in media: # we start with appending to updated_media so that we keep the same order of entries # as found in the loaded file updated_media.append(m) if bioguide and (m['id']['bioguide'] != bioguide): continue social = m['social'] # now we add entries to either the `ids` or the `screen_names` list to batch lookup if 'twitter_id' in social: # add to the queue to be batched-looked-up lookups['ids'].append(m) # append elif 'twitter' in social: lookups['screen_names'].append(m) ####################################### # perform Twitter batch lookup for ids: if lookups['screen_names']: arr = lookups['screen_names'] print("Looking up Twitter ids for", len(arr), "names.") tw_names = [m['social']['twitter'] for m in arr] tw_profiles = fetch_profiles(api, screen_names = tw_names) for m in arr: social = m['social'] # find profile that corresponds to a given screen_name twitter_handle = social['twitter'] twp = next((p for p in tw_profiles if p['screen_name'].lower() == twitter_handle.lower()), None) if twp: m['social']['twitter_id'] = int(twp['id']) print("Matched twitter_id `%s` to `%s`" % (social['twitter_id'], twitter_handle)) else: # Remove errant Twitter entry for now print("No Twitter user profile for:", twitter_handle) m['social'].pop('twitter') print("\t ! removing Twitter handle:", twitter_handle) ########################################## # perform Twitter batch lookup for names by id, to update any renamings: if lookups['ids']: arr = lookups['ids'] print("Looking up Twitter screen_names for", len(arr), "ids.") tw_ids = [m['social']['twitter_id'] for m in arr] tw_profiles = fetch_profiles(api, ids = tw_ids) any_renames_needed = False for m in arr: social = m['social'] # find profile that corresponds to a given screen_name t_id = social['twitter_id'] t_name = social.get('twitter') twp = next((p for p in tw_profiles if int(p['id']) == t_id), None) if twp: # Be silent if there is no change to screen name if t_name and (twp['screen_name'].lower() == t_name.lower()): pass else: any_renames_needed = True m['social']['twitter'] = twp['screen_name'] print("For twitter_id `%s`, renamed `%s` to `%s`" % (t_id, t_name, m['social']['twitter'])) else: # No entry found for this twitter id print("No Twitter user profile for %s, %s" % (t_id, t_name)) m['social'].pop('twitter_id') print("\t ! removing Twitter id:", t_id) if not any_renames_needed: print("No renames needed") # all done with Twitter save_data(updated_media, "legislators-social-media.yaml") def sweep(): to_check = [] bioguide = utils.flags().get('bioguide', None) if bioguide: possibles = [bioguide] else: possibles = list(current_bioguide.keys()) for bioguide in possibles: if media_bioguide.get(bioguide, None) is None: to_check.append(bioguide) elif (media_bioguide[bioguide]["social"].get(service, None) is None) and \ (media_bioguide[bioguide]["social"].get(service + "_id", None) is None): to_check.append(bioguide) else: pass utils.mkdir_p("cache/social_media") writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w')) writer.writerow(["bioguide", "official_full", "website", "service", "candidate", "candidate_url"]) if len(to_check) > 0: rows_found = [] for bioguide in to_check: candidate = candidate_for(bioguide) if candidate: url = current_bioguide[bioguide]["terms"][-1].get("url", None) candidate_url = "https://%s.com/%s" % (service, candidate) row = [bioguide, current_bioguide[bioguide]['name']['official_full'].encode('utf-8'), url, service, candidate, candidate_url] writer.writerow(row) print("\tWrote: %s" % candidate) rows_found.append(row) if email_enabled and len(rows_found) > 0: email_body = "Social media leads found:\n\n" for row in rows_found: email_body += ("%s\n" % row) utils.send_email(email_body) def verify(): bioguide = utils.flags().get('bioguide', None) if bioguide: to_check = [bioguide] else: to_check = list(media_bioguide.keys()) for bioguide in to_check: entry = media_bioguide[bioguide] current = entry['social'].get(service, None) if not current: continue bioguide = entry['id']['bioguide'] candidate = candidate_for(bioguide, current) if not candidate: # if current is in whitelist, and none is on the page, that's okay if current.lower() in whitelist[service]: continue else: candidate = "" url = current_bioguide[bioguide]['terms'][-1].get('url') if current.lower() != candidate.lower(): print("[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate)) def update(): for rec in csv.DictReader(open("cache/social_media/%s_candidates.csv" % service)): bioguide = rec["bioguide"] candidate = rec["candidate"] if bioguide in media_bioguide: media_bioguide[bioguide]['social'][service] = candidate else: new_media = {'id': {}, 'social': {}} new_media['id']['bioguide'] = bioguide thomas_id = current_bioguide[bioguide]['id'].get("thomas", None) govtrack_id = current_bioguide[bioguide]['id'].get("govtrack", None) if thomas_id: new_media['id']['thomas'] = thomas_id if govtrack_id: new_media['id']['govtrack'] = govtrack_id new_media['social'][service] = candidate media.append(new_media) print("Saving social media...") save_data(media, "legislators-social-media.yaml") # if it's a youtube update, always do the resolve # if service == "youtube": # resolveyt() def clean(): print("Loading historical legislators...") historical = load_data("legislators-historical.yaml") count = 0 for m in historical: if m["id"]["bioguide"] in media_bioguide: media.remove(media_bioguide[m["id"]["bioguide"]]) count += 1 print("Removed %i out of office legislators from social media file..." % count) print("Saving historical legislators...") save_data(media, "legislators-social-media.yaml") def candidate_for(bioguide, current = None): """find the most likely candidate account from the URL. If current is passed, the candidate will match it if found otherwise, the first candidate match is returned """ url = current_bioguide[bioguide]["terms"][-1].get("url", None) if not url: if debug: print("[%s] No official website, skipping" % bioguide) return None if debug: print("[%s] Downloading..." % bioguide) cache = "congress/%s.html" % bioguide body = utils.download(url, cache, force, {'check_redirects': True}) if not body: return None all_matches = [] for regex in regexes[service]: matches = re.findall(regex, body, re.I) if matches: all_matches.extend(matches) if not current == None and current in all_matches: return current if all_matches: for candidate in all_matches: passed = True for blacked in blacklist[service]: if re.search(blacked, candidate, re.I): passed = False if not passed: if debug: print("\tBlacklisted: %s" % candidate) continue return candidate return None if do_update: update() elif do_clean: clean() elif do_verify: verify() elif do_resolveyt: resolveyt() elif do_resolveig: resolveig() elif do_resolvetw: resolvetw() else: sweep()
def run(): today = datetime.now().date() # default to not caching cache = utils.flags().get('cache', False) force = not cache y = load_data("legislators-current.yaml") for moc in y: try: term = moc["terms"][-1] except IndexError: print("Member has no terms", moc) continue if term["type"] != "rep": continue if today < parse_date(term["start"]) or today > parse_date(term["end"]): print("Member's last listed term is not current", moc, term["start"]) continue # Specify districts e.g. WA-02 on the command line to only update those. # if len(sys.argv) > 1 and ("%s-%02d" % (term["state"], term["district"])) not in sys.argv: continue if "class" in term: del term["class"] url = "http://clerk.house.gov/member_info/mem_contact_info.aspx?statdis=%s%02d" % (term["state"], term["district"]) cache = "legislators/house/%s%02d.html" % (term["state"], term["district"]) try: # the meta tag say it's iso-8859-1, but... names are actually in utf8... body = download(url, cache, force) dom = lxml.html.parse(io.StringIO(body)).getroot() except lxml.etree.XMLSyntaxError: print("Error parsing: ", url) continue name = str(dom.cssselect("#results h3")[0].text_content()) addressinfo = str(dom.cssselect("#results p")[0].text_content()) # Sanity check that the name is similar. if name != moc["name"].get("official_full", ""): cfname = moc["name"]["first"] + " " + moc["name"]["last"] print("Warning: Are these the same people?", name.encode("utf8"), "|", cfname.encode("utf8")) # Parse the address out of the address p tag. addressinfo = "; ".join(line.strip() for line in addressinfo.split("\n") if line.strip() != "") m = re.match(r"[\w\s]+-(\d+(st|nd|rd|th)|At Large|Delegate|Resident Commissioner), ([A-Za-z]*)(.+); Phone: (.*)", addressinfo, re.DOTALL) if not m: print("Error parsing address info: ", name.encode("utf8"), ":", addressinfo.encode("utf8")) continue address = m.group(4) phone = re.sub("^\((\d\d\d)\) ", lambda m : m.group(1) + "-", m.group(5)) # replace (XXX) area code with XXX- for compatibility w/ existing format office = address.split(";")[0].replace("HOB", "House Office Building") moc["name"]["official_full"] = name term["address"] = address term["office"] = office term["phone"] = phone save_data(y, "legislators-current.yaml")
def resolvetw(): """ Does two batch lookups: 1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name as found in the entry's `twitter`. If not, the `twitter` value is updated. 2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and inserts ID. If no profile is found, the `twitter` value is deleted. Note: cache/twitter_client_id must be a formatted JSON dict: { "consumer_secret": "xyz", "access_token": "abc", "access_token_secret": "def", "consumer_key": "jk" } """ import rtyaml from social.twitter import get_api, fetch_profiles updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr( media, '__initial_comment_block') client_id_file = open('cache/twitter_client_id', 'r') _c = json.load(client_id_file) api = get_api(_c['access_token'], _c['access_token_secret'], _c['consumer_key'], _c['consumer_secret']) bioguide = utils.flags().get('bioguide', None) lookups = { 'screen_names': [], 'ids': [] } # store members that have `twitter` or `twitter_id` info for m in media: # we start with appending to updated_media so that we keep the same order of entries # as found in the loaded file updated_media.append(m) if bioguide and (m['id']['bioguide'] != bioguide): continue social = m['social'] # now we add entries to either the `ids` or the `screen_names` list to batch lookup if 'twitter_id' in social: # add to the queue to be batched-looked-up lookups['ids'].append(m) # append elif 'twitter' in social: lookups['screen_names'].append(m) ####################################### # perform Twitter batch lookup for ids: if lookups['screen_names']: arr = lookups['screen_names'] print("Looking up Twitter ids for", len(arr), "names.") tw_names = [m['social']['twitter'] for m in arr] tw_profiles = fetch_profiles(api, screen_names=tw_names) for m in arr: social = m['social'] # find profile that corresponds to a given screen_name twitter_handle = social['twitter'] twp = next( (p for p in tw_profiles if p['screen_name'].lower() == twitter_handle.lower()), None) if twp: m['social']['twitter_id'] = int(twp['id']) print("Matched twitter_id `%s` to `%s`" % (social['twitter_id'], twitter_handle)) else: # Remove errant Twitter entry for now print("No Twitter user profile for:", twitter_handle) m['social'].pop('twitter') print("\t ! removing Twitter handle:", twitter_handle) ########################################## # perform Twitter batch lookup for names by id, to update any renamings: if lookups['ids']: arr = lookups['ids'] print("Looking up Twitter screen_names for", len(arr), "ids.") tw_ids = [m['social']['twitter_id'] for m in arr] tw_profiles = fetch_profiles(api, ids=tw_ids) any_renames_needed = False for m in arr: social = m['social'] # find profile that corresponds to a given screen_name t_id = social['twitter_id'] t_name = social.get('twitter') twp = next((p for p in tw_profiles if int(p['id']) == t_id), None) if twp: # Be silent if there is no change to screen name if t_name and (twp['screen_name'].lower() == t_name.lower()): pass else: any_renames_needed = True m['social']['twitter'] = twp['screen_name'] print("For twitter_id `%s`, renamed `%s` to `%s`" % (t_id, t_name, m['social']['twitter'])) else: # No entry found for this twitter id print("No Twitter user profile for %s, %s" % (t_id, t_name)) m['social'].pop('twitter_id') print("\t ! removing Twitter id:", t_id) if not any_renames_needed: print("No renames needed") # all done with Twitter save_data(updated_media, "legislators-social-media.yaml")
def run(): options = utils.flags() options['urllib'] = True # disable scrapelib for this debug = options.get('debug', False) # default to NOT caching cache = options.get('cache', False) force = not cache only_bioguide = options.get('bioguide', None) # pick either current or historical # order is important here, since current defaults to true if utils.flags().get('historical', False): filename = "legislators-historical.yaml" elif utils.flags().get('current', True): filename = "legislators-current.yaml" else: print("No legislators selected.") exit(0) print("Loading %s..." % filename) legislators = load_data(filename) api_file = open('cache/sunlight_api_key.txt','r') api_key = api_file.read() for m in legislators: # this can't run unless we've already collected a bioguide for this person bioguide = m["id"].get("bioguide", None) if not bioguide: continue # if we've limited this to just one bioguide, skip over everyone else if only_bioguide and (bioguide != only_bioguide): continue url_BG = "http://transparencydata.com/api/1.0/entities/id_lookup.json?bioguide_id=" url_BG += bioguide url_BG += "&apikey="+api_key destination = "legislators/influence_explorer/lookups/%s.json" % bioguide if debug: print("[%s] Looking up ID..." % bioguide) body = utils.download(url_BG, destination, force, options) if not body: print("[%s] Bad request, skipping" % bioguide) continue jsondata = json.loads(body) if (jsondata != []): IE_ID = jsondata[0]['id'] url_CRP = "http://transparencydata.com/api/1.0/entities/" url_CRP += IE_ID url_CRP += ".json?apikey=" + api_key destination = "legislators/influence_explorer/entities/%s.json" % IE_ID body = utils.download(url_CRP, destination, force, options) jsondata = json.loads(body) opensecrets_id = None fec_ids = [] for external in jsondata['external_ids']: if external["namespace"].startswith("urn:crp"): opensecrets_id = external['id'] elif external["namespace"].startswith("urn:fec"): fec_ids.append(external['id']) if opensecrets_id: m["id"]["opensecrets"] = opensecrets_id # preserve existing FEC IDs, but don't duplicate them if len(fec_ids) > 0: if m["id"].get("fec", None) is None: m["id"]["fec"] = [] for fec_id in fec_ids: if fec_id not in m["id"]["fec"]: m["id"]["fec"].append(fec_id) print("[%s] Added opensecrets ID of %s" % (bioguide, opensecrets_id)) else: print("[%s] NO DATA" % bioguide) print("Saving data to %s..." % filename) save_data(legislators, filename)
def run(): today = datetime.now().date() # default to not caching cache = utils.flags().get('cache', False) force = not cache y = load_data("legislators-current.yaml") # Map bioguide IDs to dicts. Reference the same dicts # in y so we are updating y when we update biogiude. bioguide = {} by_name = {} for m in y: if "bioguide" in m["id"]: bioguide[m["id"]["bioguide"]] = m party = m["terms"][-1]["party"][0] state = m["terms"][-1]["state"] last_name = m["name"]["last"] member_full = "%s (%s-%s)" % (last_name, party, state) by_name[member_full] = m print("Fetching general Senate information from senators_cfm.xml...") url = "http://www.senate.gov/general/contact_information/senators_cfm.xml" body = download(url, "legislators/senate.xml", force) dom = lxml.etree.parse( io.BytesIO(body.encode("utf8")) ) # file has an <?xml declaration and so must be parsed as a bytes array for node in dom.xpath("member"): bioguide_id = str(node.xpath("string(bioguide_id)")).strip() member_full = node.xpath("string(member_full)") if bioguide_id == "": print("Someone has an empty bioguide ID!") print(lxml.etree.tostring(node)) continue print("[%s] Processing Senator %s..." % (bioguide_id, member_full)) # find member record in our YAML, either by bioguide_id or member_full if bioguide_id in bioguide: member = bioguide[bioguide_id] else: if member_full in by_name: member = by_name[member_full] else: print("Bioguide ID '%s' and full name '%s' not recognized." % (bioguide_id, member_full)) exit(0) try: term = member["terms"][-1] except IndexError: print("Member has no terms", bioguide_id, member_full) continue if today < parse_date(term["start"]) or today > parse_date( term["end"]): print("Member's last listed term is not current", bioguide_id, member_full, term["start"]) continue if term["type"] != "sen": print("Member's last listed term is not a Senate term", bioguide_id, member_full) continue if term["state"] != str(node.xpath("string(state)")): print("Member's last listed term has the wrong state", bioguide_id, member_full) continue if "district" in term: del term["district"] full_name = str(node.xpath("string(first_name)")) suffix = None if ", " in full_name: full_name, suffix = full_name.split(", ") full_name += " " + str(node.xpath("string(last_name)")) if suffix: full_name += ", " + suffix member["name"]["official_full"] = full_name member["id"]["bioguide"] = bioguide_id term["class"] = { "Class I": 1, "Class II": 2, "Class III": 3 }[node.xpath("string(class)")] term["party"] = { "D": "Democrat", "R": "Republican", "I": "Independent", "ID": "Independent" }[node.xpath("string(party)")] url = str(node.xpath("string(website)")).strip() if not url.startswith("/"): # temporary home pages for new senators are relative links? # hit the URL to resolve any redirects to get the canonical URL, # since the listing on house.gov sometimes gives URLs that redirect. try: resp = urllib.request.urlopen(url) url = resp.geturl() except Exception as e: print(url, e) # kill trailing slash url = re.sub("/$", "", url) term["url"] = url term["address"] = str(node.xpath("string(address)")).strip().replace( "\n ", " ") term["office"] = string.capwords( term["address"].upper().split(" WASHINGTON ")[0]) phone = str(node.xpath("string(phone)")).strip() term["phone"] = phone.replace("(", "").replace(")", "").replace(" ", "-") #contact_form = str(node.xpath("string(email)")).strip().replace(".Senate.gov", ".senate.gov") #if contact_form: # can be blank # term["contact_form"] = contact_form print( "\n\nUpdating Senate stateRank and LIS ID from cvc_member_data.xml...") url = "http://www.senate.gov/legislative/LIS_MEMBER/cvc_member_data.xml" body = download(url, "legislators/senate_cvc.xml", force) dom = lxml.etree.parse(io.StringIO(body)) for node in dom.getroot(): if node.tag == "lastUpdate": date, time = node.getchildren() print("Last updated: %s, %s" % (date.text, time.text)) continue bioguide_id = str(node.xpath("string(bioguideId)")).strip() if bioguide_id == "": print("Someone has an empty bioguide ID!") print(lxml.etree.tostring(node)) continue last_name = node.xpath("string(name/last)") party = node.xpath("string(party)") state = node.xpath("string(state)") member_full = "%s (%s-%s)" % (last_name, party, state) print("[%s] Processing Senator %s..." % (bioguide_id, member_full)) # find member record in our YAML, either by bioguide_id or member_full if bioguide_id in bioguide: member = bioguide[bioguide_id] else: if member_full in by_name: member = by_name[member_full] else: print( "Bioguide ID '%s' and synthesized official name '%s' not recognized." % (bioguide_id, member_full)) exit(0) try: term = member["terms"][-1] except IndexError: print("Member has no terms", bioguide_id, member_full) continue if "id" not in member: member["id"] = {} member["id"]["lis"] = node.attrib["lis_member_id"] state_rank = node.xpath("string(stateRank)") if state_rank == '1': term["state_rank"] = "senior" elif state_rank == '2': term["state_rank"] = "junior" print("Saving data...") save_data(y, "legislators-current.yaml")
def main(): regexes = { "youtube": [ "https?://(?:www\\.)?youtube.com/(?:user/)?([^\\s\"']+)" ], "facebook": [ "https?://(?:www\\.)?facebook.com/(?:home\\.php#!)?(?:#!)?(?:people/)?/?([^\\s\"']+)" ], "twitter": [ "https?://(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/]+)", "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)" ] } debug = utils.flags().get('debug', False) do_update = utils.flags().get('update', False) do_clean = utils.flags().get('clean', False) do_verify = utils.flags().get('verify', False) # default to not caching cache = utils.flags().get('cache', False) force = not cache service = utils.flags().get('service', None) if service not in ["twitter", "youtube", "facebook"]: print "--service must be one of twitter, youtube, or facebook" exit(0) # load in members, orient by bioguide ID print "Loading current legislators..." current = load_data("legislators-current.yaml") current_bioguide = { } for m in current: if m["id"].has_key("bioguide"): current_bioguide[m["id"]["bioguide"]] = m print "Loading blacklist..." blacklist = { 'twitter': [], 'facebook': [], 'services': [] } for rec in csv.DictReader(open("data/social_media_blacklist.csv")): blacklist[rec["service"]].append(rec["pattern"]) print "Loading whitelist..." whitelist = { 'twitter': [], 'facebook': [], 'services': [] } for rec in csv.DictReader(open("data/social_media_whitelist.csv")): whitelist[rec["service"]].append(rec["account"].lower()) # reorient currently known social media by ID print "Loading social media..." media = load_data("legislators-social-media.yaml") media_bioguide = { } for m in media: media_bioguide[m["id"]["bioguide"]] = m def sweep(): to_check = [] bioguide = utils.flags().get('bioguide', None) if bioguide: possibles = [bioguide] else: possibles = current_bioguide.keys() for bioguide in possibles: if media_bioguide.get(bioguide, None) is None: to_check.append(bioguide) elif media_bioguide[bioguide]["social"].get(service, None) is None: to_check.append(bioguide) else: pass utils.mkdir_p("cache/social_media") writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w')) writer.writerow(["bioguide", "official_full", "website", "service", "candidate"]) for bioguide in to_check: candidate = candidate_for(bioguide) if candidate: url = current_bioguide[bioguide]["terms"][-1].get("url", None) writer.writerow([bioguide, current_bioguide[bioguide]['name']['official_full'], url, service, candidate]) print "\tWrote: %s" % candidate def verify(): bioguide = utils.flags().get('bioguide', None) if bioguide: to_check = [bioguide] else: to_check = media_bioguide.keys() for bioguide in to_check: entry = media_bioguide[bioguide] current = entry['social'].get(service, None) if not current: continue bioguide = entry['id']['bioguide'] candidate = candidate_for(bioguide) if not candidate: # if current is in whitelist, and none is on the page, that's okay if current.lower() in whitelist[service]: continue else: candidate = "" url = current_bioguide[bioguide]['terms'][-1].get('url') if current.lower() != candidate.lower(): print "[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate) def update(): for rec in csv.DictReader(open("cache/social_media/%s_candidates.csv" % service)): bioguide = rec["bioguide"] candidate = rec["candidate"] if media_bioguide.has_key(bioguide): media_bioguide[bioguide]['social'][service] = candidate else: new_media = {'id': {}, 'social': {}} new_media['id']['bioguide'] = bioguide thomas_id = current_bioguide[bioguide]['id'].get("thomas", None) if thomas_id: new_media['id']['thomas'] = thomas_id new_media['social'][service] = candidate media.append(new_media) print "Saving social media..." save_data(media, "legislators-social-media.yaml") def clean(): print "Loading historical legislators..." historical = load_data("legislators-historical.yaml") count = 0 for m in historical: if media_bioguide.has_key(m["id"]["bioguide"]): media.remove(media_bioguide[m["id"]["bioguide"]]) count += 1 print "Removed %i out of office legislators from social media file..." % count print "Saving historical legislators..." save_data(media, "legislators-social-media.yaml") def candidate_for(bioguide): url = current_bioguide[bioguide]["terms"][-1].get("url", None) if not url: if debug: print "[%s] No official website, skipping" % bioguide return None if debug: print "[%s] Downloading..." % bioguide cache = "congress/%s.html" % bioguide body = utils.download(url, cache, force) all_matches = [] for regex in regexes[service]: matches = re.findall(regex, body, re.I) if matches: all_matches.extend(matches) if all_matches: for candidate in all_matches: passed = True for blacked in blacklist[service]: if re.search(blacked, candidate, re.I): passed = False if not passed: if debug: print "\tBlacklisted: %s" % candidate continue return candidate return None if do_update: update() elif do_clean: clean() elif do_verify: verify() else: sweep()
def run(): committee_membership = { } committees_current = load_data("committees-current.yaml") memberships_current = load_data("committee-membership-current.yaml") # default to not caching cache = utils.flags().get('cache', False) force = not cache # map house/senate committee IDs to their dicts house_ref = { } for cx in committees_current: if "house_committee_id" in cx: house_ref[cx["house_committee_id"]] = cx senate_ref = { } for cx in committees_current: if "senate_committee_id" in cx: senate_ref[cx["senate_committee_id"]] = cx # map state/district to current representatives and state/lastname to current senators # since the House/Senate pages do not provide IDs for Members of Congress today = datetime.datetime.now().date() legislators_current = load_data("legislators-current.yaml") congressmen = { } senators = { } for moc in legislators_current: term = moc["terms"][-1] if today < parse_date(term["start"]) or today > parse_date(term["end"]): raise ValueError("Member's last listed term is not current: " + repr(moc) + " / " + term["start"]) if term["type"] == "rep": congressmen["%s%02d" % (term["state"], term["district"])] = moc elif term["type"] == "sen": for n in [moc["name"]] + moc.get("other_names", []): senators[(term["state"], n["last"])] = moc # Scrape clerk.house.gov... def scrape_house_alt(): for id, cx in list(house_ref.items()): scrape_house_committee(cx, cx["thomas_id"], id + "00") def scrape_house(): """The old way of scraping House committees was to start with the committee list at the URL below, but this page no longer has links to the committee info pages even though those pages exist. Preserving this function in case we need it later.""" url = "http://clerk.house.gov/committee_info/index.aspx" body = download(url, "committees/membership/house.html", force) for id, name in re.findall(r'<a href="/committee_info/index.aspx\?comcode=(..)00">(.*)</a>', body, re.I): if id not in house_ref: print("Unrecognized committee:", id, name) continue cx = house_ref[id] scrape_house_committee(cx, cx["thomas_id"], id + "00") def scrape_house_committee(cx, output_code, house_code): # load the House Clerk's committee membership page for the committee # (it is encoded in utf-8 even though the page indicates otherwise, and # while we don't really care, it helps our sanity check that compares # names) url = "http://clerk.house.gov/committee_info/index.aspx?%s=%s" % ('comcode' if house_code[-2:] == '00' else 'subcomcode', house_code) body = download(url, "committees/membership/house/%s.html" % house_code, force) dom = lxml.html.parse(io.StringIO(body)).getroot() # update official name metadata if house_code[-2:] == "00": cx["name"] = "House " + str(dom.cssselect("#com_display h3")[0].text_content()) else: cx["name"] = str(dom.cssselect("#subcom_title h4")[0].text_content()) # update address/phone metadata address_info = re.search(r"""Mailing Address:\s*(.*\S)\s*Telephone:\s*(\(202\) .*\S)""", dom.cssselect("#address")[0].text_content(), re.I | re.S) if not address_info: raise Exception("Failed to parse address info in %s." % house_code) cx["address"] = address_info.group(1) cx["address"] = re.sub(r"\s+", " ", cx["address"]) cx["address"] = re.sub(r"(.*\S)(Washington, DC \d+)\s*(-\d+)?", lambda m : m.group(1) + "; " + m.group(2) + (m.group(3) if m.group(3) else ""), cx["address"]) cx["phone"] = address_info.group(2) # get the ratio line to use in a sanity check later ratio = dom.cssselect("#ratio") if len(ratio): # some committees are missing ratio = re.search(r"Ratio (\d+)/(\d+)", ratio[0].text_content()) else: ratio = None # scan the membership, which is listed by party for i, party, nodename in ((1, 'majority', 'primary'), (2, 'minority', 'secondary')): ctr = 0 for rank, node in enumerate(dom.cssselect("#%s_group li" % nodename)): ctr += 1 lnk = node.cssselect('a') if len(lnk) == 0: if node.text_content() == "Vacancy": continue raise ValueError("Failed to parse a <li> node.") moc = lnk[0].get('href') m = re.search(r"statdis=([A-Z][A-Z]\d\d)", moc) if not m: raise ValueError("Failed to parse member link: " + moc) if not m.group(1) in congressmen: print("Vacancy discrepancy? " + m.group(1)) continue moc = congressmen[m.group(1)] found_name = node.cssselect('a')[0].text_content().replace(", ", "") if moc['name'].get("official_full", None) is None: print("No official_full field for %s" % found_name) continue if found_name != moc['name']['official_full']: print("Name mismatch: %s (in our file) vs %s (on the Clerk page)" % (moc['name']['official_full'], node.cssselect('a')[0].text_content())) entry = OrderedDict() entry["name"] = moc['name']['official_full'] entry["party"] = party entry["rank"] = rank+1 if rank == 0: entry["title"] = "Chair" if entry["party"] == "majority" else "Ranking Member" # not explicit, frown entry.update(ids_from(moc["id"])) committee_membership.setdefault(output_code, []).append(entry) # the .tail attribute has the text to the right of the link m = re.match(r", [A-Z][A-Z](,\s*)?(.*\S)?", lnk[0].tail) if m.group(2): # Chairman, Vice Chair, etc. (all but Ex Officio) started appearing on subcommittees around Feb 2014. # For the chair, this should overwrite the implicit title given for the rank 0 majority party member. if m.group(2) in ("Chair", "Chairman", "Chairwoman"): entry["title"] = "Chair" elif m.group(2) in ("Vice Chair", "Vice Chairman"): entry["title"] = "Vice Chair" elif m.group(2) == "Ex Officio": entry["title"] = m.group(2) else: raise ValueError("Unrecognized title information '%s' in %s." % (m.group(2), url)) # sanity check we got the right number of nodes if ratio and ctr != int(ratio.group(i)): raise ValueError("Parsing didn't get the right count of members.") # scan for subcommittees for subcom in dom.cssselect("#subcom_list li a"): m = re.search("subcomcode=(..(\d\d))", subcom.get('href')) if not m: raise ValueError("Failed to parse subcommittee link.") for sx in cx['subcommittees']: if sx["thomas_id"] == m.group(2): break else: print("Subcommittee not found, creating it", output_code, m.group(1)) sx = OrderedDict() sx['name'] = "[not initialized]" # will be set inside of scrape_house_committee sx['thomas_id'] = m.group(2) cx['subcommittees'].append(sx) scrape_house_committee(sx, cx["thomas_id"] + sx["thomas_id"], m.group(1)) # Scrape senate.gov.... def scrape_senate(): url = "https://www.senate.gov/pagelayout/committees/b_three_sections_with_teasers/membership.htm" body = download(url, "committees/membership/senate.html", force) for id, name in re.findall(r'value="/general/committee_membership/committee_memberships_(....).htm">(.*?)</option>', body, re.I | re.S): if id not in senate_ref: print("Unrecognized committee:", id, name) continue cx = senate_ref[id] is_joint = (id[0] == "J") # Scrape some metadata on the HTML page first. committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.htm" % id print("[%s] Fetching members for %s (%s)" % (id, name, committee_url)) body2 = download(committee_url, "committees/membership/senate/%s.html" % id, force) if not body2: print("\tcommittee page not good:", committee_url) continue m = re.search(r'<span class="contenttext"><a href="(http://(.*?).senate.gov/)">', body2, re.I) if m: cx["url"] = m.group(1) # Use the XML for the rest. print("\tDownloading XML...") committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.xml" % id body3 = download(committee_url, "committees/membership/senate/%s.xml" % id, force) dom = lxml.etree.fromstring(body3.encode("utf8")) # must be bytes to parse if there is an encoding declaration inside the string cx["name"] = dom.xpath("committees/committee_name")[0].text if id[0] != "J" and id[0:2] != 'SC': cx["name"] = "Senate " + cx["name"] majority_party = dom.xpath("committees/majority_party")[0].text # update full committee members committee_membership[id] = [] for member in dom.xpath("committees/members/member"): scrape_senate_member(committee_membership[id], member, majority_party, is_joint) # update subcommittees for subcom in dom.xpath("committees/subcommittee"): scid = subcom.xpath("committee_code")[0].text[4:] for sx in cx.get('subcommittees', []): if sx["thomas_id"] == scid: break else: print("Subcommittee not found, creating it", scid, name) sx = OrderedDict() sx['thomas_id'] = scid cx.setdefault('subcommittees', []).append(sx) # update metadata name = subcom.xpath("subcommittee_name")[0].text sx["name"] = name.strip() sx["name"] = re.sub(r"^\s*Subcommittee on\s*", "", sx["name"]) sx["name"] = re.sub(r"\s+", " ", sx["name"]) committee_membership[id + scid] = [] for member in subcom.xpath("members/member"): scrape_senate_member(committee_membership[id + scid], member, majority_party, is_joint) def scrape_senate_member(output_list, membernode, majority_party, is_joint): last_name = membernode.xpath("name/last")[0].text state = membernode.xpath("state")[0].text party = "majority" if membernode.xpath("party")[0].text == majority_party else "minority" title = membernode.xpath("position")[0].text if title == "Member": title = None if title == "Ranking": title = "Ranking Member" # look up senator by state and last name if (state, last_name) not in senators: print("\t[%s] Unknown member: %s" % (state, last_name)) return None moc = senators[(state, last_name)] entry = OrderedDict() if 'official_full' in moc['name']: entry["name"] = moc['name']['official_full'] else: print("missing name->official_full field for", moc['id']['bioguide']) entry["party"] = party entry["rank"] = len([e for e in output_list if e["party"] == entry["party"]]) + 1 # how many have we seen so far in this party, +1 if title: entry["title"] = title entry.update(ids_from(moc["id"])) if is_joint: entry["chamber"] = "senate" output_list.append(entry) # sort by party, then by rank, since we get the nodes in the XML in a rough seniority order that ignores party # should be done once at the end, but cleaner to do it here output_list.sort(key = lambda e : (e["party"] != "majority", e["rank"])) # stick to a specific small set of official IDs to cross-link members # this limits the IDs from going out of control in this file, while # preserving us flexibility to be inclusive of IDs in the main leg files def ids_from(moc): ids = {} for id in ["bioguide", "thomas"]: if id in moc: ids[id] = moc[id] if len(ids) == 0: raise ValueError("Missing an official ID for this legislator, won't be able to link back") return ids def restore_house_members_on_joint_committees(): # The House doesn't publish joint committee members, but we're manaually gathering # that. Add them back into the output from whatever we have on disk. Put them after # Senate members. for c, mbrs in list(memberships_current.items()): if c[0] != "J": continue for m in mbrs: if m["chamber"] != "house": continue committee_membership[c].append(m) # MAIN scrape_house() scrape_senate() restore_house_members_on_joint_committees() save_data(committee_membership, "committee-membership-current.yaml") save_data(committees_current, "committees-current.yaml")
#!/usr/bin/env python # Use the House' member labels file to update some basic info, including bioguide IDs, for members. # Assumes state and district are already present. import csv, re import utils from utils import download, load_data, save_data, parse_date house_labels = "labels-113.csv" # default to not caching cache = utils.flags().get("cache", False) force = not cache names = utils.flags().get("names", False) y = load_data("legislators-current.yaml") by_district = {} for m in y: last_term = m["terms"][-1] if last_term["type"] != "sen": full_district = "%s%02d" % (last_term["state"], int(last_term["district"])) by_district[full_district] = m for rec in csv.DictReader(open(house_labels)): full_district = rec["113 ST/DIS"] # empty seat - IL-02
def run(): today = datetime.now().date() # default to not caching cache = utils.flags().get('cache', False) force = not cache y = load_data("legislators-current.yaml") # Map bioguide IDs to dicts. Reference the same dicts # in y so we are updating y when we update biogiude. bioguide = { } by_name = { } for m in y: if "bioguide" in m["id"]: bioguide[m["id"]["bioguide"]] = m party = m["terms"][-1]["party"][0] state = m["terms"][-1]["state"] last_name = m["name"]["last"] member_full = "%s (%s-%s)" % (last_name, party, state) by_name[member_full] = m print("Fetching general Senate information from senators_cfm.xml...") url = "http://www.senate.gov/general/contact_information/senators_cfm.xml" body = download(url, "legislators/senate.xml", force) dom = lxml.etree.parse(io.StringIO(body)) for node in dom.xpath("member"): bioguide_id = str(node.xpath("string(bioguide_id)")).strip() member_full = node.xpath("string(member_full)") if bioguide_id == "": print("Someone has an empty bioguide ID!") print(lxml.etree.tostring(node)) continue print("[%s] Processing Senator %s..." % (bioguide_id, member_full)) # find member record in our YAML, either by bioguide_id or member_full if bioguide_id in bioguide: member = bioguide[bioguide_id] else: if member_full in by_name: member = by_name[member_full] else: print("Bioguide ID '%s' and full name '%s' not recognized." % (bioguide_id, member_full)) exit(0) try: term = member["terms"][-1] except IndexError: print("Member has no terms", bioguide_id, member_full) continue if today < parse_date(term["start"]) or today > parse_date(term["end"]): print("Member's last listed term is not current", bioguide_id, member_full, term["start"]) continue if term["type"] != "sen": print("Member's last listed term is not a Senate term", bioguide_id, member_full) continue if term["state"] != str(node.xpath("string(state)")): print("Member's last listed term has the wrong state", bioguide_id, member_full) continue if "district" in term: del term["district"] full_name = str(node.xpath("string(first_name)")) suffix = None if ", " in full_name: full_name, suffix = full_name.split(", ") full_name += " " + str(node.xpath("string(last_name)")) if suffix: full_name += ", " + suffix member["name"]["official_full"] = full_name member["id"]["bioguide"] = bioguide_id term["class"] = { "Class I": 1, "Class II": 2, "Class III": 3}[ node.xpath("string(class)") ] term["party"] = { "D": "Democrat", "R": "Republican", "I": "Independent", "ID": "Independent"}[ node.xpath("string(party)") ] url = str(node.xpath("string(website)")).strip() # kill trailing slashes and force hostname to lowercase since around December 2013 they started uppercasing "Senate.gov" url = re.sub("/$", "", url).replace(".Senate.gov", ".senate.gov") if not url.startswith("/"): term["url"] = url # temporary home pages for new senators term["address"] = str(node.xpath("string(address)")).strip().replace("\n ", " ") term["office"] = string.capwords(term["address"].upper().split(" WASHINGTON ")[0]) phone = str(node.xpath("string(phone)")).strip() term["phone"] = phone.replace("(", "").replace(")", "").replace(" ", "-") contact_form = str(node.xpath("string(email)")).strip().replace(".Senate.gov", ".senate.gov") if contact_form: # can be blank term["contact_form"] = contact_form print("\n\nUpdating Senate stateRank and LIS ID from cvc_member_data.xml...") url = "http://www.senate.gov/legislative/LIS_MEMBER/cvc_member_data.xml" body = download(url, "legislators/senate_cvc.xml", force) dom = lxml.etree.parse(io.StringIO(body)) for node in dom.getroot(): if node.tag == "lastUpdate": date, time = node.getchildren() print("Last updated: %s, %s" % (date.text, time.text)) continue bioguide_id = str(node.xpath("string(bioguideId)")).strip() if bioguide_id == "": print("Someone has an empty bioguide ID!") print(lxml.etree.tostring(node)) continue last_name = node.xpath("string(name/last)") party = node.xpath("string(party)") state = node.xpath("string(state)") member_full = "%s (%s-%s)" % (last_name, party, state) print("[%s] Processing Senator %s..." % (bioguide_id, member_full)) # find member record in our YAML, either by bioguide_id or member_full if bioguide_id in bioguide: member = bioguide[bioguide_id] else: if member_full in by_name: member = by_name[member_full] else: print("Bioguide ID '%s' and synthesized official name '%s' not recognized." % (bioguide_id, member_full)) exit(0) try: term = member["terms"][-1] except IndexError: print("Member has no terms", bioguide_id, member_full) continue if "id" not in member: member["id"] = {} member["id"]["lis"] = node.attrib["lis_member_id"] state_rank = node.xpath("string(stateRank)") if state_rank == '1': term["state_rank"] = "senior" elif state_rank == '2': term["state_rank"] = "junior" print("Saving data...") save_data(y, "legislators-current.yaml")
# gets CRP id for every member with a bioguide ID: # options: # --cache: load from cache if present on disk (default: true) # --current: do *only* current legislators (default: true) # --historical: do *only* historical legislators (default: false) import datetime import re import utils import urllib2 import requests from utils import download, load_data, save_data, parse_date import json options = utils.flags() options['urllib'] = True # disable scrapelib for this debug = options.get('debug', False) # default to NOT caching cache = options.get('cache', False) force = not cache only_bioguide = options.get('bioguide', None) # pick either current or historical # order is important here, since current defaults to true if utils.flags().get('historical', False):
def run(): def update_birthday(bioguide, person, main): birthday = birthday_for(main) if not birthday: print("[%s] NO BIRTHDAY :(\n\n%s" % (bioguide, main.encode("utf8"))) warnings.append(bioguide) return if birthday == "UNKNOWN": return try: birthday = datetime.datetime.strptime(birthday.replace(",", ""), "%B %d %Y") except ValueError: print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide, main.encode("utf8"))) warnings.append(bioguide) return birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day) person.setdefault("bio", {})["birthday"] = birthday def birthday_for(string): # exceptions for not-nicely-placed semicolons string = string.replace( "born in Cresskill, Bergen County, N. J.; April", "born April") string = string.replace( "FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802") string = string.replace( "CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967") string = string.replace( "CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962") string = string.replace( "SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947") string = string.replace( 'KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968") # look for a date pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})" match = re.search(pattern, string, re.I) if not match or not match.group(1): # specifically detect cases that we can't handle to avoid unnecessary warnings if re.search("birth dates? unknown|date of birth is unknown", string, re.I): return "UNKNOWN" if re.search( "born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", string, re.I): return "UNKNOWN" return None return match.group(1).strip() def relationships_of(string): # relationship data is stored in a parenthetical immediately after the end of the </font> tag in the bio # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)" pattern = "^\((.*?)\)" match = re.search(pattern, string, re.I) relationships = [] if match and len(match.groups()) > 0: relationship_text = match.group(1).encode("ascii", "replace") # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar from nltk import tree, pos_tag, RegexpParser tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text) pos = pos_tag(tokens) grammar = r""" NAME: {<NNP>+} NAMES: { <IN><NAME>(?:<CC><NAME>)* } RELATIONSHIP: { <JJ|NN|RB|VB|VBD|VBN|IN|PRP\$>+ } MATCH: { <RELATIONSHIP><NAMES> } """ cp = RegexpParser(grammar) chunks = cp.parse(pos) # iterate through the Relationship/Names pairs for n in chunks: if isinstance(n, tree.Tree) and n.node == "MATCH": people = [] relationship = None for piece in n: if piece.node == "RELATIONSHIP": relationship = " ".join([x[0] for x in piece]) elif piece.node == "NAMES": for name in [ x for x in piece if isinstance(x, tree.Tree) ]: people.append(" ".join([x[0] for x in name])) for person in people: relationships.append({ "relation": relationship, "name": person }) return relationships # default to caching cache = utils.flags().get('cache', True) force = not cache # pick either current or historical # order is important here, since current defaults to true if utils.flags().get('historical', False): filename = "legislators-historical.yaml" elif utils.flags().get('current', True): filename = "legislators-current.yaml" else: print("No legislators selected.") exit(0) print("Loading %s..." % filename) legislators = load_data(filename) # reoriented cache to access by bioguide ID by_bioguide = {} for m in legislators: if "bioguide" in m["id"]: by_bioguide[m["id"]["bioguide"]] = m # optionally focus on one legislator bioguide = utils.flags().get('bioguide', None) if bioguide: bioguides = [bioguide] else: bioguides = list(by_bioguide.keys()) warnings = [] missing = [] count = 0 families = 0 for bioguide in bioguides: # Download & parse the HTML of the bioguide page. try: dom = fetch_bioguide_page(bioguide, force) except Exception as e: print(e) missing.append(bioguide) continue # Extract the member's name and the biography paragraph (main). try: name = dom.cssselect("p font")[0] main = dom.cssselect("p")[0] except IndexError: print("[%s] Missing name or content!" % bioguide) exit(0) name = name.text_content().strip() main = main.text_content().strip().replace("\n", " ").replace("\r", " ") main = re.sub("\s+", " ", main) # Extract the member's birthday. update_birthday(bioguide, by_bioguide[bioguide], main) # Extract relationships with other Members of Congress. if utils.flags().get("relationships", False): #relationship information, if present, is in a parenthetical immediately after the name. #should always be present if we passed the IndexError catch above after_name = dom.cssselect("p font")[0].tail.strip() relationships = relationships_of(after_name) if len(relationships): families = families + 1 by_bioguide[bioguide]["family"] = relationships count = count + 1 print() if warnings: print("Missed %d birthdays: %s" % (len(warnings), str.join(", ", warnings))) if missing: print("Missing a page for %d bioguides: %s" % (len(missing), str.join(", ", missing))) print("Saving data to %s..." % filename) save_data(legislators, filename) print("Saved %d legislators to %s" % (count, filename)) if utils.flags().get("relationships", False): print("Found family members for %d of those legislators" % families)