def fetch_nomination(nomination_id, options={}): logging.info("\n[%s] Fetching..." % nomination_id) # fetch committee name map, if it doesn't already exist nomination_type, number, congress = utils.split_nomination_id(nomination_id) if not number: return {'saved': False, 'ok': False, 'reason': "Couldn't parse %s" % nomination_id} if not utils.committee_names: utils.fetch_committee_names(congress, options) # fetch bill details body body = utils.download( nomination_url_for(nomination_id), nomination_cache_for(nomination_id, "information.html"), options) if not body: return {'saved': False, 'ok': False, 'reason': "failed to download"} if options.get("download_only", False): return {'saved': False, 'ok': True, 'reason': "requested download only"} # TODO: # detect group nominations, particularly for military promotions # detect when a group nomination is split into subnominations # # Also, the splitting process is nonsense: # http://thomas.loc.gov/home/PN/split.htm if "split into two or more parts" in body: return {'saved': False, 'ok': True, 'reason': 'was split'} nomination = parse_nomination(nomination_id, body, options) output_nomination(nomination, options) return {'ok': True, 'saved': True}
def fetch_nomination(nomination_id, options={}): logging.info("\n[%s] Fetching..." % nomination_id) # fetch committee name map, if it doesn't already exist nomination_type, number, congress = utils.split_nomination_id(nomination_id) if not number: return {'saved': False, 'ok': False, 'reason': "Couldn't parse %s" % nomination_id} if not utils.committee_names: utils.fetch_committee_names(congress, options) # fetch bill details body body = utils.download( nomination_url_for(nomination_id), nomination_cache_for(nomination_id, "information.html"), options) if not body: return {'saved': False, 'ok': False, 'reason': "failed to download"} if options.get("download_only", False): return {'saved': False, 'ok': True, 'reason': "requested download only"} ''' # TO DO ## detect group nominations, particularly for military promotions ## detect when a group nomination is split into sub nominations because of divergent Senate action ''' nomination = parse_nomination(nomination_id, body, options) output_nomination(nomination, options) return {'ok': True, 'saved': True}
def run(options): nomination_id = options.get('nomination_id', None) if nomination_id: nomination_type, number, congress = utils.split_nomination_id( nomination_id) to_fetch = [nomination_id] else: congress = options.get('congress', utils.current_congress()) to_fetch = nomination_ids_for(congress, options) if not to_fetch: if options.get("fast", False): logging.warn("No nominations changed.") else: logging.error( "Error figuring out which nominations to download, aborting." ) return None limit = options.get('limit', None) if limit: to_fetch = to_fetch[:int(limit)] logging.warn("Going to fetch %i nominations from congress #%s" % (len(to_fetch), congress)) saved_nominations = utils.process_set(to_fetch, nomination_info.fetch_nomination, options)
def nomination_url_for(nomination_id): nomination_type, number, congress = utils.split_nomination_id(nomination_id) # numbers can be either of the form "63" or "64-01" number_pieces = number.split("-") if len(number_pieces) == 1: number_pieces.append("00") url_number = "%05d%s" % (int(number_pieces[0]), number_pieces[1]) return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%s:/" % (int(congress), nomination_type.upper(), url_number)
def parse_nomination(nomination_id, body, options): nomination_type, number, congress = utils.split_nomination_id( nomination_id) #remove (and store) comments, which contain some info for the nomination but also mess up the parser facts = re.findall("<!--(.+?)-->", body) body = re.sub("<!--.+?-->", "", body) doc = fromstring(body) info = {'nomination_id': nomination_id, 'actions': []} #the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text for pair in doc.xpath('//span[@class="elabel"]|//strong'): if pair.tail: label, data = pair.text.replace(':', '').strip(), pair.tail.strip() if label.split(" ")[-1] == "Action": data = re.split("\s+\-\s+", data) info['actions'].append((label, data[0], data[1])) else: info[label.lower()] = data ''' Some of the data is structured fine as is (e.g. Organization, Referred to, Reported by) Some needs processing, like date and nominee ''' # Doc format is: "January 04, 1995 (104th Congress)" info["date"] = datetime.strptime(info["date received"].split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d") # Note: Will break with the 1000th congress in year 3789 info["congress"] = int( re.search("(\d{2,3})[stndhr]{2}", info["date received"]).group(1)) # remove final caluse if there info["nominee"] = info["nominee"].split(", vice")[0] # get overview from the text of the nomination try: (name, state, position) = re.search("(.+?), of (.+?), to be (.+?)", info["nominee"]).groups() except Exception, e: logging.error("Couldn't parse %s" % info["nominee"]) (name, state, position) = ("", "", "")
def fetch_nomination(nomination_id, options={}): logging.info("\n[%s] Fetching..." % nomination_id) # fetch committee name map, if it doesn't already exist nomination_type, number, congress = utils.split_nomination_id( nomination_id) if not number: return { 'saved': False, 'ok': False, 'reason': "Couldn't parse %s" % nomination_id } if not utils.committee_names: utils.fetch_committee_names(congress, options) # fetch bill details body body = utils.download( nomination_url_for(nomination_id), nomination_cache_for(nomination_id, "information.html"), options) if not body: return {'saved': False, 'ok': False, 'reason': "failed to download"} if options.get("download_only", False): return { 'saved': False, 'ok': True, 'reason': "requested download only" } ''' # TO DO ## detect group nominations, particularly for military promotions ## detect when a group nomination is split into sub nominations because of divergent Senate action ''' nomination = parse_nomination(nomination_id, body, options) output_nomination(nomination, options) return {'ok': True, 'saved': True}
def parse_nomination(nomination_id, body, options): nomination_type, number, congress = utils.split_nomination_id(nomination_id) #remove (and store) comments, which contain some info for the nomination but also mess up the parser facts = re.findall("<!--(.+?)-->", body) body = re.sub("<!--.+?-->", "", body) doc = fromstring(body) info = { 'nomination_id': nomination_id, 'actions': [] } #the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text for pair in doc.xpath('//span[@class="elabel"]|//strong'): if pair.tail: label, data = pair.text.replace(':', '').strip(), pair.tail.strip() if label.split(" ")[-1] == "Action": data = re.split("\s+\-\s+", data) info['actions'].append((label, data[0], data[1])) else: info[label.lower()] = data ''' Some of the data is structured fine as is (e.g. Organization, Referred to, Reported by) Some needs processing, like date and nominee ''' # Doc format is: "January 04, 1995 (104th Congress)" info["date"] = datetime.strptime(info["date received"].split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d") # Note: Will break with the 1000th congress in year 3789 info["congress"] = int(re.search("(\d{2,3})[stndhr]{2}", info["date received"]).group(1)) # remove final caluse if there info["nominee"] = info["nominee"].split(", vice")[0] # get overview from the text of the nomination try: (name, state, position) = re.search("(.+?), of (.+?), to be (.+?)", info["nominee"]).groups() except Exception, e: logging.error("Couldn't parse %s" % info["nominee"]) (name, state, position) = ("", "", "")
def run(options): nomination_id = options.get('nomination_id', None) if nomination_id: nomination_type, number, congress = utils.split_nomination_id(nomination_id) to_fetch = [nomination_id] else: congress = options.get('congress', utils.current_congress()) to_fetch = nomination_ids_for(congress, options) if not to_fetch: if options.get("fast", False): logging.warn("No nominations changed.") else: logging.error("Error figuring out which nominations to download, aborting.") return None limit = options.get('limit', None) if limit: to_fetch = to_fetch[:int(limit)] logging.warn("Going to fetch %i nominations from congress #%s" % (len(to_fetch), congress)) saved_nominations = utils.process_set(to_fetch, nomination_info.fetch_nomination, options)
def nomination_cache_for(nomination_id, file): nomination_type, number, congress = utils.split_nomination_id( nomination_id) return "%s/nominations/%s/%s" % (congress, number, file)
def parse_nomination(nomination_id, body, options): nomination_type, number, congress = utils.split_nomination_id(nomination_id) # remove (and store) comments, which contain some info for the nomination but also mess up the parser facts = re.findall("<!--(.+?)-->", body) body = re.sub("<!--.+?-->", "", body) committee_names = [] committees = [] doc = fromstring(body) info = { 'nomination_id': nomination_id, 'actions': [] } # the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text for pair in doc.xpath('//span[@class="elabel"]|//strong'): if pair.tail: text = pair.text or pair.text_content() label, data = text.replace(':', '').strip(), pair.tail.strip() # handle actions separately if label.split(" ")[-1] == "Action": pieces = re.split("\s+\-\s+", data) location = label.split(" ")[0].lower() # use 'acted_at', even though it's always a date, to be consistent # with acted_at field on bills and amendments acted_at = datetime.strptime(pieces[0], "%B %d, %Y").strftime("%Y-%m-%d") # join rest back together (in case action itself has a hyphen) text = str.join(" - ", pieces[1:len(pieces)]) info['actions'].append({ "type": "action", "location": location, "acted_at": acted_at, "text": text }) else: # let's handle these cases one by one if label == "Organization": info["organization"] = data elif label == "Control Number": # this doesn't seem useful pass elif label.lower() == "referred to": committee_names.append(data) elif label == "Reported by": info["reported_by"] = data elif label == "Nomination": # sanity check - verify nomination_id matches if nomination_id != data: raise Exception("Whoa! Mismatched nomination ID.") elif label == "Date Received": # Note: Will break with the 1000th congress in year 3789 match = re.search("(\d{2,3})[stndhr]{2}", data) if match: info["congress"] = int(match.group(1)) else: raise Exception("Choked, couldn't find Congress in \"%s\"" % data) # Doc format is: "January 04, 1995 (104th Congress)" info["received_on"] = datetime.strptime(data.split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d") elif label == "Nominee": name = data.split(", vice")[0] try: name = re.search("(.+?),", name).groups()[0] except Exception, e: raise Exception("Couldn't parse nominee entry: %s" % name) # and grab the state and position out of the comment facts if facts[-5]: position = facts[-5] else: raise Exception("Couldn't find the position in the comments.") info["nominees"] = [{ "name": name, "position": position, "state": facts[-6][2:] }] elif label.lower() == "nominees": pass elif label.lower() == "list of nominees": # step through each sibling, collecting each br's stripped tail for names as we go # stop when we get to a strong or span (next label) nominees = [] current_position = None for sibling in pair.itersiblings(): if sibling.tag == "br": if sibling.tail: name = sibling.tail.strip() if (name[0:5].lower() == "to be"): current_position = name[6:].strip() elif name: nominees.append({ "name": sibling.tail.strip(), "position": current_position }) elif (sibling.tag == "strong") or (sibling.tag == "span"): break info["nominees"] = nominees else: # choke, I think we handle all of them now raise Exception("Unrecognized label: %s" % label)
def nomination_cache_for(nomination_id, file): nomination_type, number, congress = utils.split_nomination_id(nomination_id) return "%s/nominations/%s/%s" % (congress, number, file)
def output_for_nomination(nomination_id, format): nomination_type, number, congress = utils.split_nomination_id(nomination_id) return "%s/%s/nominations/%s/%s" % (utils.data_dir(), congress, number, "data.%s" % format)
def parse_nomination(nomination_id, body, options): nomination_type, number, congress = utils.split_nomination_id( nomination_id) # remove (and store) comments, which contain some info for the nomination but also mess up the parser facts = re.findall("<!--(.+?)-->", body) body = re.sub("<!--.+?-->", "", body) committee_names = [] committees = [] doc = fromstring(body) info = {'nomination_id': nomination_id, 'actions': []} # the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text for pair in doc.xpath('//span[@class="elabel"]|//strong'): if pair.tail: text = pair.text or pair.text_content() label, data = text.replace(':', '').strip(), pair.tail.strip() # handle actions separately if label.split(" ")[-1] == "Action": pieces = re.split("\s+\-\s+", data) location = label.split(" ")[0].lower() # use 'acted_at', even though it's always a date, to be consistent # with acted_at field on bills and amendments acted_at = datetime.strptime(pieces[0], "%B %d, %Y").strftime("%Y-%m-%d") # join rest back together (in case action itself has a hyphen) text = str.join(" - ", pieces[1:len(pieces)]) info['actions'].append({ "type": "action", "location": location, "acted_at": acted_at, "text": text }) else: # let's handle these cases one by one if label == "Organization": info["organization"] = data elif label == "Control Number": # this doesn't seem useful pass elif label.lower() == "referred to": committee_names.append(data) elif label == "Reported by": info["reported_by"] = data elif label == "Nomination": # sanity check - verify nomination_id matches if nomination_id != data: raise Exception("Whoa! Mismatched nomination ID.") elif label == "Date Received": # Note: Will break with the 1000th congress in year 3789 match = re.search("(\d{2,3})[stndhr]{2}", data) if match: info["congress"] = int(match.group(1)) else: raise Exception( "Choked, couldn't find Congress in \"%s\"" % data) # Doc format is: "January 04, 1995 (104th Congress)" info["received_on"] = datetime.strptime( data.split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d") elif label == "Nominee": name = data.split(", vice")[0] try: name = re.search("(.+?),", name).groups()[0] except Exception, e: raise Exception("Couldn't parse nominee entry: %s" % name) # and grab the state and position out of the comment facts if facts[-5]: position = facts[-5] else: raise Exception( "Couldn't find the position in the comments.") info["nominees"] = [{ "name": name, "position": position, "state": facts[-6][2:] }] elif label.lower() == "nominees": pass elif label.lower() == "list of nominees": # step through each sibling, collecting each br's stripped tail for names as we go # stop when we get to a strong or span (next label) nominees = [] current_position = None for sibling in pair.itersiblings(): if sibling.tag == "br": if sibling.tail: name = sibling.tail.strip() if (name[0:5].lower() == "to be"): current_position = name[6:].strip() elif name: nominees.append({ "name": sibling.tail.strip(), "position": current_position }) elif (sibling.tag == "strong") or (sibling.tag == "span"): break info["nominees"] = nominees else: # choke, I think we handle all of them now raise Exception("Unrecognized label: %s" % label)
def nomination_url_for(nomination_id): nomination_type, number, congress = utils.split_nomination_id( nomination_id) return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%05d00:/" % ( int(congress), nomination_type.upper(), int(number))
def output_for_nomination(nomination_id, format): nomination_type, number, congress = utils.split_nomination_id( nomination_id) return "%s/%s/nominations/%s/%s" % (utils.data_dir(), congress, number, "data.%s" % format)
def nomination_url_for(nomination_id): nomination_type, number, congress = utils.split_nomination_id(nomination_id) return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%05d00:/" % (int(congress), nomination_type.upper(), int(number))