Esempio n. 1
0
def fetch_nomination(nomination_id, options={}):
	logging.info("\n[%s] Fetching..." % nomination_id)

	# fetch committee name map, if it doesn't already exist
	nomination_type, number, congress = utils.split_nomination_id(nomination_id)
	if not number:
		return {'saved': False, 'ok': False, 'reason': "Couldn't parse %s" % nomination_id}

	if not utils.committee_names:
		utils.fetch_committee_names(congress, options)

	# fetch bill details body
	body = utils.download(
		nomination_url_for(nomination_id),
		nomination_cache_for(nomination_id, "information.html"), options)

	if not body:
		return {'saved': False, 'ok': False, 'reason': "failed to download"}

	if options.get("download_only", False):
		return {'saved': False, 'ok': True, 'reason': "requested download only"}

	# TODO:
	#   detect group nominations, particularly for military promotions
	#   detect when a group nomination is split into subnominations
	#
	# Also, the splitting process is nonsense:
	# http://thomas.loc.gov/home/PN/split.htm

	if "split into two or more parts" in body:
		return {'saved': False, 'ok': True, 'reason': 'was split'}

	nomination = parse_nomination(nomination_id, body, options)
	output_nomination(nomination, options)
	return {'ok': True, 'saved': True}
Esempio n. 2
0
def fetch_nomination(nomination_id, options={}):
    logging.info("\n[%s] Fetching..." % nomination_id)

    # fetch committee name map, if it doesn't already exist
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)
    if not number:
        return {'saved': False, 'ok': False, 'reason': "Couldn't parse %s" % nomination_id}

    if not utils.committee_names:
        utils.fetch_committee_names(congress, options)

    # fetch bill details body
    body = utils.download(
        nomination_url_for(nomination_id),
        nomination_cache_for(nomination_id, "information.html"), options)

    if not body:
        return {'saved': False, 'ok': False, 'reason': "failed to download"}

    if options.get("download_only", False):
        return {'saved': False, 'ok': True, 'reason': "requested download only"}

    '''
  # TO DO
  ## detect group nominations, particularly for military promotions
  ## detect when a group nomination is split into sub nominations because of divergent Senate action
  '''

    nomination = parse_nomination(nomination_id, body, options)
    output_nomination(nomination, options)
    return {'ok': True, 'saved': True}
Esempio n. 3
0
def run(options):
    nomination_id = options.get('nomination_id', None)

    if nomination_id:
        nomination_type, number, congress = utils.split_nomination_id(
            nomination_id)
        to_fetch = [nomination_id]
    else:
        congress = options.get('congress', utils.current_congress())
        to_fetch = nomination_ids_for(congress, options)
        if not to_fetch:
            if options.get("fast", False):
                logging.warn("No nominations changed.")
            else:
                logging.error(
                    "Error figuring out which nominations to download, aborting."
                )
            return None

        limit = options.get('limit', None)
        if limit:
            to_fetch = to_fetch[:int(limit)]

    logging.warn("Going to fetch %i nominations from congress #%s" %
                 (len(to_fetch), congress))

    saved_nominations = utils.process_set(to_fetch,
                                          nomination_info.fetch_nomination,
                                          options)
Esempio n. 4
0
def nomination_url_for(nomination_id):
	nomination_type, number, congress = utils.split_nomination_id(nomination_id)

	# numbers can be either of the form "63" or "64-01"
	number_pieces = number.split("-")
	if len(number_pieces) == 1:
		number_pieces.append("00")
	url_number = "%05d%s" % (int(number_pieces[0]), number_pieces[1])

	return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%s:/" % (int(congress), nomination_type.upper(), url_number)
Esempio n. 5
0
def nomination_url_for(nomination_id):
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)

    # numbers can be either of the form "63" or "64-01"
    number_pieces = number.split("-")
    if len(number_pieces) == 1:
        number_pieces.append("00")
    url_number = "%05d%s" % (int(number_pieces[0]), number_pieces[1])

    return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%s:/" % (int(congress), nomination_type.upper(), url_number)
Esempio n. 6
0
def parse_nomination(nomination_id, body, options):
    nomination_type, number, congress = utils.split_nomination_id(
        nomination_id)

    #remove (and store) comments, which contain some info for the nomination but also mess up the parser
    facts = re.findall("<!--(.+?)-->", body)
    body = re.sub("<!--.+?-->", "", body)

    doc = fromstring(body)
    info = {'nomination_id': nomination_id, 'actions': []}

    #the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text
    for pair in doc.xpath('//span[@class="elabel"]|//strong'):
        if pair.tail:
            label, data = pair.text.replace(':', '').strip(), pair.tail.strip()
            if label.split(" ")[-1] == "Action":
                data = re.split("\s+\-\s+", data)
                info['actions'].append((label, data[0], data[1]))
            else:
                info[label.lower()] = data
    '''
  Some of the data is structured fine as is (e.g. Organization, Referred to, Reported by)
  Some needs processing, like date and nominee
  '''

    # Doc format is: "January 04, 1995 (104th Congress)"
    info["date"] = datetime.strptime(info["date received"].split(" (")[0],
                                     "%B %d, %Y").strftime("%Y-%m-%d")
    # Note: Will break with the 1000th congress in year 3789
    info["congress"] = int(
        re.search("(\d{2,3})[stndhr]{2}", info["date received"]).group(1))

    # remove final caluse if there
    info["nominee"] = info["nominee"].split(", vice")[0]

    # get overview from the text of the nomination
    try:
        (name, state, position) = re.search("(.+?), of (.+?), to be (.+?)",
                                            info["nominee"]).groups()
    except Exception, e:
        logging.error("Couldn't parse %s" % info["nominee"])
        (name, state, position) = ("", "", "")
Esempio n. 7
0
def fetch_nomination(nomination_id, options={}):
    logging.info("\n[%s] Fetching..." % nomination_id)

    # fetch committee name map, if it doesn't already exist
    nomination_type, number, congress = utils.split_nomination_id(
        nomination_id)
    if not number:
        return {
            'saved': False,
            'ok': False,
            'reason': "Couldn't parse %s" % nomination_id
        }

    if not utils.committee_names:
        utils.fetch_committee_names(congress, options)

    # fetch bill details body
    body = utils.download(
        nomination_url_for(nomination_id),
        nomination_cache_for(nomination_id, "information.html"), options)

    if not body:
        return {'saved': False, 'ok': False, 'reason': "failed to download"}

    if options.get("download_only", False):
        return {
            'saved': False,
            'ok': True,
            'reason': "requested download only"
        }
    '''
  # TO DO
  ## detect group nominations, particularly for military promotions
  ## detect when a group nomination is split into sub nominations because of divergent Senate action
  '''

    nomination = parse_nomination(nomination_id, body, options)
    output_nomination(nomination, options)
    return {'ok': True, 'saved': True}
Esempio n. 8
0
def parse_nomination(nomination_id, body, options):
  nomination_type, number, congress = utils.split_nomination_id(nomination_id)
  
  #remove (and store) comments, which contain some info for the nomination but also mess up the parser  
  facts = re.findall("<!--(.+?)-->", body)
  body = re.sub("<!--.+?-->", "", body)
  
  doc = fromstring(body)
  info = { 'nomination_id': nomination_id, 'actions': [] }

  #the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text
  for pair in doc.xpath('//span[@class="elabel"]|//strong'):
    if pair.tail:
        label, data = pair.text.replace(':', '').strip(), pair.tail.strip()
        if label.split(" ")[-1] == "Action":
            data = re.split("\s+\-\s+", data)
            info['actions'].append((label, data[0], data[1]))
        else:
            info[label.lower()] = data

  '''
  Some of the data is structured fine as is (e.g. Organization, Referred to, Reported by)
  Some needs processing, like date and nominee
  '''
  
  # Doc format is: "January 04, 1995 (104th Congress)"
  info["date"] = datetime.strptime(info["date received"].split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d")
  # Note: Will break with the 1000th congress in year 3789
  info["congress"] = int(re.search("(\d{2,3})[stndhr]{2}", info["date received"]).group(1))
  
  # remove final caluse if there
  info["nominee"] = info["nominee"].split(", vice")[0]
  
  # get overview from the text of the nomination
  try:
    (name, state, position) = re.search("(.+?), of (.+?), to be (.+?)", info["nominee"]).groups()
  except Exception, e:
    logging.error("Couldn't parse %s" % info["nominee"])
    (name, state, position) = ("", "", "")    
Esempio n. 9
0
def run(options):
  nomination_id = options.get('nomination_id', None)
  
  if nomination_id:
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)
    to_fetch = [nomination_id]
  else:
    congress = options.get('congress', utils.current_congress())
    to_fetch = nomination_ids_for(congress, options)
    if not to_fetch:
      if options.get("fast", False):
        logging.warn("No nominations changed.")
      else:
        logging.error("Error figuring out which nominations to download, aborting.")
      return None

    limit = options.get('limit', None)
    if limit:
      to_fetch = to_fetch[:int(limit)]

  logging.warn("Going to fetch %i nominations from congress #%s" % (len(to_fetch), congress))
  
  saved_nominations = utils.process_set(to_fetch, nomination_info.fetch_nomination, options)  
Esempio n. 10
0
def nomination_cache_for(nomination_id, file):
    nomination_type, number, congress = utils.split_nomination_id(
        nomination_id)
    return "%s/nominations/%s/%s" % (congress, number, file)
Esempio n. 11
0
def parse_nomination(nomination_id, body, options):
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)

    # remove (and store) comments, which contain some info for the nomination but also mess up the parser
    facts = re.findall("<!--(.+?)-->", body)
    body = re.sub("<!--.+?-->", "", body)

    committee_names = []
    committees = []

    doc = fromstring(body)
    info = {
        'nomination_id': nomination_id, 'actions': []
    }

    # the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text
    for pair in doc.xpath('//span[@class="elabel"]|//strong'):
        if pair.tail:
            text = pair.text or pair.text_content()
            label, data = text.replace(':', '').strip(), pair.tail.strip()

            # handle actions separately
            if label.split(" ")[-1] == "Action":
                pieces = re.split("\s+\-\s+", data)

                location = label.split(" ")[0].lower()

                # use 'acted_at', even though it's always a date, to be consistent
                # with acted_at field on bills and amendments
                acted_at = datetime.strptime(pieces[0], "%B %d, %Y").strftime("%Y-%m-%d")

                # join rest back together (in case action itself has a hyphen)
                text = str.join(" - ", pieces[1:len(pieces)])

                info['actions'].append({
                    "type": "action",
                    "location": location,
                    "acted_at": acted_at,
                    "text": text
                })

            else:
                # let's handle these cases one by one
                if label == "Organization":
                    info["organization"] = data

                elif label == "Control Number":
                    # this doesn't seem useful
                    pass

                elif label.lower() == "referred to":
                    committee_names.append(data)

                elif label == "Reported by":
                    info["reported_by"] = data

                elif label == "Nomination":
                    # sanity check - verify nomination_id matches
                    if nomination_id != data:
                        raise Exception("Whoa! Mismatched nomination ID.")

                elif label == "Date Received":
                    # Note: Will break with the 1000th congress in year 3789
                    match = re.search("(\d{2,3})[stndhr]{2}", data)
                    if match:
                        info["congress"] = int(match.group(1))
                    else:
                        raise Exception("Choked, couldn't find Congress in \"%s\"" % data)

                    # Doc format is: "January 04, 1995 (104th Congress)"
                    info["received_on"] = datetime.strptime(data.split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d")

                elif label == "Nominee":

                    name = data.split(", vice")[0]

                    try:
                        name = re.search("(.+?),", name).groups()[0]
                    except Exception, e:
                        raise Exception("Couldn't parse nominee entry: %s" % name)

                    # and grab the state and position out of the comment facts
                    if facts[-5]:
                        position = facts[-5]
                    else:
                        raise Exception("Couldn't find the position in the comments.")

                    info["nominees"] = [{
                        "name": name,
                        "position": position,
                        "state": facts[-6][2:]
                    }]

                elif label.lower() == "nominees":
                    pass

                elif label.lower() == "list of nominees":
                    # step through each sibling, collecting each br's stripped tail for names as we go
                    # stop when we get to a strong or span (next label)
                    nominees = []

                    current_position = None
                    for sibling in pair.itersiblings():
                        if sibling.tag == "br":
                            if sibling.tail:
                                name = sibling.tail.strip()
                                if (name[0:5].lower() == "to be"):
                                    current_position = name[6:].strip()
                                elif name:
                                    nominees.append({
                                        "name": sibling.tail.strip(),
                                        "position": current_position
                                    })
                        elif (sibling.tag == "strong") or (sibling.tag == "span"):
                            break

                    info["nominees"] = nominees

                else:
                    # choke, I think we handle all of them now
                    raise Exception("Unrecognized label: %s" % label)
Esempio n. 12
0
def nomination_cache_for(nomination_id, file):
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)
    return "%s/nominations/%s/%s" % (congress, number, file)
Esempio n. 13
0
def output_for_nomination(nomination_id, format):
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)
    return "%s/%s/nominations/%s/%s" % (utils.data_dir(), congress, number, "data.%s" % format)
Esempio n. 14
0
def parse_nomination(nomination_id, body, options):
    nomination_type, number, congress = utils.split_nomination_id(
        nomination_id)

    # remove (and store) comments, which contain some info for the nomination but also mess up the parser
    facts = re.findall("<!--(.+?)-->", body)
    body = re.sub("<!--.+?-->", "", body)

    committee_names = []
    committees = []

    doc = fromstring(body)
    info = {'nomination_id': nomination_id, 'actions': []}

    # the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text
    for pair in doc.xpath('//span[@class="elabel"]|//strong'):
        if pair.tail:
            text = pair.text or pair.text_content()
            label, data = text.replace(':', '').strip(), pair.tail.strip()

            # handle actions separately
            if label.split(" ")[-1] == "Action":
                pieces = re.split("\s+\-\s+", data)

                location = label.split(" ")[0].lower()

                # use 'acted_at', even though it's always a date, to be consistent
                # with acted_at field on bills and amendments
                acted_at = datetime.strptime(pieces[0],
                                             "%B %d, %Y").strftime("%Y-%m-%d")

                # join rest back together (in case action itself has a hyphen)
                text = str.join(" - ", pieces[1:len(pieces)])

                info['actions'].append({
                    "type": "action",
                    "location": location,
                    "acted_at": acted_at,
                    "text": text
                })

            else:
                # let's handle these cases one by one
                if label == "Organization":
                    info["organization"] = data

                elif label == "Control Number":
                    # this doesn't seem useful
                    pass

                elif label.lower() == "referred to":
                    committee_names.append(data)

                elif label == "Reported by":
                    info["reported_by"] = data

                elif label == "Nomination":
                    # sanity check - verify nomination_id matches
                    if nomination_id != data:
                        raise Exception("Whoa! Mismatched nomination ID.")

                elif label == "Date Received":
                    # Note: Will break with the 1000th congress in year 3789
                    match = re.search("(\d{2,3})[stndhr]{2}", data)
                    if match:
                        info["congress"] = int(match.group(1))
                    else:
                        raise Exception(
                            "Choked, couldn't find Congress in \"%s\"" % data)

                    # Doc format is: "January 04, 1995 (104th Congress)"
                    info["received_on"] = datetime.strptime(
                        data.split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d")

                elif label == "Nominee":

                    name = data.split(", vice")[0]

                    try:
                        name = re.search("(.+?),", name).groups()[0]
                    except Exception, e:
                        raise Exception("Couldn't parse nominee entry: %s" %
                                        name)

                    # and grab the state and position out of the comment facts
                    if facts[-5]:
                        position = facts[-5]
                    else:
                        raise Exception(
                            "Couldn't find the position in the comments.")

                    info["nominees"] = [{
                        "name": name,
                        "position": position,
                        "state": facts[-6][2:]
                    }]

                elif label.lower() == "nominees":
                    pass

                elif label.lower() == "list of nominees":
                    # step through each sibling, collecting each br's stripped tail for names as we go
                    # stop when we get to a strong or span (next label)
                    nominees = []

                    current_position = None
                    for sibling in pair.itersiblings():
                        if sibling.tag == "br":
                            if sibling.tail:
                                name = sibling.tail.strip()
                                if (name[0:5].lower() == "to be"):
                                    current_position = name[6:].strip()
                                elif name:
                                    nominees.append({
                                        "name":
                                        sibling.tail.strip(),
                                        "position":
                                        current_position
                                    })
                        elif (sibling.tag == "strong") or (sibling.tag
                                                           == "span"):
                            break

                    info["nominees"] = nominees

                else:
                    # choke, I think we handle all of them now
                    raise Exception("Unrecognized label: %s" % label)
Esempio n. 15
0
def nomination_url_for(nomination_id):
    nomination_type, number, congress = utils.split_nomination_id(
        nomination_id)
    return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%05d00:/" % (
        int(congress), nomination_type.upper(), int(number))
Esempio n. 16
0
def output_for_nomination(nomination_id, format):
    nomination_type, number, congress = utils.split_nomination_id(
        nomination_id)
    return "%s/%s/nominations/%s/%s" % (utils.data_dir(), congress, number,
                                        "data.%s" % format)
Esempio n. 17
0
def nomination_url_for(nomination_id):
  nomination_type, number, congress = utils.split_nomination_id(nomination_id)
  return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%05d00:/" % (int(congress), nomination_type.upper(), int(number))