Example #1
0
def save_bill_search_state(saved_bills, search_state):
    # For --fast mode, cache the current search result listing (in search_state)
    # to disk so we can detect major changes to the bill through the search
    # listing rather than having to parse the bill.
    for bill_id in saved_bills:
        if bill_id in search_state:
            fast_cache_path = utils.cache_dir() + "/" + bill_info.bill_cache_for(bill_id, "search_result.html")
            new_state = search_state[bill_id]
            utils.write(new_state, fast_cache_path)
Example #2
0
def save_bill_search_state(saved_bills, search_state):
    # For --fast mode, cache the current search result listing (in search_state)
    # to disk so we can detect major changes to the bill through the search
    # listing rather than having to parse the bill.
    for bill_id in saved_bills:
        if bill_id in search_state:
            fast_cache_path = utils.cache_dir(
            ) + "/" + bill_info.bill_cache_for(bill_id, "search_result.html")
            new_state = search_state[bill_id]
            utils.write(new_state, fast_cache_path)
Example #3
0
def bill_ids_for(congress, options, bill_states={}):

    # override if we're actually using this method to get amendments
    doing_amendments = options.get('amendments', False)

    bill_ids = []

    bill_type = options.get(
        'amendment_type' if doing_amendments else 'bill_type', None)
    if bill_type:
        bill_types = [bill_type]
    else:
        bill_types = utils.thomas_types.keys()

    for bill_type in bill_types:

        # This sub is re-used for pulling amendment IDs too.
        if (bill_type in ('samdt', 'hamdt', 'supamdt')) != doing_amendments:
            continue

        # match only links to landing pages of this bill type
        # it shouldn't catch stray links outside of the confines of the 100 on the page,
        # but if it does, no big deal
        link_pattern = "^\s*%s\d+\s*$" % utils.thomas_types[bill_type][1]

        # loop through pages and collect the links on each page until
        # we hit a page with < 100 results, or no results
        offset = 0
        while True:
            # download page, find the matching links
            page = utils.download(page_for(congress, bill_type, offset),
                                  page_cache_for(congress, bill_type, offset),
                                  options)

            if not page:
                logging.error(
                    "Couldn't download page with offset %i, aborting" % offset)
                return None

            # extract matching links
            doc = html.document_fromstring(page)
            links = doc.xpath(
                "//a[re:match(text(), '%s')]" % link_pattern,
                namespaces={"re": "http://exslt.org/regular-expressions"})

            # extract the bill ID from each link
            for link in links:
                code = link.text.lower().replace(".", "").replace(" ", "")
                bill_id = "%s-%s" % (code, congress)

                if options.get("fast", False):
                    fast_cache_path = utils.cache_dir(
                    ) + "/" + bill_info.bill_cache_for(bill_id,
                                                       "search_result.html")
                    old_state = utils.read(fast_cache_path)

                    # Compare all of the output in the search result's <p> tag, which
                    # has last major action, number of cosponsors, etc. to a cache on
                    # disk to see if any major information about the bill changed.
                    parent_node = link.getparent(
                    )  # the <p> tag containing the whole search hit
                    parent_node.remove(
                        parent_node.xpath("b")[0]
                    )  # remove the <b>###.</b> node that isn't relevant for comparison
                    new_state = etree.tostring(
                        parent_node)  # serialize this tag

                    if old_state == new_state:
                        logging.info("No change in search result listing: %s" %
                                     bill_id)
                        continue

                    bill_states[bill_id] = new_state

                bill_ids.append(bill_id)

            if len(links) < 100:
                break

            offset += 100

            # sanity check, while True loops are dangerous
            if offset > 100000:
                break

    return utils.uniq(bill_ids)
Example #4
0
def bill_ids_for(congress, options, bill_states={}):

    # override if we're actually using this method to get amendments
    doing_amendments = options.get('amendments', False)

    bill_ids = []

    bill_type = options.get('amendment_type' if doing_amendments else 'bill_type', None)
    if bill_type:
        bill_types = [bill_type]
    else:
        bill_types = utils.thomas_types.keys()

    for bill_type in bill_types:

        # This sub is re-used for pulling amendment IDs too.
        if (bill_type in ('samdt', 'hamdt', 'supamdt')) != doing_amendments:
            continue

        # match only links to landing pages of this bill type
        # it shouldn't catch stray links outside of the confines of the 100 on the page,
        # but if it does, no big deal
        link_pattern = "^\s*%s\d+\s*$" % utils.thomas_types[bill_type][1]

        # loop through pages and collect the links on each page until
        # we hit a page with < 100 results, or no results
        offset = 0
        while True:
            # download page, find the matching links
            page = utils.download(
                page_for(congress, bill_type, offset),
                page_cache_for(congress, bill_type, offset),
                options)

            if not page:
                logging.error("Couldn't download page with offset %i, aborting" % offset)
                return None

            # extract matching links
            doc = html.document_fromstring(page)
            links = doc.xpath(
                "//a[re:match(text(), '%s')]" % link_pattern,
                namespaces={"re": "http://exslt.org/regular-expressions"})

            # extract the bill ID from each link
            for link in links:
                code = link.text.lower().replace(".", "").replace(" ", "")
                bill_id = "%s-%s" % (code, congress)

                if options.get("fast", False):
                    fast_cache_path = utils.cache_dir() + "/" + bill_info.bill_cache_for(bill_id, "search_result.html")
                    old_state = utils.read(fast_cache_path)

                    # Compare all of the output in the search result's <p> tag, which
                    # has last major action, number of cosponsors, etc. to a cache on
                    # disk to see if any major information about the bill changed.
                    parent_node = link.getparent()  # the <p> tag containing the whole search hit
                    parent_node.remove(parent_node.xpath("b")[0])  # remove the <b>###.</b> node that isn't relevant for comparison
                    new_state = etree.tostring(parent_node)  # serialize this tag

                    if old_state == new_state:
                        logging.info("No change in search result listing: %s" % bill_id)
                        continue

                    bill_states[bill_id] = new_state

                bill_ids.append(bill_id)

            if len(links) < 100:
                break

            offset += 100

            # sanity check, while True loops are dangerous
            if offset > 100000:
                break

    return utils.uniq(bill_ids)