Example #1
0
def update_bill_version_list(only_congress):
    bill_versions = {}

    # Which sitemap years should we look at?
    if not only_congress:
        sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml")
    else:
        # If --congress=X is specified, only look at the relevant years.
        sitemap_files = [
            utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml"
            for year in utils.get_congress_years(only_congress)
        ]
        sitemap_files = [f for f in sitemap_files if os.path.exists(f)]

    # For each year-by-year BILLS sitemap...
    for year_sitemap in sitemap_files:
        dom = etree.parse(year_sitemap).getroot()
        if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":
            raise Exception("Mismatched sitemap type.")

        # Loop through each bill text version...
        for file_node in dom.xpath("x:url", namespaces=ns):
            # get URL and last modified date
            url = str(file_node.xpath("string(x:loc)", namespaces=ns))
            lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))

            # extract bill congress, type, number, and version from the URL
            m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url)
            if not m:
                raise Exception("Unmatched bill document URL: " + url)
            congress, bill_type, bill_number, version_code = m.groups()
            congress = int(congress)
            if bill_type not in utils.thomas_types:
                raise Exception("Invalid bill type: " + url)

            # If --congress=XXX is specified, only look at those bills.
            if only_congress and congress != only_congress:
                continue

            # Track the documents by congress, bill type, etc.
            bill_versions.setdefault(congress, {}).setdefault(bill_type, {}).setdefault(bill_number, {})[
                version_code
            ] = {"url": url, "lastmod": lastmod}

    # Output the bill version info. We can't do this until the end because we need to get
    # the complete list of versions for a bill before we write the file, and the versions
    # may be split across multiple sitemap files.

    for congress in bill_versions:
        for bill_type in bill_versions[congress]:
            for bill_number in bill_versions[congress][bill_type]:
                utils.write(
                    json.dumps(
                        bill_versions[congress][bill_type][bill_number],
                        sort_keys=True,
                        indent=2,
                        default=utils.format_datetime,
                    ),
                    output_for_bill(congress, bill_type, bill_number, "text-versions.json"),
                )
Example #2
0
def update_bill_version_list(only_congress):
  bill_versions = { }
  
  # Which sitemap years should we look at?
  if not only_congress:
    sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml")
  else:
    # If --congress=X is specified, only look at the relevant years.
    sitemap_files = [utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress)]
    sitemap_files = [f for f in sitemap_files if os.path.exists(f)]
  
  # For each year-by-year BILLS sitemap...
  for year_sitemap in sitemap_files:
    dom = etree.parse(year_sitemap).getroot()
    if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.")
    
    # Loop through each bill text version...
    for file_node in dom.xpath("x:url", namespaces=ns):
      # get URL and last modified date
      url = str(file_node.xpath("string(x:loc)", namespaces=ns))
      lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))
      
      # extract bill congress, type, number, and version from the URL
      m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url)
      if not m: raise Exception("Unmatched bill document URL: " + url)
      congress, bill_type, bill_number, version_code = m.groups()
      congress = int(congress)
      if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url)
      
      # If --congress=XXX is specified, only look at those bills. 
      if only_congress and congress != only_congress:
        continue
      
      # Track the documents by congress, bill type, etc.
      bill_versions\
        .setdefault(congress, { })\
        .setdefault(bill_type, { })\
        .setdefault(bill_number, { })\
        [version_code] = {
          "url": url,
          "lastmod": lastmod,
        }
        
  # Output the bill version info. We can't do this until the end because we need to get
  # the complete list of versions for a bill before we write the file, and the versions
  # may be split across multiple sitemap files.
  
  for congress in bill_versions:
    for bill_type in bill_versions[congress]:
      for bill_number in bill_versions[congress][bill_type]:
        utils.write(
          json.dumps(bill_versions[congress][bill_type][bill_number],
            sort_keys=True, indent=2, default=utils.format_datetime), 
          output_for_bill(congress, bill_type, bill_number, "text-versions.json")
        )
Example #3
0
def update_sitemap(url, current_lastmod, how_we_got_here, options):
    """Updates the local cache of a sitemap file."""

    # Skip if the year or congress flags are set and this sitemap is
    # not for that year or Congress.
    if should_skip_sitemap(url, options):
        return []

    # For debugging, remember what URLs we are stepping through.
    how_we_got_here = how_we_got_here + [url]

    # Get the file paths to cache:
    # * the sitemap XML for future runs
    # * its <lastmod> date (which comes from the parent sitemap) so we know if we need to re-download it now
    # * the <lastmod> dates of the packages listed in this sitemap so we know if we need to re-download any package files
    cache_file = get_sitemap_cache_file(url)
    cache_file = os.path.join("govinfo/sitemap", cache_file, "sitemap.xml")
    lastmod_cache_file = cache_file.replace(".xml", "-lastmod.yaml")
    lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file)
    if not os.path.exists(lastmod_cache_file):
        lastmod_cache = { }
    else:
        with open(lastmod_cache_file) as f:
            lastmod_cache = rtyaml.load(f)

    try:
        return update_sitemap2(url, current_lastmod, how_we_got_here, options, lastmod_cache, cache_file)
    finally:
        # Write the updated last modified dates to disk so we know the next time whether
        # we need to fetch the files. If we didn't download anything, no need to write an
        # empty file.
        with utils.NoInterrupt():
            with open(lastmod_cache_file, "w") as f:
                rtyaml.dump(lastmod_cache, f)
Example #4
0
def update_sitemap(url, current_lastmod, how_we_got_here, options):
    """Updates the local cache of a sitemap file."""

    # Skip if the year or congress flags are set and this sitemap is
    # not for that year or Congress.
    if should_skip_sitemap(url, options):
        return []

    # For debugging, remember what URLs we are stepping through.
    how_we_got_here = how_we_got_here + [url]

    # Get the file paths to cache:
    # * the sitemap XML for future runs
    # * its <lastmod> date (which comes from the parent sitemap) so we know if we need to re-download it now
    # * the <lastmod> dates of the packages listed in this sitemap so we know if we need to re-download any package files
    cache_file = get_sitemap_cache_file(url)
    cache_file = os.path.join("govinfo/sitemap", cache_file, "sitemap.xml")
    lastmod_cache_file = cache_file.replace(".xml", "-lastmod.yaml")
    lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file)
    if not os.path.exists(lastmod_cache_file):
        lastmod_cache = {}
    else:
        with open(lastmod_cache_file) as f:
            lastmod_cache = rtyaml.load(f)

    try:
        return update_sitemap2(url, current_lastmod, how_we_got_here, options,
                               lastmod_cache, cache_file)
    finally:
        # Write the updated last modified dates to disk so we know the next time whether
        # we need to fetch the files. If we didn't download anything, no need to write an
        # empty file.
        with utils.NoInterrupt():
            with open(lastmod_cache_file, "w") as f:
                rtyaml.dump(lastmod_cache, f)
Example #5
0
def get_sitemap(year, collection, lastmod, options):
  """Gets a single sitemap, downloading it if the sitemap has changed.
  
  Downloads the root sitemap (year==None, collection==None), or
  the sitemap for a year (collection==None), or the sitemap for
  a particular year and collection. Pass lastmod which is the current
  modification time of the file according to its parent sitemap, which
  is how it knows to return a cached copy.
  
  Returns the sitemap parsed into a DOM.
  """
  
  # Construct the URL and the path to where to cache the file on disk.
  if year == None:
    url = "http://www.gpo.gov/smap/fdsys/sitemap.xml"
    path = "fdsys/sitemap/sitemap.xml"
  elif collection == None:
    url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year)
    path = "fdsys/sitemap/%s/sitemap.xml" % year
  else:
    url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection)
    path = "fdsys/sitemap/%s/%s.xml" % (year, collection)
    
  # Should we re-download the file?
  lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt")
  if options.get("cached", False):
    # If --cached is used, don't hit the network.
    force = False
  elif not lastmod:
    # No *current* lastmod date is known for this file (because it is the master
    # sitemap file, probably), so always download.
    force = True
  else:
    # If the file is out of date or --force is used, download the file.
    cache_lastmod = utils.read(lastmod_cache_file)
    force = (lastmod != cache_lastmod) or options.get("force", False)
    
  if force:
    logging.warn("Downloading: %s" % url)
    
  body = utils.download(url, path, utils.merge(options, {
    'force': force, 
    'binary': True
  }))
  
  if not body:
      raise Exception("Failed to download %s" % url)
      
  # Write the current last modified date to disk so we know the next time whether
  # we need to fetch the file.
  if lastmod and not options.get("cached", False):
    utils.write(lastmod, lastmod_cache_file)
  
  try:
    return etree.fromstring(body)
  except etree.XMLSyntaxError as e:
    raise Exception("XML syntax error in %s: %s" % (url, str(e)))
Example #6
0
def save_bill_search_state(saved_bills, search_state):
    # For --fast mode, cache the current search result listing (in search_state)
    # to disk so we can detect major changes to the bill through the search
    # listing rather than having to parse the bill.
    for bill_id in saved_bills:
        if bill_id in search_state:
            fast_cache_path = utils.cache_dir() + "/" + bill_info.bill_cache_for(bill_id, "search_result.html")
            new_state = search_state[bill_id]
            utils.write(new_state, fast_cache_path)
Example #7
0
def save_bill_search_state(saved_bills, search_state):
    # For --fast mode, cache the current search result listing (in search_state)
    # to disk so we can detect major changes to the bill through the search
    # listing rather than having to parse the bill.
    for bill_id in saved_bills:
        if bill_id in search_state:
            fast_cache_path = utils.cache_dir(
            ) + "/" + bill_info.bill_cache_for(bill_id, "search_result.html")
            new_state = search_state[bill_id]
            utils.write(new_state, fast_cache_path)
def run(options):
    # Download the TSV file.
    cache_zip_path = "adler-wilkerson-bills.zip"
    utils.download(
        "http://congressionalbills.org/billfiles/bills80-92.zip",
        cache_zip_path,
        utils.merge(options, {'binary': True, 'needs_content': False}))

    # Unzip in memory and process the records.
    zfile = zipfile.ZipFile(utils.cache_dir() + "/" + cache_zip_path)
    csvreader = csv.DictReader(zfile.open("bills80-92.txt"), delimiter="\t")
    for record in csvreader:
        rec = process_bill(record)

        import pprint
        pprint.pprint(rec)
Example #9
0
def get_sitemap(year, collection, lastmod, options):
    # Construct the URL and the path to where to cache the file on disk.
    if year == None:
        url = "http://www.gpo.gov/smap/fdsys/sitemap.xml"
        path = "fdsys/sitemap/sitemap.xml"
    elif collection == None:
        url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year)
        path = "fdsys/sitemap/%s/sitemap.xml" % year
    else:
        url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection)
        path = "fdsys/sitemap/%s/%s.xml" % (year, collection)

    # Should we re-download the file?
    lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt")
    if options.get("cached", False):
        # If --cached is used, don't hit the network.
        force = False
    elif not lastmod:
        # No *current* lastmod date is known for this file (because it is the master
        # sitemap file, probably), so always download.
        force = True
    else:
        # If the file is out of date or --force is used, download the file.
        cache_lastmod = utils.read(lastmod_cache_file)
        force = (lastmod != cache_lastmod) or options.get("force", False)

    if force:
        logging.warn("Downloading: %s" % url)

    body = utils.download(url, path, utils.merge(options, {"force": force, "xml": True}))

    if not body:
        raise Exception("Failed to download %s" % url)

    # Write the current last modified date to disk so we know the next time whether
    # we need to fetch the file.
    if lastmod and not options.get("cached", False):
        utils.write(lastmod, lastmod_cache_file)

    return etree.fromstring(body)
Example #10
0
def run():

    # Field mapping. And which fields should be turned into integers.
    # See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available.
    fieldmap = {
        "congbio": "bioguide",
        #"fec": "fec", # handled specially...
        "govtrack":
        "govtrack",  # for sanity checking since we definitely have this already (I caught some Wikipedia errors)
        "opensecrets": "opensecrets",
        "votesmart": "votesmart",
        "cspan": "cspan",
    }
    int_fields = ("govtrack", "votesmart", "cspan")

    # default to not caching
    cache = utils.flags().get('cache', False)

    # Load legislator files and map bioguide IDs.
    y1 = utils.load_data("legislators-current.yaml")
    y2 = utils.load_data("legislators-historical.yaml")
    bioguides = {}
    for y in y1 + y2:
        bioguides[y["id"]["bioguide"]] = y

    # Okay now the Wikipedia stuff...

    def get_matching_pages():
        # Does a Wikipedia API search for pages containing either of the
        # two templates. Returns the pages.

        page_titles = set()

        for template in ("CongLinks", "CongBio"):
            eicontinue = ""
            while True:
                # construct query URL, using the "eicontinue" of the last query to get the next batch
                url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template
                if eicontinue: url += "&eicontinue=" + eicontinue

                # load the XML
                print("Getting %s pages (%d...)" %
                      (template, len(page_titles)))
                dom = lxml.etree.fromstring(utils.download(
                    url, None, True))  # can't cache eicontinue probably

                for pgname in dom.xpath("query/embeddedin/ei/@title"):
                    page_titles.add(pgname)

                # get the next eicontinue value and loop
                eicontinue = dom.xpath(
                    "string(query-continue/embeddedin/@eicontinue)")
                if not eicontinue: break

        return page_titles

    # Get the list of Wikipedia pages that use any of the templates we care about.
    page_list_cache_file = os.path.join(utils.cache_dir(),
                                        "legislators/wikipedia/page_titles")
    if cache and os.path.exists(page_list_cache_file):
        # Load from cache.
        matching_pages = open(page_list_cache_file).read().split("\n")
    else:
        # Query Wikipedia API and save to cache.
        matching_pages = get_matching_pages()
        utils.write(("\n".join(matching_pages)), page_list_cache_file)

    # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
    matching_pages = [p for p in matching_pages if ":" not in p]

    # Load each page's content and parse the template.
    for p in sorted(matching_pages):
        if " campaign" in p: continue
        if " (surname)" in p: continue
        if "career of " in p: continue
        if "for Congress" in p: continue
        if p.startswith("List of "): continue
        if p in ("New York in the American Civil War",
                 "Upper Marlboro, Maryland"):
            continue

        # Query the Wikipedia API to get the raw page content in XML,
        # and then use XPath to get the raw page text.
        url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(
            p.encode("utf8")) + "&export&exportnowrap"
        cache_path = "legislators/wikipedia/pages/" + p
        dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache))
        page_content = dom.xpath(
            "string(mw:page/mw:revision/mw:text)",
            namespaces={"mw": "http://www.mediawiki.org/xml/export-0.8/"})

        # Build a dict for the IDs that we want to insert into our files.
        new_ids = {
            "wikipedia":
            p  # Wikipedia page name, with spaces for spaces (not underscores)
        }

        if "CongLinks" in page_content:
            # Parse the key/val pairs in the template.
            m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content)
            if not m: continue  # no template?
            for arg in m.group(1).split("|"):
                if "=" not in arg: continue
                key, val = arg.split("=", 1)
                key = key.strip()
                val = val.strip()
                if val and key in fieldmap:
                    try:
                        if fieldmap[key] in int_fields: val = int(val)
                    except ValueError:
                        print("invalid value", key, val)
                        continue

                    if key == "opensecrets":
                        val = val.replace("&newMem=Y", "").replace(
                            "&newmem=Y", "").replace("&cycle=2004",
                                                     "").upper()
                    new_ids[fieldmap[key]] = val

            if "bioguide" not in new_ids: continue
            new_ids["bioguide"] = new_ids["bioguide"].upper()  # hmm
            bioguide = new_ids["bioguide"]

        else:
            m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content)
            if not m: continue  # no template?
            bioguide = m.group(1).upper()

        if not bioguide in bioguides:
            print(
                "Member not found: " + bioguide, p,
                "(Might have been a delegate to the Constitutional Convention.)"
            )
            continue

        # handle FEC ids specially because they are stored in an array...
        fec_id = new_ids.get("fec")
        if fec_id: del new_ids["fec"]

        member = bioguides[bioguide]
        member["id"].update(new_ids)

        # ...finish the FEC id.
        if fec_id:
            if fec_id not in bioguides[bioguide]["id"].get("fec", []):
                bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id)

        #print p.encode("utf8"), new_ids

    utils.save_data(y1, "legislators-current.yaml")
    utils.save_data(y2, "legislators-historical.yaml")
Example #11
0
def update_sitemap(url, current_lastmod, how_we_got_here, options, listing):
    """Updates the local cache of a sitemap file."""

    # What is this sitemap for?
    subject = extract_sitemap_subject_from_url(url, how_we_got_here)

    # For debugging, remember what URLs we are stepping through.
    how_we_got_here = how_we_got_here + [url]

    # Does the user want to process this sitemap?
    if skip_sitemap(subject, options):
        return

    # Where to cache the sitemap and a file where we store its current <lastmod> date
    # (which comes from a parent sitemap)?
    (cache_file, lastmod_cache_file) = get_sitemap_cache_files(subject)
    lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file)

    # Download anew if the current_lastmod doesn't match the stored lastmod
    # in our cache, and if --cache is not specified. Or if --force is given.
    # If we're not downloading it, load it from disk because we still have
    # to process each sitemap to ensure we've downloaded all of the package
    # files the user wants.
    download = should_download_sitemap(lastmod_cache_file, current_lastmod, options)

    # Download, or just retreive from cache.
    if download:
        logging.warn("Downloading: %s" % url)
    body = utils.download(
        url,
        cache_file,
        utils.merge(options, {
            'force': download,
            'binary': True
        }))
    if not body:
        raise Exception("Failed to download %s" % url)

    # Write the current last modified date to disk so we know the next time whether
    # we need to fetch the file --- if we just downloaded it.
    if download and current_lastmod:
        utils.write(current_lastmod, lastmod_cache_file)

    # Load the XML.
    try:
        sitemap = etree.fromstring(body)
    except etree.XMLSyntaxError as e:
        raise Exception("XML syntax error in %s: %s" % (url, str(e)))

    # Process the entries.
    if sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex":

        # This is a sitemap index. Process the sitemaps listed in this
        # sitemapindex recursively.
        for node in sitemap.xpath("x:sitemap", namespaces=ns):
            # Get URL and lastmod date of the sitemap.
            url = str(node.xpath("string(x:loc)", namespaces=ns))
            lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns))
            update_sitemap(url, lastmod, how_we_got_here, options, listing)
    
    elif sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":

        # This is a regular sitemap with content items listed.

        # For the --list command, remember that this sitemap had some data.
        # And then return --- don't download any package files.
        if options.get("list"):
            listing.append(subject)
            return

        # Process the items.
        for node in sitemap.xpath("x:url", namespaces=ns):
            url = str(node.xpath("string(x:loc)", namespaces=ns))
            lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns))

            if not subject.get("bulkdata"):
                # This is a regular collection item.
                #
                # Get the "package" name, i.e. a particular document (which has
                # one or more file formats within it).
                m = re.match("https://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url)
                if not m:
                    raise Exception("Unmatched package URL (%s) at %s." % (url, "->".join(how_we_got_here)))
                package_name = m.group(1)
                if options.get("filter") and not re.search(options["filter"], package_name): continue
                mirror_package(subject, package_name, lastmod, url, options)

            else:
                # This is a bulk data item. Extract components of the URL.
                m = re.match(re.escape(BULKDATA_BASE_URL) + re.escape(subject["collection"]) + "/(.+)", url)
                if not m:
                    raise Exception("Unmatched bulk data file URL (%s) at %s." % (url, "->".join(how_we_got_here)))
                item_path = m.group(1)
                if options.get("filter") and not re.search(options["filter"], item_path): continue
                mirror_bulkdata_file(subject, url, item_path, lastmod, options)
    
    else:
        raise Exception("Unknown sitemap type (%s) at the root sitemap of %s." % (sitemap.tag, url))
Example #12
0
def bill_ids_for(congress, options, bill_states={}):

    # override if we're actually using this method to get amendments
    doing_amendments = options.get('amendments', False)

    bill_ids = []

    bill_type = options.get(
        'amendment_type' if doing_amendments else 'bill_type', None)
    if bill_type:
        bill_types = [bill_type]
    else:
        bill_types = utils.thomas_types.keys()

    for bill_type in bill_types:

        # This sub is re-used for pulling amendment IDs too.
        if (bill_type in ('samdt', 'hamdt', 'supamdt')) != doing_amendments:
            continue

        # match only links to landing pages of this bill type
        # it shouldn't catch stray links outside of the confines of the 100 on the page,
        # but if it does, no big deal
        link_pattern = "^\s*%s\d+\s*$" % utils.thomas_types[bill_type][1]

        # loop through pages and collect the links on each page until
        # we hit a page with < 100 results, or no results
        offset = 0
        while True:
            # download page, find the matching links
            page = utils.download(page_for(congress, bill_type, offset),
                                  page_cache_for(congress, bill_type, offset),
                                  options)

            if not page:
                logging.error(
                    "Couldn't download page with offset %i, aborting" % offset)
                return None

            # extract matching links
            doc = html.document_fromstring(page)
            links = doc.xpath(
                "//a[re:match(text(), '%s')]" % link_pattern,
                namespaces={"re": "http://exslt.org/regular-expressions"})

            # extract the bill ID from each link
            for link in links:
                code = link.text.lower().replace(".", "").replace(" ", "")
                bill_id = "%s-%s" % (code, congress)

                if options.get("fast", False):
                    fast_cache_path = utils.cache_dir(
                    ) + "/" + bill_info.bill_cache_for(bill_id,
                                                       "search_result.html")
                    old_state = utils.read(fast_cache_path)

                    # Compare all of the output in the search result's <p> tag, which
                    # has last major action, number of cosponsors, etc. to a cache on
                    # disk to see if any major information about the bill changed.
                    parent_node = link.getparent(
                    )  # the <p> tag containing the whole search hit
                    parent_node.remove(
                        parent_node.xpath("b")[0]
                    )  # remove the <b>###.</b> node that isn't relevant for comparison
                    new_state = etree.tostring(
                        parent_node)  # serialize this tag

                    if old_state == new_state:
                        logging.info("No change in search result listing: %s" %
                                     bill_id)
                        continue

                    bill_states[bill_id] = new_state

                bill_ids.append(bill_id)

            if len(links) < 100:
                break

            offset += 100

            # sanity check, while True loops are dangerous
            if offset > 100000:
                break

    return utils.uniq(bill_ids)
			
			# load the XML
			print "Getting %s pages (%d...)" % (template, len(page_titles))
			dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably
			
			for pgname in dom.xpath("query/embeddedin/ei/@title"):
				page_titles.add(pgname)
			
			# get the next eicontinue value and loop
			eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)")
			if not eicontinue: break
			
	return page_titles

# Get the list of Wikipedia pages that use any of the templates we care about.
page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles")
if cache and os.path.exists(page_list_cache_file):
	# Load from cache.
	matching_pages = open(page_list_cache_file).read().decode("utf8").split("\n")
else:
	# Query Wikipedia API and save to cache.
	matching_pages = get_matching_pages()
	utils.write((u"\n".join(matching_pages)).encode("utf8"), page_list_cache_file)

# Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
matching_pages = [p for p in matching_pages if ":" not in p]

# Load each page's content and parse the template.
for p in sorted(matching_pages):
	if " campaign" in p: continue
	if " (surname)" in p: continue
Example #14
0
def mirror_packages(fetch_collections, options):
  """Create a local mirror of FDSys document files. Only downloads
  changed files, according to the sitemap. Run update_sitemap_cache first.
  
  Pass fetch_collections as None, or to restrict the update to
  particular FDSys collections a set of collection names.
  
  Set options["store"] to a comma-separated list of file types (pdf,
  mods, text, xml, zip).
  """
  
  # For determining whether we need to process a sitemap file again on a later
  # run, we need to make a key out of the command line arguments that affect
  # which files we are downloading.
  cache_options_key = repr(tuple(sorted(kv for kv in options.items() if kv[0] in ("store", "year", "congress", "granules", "cached"))))
  
  file_types = options["store"].split(",")

  # Process each FDSys sitemap...
  for sitemap in sorted(glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/*.xml")):
    # Should we process this file?
    year, collection = re.search(r"/(\d+)/([^/]+).xml$", sitemap).groups()
    if "year" in options and year != options["year"]: continue
    if "congress" in options and int(year) not in utils.get_congress_years(int(options["congress"])): continue 
    if fetch_collections and collection not in fetch_collections: continue
    
    # Has this sitemap changed since the last successful mirror?
    #
    # The sitemap's last modification time is stored in ...-lastmod.txt,
    # which comes from the sitemap's parent sitemap's lastmod listing for
    # the file.
    #
    # Compare that to the lastmod value of when we last did a successful mirror.
    # This function can be run to fetch different sets of files, so get the
    # lastmod value corresponding to the current run arguments.
    sitemap_store_state_file = re.sub(r"\.xml$", "-store-state.json", sitemap)
    sitemap_last_mod = open(re.sub(r"\.xml$", "-lastmod.txt", sitemap)).read()
    if os.path.exists(sitemap_store_state_file):
      sitemap_store_state = json.load(open(sitemap_store_state_file))
      if sitemap_store_state.get(cache_options_key) == sitemap_last_mod:
        # sitemap hasn't changed since the last time
        continue
    
    logging.info("scanning " + sitemap + "...")
    
    # Load the sitemap for this year & collection, and loop through each document.
    for package_name, lastmod in get_sitemap_entries(sitemap):
      # Add this package to the download list.
      file_list = []
      
      if not options.get("granules", False):
        # Doing top-level package files (granule==None).
        file_list.append(None)

      else:
        # In some collections, like STATUTE, each document has subparts which are not
        # described in the sitemap. Load the main HTML page and scrape for the sub-files.
        # In the STATUTE collection, the MODS information in granules is redundant with
        # information in the top-level package MODS file. But the only way to get granule-
        # level PDFs is to go through the granules.
        content_detail_url = "http://www.gpo.gov/fdsys/pkg/%s/content-detail.html" % package_name
        content_index = utils.download(content_detail_url,
            "fdsys/package/%s/%s/%s.html" % (year, collection, package_name),
            utils.merge(options, {
            'binary': True, 
          }))
        if not content_index: raise Exception("Failed to download %s" % content_detail_url)
        for link in html.fromstring(content_index).cssselect("table.page-details-data-table td.rightLinkCell a"):
          if link.text == "More":
            m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href"))
            if not m or m.group(1) != package_name: raise Exception("Unmatched granule URL %s" % link.get("href"))
            granule_name = m.group(2)
            file_list.append(granule_name)
        
      # Download the files of the desired types.
      for granule_name in file_list:
        mirror_package(year, collection, package_name, lastmod, granule_name, file_types, options)
        
    # If we got this far, we successfully downloaded all of the files in this year/collection.
    # To speed up future updates, save the lastmod time of this sitemap in a file indicating
    # what we downloaded. The store-state file contains a JSON mapping of command line options
    # to the most recent lastmod value for this sitemap.
    sitemap_store_state = { }
    if os.path.exists(sitemap_store_state_file):
      sitemap_store_state = json.load(open(sitemap_store_state_file))
    sitemap_store_state[cache_options_key] = sitemap_last_mod
    json.dump(sitemap_store_state, open(sitemap_store_state_file, "w"))
def run():

	# Field mapping. And which fields should be turned into integers.
	# See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available.
	fieldmap = {
		"congbio": "bioguide",
		#"fec": "fec", # handled specially...
		"govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors)
		"opensecrets": "opensecrets",
		"votesmart": "votesmart",
		"cspan": "cspan",
	}
	int_fields = ("govtrack", "votesmart", "cspan")

	# default to not caching
	cache = utils.flags().get('cache', False)

	# Load legislator files and map bioguide IDs.
	y1 = utils.load_data("legislators-current.yaml")
	y2 = utils.load_data("legislators-historical.yaml")
	bioguides = { }
	for y in y1+y2:
	  bioguides[y["id"]["bioguide"]] = y

	# Okay now the Wikipedia stuff...

	def get_matching_pages():
		# Does a Wikipedia API search for pages containing either of the
		# two templates. Returns the pages.

		page_titles = set()

		for template in ("CongLinks", "CongBio"):
			eicontinue = ""
			while True:
				# construct query URL, using the "eicontinue" of the last query to get the next batch
				url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template
				if eicontinue: url += "&eicontinue=" + eicontinue

				# load the XML
				print("Getting %s pages (%d...)" % (template, len(page_titles)))
				dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably

				for pgname in dom.xpath("query/embeddedin/ei/@title"):
					page_titles.add(pgname)

				# get the next eicontinue value and loop
				eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)")
				if not eicontinue: break

		return page_titles

	# Get the list of Wikipedia pages that use any of the templates we care about.
	page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles")
	if cache and os.path.exists(page_list_cache_file):
		# Load from cache.
		matching_pages = open(page_list_cache_file).read().split("\n")
	else:
		# Query Wikipedia API and save to cache.
		matching_pages = get_matching_pages()
		utils.write(("\n".join(matching_pages)), page_list_cache_file)

	# Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
	matching_pages = [p for p in matching_pages if ":" not in p]

	# Load each page's content and parse the template.
	for p in sorted(matching_pages):
		if " campaign" in p: continue
		if " (surname)" in p: continue
		if "career of " in p: continue
		if "for Congress" in p: continue
		if p.startswith("List of "): continue
		if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue

		# Query the Wikipedia API to get the raw page content in XML,
		# and then use XPath to get the raw page text.
		url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(p.encode("utf8")) + "&export&exportnowrap"
		cache_path = "legislators/wikipedia/pages/" + p
		dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache))
		page_content = dom.xpath("string(mw:page/mw:revision/mw:text)", namespaces={ "mw": "http://www.mediawiki.org/xml/export-0.8/" })

		# Build a dict for the IDs that we want to insert into our files.
		new_ids = {
			"wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores)
		}

		if "CongLinks" in page_content:
			# Parse the key/val pairs in the template.
			m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content)
			if not m: continue # no template?
			for arg in m.group(1).split("|"):
				if "=" not in arg: continue
				key, val = arg.split("=", 1)
				key = key.strip()
				val = val.strip()
				if val and key in fieldmap:
					try:
						if fieldmap[key] in int_fields: val = int(val)
					except ValueError:
						print("invalid value", key, val)
						continue

					if key == "opensecrets": val = val.replace("&newMem=Y", "").replace("&newmem=Y", "").replace("&cycle=2004", "").upper()
					new_ids[fieldmap[key]] = val

			if "bioguide" not in new_ids: continue
			new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm
			bioguide = new_ids["bioguide"]

		else:
			m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content)
			if not m: continue # no template?
			bioguide = m.group(1).upper()


		if not bioguide in bioguides:
			print("Member not found: " + bioguide, p.encode("utf8"), "(Might have been a delegate to the Constitutional Convention.)")
			continue

		# handle FEC ids specially because they are stored in an array...
		fec_id = new_ids.get("fec")
		if fec_id: del new_ids["fec"]

		member = bioguides[bioguide]
		member["id"].update(new_ids)

		# ...finish the FEC id.
		if fec_id:
			if fec_id not in bioguides[bioguide]["id"].get("fec", []):
				bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id)

		#print p.encode("utf8"), new_ids

	utils.save_data(y1, "legislators-current.yaml")
	utils.save_data(y2, "legislators-historical.yaml")
            dom = lxml.etree.fromstring(utils.download(
                url, None, True))  # can't cache eicontinue probably

            for pgname in dom.xpath("query/embeddedin/ei/@title"):
                page_titles.add(pgname)

            # get the next eicontinue value and loop
            eicontinue = dom.xpath(
                "string(query-continue/embeddedin/@eicontinue)")
            if not eicontinue: break

    return page_titles


# Get the list of Wikipedia pages that use any of the templates we care about.
page_list_cache_file = os.path.join(utils.cache_dir(),
                                    "legislators/wikipedia/page_titles")
if cache and os.path.exists(page_list_cache_file):
    # Load from cache.
    matching_pages = open(page_list_cache_file).read().decode("utf8").split(
        "\n")
else:
    # Query Wikipedia API and save to cache.
    matching_pages = get_matching_pages()
    utils.write((u"\n".join(matching_pages)).encode("utf8"),
                page_list_cache_file)

# Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
matching_pages = [p for p in matching_pages if ":" not in p]

# Load each page's content and parse the template.
Example #17
0
def bill_ids_for(congress, options, bill_states={}):

    # override if we're actually using this method to get amendments
    doing_amendments = options.get('amendments', False)

    bill_ids = []

    bill_type = options.get('amendment_type' if doing_amendments else 'bill_type', None)
    if bill_type:
        bill_types = [bill_type]
    else:
        bill_types = utils.thomas_types.keys()

    for bill_type in bill_types:

        # This sub is re-used for pulling amendment IDs too.
        if (bill_type in ('samdt', 'hamdt', 'supamdt')) != doing_amendments:
            continue

        # match only links to landing pages of this bill type
        # it shouldn't catch stray links outside of the confines of the 100 on the page,
        # but if it does, no big deal
        link_pattern = "^\s*%s\d+\s*$" % utils.thomas_types[bill_type][1]

        # loop through pages and collect the links on each page until
        # we hit a page with < 100 results, or no results
        offset = 0
        while True:
            # download page, find the matching links
            page = utils.download(
                page_for(congress, bill_type, offset),
                page_cache_for(congress, bill_type, offset),
                options)

            if not page:
                logging.error("Couldn't download page with offset %i, aborting" % offset)
                return None

            # extract matching links
            doc = html.document_fromstring(page)
            links = doc.xpath(
                "//a[re:match(text(), '%s')]" % link_pattern,
                namespaces={"re": "http://exslt.org/regular-expressions"})

            # extract the bill ID from each link
            for link in links:
                code = link.text.lower().replace(".", "").replace(" ", "")
                bill_id = "%s-%s" % (code, congress)

                if options.get("fast", False):
                    fast_cache_path = utils.cache_dir() + "/" + bill_info.bill_cache_for(bill_id, "search_result.html")
                    old_state = utils.read(fast_cache_path)

                    # Compare all of the output in the search result's <p> tag, which
                    # has last major action, number of cosponsors, etc. to a cache on
                    # disk to see if any major information about the bill changed.
                    parent_node = link.getparent()  # the <p> tag containing the whole search hit
                    parent_node.remove(parent_node.xpath("b")[0])  # remove the <b>###.</b> node that isn't relevant for comparison
                    new_state = etree.tostring(parent_node)  # serialize this tag

                    if old_state == new_state:
                        logging.info("No change in search result listing: %s" % bill_id)
                        continue

                    bill_states[bill_id] = new_state

                bill_ids.append(bill_id)

            if len(links) < 100:
                break

            offset += 100

            # sanity check, while True loops are dangerous
            if offset > 100000:
                break

    return utils.uniq(bill_ids)
Example #18
0
def cache(inspector, path):
  return os.path.join(utils.cache_dir(), inspector, path)
Example #19
0
def update_sitemap(url, current_lastmod, how_we_got_here, options, listing):
    """Updates the local cache of a sitemap file."""

    # Return a list of files we downloaded.
    results = []

    # What is this sitemap for?
    subject = extract_sitemap_subject_from_url(url, how_we_got_here)

    # For debugging, remember what URLs we are stepping through.
    how_we_got_here = how_we_got_here + [url]

    # Does the user want to process this sitemap?
    if skip_sitemap(subject, options):
        return

    # Where to cache the sitemap and a file where we store its current <lastmod> date
    # (which comes from a parent sitemap)?
    (cache_file, lastmod_cache_file) = get_sitemap_cache_files(subject)
    lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file)

    # Download anew if the current_lastmod doesn't match the stored lastmod
    # in our cache, and if --cache is not specified. Or if --force is given.
    # If we're not downloading it, load it from disk because we still have
    # to process each sitemap to ensure we've downloaded all of the package
    # files the user wants.
    download = should_download_sitemap(lastmod_cache_file, current_lastmod, options)

    # Download, or just retreive from cache.
    if download:
        logging.warn("Downloading: %s" % url)
    body = utils.download(
        url,
        cache_file,
        utils.merge(options, {
            'force': download,
            'binary': True
        }))
    if not body:
        raise Exception("Failed to download %s" % url)

    # Write the current last modified date to disk so we know the next time whether
    # we need to fetch the file --- if we just downloaded it.
    if download and current_lastmod:
        utils.write(current_lastmod, lastmod_cache_file)

    # Load the XML.
    try:
        sitemap = etree.fromstring(body)
    except etree.XMLSyntaxError as e:
        raise Exception("XML syntax error in %s: %s" % (url, str(e)))

    # Process the entries.
    if sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex":

        # This is a sitemap index. Process the sitemaps listed in this
        # sitemapindex recursively.
        for node in sitemap.xpath("x:sitemap", namespaces=ns):
            # Get URL and lastmod date of the sitemap.
            url = str(node.xpath("string(x:loc)", namespaces=ns))
            lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns))
            sitemap_results = update_sitemap(url, lastmod, how_we_got_here, options, listing)
            if sitemap_results is not None:
                results = results + sitemap_results

    elif sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":

        # This is a regular sitemap with content items listed.

        # For the --list command, remember that this sitemap had some data.
        # And then return --- don't download any package files.
        if options.get("list"):
            listing.append(subject)
            return

        # Process the items.
        for node in sitemap.xpath("x:url", namespaces=ns):
            url = str(node.xpath("string(x:loc)", namespaces=ns))
            lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns))

            if not subject.get("bulkdata"):
                # This is a regular collection item.
                #
                # Get the "package" name, i.e. a particular document (which has
                # one or more file formats within it).
                m = re.match("https://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url)
                if not m:
                    raise Exception("Unmatched package URL (%s) at %s." % (url, "->".join(how_we_got_here)))
                package_name = m.group(1)
                if options.get("filter") and not re.search(options["filter"], package_name): continue
                mirror_results = mirror_package(subject, package_name, lastmod, url, options)
                if mirror_results is not None and len(mirror_results) > 0:
                    results = results + mirror_results

            else:
                # This is a bulk data item. Extract components of the URL.
                m = re.match(re.escape(BULKDATA_BASE_URL) + re.escape(subject["collection"]) + "/(.+)", url)
                if not m:
                    raise Exception("Unmatched bulk data file URL (%s) at %s." % (url, "->".join(how_we_got_here)))
                item_path = m.group(1)
                if options.get("filter") and not re.search(options["filter"], item_path): continue
                mirror_results = mirror_bulkdata_file(subject, url, item_path, lastmod, options)
                if mirror_results is not None and len(mirror_results) > 0:
                    results = results + mirror_results

    else:
        raise Exception("Unknown sitemap type (%s) at the root sitemap of %s." % (sitemap.tag, url))

    return results
Example #20
0
def mirror_files(fetch_collections, options):
    # Locally mirror certain file types for the specified collections.

    file_types = options["store"].split(",")

    for sitemap in sorted(glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/*.xml")):
        # Should we process this file?
        year, collection = re.search(r"/(\d+)/([^/]+).xml$", sitemap).groups()
        if "year" in options and year != options["year"]:
            continue
        if "congress" in options and int(year) not in utils.get_congress_years(int(options["congress"])):
            continue
        if fetch_collections and collection not in fetch_collections:
            continue

        logging.warn(sitemap + "...")

        # Load the sitemap for this year & collection.
        dom = etree.parse(sitemap).getroot()
        if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":
            raise Exception("Mismatched sitemap type.")

        # Loop through each document in the collection in this year...
        for file_node in dom.xpath("x:url", namespaces=ns):
            # Get URL and last modified timestamp.
            url = str(file_node.xpath("string(x:loc)", namespaces=ns))
            lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))
            if not url.endswith("/content-detail.html"):
                raise Exception("Unrecognized file pattern.")

            # Get the package name.
            m = re.match("http://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url)
            if not m:
                raise Exception("Unmatched document URL")
            package_name = m.group(1)

            # Where to store the document files?
            # The path will depend a bit on the collection.
            if collection == "BILLS":
                # Store with the other bill data.
                m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url)
                if not m:
                    raise Exception("Unmatched bill document URL: " + url)
                congress, bill_type, bill_number, version_code = m.groups()
                congress = int(congress)
                if "congress" in options and congress != int(options["congress"]):
                    continue
                path = output_for_bill(congress, bill_type, bill_number, "text-versions/" + version_code)
            else:
                # Store in fdsys/COLLECTION/YEAR/PKGNAME.
                path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), collection, year, package_name)

            # Do we need to update this record?
            lastmod_cache_file = path + "/lastmod.txt"
            cache_lastmod = utils.read(lastmod_cache_file)
            force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False)

            # Add this package to the download list.
            file_list = []
            file_list.append((None, path))

            if options.get("granules", False):
                # In some collections, like STATUTE, each document has subparts which are not
                # described in the sitemap. Load the main HTML page and scrape for the sub-files.
                # Josh originally thought the STATUTE granule files (individual statutes) were
                # useful, but then it turned out the information is redudant with information
                # in the top-level package MODS file.
                content_index = utils.download(
                    url,
                    "fdsys/package/%s/%s/%s.html" % (year, collection, package_name),
                    utils.merge(
                        options,
                        {
                            "xml": True,  # it's not XML but this avoid unescaping HTML which fails if there are unicode characters
                            "force": force,
                        },
                    ),
                )
                if not content_index:
                    raise Exception("Failed to download %s" % url)
                for link in html.fromstring(content_index).cssselect(
                    "table.page-details-data-table td.rightLinkCell a"
                ):
                    if link.text == "More":
                        m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href"))
                        if not m or m.group(1) != package_name:
                            raise Exception("Unmatched granule URL %s" % link.get("href"))
                        granule_name = m.group(2)
                        file_list.append((granule_name, path + "/" + granule_name))

            # Download the files of the desired types.
            for granule_name, path in file_list:
                targets = get_package_files(package_name, granule_name, path)
                for file_type in file_types:
                    if file_type not in targets:
                        raise Exception("Invalid file type: %s" % file_type)
                    f_url, f_path = targets[file_type]

                    if force:
                        logging.warn(f_path)
                    data = utils.download(
                        f_url, f_path, utils.merge(options, {"xml": True, "force": force, "to_cache": False})
                    )

                    if not data:
                        raise Exception("Failed to download %s" % url)

            # Write the current last modified date to disk so we know the next time whether
            # we need to fetch the file.
            if lastmod and not options.get("cached", False):
                utils.write(lastmod, lastmod_cache_file)