Example #1
0
def get_output_path(collection, package_name, options):
    # Where to store the document files?

    # The path will depend a bit on the collection.
    if collection == "BILLS":
        # Store with the other bill data ([congress]/bills/[billtype]/[billtype][billnumber]).
        bill_and_ver = get_bill_id_for_package(
            package_name,
            with_version=False,
            restrict_to_congress=options.get("congress"))
        if not bill_and_ver:
            return None  # congress number does not match options["congress"]
        from bills import output_for_bill
        bill_id, version_code = bill_and_ver
        return output_for_bill(bill_id,
                               "text-versions/" + version_code,
                               is_data_dot=False)

    elif collection == "CRPT":
        # Store committee reports in [congress]/crpt/[reporttype].
        m = re.match(r"(\d+)([hse]rpt)(\d+)$", package_name)
        if not m:
            raise ValueError(package_name)
        congress, report_type, report_number = m.groups()
        if options.get("congress") and congress != options.get("congress"):
            return None  # congress number does not match options["congress"]
        return "%s/%s/%s/%s/%s" % (utils.data_dir(), congress,
                                   collection.lower(), report_type,
                                   report_type + report_number)

    else:
        # Store in govinfo/COLLECTION/PKGNAME.
        path = "%s/govinfo/%s/%s" % (utils.data_dir(), collection,
                                     package_name)
        return path
Example #2
0
def get_output_path(collection, package_name, options):
    # Where to store the document files?

    # The path will depend a bit on the collection.
    if collection == "BILLS":
        # Store with the other bill data ([congress]/bills/[billtype]/[billtype][billnumber]).
        bill_and_ver = get_bill_id_for_package(package_name, with_version=False, restrict_to_congress=options.get("congress"))
        if not bill_and_ver:
            return None  # congress number does not match options["congress"]
        from bills import output_for_bill
        bill_id, version_code = bill_and_ver
        return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False)

    elif collection == "CRPT":
        # Store committee reports in [congress]/crpt/[reporttype].
        m = re.match(r"(\d+)([hse]rpt)(\d+)$", package_name)
        if not m:
            raise ValueError(package_name)
        congress, report_type, report_number = m.groups()
        if options.get("congress") and congress != options.get("congress"):
            return None  # congress number does not match options["congress"]
        return "%s/%s/%s/%s/%s" % (utils.data_dir(), congress, collection.lower(), report_type, report_type + report_number)
    
    else:
        # Store in govinfo/COLLECTION/PKGNAME.
        path = "%s/govinfo/%s/%s" % (utils.data_dir(), collection, package_name)
        return path
Example #3
0
def get_output_path(sitemap, package_name, granule_name, options):
    # Where to store the document files?

    # The path will depend a bit on the collection.
    if sitemap["collection"] == "BILLS":
        # Store with the other bill data.
        bill_and_ver = get_bill_id_for_package(
            package_name,
            with_version=False,
            restrict_to_congress=options.get("congress"))
        if not bill_and_ver:
            return None  # congress number does not match options["congress"]
        from bills import output_for_bill
        bill_id, version_code = bill_and_ver
        return output_for_bill(bill_id,
                               "text-versions/" + version_code,
                               is_data_dot=False)

    else:
        # Store in fdsys/COLLECTION/YEAR/PKGNAME[/GRANULE_NAME].
        path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), sitemap["collection"],
                                      sitemap["year"], package_name)
        if granule_name:
            path += "/" + granule_name
        return path
Example #4
0
def mirror_bulkdata_file(sitemap, url, item_path, lastmod, options):
    # Return a list of files we downloaded.
    results = []

    # Where should we store the file?
    path = "%s/fdsys/%s/%s" % (utils.data_dir(), sitemap["collection"],
                               item_path)

    # For BILLSTATUS, store this along with where we store the rest of bill
    # status data.
    if sitemap["collection"] == "BILLSTATUS":
        from bills import output_for_bill
        bill_id, version_code = get_bill_id_for_package(os.path.splitext(
            os.path.basename(item_path))[0],
                                                        with_version=False)
        path = output_for_bill(bill_id,
                               FDSYS_BILLSTATUS_FILENAME,
                               is_data_dot=False)

    # Where should we store the lastmod found in the sitemap so that
    # we can tell later if the file has changed?
    lastmod_cache_file = os.path.splitext(path)[0] + "-lastmod.txt"

    # Do we already have this file up to date?
    if os.path.exists(lastmod_cache_file) and not options.get("force", False):
        if lastmod == utils.read(lastmod_cache_file):
            return

    # With --cached, skip if the file is already downloaded.
    if os.path.exists(path) and options.get("cached", False):
        return

    # Download.
    logging.warn("Downloading: " + path)
    data = utils.download(
        url,
        path,
        utils.merge(
            options,
            {
                'binary': True,
                'force': True,  # decision to cache was made above
                'to_cache': False,
            }))
    results.append(path)

    if not data:
        # Something failed.
        return

    # Write the current last modified date back to disk so we know the next time whether
    # we need to fetch the file again.
    utils.write(lastmod, lastmod_cache_file)

    return results
Example #5
0
def mirror_bulkdata_file(collection, url, item_path, lastmod, options):
    # Return a list of files we downloaded.
    results = []

    # Where should we store the file?
    path = "%s/govinfo/%s/%s" % (utils.data_dir(), collection, item_path)

    # For BILLSTATUS, store this along with where we store the rest of bill
    # status data.
    if collection == "BILLSTATUS":
        from bills import output_for_bill
        bill_id, version_code = get_bill_id_for_package(os.path.splitext(os.path.basename(item_path.replace("BILLSTATUS-", "")))[0], with_version=False)
        path = output_for_bill(bill_id, FDSYS_BILLSTATUS_FILENAME, is_data_dot=False)

    # Where should we store the lastmod found in the sitemap so that
    # we can tell later if the file has changed?
    lastmod_cache_file = os.path.splitext(path)[0] + "-lastmod.txt"

    # Do we already have this file up to date?
    if os.path.exists(lastmod_cache_file) and not options.get("force", False):
        if lastmod == utils.read(lastmod_cache_file):
            return

    # With --cached, skip if the file is already downloaded.
    if os.path.exists(path) and options.get("cached", False):
        return

    # Download.
    logging.warn("Downloading: " + path)
    data = utils.download(url, path, utils.merge(options, {
        'binary': True,
        'force': True, # decision to cache was made above
        'to_cache': False,
    }))
    results.append(path)

    if not data:
        # Something failed.
        return

    # Write the current last modified date back to disk so we know the next time whether
    # we need to fetch the file again.
    utils.write(lastmod, lastmod_cache_file)

    return results
Example #6
0
def fetch_floor_week(for_the_week, options):
    base_url = 'https://docs.house.gov/floor/Download.aspx?file=/billsthisweek/'
    week_url = base_url + '%s/%s.xml' % (for_the_week, for_the_week)

    # Turn on 'force' to re-download the schedules, by default, since the content
    # changes frequently and we're scanning weeks that might have 404'd previously
    # when we looked ahead. We leave 'force' off for downloading the file attachments.
    options2 = dict(options)
    if "force" not in options2:
        options2["force"] = True

    body = utils.download(week_url,
                          'upcoming_house_floor/%s.xml' % for_the_week,
                          options2)
    if "was not found" in body: return None
    dom = lxml.etree.fromstring(body)

    # can download the actual attached files to disk, if asked
    download = options.get("download", False)

    # always present at the feed level
    congress = int(dom.xpath('//floorschedule')[0].get('congress-num'))

    # week of this day, e.g. '2013-01-21'
    legislative_day = for_the_week[0:4] + '-' + for_the_week[
        4:6] + '-' + for_the_week[6:]

    upcoming = []

    for node in dom.xpath('//floorschedule/category/floor-items/floor-item'):
        bill_number = node.xpath('legis-num//text()')[0]

        # TODO: fetch non-bills too
        if not bill_number:
            logging.warn("Skipping item, not a bill: %s" % description)
            continue

        description = node.xpath('floor-text//text()')[0]

        # how is this bill being considered?
        category = next(node.iterancestors("category")).get('type')
        if "suspension" in category:
            consideration = "suspension"
        elif "pursuant" in category:
            consideration = "rule"
        else:
            consideration = "unknown"

        logging.warn("[%s]" % bill_number)

        # todo: establish most recent date from a combo of added, published, updates
        date = date_for(node.get('publish-date'))

        # all items will have this
        bill = {
            'description': description,
            'floor_item_id': node.get('id'),
            'consideration': consideration,
            'published_at': date_for(node.get('publish-date')),
            'added_at': date_for(node.get('add-date')),
        }

        # treat drafts and numbered bills a bit differently
        if "_" in bill_number:
            draft_bill_id = draft_bill_id_for(bill_number, date, congress)
            bill['item_type'] = 'draft_bill'
            bill['draft_bill_id'] = draft_bill_id
        else:
            m = re.match(
                "(Concur(ring)? in )?(?P<type>((the )?(Senate|House) Amendments? (with an amendment )?to )+)(?P<bill>.*)",
                bill_number, re.I)
            if m:
                amendment_type = m.group("type").split("to")[0]
                if "Senate" in amendment_type and "House" not in amendment_type:
                    bill['item_type'] = 'senate_amendment'
                elif "House" in amendment_type and "Senate" not in amendment_type:
                    bill['item_type'] = 'house_amendment'
                else:
                    raise ValueError(bill_number)
                bill_number = m.group("bill")

            elif re.match("Conference report to accompany ", bill_number,
                          re.I):
                bill['item_type'] = 'conference_report'
                bill_number = bill_number.lower().replace(
                    "conference report to accompany ", '')
            else:
                bill['item_type'] = 'bill'

            # In one case we got "H. Res. 497 (H. Rept. 116-125)".
            # Stop at parens.
            bill_number = re.sub(r"\(.*", "", bill_number)

            try:
                bill['bill_id'] = bill_id_for(bill_number.strip(), congress)
            except ValueError:
                logging.error("Could not parse bill from: %s" % bill_number)
                continue

        bill['files'] = []
        for file in node.xpath('files/file'):
            file_url = file.get('doc-url')
            filename = file_url.split('/')[-1]
            file_format = file.get('doc-type').lower()

            logging.warn("\t%s file for %s: %s" %
                         (file_format.upper(), bill_number, filename))

            file_field = {
                'url': file_url,
                'format': file_format,
                'added_at': date_for(file.get('add-date')),
                'published_at': date_for(file.get('publish-date'))
            }

            bill['files'].append(file_field)

            # now try downloading the file to disk and linking it to the data
            if not download: continue
            try:
                file_path = 'upcoming_house_floor/%s/%s' % (for_the_week,
                                                            filename)
                try:
                    os.makedirs(
                        os.path.join(utils.data_dir(),
                                     os.path.dirname(file_path)))
                except OSError:
                    pass  # directory exists
                options3 = dict(options)
                options3[
                    "to_cache"] = False  # put in the actual specified directory
                options3[
                    "binary"] = True  # force binary mode, no file escaping
                utils.download(file_url,
                               os.path.join(utils.data_dir(), file_path),
                               options3)
                file_field['path'] = file_path
            except IOError:
                logging.error(
                    "Omitting 'path', couldn't download file %s from House floor for the week of %s"
                    % (file_field['url'], for_the_week))
                continue

            # if it's a PDF, convert to text and extract XML
            if file_format == "pdf" and file_path.endswith(".pdf"):
                # extract text
                text_path = file_path.replace(".pdf", ".txt")
                if subprocess.call([
                        "pdftotext", "-layout",
                        os.path.join(utils.data_dir(), file_path),
                        os.path.join(utils.data_dir(), text_path)
                ],
                                   universal_newlines=True) != 0:
                    raise Exception("pdftotext failed on %s" % file_path)
                file_field['text_path'] = text_path

                # extract embedded XML
                for line in subprocess.check_output(
                    [
                        "pdfdetach", "-list",
                        os.path.join(utils.data_dir(), file_path)
                    ],
                        universal_newlines=True).split("\n"):
                    m = re.match(r"(\d+):\s*(.*)", line)
                    if m:
                        attachment_n, attachment_fn = m.groups()
                        if attachment_fn.endswith(".xml"):
                            text_path = file_path.replace(".pdf", ".xml")
                            subprocess.check_call([
                                "pdfdetach",
                                os.path.join(utils.data_dir(), file_path),
                                "-save", attachment_n, "-o",
                                os.path.join(utils.data_dir(), text_path)
                            ],
                                                  universal_newlines=True)
                            file_field['xml_path'] = text_path

        upcoming.append(bill)

        if "bill_id" in bill:
            # Save this bill data to the bill's bill text directory.
            text_data_path = output_for_bill(
                bill['bill_id'],
                os.path.join("text-versions",
                             "dhg-" + bill["floor_item_id"] + ".json"),
                is_data_dot=False)
            try:
                os.makedirs(
                    os.path.join(utils.data_dir(),
                                 os.path.dirname(text_data_path)))
            except OSError:
                pass  # directory exists
            utils.write(
                json.dumps(bill,
                           sort_keys=True,
                           indent=2,
                           default=utils.format_datetime), text_data_path)

    # Create and return the house floor file data.
    house_floor = {
        'congress': congress,
        'week_of': legislative_day,
        'upcoming': upcoming
    }

    return house_floor