Exemple #1
0
def parse_bill_number(q, congress=None, not_exist_ok=False):
    m = bill_number_re.match(
        q.replace(" ", "").replace(".", "").replace("-", ""))
    if m == None: return None
    search_type_flag = None
    if m.group(3) != None:
        cn = int(m.group(4))
        search_type_flag = "bill-with-congress"
    elif congress != None:
        try:
            cn = int(congress)
        except:
            cn = CURRENT_CONGRESS
        search_type_flag = "bill-default-congress"
    else:
        cn = CURRENT_CONGRESS
        search_type_flag = "bill-guessed-congress"
    try:
        b = Bill.objects.get(congress=cn,
                             bill_type=BillType.by_slug(m.group(1).lower()),
                             number=int(m.group(2)))
        b.search_type_flag = search_type_flag
        return b
    except Bill.DoesNotExist:
        if not_exist_ok:
            # Return a dummy bill indicating that string matched the regex.
            b = Bill(congress=cn,
                     bill_type=BillType.by_slug(m.group(1).lower()),
                     number=int(m.group(2)))
            b.search_type_flag = search_type_flag
            return b
        return None
Exemple #2
0
def load_docs_house_gov(options, bill_index):
    # Look at the three most recent JSON files by looking at the lexicographically last ones,
    # which possibly cover the current week, the next week, and the week after that.
    for fn in sorted(os.listdir("data/congress/upcoming_house_floor"))[-3:]:
        data = json.load(open("data/congress/upcoming_house_floor/" + fn))
        for billinfo in data.get("upcoming", []):
            if "bill_id" not in billinfo: continue

            m = re.match(r"([hrsjconres]+)(\d+)-(\d+)", billinfo["bill_id"])
            if not m:
                log.error('Could not parse bill_id "%s" in docs.house.gov.' %
                          billinfo["bill_id"])
                continue

            bt = BillType.by_slug(m.group(1))
            try:
                bill = Bill.objects.get(congress=int(m.group(3)),
                                        bill_type=bt,
                                        number=int(m.group(2)))
            except Exception as e:
                log.error('Could not get bill "%s" in docs.house.gov: %s.' %
                          (billinfo["bill_id"], str(e)))
                continue

            bill.docs_house_gov_postdate = BillProcessor.parse_datetime(
                billinfo["published_at"])
            if bill.senate_floor_schedule_postdate is None or bill.docs_house_gov_postdate > bill.senate_floor_schedule_postdate:
                bill.scheduled_consideration_date = BillProcessor.parse_datetime(
                    data["week_of"])
            bill.save()
            if bill_index: bill.update_index(bill_index)
            if not options.disable_events: bill.create_events()
def load_docs_house_gov(options, bill_index):
    # Get most recent JSON file by looking at the lexicographically last one.
    fn = sorted(os.listdir("data/congress/upcoming_house_floor"))[-1]
    data = json.load(open("data/congress/upcoming_house_floor/" + fn))
    for billinfo in data.get("upcoming", []):
        if "bill_id" not in billinfo: continue

        m = re.match(r"([hrsjconres]+)(\d+)-(\d+)", billinfo["bill_id"])
        if not m:
            log.error('Could not parse bill_id "%s" in docs.house.gov.' %
                      billinfo["bill_id"])
            continue

        bt = BillType.by_slug(m.group(1))
        try:
            bill = Bill.objects.get(congress=int(m.group(3)),
                                    bill_type=bt,
                                    number=int(m.group(2)))
        except Exception as e:
            log.error('Could not get bill "%s" in docs.house.gov: %s.' %
                      (billinfo["bill_id"], str(e)))
            continue

        bill.docs_house_gov_postdate = BillProcessor.parse_datetime(
            billinfo["published_at"])
        bill.save()
        if bill_index: bill.update_index(bill_index)
        if not options.disable_events: bill.create_events()
Exemple #4
0
def bill_advocacy_tips(request, congress, type_slug, number):
    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
    return { "bill": bill }
Exemple #5
0
    def load_bill_from_url(congress, type_slug, number):
        try:
            bill_type = BillType.by_slug(type_slug)
        except BillType.NotFound:
            raise Http404("Invalid bill type: " + type_slug)

        return get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
Exemple #6
0
def bill_details_user_view(request, congress, type_slug, number):
    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
    
    ret = { }
    if request.user.is_staff:
        admin_panel = """
            {% load humanize %}
            <div class="clear"> </div>
            <div style="margin-top: 1.5em; padding: .5em; background-color: #EEE; ">
                <b>ADMIN</b> - <a href="{% url "bill_go_to_summary_admin" %}?bill={{bill.id}}">Edit Summary</a>
                <br/>Tracked by {{feed.tracked_in_lists.count|intcomma}} users
                ({{feed.tracked_in_lists_with_email.count|intcomma}} w/ email).
            </div>
            """
        from django.template import Template, Context, RequestContext, loader
        ret["admin_panel"] = Template(admin_panel).render(RequestContext(request, {
            'bill': bill,
            "feed": Feed.BillFeed(bill),
            }))
    
    from person.views import render_subscribe_inline
    ret.update(render_subscribe_inline(request, Feed.BillFeed(bill)))
    
    return ret
Exemple #7
0
def bill_advocacy_tips(request, congress, type_slug, number):
    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
    return { "bill": bill }
def load_docs_house_gov(options, bill_index):
    # Look at the three most recent JSON files by looking at the lexicographically last ones,
    # which possibly cover the current week, the next week, and the week after that.
    if not os.path.exists(settings.CONGRESS_DATA_PATH + "/upcoming_house_floor"):
        print("No upcoming_house_floor data.")
        return
    for fn in sorted(os.listdir(settings.CONGRESS_DATA_PATH + "/upcoming_house_floor"))[-3:]:
        data = json.load(open(settings.CONGRESS_DATA_PATH + "/upcoming_house_floor/" + fn))
        for billinfo in data.get("upcoming", []):
            if "bill_id" not in billinfo: continue
    
            m = re.match(r"([hrsjconres]+)(\d+)-(\d+)", billinfo["bill_id"])
            if not m:
                log.error('Could not parse bill_id "%s" in docs.house.gov.' % billinfo["bill_id"])
                continue
 
            bt = BillType.by_slug(m.group(1))
            try:
                bill = Bill.objects.get(congress=int(m.group(3)), bill_type=bt, number=int(m.group(2)))
            except Exception as e:
                log.error('Could not get bill "%s" in docs.house.gov: %s.' % (billinfo["bill_id"], str(e)))
                continue

            bill.docs_house_gov_postdate = BillProcessor.parse_datetime(billinfo["published_at"])
            if bill.senate_floor_schedule_postdate is None or bill.docs_house_gov_postdate > bill.senate_floor_schedule_postdate: bill.scheduled_consideration_date = BillProcessor.parse_datetime(data["week_of"])
            bill.save()
            if bill_index: bill.update_index(bill_index)
            if not options.disable_events: bill.create_events()
Exemple #9
0
def bill_text(request, congress, type_slug, number, version=None):
    if version == "":
        version = None

    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)

    from .billtext import load_bill_text, get_bill_text_versions
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    is_latest = True
    if textdata:
        alternates = []
        for v in get_bill_text_versions(bill):
            try:
                alternates.append(load_bill_text(bill, v, mods_only=True))
            except IOError:
                pass
        alternates.sort(key = lambda mods : mods["docdate"])
        if len(alternates) > 0:
            is_latest = False
            if textdata["doc_version"] == alternates[-1]["doc_version"]:
                is_latest = True

    # Get a list of related bills.
    from .billtext import get_current_version
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        try:
            rbv = get_current_version(rb)
            if not (rb, rbv) in related_bills: related_bills.append((rb, rbv))
        except IOError:
            pass # text not available
    for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        "bill_subpage": "Text",
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "is_latest": is_latest,
        "alternates": alternates,
        "related_bills": related_bills,
        "days_old": (datetime.datetime.now().date() - bill.current_status_date).days,
        "is_on_bill_text_page": True, # for the header tabs
    }
Exemple #10
0
def bill_text(request, congress, type_slug, number, version=None):
    if version == "":
        version = None

    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)

    from .billtext import load_bill_text, get_bill_text_versions
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    is_latest = True
    if textdata:
        alternates = []
        for v in get_bill_text_versions(bill):
            try:
                alternates.append(load_bill_text(bill, v, mods_only=True))
            except IOError:
                pass
        alternates.sort(key = lambda mods : mods["docdate"])
        if len(alternates) > 0:
            is_latest = False
            if textdata["doc_version"] == alternates[-1]["doc_version"]:
                is_latest = True

    # Get a list of related bills.
    from .billtext import get_current_version
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        try:
            rbv = get_current_version(rb)
            if not (rb, rbv) in related_bills: related_bills.append((rb, rbv))
        except IOError:
            pass # text not available
    for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        "bill_subpage": "Text",
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "is_latest": is_latest,
        "alternates": alternates,
        "related_bills": related_bills,
        "days_old": (datetime.datetime.now().date() - bill.current_status_date).days,
        "is_on_bill_text_page": True, # for the header tabs
    }
Exemple #11
0
def load_bill_from_url(congress, type_slug, number):
    # not sure why we were trying this
    #if type_slug.isdigit():
    #    bill_type = type_slug
    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)

    return get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
Exemple #12
0
def load_bill_from_url(congress, type_slug, number):
    # not sure why we were trying this
    #if type_slug.isdigit():
    #    bill_type = type_slug
    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)

    return get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
def bill_text(request, congress, type_slug, number, version=None):
    if version == "":
        version = None

    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)

    from billtext import load_bill_text, bill_gpo_status_codes
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    if textdata:
        alternates = []
        for v in bill_gpo_status_codes:
            try:
                alternates.append(load_bill_text(bill, v, mods_only=True))
            except IOError:
                pass
        alternates.sort(key = lambda mods : mods["docdate"])

    # Get a list of related bills.
    from billtext import get_current_version
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        try:
            rbv = get_current_version(rb)
            if not (rb, rbv) in related_bills: related_bills.append((rb, rbv))
        except IOError:
            pass # text not available
    for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "alternates": alternates,
        "related_bills": related_bills,
    }
Exemple #14
0
def bill_text(request, congress, type_slug, number, version=None):
    if version == "":
        version = None

    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)

    from billtext import load_bill_text, bill_gpo_status_codes
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    if textdata:
        alternates = []
        for v in bill_gpo_status_codes:
            try:
                alternates.append(load_bill_text(bill, v, mods_only=True))
            except IOError:
                pass
        alternates.sort(key = lambda mods : mods["docdate"])

    # Get a list of related bills.
    from billtext import get_current_version
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        try:
            rbv = get_current_version(rb)
            if not (rb, rbv) in related_bills: related_bills.append((rb, rbv))
        except IOError:
            pass # text not available
    for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "alternates": alternates,
        "related_bills": related_bills,
    }
Exemple #15
0
def parse_bill_number(q, congress=None):
    m = bill_number_re.match(q.replace(" ", "").replace(".", "").replace("-", ""))
    if m == None: return None
    if m.group(3) != None:
        cn = int(m.group(4))
    elif congress != None:
        try:
            cn = int(congress)
        except:
            cn = CURRENT_CONGRESS
    else:
        cn = CURRENT_CONGRESS
    try:
        return Bill.objects.get(congress=cn, bill_type=BillType.by_slug(m.group(1).lower()), number=int(m.group(2)))
    except Bill.DoesNotExist:
        return None
def parse_bill_number(q, congress=None):
    m = bill_number_re.match(q.replace(" ", "").replace(".", "").replace("-", ""))
    if m == None: return None
    if m.group(3) != None:
        cn = int(m.group(4))
    elif congress != None:
        try:
            cn = int(congress)
        except:
            cn = CURRENT_CONGRESS
    else:
        cn = CURRENT_CONGRESS
    try:
        return Bill.objects.get(congress=cn, bill_type=BillType.by_slug(m.group(1).lower()), number=int(m.group(2)))
    except Bill.DoesNotExist:
        return None
Exemple #17
0
def bill_text(request, congress, type_slug, number, version=None):
    if int(congress) < 103:
        raise Http404("Bill text is not available before the 103rd congress.")

    if version == "":
        version = None
    
    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
    
    from billtext import load_bill_text, bill_gpo_status_codes
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    if textdata:
        alternates = []
        for v in bill_gpo_status_codes:
            fn = "data/us/bills.text/%s/%s/%s%d%s.mods.xml" % (bill.congress, BillType.by_value(bill.bill_type).xml_code, BillType.by_value(bill.bill_type).xml_code, bill.number, v)
            if os.path.exists(fn):
                alternates.append(load_bill_text(bill, v, mods_only=True))
        alternates.sort(key = lambda mods : mods["docdate"])

    # Get a list of related bills.
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        if not (rb, "") in related_bills: related_bills.append((rb, ""))
    for btc in BillTextComparison.objects.filter(bill1=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "alternates": alternates,
        "related_bills": related_bills,
    }
def get_bill_for_page(page):
    for template in mwparserfromhell.parse(page["text"]).filter_templates():
        if template.name.strip() == "Infobox U.S. legislation":
            # print page["title"].encode("utf8")
            billref = get_bill_from_infobox(template)
            if billref:
                try:
                    if billref[0] == "PL":
                        # Get by pulic law number.
                        return Bill.objects.get(congress=billref[1], sliplawpubpriv="PUB", sliplawnum=billref[2])
                    elif billref[0] == "BILL":
                        # It's a bill number.
                        return Bill.objects.get(
                            congress=billref[1], bill_type=BillType.by_slug(billref[2]), number=billref[3]
                        )
                except Bill.DoesNotExist:
                    return None
    return None
def get_bill_for_page(page):
	for template in mwparserfromhell.parse(page["text"]).filter_templates():
		if template.name.strip() == "Infobox U.S. legislation":
			#print page["title"].encode("utf8")
			try:
				billref = get_bill_from_infobox(template)
			except Exception as e:
				print(page["pageid"], e)
				billref = None
			if billref:
				try:
					if billref[0] == "PL":
						# Get by pulic law number.
						return Bill.objects.get(congress=billref[1], sliplawpubpriv="PUB", sliplawnum=billref[2])
					elif billref[0] == "BILL":
						# It's a bill number.
						return Bill.objects.get(congress=billref[1], bill_type=BillType.by_slug(billref[2]), number=billref[3])
				except Bill.DoesNotExist:
					return None
	return None
Exemple #20
0
def bill_details(request, congress, type_slug, number):
    if type_slug.isdigit():
        bill_type = type_slug
    else:
        try:
            bill_type = BillType.by_slug(type_slug)
        except BillType.NotFound:
            raise Http404("Invalid bill type: " + type_slug)
    
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
    
    from person.name import get_person_name
    sponsor_name = None if not bill.sponsor else \
        get_person_name(bill.sponsor, role_date=bill.introduced_date, firstname_position='before', show_suffix=True)
    
    def get_reintroductions():
        reintro_prev = None
        reintro_next = None
        for reintro in bill.find_reintroductions():
            if reintro.congress < bill.congress: reintro_prev = reintro
            if reintro.congress > bill.congress and not reintro_next: reintro_next = reintro
        return reintro_prev, reintro_next
        
    def get_text_info():
        from billtext import load_bill_text
        try:
            return load_bill_text(bill, None, mods_only=True)
        except IOError:
            return None

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "subtitle": get_secondary_bill_title(bill, bill.titles),
        "sponsor_name": sponsor_name,
        "reintros": get_reintroductions, # defer so we can use template caching
        "current": bill.congress == CURRENT_CONGRESS,
        "dead": bill.congress != CURRENT_CONGRESS and bill.current_status not in BillStatus.final_status_obvious,
        "feed": Feed.BillFeed(bill),
        "text": get_text_info,
    }
def load_docs_house_gov(options, bill_index):
    # Get most recent JSON file by looking at the lexicographically last one.
    fn = sorted(os.listdir("data/congress/upcoming_house_floor"))[-1]
    data = json.load(open("data/congress/upcoming_house_floor/" + fn))
    for billinfo in data.get("upcoming", []):
        m = re.match(r"([hrsjconres]+)(\d+)-(\d+)", billinfo["bill_id"])
        if not m:
            log.error('Could not parse bill_id "%s" in docs.house.gov.' % billinfo["bill_id"])
            continue

        bt = BillType.by_slug(m.group(1))
        try:
            bill = Bill.objects.get(congress=int(m.group(3)), bill_type=bt, number=int(m.group(2)))
        except Exception as e:
            log.error('Could not get bill "%s" in docs.house.gov: %s.' % (billinfo["bill_id"], str(e)))
            continue

        bill.docs_house_gov_postdate = BillProcessor.parse_datetime(billinfo["published_at"])
        bill.save()
        if bill_index: bill.update_index(bill_index)
        if not options.disable_events: bill.create_events()
Exemple #22
0
def bill_details(request, congress, type_slug, number):
    if type_slug.isdigit():
        bill_type = type_slug
    else:
        try:
            bill_type = BillType.by_slug(type_slug)
        except BillType.NotFound:
            raise Http404("Invalid bill type: " + type_slug)
    
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
    
    from person.name import get_person_name
    sponsor_name = None if not bill.sponsor else \
        get_person_name(bill.sponsor, role_date=bill.introduced_date, firstname_position='before', show_suffix=True)
    
    def get_reintroductions():
        reintro_prev = None
        reintro_next = None
        for reintro in bill.find_reintroductions():
            if reintro.congress < bill.congress: reintro_prev = reintro
            if reintro.congress > bill.congress and not reintro_next: reintro_next = reintro
        return reintro_prev, reintro_next
        
    def get_text_info():
        from models import USCSection
        from billtext import load_bill_text
        from search import parse_slip_law_number
        import re
        try:
            metadata = load_bill_text(bill, None, mods_only=True)
            
            # do interesting stuff with citations
            if "citations" in metadata:
                slip_laws = []
                statutes = []
                usc = { }
                other = []
                usc_other = USCSection(name="Other Citations", ordering=99999)
                for cite in metadata["citations"]:
                    if cite["type"] == "slip_law":
                        slip_laws.append(cite)
                        cite["bill"] = parse_slip_law_number(cite["text"])
                    elif cite["type"] == "statutes_at_large":
                        statutes.append(cite)
                    elif cite["type"] == "usc":
                        # build a normalized citation and a link to LII
                        cite_norm = "usc/" + cite["title"]
                        cite_link = "http://www.law.cornell.edu/uscode/text/" + cite["title"]
                        if cite["section"]:
                            cite_link += "/" + cite["section"]
                            cite_norm += "/" + cite["section"]
                        if cite["paragraph"]: cite_link += "#" + "_".join(re.findall(r"\(([^)]+)\)", cite["paragraph"]))
                        
                        # Build a tree of title-chapter-...-section nodes so we can
                        # display the citations in context.
                        try:
                            sec_obj = USCSection.objects.get(citation=cite_norm)
                        except: # USCSection.DoesNotExist and MultipleObjectsReturned both possible
                            # the 'id' field is set to make these objects properly hashable
                            sec_obj = USCSection(id=cite["text"], name=cite["text"], parent_section=usc_other)
                        
                        sec_obj.link = cite_link
                        
                        if "range_to_section" in cite:
                            sec_obj.range_to_section = cite["range_to_section"]
                        
                        # recursively go up to the title
                        path = [sec_obj]
                        while sec_obj.parent_section:
                            sec_obj = sec_obj.parent_section
                            path.append(sec_obj)
                            
                        # now pop off from the path to put the node at the right point in a tree
                        container = usc
                        while path:
                            p = path.pop(-1)
                            if p not in container: container[p] = { }
                            container = container[p]
                        
                    else:
                        other.append(cite)
                        
                slip_laws.sort(key = lambda x : (x["congress"], x["number"]))
                
                # restructure data format
                def ucfirst(s): return s[0].upper() + s[1:]
                def rebuild_usc_sec(seclist, indent=0):
                    ret = []
                    seclist = sorted(seclist.items(), key=lambda x : x[0].ordering)
                    for sec, subparts in seclist:
                        ret.append({
                            "text": (ucfirst(sec.level_type + ((" " + sec.number) if sec.number else "") + (": " if sec.name else "")) if sec.level_type else "") + (sec.name if sec.name else ""),
                            "link": getattr(sec, "link", None),
                            "range_to_section": getattr(sec, "range_to_section", None),
                            "indent": indent,
                        })
                        ret.extend(rebuild_usc_sec(subparts, indent=indent+1))
                    return ret
                usc = rebuild_usc_sec(usc)
                
                metadata["citations"] = {
                    "slip_laws": slip_laws, "statutes": statutes, "usc": usc, "other": other,
                    "count": len(slip_laws)+len(statutes)+len(usc)+len(other) }
            return metadata
        except IOError:
            return None

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "subtitle": get_secondary_bill_title(bill, bill.titles),
        "sponsor_name": sponsor_name,
        "reintros": get_reintroductions, # defer so we can use template caching
        "current": bill.congress == CURRENT_CONGRESS,
        "dead": bill.congress != CURRENT_CONGRESS and bill.current_status not in BillStatus.final_status_obvious,
        "feed": Feed.BillFeed(bill),
        "text": get_text_info,
    }
def parse_committee_schedules(options):
    log.info('Processing committee schedule')
    meeting_processor = CommitteeMeetingProcessor()
    loaded_meetings = set()
    processed_all_meetings = True
    for chamber in ("house", "senate"):
    	meetings_file = settings.CONGRESS_DATA_PATH + '/committee_meetings_%s.json' % chamber
    	file_changed = File.objects.is_changed(meetings_file)
    
    	if not file_changed and not options.force:
    		log.info('File %s was not changed' % meetings_file)
    		processed_all_meetings = False
    	else:
    		meetings = json.load(open(meetings_file))
    		
    		# Process committee event nodes
    		for meeting in meetings:
    			try:
    				# Associate it with an existing meeting object if GUID is already known.
    				# Must get it like this, vs just assigning the ID as we do in other parsers,
    				# because of the auto_now_add created field, which otherwise misbehaves.
    				try:
    					mobj = CommitteeMeeting.objects.get(guid=meeting['guid'])
    				except CommitteeMeeting.DoesNotExist:
    					mobj = CommitteeMeeting()
    				
    				# Parse.
    				mobj = meeting_processor.process(mobj, meeting)
    				
    				# Attach the meeting to the subcommittee if set.
    				if mobj.subcommittee:
    					mobj.committee = Committee.objects.get(code=mobj.committee.code + mobj.subcommittee)
    				
    				mobj.save()
    				loaded_meetings.add(mobj.id)
    				
    				mobj.bills.clear()
    				for bill in meeting["bill_ids"]:
    				    try:
    				        bill_type, bill_num, bill_cong = re.match(r"([a-z]+)(\d+)-(\d+)$", bill).groups()
    				        bill = Bill.objects.get(congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num))
    				        mobj.bills.add(bill)
    				    except AttributeError:
    				        pass # regex failed
    				    except common.enum.NotFound:
    				        pass # invalid bill type code in source data
    				    except Bill.DoesNotExist:
    				        pass # we don't know about bill yet
    			except Committee.DoesNotExist:
    				log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting))
    
    		File.objects.save_file(meetings_file)
    	
    if processed_all_meetings:
        # Drop any future meetings that are no longer in the source data.
        obsolete_mtgs = CommitteeMeeting.objects.exclude(id__in=loaded_meetings).filter(when__gt=datetime.now())
        if obsolete_mtgs.count() > 0:
           log.error("Deleting %d obsolete meetings." % obsolete_mtgs.count())
           obsolete_mtgs.delete()

    if not options.disable_events:
        for committee in Committee.objects.filter(obsolete=False):
            log.info('Generating events for %s.' % committee)
            committee.create_events()
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()
    
    # Cache existing terms. There aren't so many.
    existing_terms = { }
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()
            
        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm) 
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)
                    
                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' % term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)
                        
                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills
    
    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()
        
        if not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)
                
                # Update the index/events for any bill with recently changed text
                textfile = "data/us/bills.text/%s/%s/%s%s.txt" % (m.group(1), m.group(2), m.group(2), m.group(3))
                if (bill_index and not options.disable_events) and os.path.exists(textfile) and File.objects.is_changed(textfile):
                    bill_index.update_object(b, using="bill") # index the full text
                    b.create_events() # events for new bill text documents
                    File.objects.save_file(textfile)
                    
                continue
            except Bill.DoesNotExist:
                pass # just parse as normal
            
        if options.slow:
            time.sleep(1)
            
        skip_stuff = False
            
        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            if not skip_stuff:
                try:
                    bill = bill_processor.process(Bill(), node)
                except:
                    print fname
                    raise
            else:
                m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)
                bill = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
           
            seen_bill_ids.append(bill.id) # don't delete me later
            
            actions = []
            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/*[@state]"):
                actions.append( (repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)")) )
                
                if actions[-1][1] in (BillStatus.enacted_signed, BillStatus.enacted_veto_override):
                    bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                    bill.sliplawnum = int(axn.get("number").split("-")[1])
                    
            bill.major_actions = actions
            try:
                bill.save()
            except:
            	print bill
            	raise
            if bill_index: bill_index.update_object(bill, using="bill")
            
            if not options.disable_events:
                bill.create_events()

        if not skip_stuff:
            File.objects.save_file(fname)
        
    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter and False:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids).delete()
        
    # Parse docs.house.gov for what might be coming up this week.
    import iso8601
    dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read()
    m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html)
    if not m:
        log.error('No docs.house.gov download link found at http://docs.house.gov.')
    else:
        def bt_re(bt): return re.escape(bt[1]).replace(r"\.", "\.?\s*")
        try:
            dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot()
        except:
            print "http://docs.house.gov/" + m.group(1)
            raise
        # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date()
        for item in dhg.xpath("category/floor-items/floor-item"):
            billname = item.xpath("legis-num")[0].text
            m = re.match("\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?("
                + "|".join(bt_re(bt) for bt in BillType)
                + ")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I)
            if not m:
                if billname.strip() != "H.R. __":
                    log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname)
            else:
                for bt in BillType:
                    if re.match(bt_re(bt) + "$", m.group(1)):
                        try:
                            bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2))
                            bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None)
                            bill.save()
                            if bill_index: bill_index.update_object(bill, using="bill")
                            
                            if not options.disable_events:
                                bill.create_events()
                        except Bill.DoesNotExist:
                            log.error('Could not find bill "%s" in docs.house.gov.' % billname)
                        break
                else:
                    log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % billname)

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen("http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm").read()
    try:
        sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1)
        for congress, bill_type, number in re.findall(r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(days=7):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index: bill_index.update_object(bill, using="bill")
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error('Could not parse Senate Floor Schedule: ' + repr(e))
Exemple #25
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()

    # Cache existing terms. There aren't so many.
    existing_terms = {}
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()

        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type),
                                          subterm.name)]
                term.subterms.add(subterm)
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)

                    existing_terms[(int(subterm.term_type),
                                    subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' %
                              term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type),
                                              subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)

                        existing_terms[(int(subterm.term_type),
                                        subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' %
                                  term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills

    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob('data/congress/%s/bills/*/*/*.xml' %
                          options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' %
                 options.congress)
    elif options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')

    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]

    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()

        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or options.congress > 42) and (
                bill_index and not options.disable_events
        ) and not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1),
                                     bill_type=BillType.by_xml_code(
                                         m.group(2)),
                                     number=m.group(3))
                seen_bill_ids.append(b.id)

                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (
                            datetime.now() - timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(
                        textfile):
                    bill_index.update_object(
                        b, using="bill")  # index the full text
                    b.create_events()  # events for new bill text documents
                    File.objects.save_file(textfile)

                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass  # just parse as normal

        if options.slow:
            time.sleep(1)

        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise

            seen_bill_ids.append(bill.id)  # don't delete me later

            if bill.congress >= 93:
                bill.source = "thomas-congproj"
            elif bill.congress >= 82:
                bill.source = "statutesatlarge"
                if bill.current_status == BillStatus.enacted_signed:
                    bill.current_status = BillStatus.enacted_unknown
            elif bill.congress <= 42:
                bill.source = "americanmemory"
            else:
                raise ValueError()

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append((
                    repr(
                        bill_processor.parse_datetime(
                            axn.xpath("string(@datetime)"))),
                    BillStatus.by_xml_code(axn.xpath("string(@state)")),
                    axn.xpath("string(text)"),
                    etree.tostring(axn),
                ))

            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get(
                    "type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])

            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise
            if bill_index: bill_index.update_object(bill, using="bill")

            if not options.disable_events:
                bill.create_events()

        File.objects.save_file(fname)

    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(
                id__in=seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b

    # The rest is for current only...

    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return

    # Parse docs.house.gov for what might be coming up this week.
    import iso8601
    dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read()
    m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"",
                  dhg_html)
    if not m:
        log.error(
            'No docs.house.gov download link found at http://docs.house.gov.')
    else:

        def bt_re(bt):
            return re.escape(bt[1]).replace(r"\.", r"\.?\s*")

        try:
            dhg = etree.parse(
                urllib.urlopen("http://docs.house.gov/floor/" +
                               m.group(1))).getroot()
        except:
            print "http://docs.house.gov/floor/" + m.group(1)
            raise
        # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date()
        for item in dhg.xpath("category/floor-items/floor-item"):
            billname = item.xpath("legis-num")[0].text
            if billname is None: continue  # weird but OK
            m = re.match(
                r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?("
                + "|".join(bt_re(bt) for bt in BillType) +
                r")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I)
            if not m:
                if not billname.strip().endswith(" __"):
                    log.error(
                        'Could not parse legis-num "%s" in docs.house.gov.' %
                        billname)
            else:
                for bt in BillType:
                    if re.match(bt_re(bt) + "$", m.group(1), re.I):
                        try:
                            bill = Bill.objects.get(congress=CURRENT_CONGRESS,
                                                    bill_type=bt[0],
                                                    number=m.group(2))
                            bill.docs_house_gov_postdate = iso8601.parse_date(
                                item.get("add-date")).replace(tzinfo=None)
                            bill.save()
                            if bill_index:
                                bill_index.update_object(bill, using="bill")

                            if not options.disable_events:
                                bill.create_events()
                        except Bill.DoesNotExist:
                            log.error(
                                'Could not find bill "%s" in docs.house.gov.' %
                                billname)
                        break
                else:
                    log.error(
                        'Could not parse legis-num bill type "%s" in docs.house.gov.'
                        % m.group(1))

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen(
        "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm"
    ).read()
    try:
        sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting",
                        sfs).group(1)
        for congress, bill_type, number in re.findall(
                r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)",
                sfs):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress,
                                    bill_type=bill_type,
                                    number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(
                    days=7):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index: bill_index.update_object(bill, using="bill")
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error('Could not parse Senate Floor Schedule: ' + repr(e))
	def handle(self, *args, **options):
		if len(args) != 1:
			print "Missing argument."
			return

		min_count = int(args[0])

		p = apachelog.parser(logformat)
		
		spider = { }
		
		for line in sys.stdin:
			# Parse the acess log line.
			try:
				data = p.parse(line)
			except:
				continue
			
			# Is it a request to a bill page?
			path = data["%r"].split(" ")[1]
			m = re_bill.match(path)
			if not m: continue
			
			# Who is the referrer?
			ref = data["%{Referer}i"]
			if ref in ("", "-") or "govtrack.us" in ref:
				continue
			
			url = urlparse.urlparse(ref)
			hostname = url.hostname
			qs = urlparse.parse_qs(url.query)
			
			if not hostname: continue
			
			# Filter out known useless domains.
			if hostname in ("t.co", "longurl.org", "ow.ly", "bit.ly", "www.facebook.com", "www.weblinkvalidator.com", "static.ak.facebook.com", "info.com", "altavista.com", "tumblr.com", "www.freerepublic.com", "www.reddit.com"): continue
			if hostname.endswith(".ru"): continue
			
			# For referrals from Google, look at the 'q' argument to see how
			# people are searching for this page.
			if hostname.replace("www.", "").replace("search.", "") in ("google.com", "bing.com", "aol.com", "yahoo.com"):
				# todo, some use q= some use query=
				#print qs.get("q", [""])[0]
				continue
				
			# Filter out other domains if the link has a 'q' argument since it's probs
			# a search engine.
			if "q" in qs or "pid" in qs: continue
			
			# Filter out common paths for message boards.
			if "/threads/" in ref or "/forum/" in ref or "viewtopic.php" in ref: continue
				
			key = (m.groups(), url)
			spider[key] = spider.get(key, 0) + 1
			
		###
		
		first_print = True
		
		spider = spider.items()
		spider.sort(key = lambda kv : kv[1])
		for (bill_info, referral_url), count in spider:
			if count < min_count: continue # filter out referrers that occurred too infrequently
			
			bill_type = BillType.by_slug(bill_info[1])
			bill = Bill.objects.get(congress=bill_info[0], bill_type=bill_type, number=bill_info[2])
			
			lnk, is_new = BillLink.objects.get_or_create(
				bill=bill,
				url=referral_url.geturl(),
				defaults={
					"title": "Title Not Set"
				})
			
			# Additional processing for new entries.
			
			if not is_new: continue
			
			try:
				stream = urllib.urlopen(referral_url.geturl())
				if stream.getcode() != 200: continue
				dom = lxml.etree.parse(stream, lxml.etree.HTMLParser())
			except:
				continue
				
			title = dom.xpath('string(head/title)').strip()
			if title == "": continue
			
			# set the title of the scraped page
			lnk.title = title
			
			# white-list some domains, provided we were able to
			# get a title
			if referral_url.hostname in ("en.wikipedia.org", "www.truthorfiction.com", "www.theatlantic.com", "www.snopes.com", "arstechnica.com"):
				lnk.approved = True
			else:
				if first_print:
					print "Links pending approval:"
					print
					first_print = False
				print referral_url.geturl()
				print title.encode("utf8")
				print unicode(bill).encode("utf8")
				print
			
			lnk.save()
Exemple #27
0
	def handle(self, *args, **options):
		if len(args) != 1:
			print "Missing argument."
			return

		min_count = int(args[0])

		p = apachelog.parser(logformat)
		
		spider = { }
		
		for line in sys.stdin:
			# Parse the acess log line.
			try:
				data = p.parse(line)
			except:
				continue
			
			# Is it a request to a bill page?
			path = data["%r"].split(" ")[1]
			m = re_bill.match(path)
			if not m: continue
			
			# Who is the referrer?
			ref = data["%{Referer}i"]
			if ref in ("", "-") or "govtrack.us" in ref:
				continue
			
			url = urlparse.urlparse(ref)
			hostname = url.hostname
			qs = urlparse.parse_qs(url.query)
			
			if not hostname: continue
			
			# Filter out known useless domains.
			if hostname in ("t.co", "longurl.org", "ow.ly", "bit.ly", "www.facebook.com", "www.weblinkvalidator.com", "static.ak.facebook.com", "info.com", "altavista.com", "tumblr.com", "www.freerepublic.com", "www.reddit.com"): continue
			if hostname.endswith(".ru"): continue
			
			# For referrals from Google, look at the 'q' argument to see how
			# people are searching for this page.
			if hostname.replace("www.", "").replace("search.", "") in ("google.com", "bing.com", "aol.com", "yahoo.com"):
				# todo, some use q= some use query=
				#print qs.get("q", [""])[0]
				continue
				
			# Filter out other domains if the link has a 'q' argument since it's probs
			# a search engine.
			if "q" in qs or "pid" in qs: continue
			
			# Filter out common paths for message boards.
			if "/threads/" in ref or "/forum/" in ref or "viewtopic.php" in ref: continue
				
			key = (m.groups(), url)
			spider[key] = spider.get(key, 0) + 1
			
		###
		
		first_print = True
		
		spider = spider.items()
		spider.sort(key = lambda kv : kv[1])
		for (bill_info, referral_url), count in spider:
			if count < min_count: continue # filter out referrers that occurred too infrequently
			
			bill_type = BillType.by_slug(bill_info[1])
			bill = Bill.objects.get(congress=bill_info[0], bill_type=bill_type, number=bill_info[2])
			
			lnk, is_new = BillLink.objects.get_or_create(
				bill=bill,
				url=referral_url.geturl(),
				defaults={
					"title": "Title Not Set"
				})
			
			# Additional processing for new entries.
			
			if not is_new: continue
			
			try:
				stream = urllib.urlopen(referral_url.geturl())
				if stream.getcode() != 200: continue
				dom = lxml.etree.parse(stream, lxml.etree.HTMLParser())
			except:
				continue
				
			title = dom.xpath('string(head/title)').strip()
			if title == "": continue
			
			# set the title of the scraped page
			lnk.title = title
			
			# white-list some domains, provided we were able to
			# get a title
			if referral_url.hostname in ("en.wikipedia.org", "www.truthorfiction.com", "www.theatlantic.com", "www.snopes.com", "arstechnica.com"):
				lnk.approved = True
			else:
				if first_print:
					print "Links pending approval:"
					print
					first_print = False
				print referral_url.geturl()
				print title.encode("utf8")
				print unicode(bill).encode("utf8")
				print
			
			lnk.save()
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()

    # Cache existing terms. There aren't so many.
    existing_terms = {}
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info("Processing old bill terms")
    TERMS_FILE = "data/us/liv.xml"
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath("/liv/top-term"):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()

        for subnode in node.xpath("./term"):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm)
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)

                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error("Duplicated term %s" % term_processor.display_node(subnode))

    log.info("Processing new bill terms")
    for FILE in ("data/us/liv111.xml", "data/us/crsnet.xml"):
        tree = etree.parse(FILE)
        for node in tree.xpath("/liv/top-term"):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath("./term"):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)

                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error("Duplicated term %s" % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills

    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex

        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob("data/congress/%s/bills/*/*/*.xml" % options.congress)
        log.info("Parsing unitedstates/congress bills of only congress#%s" % options.congress)
    elif options.congress:
        files = glob.glob("data/us/%s/bills/*.xml" % options.congress)
        log.info("Parsing bills of only congress#%s" % options.congress)
    else:
        files = glob.glob("data/us/*/bills/*.xml")

    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]

    log.info("Processing bills: %d files" % len(files))
    total = len(files)
    progress = Progress(total=total, name="files", step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()

        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (
            (not options.congress or options.congress > 42)
            and (bill_index and not options.disable_events)
            and not File.objects.is_changed(fname)
            and not options.force
        ):
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)

                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (datetime.now() - timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(textfile):
                    bill_index.update_object(b, using="bill")  # index the full text
                    b.create_events()  # events for new bill text documents
                    File.objects.save_file(textfile)

                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass  # just parse as normal

        if options.slow:
            time.sleep(1)

        tree = etree.parse(fname)
        for node in tree.xpath("/bill"):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise

            seen_bill_ids.append(bill.id)  # don't delete me later

            if bill.congress >= 93:
                bill.source = "thomas-congproj"
            elif bill.congress >= 82:
                bill.source = "statutesatlarge"
                if bill.current_status == BillStatus.enacted_signed:
                    bill.current_status = BillStatus.enacted_unknown
            elif bill.congress <= 42:
                bill.source = "americanmemory"
            else:
                raise ValueError()

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append(
                    (
                        repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))),
                        BillStatus.by_xml_code(axn.xpath("string(@state)")),
                        axn.xpath("string(text)"),
                        etree.tostring(axn),
                    )
                )

            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])

            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise
            if bill_index:
                bill_index.update_object(bill, using="bill")

            if not options.disable_events:
                bill.create_events()

        File.objects.save_file(fname)

    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(id__in=seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b

    # The rest is for current only...

    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return

    # Parse docs.house.gov for what might be coming up this week.
    import iso8601

    dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read()
    m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html)
    if not m:
        log.error("No docs.house.gov download link found at http://docs.house.gov.")
    else:

        def bt_re(bt):
            return re.escape(bt[1]).replace(r"\.", r"\.?\s*")

        try:
            dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot()
        except:
            print "http://docs.house.gov/floor/" + m.group(1)
            raise
        # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date()
        for item in dhg.xpath("category/floor-items/floor-item"):
            billname = item.xpath("legis-num")[0].text
            if billname is None:
                continue  # weird but OK
            m = re.match(
                r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?("
                + "|".join(bt_re(bt) for bt in BillType)
                + r")(\d+)\s*(\[Conference Report\]\s*)?$",
                billname,
                re.I,
            )
            if not m:
                if not billname.strip().endswith(" __"):
                    log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname)
            else:
                for bt in BillType:
                    if re.match(bt_re(bt) + "$", m.group(1), re.I):
                        try:
                            bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2))
                            bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None)
                            bill.save()
                            if bill_index:
                                bill_index.update_object(bill, using="bill")

                            if not options.disable_events:
                                bill.create_events()
                        except Bill.DoesNotExist:
                            log.error('Could not find bill "%s" in docs.house.gov.' % billname)
                        break
                else:
                    log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % m.group(1))

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen(
        "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm"
    ).read()
    try:
        sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1)
        for congress, bill_type, number in re.findall(
            r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs
        ):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(
                days=7
            ):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index:
                    bill_index.update_object(bill, using="bill")
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error("Could not parse Senate Floor Schedule: " + repr(e))
def main(options):
    """
    Process committees, subcommittees and
    members of current congress committees.
    """

    BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH
    
    meeting_processor = CommitteeMeetingProcessor()

    log.info('Processing committees')
    COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print "New committee:", committee["thomas_id"]
                cobj = Committee(code=committee["thomas_id"])
               
            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.jurisdiction = committee.get("jurisdiction")
            cobj.jurisdiction_link = committee.get("jurisdiction_source")
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print "New subcommittee:", code
                    sobj = Committee(code=code)
                
                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)
                
            progress.tick()
            
        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees)
        if len(other_committees) > 0:
            print "Marking obsolete:", ", ".join(c.code for c in other_committees)
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)
        
    log.info('Processing committee members')
    MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map THOMAS IDs to GovTrack IDs
        y = yaml_load(BASE_PATH + "legislators-current.yaml")
        person_id_map = { }
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]:
                person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"]
        
        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')
        
        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print "Committee not found:", committee
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(id=person_id_map[member["thomas"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()
            
            progress.tick()

        File.objects.save_file(MEMBERS_FILE)
        
    log.info('Processing committee schedule')
    for chamber in ("house", "senate"):
		meetings_file = 'data/congress/committee_meetings_%s.json' % chamber
		file_changed = File.objects.is_changed(meetings_file)
	
		if not file_changed and not options.force:
			log.info('File %s was not changed' % meetings_file)
		else:
			meetings = json.load(open(meetings_file))
			
			# Process committee event nodes
			for meeting in meetings:
				try:
					# Associate it with an existing meeting object if GUID is already known.
					# Must get it like this, vs just assigning the ID as we do in other parsers,
					# because of the auto_now_add created field, which otherwise misbehaves.
					try:
						mobj = CommitteeMeeting.objects.get(guid=meeting['guid'])
					except CommitteeMeeting.DoesNotExist:
						mobj = CommitteeMeeting()
					
					# Parse.
					mobj = meeting_processor.process(mobj, meeting)
					
					# Attach the meeting to the subcommittee if set.
					if mobj.subcommittee:
						mobj.committee = Committee.objects.get(code=mobj.committee.code + mobj.subcommittee)
					
					mobj.save()
					
					mobj.bills.clear()
					for bill in meeting["bill_ids"]:
					    try:
					        bill_type, bill_num, bill_cong = re.match(r"([a-z]+)(\d+)-(\d+)$", bill).groups()
					        bill = Bill.objects.get(congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num))
					        mobj.bills.add(bill)
					    except AttributeError:
					        pass # regex failed
					    except common.enum.NotFound:
					        pass # invalid bill type code in source data
					    except Bill.DoesNotExist:
					        pass # we don't know about bill yet
				except Committee.DoesNotExist:
					log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting))
	
			for committee in Committee.objects.all():
				if not options.disable_events:
					committee.create_events()
				
			File.objects.save_file(meetings_file)
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()
    
    # Cache existing terms. There aren't so many.
    existing_terms = { }
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()
            
        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm) 
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)
                    
                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' % term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)
                        
                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills
    
    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress)
    elif options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()
        
        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or options.congress>42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)
                
                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (datetime.now()-timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(textfile):
                    b.update_index(bill_index) # index the full text
                    b.create_events() # events for new bill text documents
                    File.objects.save_file(textfile)
                    
                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass # just parse as normal
            
        if options.slow:
            time.sleep(1)
            
        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise
           
            seen_bill_ids.append(bill.id) # don't delete me later
            
            if bill.congress >= 93:
                bill.source = "thomas-congproj"
            elif bill.congress >= 82:
                bill.source = "statutesatlarge"
                if bill.current_status == BillStatus.enacted_signed: bill.current_status = BillStatus.enacted_unknown
            elif bill.congress <= 42:
                bill.source = "americanmemory"
            else:
                raise ValueError()

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append( (
                	repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))),
                	BillStatus.by_xml_code(axn.xpath("string(@state)")),
                	axn.xpath("string(text)"),
                    etree.tostring(axn),
                	) )
                
            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])
                    
            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise

            if bill_index:
                bill.update_index(bill_index)

            if not options.disable_events:
                bill.create_events()
                
        File.objects.save_file(fname)
        
    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b
        
    # The rest is for current only...
    
    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return
        
    # Load docs.house.gov data for what might be coming up this week.
    load_docs_house_gov(options, bill_index)

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen("http://www.senate.gov/legislative/schedule/floor_schedule.htm").read()
    try:
        sfs = re.search(r"([\w\W]*)<i>Previous Meeting", sfs).group(1)
        for congress, bill_type, number in re.findall(r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(days=7):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index:
                    bill.update_index(bill_index)
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error('Could not parse Senate Floor Schedule: ' + repr(e))
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()

    # Cache existing terms. There aren't so many.
    existing_terms = {}
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'bill/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()

        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type),
                                          subterm.name)]
                term.subterms.add(subterm)
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)

                    existing_terms[(int(subterm.term_type),
                                    subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' %
                              term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('bill/liv111.xml', 'bill/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type),
                                              subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)

                        existing_terms[(int(subterm.term_type),
                                        subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' %
                                  term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills

    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress:
        files = glob.glob(settings.CONGRESS_DATA_PATH +
                          '/%s/bills/*/*/data.xml' % options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' %
                 options.congress)
    else:
        files = glob.glob(settings.CONGRESS_DATA_PATH +
                          '/*/bills/*/*/data.xml')

    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]

    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()

        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or int(options.congress) > 42) and (
                bill_index and not options.disable_events
        ) and not File.objects.is_changed(fname) and not options.force:
            m = re.match(
                re.escape(settings.CONGRESS_DATA_PATH) +
                r'/(?P<congress>\d+)/bills/(?P<bill_type>[a-z]+)/(?P<bill_type_2>[a-z]+)(?P<number>\d+)/data.xml',
                fname)

            try:
                b = Bill.objects.get(congress=int(m.group("congress")),
                                     bill_type=BillType.by_slug(
                                         m.group("bill_type")),
                                     number=m.group("number"))
                seen_bill_ids.append(b.id)

                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (
                            datetime.now() - timedelta(days=14)).date():
                        print("No bill text?", fname, b.introduced_date)
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(
                        textfile):
                    b.update_index(bill_index)  # index the full text
                    b.create_events()  # events for new bill text documents
                    File.objects.save_file(textfile)

                continue
            except Bill.DoesNotExist:
                print("Unchanged metadata file but bill doesn't exist:", fname)
                pass  # just parse as normal

        if options.slow:
            time.sleep(1)

        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print(fname)
                raise

            seen_bill_ids.append(bill.id)  # don't delete me later

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = str(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                if axn.xpath("string(@state)") == "REFERRED":
                    continue  # we don't track this state
                actions.append((
                    repr(
                        bill_processor.parse_datetime(
                            axn.xpath("string(@datetime)"))),
                    BillStatus.by_xml_code(axn.xpath("string(@state)")),
                    axn.xpath("string(text)"),
                    etree.tostring(axn, encoding=str),
                ))

            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get(
                    "type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])

            bill.major_actions = actions
            try:
                bill.save()
            except:
                print(bill)
                raise

            if bill_index:
                bill.update_index(bill_index)

            if not options.disable_events:
                bill.create_events()

        File.objects.save_file(fname)

    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(
                id__in=seen_bill_ids):
            print("Bill is no longer on disk: ", b.id, b)

    # The rest is for current only...

    if options.congress and int(options.congress) != settings.CURRENT_CONGRESS:
        return

    # Find what might be coming up this week.
    load_docs_house_gov(options, bill_index)
    load_senate_floor_schedule(options, bill_index)
Exemple #32
0
def main(options):
    """
    Process committees, subcommittees and
    members of current congress committees.
    """

    BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH

    meeting_processor = CommitteeMeetingProcessor()

    log.info('Processing committees')
    COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print "New committee:", committee["thomas_id"]
                cobj = Committee(code=committee["thomas_id"])

            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.jurisdiction = committee.get("jurisdiction")
            cobj.jurisdiction_link = committee.get("jurisdiction_source")
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print "New subcommittee:", code
                    sobj = Committee(code=code)

                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)

            progress.tick()

        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(
            id__in=seen_committees)
        if len(other_committees) > 0:
            print "Marking obsolete:", ", ".join(c.code
                                                 for c in other_committees)
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)

    log.info('Processing committee members')
    MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map THOMAS IDs to GovTrack IDs
        y = yaml_load(BASE_PATH + "legislators-current.yaml")
        person_id_map = {}
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]:
                person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"]

        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')

        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print "Committee not found:", committee
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(
                    id=person_id_map[member["thomas"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()

            progress.tick()

        File.objects.save_file(MEMBERS_FILE)

    log.info('Processing committee schedule')
    for chamber in ("house", "senate"):
        meetings_file = 'data/congress/committee_meetings_%s.json' % chamber
        file_changed = File.objects.is_changed(meetings_file)

        if not file_changed and not options.force:
            log.info('File %s was not changed' % meetings_file)
        else:
            meetings = json.load(open(meetings_file))

            # Process committee event nodes
            for meeting in meetings:
                try:
                    # Associate it with an existing meeting object if GUID is already known.
                    # Must get it like this, vs just assigning the ID as we do in other parsers,
                    # because of the auto_now_add created field, which otherwise misbehaves.
                    try:
                        mobj = CommitteeMeeting.objects.get(
                            guid=meeting['guid'])
                    except CommitteeMeeting.DoesNotExist:
                        mobj = CommitteeMeeting()

                    # Parse.
                    mobj = meeting_processor.process(mobj, meeting)

                    # Attach the meeting to the subcommittee if set.
                    if mobj.subcommittee:
                        mobj.committee = Committee.objects.get(
                            code=mobj.committee.code + mobj.subcommittee)

                    mobj.save()

                    mobj.bills.clear()
                    for bill in meeting["bill_ids"]:
                        try:
                            bill_type, bill_num, bill_cong = re.match(
                                r"([a-z]+)(\d+)-(\d+)$", bill).groups()
                            bill = Bill.objects.get(
                                congress=bill_cong,
                                bill_type=BillType.by_slug(bill_type),
                                number=int(bill_num))
                            mobj.bills.add(bill)
                        except AttributeError:
                            pass  # regex failed
                        except common.enum.NotFound:
                            pass  # invalid bill type code in source data
                        except Bill.DoesNotExist:
                            pass  # we don't know about bill yet
                except Committee.DoesNotExist:
                    log.error(
                        'Could not load Committee object for meeting %s' %
                        meeting_processor.display_node(meeting))

            for committee in Committee.objects.all():
                if not options.disable_events:
                    committee.create_events()

            File.objects.save_file(meetings_file)
def parse_committee_schedules(options):
    log.info('Processing committee schedule')
    meeting_processor = CommitteeMeetingProcessor()
    loaded_meetings = set()
    processed_all_meetings = True
    for chamber in ("house", "senate"):
        meetings_file = settings.CONGRESS_DATA_PATH + '/committee_meetings_%s.json' % chamber
        file_changed = File.objects.is_changed(meetings_file)

        if not file_changed and not options.force:
            log.info('File %s was not changed' % meetings_file)
            processed_all_meetings = False
        else:
            meetings = json.load(open(meetings_file))

            # Process committee event nodes
            for meeting in meetings:
                try:
                    # Associate it with an existing meeting object if GUID is already known.
                    # Must get it like this, vs just assigning the ID as we do in other parsers,
                    # because of the auto_now_add created field, which otherwise misbehaves.
                    try:
                        mobj = CommitteeMeeting.objects.get(
                            guid=meeting['guid'])
                    except CommitteeMeeting.DoesNotExist:
                        mobj = CommitteeMeeting()

                    # Parse.
                    mobj = meeting_processor.process(mobj, meeting)

                    # Attach the meeting to the subcommittee if set.
                    if mobj.subcommittee:
                        mobj.committee = Committee.objects.get(
                            code=mobj.committee.code + mobj.subcommittee)

                    mobj.save()
                    loaded_meetings.add(mobj.id)

                    mobj.bills.clear()
                    for bill in meeting["bill_ids"]:
                        try:
                            bill_type, bill_num, bill_cong = re.match(
                                r"([a-z]+)(\d+)-(\d+)$", bill).groups()
                            bill = Bill.objects.get(
                                congress=bill_cong,
                                bill_type=BillType.by_slug(bill_type),
                                number=int(bill_num))
                            mobj.bills.add(bill)
                        except AttributeError:
                            pass  # regex failed
                        except common.enum.NotFound:
                            pass  # invalid bill type code in source data
                        except Bill.DoesNotExist:
                            pass  # we don't know about bill yet
                except Committee.DoesNotExist:
                    log.error(
                        'Could not load Committee object for meeting %s' %
                        meeting_processor.display_node(meeting))

            File.objects.save_file(meetings_file)

    if processed_all_meetings:
        # Drop any future meetings that are no longer in the source data.
        obsolete_mtgs = CommitteeMeeting.objects.exclude(
            id__in=loaded_meetings).filter(when__gt=datetime.now())
        if obsolete_mtgs.count() > 0:
            log.error("Deleting %d obsolete meetings." % obsolete_mtgs.count())
            obsolete_mtgs.delete()

    if not options.disable_events:
        for committee in Committee.objects.filter(obsolete=False):
            log.info('Generating events for %s.' % committee)
            committee.create_events()
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()
    
    # Cache existing terms. There aren't so many.
    existing_terms = { }
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'bill/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()
            
        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm) 
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)
                    
                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' % term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('bill/liv111.xml', 'bill/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)
                        
                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills
    
    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress:
        files = glob.glob(settings.CONGRESS_DATA_PATH + '/%s/bills/*/*/data.xml' % options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress)
    else:
        files = glob.glob(settings.CONGRESS_DATA_PATH + '/*/bills/*/*/data.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()
        
        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or int(options.congress)>42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force:
            m = re.match(re.escape(settings.CONGRESS_DATA_PATH) + r'/(?P<congress>\d+)/bills/(?P<bill_type>[a-z]+)/(?P<bill_type_2>[a-z]+)(?P<number>\d+)/data.xml', fname)

            try:
                b = Bill.objects.get(congress=int(m.group("congress")), bill_type=BillType.by_slug(m.group("bill_type")), number=m.group("number"))
                seen_bill_ids.append(b.id)
                
                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (datetime.now()-timedelta(days=14)).date():
                        print("No bill text?", fname, b.introduced_date)
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(textfile):
                    b.update_index(bill_index) # index the full text
                    b.create_events() # events for new bill text documents
                    File.objects.save_file(textfile)
                    
                continue
            except Bill.DoesNotExist:
                print("Unchanged metadata file but bill doesn't exist:", fname)
                pass # just parse as normal
            
        if options.slow:
            time.sleep(1)
            
        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print(fname)
                raise
           
            seen_bill_ids.append(bill.id) # don't delete me later
            
            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = str(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                if axn.xpath("string(@state)") == "REFERRED": continue # we don't track this state
                actions.append( (
                	repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))),
                	BillStatus.by_xml_code(axn.xpath("string(@state)")),
                	axn.xpath("string(text)"),
                    etree.tostring(axn, encoding=str),
                	) )
                
            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])
                    
            bill.major_actions = actions
            try:
                bill.save()
            except:
                print(bill)
                raise

            if bill_index:
                bill.update_index(bill_index)

            if not options.disable_events:
                bill.create_events()
                
        File.objects.save_file(fname)
        
    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids):
            print("Bill is no longer on disk: ", b.id, b)
        
    # The rest is for current only...
    
    if options.congress and int(options.congress) != settings.CURRENT_CONGRESS:
        return
        
    # Find what might be coming up this week.
    load_docs_house_gov(options, bill_index)
    load_senate_floor_schedule(options, bill_index)