def parse_bill_number(q, congress=None, not_exist_ok=False): m = bill_number_re.match( q.replace(" ", "").replace(".", "").replace("-", "")) if m == None: return None search_type_flag = None if m.group(3) != None: cn = int(m.group(4)) search_type_flag = "bill-with-congress" elif congress != None: try: cn = int(congress) except: cn = CURRENT_CONGRESS search_type_flag = "bill-default-congress" else: cn = CURRENT_CONGRESS search_type_flag = "bill-guessed-congress" try: b = Bill.objects.get(congress=cn, bill_type=BillType.by_slug(m.group(1).lower()), number=int(m.group(2))) b.search_type_flag = search_type_flag return b except Bill.DoesNotExist: if not_exist_ok: # Return a dummy bill indicating that string matched the regex. b = Bill(congress=cn, bill_type=BillType.by_slug(m.group(1).lower()), number=int(m.group(2))) b.search_type_flag = search_type_flag return b return None
def load_docs_house_gov(options, bill_index): # Look at the three most recent JSON files by looking at the lexicographically last ones, # which possibly cover the current week, the next week, and the week after that. for fn in sorted(os.listdir("data/congress/upcoming_house_floor"))[-3:]: data = json.load(open("data/congress/upcoming_house_floor/" + fn)) for billinfo in data.get("upcoming", []): if "bill_id" not in billinfo: continue m = re.match(r"([hrsjconres]+)(\d+)-(\d+)", billinfo["bill_id"]) if not m: log.error('Could not parse bill_id "%s" in docs.house.gov.' % billinfo["bill_id"]) continue bt = BillType.by_slug(m.group(1)) try: bill = Bill.objects.get(congress=int(m.group(3)), bill_type=bt, number=int(m.group(2))) except Exception as e: log.error('Could not get bill "%s" in docs.house.gov: %s.' % (billinfo["bill_id"], str(e))) continue bill.docs_house_gov_postdate = BillProcessor.parse_datetime( billinfo["published_at"]) if bill.senate_floor_schedule_postdate is None or bill.docs_house_gov_postdate > bill.senate_floor_schedule_postdate: bill.scheduled_consideration_date = BillProcessor.parse_datetime( data["week_of"]) bill.save() if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events()
def load_docs_house_gov(options, bill_index): # Get most recent JSON file by looking at the lexicographically last one. fn = sorted(os.listdir("data/congress/upcoming_house_floor"))[-1] data = json.load(open("data/congress/upcoming_house_floor/" + fn)) for billinfo in data.get("upcoming", []): if "bill_id" not in billinfo: continue m = re.match(r"([hrsjconres]+)(\d+)-(\d+)", billinfo["bill_id"]) if not m: log.error('Could not parse bill_id "%s" in docs.house.gov.' % billinfo["bill_id"]) continue bt = BillType.by_slug(m.group(1)) try: bill = Bill.objects.get(congress=int(m.group(3)), bill_type=bt, number=int(m.group(2))) except Exception as e: log.error('Could not get bill "%s" in docs.house.gov: %s.' % (billinfo["bill_id"], str(e))) continue bill.docs_house_gov_postdate = BillProcessor.parse_datetime( billinfo["published_at"]) bill.save() if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events()
def bill_advocacy_tips(request, congress, type_slug, number): try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number) return { "bill": bill }
def load_bill_from_url(congress, type_slug, number): try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) return get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
def bill_details_user_view(request, congress, type_slug, number): try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number) ret = { } if request.user.is_staff: admin_panel = """ {% load humanize %} <div class="clear"> </div> <div style="margin-top: 1.5em; padding: .5em; background-color: #EEE; "> <b>ADMIN</b> - <a href="{% url "bill_go_to_summary_admin" %}?bill={{bill.id}}">Edit Summary</a> <br/>Tracked by {{feed.tracked_in_lists.count|intcomma}} users ({{feed.tracked_in_lists_with_email.count|intcomma}} w/ email). </div> """ from django.template import Template, Context, RequestContext, loader ret["admin_panel"] = Template(admin_panel).render(RequestContext(request, { 'bill': bill, "feed": Feed.BillFeed(bill), })) from person.views import render_subscribe_inline ret.update(render_subscribe_inline(request, Feed.BillFeed(bill))) return ret
def load_docs_house_gov(options, bill_index): # Look at the three most recent JSON files by looking at the lexicographically last ones, # which possibly cover the current week, the next week, and the week after that. if not os.path.exists(settings.CONGRESS_DATA_PATH + "/upcoming_house_floor"): print("No upcoming_house_floor data.") return for fn in sorted(os.listdir(settings.CONGRESS_DATA_PATH + "/upcoming_house_floor"))[-3:]: data = json.load(open(settings.CONGRESS_DATA_PATH + "/upcoming_house_floor/" + fn)) for billinfo in data.get("upcoming", []): if "bill_id" not in billinfo: continue m = re.match(r"([hrsjconres]+)(\d+)-(\d+)", billinfo["bill_id"]) if not m: log.error('Could not parse bill_id "%s" in docs.house.gov.' % billinfo["bill_id"]) continue bt = BillType.by_slug(m.group(1)) try: bill = Bill.objects.get(congress=int(m.group(3)), bill_type=bt, number=int(m.group(2))) except Exception as e: log.error('Could not get bill "%s" in docs.house.gov: %s.' % (billinfo["bill_id"], str(e))) continue bill.docs_house_gov_postdate = BillProcessor.parse_datetime(billinfo["published_at"]) if bill.senate_floor_schedule_postdate is None or bill.docs_house_gov_postdate > bill.senate_floor_schedule_postdate: bill.scheduled_consideration_date = BillProcessor.parse_datetime(data["week_of"]) bill.save() if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events()
def bill_text(request, congress, type_slug, number, version=None): if version == "": version = None try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number) from .billtext import load_bill_text, get_bill_text_versions try: textdata = load_bill_text(bill, version) except IOError: textdata = None # Get a list of the alternate versions of this bill. alternates = None is_latest = True if textdata: alternates = [] for v in get_bill_text_versions(bill): try: alternates.append(load_bill_text(bill, v, mods_only=True)) except IOError: pass alternates.sort(key = lambda mods : mods["docdate"]) if len(alternates) > 0: is_latest = False if textdata["doc_version"] == alternates[-1]["doc_version"]: is_latest = True # Get a list of related bills. from .billtext import get_current_version related_bills = [] for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]: try: rbv = get_current_version(rb) if not (rb, rbv) in related_bills: related_bills.append((rb, rbv)) except IOError: pass # text not available for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill): if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2)) for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill): if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1)) return { "bill_subpage": "Text", 'bill': bill, "congressdates": get_congress_dates(bill.congress), "textdata": textdata, "version": version, "is_latest": is_latest, "alternates": alternates, "related_bills": related_bills, "days_old": (datetime.datetime.now().date() - bill.current_status_date).days, "is_on_bill_text_page": True, # for the header tabs }
def load_bill_from_url(congress, type_slug, number): # not sure why we were trying this #if type_slug.isdigit(): # bill_type = type_slug try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) return get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
def bill_text(request, congress, type_slug, number, version=None): if version == "": version = None try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number) from billtext import load_bill_text, bill_gpo_status_codes try: textdata = load_bill_text(bill, version) except IOError: textdata = None # Get a list of the alternate versions of this bill. alternates = None if textdata: alternates = [] for v in bill_gpo_status_codes: try: alternates.append(load_bill_text(bill, v, mods_only=True)) except IOError: pass alternates.sort(key = lambda mods : mods["docdate"]) # Get a list of related bills. from billtext import get_current_version related_bills = [] for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]: try: rbv = get_current_version(rb) if not (rb, rbv) in related_bills: related_bills.append((rb, rbv)) except IOError: pass # text not available for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill): if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2)) for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill): if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1)) return { 'bill': bill, "congressdates": get_congress_dates(bill.congress), "textdata": textdata, "version": version, "alternates": alternates, "related_bills": related_bills, }
def parse_bill_number(q, congress=None): m = bill_number_re.match(q.replace(" ", "").replace(".", "").replace("-", "")) if m == None: return None if m.group(3) != None: cn = int(m.group(4)) elif congress != None: try: cn = int(congress) except: cn = CURRENT_CONGRESS else: cn = CURRENT_CONGRESS try: return Bill.objects.get(congress=cn, bill_type=BillType.by_slug(m.group(1).lower()), number=int(m.group(2))) except Bill.DoesNotExist: return None
def bill_text(request, congress, type_slug, number, version=None): if int(congress) < 103: raise Http404("Bill text is not available before the 103rd congress.") if version == "": version = None try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number) from billtext import load_bill_text, bill_gpo_status_codes try: textdata = load_bill_text(bill, version) except IOError: textdata = None # Get a list of the alternate versions of this bill. alternates = None if textdata: alternates = [] for v in bill_gpo_status_codes: fn = "data/us/bills.text/%s/%s/%s%d%s.mods.xml" % (bill.congress, BillType.by_value(bill.bill_type).xml_code, BillType.by_value(bill.bill_type).xml_code, bill.number, v) if os.path.exists(fn): alternates.append(load_bill_text(bill, v, mods_only=True)) alternates.sort(key = lambda mods : mods["docdate"]) # Get a list of related bills. related_bills = [] for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]: if not (rb, "") in related_bills: related_bills.append((rb, "")) for btc in BillTextComparison.objects.filter(bill1=bill): if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2)) for btc in BillTextComparison.objects.filter(bill2=bill): if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1)) return { 'bill': bill, "congressdates": get_congress_dates(bill.congress), "textdata": textdata, "version": version, "alternates": alternates, "related_bills": related_bills, }
def get_bill_for_page(page): for template in mwparserfromhell.parse(page["text"]).filter_templates(): if template.name.strip() == "Infobox U.S. legislation": # print page["title"].encode("utf8") billref = get_bill_from_infobox(template) if billref: try: if billref[0] == "PL": # Get by pulic law number. return Bill.objects.get(congress=billref[1], sliplawpubpriv="PUB", sliplawnum=billref[2]) elif billref[0] == "BILL": # It's a bill number. return Bill.objects.get( congress=billref[1], bill_type=BillType.by_slug(billref[2]), number=billref[3] ) except Bill.DoesNotExist: return None return None
def get_bill_for_page(page): for template in mwparserfromhell.parse(page["text"]).filter_templates(): if template.name.strip() == "Infobox U.S. legislation": #print page["title"].encode("utf8") try: billref = get_bill_from_infobox(template) except Exception as e: print(page["pageid"], e) billref = None if billref: try: if billref[0] == "PL": # Get by pulic law number. return Bill.objects.get(congress=billref[1], sliplawpubpriv="PUB", sliplawnum=billref[2]) elif billref[0] == "BILL": # It's a bill number. return Bill.objects.get(congress=billref[1], bill_type=BillType.by_slug(billref[2]), number=billref[3]) except Bill.DoesNotExist: return None return None
def bill_details(request, congress, type_slug, number): if type_slug.isdigit(): bill_type = type_slug else: try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number) from person.name import get_person_name sponsor_name = None if not bill.sponsor else \ get_person_name(bill.sponsor, role_date=bill.introduced_date, firstname_position='before', show_suffix=True) def get_reintroductions(): reintro_prev = None reintro_next = None for reintro in bill.find_reintroductions(): if reintro.congress < bill.congress: reintro_prev = reintro if reintro.congress > bill.congress and not reintro_next: reintro_next = reintro return reintro_prev, reintro_next def get_text_info(): from billtext import load_bill_text try: return load_bill_text(bill, None, mods_only=True) except IOError: return None return { 'bill': bill, "congressdates": get_congress_dates(bill.congress), "subtitle": get_secondary_bill_title(bill, bill.titles), "sponsor_name": sponsor_name, "reintros": get_reintroductions, # defer so we can use template caching "current": bill.congress == CURRENT_CONGRESS, "dead": bill.congress != CURRENT_CONGRESS and bill.current_status not in BillStatus.final_status_obvious, "feed": Feed.BillFeed(bill), "text": get_text_info, }
def load_docs_house_gov(options, bill_index): # Get most recent JSON file by looking at the lexicographically last one. fn = sorted(os.listdir("data/congress/upcoming_house_floor"))[-1] data = json.load(open("data/congress/upcoming_house_floor/" + fn)) for billinfo in data.get("upcoming", []): m = re.match(r"([hrsjconres]+)(\d+)-(\d+)", billinfo["bill_id"]) if not m: log.error('Could not parse bill_id "%s" in docs.house.gov.' % billinfo["bill_id"]) continue bt = BillType.by_slug(m.group(1)) try: bill = Bill.objects.get(congress=int(m.group(3)), bill_type=bt, number=int(m.group(2))) except Exception as e: log.error('Could not get bill "%s" in docs.house.gov: %s.' % (billinfo["bill_id"], str(e))) continue bill.docs_house_gov_postdate = BillProcessor.parse_datetime(billinfo["published_at"]) bill.save() if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events()
def bill_details(request, congress, type_slug, number): if type_slug.isdigit(): bill_type = type_slug else: try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number) from person.name import get_person_name sponsor_name = None if not bill.sponsor else \ get_person_name(bill.sponsor, role_date=bill.introduced_date, firstname_position='before', show_suffix=True) def get_reintroductions(): reintro_prev = None reintro_next = None for reintro in bill.find_reintroductions(): if reintro.congress < bill.congress: reintro_prev = reintro if reintro.congress > bill.congress and not reintro_next: reintro_next = reintro return reintro_prev, reintro_next def get_text_info(): from models import USCSection from billtext import load_bill_text from search import parse_slip_law_number import re try: metadata = load_bill_text(bill, None, mods_only=True) # do interesting stuff with citations if "citations" in metadata: slip_laws = [] statutes = [] usc = { } other = [] usc_other = USCSection(name="Other Citations", ordering=99999) for cite in metadata["citations"]: if cite["type"] == "slip_law": slip_laws.append(cite) cite["bill"] = parse_slip_law_number(cite["text"]) elif cite["type"] == "statutes_at_large": statutes.append(cite) elif cite["type"] == "usc": # build a normalized citation and a link to LII cite_norm = "usc/" + cite["title"] cite_link = "http://www.law.cornell.edu/uscode/text/" + cite["title"] if cite["section"]: cite_link += "/" + cite["section"] cite_norm += "/" + cite["section"] if cite["paragraph"]: cite_link += "#" + "_".join(re.findall(r"\(([^)]+)\)", cite["paragraph"])) # Build a tree of title-chapter-...-section nodes so we can # display the citations in context. try: sec_obj = USCSection.objects.get(citation=cite_norm) except: # USCSection.DoesNotExist and MultipleObjectsReturned both possible # the 'id' field is set to make these objects properly hashable sec_obj = USCSection(id=cite["text"], name=cite["text"], parent_section=usc_other) sec_obj.link = cite_link if "range_to_section" in cite: sec_obj.range_to_section = cite["range_to_section"] # recursively go up to the title path = [sec_obj] while sec_obj.parent_section: sec_obj = sec_obj.parent_section path.append(sec_obj) # now pop off from the path to put the node at the right point in a tree container = usc while path: p = path.pop(-1) if p not in container: container[p] = { } container = container[p] else: other.append(cite) slip_laws.sort(key = lambda x : (x["congress"], x["number"])) # restructure data format def ucfirst(s): return s[0].upper() + s[1:] def rebuild_usc_sec(seclist, indent=0): ret = [] seclist = sorted(seclist.items(), key=lambda x : x[0].ordering) for sec, subparts in seclist: ret.append({ "text": (ucfirst(sec.level_type + ((" " + sec.number) if sec.number else "") + (": " if sec.name else "")) if sec.level_type else "") + (sec.name if sec.name else ""), "link": getattr(sec, "link", None), "range_to_section": getattr(sec, "range_to_section", None), "indent": indent, }) ret.extend(rebuild_usc_sec(subparts, indent=indent+1)) return ret usc = rebuild_usc_sec(usc) metadata["citations"] = { "slip_laws": slip_laws, "statutes": statutes, "usc": usc, "other": other, "count": len(slip_laws)+len(statutes)+len(usc)+len(other) } return metadata except IOError: return None return { 'bill': bill, "congressdates": get_congress_dates(bill.congress), "subtitle": get_secondary_bill_title(bill, bill.titles), "sponsor_name": sponsor_name, "reintros": get_reintroductions, # defer so we can use template caching "current": bill.congress == CURRENT_CONGRESS, "dead": bill.congress != CURRENT_CONGRESS and bill.current_status not in BillStatus.final_status_obvious, "feed": Feed.BillFeed(bill), "text": get_text_info, }
def parse_committee_schedules(options): log.info('Processing committee schedule') meeting_processor = CommitteeMeetingProcessor() loaded_meetings = set() processed_all_meetings = True for chamber in ("house", "senate"): meetings_file = settings.CONGRESS_DATA_PATH + '/committee_meetings_%s.json' % chamber file_changed = File.objects.is_changed(meetings_file) if not file_changed and not options.force: log.info('File %s was not changed' % meetings_file) processed_all_meetings = False else: meetings = json.load(open(meetings_file)) # Process committee event nodes for meeting in meetings: try: # Associate it with an existing meeting object if GUID is already known. # Must get it like this, vs just assigning the ID as we do in other parsers, # because of the auto_now_add created field, which otherwise misbehaves. try: mobj = CommitteeMeeting.objects.get(guid=meeting['guid']) except CommitteeMeeting.DoesNotExist: mobj = CommitteeMeeting() # Parse. mobj = meeting_processor.process(mobj, meeting) # Attach the meeting to the subcommittee if set. if mobj.subcommittee: mobj.committee = Committee.objects.get(code=mobj.committee.code + mobj.subcommittee) mobj.save() loaded_meetings.add(mobj.id) mobj.bills.clear() for bill in meeting["bill_ids"]: try: bill_type, bill_num, bill_cong = re.match(r"([a-z]+)(\d+)-(\d+)$", bill).groups() bill = Bill.objects.get(congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num)) mobj.bills.add(bill) except AttributeError: pass # regex failed except common.enum.NotFound: pass # invalid bill type code in source data except Bill.DoesNotExist: pass # we don't know about bill yet except Committee.DoesNotExist: log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting)) File.objects.save_file(meetings_file) if processed_all_meetings: # Drop any future meetings that are no longer in the source data. obsolete_mtgs = CommitteeMeeting.objects.exclude(id__in=loaded_meetings).filter(when__gt=datetime.now()) if obsolete_mtgs.count() > 0: log.error("Deleting %d obsolete meetings." % obsolete_mtgs.count()) obsolete_mtgs.delete() if not options.disable_events: for committee in Committee.objects.filter(obsolete=False): log.info('Generating events for %s.' % committee) committee.create_events()
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = { } for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'data/us/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress: files = glob.glob('data/us/%s/bills/*.xml' % options.congress) log.info('Parsing bills of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() if not File.objects.is_changed(fname) and not options.force: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = "data/us/bills.text/%s/%s/%s%s.txt" % (m.group(1), m.group(2), m.group(2), m.group(3)) if (bill_index and not options.disable_events) and os.path.exists(textfile) and File.objects.is_changed(textfile): bill_index.update_object(b, using="bill") # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: pass # just parse as normal if options.slow: time.sleep(1) skip_stuff = False tree = etree.parse(fname) for node in tree.xpath('/bill'): if not skip_stuff: try: bill = bill_processor.process(Bill(), node) except: print fname raise else: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) bill = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(bill.id) # don't delete me later actions = [] bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/*[@state]"): actions.append( (repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)")) ) if actions[-1][1] in (BillStatus.enacted_signed, BillStatus.enacted_veto_override): bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() if not skip_stuff: File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter and False: # this doesn't work because seen_bill_ids is too big for sqlite! Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids).delete() # Parse docs.house.gov for what might be coming up this week. import iso8601 dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read() m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html) if not m: log.error('No docs.house.gov download link found at http://docs.house.gov.') else: def bt_re(bt): return re.escape(bt[1]).replace(r"\.", "\.?\s*") try: dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot() except: print "http://docs.house.gov/" + m.group(1) raise # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date() for item in dhg.xpath("category/floor-items/floor-item"): billname = item.xpath("legis-num")[0].text m = re.match("\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?(" + "|".join(bt_re(bt) for bt in BillType) + ")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I) if not m: if billname.strip() != "H.R. __": log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname) else: for bt in BillType: if re.match(bt_re(bt) + "$", m.group(1)): try: bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2)) bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None) bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Bill.DoesNotExist: log.error('Could not find bill "%s" in docs.house.gov.' % billname) break else: log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % billname) # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow. now = datetime.now() sfs = urllib.urlopen("http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm").read() try: sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1) for congress, bill_type, number in re.findall(r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs): bill_type = BillType.by_slug(bill_type) bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number) if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(days=7): bill.senate_floor_schedule_postdate = now bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Exception as e: log.error('Could not parse Senate Floor Schedule: ' + repr(e))
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = {} for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'data/us/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress and int(options.congress) <= 42: files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress) log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress) elif options.congress: files = glob.glob('data/us/%s/bills/*.xml' % options.congress) log.info('Parsing bills of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if (not options.congress or options.congress > 42) and ( bill_index and not options.disable_events ) and not File.objects.is_changed(fname) and not options.force: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code( m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < ( datetime.now() - timedelta(days=14)).date(): print "No bill text?", fname, b.introduced_date continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed( textfile): bill_index.update_object( b, using="bill") # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print "Unchanged metadata file but bill doesn't exist:", fname pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath('/bill'): try: bill = bill_processor.process(Bill(), node) except: print fname raise seen_bill_ids.append(bill.id) # don't delete me later if bill.congress >= 93: bill.source = "thomas-congproj" elif bill.congress >= 82: bill.source = "statutesatlarge" if bill.current_status == BillStatus.enacted_signed: bill.current_status = BillStatus.enacted_unknown elif bill.congress <= 42: bill.source = "americanmemory" else: raise ValueError() # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = unicode(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): actions.append(( repr( bill_processor.parse_datetime( axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn), )) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get( "type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude( id__in=seen_bill_ids): print "Bill is no longer on disk: ", b.id, b # The rest is for current only... if options.congress and int(options.congress) != CURRENT_CONGRESS: return # Parse docs.house.gov for what might be coming up this week. import iso8601 dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read() m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html) if not m: log.error( 'No docs.house.gov download link found at http://docs.house.gov.') else: def bt_re(bt): return re.escape(bt[1]).replace(r"\.", r"\.?\s*") try: dhg = etree.parse( urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot() except: print "http://docs.house.gov/floor/" + m.group(1) raise # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date() for item in dhg.xpath("category/floor-items/floor-item"): billname = item.xpath("legis-num")[0].text if billname is None: continue # weird but OK m = re.match( r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?(" + "|".join(bt_re(bt) for bt in BillType) + r")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I) if not m: if not billname.strip().endswith(" __"): log.error( 'Could not parse legis-num "%s" in docs.house.gov.' % billname) else: for bt in BillType: if re.match(bt_re(bt) + "$", m.group(1), re.I): try: bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2)) bill.docs_house_gov_postdate = iso8601.parse_date( item.get("add-date")).replace(tzinfo=None) bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Bill.DoesNotExist: log.error( 'Could not find bill "%s" in docs.house.gov.' % billname) break else: log.error( 'Could not parse legis-num bill type "%s" in docs.house.gov.' % m.group(1)) # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow. now = datetime.now() sfs = urllib.urlopen( "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm" ).read() try: sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1) for congress, bill_type, number in re.findall( r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs): bill_type = BillType.by_slug(bill_type) bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number) if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta( days=7): bill.senate_floor_schedule_postdate = now bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Exception as e: log.error('Could not parse Senate Floor Schedule: ' + repr(e))
def handle(self, *args, **options): if len(args) != 1: print "Missing argument." return min_count = int(args[0]) p = apachelog.parser(logformat) spider = { } for line in sys.stdin: # Parse the acess log line. try: data = p.parse(line) except: continue # Is it a request to a bill page? path = data["%r"].split(" ")[1] m = re_bill.match(path) if not m: continue # Who is the referrer? ref = data["%{Referer}i"] if ref in ("", "-") or "govtrack.us" in ref: continue url = urlparse.urlparse(ref) hostname = url.hostname qs = urlparse.parse_qs(url.query) if not hostname: continue # Filter out known useless domains. if hostname in ("t.co", "longurl.org", "ow.ly", "bit.ly", "www.facebook.com", "www.weblinkvalidator.com", "static.ak.facebook.com", "info.com", "altavista.com", "tumblr.com", "www.freerepublic.com", "www.reddit.com"): continue if hostname.endswith(".ru"): continue # For referrals from Google, look at the 'q' argument to see how # people are searching for this page. if hostname.replace("www.", "").replace("search.", "") in ("google.com", "bing.com", "aol.com", "yahoo.com"): # todo, some use q= some use query= #print qs.get("q", [""])[0] continue # Filter out other domains if the link has a 'q' argument since it's probs # a search engine. if "q" in qs or "pid" in qs: continue # Filter out common paths for message boards. if "/threads/" in ref or "/forum/" in ref or "viewtopic.php" in ref: continue key = (m.groups(), url) spider[key] = spider.get(key, 0) + 1 ### first_print = True spider = spider.items() spider.sort(key = lambda kv : kv[1]) for (bill_info, referral_url), count in spider: if count < min_count: continue # filter out referrers that occurred too infrequently bill_type = BillType.by_slug(bill_info[1]) bill = Bill.objects.get(congress=bill_info[0], bill_type=bill_type, number=bill_info[2]) lnk, is_new = BillLink.objects.get_or_create( bill=bill, url=referral_url.geturl(), defaults={ "title": "Title Not Set" }) # Additional processing for new entries. if not is_new: continue try: stream = urllib.urlopen(referral_url.geturl()) if stream.getcode() != 200: continue dom = lxml.etree.parse(stream, lxml.etree.HTMLParser()) except: continue title = dom.xpath('string(head/title)').strip() if title == "": continue # set the title of the scraped page lnk.title = title # white-list some domains, provided we were able to # get a title if referral_url.hostname in ("en.wikipedia.org", "www.truthorfiction.com", "www.theatlantic.com", "www.snopes.com", "arstechnica.com"): lnk.approved = True else: if first_print: print "Links pending approval:" print first_print = False print referral_url.geturl() print title.encode("utf8") print unicode(bill).encode("utf8") print lnk.save()
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = {} for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info("Processing old bill terms") TERMS_FILE = "data/us/liv.xml" tree = etree.parse(TERMS_FILE) for node in tree.xpath("/liv/top-term"): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath("./term"): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error("Duplicated term %s" % term_processor.display_node(subnode)) log.info("Processing new bill terms") for FILE in ("data/us/liv111.xml", "data/us/crsnet.xml"): tree = etree.parse(FILE) for node in tree.xpath("/liv/top-term"): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath("./term"): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error("Duplicated term %s" % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress and int(options.congress) <= 42: files = glob.glob("data/congress/%s/bills/*/*/*.xml" % options.congress) log.info("Parsing unitedstates/congress bills of only congress#%s" % options.congress) elif options.congress: files = glob.glob("data/us/%s/bills/*.xml" % options.congress) log.info("Parsing bills of only congress#%s" % options.congress) else: files = glob.glob("data/us/*/bills/*.xml") if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info("Processing bills: %d files" % len(files)) total = len(files) progress = Progress(total=total, name="files", step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if ( (not options.congress or options.congress > 42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force ): m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < (datetime.now() - timedelta(days=14)).date(): print "No bill text?", fname, b.introduced_date continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed(textfile): bill_index.update_object(b, using="bill") # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print "Unchanged metadata file but bill doesn't exist:", fname pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath("/bill"): try: bill = bill_processor.process(Bill(), node) except: print fname raise seen_bill_ids.append(bill.id) # don't delete me later if bill.congress >= 93: bill.source = "thomas-congproj" elif bill.congress >= 82: bill.source = "statutesatlarge" if bill.current_status == BillStatus.enacted_signed: bill.current_status = BillStatus.enacted_unknown elif bill.congress <= 42: bill.source = "americanmemory" else: raise ValueError() # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = unicode(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): actions.append( ( repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn), ) ) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude(id__in=seen_bill_ids): print "Bill is no longer on disk: ", b.id, b # The rest is for current only... if options.congress and int(options.congress) != CURRENT_CONGRESS: return # Parse docs.house.gov for what might be coming up this week. import iso8601 dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read() m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html) if not m: log.error("No docs.house.gov download link found at http://docs.house.gov.") else: def bt_re(bt): return re.escape(bt[1]).replace(r"\.", r"\.?\s*") try: dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot() except: print "http://docs.house.gov/floor/" + m.group(1) raise # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date() for item in dhg.xpath("category/floor-items/floor-item"): billname = item.xpath("legis-num")[0].text if billname is None: continue # weird but OK m = re.match( r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?(" + "|".join(bt_re(bt) for bt in BillType) + r")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I, ) if not m: if not billname.strip().endswith(" __"): log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname) else: for bt in BillType: if re.match(bt_re(bt) + "$", m.group(1), re.I): try: bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2)) bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None) bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Bill.DoesNotExist: log.error('Could not find bill "%s" in docs.house.gov.' % billname) break else: log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % m.group(1)) # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow. now = datetime.now() sfs = urllib.urlopen( "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm" ).read() try: sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1) for congress, bill_type, number in re.findall( r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs ): bill_type = BillType.by_slug(bill_type) bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number) if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta( days=7 ): bill.senate_floor_schedule_postdate = now bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Exception as e: log.error("Could not parse Senate Floor Schedule: " + repr(e))
def main(options): """ Process committees, subcommittees and members of current congress committees. """ BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH meeting_processor = CommitteeMeetingProcessor() log.info('Processing committees') COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print "New committee:", committee["thomas_id"] cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.jurisdiction = committee.get("jurisdiction") cobj.jurisdiction_link = committee.get("jurisdiction_source") cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print "New subcommittee:", code sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees) if len(other_committees) > 0: print "Marking obsolete:", ", ".join(c.code for c in other_committees) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE) log.info('Processing committee members') MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map THOMAS IDs to GovTrack IDs y = yaml_load(BASE_PATH + "legislators-current.yaml") person_id_map = { } for m in y: if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]: person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print "Committee not found:", committee continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get(id=person_id_map[member["thomas"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE) log.info('Processing committee schedule') for chamber in ("house", "senate"): meetings_file = 'data/congress/committee_meetings_%s.json' % chamber file_changed = File.objects.is_changed(meetings_file) if not file_changed and not options.force: log.info('File %s was not changed' % meetings_file) else: meetings = json.load(open(meetings_file)) # Process committee event nodes for meeting in meetings: try: # Associate it with an existing meeting object if GUID is already known. # Must get it like this, vs just assigning the ID as we do in other parsers, # because of the auto_now_add created field, which otherwise misbehaves. try: mobj = CommitteeMeeting.objects.get(guid=meeting['guid']) except CommitteeMeeting.DoesNotExist: mobj = CommitteeMeeting() # Parse. mobj = meeting_processor.process(mobj, meeting) # Attach the meeting to the subcommittee if set. if mobj.subcommittee: mobj.committee = Committee.objects.get(code=mobj.committee.code + mobj.subcommittee) mobj.save() mobj.bills.clear() for bill in meeting["bill_ids"]: try: bill_type, bill_num, bill_cong = re.match(r"([a-z]+)(\d+)-(\d+)$", bill).groups() bill = Bill.objects.get(congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num)) mobj.bills.add(bill) except AttributeError: pass # regex failed except common.enum.NotFound: pass # invalid bill type code in source data except Bill.DoesNotExist: pass # we don't know about bill yet except Committee.DoesNotExist: log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting)) for committee in Committee.objects.all(): if not options.disable_events: committee.create_events() File.objects.save_file(meetings_file)
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = { } for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'data/us/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress and int(options.congress) <= 42: files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress) log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress) elif options.congress: files = glob.glob('data/us/%s/bills/*.xml' % options.congress) log.info('Parsing bills of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if (not options.congress or options.congress>42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < (datetime.now()-timedelta(days=14)).date(): print "No bill text?", fname, b.introduced_date continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed(textfile): b.update_index(bill_index) # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print "Unchanged metadata file but bill doesn't exist:", fname pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath('/bill'): try: bill = bill_processor.process(Bill(), node) except: print fname raise seen_bill_ids.append(bill.id) # don't delete me later if bill.congress >= 93: bill.source = "thomas-congproj" elif bill.congress >= 82: bill.source = "statutesatlarge" if bill.current_status == BillStatus.enacted_signed: bill.current_status = BillStatus.enacted_unknown elif bill.congress <= 42: bill.source = "americanmemory" else: raise ValueError() # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = unicode(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): actions.append( ( repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn), ) ) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids): print "Bill is no longer on disk: ", b.id, b # The rest is for current only... if options.congress and int(options.congress) != CURRENT_CONGRESS: return # Load docs.house.gov data for what might be coming up this week. load_docs_house_gov(options, bill_index) # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow. now = datetime.now() sfs = urllib.urlopen("http://www.senate.gov/legislative/schedule/floor_schedule.htm").read() try: sfs = re.search(r"([\w\W]*)<i>Previous Meeting", sfs).group(1) for congress, bill_type, number in re.findall(r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs): bill_type = BillType.by_slug(bill_type) bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number) if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(days=7): bill.senate_floor_schedule_postdate = now bill.save() if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events() except Exception as e: log.error('Could not parse Senate Floor Schedule: ' + repr(e))
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = {} for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'bill/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('bill/liv111.xml', 'bill/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress: files = glob.glob(settings.CONGRESS_DATA_PATH + '/%s/bills/*/*/data.xml' % options.congress) log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress) else: files = glob.glob(settings.CONGRESS_DATA_PATH + '/*/bills/*/*/data.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if (not options.congress or int(options.congress) > 42) and ( bill_index and not options.disable_events ) and not File.objects.is_changed(fname) and not options.force: m = re.match( re.escape(settings.CONGRESS_DATA_PATH) + r'/(?P<congress>\d+)/bills/(?P<bill_type>[a-z]+)/(?P<bill_type_2>[a-z]+)(?P<number>\d+)/data.xml', fname) try: b = Bill.objects.get(congress=int(m.group("congress")), bill_type=BillType.by_slug( m.group("bill_type")), number=m.group("number")) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < ( datetime.now() - timedelta(days=14)).date(): print("No bill text?", fname, b.introduced_date) continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed( textfile): b.update_index(bill_index) # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print("Unchanged metadata file but bill doesn't exist:", fname) pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath('/bill'): try: bill = bill_processor.process(Bill(), node) except: print(fname) raise seen_bill_ids.append(bill.id) # don't delete me later # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = str(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): if axn.xpath("string(@state)") == "REFERRED": continue # we don't track this state actions.append(( repr( bill_processor.parse_datetime( axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn, encoding=str), )) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get( "type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print(bill) raise if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude( id__in=seen_bill_ids): print("Bill is no longer on disk: ", b.id, b) # The rest is for current only... if options.congress and int(options.congress) != settings.CURRENT_CONGRESS: return # Find what might be coming up this week. load_docs_house_gov(options, bill_index) load_senate_floor_schedule(options, bill_index)
def main(options): """ Process committees, subcommittees and members of current congress committees. """ BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH meeting_processor = CommitteeMeetingProcessor() log.info('Processing committees') COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print "New committee:", committee["thomas_id"] cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.jurisdiction = committee.get("jurisdiction") cobj.jurisdiction_link = committee.get("jurisdiction_source") cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print "New subcommittee:", code sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude( id__in=seen_committees) if len(other_committees) > 0: print "Marking obsolete:", ", ".join(c.code for c in other_committees) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE) log.info('Processing committee members') MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map THOMAS IDs to GovTrack IDs y = yaml_load(BASE_PATH + "legislators-current.yaml") person_id_map = {} for m in y: if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]: person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print "Committee not found:", committee continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get( id=person_id_map[member["thomas"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE) log.info('Processing committee schedule') for chamber in ("house", "senate"): meetings_file = 'data/congress/committee_meetings_%s.json' % chamber file_changed = File.objects.is_changed(meetings_file) if not file_changed and not options.force: log.info('File %s was not changed' % meetings_file) else: meetings = json.load(open(meetings_file)) # Process committee event nodes for meeting in meetings: try: # Associate it with an existing meeting object if GUID is already known. # Must get it like this, vs just assigning the ID as we do in other parsers, # because of the auto_now_add created field, which otherwise misbehaves. try: mobj = CommitteeMeeting.objects.get( guid=meeting['guid']) except CommitteeMeeting.DoesNotExist: mobj = CommitteeMeeting() # Parse. mobj = meeting_processor.process(mobj, meeting) # Attach the meeting to the subcommittee if set. if mobj.subcommittee: mobj.committee = Committee.objects.get( code=mobj.committee.code + mobj.subcommittee) mobj.save() mobj.bills.clear() for bill in meeting["bill_ids"]: try: bill_type, bill_num, bill_cong = re.match( r"([a-z]+)(\d+)-(\d+)$", bill).groups() bill = Bill.objects.get( congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num)) mobj.bills.add(bill) except AttributeError: pass # regex failed except common.enum.NotFound: pass # invalid bill type code in source data except Bill.DoesNotExist: pass # we don't know about bill yet except Committee.DoesNotExist: log.error( 'Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting)) for committee in Committee.objects.all(): if not options.disable_events: committee.create_events() File.objects.save_file(meetings_file)
def parse_committee_schedules(options): log.info('Processing committee schedule') meeting_processor = CommitteeMeetingProcessor() loaded_meetings = set() processed_all_meetings = True for chamber in ("house", "senate"): meetings_file = settings.CONGRESS_DATA_PATH + '/committee_meetings_%s.json' % chamber file_changed = File.objects.is_changed(meetings_file) if not file_changed and not options.force: log.info('File %s was not changed' % meetings_file) processed_all_meetings = False else: meetings = json.load(open(meetings_file)) # Process committee event nodes for meeting in meetings: try: # Associate it with an existing meeting object if GUID is already known. # Must get it like this, vs just assigning the ID as we do in other parsers, # because of the auto_now_add created field, which otherwise misbehaves. try: mobj = CommitteeMeeting.objects.get( guid=meeting['guid']) except CommitteeMeeting.DoesNotExist: mobj = CommitteeMeeting() # Parse. mobj = meeting_processor.process(mobj, meeting) # Attach the meeting to the subcommittee if set. if mobj.subcommittee: mobj.committee = Committee.objects.get( code=mobj.committee.code + mobj.subcommittee) mobj.save() loaded_meetings.add(mobj.id) mobj.bills.clear() for bill in meeting["bill_ids"]: try: bill_type, bill_num, bill_cong = re.match( r"([a-z]+)(\d+)-(\d+)$", bill).groups() bill = Bill.objects.get( congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num)) mobj.bills.add(bill) except AttributeError: pass # regex failed except common.enum.NotFound: pass # invalid bill type code in source data except Bill.DoesNotExist: pass # we don't know about bill yet except Committee.DoesNotExist: log.error( 'Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting)) File.objects.save_file(meetings_file) if processed_all_meetings: # Drop any future meetings that are no longer in the source data. obsolete_mtgs = CommitteeMeeting.objects.exclude( id__in=loaded_meetings).filter(when__gt=datetime.now()) if obsolete_mtgs.count() > 0: log.error("Deleting %d obsolete meetings." % obsolete_mtgs.count()) obsolete_mtgs.delete() if not options.disable_events: for committee in Committee.objects.filter(obsolete=False): log.info('Generating events for %s.' % committee) committee.create_events()
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = { } for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'bill/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('bill/liv111.xml', 'bill/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress: files = glob.glob(settings.CONGRESS_DATA_PATH + '/%s/bills/*/*/data.xml' % options.congress) log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress) else: files = glob.glob(settings.CONGRESS_DATA_PATH + '/*/bills/*/*/data.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if (not options.congress or int(options.congress)>42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force: m = re.match(re.escape(settings.CONGRESS_DATA_PATH) + r'/(?P<congress>\d+)/bills/(?P<bill_type>[a-z]+)/(?P<bill_type_2>[a-z]+)(?P<number>\d+)/data.xml', fname) try: b = Bill.objects.get(congress=int(m.group("congress")), bill_type=BillType.by_slug(m.group("bill_type")), number=m.group("number")) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < (datetime.now()-timedelta(days=14)).date(): print("No bill text?", fname, b.introduced_date) continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed(textfile): b.update_index(bill_index) # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print("Unchanged metadata file but bill doesn't exist:", fname) pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath('/bill'): try: bill = bill_processor.process(Bill(), node) except: print(fname) raise seen_bill_ids.append(bill.id) # don't delete me later # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = str(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): if axn.xpath("string(@state)") == "REFERRED": continue # we don't track this state actions.append( ( repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn, encoding=str), ) ) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print(bill) raise if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids): print("Bill is no longer on disk: ", b.id, b) # The rest is for current only... if options.congress and int(options.congress) != settings.CURRENT_CONGRESS: return # Find what might be coming up this week. load_docs_house_gov(options, bill_index) load_senate_floor_schedule(options, bill_index)