def compare_bills(b1, b2): from bill.billtext import get_bill_text_metadata fn1 = get_bill_text_metadata(b1, None)['xml_file'] fn2 = get_bill_text_metadata(b2, None)['xml_file'] text1 = extract_text(fn1) state = prepare_text1(text1) print b1.id, unicode(b1).encode("utf8") print b2.id, unicode(b2).encode("utf8") ratio1, ratio2, text = compare_text(extract_text(fn2), *state) print ratio1, ratio2, ratio1 * ratio2 print(text[:1000].encode("utf8")) print
def compare_bills(b1, b2): from bill.billtext import get_bill_text_metadata fn1 = get_bill_text_metadata(b1, None)['xml_file'] fn2 = get_bill_text_metadata(b2, None)['xml_file'] text1 = extract_text(fn1) state = prepare_text1(text1) print b1.id, unicode(b1).encode("utf8") print b2.id, unicode(b2).encode("utf8") ratio1, ratio2, text = compare_text(extract_text(fn2), *state) print ratio1, ratio2, ratio1*ratio2 print(text[:1000].encode("utf8")) print
def compare_bills(b1, b2): from bill.billtext import get_bill_text_metadata fn1 = get_bill_text_metadata(b1, None)['xml_file'] fn2 = get_bill_text_metadata(b2, None)['xml_file'] text1 = extract_text(fn1) state = prepare_text1(text1) print(b1.id, str(b1)) print(b2.id, str(b2)) ratio1, ratio2, text = compare_text(extract_text(fn2), *state) print(ratio1, ratio2, len(text)) print(text) print(is_text_incorporated(ratio1, ratio2, len(text))) print()
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = {} for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'bill/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('bill/liv111.xml', 'bill/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress: files = glob.glob(settings.CONGRESS_DATA_PATH + '/%s/bills/*/*/data.xml' % options.congress) log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress) else: files = glob.glob(settings.CONGRESS_DATA_PATH + '/*/bills/*/*/data.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if (not options.congress or int(options.congress) > 42) and ( bill_index and not options.disable_events ) and not File.objects.is_changed(fname) and not options.force: m = re.match( re.escape(settings.CONGRESS_DATA_PATH) + r'/(?P<congress>\d+)/bills/(?P<bill_type>[a-z]+)/(?P<bill_type_2>[a-z]+)(?P<number>\d+)/data.xml', fname) try: b = Bill.objects.get(congress=int(m.group("congress")), bill_type=BillType.by_slug( m.group("bill_type")), number=m.group("number")) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < ( datetime.now() - timedelta(days=14)).date(): print("No bill text?", fname, b.introduced_date) continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed( textfile): b.update_index(bill_index) # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print("Unchanged metadata file but bill doesn't exist:", fname) pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath('/bill'): try: bill = bill_processor.process(Bill(), node) except: print(fname) raise seen_bill_ids.append(bill.id) # don't delete me later # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = str(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): if axn.xpath("string(@state)") == "REFERRED": continue # we don't track this state actions.append(( repr( bill_processor.parse_datetime( axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn, encoding=str), )) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get( "type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print(bill) raise if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude( id__in=seen_bill_ids): print("Bill is no longer on disk: ", b.id, b) # The rest is for current only... if options.congress and int(options.congress) != settings.CURRENT_CONGRESS: return # Find what might be coming up this week. load_docs_house_gov(options, bill_index) load_senate_floor_schedule(options, bill_index)
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = { } for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'data/us/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress and int(options.congress) <= 42: files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress) log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress) elif options.congress: files = glob.glob('data/us/%s/bills/*.xml' % options.congress) log.info('Parsing bills of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if (not options.congress or options.congress>42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < (datetime.now()-timedelta(days=14)).date(): print "No bill text?", fname, b.introduced_date continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed(textfile): b.update_index(bill_index) # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print "Unchanged metadata file but bill doesn't exist:", fname pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath('/bill'): try: bill = bill_processor.process(Bill(), node) except: print fname raise seen_bill_ids.append(bill.id) # don't delete me later # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = unicode(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): actions.append( ( repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn), ) ) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids): print "Bill is no longer on disk: ", b.id, b # The rest is for current only... if options.congress and int(options.congress) != CURRENT_CONGRESS: return # Find what might be coming up this week. load_docs_house_gov(options, bill_index) load_senate_floor_schedule(options, bill_index)
existing_comps = set() if os.path.exists(csv_fn): for row in csv.reader(open(csv_fn)): timestamp, b1_id, b1_versioncode, b1_ratio, b2_id, b2_versioncode, b2_ratio, cmp_text_len, cmp_text \ = row existing_comps.add( ((b1_id, b1_versioncode), (b2_id, b2_versioncode))) writer.writerow(row) # For each enacted bill.. for b1 in tqdm(enacted_bills): # Load the enacted bill's text. # Loads current metadata and text for the bill. try: md1 = get_bill_text_metadata(b1, None) text1 = extract_text(md1['xml_file']) except TypeError: # no bill text at all (accessing [...] of None) continue except KeyError: # no xml_file continue except ValueError: # xml is bad continue state = prepare_text1(text1) # Use Solr's More Like This query to get a preliminary list of # bills textually similar to each enacted bill, which lets us # cut down on the number of comparisons that we need to run # by a factor of around 100. Pull between 10 and 50 similar # bills -- depending on how large of a bill the enacted bill
csv_fn = "data/us/%d/text_comparison.csv" % congress existing_comps = set() if os.path.exists(csv_fn): for row in csv.reader(open(csv_fn)): timestamp, b1_id, b1_versioncode, b1_ratio, b2_id, b2_versioncode, b2_ratio, cmp_text_len, cmp_text \ = row existing_comps.add( ((b1_id, b1_versioncode), (b2_id, b2_versioncode) ) ) writer.writerow(row) # For each enacted bill.. for b1 in tqdm.tqdm(enacted_bills): # Load the enacted bill's text. # Loads current metadata and text for the bill. try: md1 = get_bill_text_metadata(b1, None) text1 = extract_text(md1['xml_file']) except KeyError: # no xml_file continue except ValueError: # xml is bad continue state = prepare_text1(text1) # Use Solr's More Like This query to get a preliminary list of # bills textually similar to each enacted bill, which lets us # cut down on the number of comparisons that we need to run # by a factor of around 100. Pull between 10 and 50 similar # bills -- depending on how large of a bill the enacted bill # is. An authorization bill can have lots of bills incorporated # into it, but a short bill could not have very many.
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = {} for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'data/us/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress and int(options.congress) <= 42: files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress) log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress) elif options.congress: files = glob.glob('data/us/%s/bills/*.xml' % options.congress) log.info('Parsing bills of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if (not options.congress or options.congress > 42) and ( bill_index and not options.disable_events ) and not File.objects.is_changed(fname) and not options.force: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code( m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < ( datetime.now() - timedelta(days=14)).date(): print "No bill text?", fname, b.introduced_date continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed( textfile): bill_index.update_object( b, using="bill") # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print "Unchanged metadata file but bill doesn't exist:", fname pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath('/bill'): try: bill = bill_processor.process(Bill(), node) except: print fname raise seen_bill_ids.append(bill.id) # don't delete me later if bill.congress >= 93: bill.source = "thomas-congproj" elif bill.congress >= 82: bill.source = "statutesatlarge" if bill.current_status == BillStatus.enacted_signed: bill.current_status = BillStatus.enacted_unknown elif bill.congress <= 42: bill.source = "americanmemory" else: raise ValueError() # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = unicode(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): actions.append(( repr( bill_processor.parse_datetime( axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn), )) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get( "type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude( id__in=seen_bill_ids): print "Bill is no longer on disk: ", b.id, b # The rest is for current only... if options.congress and int(options.congress) != CURRENT_CONGRESS: return # Parse docs.house.gov for what might be coming up this week. import iso8601 dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read() m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html) if not m: log.error( 'No docs.house.gov download link found at http://docs.house.gov.') else: def bt_re(bt): return re.escape(bt[1]).replace(r"\.", r"\.?\s*") try: dhg = etree.parse( urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot() except: print "http://docs.house.gov/floor/" + m.group(1) raise # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date() for item in dhg.xpath("category/floor-items/floor-item"): billname = item.xpath("legis-num")[0].text if billname is None: continue # weird but OK m = re.match( r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?(" + "|".join(bt_re(bt) for bt in BillType) + r")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I) if not m: if not billname.strip().endswith(" __"): log.error( 'Could not parse legis-num "%s" in docs.house.gov.' % billname) else: for bt in BillType: if re.match(bt_re(bt) + "$", m.group(1), re.I): try: bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2)) bill.docs_house_gov_postdate = iso8601.parse_date( item.get("add-date")).replace(tzinfo=None) bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Bill.DoesNotExist: log.error( 'Could not find bill "%s" in docs.house.gov.' % billname) break else: log.error( 'Could not parse legis-num bill type "%s" in docs.house.gov.' % m.group(1)) # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow. now = datetime.now() sfs = urllib.urlopen( "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm" ).read() try: sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1) for congress, bill_type, number in re.findall( r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs): bill_type = BillType.by_slug(bill_type) bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number) if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta( days=7): bill.senate_floor_schedule_postdate = now bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Exception as e: log.error('Could not parse Senate Floor Schedule: ' + repr(e))
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = { } for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'data/us/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress: files = glob.glob('data/us/%s/bills/*.xml' % options.congress) log.info('Parsing bills of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() if not File.objects.is_changed(fname) and not options.force: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.introduced_date < (datetime.now()-timedelta(days=14)).date(): print "No bill text?", fname, b.introduced_date continue textfile = textfile["plain_text_file"] if (bill_index and not options.disable_events) and File.objects.is_changed(textfile): bill_index.update_object(b, using="bill") # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print "Unchanged metadata file but bill doesn't exist:", fname pass # just parse as normal if options.slow: time.sleep(1) skip_stuff = False tree = etree.parse(fname) for node in tree.xpath('/bill'): if not skip_stuff: try: bill = bill_processor.process(Bill(), node) except: print fname raise else: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) bill = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(bill.id) # don't delete me later actions = [] bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/*[@state]"): actions.append( (repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)")) ) if actions[-1][1] in (BillStatus.enacted_signed, BillStatus.enacted_veto_override): bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() if not skip_stuff: File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter and False: # this doesn't work because seen_bill_ids is too big for sqlite! Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids).delete() # Parse docs.house.gov for what might be coming up this week. import iso8601 dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read() m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html) if not m: log.error('No docs.house.gov download link found at http://docs.house.gov.') else: def bt_re(bt): return re.escape(bt[1]).replace(r"\.", "\.?\s*") try: dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot() except: print "http://docs.house.gov/" + m.group(1) raise # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date() for item in dhg.xpath("category/floor-items/floor-item"): billname = item.xpath("legis-num")[0].text m = re.match("\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?(" + "|".join(bt_re(bt) for bt in BillType) + ")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I) if not m: if billname.strip() != "H.R. __": log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname) else: for bt in BillType: if re.match(bt_re(bt) + "$", m.group(1)): try: bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2)) bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None) bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Bill.DoesNotExist: log.error('Could not find bill "%s" in docs.house.gov.' % billname) break else: log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % billname) # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow. now = datetime.now() sfs = urllib.urlopen("http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm").read() try: sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1) for congress, bill_type, number in re.findall(r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs): bill_type = BillType.by_slug(bill_type) bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number) if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(days=7): bill.senate_floor_schedule_postdate = now bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Exception as e: log.error('Could not parse Senate Floor Schedule: ' + repr(e))
if os.path.exists(csv_fn): for row in csv.reader(open(csv_fn)): timestamp, b1_id, b1_versioncode, b1_ratio, b2_id, b2_versioncode, b2_ratio, cmp_text_len, cmp_text \ = row existing_comps.add( ((b1_id, b1_versioncode), (b2_id, b2_versioncode))) writer.writerow(row) # For each enacted bill, find the set of bills to compare it to. comps = [] for b1 in tqdm(enacted_bills, desc="Finding comparison pairs"): # Load the enacted bill's text. # Loads current metadata for the bill. try: md1 = get_bill_text_metadata(b1, None) if 'xml_file' not in md1: continue # no xml text text1 = extract_text(md1['xml_file']) except TypeError: # no bill text at all (accessing [...] of None) continue except ValueError: # xml is bad continue # Use Solr's More Like This query to get a preliminary list of # bills textually similar to each enacted bill, which lets us # cut down on the number of comparisons that we need to run # by a factor of around 100. Pull between 10 and 300 similar # bills -- depending on how large of a bill the enacted bill # is. An authorization bill can have lots of bills incorporated # into it, but a short bill could not have very many. from haystack.query import SearchQuerySet