def tweet_a_bill_action(self):
		# Tweet an interesting action on a bill.
		from bill.models import Bill, BillStatus
		from bill.status import get_bill_really_short_status_string
		bills = list(Bill.objects.filter(
			current_status_date__gte=timezone.now().date()-timedelta(days=1),
			current_status_date__lt=timezone.now().date(),
		).exclude(
			current_status=BillStatus.introduced,
		))
		if len(bills) == 0: return

		# Choose bill with the most salient status, breaking ties with the highest proscore.
		bills.sort(key = lambda b : (BillStatus.by_value(b.current_status).sort_order, b.proscore()), reverse=True)
		for bill in bills:
			status = BillStatus.by_value(bill.current_status).xml_code
			if "Providing for consideration" in bill.title: continue
			text = get_bill_really_short_status_string(status)
			if text == "": continue
			bill_number = bill.display_number
			if bill.sponsor and bill.sponsor.twitterid: bill_number += " by @" + bill.sponsor.twitterid
			text = text % (bill_number, "yesterday")
			text += " " + bill.title_no_number
			self.post_tweet(
				bill.current_status_date.isoformat() + ":bill:%s:status:%s" % (bill.congressproject_id, status),
				text,
				"https://www.govtrack.us" + bill.get_absolute_url())
def bill_search_manager():
    sm = SearchManager(Bill, connection="bill")
    
    sm.add_option('similar_to', type="text", label="similar to (enter bill number)", visible_if=lambda form : False, filter=similar_to)
    sm.add_option('usc_cite', type="text", label="cites", visible_if=lambda form : False, orm_field_name='usc_citations_uptree', filter=usc_cite)
    
    sm.add_option('text', label='search title & full text', type="text", choices="NONE")
    sm.add_option('congress', type="select", formatter=format_congress_number, sort="KEY-REVERSE")
    sm.add_option('sponsor', type="select", sort="LABEL", formatter=lambda p : p.sortname)
    sm.add_option('current_status', label="current status", sort=lambda s : BillStatus.by_value(s).sort_order)
    sm.add_option('cosponsors', label="cosponsor", type="select", sort="LABEL", formatter=lambda p : p.sortname)
    sm.add_option('committees', label="committee", type="select", sort="LABEL", formatter=lambda c : c.shortname)
    sm.add_option('terms', type="select", label="subject", choices=get_terms(BillTerm.objects.exclude(parents__id__gt=0)))
    sm.add_option('terms2', type="select", label="subject 2", choices=sub_terms, visible_if=lambda post:"terms" in post, filter=sub_term_filter)
    sm.add_option('sponsor_party', label="party of sponsor", type="select")
    sm.add_option('bill_type', label="bill or resolution type")
    
    #sm.add_sort("Popularity", "-total_bets", default=True)
    sm.add_sort("Secret Sauce", "-proscore", default=True)
    sm.add_sort("Introduced Date (Newest First)", "-introduced_date")
    sm.add_sort("Introduced Date (Oldest First)", "introduced_date")
    sm.add_sort("Last Major Action (Recent First)", "-current_status_date")

    #def safe_strftime(date, format):
    #    return date.replace(year=3456).strftime(format).replace("3456", str(date.year)).replace(" 12:00AM", "")
    
    sm.set_template("""
    	<a href="{{object.get_absolute_url}}" style="font-size: 15px">{{object|truncatewords_html:50}}</a>
    	{% if object.sponsor %}<div>Sponsor: {{object.sponsor}}</div>{% endif %}
    	{% if object.source != "statutesatlarge" %}<div>Introduced: {{object.introduced_date}}</div>{% endif %}
    	{% if object.source != "americanmemory" %}<div>{% if object.source != "statutesatlarge" %}{{object.get_current_status_display}}{% else %}Enacted/Agreed to{% endif %}: {{object.current_status_date}}</div>{% endif %}
	""")
    
    return sm
Esempio n. 3
0
def bill_search_manager():
    sm = SearchManager(Bill, connection="bill", bulk_loader=bill_bulk_loader)
    
    sm.add_option('similar_to', type="text", label="similar to (enter bill number)", visible_if=lambda form : False, filter=similar_to)
    sm.add_option('usc_cite', type="text", label="cites", visible_if=lambda form : False, orm_field_name='usc_citations_uptree', filter=usc_cite)
    
    sm.add_option('text', label='search title & full text', type="text", choices="NONE")
    sm.add_option('congress', type="select", formatter=format_congress_number, sort="KEY-REVERSE")
    sm.add_option('sponsor', type="select", sort="LABEL", formatter=lambda p : p.sortname)
    sm.add_option('current_status', label="current status", sort=lambda s : BillStatus.by_value(s).sort_order)
    sm.add_option('enacted_ex', type="boolean", label="Enacted \u2014 Including by Incorporation into Other Bills")
    sm.add_option('cosponsors', label="cosponsor", type="select", sort="LABEL", formatter=lambda p : p.sortname)
    sm.add_option('committees', label="committee", type="select", sort="LABEL", formatter=lambda c : c.shortname)
    sm.add_option('terms', type="select", label="subject", choices=get_terms(BillTerm.objects.exclude(parents__id__gt=0)))
    sm.add_option('terms2', type="select", label="subject 2", choices=sub_terms, visible_if=lambda post:"terms" in post, filter=sub_term_filter)
    sm.add_option('sponsor_party', label="party of sponsor", type="select")
    sm.add_option('bill_type', label="bill or resolution type")
    
    #sm.add_sort("Popularity", "-total_bets", default=True)
    # default sort order is handled by the view
    sm.add_sort("Relevance of Title/Text", "relevance", func=lambda x : x) # no-op to use Solr default
    sm.add_sort("Secret Sauce", "-proscore")
    sm.add_sort("Introduced Date (Newest First)", "-introduced_date")
    sm.add_sort("Introduced Date (Oldest First)", "introduced_date")
    sm.add_sort("Last Major Action (Recent First)", "-current_status_date")
    sm.add_sort("Cosponsors (Most First)", "-cosponsor_count")
    sm.add_sort("Cosponsors (Fewest First)", "cosponsor_count")

    #def safe_strftime(date, format):
    #    return date.replace(year=3456).strftime(format).replace("3456", str(date.year)).replace(" 12:00AM", "")
    
    sm.set_template("""
	<div class="row">
		<div class="col-xs-2 col-md-1" style="padding-right: 0">
			<img src="{{object.get_absolute_url}}/thumbnail?aspect=1.2&width=125" class="img-responsive"/>
		</div>
		<div class="col-xs-10 col-md-11">
    	<div style="margin-bottom: 3px"><a href="{{object.get_absolute_url}}" style="font-size: 15px; line-height: 125%;">{{object|truncatewords_html:50}}</a></div>
		<div style="font-size: 90%">
    	{% if object.sponsor %}<div style="margin-bottom: 3px">Sponsor: {{object.sponsor_name}}</div>{% endif %}
		<table width="100%"><tr valign="top">
    	{% if object.source != "statutesatlarge" %}<td width="25%" style="padding-right: 1.5em">Introduced<br>{{object.introduced_date}}</td>{% else %}<td/>{% endif %}
    	{% if object.source != "americanmemory" and object.get_current_status_display_simple != "Introduced" %}<td width="25%" style="padding-right: 1.5em">{% if object.source != "statutesatlarge" %}{{object.get_current_status_display_simple}}{% else %}Enacted/Agreed to{% endif %}<br>{{object.current_status_date}}</td>{% else %}<td/>{% endif %}
		{% if 1 %}<td width="25%" style="padding-right: 1.5em">Cosponsors<br>{{object.cosponsor_counts_summary}}</td>{% else %}<td/>{% endif %}
		{% if object.is_alive and object.get_prognosis %}<td width="25%" style="padding-right: 1.5em">Prognosis<br>{{object.get_prognosis.prediction|floatformat:0}}%</td>{% else %}<td/>{% endif %}
		</tr></table>
        {% with b_list=object.was_enacted_ex %}
        {% for b in b_list %}
            {% if b and b != object %}
                <div>Enacted via <a href="{{b.get_absolute_url}}" style="text-decoration: none">{{b.title}}</a></div>
            {% endif %}
        {% endfor %}
		</div>
		</div>
	</div>
        {% endwith %}
	""")
    
    return sm
Esempio n. 4
0
 def process_current_status(self, obj, node):
     elem = node.xpath('./state')[0]
     obj.current_status_date = self.parse_datetime(elem.get('datetime'))
     obj.current_status = BillStatus.by_xml_code(elem.text)
Esempio n. 5
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()
    
    # Cache existing terms. There aren't so many.
    existing_terms = { }
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()
            
        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm) 
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)
                    
                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' % term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)
                        
                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills
    
    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()
        
        if not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)
                
                # Update the index/events for any bill with recently changed text
                textfile = "data/us/bills.text/%s/%s/%s%s.txt" % (m.group(1), m.group(2), m.group(2), m.group(3))
                if (bill_index and not options.disable_events) and os.path.exists(textfile) and File.objects.is_changed(textfile):
                    bill_index.update_object(b, using="bill") # index the full text
                    b.create_events() # events for new bill text documents
                    File.objects.save_file(textfile)
                    
                continue
            except Bill.DoesNotExist:
                pass # just parse as normal
            
        if options.slow:
            time.sleep(1)
            
        skip_stuff = False
            
        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            if not skip_stuff:
                try:
                    bill = bill_processor.process(Bill(), node)
                except:
                    print fname
                    raise
            else:
                m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)
                bill = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
           
            seen_bill_ids.append(bill.id) # don't delete me later
            
            actions = []
            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/*[@state]"):
                actions.append( (repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)")) )
                
                if actions[-1][1] in (BillStatus.enacted_signed, BillStatus.enacted_veto_override):
                    bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                    bill.sliplawnum = int(axn.get("number").split("-")[1])
                    
            bill.major_actions = actions
            try:
                bill.save()
            except:
            	print bill
            	raise
            if bill_index: bill_index.update_object(bill, using="bill")
            
            if not options.disable_events:
                bill.create_events()

        if not skip_stuff:
            File.objects.save_file(fname)
        
    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter and False:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids).delete()
        
    # Parse docs.house.gov for what might be coming up this week.
    import iso8601
    dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read()
    m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html)
    if not m:
        log.error('No docs.house.gov download link found at http://docs.house.gov.')
    else:
        def bt_re(bt): return re.escape(bt[1]).replace(r"\.", "\.?\s*")
        try:
            dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot()
        except:
            print "http://docs.house.gov/" + m.group(1)
            raise
        # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date()
        for item in dhg.xpath("category/floor-items/floor-item"):
            billname = item.xpath("legis-num")[0].text
            m = re.match("\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?("
                + "|".join(bt_re(bt) for bt in BillType)
                + ")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I)
            if not m:
                if billname.strip() != "H.R. __":
                    log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname)
            else:
                for bt in BillType:
                    if re.match(bt_re(bt) + "$", m.group(1)):
                        try:
                            bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2))
                            bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None)
                            bill.save()
                            if bill_index: bill_index.update_object(bill, using="bill")
                            
                            if not options.disable_events:
                                bill.create_events()
                        except Bill.DoesNotExist:
                            log.error('Could not find bill "%s" in docs.house.gov.' % billname)
                        break
                else:
                    log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % billname)

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen("http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm").read()
    try:
        sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1)
        for congress, bill_type, number in re.findall(r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(days=7):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index: bill_index.update_object(bill, using="bill")
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error('Could not parse Senate Floor Schedule: ' + repr(e))
Esempio n. 6
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()
    
    # Cache existing terms. There aren't so many.
    existing_terms = { }
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()
            
        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm) 
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)
                    
                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' % term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)
                        
                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills
    
    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress)
    elif options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()
        
        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or options.congress>42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)
                
                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (datetime.now()-timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(textfile):
                    b.update_index(bill_index) # index the full text
                    b.create_events() # events for new bill text documents
                    File.objects.save_file(textfile)
                    
                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass # just parse as normal
            
        if options.slow:
            time.sleep(1)
            
        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise
           
            seen_bill_ids.append(bill.id) # don't delete me later
            
            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append( (
                	repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))),
                	BillStatus.by_xml_code(axn.xpath("string(@state)")),
                	axn.xpath("string(text)"),
                    etree.tostring(axn),
                	) )
                
            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])
                    
            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise

            if bill_index:
                bill.update_index(bill_index)

            if not options.disable_events:
                bill.create_events()
                
        File.objects.save_file(fname)
        
    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b
        
    # The rest is for current only...
    
    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return
        
    # Find what might be coming up this week.
    load_docs_house_gov(options, bill_index)
    load_senate_floor_schedule(options, bill_index)
Esempio n. 7
0
 def process_current_status(self, obj, node):
     elem = node.xpath('./state')[0]
     status = elem.text
     if status == "REFERRED": status = "INTRODUCED"
     obj.current_status_date = self.parse_datetime(elem.get('datetime'))
     obj.current_status = BillStatus.by_xml_code(status)
#!script

from django.db.models import Count
import sys, csv
from bill.models import Cosponsor, Bill, BillStatus

bills_most_cosponsored = \
	Cosponsor.objects \
 	 .filter(bill__congress=114) \
	 .filter(withdrawn=None) \
	 .values("bill") \
	 .annotate(count=Count('id')) \
	 .order_by("-count") #\
	#[0:200]

w = csv.writer(sys.stdout)
for bill_and_count in bills_most_cosponsored:
	bill = Bill.objects.get(id=bill_and_count["bill"])
	w.writerow([
		bill_and_count["count"],
		str(bill).encode("utf8"),
		BillStatus.by_value(bill.current_status).label.encode("utf8"),
		"https://www.govtrack.us" + bill.get_absolute_url(),
	])
Esempio n. 9
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()

    # Cache existing terms. There aren't so many.
    existing_terms = {}
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info("Processing old bill terms")
    TERMS_FILE = "data/us/liv.xml"
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath("/liv/top-term"):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()

        for subnode in node.xpath("./term"):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm)
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)

                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error("Duplicated term %s" % term_processor.display_node(subnode))

    log.info("Processing new bill terms")
    for FILE in ("data/us/liv111.xml", "data/us/crsnet.xml"):
        tree = etree.parse(FILE)
        for node in tree.xpath("/liv/top-term"):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath("./term"):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)

                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error("Duplicated term %s" % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills

    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex

        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob("data/congress/%s/bills/*/*/*.xml" % options.congress)
        log.info("Parsing unitedstates/congress bills of only congress#%s" % options.congress)
    elif options.congress:
        files = glob.glob("data/us/%s/bills/*.xml" % options.congress)
        log.info("Parsing bills of only congress#%s" % options.congress)
    else:
        files = glob.glob("data/us/*/bills/*.xml")

    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]

    log.info("Processing bills: %d files" % len(files))
    total = len(files)
    progress = Progress(total=total, name="files", step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()

        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (
            (not options.congress or options.congress > 42)
            and (bill_index and not options.disable_events)
            and not File.objects.is_changed(fname)
            and not options.force
        ):
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)

                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (datetime.now() - timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(textfile):
                    bill_index.update_object(b, using="bill")  # index the full text
                    b.create_events()  # events for new bill text documents
                    File.objects.save_file(textfile)

                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass  # just parse as normal

        if options.slow:
            time.sleep(1)

        tree = etree.parse(fname)
        for node in tree.xpath("/bill"):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise

            seen_bill_ids.append(bill.id)  # don't delete me later

            if bill.congress >= 93:
                bill.source = "thomas-congproj"
            elif bill.congress >= 82:
                bill.source = "statutesatlarge"
                if bill.current_status == BillStatus.enacted_signed:
                    bill.current_status = BillStatus.enacted_unknown
            elif bill.congress <= 42:
                bill.source = "americanmemory"
            else:
                raise ValueError()

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append(
                    (
                        repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))),
                        BillStatus.by_xml_code(axn.xpath("string(@state)")),
                        axn.xpath("string(text)"),
                        etree.tostring(axn),
                    )
                )

            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])

            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise
            if bill_index:
                bill_index.update_object(bill, using="bill")

            if not options.disable_events:
                bill.create_events()

        File.objects.save_file(fname)

    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(id__in=seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b

    # The rest is for current only...

    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return

    # Parse docs.house.gov for what might be coming up this week.
    import iso8601

    dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read()
    m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html)
    if not m:
        log.error("No docs.house.gov download link found at http://docs.house.gov.")
    else:

        def bt_re(bt):
            return re.escape(bt[1]).replace(r"\.", r"\.?\s*")

        try:
            dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot()
        except:
            print "http://docs.house.gov/floor/" + m.group(1)
            raise
        # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date()
        for item in dhg.xpath("category/floor-items/floor-item"):
            billname = item.xpath("legis-num")[0].text
            if billname is None:
                continue  # weird but OK
            m = re.match(
                r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?("
                + "|".join(bt_re(bt) for bt in BillType)
                + r")(\d+)\s*(\[Conference Report\]\s*)?$",
                billname,
                re.I,
            )
            if not m:
                if not billname.strip().endswith(" __"):
                    log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname)
            else:
                for bt in BillType:
                    if re.match(bt_re(bt) + "$", m.group(1), re.I):
                        try:
                            bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2))
                            bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None)
                            bill.save()
                            if bill_index:
                                bill_index.update_object(bill, using="bill")

                            if not options.disable_events:
                                bill.create_events()
                        except Bill.DoesNotExist:
                            log.error('Could not find bill "%s" in docs.house.gov.' % billname)
                        break
                else:
                    log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % m.group(1))

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen(
        "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm"
    ).read()
    try:
        sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1)
        for congress, bill_type, number in re.findall(
            r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs
        ):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(
                days=7
            ):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index:
                    bill_index.update_object(bill, using="bill")
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error("Could not parse Senate Floor Schedule: " + repr(e))
Esempio n. 10
0
 def process_current_status(self, obj, node):
     elem = node.xpath('./state')[0]
     obj.current_status_date = self.parse_datetime(elem.get('datetime'))
     obj.current_status = BillStatus.by_xml_code(elem.text)
Esempio n. 11
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()
    
    # Cache existing terms. There aren't so many.
    existing_terms = { }
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()
            
        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm) 
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)
                    
                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' % term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)
                        
                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills
    
    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress)
    elif options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()
        
        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or options.congress>42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)
                
                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (datetime.now()-timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(textfile):
                    b.update_index(bill_index) # index the full text
                    b.create_events() # events for new bill text documents
                    File.objects.save_file(textfile)
                    
                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass # just parse as normal
            
        if options.slow:
            time.sleep(1)
            
        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise
           
            seen_bill_ids.append(bill.id) # don't delete me later
            
            if bill.congress >= 93:
                bill.source = "thomas-congproj"
            elif bill.congress >= 82:
                bill.source = "statutesatlarge"
                if bill.current_status == BillStatus.enacted_signed: bill.current_status = BillStatus.enacted_unknown
            elif bill.congress <= 42:
                bill.source = "americanmemory"
            else:
                raise ValueError()

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append( (
                	repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))),
                	BillStatus.by_xml_code(axn.xpath("string(@state)")),
                	axn.xpath("string(text)"),
                    etree.tostring(axn),
                	) )
                
            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])
                    
            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise

            if bill_index:
                bill.update_index(bill_index)

            if not options.disable_events:
                bill.create_events()
                
        File.objects.save_file(fname)
        
    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b
        
    # The rest is for current only...
    
    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return
        
    # Load docs.house.gov data for what might be coming up this week.
    load_docs_house_gov(options, bill_index)

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen("http://www.senate.gov/legislative/schedule/floor_schedule.htm").read()
    try:
        sfs = re.search(r"([\w\W]*)<i>Previous Meeting", sfs).group(1)
        for congress, bill_type, number in re.findall(r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(days=7):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index:
                    bill.update_index(bill_index)
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error('Could not parse Senate Floor Schedule: ' + repr(e))