def test_vote_event_bill_clearing(): # ensure that we don't wind up with vote events sitting around forever on bills as # changes make it look like there are multiple vote events j = create_jurisdiction() session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower', jurisdiction=j) bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) oi = OrganizationImporter('jid') dmi = DumbMockImporter() bi = BillImporter('jid', dmi, oi) vote_event1 = ScrapeVoteEvent( legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on somthing', # typo intentional bill=bill.identifier, bill_chamber='lower', chamber='lower') vote_event2 = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something else', bill=bill.identifier, bill_chamber='lower', chamber='lower') # have to use import_data so postimport is called VoteEventImporter('jid', dmi, oi, bi).import_data( [vote_event1.as_dict(), vote_event2.as_dict()]) assert VoteEvent.objects.count() == 2 # a typo is fixed, we don't want 3 vote events now vote_event1.motion_text = 'a vote on something' VoteEventImporter('jid', dmi, oi, bi).import_data( [vote_event1.as_dict(), vote_event2.as_dict()]) assert VoteEvent.objects.count() == 2
def test_vote_event_bill_clearing(): # ensure that we don't wind up with vote events sitting around forever on bills as # changes make it look like there are multiple vote events j = create_jurisdiction() session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower', jurisdiction=j) bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) oi = OrganizationImporter('jid') dmi = DumbMockImporter() bi = BillImporter('jid', dmi, oi) vote_event1 = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on somthing', # typo intentional bill=bill.identifier, bill_chamber='lower', chamber='lower' ) vote_event2 = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something else', bill=bill.identifier, bill_chamber='lower', chamber='lower' ) # have to use import_data so postimport is called VoteEventImporter('jid', dmi, oi, bi).import_data([ vote_event1.as_dict(), vote_event2.as_dict() ]) assert VoteEvent.objects.count() == 2 # a typo is fixed, we don't want 3 vote events now vote_event1.motion_text = 'a vote on something' VoteEventImporter('jid', dmi, oi, bi).import_data([ vote_event1.as_dict(), vote_event2.as_dict() ]) assert VoteEvent.objects.count() == 2
def handle_page(self): summary = self.doc.xpath( "/".join( [ '//h4[starts-with(text(), "SUMMARY")]', "/following-sibling::p", "text()", ] ) ) if summary and summary[0].strip(): self.obj.add_abstract(abstract=summary[0].strip(), note="summary") # versions for va in self.doc.xpath( '//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]' ): # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D date, desc = va.text.split(u" \xa0") desc.rsplit(" ", 1)[0] # chop off last part link = va.get("href") if "http" not in link: link = "{}{}".format(BASE_URL, link) date = datetime.datetime.strptime(date, "%m/%d/%y").date() # budget bills in VA are searchable but no full text available if "+men+" in link: logging.getLogger("va").warning( "not adding budget version, bill text not available" ) else: # VA duplicates reprinted bills, lets keep the original name self.obj.add_version_link( desc, link, date=date, media_type="text/html", on_duplicate="ignore" ) # amendments for va in self.doc.xpath( '//h4[text()="AMENDMENTS"]/following-sibling::ul[1]/li/a[1]' ): version_name = va.xpath("string(.)") if ( ( "adopted" in version_name.lower() or "engrossed" in version_name.lower() ) and "not adopted" not in version_name.lower() and "not engrossed" not in version_name.lower() ): version_url = va.xpath("@href")[0] self.obj.add_version_link( version_name, version_url, media_type="text/html", on_duplicate="ignore", ) # actions seen_next = False for ali, next_ali in pairwise(self.doc.xpath('//h4[text()="HISTORY"]/following-sibling::ul[1]/li')): # If we've used this action text before, we don't need to parse it again if seen_next: seen_next = False continue date, action = ali.text_content().split(u" \xa0") try: actor, action = action.split(": ", 1) except ValueError: assert any( [action.startswith("{}:".format(x)) for x in self.actor_map.keys()] ), "Unparseable action text found: '{}'".format(action) logging.getLogger("va").warning( "Skipping apparently-null action: '{}'".format(action) ) continue # Bill history entries purely in parentheses tend to be # notes and not actions, so we'll skip them. if action.startswith("(") and action.endswith(")"): continue actor = self.actor_map[actor] date = datetime.datetime.strptime(date.strip(), "%m/%d/%y").date() # if action ends in (##-Y ##-N) remove that part vrematch = self.vote_strip_re.match(action) # The following conditional logic is messy to handle # Virginia's crazy and inconsistently formatted bill # histories. Someone less harried and tired than me # could probably make this much cleaner. - alo if vrematch: vote_action, y, n, o = vrematch.groups() y = int(y) n = int(n) # Set default count for "other" votes to 0. We have to # do this explicitly as it's excluded from the action # text when there were no abstentions (the only type of # "other" vote encountered thus far). o = int(o) if o else 0 vote_url = ali.xpath("a/@href") # Finds relevant information from the current action if # vote count encountered, then searches for the presence # of identical counts in the next entry (we assume that # it's probably there). If matching votes are found, it # merges data in both to create a unified vote record. # # This is because Virginia usually publishes two lines # of history data for a single vote, without guaranteed # order, so we unsafely attempt to match on identical # vote counts in the next line. vote = VoteEvent( start_date=date, chamber=actor, motion_text=vote_action.strip(), result="pass" if y > n else "fail", classification="passage", bill=self.obj, ) vote.set_count("yes", y) vote.set_count("no", n) vote.set_count("other", o) try: next_action = next_ali.text_content().split(" \xa0")[1].split(": ", 1)[1] except ValueError: next_action = "" vrematch_next = self.vote_strip_re.match(next_action) if vrematch_next: vote_action_next, y_next, n_next, o_next = vrematch_next.groups() y_next = int(y_next) n_next = int(n_next) o_next = int(o_next) if o_next else 0 vote_url_next = next_ali.xpath("a/@href") # Check that the vote counts match and that only one action # has a URL (otherwise, they're probably different votes). if [y_next, n_next, o_next] == [y, n, o] and len(vote_url) != len(vote_url_next): seen_next = True if not vote_url: vote_url = vote_url_next else: vote.motion_text = vote_action_next.strip() action = next_action if vote_url: list( self.scrape_page_items( VotePage, url=vote_url[0], obj=vote ) ) vote.add_source(vote_url[0]) else: vote.add_source(self.url) yield from add_pupa_id(vote) # categorize actions for pattern, atype in ACTION_CLASSIFIERS: if re.match(pattern, action): break else: atype = None # if matched a 'None' atype, don't add the action if atype != SKIP: self.obj.add_action(action, date, chamber=actor, classification=atype)