def main(options): """ Process amendments """ if options.congress: files = glob.glob('data/us/%s/bills.amdt/*.xml' % options.congress) log.info('Parsing amendments of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills.amdt/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing amendments: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) amendment_processor = AmendmentProcessor() seen_amdt_ids = [] for fname in files: progress.tick() if not File.objects.is_changed(fname) and not options.force: m = re.match(r"data/us/(\d+)/bills.amdt/([sh])(\d+).xml", fname) if not m: print "Invalid file name", fname else: amdt = Amendment.objects.get(congress=m.group(1), amendment_type=AmendmentType.by_slug(m.group(2)), number=m.group(3)) seen_amdt_ids.append(amdt.id) # don't delete me later continue tree = etree.parse(fname) node = tree.xpath('/amendment')[0] try: amdt = amendment_processor.process(Amendment(), node) except: print fname raise if not amdt: # Amendments to treaties. Can't process. continue # update if already in db try: amdt.id = Amendment.objects.get(congress=amdt.congress, amendment_type=amdt.amendment_type, number=amdt.number).id except Amendment.DoesNotExist: pass # a new amendment seen_amdt_ids.append(amdt.id) # don't delete me later try: amdt.save() except: print amdt raise # For House votes on amendments, the only way to associate the vote with the # amendment is to use the THOMAS/Congress.gov action lines. The House vote XML # has an amendment-num field but its meaning is ambiguous, so it is useless. # When we parse a House amendment with an action line referencing a roll call vote, # save this amendment as that vote's related_amendment, then mark the vote as # 'missing data' (below) so that on the next parse of votes its title gets updated. if amdt.amendment_type == AmendmentType.house_amendment: for vote in node.xpath("actions/vote[@how='roll']"): v_congress, v_session = get_session_from_date(XmlProcessor.parse_datetime(vote.get('datetime')).date()) v_roll = int(vote.get("roll")) try: vote = Vote.objects.get(congress=v_congress, chamber=CongressChamber.house, session=v_session, number=v_roll) vote.related_amendment = amdt vote.save() except Vote.DoesNotExist: print "Missing vote data in", fname # If this amendment is related to a vote, mark the vote as missing data because # we may need to update the vote title if the amendment title has changed. Vote.objects.filter(related_amendment=amdt).update(missing_data=True) File.objects.save_file(fname) # Are any amendments in the database no longer on disk? if options.congress and not options.filter: missing = Amendment.objects.filter(congress=options.congress).exclude(id__in = seen_amdt_ids) if missing.exists(): print "Amendments should be deleted: ", missing
def chamber_handler(self, value): return AmendmentType.by_slug(value)
def main(options): """ Parse rolls. """ # Setup XML processors vote_processor = VoteProcessor() option_processor = VoteOptionProcessor() voter_processor = VoterProcessor() voter_processor.PERSON_CACHE = dict((x.pk, x) for x in Person.objects.all()) # The pattern which the roll file matches # Filename contains info which should be placed to DB # along with info extracted from the XML file re_path = re.compile("data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml") chamber_mapping = {"s": CongressChamber.senate, "h": CongressChamber.house} if options.filter: files = glob.glob(options.filter) log.info("Parsing rolls matching %s" % options.filter) elif options.congress: files = glob.glob("data/us/%s/rolls/*.xml" % options.congress) log.info("Parsing rolls of only congress#%s" % options.congress) else: files = glob.glob("data/us/*/rolls/*.xml") log.info("Processing votes: %d files" % len(files)) total = len(files) progress = Progress(total=total, name="files", step=10) def log_delete_qs(qs): if qs.count() == 0: return print "Deleting obsoleted records: ", qs # if qs.count() > 3: # print "Delete skipped..." # return qs.delete() seen_obj_ids = set() had_error = False for fname in files: progress.tick() match = re_path.search(fname) try: existing_vote = Vote.objects.get( congress=match.group(1), chamber=chamber_mapping[match.group(2)], session=match.group(3), number=match.group(4), ) except Vote.DoesNotExist: existing_vote = None if ( not File.objects.is_changed(fname) and not options.force and existing_vote != None and not existing_vote.missing_data ): seen_obj_ids.add(existing_vote.id) continue try: tree = etree.parse(fname) ## Look for votes with VP tie breakers. # if len(tree.xpath("/roll/voter[@VP='1']")) == 0: # had_error = True # prevent delete at the end # continue # Process role object for roll_node in tree.xpath("/roll"): vote = vote_processor.process(Vote(), roll_node) if existing_vote: vote.id = existing_vote.id match = re_path.search(fname) vote.congress = int(match.group(1)) vote.chamber = chamber_mapping[match.group(2)] vote.session = match.group(3) vote.number = int(match.group(4)) # Get related bill & amendment. for bill_node in roll_node.xpath("bill"): related_bill_num = bill_node.get("number") if 9 <= vote.congress <= 42 and vote.session in ("1", "2"): # Bill numbering from the American Memory colletion is different. The number combines # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to # the 9th congress numbering seems to be wholly assigned by us and not related to # actual numbering, so we skip matching those bills. related_bill_num = "%d%04d%d" % (int(vote.session), int(bill_node.get("number")), 0) try: vote.related_bill = Bill.objects.get( congress=bill_node.get("session"), bill_type=BillType.by_xml_code(bill_node.get("type")), number=related_bill_num, ) except Bill.DoesNotExist: if vote.congress >= 93: vote.missing_data = True for amdt_node in roll_node.xpath("amendment"): if amdt_node.get("ref") == "regular" and vote.related_bill is not None: try: vote.related_amendment = Amendment.objects.get( congress=vote.related_bill.congress, amendment_type=AmendmentType.by_slug(amdt_node.get("number")[0]), number=amdt_node.get("number")[1:], ) except Amendment.DoesNotExist: if vote.congress >= 93: print "Missing amendment", fname vote.missing_data = True elif amdt_node.get("ref") == "bill-serial": # It is impossible to associate House votes with amendments just from the House # vote XML because the amendment-num might correspond either with the A___ number # or with the "An amendment, numbered ____" number from the amendment purpose, # and there's really no way to figure out which. Maybe we can use the amendment # sponsor instead? # vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number")) # Instead, we set related_amendment from the amendment parser. Here, we have to # preserve the related_amendment if it is set. if existing_vote: vote.related_amendment = existing_vote.related_amendment # clean up some question text and use the question_details field if ( vote.category in (VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override) and vote.related_bill ): # For passage votes, set the question to the bill title and put the question # details in the details field. vote.question = truncatewords(vote.related_bill.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() elif vote.category == VoteCategory.amendment and vote.related_amendment: # For votes on amendments, make a better title/explanation. vote.question = truncatewords(vote.related_amendment.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() elif vote.related_bill and vote.question.startswith( "On the Cloture Motion " + vote.related_bill.display_number ): vote.question = "Cloture on " + truncatewords(vote.related_bill.title, 20) elif vote.related_bill and vote.question.startswith( "On Cloture on the Motion to Proceed " + vote.related_bill.display_number ): vote.question = "Cloture on " + truncatewords(vote.related_bill.title, 20) vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display() elif vote.related_bill and vote.question.startswith( "On the Motion to Proceed " + vote.related_bill.display_number ): vote.question = "Motion to Proceed on " + truncatewords(vote.related_bill.title, 20) elif vote.related_amendment and vote.question.startswith( "On the Cloture Motion " + vote.related_amendment.get_amendment_type_display() + " " + str(vote.related_amendment.number) ): vote.question = "Cloture on " + truncatewords(vote.related_amendment.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() # weird House foratting of bill numbers ("H RES 123 Blah blah") if vote.related_bill: vote.question = re.sub( "(On [^:]+): " + vote.related_bill.display_number.replace(". ", " ").replace(".", " ").upper() + " .*", r"\1: " + truncatewords(vote.related_bill.title, 15), vote.question, ) vote.save() seen_obj_ids.add(vote.id) # don't delete me later # Process roll options, overwrite existing options where possible. seen_option_ids = set() roll_options = {} for option_node in roll_node.xpath("./option"): option = option_processor.process(VoteOption(), option_node) option.vote = vote if existing_vote: try: option.id = VoteOption.objects.filter(vote=vote, key=option.key)[ 0 ].id # get is better, but I had the database corruption problem except IndexError: pass option.save() roll_options[option.key] = option seen_option_ids.add(option.id) log_delete_qs( VoteOption.objects.filter(vote=vote).exclude(id__in=seen_option_ids) ) # may cascade and delete the Voters too? # Process roll voters, overwriting existing voters where possible. if existing_vote: existing_voters = dict(Voter.objects.filter(vote=vote).values_list("person", "id")) seen_voter_ids = set() voters = list() for voter_node in roll_node.xpath("./voter"): voter = voter_processor.process(roll_options, Voter(), voter_node) voter.vote = vote voter.created = vote.created # for VP votes, load the actual person... if voter.voter_type == VoterType.vice_president: try: r = PersonRole.objects.get( role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created ) voter.person_role = r voter.person = r.person except: # overlapping roles? missing data? log.error("Could not resolve vice president in %s" % fname, exc_info=ex) if existing_vote and voter.person: try: voter.id = existing_voters[voter.person.id] except KeyError: pass voters.append(voter) if voter.voter_type == VoterType.unknown and not vote.missing_data: vote.missing_data = True vote.save() # pre-fetch the role of each voter load_roles_at_date([x.person for x in voters if x.person != None], vote.created) for voter in voters: voter.person_role = voter.person.role if voter.person_role is None: log.error("%s: Could not find role for %s on %s." % (fname, voter.person, vote.created)) vote.missing_data = True vote.save() # save all of the records (inserting/updating) for voter in voters: voter.save() seen_voter_ids.add(voter.id) # remove obsolete voter records log_delete_qs( Voter.objects.filter(vote=vote).exclude(id__in=seen_voter_ids) ) # possibly already deleted by cascade above # pre-calculate totals vote.calculate_totals() if not options.disable_events: vote.create_event() File.objects.save_file(fname) except Exception, ex: log.error("Error in processing %s" % fname, exc_info=ex) had_error = True
def main(options): """ Parse rolls. """ # Setup XML processors vote_processor = VoteProcessor() option_processor = VoteOptionProcessor() voter_processor = VoterProcessor() voter_processor.PERSON_CACHE = dict( (x.pk, x) for x in Person.objects.all()) # The pattern which the roll file matches # Filename contains info which should be placed to DB # along with info extracted from the XML file re_path = re.compile('data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml') chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house} if options.filter: files = glob.glob(options.filter) log.info('Parsing rolls matching %s' % options.filter) elif options.congress: files = glob.glob('data/us/%s/rolls/*.xml' % options.congress) log.info('Parsing rolls of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/rolls/*.xml') log.info('Processing votes: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=10) def log_delete_qs(qs): if qs.count() == 0: return print "Deleting obsoleted records: ", qs #if qs.count() > 3: # print "Delete skipped..." # return qs.delete() seen_obj_ids = set() had_error = False for fname in files: progress.tick() match = re_path.search(fname) try: existing_vote = Vote.objects.get( congress=match.group(1), chamber=chamber_mapping[match.group(2)], session=match.group(3), number=match.group(4)) except Vote.DoesNotExist: existing_vote = None if not File.objects.is_changed( fname ) and not options.force and existing_vote != None and not existing_vote.missing_data: seen_obj_ids.add(existing_vote.id) continue try: tree = etree.parse(fname) ## Look for votes with VP tie breakers. #if len(tree.xpath("/roll/voter[@VP='1']")) == 0: # had_error = True # prevent delete at the end # continue # Process role object roll_node = tree.xpath('/roll')[0] # Sqlite is much faster when lots of saves are wrapped in a transaction, # and we do a lot of saves because it's a lot of voters. from django.db import transaction with transaction.atomic(): vote = vote_processor.process(Vote(), roll_node) if existing_vote: vote.id = existing_vote.id match = re_path.search(fname) vote.congress = int(match.group(1)) vote.chamber = chamber_mapping[match.group(2)] vote.session = match.group(3) vote.number = int(match.group(4)) # Get related bill & amendment. for bill_node in roll_node.xpath("bill"): related_bill_num = bill_node.get("number") if 9 <= vote.congress <= 42 and vote.session in ('1', '2'): # Bill numbering from the American Memory colletion is different. The number combines # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to # the 9th congress numbering seems to be wholly assigned by us and not related to # actual numbering, so we skip matching those bills. related_bill_num = "%d%04d%d" % (int( vote.session), int(bill_node.get("number")), 0) try: vote.related_bill = Bill.objects.get( congress=bill_node.get("session"), bill_type=BillType.by_xml_code( bill_node.get("type")), number=related_bill_num) except Bill.DoesNotExist: if vote.congress >= 93: vote.missing_data = True for amdt_node in roll_node.xpath("amendment"): if amdt_node.get( "ref" ) == "regular" and vote.related_bill is not None: try: vote.related_amendment = Amendment.objects.get( congress=vote.related_bill.congress, amendment_type=AmendmentType.by_slug( amdt_node.get("number")[0]), number=amdt_node.get("number")[1:]) except Amendment.DoesNotExist: if vote.congress >= 93: print "Missing amendment", fname vote.missing_data = True elif amdt_node.get("ref") == "bill-serial": # It is impossible to associate House votes with amendments just from the House # vote XML because the amendment-num might correspond either with the A___ number # or with the "An amendment, numbered ____" number from the amendment purpose, # and there's really no way to figure out which. Maybe we can use the amendment # sponsor instead? #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number")) # Instead, we set related_amendment from the amendment parser. Here, we have to # preserve the related_amendment if it is set. if existing_vote: vote.related_amendment = existing_vote.related_amendment # clean up some question text and use the question_details field if vote.category in ( VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override) and vote.related_bill: # For passage votes, set the question to the bill title and put the question # details in the details field. vote.question = vote.related_bill.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) elif vote.category == VoteCategory.amendment and vote.related_amendment: # For votes on amendments, make a better title/explanation. vote.question = vote.related_amendment.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) elif vote.related_bill and vote.question.startswith( "On the Cloture Motion " + vote.related_bill.display_number): vote.question = "Cloture on " + vote.related_bill.title elif vote.related_bill and vote.question.startswith( "On Cloture on the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Cloture on " + vote.related_bill.title vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display( ) elif vote.related_bill and vote.question.startswith( "On the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Motion to Proceed on " + vote.related_bill.title elif vote.related_amendment and vote.question.startswith( "On the Cloture Motion " + vote.related_amendment.get_amendment_type_display() + " " + str(vote.related_amendment.number)): vote.question = "Cloture on " + vote.related_amendment.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) # weird House foratting of bill numbers ("H RES 123 Blah blah") if vote.related_bill: vote.question = re.sub( "(On [^:]+): " + vote.related_bill.display_number.replace( ". ", " ").replace(".", " ").upper() + " .*", r"\1: " + truncatewords(vote.related_bill.title, 15), vote.question) vote.save() seen_obj_ids.add(vote.id) # don't delete me later # Process roll options, overwrite existing options where possible. seen_option_ids = set() roll_options = {} for option_node in roll_node.xpath('./option'): option = option_processor.process(VoteOption(), option_node) option.vote = vote if existing_vote: try: option.id = VoteOption.objects.filter( vote=vote, key=option.key )[0].id # get is better, but I had the database corruption problem except IndexError: pass option.save() roll_options[option.key] = option seen_option_ids.add(option.id) log_delete_qs( VoteOption.objects.filter(vote=vote).exclude( id__in=seen_option_ids) ) # may cascade and delete the Voters too? # Process roll voters, overwriting existing voters where possible. if existing_vote: existing_voters = dict( Voter.objects.filter(vote=vote).values_list( "person", "id")) seen_voter_ids = set() voters = list() for voter_node in roll_node.xpath('./voter'): voter = voter_processor.process(roll_options, Voter(), voter_node) voter.vote = vote voter.created = vote.created # for VP votes, load the actual person & role... if voter.voter_type == VoterType.vice_president: try: r = PersonRole.objects.get( role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created) voter.person_role = r voter.person = r.person except PersonRole.DoesNotExist: # overlapping roles? missing data? log.error( 'Could not resolve vice president in %s' % fname) if existing_vote and voter.person: try: voter.id = existing_voters[voter.person.id] except KeyError: pass voters.append(voter) if voter.voter_type == VoterType.unknown and not vote.missing_data: vote.missing_data = True vote.save() # pre-fetch the role of each voter load_roles_at_date( [x.person for x in voters if x.person != None], vote.created, vote.congress) for voter in list(voters): if voter.voter_type != VoterType.vice_president: voter.person_role = voter.person.role # If we couldn't match a role for this person on the date of the vote, and if the voter was Not Voting, # and we're looking at historical data, then this is probably a data error --- the voter wasn't even in office. if voter.person_role is None: if vote.source == VoteSource.keithpoole and voter.option.key == "0": # Drop this record. voters.remove(voter) else: log.error("%s: Could not find role for %s on %s." % (fname, voter.person, vote.created)) vote.missing_data = True vote.save() # save all of the records (inserting/updating) for voter in voters: voter.save() seen_voter_ids.add(voter.id) # remove obsolete voter records log_delete_qs( Voter.objects.filter(vote=vote).exclude( id__in=seen_voter_ids) ) # possibly already deleted by cascade above # pre-calculate totals vote.calculate_totals() if not options.disable_events: vote.create_event() File.objects.save_file(fname) except Exception, ex: log.error('Error in processing %s' % fname, exc_info=ex) had_error = True
def amendment_type_handler(self, value): return AmendmentType.by_slug(value)
def main(options): """ Process amendments """ if options.congress: files = glob.glob(CONGRESS_DATA_PATH + '/{congress}/amendments/*/*/data.xml'.format(congress=options.congress)) log.info('Parsing amendments of only congress#%s' % options.congress) else: files = glob.glob(CONGRESS_DATA_PATH + '/*/amendments/*/*/data.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing amendments: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) amendment_processor = AmendmentProcessor() seen_amdt_ids = [] for fname in files: progress.tick() m = re.match(re.escape(CONGRESS_DATA_PATH) + r'/(?P<congress>\d+)/amendments/(?P<amendment_type>[a-z]+)/(?P<amendment_type2>[a-z]+)(?P<number>[0-9]+)/data.xml', fname) if not File.objects.is_changed(fname) and not options.force: if not m: raise ValueError("Invalid file name", fname) else: amdt = Amendment.objects.get(congress=int(m.group("congress")), amendment_type=AmendmentType.by_slug(m.group("amendment_type")), number=int(m.group("number"))) seen_amdt_ids.append(amdt.id) # don't delete me later continue tree = etree.parse(fname) node = tree.xpath('/amendment')[0] node.set("amendment_type", m.group("amendment_type")) # move from the filename to a place where we can see it in the XML try: amdt = amendment_processor.process(Amendment(), node) except: print(fname) raise if not amdt: # Amendments to treaties. Can't process. continue # update if already in db try: amdt.id = Amendment.objects.get(congress=amdt.congress, amendment_type=amdt.amendment_type, number=amdt.number).id except Amendment.DoesNotExist: pass # a new amendment seen_amdt_ids.append(amdt.id) # don't delete me later try: amdt.save() except: print(amdt) raise # For House votes on amendments, the only way to associate the vote with the # amendment is to use the THOMAS/Congress.gov action lines. The House vote XML # has an amendment-num field but its meaning is ambiguous, so it is useless. # When we parse a House amendment with an action line referencing a roll call vote, # save this amendment as that vote's related_amendment, then mark the vote as # 'missing data' (below) so that on the next parse of votes its title gets updated. if amdt.amendment_type == AmendmentType.house_amendment: for vote in node.xpath("actions/vote[@how='roll']"): v_congress, v_session = get_session_from_date(XmlProcessor.parse_datetime(vote.get('datetime')).date()) v_roll = int(vote.get("roll")) try: vote = Vote.objects.get(congress=v_congress, chamber=CongressChamber.house, session=v_session, number=v_roll) vote.related_amendment = amdt vote.save() except Vote.DoesNotExist: print("Missing vote data in", fname) # If this amendment is related to a vote, mark the vote as missing data because # we may need to update the vote title if the amendment title has changed. Vote.objects.filter(related_amendment=amdt).update(missing_data=True) File.objects.save_file(fname) # Are any amendments in the database no longer on disk? if options.congress and not options.filter: missing = Amendment.objects.filter(congress=options.congress).exclude(id__in = seen_amdt_ids) if missing.exists(): print("Amendments should be deleted: ", missing)
def main(options): """ Parse rolls. """ # Setup XML processors vote_processor = VoteProcessor() option_processor = VoteOptionProcessor() voter_processor = VoterProcessor() voter_processor.PERSON_CACHE = dict((x.pk, x) for x in Person.objects.all()) chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house} if options.filter: files = glob.glob(options.filter) log.info('Parsing rolls matching %s' % options.filter) elif options.congress: files = glob.glob(settings.CONGRESS_DATA_PATH + '/%s/votes/*/*/data.xml' % options.congress) log.info('Parsing rolls of only congress#%s' % options.congress) else: files = glob.glob('data/congress/*/votes/*/*/data.xml') log.info('Processing votes: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=10) def log_delete_qs(qs): if qs.count() == 0: return print("Deleting obsoleted records: ", qs) #if qs.count() > 3: # print "Delete skipped..." # return qs.delete() seen_obj_ids = set() had_error = False for fname in files: progress.tick() match = re.match(r"data/congress/(?P<congress>\d+)/votes/(?P<session>[ABC0-9]+)/(?P<chamber>[hs])(?P<number>\d+)/data.xml$", fname) try: existing_vote = Vote.objects.get(congress=int(match.group("congress")), chamber=chamber_mapping[match.group("chamber")], session=match.group("session"), number=int(match.group("number"))) except Vote.DoesNotExist: existing_vote = None if not File.objects.is_changed(fname) and not options.force and existing_vote != None and not existing_vote.missing_data: seen_obj_ids.add(existing_vote.id) continue try: tree = etree.parse(fname) ## Look for votes with VP tie breakers. #if len(tree.xpath("/roll/voter[@VP='1']")) == 0: # had_error = True # prevent delete at the end # continue # Process role object roll_node = tree.xpath('/roll')[0] # Sqlite is much faster when lots of saves are wrapped in a transaction, # and we do a lot of saves because it's a lot of voters. from django.db import transaction with transaction.atomic(): vote = vote_processor.process(Vote(), roll_node) if existing_vote: vote.id = existing_vote.id vote.congress = int(match.group("congress")) vote.chamber = chamber_mapping[match.group("chamber")] vote.session = match.group("session") vote.number = int(match.group("number")) # Get related bill & amendment. for bill_node in roll_node.xpath("bill"): related_bill_num = bill_node.get("number") if 9 <= vote.congress <= 42 and vote.session in ('1', '2'): # Bill numbering from the American Memory colletion is different. The number combines # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to # the 9th congress numbering seems to be wholly assigned by us and not related to # actual numbering, so we skip matching those bills. related_bill_num = "%d%04d%d" % (int(vote.session), int(bill_node.get("number")), 0) try: vote.related_bill = Bill.objects.get(congress=bill_node.get("session"), bill_type=BillType.by_xml_code(bill_node.get("type")), number=related_bill_num) except Bill.DoesNotExist: if vote.congress >= 93: vote.missing_data = True for amdt_node in roll_node.xpath("amendment"): if amdt_node.get("ref") == "regular" and vote.related_bill is not None: try: vote.related_amendment = Amendment.objects.get(congress=vote.related_bill.congress, amendment_type=AmendmentType.by_slug(amdt_node.get("number")[0]+"amdt"), number=amdt_node.get("number")[1:]) except Amendment.DoesNotExist: if vote.congress >= 93: print("Missing amendment", fname) vote.missing_data = True elif amdt_node.get("ref") == "bill-serial": # It is impossible to associate House votes with amendments just from the House # vote XML because the amendment-num might correspond either with the A___ number # or with the "An amendment, numbered ____" number from the amendment purpose, # and there's really no way to figure out which. Maybe we can use the amendment # sponsor instead? #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number")) # Instead, we set related_amendment from the amendment parser. Here, we have to # preserve the related_amendment if it is set. if existing_vote: vote.related_amendment = existing_vote.related_amendment # clean up some question text and use the question_details field if vote.category in (VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override) and vote.related_bill: # For passage votes, set the question to the bill title and put the question # details in the details field. vote.question = vote.related_bill.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() elif vote.category == VoteCategory.amendment and vote.related_amendment: # For votes on amendments, make a better title/explanation. vote.question = vote.related_amendment.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() elif vote.related_bill and vote.question.startswith("On the Cloture Motion " + vote.related_bill.display_number): vote.question = "Cloture on " + vote.related_bill.title elif vote.related_bill and vote.question.startswith("On Cloture on the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Cloture on " + vote.related_bill.title vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display() elif vote.related_bill and vote.question.startswith("On the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Motion to Proceed on " + vote.related_bill.title elif vote.related_amendment and vote.question.startswith("On the Cloture Motion " + vote.related_amendment.get_amendment_type_display() + " " + str(vote.related_amendment.number)): vote.question = "Cloture on " + vote.related_amendment.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() # weird House foratting of bill numbers ("H RES 123 Blah blah") if vote.related_bill: vote.question = re.sub( "(On [^:]+): " + vote.related_bill.display_number.replace(". ", " ").replace(".", " ").upper() + " .*", r"\1: " + truncatewords(vote.related_bill.title, 15), vote.question) vote.save() seen_obj_ids.add(vote.id) # don't delete me later # Process roll options, overwrite existing options where possible. seen_option_ids = set() roll_options = {} for option_node in roll_node.xpath('./option'): option = option_processor.process(VoteOption(), option_node) option.vote = vote if existing_vote: try: option.id = VoteOption.objects.filter(vote=vote, key=option.key)[0].id # get is better, but I had the database corruption problem except IndexError: pass option.save() roll_options[option.key] = option seen_option_ids.add(option.id) log_delete_qs(VoteOption.objects.filter(vote=vote).exclude(id__in=seen_option_ids)) # may cascade and delete the Voters too? # Process roll voters, overwriting existing voters where possible. if existing_vote: existing_voters = dict(Voter.objects.filter(vote=vote).values_list("person", "id")) seen_voter_ids = set() voters = list() for voter_node in roll_node.xpath('./voter'): voter = voter_processor.process(roll_options, Voter(), voter_node) voter.vote = vote voter.created = vote.created # for VP votes, load the actual person & role... if voter.voter_type == VoterType.vice_president: try: r = PersonRole.objects.get(role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created) voter.person_role = r voter.person = r.person except PersonRole.DoesNotExist: # overlapping roles? missing data? log.error('Could not resolve vice president in %s' % fname) if existing_vote and voter.person: try: voter.id = existing_voters[voter.person.id] except KeyError: pass voters.append(voter) if voter.voter_type == VoterType.unknown and not vote.missing_data: vote.missing_data = True vote.save() # pre-fetch the role of each voter load_roles_at_date([x.person for x in voters if x.person != None], vote.created, vote.congress) for voter in list(voters): if voter.voter_type != VoterType.vice_president: voter.person_role = voter.person.role # If we couldn't match a role for this person on the date of the vote, and if the voter was Not Voting, # and we're looking at historical data, then this is probably a data error --- the voter wasn't even in office. # At the start of each Congress, the House does a Call by States and Election of the Speaker, before swearing # in. In the 116th Congress, these votes had a Not Voting for Walter Jones who had not yet made it to DC, and # then omitted Jones in the votes after swearing in. In those cases, look for a role coming up. if voter.person_role is None and voter.option.key == "0" and vote.question in ("Call by States", "Election of the Speaker"): voter.person_role = voter.person.roles.filter(startdate__gt=vote.created, startdate__lt=vote.created+timedelta(days=30)).first() if voter.person_role is None: if vote.source == VoteSource.keithpoole and voter.option.key == "0": # Drop this record. voters.remove(voter) else: log.error("%s: Could not find role for %s on %s." % (fname, voter.person, vote.created)) vote.missing_data = True vote.save() # save all of the records (inserting/updating) for voter in voters: voter.save() seen_voter_ids.add(voter.id) # remove obsolete voter records log_delete_qs(Voter.objects.filter(vote=vote).exclude(id__in=seen_voter_ids)) # possibly already deleted by cascade above # pre-calculate totals vote.calculate_totals() if not options.disable_events: vote.create_event() File.objects.save_file(fname) except Exception as ex: log.error('Error in processing %s' % fname, exc_info=ex) had_error = True # delete vote objects that are no longer represented on disk if options.congress and not options.filter and not had_error: log_delete_qs(Vote.objects.filter(congress=options.congress).exclude(id__in = seen_obj_ids))
def main(options): """ Parse rolls. """ # Setup XML processors vote_processor = VoteProcessor() option_processor = VoteOptionProcessor() voter_processor = VoterProcessor() voter_processor.PERSON_CACHE = dict( (x.pk, x) for x in Person.objects.all()) # The pattern which the roll file matches # Filename contains info which should be placed to DB # along with info extracted from the XML file re_path = re.compile('data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml') chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house} if options.filter: files = glob.glob(options.filter) log.info('Parsing rolls matching %s' % options.filter) elif options.congress: files = glob.glob('data/us/%s/rolls/*.xml' % options.congress) log.info('Parsing rolls of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/rolls/*.xml') log.info('Processing votes: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=10) def log_delete_qs(qs): if qs.count() > 0: try: print "Deleting: ", qs except Exception as e: print "Deleting [%s]..." % str(e) if qs.count() > 3: print "Delete skipped..." return qs.delete() seen_obj_ids = set() had_error = False for fname in files: progress.tick() match = re_path.search(fname) try: existing_vote = Vote.objects.get( congress=match.group(1), chamber=chamber_mapping[match.group(2)], session=match.group(3), number=match.group(4)) except Vote.DoesNotExist: existing_vote = None if not File.objects.is_changed( fname ) and not options.force and existing_vote != None and not existing_vote.missing_data: seen_obj_ids.add(existing_vote.id) continue try: tree = etree.parse(fname) ## Look for votes with VP tie breakers. #if len(tree.xpath("/roll/voter[@VP='1']")) == 0: # had_error = True # prevent delete at the end # continue # Process role object for roll_node in tree.xpath('/roll'): vote = vote_processor.process(Vote(), roll_node) if existing_vote: vote.id = existing_vote.id match = re_path.search(fname) vote.congress = int(match.group(1)) vote.chamber = chamber_mapping[match.group(2)] vote.session = match.group(3) vote.number = int(match.group(4)) # Get related bill & amendment. for bill_node in roll_node.xpath("bill"): try: vote.related_bill = Bill.objects.get( congress=bill_node.get("session"), bill_type=BillType.by_xml_code( bill_node.get("type")), number=bill_node.get("number")) except Bill.DoesNotExist: vote.missing_data = True for amdt_node in roll_node.xpath("amendment"): if amdt_node.get("ref") == "regular": try: vote.related_amendment = Amendment.objects.get( congress=vote.related_bill.congress, amendment_type=AmendmentType.by_slug( amdt_node.get("number")[0]), number=amdt_node.get("number")[1:]) except Amendment.DoesNotExist: print "Missing amendment", fname vote.missing_data = True elif amdt_node.get("ref") == "bill-serial": # It is impossible to associate House votes with amendments just from the House # vote XML because the amendment-num might correspond either with the A___ number # or with the "An amendment, numbered ____" number from the amendment purpose, # and there's really no way to figure out which. Maybe we can use the amendment # sponsor instead? #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number")) # Instead, we set related_amendment from the amendment parser. Here, we have to # preserve the related_amendment if it is set. if existing_vote: vote.related_amendment = existing_vote.related_amendment # clean up some question text and use the question_details field if vote.category in ( VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override) and vote.related_bill: # For passage votes, set the question to the bill title and put the question # details in the details field. vote.question = truncatewords(vote.related_bill.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) elif vote.category == VoteCategory.amendment and vote.related_amendment: # For votes on amendments, make a better title/explanation. vote.question = truncatewords(vote.related_amendment.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) elif vote.related_bill and vote.question.startswith( "On the Cloture Motion " + vote.related_bill.display_number): vote.question = "Cloture on " + truncatewords( vote.related_bill.title, 20) elif vote.related_bill and vote.question.startswith( "On Cloture on the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Cloture on " + truncatewords( vote.related_bill.title, 20) vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display( ) elif vote.related_bill and vote.question.startswith( "On the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Motion to Proceed on " + truncatewords( vote.related_bill.title, 20) elif vote.related_amendment and vote.question.startswith( "On the Cloture Motion " + vote.related_amendment.get_amendment_type_display() + " " + str(vote.related_amendment.number)): vote.question = "Cloture on " + truncatewords( vote.related_amendment.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) # weird House foratting of bill numbers ("H RES 123 Blah blah") if vote.related_bill: vote.question = re.sub( "(On [^:]+): " + vote.related_bill.display_number.replace( ". ", " ").replace(".", " ").upper() + " .*", r"\1: " + truncatewords(vote.related_bill.title, 15), vote.question) vote.save() seen_obj_ids.add(vote.id) # don't delete me later # Process roll options, overwrite existing options where possible. seen_option_ids = set() roll_options = {} for option_node in roll_node.xpath('./option'): option = option_processor.process(VoteOption(), option_node) option.vote = vote if existing_vote: try: option.id = VoteOption.objects.filter( vote=vote, key=option.key )[0].id # get is better, but I had the database corruption problem except IndexError: pass option.save() roll_options[option.key] = option seen_option_ids.add(option.id) log_delete_qs( VoteOption.objects.filter(vote=vote).exclude( id__in=seen_option_ids) ) # may cascade and delete the Voters too? # Process roll voters, overwriting existing voters where possible. if existing_vote: existing_voters = dict( Voter.objects.filter(vote=vote).values_list( "person", "id")) seen_voter_ids = set() for voter_node in roll_node.xpath('./voter'): voter = voter_processor.process(roll_options, Voter(), voter_node) voter.vote = vote voter.created = vote.created # for VP votes, load the actual person... if voter.voter_type == VoterType.vice_president: try: r = PersonRole.objects.get( role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created) voter.person = r.person except: # overlapping roles? missing data? log.error( 'Could not resolve vice president in %s' % fname, exc_info=ex) if existing_vote and voter.person: try: voter.id = existing_voters[voter.person.id] except KeyError: pass voter.save() if voter.voter_type == VoterType.unknown and not vote.missing_data: vote.missing_data = True vote.save() seen_voter_ids.add(voter.id) log_delete_qs( Voter.objects.filter(vote=vote).exclude( id__in=seen_voter_ids) ) # possibly already deleted by cascade above vote.calculate_totals() if not options.disable_events: vote.create_event() File.objects.save_file(fname) except Exception, ex: log.error('Error in processing %s' % fname, exc_info=ex) had_error = True