def main(options):
    """
    Process amendments
    """

    if options.congress:
        files = glob.glob('data/us/%s/bills.amdt/*.xml' % options.congress)
        log.info('Parsing amendments of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills.amdt/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing amendments: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    amendment_processor = AmendmentProcessor()
    seen_amdt_ids = []
    for fname in files:
        progress.tick()
        
        if not File.objects.is_changed(fname) and not options.force:
            m = re.match(r"data/us/(\d+)/bills.amdt/([sh])(\d+).xml", fname)
            if not m:
                print "Invalid file name", fname
            else:
                amdt = Amendment.objects.get(congress=m.group(1), amendment_type=AmendmentType.by_slug(m.group(2)), number=m.group(3))
                seen_amdt_ids.append(amdt.id) # don't delete me later
            continue
            
        tree = etree.parse(fname)
        node = tree.xpath('/amendment')[0]
        
        try:
            amdt = amendment_processor.process(Amendment(), node)
        except:
            print fname
            raise

        if not amdt:
            # Amendments to treaties. Can't process.
            continue
            
        # update if already in db
        try:
            amdt.id = Amendment.objects.get(congress=amdt.congress, amendment_type=amdt.amendment_type, number=amdt.number).id
        except Amendment.DoesNotExist:
            pass # a new amendment
       
        seen_amdt_ids.append(amdt.id) # don't delete me later
        
        try:
            amdt.save()
        except:
            print amdt
            raise
            
        # For House votes on amendments, the only way to associate the vote with the
        # amendment is to use the THOMAS/Congress.gov action lines. The House vote XML
        # has an amendment-num field but its meaning is ambiguous, so it is useless.
        # When we parse a House amendment with an action line referencing a roll call vote,
        # save this amendment as that vote's related_amendment, then mark the vote as
        # 'missing data' (below) so that on the next parse of votes its title gets updated.
        if amdt.amendment_type == AmendmentType.house_amendment:
            for vote in node.xpath("actions/vote[@how='roll']"):
                v_congress, v_session = get_session_from_date(XmlProcessor.parse_datetime(vote.get('datetime')).date())
                v_roll = int(vote.get("roll"))
                try:
                    vote = Vote.objects.get(congress=v_congress, chamber=CongressChamber.house, session=v_session, number=v_roll)
                    vote.related_amendment = amdt
                    vote.save()
                except Vote.DoesNotExist:
                    print "Missing vote data in", fname
            
        # If this amendment is related to a vote, mark the vote as missing data because
        # we may need to update the vote title if the amendment title has changed.
        Vote.objects.filter(related_amendment=amdt).update(missing_data=True)

        File.objects.save_file(fname)
        
    # Are any amendments in the database no longer on disk?
    if options.congress and not options.filter:
        missing = Amendment.objects.filter(congress=options.congress).exclude(id__in = seen_amdt_ids)
        if missing.exists():
            print "Amendments should be deleted: ", missing
 def chamber_handler(self, value):
     return AmendmentType.by_slug(value)
def main(options):
    """
    Parse rolls.
    """

    # Setup XML processors
    vote_processor = VoteProcessor()
    option_processor = VoteOptionProcessor()
    voter_processor = VoterProcessor()
    voter_processor.PERSON_CACHE = dict((x.pk, x) for x in Person.objects.all())

    # The pattern which the roll file matches
    # Filename contains info which should be placed to DB
    # along with info extracted from the XML file
    re_path = re.compile("data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml")

    chamber_mapping = {"s": CongressChamber.senate, "h": CongressChamber.house}

    if options.filter:
        files = glob.glob(options.filter)
        log.info("Parsing rolls matching %s" % options.filter)
    elif options.congress:
        files = glob.glob("data/us/%s/rolls/*.xml" % options.congress)
        log.info("Parsing rolls of only congress#%s" % options.congress)
    else:
        files = glob.glob("data/us/*/rolls/*.xml")
    log.info("Processing votes: %d files" % len(files))
    total = len(files)
    progress = Progress(total=total, name="files", step=10)

    def log_delete_qs(qs):
        if qs.count() == 0:
            return
        print "Deleting obsoleted records: ", qs
        # if qs.count() > 3:
        #    print "Delete skipped..."
        #    return
        qs.delete()

    seen_obj_ids = set()
    had_error = False

    for fname in files:
        progress.tick()

        match = re_path.search(fname)

        try:
            existing_vote = Vote.objects.get(
                congress=match.group(1),
                chamber=chamber_mapping[match.group(2)],
                session=match.group(3),
                number=match.group(4),
            )
        except Vote.DoesNotExist:
            existing_vote = None

        if (
            not File.objects.is_changed(fname)
            and not options.force
            and existing_vote != None
            and not existing_vote.missing_data
        ):
            seen_obj_ids.add(existing_vote.id)
            continue

        try:
            tree = etree.parse(fname)

            ## Look for votes with VP tie breakers.
            # if len(tree.xpath("/roll/voter[@VP='1']")) == 0:
            #    had_error = True # prevent delete at the end
            #    continue

            # Process role object
            for roll_node in tree.xpath("/roll"):
                vote = vote_processor.process(Vote(), roll_node)
                if existing_vote:
                    vote.id = existing_vote.id
                match = re_path.search(fname)
                vote.congress = int(match.group(1))
                vote.chamber = chamber_mapping[match.group(2)]
                vote.session = match.group(3)
                vote.number = int(match.group(4))

                # Get related bill & amendment.

                for bill_node in roll_node.xpath("bill"):
                    related_bill_num = bill_node.get("number")
                    if 9 <= vote.congress <= 42 and vote.session in ("1", "2"):
                        # Bill numbering from the American Memory colletion is different. The number combines
                        # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to
                        # the 9th congress numbering seems to be wholly assigned by us and not related to
                        # actual numbering, so we skip matching those bills.
                        related_bill_num = "%d%04d%d" % (int(vote.session), int(bill_node.get("number")), 0)
                    try:
                        vote.related_bill = Bill.objects.get(
                            congress=bill_node.get("session"),
                            bill_type=BillType.by_xml_code(bill_node.get("type")),
                            number=related_bill_num,
                        )
                    except Bill.DoesNotExist:
                        if vote.congress >= 93:
                            vote.missing_data = True

                for amdt_node in roll_node.xpath("amendment"):
                    if amdt_node.get("ref") == "regular" and vote.related_bill is not None:
                        try:
                            vote.related_amendment = Amendment.objects.get(
                                congress=vote.related_bill.congress,
                                amendment_type=AmendmentType.by_slug(amdt_node.get("number")[0]),
                                number=amdt_node.get("number")[1:],
                            )
                        except Amendment.DoesNotExist:
                            if vote.congress >= 93:
                                print "Missing amendment", fname
                                vote.missing_data = True
                    elif amdt_node.get("ref") == "bill-serial":
                        # It is impossible to associate House votes with amendments just from the House
                        # vote XML because the amendment-num might correspond either with the A___ number
                        # or with the "An amendment, numbered ____" number from the amendment purpose,
                        # and there's really no way to figure out which. Maybe we can use the amendment
                        # sponsor instead?
                        # vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number"))
                        # Instead, we set related_amendment from the amendment parser. Here, we have to
                        # preserve the related_amendment if it is set.
                        if existing_vote:
                            vote.related_amendment = existing_vote.related_amendment

                # clean up some question text and use the question_details field

                if (
                    vote.category in (VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override)
                    and vote.related_bill
                ):
                    # For passage votes, set the question to the bill title and put the question
                    # details in the details field.
                    vote.question = truncatewords(vote.related_bill.title, 20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()

                elif vote.category == VoteCategory.amendment and vote.related_amendment:
                    # For votes on amendments, make a better title/explanation.
                    vote.question = truncatewords(vote.related_amendment.title, 20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()

                elif vote.related_bill and vote.question.startswith(
                    "On the Cloture Motion " + vote.related_bill.display_number
                ):
                    vote.question = "Cloture on " + truncatewords(vote.related_bill.title, 20)
                elif vote.related_bill and vote.question.startswith(
                    "On Cloture on the Motion to Proceed " + vote.related_bill.display_number
                ):
                    vote.question = "Cloture on " + truncatewords(vote.related_bill.title, 20)
                    vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display()
                elif vote.related_bill and vote.question.startswith(
                    "On the Motion to Proceed " + vote.related_bill.display_number
                ):
                    vote.question = "Motion to Proceed on " + truncatewords(vote.related_bill.title, 20)

                elif vote.related_amendment and vote.question.startswith(
                    "On the Cloture Motion "
                    + vote.related_amendment.get_amendment_type_display()
                    + " "
                    + str(vote.related_amendment.number)
                ):
                    vote.question = "Cloture on " + truncatewords(vote.related_amendment.title, 20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()

                # weird House foratting of bill numbers ("H RES 123 Blah blah")
                if vote.related_bill:
                    vote.question = re.sub(
                        "(On [^:]+): "
                        + vote.related_bill.display_number.replace(". ", " ").replace(".", " ").upper()
                        + " .*",
                        r"\1: " + truncatewords(vote.related_bill.title, 15),
                        vote.question,
                    )

                vote.save()

                seen_obj_ids.add(vote.id)  # don't delete me later

                # Process roll options, overwrite existing options where possible.
                seen_option_ids = set()
                roll_options = {}
                for option_node in roll_node.xpath("./option"):
                    option = option_processor.process(VoteOption(), option_node)
                    option.vote = vote
                    if existing_vote:
                        try:
                            option.id = VoteOption.objects.filter(vote=vote, key=option.key)[
                                0
                            ].id  # get is better, but I had the database corruption problem
                        except IndexError:
                            pass
                    option.save()
                    roll_options[option.key] = option
                    seen_option_ids.add(option.id)
                log_delete_qs(
                    VoteOption.objects.filter(vote=vote).exclude(id__in=seen_option_ids)
                )  # may cascade and delete the Voters too?

                # Process roll voters, overwriting existing voters where possible.
                if existing_vote:
                    existing_voters = dict(Voter.objects.filter(vote=vote).values_list("person", "id"))
                seen_voter_ids = set()
                voters = list()
                for voter_node in roll_node.xpath("./voter"):
                    voter = voter_processor.process(roll_options, Voter(), voter_node)
                    voter.vote = vote
                    voter.created = vote.created

                    # for VP votes, load the actual person...
                    if voter.voter_type == VoterType.vice_president:
                        try:
                            r = PersonRole.objects.get(
                                role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created
                            )
                            voter.person_role = r
                            voter.person = r.person
                        except:
                            # overlapping roles? missing data?
                            log.error("Could not resolve vice president in %s" % fname, exc_info=ex)

                    if existing_vote and voter.person:
                        try:
                            voter.id = existing_voters[voter.person.id]
                        except KeyError:
                            pass

                    voters.append(voter)

                    if voter.voter_type == VoterType.unknown and not vote.missing_data:
                        vote.missing_data = True
                        vote.save()

                # pre-fetch the role of each voter
                load_roles_at_date([x.person for x in voters if x.person != None], vote.created)
                for voter in voters:
                    voter.person_role = voter.person.role
                    if voter.person_role is None:
                        log.error("%s: Could not find role for %s on %s." % (fname, voter.person, vote.created))
                        vote.missing_data = True
                        vote.save()

                # save all of the records (inserting/updating)
                for voter in voters:
                    voter.save()
                    seen_voter_ids.add(voter.id)

                # remove obsolete voter records
                log_delete_qs(
                    Voter.objects.filter(vote=vote).exclude(id__in=seen_voter_ids)
                )  # possibly already deleted by cascade above

                # pre-calculate totals
                vote.calculate_totals()

                if not options.disable_events:
                    vote.create_event()

            File.objects.save_file(fname)

        except Exception, ex:
            log.error("Error in processing %s" % fname, exc_info=ex)
            had_error = True
Exemple #4
0
def main(options):
    """
    Parse rolls.
    """

    # Setup XML processors
    vote_processor = VoteProcessor()
    option_processor = VoteOptionProcessor()
    voter_processor = VoterProcessor()
    voter_processor.PERSON_CACHE = dict(
        (x.pk, x) for x in Person.objects.all())

    # The pattern which the roll file matches
    # Filename contains info which should be placed to DB
    # along with info extracted from the XML file
    re_path = re.compile('data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml')

    chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house}

    if options.filter:
        files = glob.glob(options.filter)
        log.info('Parsing rolls matching %s' % options.filter)
    elif options.congress:
        files = glob.glob('data/us/%s/rolls/*.xml' % options.congress)
        log.info('Parsing rolls of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/rolls/*.xml')
    log.info('Processing votes: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=10)

    def log_delete_qs(qs):
        if qs.count() == 0: return
        print "Deleting obsoleted records: ", qs
        #if qs.count() > 3:
        #    print "Delete skipped..."
        #    return
        qs.delete()

    seen_obj_ids = set()
    had_error = False

    for fname in files:
        progress.tick()

        match = re_path.search(fname)

        try:
            existing_vote = Vote.objects.get(
                congress=match.group(1),
                chamber=chamber_mapping[match.group(2)],
                session=match.group(3),
                number=match.group(4))
        except Vote.DoesNotExist:
            existing_vote = None

        if not File.objects.is_changed(
                fname
        ) and not options.force and existing_vote != None and not existing_vote.missing_data:
            seen_obj_ids.add(existing_vote.id)
            continue

        try:
            tree = etree.parse(fname)

            ## Look for votes with VP tie breakers.
            #if len(tree.xpath("/roll/voter[@VP='1']")) == 0:
            #    had_error = True # prevent delete at the end
            #    continue

            # Process role object
            roll_node = tree.xpath('/roll')[0]

            # Sqlite is much faster when lots of saves are wrapped in a transaction,
            # and we do a lot of saves because it's a lot of voters.
            from django.db import transaction
            with transaction.atomic():

                vote = vote_processor.process(Vote(), roll_node)
                if existing_vote: vote.id = existing_vote.id
                match = re_path.search(fname)
                vote.congress = int(match.group(1))
                vote.chamber = chamber_mapping[match.group(2)]
                vote.session = match.group(3)
                vote.number = int(match.group(4))

                # Get related bill & amendment.

                for bill_node in roll_node.xpath("bill"):
                    related_bill_num = bill_node.get("number")
                    if 9 <= vote.congress <= 42 and vote.session in ('1', '2'):
                        # Bill numbering from the American Memory colletion is different. The number combines
                        # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to
                        # the 9th congress numbering seems to be wholly assigned by us and not related to
                        # actual numbering, so we skip matching those bills.
                        related_bill_num = "%d%04d%d" % (int(
                            vote.session), int(bill_node.get("number")), 0)
                    try:
                        vote.related_bill = Bill.objects.get(
                            congress=bill_node.get("session"),
                            bill_type=BillType.by_xml_code(
                                bill_node.get("type")),
                            number=related_bill_num)
                    except Bill.DoesNotExist:
                        if vote.congress >= 93:
                            vote.missing_data = True

                for amdt_node in roll_node.xpath("amendment"):
                    if amdt_node.get(
                            "ref"
                    ) == "regular" and vote.related_bill is not None:
                        try:
                            vote.related_amendment = Amendment.objects.get(
                                congress=vote.related_bill.congress,
                                amendment_type=AmendmentType.by_slug(
                                    amdt_node.get("number")[0]),
                                number=amdt_node.get("number")[1:])
                        except Amendment.DoesNotExist:
                            if vote.congress >= 93:
                                print "Missing amendment", fname
                                vote.missing_data = True
                    elif amdt_node.get("ref") == "bill-serial":
                        # It is impossible to associate House votes with amendments just from the House
                        # vote XML because the amendment-num might correspond either with the A___ number
                        # or with the "An amendment, numbered ____" number from the amendment purpose,
                        # and there's really no way to figure out which. Maybe we can use the amendment
                        # sponsor instead?
                        #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number"))
                        # Instead, we set related_amendment from the amendment parser. Here, we have to
                        # preserve the related_amendment if it is set.
                        if existing_vote:
                            vote.related_amendment = existing_vote.related_amendment

                # clean up some question text and use the question_details field

                if vote.category in (
                        VoteCategory.passage, VoteCategory.passage_suspension,
                        VoteCategory.veto_override) and vote.related_bill:
                    # For passage votes, set the question to the bill title and put the question
                    # details in the details field.
                    vote.question = vote.related_bill.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                elif vote.category == VoteCategory.amendment and vote.related_amendment:
                    # For votes on amendments, make a better title/explanation.
                    vote.question = vote.related_amendment.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                elif vote.related_bill and vote.question.startswith(
                        "On the Cloture Motion " +
                        vote.related_bill.display_number):
                    vote.question = "Cloture on " + vote.related_bill.title
                elif vote.related_bill and vote.question.startswith(
                        "On Cloture on the Motion to Proceed " +
                        vote.related_bill.display_number):
                    vote.question = "Cloture on " + vote.related_bill.title
                    vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display(
                    )
                elif vote.related_bill and vote.question.startswith(
                        "On the Motion to Proceed " +
                        vote.related_bill.display_number):
                    vote.question = "Motion to Proceed on " + vote.related_bill.title

                elif vote.related_amendment and vote.question.startswith(
                        "On the Cloture Motion " +
                        vote.related_amendment.get_amendment_type_display() +
                        " " + str(vote.related_amendment.number)):
                    vote.question = "Cloture on " + vote.related_amendment.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                # weird House foratting of bill numbers ("H RES 123 Blah blah")
                if vote.related_bill:
                    vote.question = re.sub(
                        "(On [^:]+): " +
                        vote.related_bill.display_number.replace(
                            ". ", " ").replace(".", " ").upper() + " .*",
                        r"\1: " + truncatewords(vote.related_bill.title, 15),
                        vote.question)

                vote.save()

                seen_obj_ids.add(vote.id)  # don't delete me later

                # Process roll options, overwrite existing options where possible.
                seen_option_ids = set()
                roll_options = {}
                for option_node in roll_node.xpath('./option'):
                    option = option_processor.process(VoteOption(),
                                                      option_node)
                    option.vote = vote
                    if existing_vote:
                        try:
                            option.id = VoteOption.objects.filter(
                                vote=vote, key=option.key
                            )[0].id  # get is better, but I had the database corruption problem
                        except IndexError:
                            pass
                    option.save()
                    roll_options[option.key] = option
                    seen_option_ids.add(option.id)
                log_delete_qs(
                    VoteOption.objects.filter(vote=vote).exclude(
                        id__in=seen_option_ids)
                )  # may cascade and delete the Voters too?

                # Process roll voters, overwriting existing voters where possible.
                if existing_vote:
                    existing_voters = dict(
                        Voter.objects.filter(vote=vote).values_list(
                            "person", "id"))
                seen_voter_ids = set()
                voters = list()
                for voter_node in roll_node.xpath('./voter'):
                    voter = voter_processor.process(roll_options, Voter(),
                                                    voter_node)
                    voter.vote = vote
                    voter.created = vote.created

                    # for VP votes, load the actual person & role...
                    if voter.voter_type == VoterType.vice_president:
                        try:
                            r = PersonRole.objects.get(
                                role_type=RoleType.vicepresident,
                                startdate__lte=vote.created,
                                enddate__gte=vote.created)
                            voter.person_role = r
                            voter.person = r.person
                        except PersonRole.DoesNotExist:
                            # overlapping roles? missing data?
                            log.error(
                                'Could not resolve vice president in %s' %
                                fname)

                    if existing_vote and voter.person:
                        try:
                            voter.id = existing_voters[voter.person.id]
                        except KeyError:
                            pass

                    voters.append(voter)

                    if voter.voter_type == VoterType.unknown and not vote.missing_data:
                        vote.missing_data = True
                        vote.save()

                # pre-fetch the role of each voter
                load_roles_at_date(
                    [x.person for x in voters if x.person != None],
                    vote.created, vote.congress)
                for voter in list(voters):
                    if voter.voter_type != VoterType.vice_president:
                        voter.person_role = voter.person.role
                    # If we couldn't match a role for this person on the date of the vote, and if the voter was Not Voting,
                    # and we're looking at historical data, then this is probably a data error --- the voter wasn't even in office.
                    if voter.person_role is None:
                        if vote.source == VoteSource.keithpoole and voter.option.key == "0":
                            # Drop this record.
                            voters.remove(voter)
                        else:
                            log.error("%s: Could not find role for %s on %s." %
                                      (fname, voter.person, vote.created))
                            vote.missing_data = True
                            vote.save()

                # save all of the records (inserting/updating)
                for voter in voters:
                    voter.save()
                    seen_voter_ids.add(voter.id)

                # remove obsolete voter records
                log_delete_qs(
                    Voter.objects.filter(vote=vote).exclude(
                        id__in=seen_voter_ids)
                )  # possibly already deleted by cascade above

                # pre-calculate totals
                vote.calculate_totals()

                if not options.disable_events:
                    vote.create_event()

            File.objects.save_file(fname)

        except Exception, ex:
            log.error('Error in processing %s' % fname, exc_info=ex)
            had_error = True
 def amendment_type_handler(self, value):
     return AmendmentType.by_slug(value)
def main(options):
    """
    Process amendments
    """

    if options.congress:
        files = glob.glob(CONGRESS_DATA_PATH + '/{congress}/amendments/*/*/data.xml'.format(congress=options.congress))
        log.info('Parsing amendments of only congress#%s' % options.congress)
    else:
        files = glob.glob(CONGRESS_DATA_PATH + '/*/amendments/*/*/data.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing amendments: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    amendment_processor = AmendmentProcessor()
    seen_amdt_ids = []
    for fname in files:
        progress.tick()

        m = re.match(re.escape(CONGRESS_DATA_PATH) + r'/(?P<congress>\d+)/amendments/(?P<amendment_type>[a-z]+)/(?P<amendment_type2>[a-z]+)(?P<number>[0-9]+)/data.xml', fname)
        
        if not File.objects.is_changed(fname) and not options.force:
            if not m:
                raise ValueError("Invalid file name", fname)
            else:
                amdt = Amendment.objects.get(congress=int(m.group("congress")), amendment_type=AmendmentType.by_slug(m.group("amendment_type")), number=int(m.group("number")))
                seen_amdt_ids.append(amdt.id) # don't delete me later
            continue
            
        tree = etree.parse(fname)
        node = tree.xpath('/amendment')[0]
        node.set("amendment_type", m.group("amendment_type")) # move from the filename to a place where we can see it in the XML
        
        try:
            amdt = amendment_processor.process(Amendment(), node)
        except:
            print(fname)
            raise

        if not amdt:
            # Amendments to treaties. Can't process.
            continue
            
        # update if already in db
        try:
            amdt.id = Amendment.objects.get(congress=amdt.congress, amendment_type=amdt.amendment_type, number=amdt.number).id
        except Amendment.DoesNotExist:
            pass # a new amendment
       
        seen_amdt_ids.append(amdt.id) # don't delete me later
        
        try:
            amdt.save()
        except:
            print(amdt)
            raise
            
        # For House votes on amendments, the only way to associate the vote with the
        # amendment is to use the THOMAS/Congress.gov action lines. The House vote XML
        # has an amendment-num field but its meaning is ambiguous, so it is useless.
        # When we parse a House amendment with an action line referencing a roll call vote,
        # save this amendment as that vote's related_amendment, then mark the vote as
        # 'missing data' (below) so that on the next parse of votes its title gets updated.
        if amdt.amendment_type == AmendmentType.house_amendment:
            for vote in node.xpath("actions/vote[@how='roll']"):
                v_congress, v_session = get_session_from_date(XmlProcessor.parse_datetime(vote.get('datetime')).date())
                v_roll = int(vote.get("roll"))
                try:
                    vote = Vote.objects.get(congress=v_congress, chamber=CongressChamber.house, session=v_session, number=v_roll)
                    vote.related_amendment = amdt
                    vote.save()
                except Vote.DoesNotExist:
                    print("Missing vote data in", fname)
            
        # If this amendment is related to a vote, mark the vote as missing data because
        # we may need to update the vote title if the amendment title has changed.
        Vote.objects.filter(related_amendment=amdt).update(missing_data=True)

        File.objects.save_file(fname)
        
    # Are any amendments in the database no longer on disk?
    if options.congress and not options.filter:
        missing = Amendment.objects.filter(congress=options.congress).exclude(id__in = seen_amdt_ids)
        if missing.exists():
            print("Amendments should be deleted: ", missing)
 def chamber_handler(self, value):
     return AmendmentType.by_slug(value)
def main(options):
    """
    Parse rolls.
    """
    
    # Setup XML processors
    vote_processor = VoteProcessor()
    option_processor = VoteOptionProcessor()
    voter_processor = VoterProcessor()
    voter_processor.PERSON_CACHE = dict((x.pk, x) for x in Person.objects.all())

    chamber_mapping = {'s': CongressChamber.senate,
                       'h': CongressChamber.house}

    if options.filter:
        files = glob.glob(options.filter)
        log.info('Parsing rolls matching %s' % options.filter)
    elif options.congress:
        files = glob.glob(settings.CONGRESS_DATA_PATH + '/%s/votes/*/*/data.xml' % options.congress)
        log.info('Parsing rolls of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/congress/*/votes/*/*/data.xml')
    log.info('Processing votes: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=10)

    def log_delete_qs(qs):
        if qs.count() == 0: return
        print("Deleting obsoleted records: ", qs)
        #if qs.count() > 3:
        #    print "Delete skipped..."
        #    return
        qs.delete()

    seen_obj_ids = set()
    had_error = False

    for fname in files:
        progress.tick()

        match = re.match(r"data/congress/(?P<congress>\d+)/votes/(?P<session>[ABC0-9]+)/(?P<chamber>[hs])(?P<number>\d+)/data.xml$", fname)
        
        try:
            existing_vote = Vote.objects.get(congress=int(match.group("congress")), chamber=chamber_mapping[match.group("chamber")], session=match.group("session"), number=int(match.group("number")))
        except Vote.DoesNotExist:
            existing_vote = None
        
        if not File.objects.is_changed(fname) and not options.force and existing_vote != None and not existing_vote.missing_data:
            seen_obj_ids.add(existing_vote.id)
            continue
            
        try:
            tree = etree.parse(fname)
            
            ## Look for votes with VP tie breakers.
            #if len(tree.xpath("/roll/voter[@VP='1']")) == 0:
            #    had_error = True # prevent delete at the end
            #    continue
            
            # Process role object
            roll_node = tree.xpath('/roll')[0]

            # Sqlite is much faster when lots of saves are wrapped in a transaction,
            # and we do a lot of saves because it's a lot of voters.
            from django.db import transaction
            with transaction.atomic():

                vote = vote_processor.process(Vote(), roll_node)
                if existing_vote: vote.id = existing_vote.id
                vote.congress = int(match.group("congress"))
                vote.chamber = chamber_mapping[match.group("chamber")]
                vote.session = match.group("session")
                vote.number = int(match.group("number"))
                
                # Get related bill & amendment.
                
                for bill_node in roll_node.xpath("bill"):
                    related_bill_num = bill_node.get("number")
                    if 9 <= vote.congress <= 42 and vote.session in ('1', '2'):
                         # Bill numbering from the American Memory colletion is different. The number combines
                         # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to
                         # the 9th congress numbering seems to be wholly assigned by us and not related to
                         # actual numbering, so we skip matching those bills.
                         related_bill_num = "%d%04d%d" % (int(vote.session), int(bill_node.get("number")), 0)
                    try:
                        vote.related_bill = Bill.objects.get(congress=bill_node.get("session"), bill_type=BillType.by_xml_code(bill_node.get("type")), number=related_bill_num)
                    except Bill.DoesNotExist:
                        if vote.congress >= 93:
                            vote.missing_data = True

                for amdt_node in roll_node.xpath("amendment"):
                    if amdt_node.get("ref") == "regular" and vote.related_bill is not None:
                        try:
                            vote.related_amendment = Amendment.objects.get(congress=vote.related_bill.congress, amendment_type=AmendmentType.by_slug(amdt_node.get("number")[0]+"amdt"), number=amdt_node.get("number")[1:])
                        except Amendment.DoesNotExist:
                            if vote.congress >= 93:
                                print("Missing amendment", fname)
                                vote.missing_data = True
                    elif amdt_node.get("ref") == "bill-serial":
                        # It is impossible to associate House votes with amendments just from the House
                        # vote XML because the amendment-num might correspond either with the A___ number
                        # or with the "An amendment, numbered ____" number from the amendment purpose,
                        # and there's really no way to figure out which. Maybe we can use the amendment
                        # sponsor instead?
                        #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number"))
                        # Instead, we set related_amendment from the amendment parser. Here, we have to
                        # preserve the related_amendment if it is set.
                        if existing_vote: vote.related_amendment = existing_vote.related_amendment

                # clean up some question text and use the question_details field
                
                if vote.category in (VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override) and vote.related_bill:
                    # For passage votes, set the question to the bill title and put the question
                    # details in the details field.
                    vote.question = vote.related_bill.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()
                    
                elif vote.category == VoteCategory.amendment and vote.related_amendment:
                    # For votes on amendments, make a better title/explanation.
                    vote.question = vote.related_amendment.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()
                    
                elif vote.related_bill and vote.question.startswith("On the Cloture Motion " + vote.related_bill.display_number):
                    vote.question = "Cloture on " + vote.related_bill.title
                elif vote.related_bill and vote.question.startswith("On Cloture on the Motion to Proceed " + vote.related_bill.display_number):
                    vote.question = "Cloture on " + vote.related_bill.title
                    vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display()
                elif vote.related_bill and vote.question.startswith("On the Motion to Proceed " + vote.related_bill.display_number):
                    vote.question = "Motion to Proceed on " + vote.related_bill.title
                    
                elif vote.related_amendment and vote.question.startswith("On the Cloture Motion " + vote.related_amendment.get_amendment_type_display() + " " + str(vote.related_amendment.number)):
                    vote.question = "Cloture on " + vote.related_amendment.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()
                
                # weird House foratting of bill numbers ("H RES 123 Blah blah")
                if vote.related_bill:
                    vote.question = re.sub(
                        "(On [^:]+): " + vote.related_bill.display_number.replace(". ", " ").replace(".", " ").upper() + " .*",
                        r"\1: " + truncatewords(vote.related_bill.title, 15),
                        vote.question)
                    
                vote.save()
                
                seen_obj_ids.add(vote.id) # don't delete me later
                
                # Process roll options, overwrite existing options where possible.
                seen_option_ids = set()
                roll_options = {}
                for option_node in roll_node.xpath('./option'):
                    option = option_processor.process(VoteOption(), option_node)
                    option.vote = vote
                    if existing_vote:
                        try:
                            option.id = VoteOption.objects.filter(vote=vote, key=option.key)[0].id # get is better, but I had the database corruption problem
                        except IndexError:
                            pass
                    option.save()
                    roll_options[option.key] = option
                    seen_option_ids.add(option.id)
                log_delete_qs(VoteOption.objects.filter(vote=vote).exclude(id__in=seen_option_ids)) # may cascade and delete the Voters too?

                # Process roll voters, overwriting existing voters where possible.
                if existing_vote:
                    existing_voters = dict(Voter.objects.filter(vote=vote).values_list("person", "id"))
                seen_voter_ids = set()
                voters = list()
                for voter_node in roll_node.xpath('./voter'):
                    voter = voter_processor.process(roll_options, Voter(), voter_node)
                    voter.vote = vote
                    voter.created = vote.created
                        
                    # for VP votes, load the actual person & role...
                    if voter.voter_type == VoterType.vice_president:
                        try:
                            r = PersonRole.objects.get(role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created)
                            voter.person_role = r
                            voter.person = r.person
                        except PersonRole.DoesNotExist:
                            # overlapping roles? missing data?
                            log.error('Could not resolve vice president in %s' % fname)
                        
                    if existing_vote and voter.person:
                        try:
                            voter.id = existing_voters[voter.person.id]
                        except KeyError:
                            pass
                        
                    voters.append(voter)
                    
                    if voter.voter_type == VoterType.unknown and not vote.missing_data:
                        vote.missing_data = True
                        vote.save()
                        
                # pre-fetch the role of each voter
                load_roles_at_date([x.person for x in voters if x.person != None], vote.created, vote.congress)
                for voter in list(voters):
                    if voter.voter_type != VoterType.vice_president:
                        voter.person_role = voter.person.role
                    # If we couldn't match a role for this person on the date of the vote, and if the voter was Not Voting,
                    # and we're looking at historical data, then this is probably a data error --- the voter wasn't even in office.
                    # At the start of each Congress, the House does a Call by States and Election of the Speaker, before swearing
                    # in. In the 116th Congress, these votes had a Not Voting for Walter Jones who had not yet made it to DC, and
                    # then omitted Jones in the votes after swearing in. In those cases, look for a role coming up.
                    if voter.person_role is None and voter.option.key == "0" and vote.question in ("Call by States", "Election of the Speaker"):
                        voter.person_role = voter.person.roles.filter(startdate__gt=vote.created, startdate__lt=vote.created+timedelta(days=30)).first()
                    if voter.person_role is None:
                        if vote.source == VoteSource.keithpoole and voter.option.key == "0":
                            # Drop this record.
                            voters.remove(voter)
                        else:
                            log.error("%s: Could not find role for %s on %s." % (fname, voter.person, vote.created))
                            vote.missing_data = True
                            vote.save()

                # save all of the records (inserting/updating)
                for voter in voters:
                    voter.save()
                    seen_voter_ids.add(voter.id)
                    
                # remove obsolete voter records
                log_delete_qs(Voter.objects.filter(vote=vote).exclude(id__in=seen_voter_ids)) # possibly already deleted by cascade above

                # pre-calculate totals
                vote.calculate_totals()

                if not options.disable_events:
                    vote.create_event()
                    
            File.objects.save_file(fname)

        except Exception as ex:
            log.error('Error in processing %s' % fname, exc_info=ex)
            had_error = True
        
    # delete vote objects that are no longer represented on disk
    if options.congress and not options.filter and not had_error:
        log_delete_qs(Vote.objects.filter(congress=options.congress).exclude(id__in = seen_obj_ids))
def main(options):
    """
    Parse rolls.
    """

    # Setup XML processors
    vote_processor = VoteProcessor()
    option_processor = VoteOptionProcessor()
    voter_processor = VoterProcessor()
    voter_processor.PERSON_CACHE = dict(
        (x.pk, x) for x in Person.objects.all())

    # The pattern which the roll file matches
    # Filename contains info which should be placed to DB
    # along with info extracted from the XML file
    re_path = re.compile('data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml')

    chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house}

    if options.filter:
        files = glob.glob(options.filter)
        log.info('Parsing rolls matching %s' % options.filter)
    elif options.congress:
        files = glob.glob('data/us/%s/rolls/*.xml' % options.congress)
        log.info('Parsing rolls of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/rolls/*.xml')
    log.info('Processing votes: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=10)

    def log_delete_qs(qs):
        if qs.count() > 0:
            try:
                print "Deleting: ", qs
            except Exception as e:
                print "Deleting [%s]..." % str(e)
            if qs.count() > 3:
                print "Delete skipped..."
                return
            qs.delete()

    seen_obj_ids = set()
    had_error = False

    for fname in files:
        progress.tick()

        match = re_path.search(fname)

        try:
            existing_vote = Vote.objects.get(
                congress=match.group(1),
                chamber=chamber_mapping[match.group(2)],
                session=match.group(3),
                number=match.group(4))
        except Vote.DoesNotExist:
            existing_vote = None

        if not File.objects.is_changed(
                fname
        ) and not options.force and existing_vote != None and not existing_vote.missing_data:
            seen_obj_ids.add(existing_vote.id)
            continue

        try:
            tree = etree.parse(fname)

            ## Look for votes with VP tie breakers.
            #if len(tree.xpath("/roll/voter[@VP='1']")) == 0:
            #    had_error = True # prevent delete at the end
            #    continue

            # Process role object
            for roll_node in tree.xpath('/roll'):
                vote = vote_processor.process(Vote(), roll_node)
                if existing_vote: vote.id = existing_vote.id
                match = re_path.search(fname)
                vote.congress = int(match.group(1))
                vote.chamber = chamber_mapping[match.group(2)]
                vote.session = match.group(3)
                vote.number = int(match.group(4))

                # Get related bill & amendment.

                for bill_node in roll_node.xpath("bill"):
                    try:
                        vote.related_bill = Bill.objects.get(
                            congress=bill_node.get("session"),
                            bill_type=BillType.by_xml_code(
                                bill_node.get("type")),
                            number=bill_node.get("number"))
                    except Bill.DoesNotExist:
                        vote.missing_data = True

                for amdt_node in roll_node.xpath("amendment"):
                    if amdt_node.get("ref") == "regular":
                        try:
                            vote.related_amendment = Amendment.objects.get(
                                congress=vote.related_bill.congress,
                                amendment_type=AmendmentType.by_slug(
                                    amdt_node.get("number")[0]),
                                number=amdt_node.get("number")[1:])
                        except Amendment.DoesNotExist:
                            print "Missing amendment", fname
                            vote.missing_data = True
                    elif amdt_node.get("ref") == "bill-serial":
                        # It is impossible to associate House votes with amendments just from the House
                        # vote XML because the amendment-num might correspond either with the A___ number
                        # or with the "An amendment, numbered ____" number from the amendment purpose,
                        # and there's really no way to figure out which. Maybe we can use the amendment
                        # sponsor instead?
                        #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number"))
                        # Instead, we set related_amendment from the amendment parser. Here, we have to
                        # preserve the related_amendment if it is set.
                        if existing_vote:
                            vote.related_amendment = existing_vote.related_amendment

                # clean up some question text and use the question_details field

                if vote.category in (
                        VoteCategory.passage, VoteCategory.passage_suspension,
                        VoteCategory.veto_override) and vote.related_bill:
                    # For passage votes, set the question to the bill title and put the question
                    # details in the details field.
                    vote.question = truncatewords(vote.related_bill.title, 20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                elif vote.category == VoteCategory.amendment and vote.related_amendment:
                    # For votes on amendments, make a better title/explanation.
                    vote.question = truncatewords(vote.related_amendment.title,
                                                  20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                elif vote.related_bill and vote.question.startswith(
                        "On the Cloture Motion " +
                        vote.related_bill.display_number):
                    vote.question = "Cloture on " + truncatewords(
                        vote.related_bill.title, 20)
                elif vote.related_bill and vote.question.startswith(
                        "On Cloture on the Motion to Proceed " +
                        vote.related_bill.display_number):
                    vote.question = "Cloture on " + truncatewords(
                        vote.related_bill.title, 20)
                    vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display(
                    )
                elif vote.related_bill and vote.question.startswith(
                        "On the Motion to Proceed " +
                        vote.related_bill.display_number):
                    vote.question = "Motion to Proceed on " + truncatewords(
                        vote.related_bill.title, 20)

                elif vote.related_amendment and vote.question.startswith(
                        "On the Cloture Motion " +
                        vote.related_amendment.get_amendment_type_display() +
                        " " + str(vote.related_amendment.number)):
                    vote.question = "Cloture on " + truncatewords(
                        vote.related_amendment.title, 20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                # weird House foratting of bill numbers ("H RES 123 Blah blah")
                if vote.related_bill:
                    vote.question = re.sub(
                        "(On [^:]+): " +
                        vote.related_bill.display_number.replace(
                            ". ", " ").replace(".", " ").upper() + " .*",
                        r"\1: " + truncatewords(vote.related_bill.title, 15),
                        vote.question)

                vote.save()

                seen_obj_ids.add(vote.id)  # don't delete me later

                # Process roll options, overwrite existing options where possible.
                seen_option_ids = set()
                roll_options = {}
                for option_node in roll_node.xpath('./option'):
                    option = option_processor.process(VoteOption(),
                                                      option_node)
                    option.vote = vote
                    if existing_vote:
                        try:
                            option.id = VoteOption.objects.filter(
                                vote=vote, key=option.key
                            )[0].id  # get is better, but I had the database corruption problem
                        except IndexError:
                            pass
                    option.save()
                    roll_options[option.key] = option
                    seen_option_ids.add(option.id)
                log_delete_qs(
                    VoteOption.objects.filter(vote=vote).exclude(
                        id__in=seen_option_ids)
                )  # may cascade and delete the Voters too?

                # Process roll voters, overwriting existing voters where possible.
                if existing_vote:
                    existing_voters = dict(
                        Voter.objects.filter(vote=vote).values_list(
                            "person", "id"))
                seen_voter_ids = set()
                for voter_node in roll_node.xpath('./voter'):
                    voter = voter_processor.process(roll_options, Voter(),
                                                    voter_node)
                    voter.vote = vote
                    voter.created = vote.created

                    # for VP votes, load the actual person...
                    if voter.voter_type == VoterType.vice_president:
                        try:
                            r = PersonRole.objects.get(
                                role_type=RoleType.vicepresident,
                                startdate__lte=vote.created,
                                enddate__gte=vote.created)
                            voter.person = r.person
                        except:
                            # overlapping roles? missing data?
                            log.error(
                                'Could not resolve vice president in %s' %
                                fname,
                                exc_info=ex)

                    if existing_vote and voter.person:
                        try:
                            voter.id = existing_voters[voter.person.id]
                        except KeyError:
                            pass

                    voter.save()

                    if voter.voter_type == VoterType.unknown and not vote.missing_data:
                        vote.missing_data = True
                        vote.save()

                    seen_voter_ids.add(voter.id)

                log_delete_qs(
                    Voter.objects.filter(vote=vote).exclude(
                        id__in=seen_voter_ids)
                )  # possibly already deleted by cascade above

                vote.calculate_totals()

                if not options.disable_events:
                    vote.create_event()

            File.objects.save_file(fname)

        except Exception, ex:
            log.error('Error in processing %s' % fname, exc_info=ex)
            had_error = True