コード例 #1
0
def parse_committee_members(options):
    log.info('Processing committee members')
    MEMBERS_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map Bioguide IDs to GovTrack IDs
        y = yaml_load(settings.CONGRESS_PROJECT_PATH +
                      "/congress-legislators/legislators-current.yaml")
        person_id_map = {}
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "bioguide" in m["id"]:
                person_id_map[m["id"]["bioguide"]] = m["id"]["govtrack"]

        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')

        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print("Committee not found:", committee)
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(
                    id=person_id_map[member["bioguide"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()

            progress.tick()

        File.objects.save_file(MEMBERS_FILE)
コード例 #2
0
def parse_committee_members(options):
    log.info('Processing committee members')
    MEMBERS_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map Bioguide IDs to GovTrack IDs
        y = yaml_load(settings.CONGRESS_PROJECT_PATH + "/congress-legislators/legislators-current.yaml")
        person_id_map = { }
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "bioguide" in m["id"]:
                person_id_map[m["id"]["bioguide"]] = m["id"]["govtrack"]
        
        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')
        
        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print("Committee not found:", committee)
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(id=person_id_map[member["bioguide"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()
            
            progress.tick()

        File.objects.save_file(MEMBERS_FILE)
コード例 #3
0
def parse_committee_names(options):
    log.info('Processing committees')
    COMMITTEES_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print("New committee:", committee["thomas_id"])
                cobj = Committee(code=committee["thomas_id"])

            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.jurisdiction = committee.get("jurisdiction")
            cobj.jurisdiction_link = committee.get("jurisdiction_source")
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print("New subcommittee:", code)
                    sobj = Committee(code=code)

                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)

            progress.tick()

        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(
            id__in=seen_committees)
        if len(other_committees) > 0:
            print("Marking obsolete:",
                  ", ".join(c.code for c in other_committees))
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)
コード例 #4
0
def parse_committee_names(options):
    log.info('Processing committees')
    COMMITTEES_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print("New committee:", committee["thomas_id"])
                cobj = Committee(code=committee["thomas_id"])
               
            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.jurisdiction = committee.get("jurisdiction")
            cobj.jurisdiction_link = committee.get("jurisdiction_source")
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print("New subcommittee:", code)
                    sobj = Committee(code=code)
                
                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)
                
            progress.tick()
            
        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees)
        if len(other_committees) > 0:
            print("Marking obsolete:", ", ".join(c.code for c in other_committees))
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)
コード例 #5
0
def main(options):
    """
    Process committees, subcommittees and
    members of current congress committees.
    """

    BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH
    
    meeting_processor = CommitteeMeetingProcessor()

    log.info('Processing committees')
    COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print "New committee:", committee["thomas_id"]
                cobj = Committee(code=committee["thomas_id"])
               
            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print "New subcommittee:", code
                    sobj = Committee(code=code)
                
                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)
                
            progress.tick()
            
        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees)
        if len(other_committees) > 0:
            print "Marking obsolete:", ", ".join(c.code for c in other_committees)
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)
        
    log.info('Processing committee members')
    MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map THOMAS IDs to GovTrack IDs
        y = yaml_load(BASE_PATH + "legislators-current.yaml")
        person_id_map = { }
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]:
                person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"]
        
        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')
        
        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            if committee[0] == "H": continue # House data is out of date
            
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print "Committee not found:", committee
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(id=person_id_map[member["thomas"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()
            
            progress.tick()

        File.objects.save_file(MEMBERS_FILE)
        
    return

    log.info('Processing committee schedule')
    SCHEDULE_FILE = 'data/us/112/committeeschedule.xml'
    file_changed = File.objects.is_changed(SCHEDULE_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % SCHEDULE_FILE)
    else:
        tree = etree.parse(SCHEDULE_FILE)
        
        # We have to clear out all CommitteeMeeting objects when we refresh because
        # we have no unique identifier in the upstream data for a meeting. We might use
        # the meeting's committee & date as an identifier, but since meeting times can
        # change this might have awkward consequences for the end user if we even
        # attempted to track that.

        CommitteeMeeting.objects.all().delete()

        # Process committee event nodes
        for meeting in tree.xpath('/committee-schedule/meeting'):
            try:
                mobj = meeting_processor.process(CommitteeMeeting(), meeting)
                mobj.save()
                
                mobj.bills.clear()
                for bill in meeting.xpath('bill'):
                    bill = Bill.objects.get(congress=bill.get("session"), bill_type=BillType.by_xml_code(bill.get("type")), number=int(bill.get("number")))
                    mobj.bills.add(bill)
            except Committee.DoesNotExist:
                log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting))

        for committee in Committee.objects.all():
            if not options.disable_events:
                committee.create_events()
            
        File.objects.save_file(SCHEDULE_FILE)
コード例 #6
0
def main(options):
    """
    Update Person and PersonRole models.
    
    Do safe update: touch only those records
    which have been changed.
    """

    BASE_PATH = CONGRESS_LEGISLATORS_PATH
    SRC_FILES = [
        'legislators-current', 'legislators-historical',
        'legislators-social-media', 'executive'
    ]  # order matters

    for p in SRC_FILES:
        f = BASE_PATH + p + ".yaml"
        if not File.objects.is_changed(f) and not options.force:
            log.info('File %s was not changed' % f)
        else:
            # file modified...
            break
    else:
        # no 'break' ==> no files modified
        return

    # Start parsing.

    had_error = False

    # Get combined data.
    legislator_data = {}
    leg_id_map = {}
    for p in SRC_FILES:
        log.info('Opening %s...' % p)
        f = BASE_PATH + p + ".yaml"
        y = yaml_load(f)
        for m in y:
            if p == "legislators-current":
                # We know all terms but the last are non-current and the last is.
                for r in m["terms"]:
                    r["current"] = False
                m["terms"][-1]["current"] = True
            elif p == "legislators-historical":
                # We know all terms are non-current.
                for r in m["terms"]:
                    r["current"] = False

            if p != 'legislators-social-media':
                govtrack_id = m["id"].get("govtrack")

                # For the benefit of the social media file, make a mapping of IDs.
                for k, v in m["id"].items():
                    if type(v) != list:
                        leg_id_map[(k, v)] = govtrack_id
            else:
                # GovTrack IDs are not always listed in this file.
                govtrack_id = None
                for k, v in m["id"].items():
                    if type(v) != list and (k, v) in leg_id_map:
                        govtrack_id = leg_id_map[(k, v)]
                        break

            if not govtrack_id:
                print("No GovTrack ID:")
                pprint.pprint(m)
                had_error = True
                continue

            if govtrack_id not in legislator_data:
                legislator_data[govtrack_id] = m
            elif p == "legislators-social-media":
                legislator_data[govtrack_id]["social"] = m["social"]
            elif p == "executive":
                legislator_data[govtrack_id]["terms"].extend(m["terms"])
            else:
                raise ValueError("Duplication in an unexpected way (%d, %s)." %
                                 (govtrack_id, p))

    person_processor = PersonProcessor()
    role_processor = PersonRoleProcessor()

    existing_persons = set(Person.objects.values_list('pk', flat=True))
    processed_persons = set()
    created_persons = set()

    progress = Progress(total=len(legislator_data))
    log.info('Processing persons')

    for node in legislator_data.values():
        # Wrap each iteration in try/except
        # so that if some node breaks the parsing process
        # then other nodes could be parsed
        try:
            person = person_processor.process(Person(), node)

            # Create cached name strings. This is done again later
            # after the roles are updated.
            person.set_names()

            # Now try to load the person with such ID from
            # database. If found it then just update it
            # else create new Person object
            try:
                ex_person = Person.objects.get(pk=person.pk)
                if person_processor.changed(ex_person,
                                            person) or options.force:
                    # If the person has PK of existing record,
                    # coming in via the YAML-specified GovTrack ID,
                    # then Django ORM will update existing record
                    if not options.force:
                        log.warn("Updated %s" % person)
                    person.save()

            except Person.DoesNotExist:
                created_persons.add(person.pk)
                person.save()
                log.warn("Created %s" % person)

            processed_persons.add(person.pk)

            # Parse all of the roles.
            new_roles = []
            for termnode in node['terms']:
                role = role_processor.process(PersonRole(), termnode)
                role.person = person
                role.extra = filter_yaml_term_structure(
                    termnode)  # copy in the whole YAML structure

                # Is this role current? For legislators, same as whether it came from legislators-current, which eases Jan 3 transitions when we can't distinguish by date.
                if "current" in termnode:
                    role.current = termnode["current"]

                # But executives...
                else:
                    now = datetime.now().date()
                    role.current = role.startdate <= now and role.enddate >= now
                    # Because of date overlaps at noon transition dates, ensure that only the last term that covers
                    # today is current --- reset past roles to not current. Doesn't handle turning off retirning people tho.
                    for r in new_roles:
                        r.current = False

                # Scan for most recent leadership role within the time period of this term,
                # which isn't great for Senators because it's likely it changed a few times
                # within a term, especially if there was a party switch.
                role.leadership_title = None
                for leadership_node in node.get("leadership_roles", []):
                    # must match on date and chamber
                    if leadership_node["start"] >= role.enddate.isoformat():
                        continue  # might start on the same day but is for the next Congress
                    if "end" in leadership_node and leadership_node[
                            "end"] <= role.startdate.isoformat():
                        continue  # might start on the same day but is for the previous Congress
                    if leadership_node["chamber"] != RoleType.by_value(
                            role.role_type).congress_chamber.lower():
                        continue
                    role.leadership_title = leadership_node["title"]

                new_roles.append(role)

            # Try matching the new roles to existing db records. Since we don't have a primry key
            # in the source data, we have to match on the record values. But because of errors in data,
            # term start/end dates can change, so matching has to be a little fuzzy.
            existing_roles = list(PersonRole.objects.filter(person=person))
            matches = []

            def run_match_rule(rule):
                import itertools
                for new_role, existing_role in itertools.product(
                        new_roles, existing_roles):
                    if new_role not in new_roles or existing_role not in existing_roles:
                        continue  # already matched on a previous iteration
                    if new_role.role_type != existing_role.role_type: continue
                    if new_role.state != existing_role.state: continue
                    if rule(new_role, existing_role):
                        matches.append((new_role, existing_role))
                        new_roles.remove(new_role)
                        existing_roles.remove(existing_role)

            # First match exactly, then exact on just one date, then on contractions and expansions.
            run_match_rule(lambda new_role, existing_role: new_role.startdate
                           == existing_role.startdate and new_role.enddate ==
                           existing_role.enddate)
            run_match_rule(lambda new_role, existing_role: new_role.startdate
                           == existing_role.startdate or new_role.enddate ==
                           existing_role.enddate)
            run_match_rule(lambda new_role, existing_role: new_role.startdate
                           >= existing_role.startdate and new_role.enddate <=
                           existing_role.enddate)
            run_match_rule(lambda new_role, existing_role: new_role.startdate
                           <= existing_role.startdate and new_role.enddate >=
                           existing_role.enddate)

            # Update the database entries that correspond with records in the data file.
            did_update_any = False
            for new_role, existing_role in matches:
                new_role.id = existing_role.id
                if role_processor.changed(existing_role,
                                          new_role) or options.force:
                    new_role.save()
                    did_update_any = True
                    if not options.force:
                        log.warn("Updated %s" % new_role)

            # If we have mutliple records on disk that didn't match and multiple records in the database
            # that didn't match, then we don't know how to align them.
            if len(new_roles) > 0 and len(existing_roles) > 0:
                print(new_roles)
                print(existing_roles)
                raise Exception("There is an unmatched role.")

            # Otherwise if there are any unmatched new roles, we can just add them.
            for role in new_roles:
                log.warn("Created %s" % role)
                role.save()
                did_update_any = True

            # And likewise for any existing roles that are left over.
            for pr in existing_roles:
                print(pr.person.id, pr)
                raise ValueError("Deleted role??")
                log.warn("Deleted %s" % pr)
                pr.delete()

            if did_update_any and not options.disable_events:
                # Create the events for the roles after all have been loaded
                # because we don't create events for ends of terms and
                # starts of terms that are adjacent. Refresh the list to get
                # the roles in order.
                role_list = list(
                    PersonRole.objects.filter(
                        person=person).order_by('startdate'))
                for i in range(len(role_list)):
                    role_list[i].create_events(
                        role_list[i - 1] if i > 0 else None,
                        role_list[i + 1] if i < len(role_list) - 1 else None)

            # The name can't be determined until all of the roles are set. If
            # it changes, re-save. Unfortunately roles are cached so this actually
            # doesn't work yet. Re-run the parser to fix names.
            nn = (person.name, person.sortname)
            if hasattr(person, "role"):
                delattr(person, "role")  # clear the cached info
            person._most_recent_role = None  # clear cache here too
            person.set_names()
            if nn != (person.name, person.sortname):
                log.warn("%s is now %s." % (nn[0], person.name))
                person.save()

        except Exception as ex:
            # Catch unexpected exceptions and log them
            pprint.pprint(node)
            log.error('', exc_info=ex)
            had_error = True

        progress.tick()

    log.info('Processed persons: %d' % len(processed_persons))
    log.info('Created persons: %d' % len(created_persons))

    if not had_error:
        # Remove person which were not found in XML file
        removed_persons = existing_persons - processed_persons
        for pk in removed_persons:
            p = Person.objects.get(pk=pk)
            if p.roles.all().count() > 0:
                log.warn("Missing? Deleted? %d: %s" % (p.id, p))
            else:
                log.warn("Deleting... %d: %s (remember to prune_index!)" %
                         (p.id, p))
                raise Exception("Won't delete!")
                p.delete()
        log.info('Missing/deleted persons: %d' % len(removed_persons))

        # Mark the files as processed.
        for p in SRC_FILES:
            f = BASE_PATH + p + ".yaml"
            File.objects.save_file(f)

    update_twitter_list()
コード例 #7
0
def main(options):
    """
    Update Person and PersonRole models.
    
    Do safe update: touch only those records
    which have been changed.
    """

    BASE_PATH = CONGRESS_LEGISLATORS_PATH
    SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters

    for p in SRC_FILES:
        f = BASE_PATH + p + ".yaml"
        if not File.objects.is_changed(f) and not options.force:
            log.info('File %s was not changed' % f)
        else:
            # file modified...
            break
    else:
        # no 'break' ==> no files modified
        return

    # Start parsing.
    
    had_error = False

    # Get combined data.
    legislator_data = { }
    leg_id_map = { }
    for p in SRC_FILES:
        log.info('Opening %s...' % p)
        f = BASE_PATH + p + ".yaml"
        y = yaml_load(f)
        for m in y:
            if p != 'legislators-social-media':
                govtrack_id = m["id"].get("govtrack")
                
                # For the benefit of the social media file, make a mapping of IDs.
                for k, v in m["id"].items():
                    if type(v) != list:
                        leg_id_map[(k,v)] = govtrack_id
            else:
                # GovTrack IDs are not always listed in this file.
                for k, v in m["id"].items():
                    if type(v) != list and (k, v) in leg_id_map:
                        govtrack_id = leg_id_map[(k,v)]
                        break
            
            if not govtrack_id:
                print "No GovTrack ID:"
                pprint.pprint(m)
                had_error = True
                continue
                
            if govtrack_id not in legislator_data:
                legislator_data[govtrack_id] = m
            elif p == "legislators-social-media":
                legislator_data[govtrack_id]["social"] = m["social"]
            elif p == "executive":
                legislator_data[govtrack_id]["terms"].extend( m["terms"] )
            else:
                raise ValueError("Duplication in an unexpected way.")
    
    person_processor = PersonProcessor()
    role_processor = PersonRoleProcessor()

    existing_persons = set(Person.objects.values_list('pk', flat=True))
    processed_persons = set()
    created_persons = set()

    progress = Progress(total=len(legislator_data))
    log.info('Processing persons')

    for node in legislator_data.values():
        # Wrap each iteration in try/except
        # so that if some node breaks the parsing process
        # then other nodes could be parsed
        try:
            person = person_processor.process(Person(), node)
            
            # Create cached name strings. This is done again later
            # after the roles are updated.
            person.set_names()

            # Now try to load the person with such ID from
            # database. If found it then just update it
            # else create new Person object
            try:
                ex_person = Person.objects.get(pk=person.pk)
                if person_processor.changed(ex_person, person) or options.force:
                    # If the person has PK of existing record,
                    # coming in via the YAML-specified GovTrack ID,
                    # then Django ORM will update existing record
                    if not options.force:
                        log.warn("Updated %s" % person)
                    person.save()
                    
            except Person.DoesNotExist:
                created_persons.add(person.pk)
                person.save()
                log.warn("Created %s" % person)

            processed_persons.add(person.pk)

            # Process roles of the person
            roles = list(PersonRole.objects.filter(person=person))
            existing_roles = set(PersonRole.objects.filter(person=person).values_list('pk', flat=True))
            processed_roles = set()
            role_list = []
            for role in node['terms']:
                role = role_processor.process(PersonRole(), role)
                role.person = person
                
                role.current = role.startdate <= datetime.now().date() and role.enddate >= datetime.now().date() # \
                        #and CURRENT_CONGRESS in role.congress_numbers()
                
                # Try to match this role with one already in the database.
                # First search for an exact match on type/start/end.
                ex_role = None
                for r in roles:
                    if role.role_type == r.role_type and r.startdate == role.startdate and r.enddate == role.enddate:
                        ex_role = r
                        break
                        
                # Otherwise match on type/start only.
                if not ex_role:
                    for r in roles:
                        if role.role_type == r.role_type and r.startdate == role.startdate:
                            ex_role = r
                            break
                        
                if ex_role:    
                    # These roles correspond.
                    processed_roles.add(ex_role.id)
                    role.id = ex_role.id
                    if role_processor.changed(ex_role, role) or options.force:
                        role.save()
                        role_list.append(role)
                        if not options.force:
                            log.warn("Updated %s" % role)
                    roles.remove(ex_role) # don't need to try matching this to any other node
                else:
                    # Didn't find a matching role.
                    if len([r for r in roles if r.role_type == role.role_type]) > 0:
                        print role, "is one of these?"
                        for ex_role in roles:
                            print "\t", ex_role
                        raise Exception("There is an unmatched role.")
                    log.warn("Created %s" % role)
                    role.save()
                    role_list.append(role)
                        
            # create the events for the roles after all have been loaded
            # because we don't create events for ends of terms and
            # starts of terms that are adjacent.
            if not options.disable_events:
                for i in xrange(len(role_list)):
                    role_list[i].create_events(
                        role_list[i-1] if i > 0 else None,
                        role_list[i+1] if i < len(role_list)-1 else None
                        )
            
            removed_roles = existing_roles - processed_roles
            for pk in removed_roles:
                pr = PersonRole.objects.get(pk=pk)
                print pr.person.id, pr
                raise ValueError("Deleted role??")
                log.warn("Deleted %s" % pr)
                pr.delete()
            
            # The name can't be determined until all of the roles are set. If
            # it changes, re-save. Unfortunately roles are cached so this actually
            # doesn't work yet. Re-run the parser to fix names.
            nn = (person.name, person.sortname)
            if hasattr(person, "role"): delattr(person, "role") # clear the cached info
            person.set_names()
            if nn != (person.name, person.sortname):
                log.warn("%s is now %s." % (nn[0], person.name))
                person.save()
            
        except Exception, ex:
            # Catch unexpected exceptions and log them
            pprint.pprint(node)
            log.error('', exc_info=ex)
            had_error = True

        progress.tick()
コード例 #8
0
def main(options):
    """
    Process committees, subcommittees and
    members of current congress committees.
    """

    BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH

    meeting_processor = CommitteeMeetingProcessor()

    log.info('Processing committees')
    COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print "New committee:", committee["thomas_id"]
                cobj = Committee(code=committee["thomas_id"])

            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.jurisdiction = committee.get("jurisdiction")
            cobj.jurisdiction_link = committee.get("jurisdiction_source")
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print "New subcommittee:", code
                    sobj = Committee(code=code)

                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)

            progress.tick()

        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(
            id__in=seen_committees)
        if len(other_committees) > 0:
            print "Marking obsolete:", ", ".join(c.code
                                                 for c in other_committees)
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)

    log.info('Processing committee members')
    MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map THOMAS IDs to GovTrack IDs
        y = yaml_load(BASE_PATH + "legislators-current.yaml")
        person_id_map = {}
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]:
                person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"]

        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')

        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print "Committee not found:", committee
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(
                    id=person_id_map[member["thomas"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()

            progress.tick()

        File.objects.save_file(MEMBERS_FILE)

    log.info('Processing committee schedule')
    for chamber in ("house", "senate"):
        meetings_file = 'data/congress/committee_meetings_%s.json' % chamber
        file_changed = File.objects.is_changed(meetings_file)

        if not file_changed and not options.force:
            log.info('File %s was not changed' % meetings_file)
        else:
            meetings = json.load(open(meetings_file))

            # Process committee event nodes
            for meeting in meetings:
                try:
                    # Associate it with an existing meeting object if GUID is already known.
                    # Must get it like this, vs just assigning the ID as we do in other parsers,
                    # because of the auto_now_add created field, which otherwise misbehaves.
                    try:
                        mobj = CommitteeMeeting.objects.get(
                            guid=meeting['guid'])
                    except CommitteeMeeting.DoesNotExist:
                        mobj = CommitteeMeeting()

                    # Parse.
                    mobj = meeting_processor.process(mobj, meeting)

                    # Attach the meeting to the subcommittee if set.
                    if mobj.subcommittee:
                        mobj.committee = Committee.objects.get(
                            code=mobj.committee.code + mobj.subcommittee)

                    mobj.save()

                    mobj.bills.clear()
                    for bill in meeting["bill_ids"]:
                        try:
                            bill_type, bill_num, bill_cong = re.match(
                                r"([a-z]+)(\d+)-(\d+)$", bill).groups()
                            bill = Bill.objects.get(
                                congress=bill_cong,
                                bill_type=BillType.by_slug(bill_type),
                                number=int(bill_num))
                            mobj.bills.add(bill)
                        except AttributeError:
                            pass  # regex failed
                        except common.enum.NotFound:
                            pass  # invalid bill type code in source data
                        except Bill.DoesNotExist:
                            pass  # we don't know about bill yet
                except Committee.DoesNotExist:
                    log.error(
                        'Could not load Committee object for meeting %s' %
                        meeting_processor.display_node(meeting))

            for committee in Committee.objects.all():
                if not options.disable_events:
                    committee.create_events()

            File.objects.save_file(meetings_file)
コード例 #9
0
def main(options):
    """
    Update Person and PersonRole models.
    
    Do safe update: touch only those records
    which have been changed.
    """

    BASE_PATH = CONGRESS_LEGISLATORS_PATH
    SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters

    for p in SRC_FILES:
        f = BASE_PATH + p + ".yaml"
        if not File.objects.is_changed(f) and not options.force:
            log.info('File %s was not changed' % f)
        else:
            # file modified...
            break
    else:
        # no 'break' ==> no files modified
        return

    # Start parsing.
    
    had_error = False

    # Get combined data.
    legislator_data = { }
    leg_id_map = { }
    for p in SRC_FILES:
        log.info('Opening %s...' % p)
        f = BASE_PATH + p + ".yaml"
        y = yaml_load(f)
        for m in y:
            if p != 'legislators-social-media':
                govtrack_id = m["id"].get("govtrack")
                
                # For the benefit of the social media file, make a mapping of IDs.
                for k, v in m["id"].items():
                    if type(v) != list:
                        leg_id_map[(k,v)] = govtrack_id
            else:
                # GovTrack IDs are not always listed in this file.
                govtrack_id = None
                for k, v in m["id"].items():
                    if type(v) != list and (k, v) in leg_id_map:
                        govtrack_id = leg_id_map[(k,v)]
                        break
            
            if not govtrack_id:
                print "No GovTrack ID:"
                pprint.pprint(m)
                had_error = True
                continue
                
            if govtrack_id not in legislator_data:
                legislator_data[govtrack_id] = m
            elif p == "legislators-social-media":
                legislator_data[govtrack_id]["social"] = m["social"]
            elif p == "executive":
                legislator_data[govtrack_id]["terms"].extend( m["terms"] )
            else:
                raise ValueError("Duplication in an unexpected way (%d, %s)." % (govtrack_id, p))
    
    person_processor = PersonProcessor()
    role_processor = PersonRoleProcessor()

    existing_persons = set(Person.objects.values_list('pk', flat=True))
    processed_persons = set()
    created_persons = set()

    progress = Progress(total=len(legislator_data))
    log.info('Processing persons')

    for node in legislator_data.values():
        # Wrap each iteration in try/except
        # so that if some node breaks the parsing process
        # then other nodes could be parsed
        try:
            person = person_processor.process(Person(), node)
            
            # Create cached name strings. This is done again later
            # after the roles are updated.
            person.set_names()

            # Now try to load the person with such ID from
            # database. If found it then just update it
            # else create new Person object
            try:
                ex_person = Person.objects.get(pk=person.pk)
                if person_processor.changed(ex_person, person) or options.force:
                    # If the person has PK of existing record,
                    # coming in via the YAML-specified GovTrack ID,
                    # then Django ORM will update existing record
                    if not options.force:
                        log.warn("Updated %s" % person)
                    person.save()
                    
            except Person.DoesNotExist:
                created_persons.add(person.pk)
                person.save()
                log.warn("Created %s" % person)

            processed_persons.add(person.pk)

            # Process roles of the person
            roles = list(PersonRole.objects.filter(person=person))
            existing_roles = set(PersonRole.objects.filter(person=person).values_list('pk', flat=True))
            processed_roles = set()
            role_list = []
            for role in node['terms']:
                role = role_processor.process(PersonRole(), role)
                role.person = person
                
                role.current = role.startdate <= datetime.now().date() and role.enddate >= datetime.now().date() # \
                        #and CURRENT_CONGRESS in role.congress_numbers()

                # Scan for most recent leadership role within the time period of this term,
                # which isn't great for Senators because it's likely it changed a few times
                # within a term, especially if there was a party switch.
                role.leadership_title = None
                for leadership_node in node.get("leadership_roles", []):
                    # must match on date and chamber
                    if leadership_node["start"] >= role.enddate.isoformat(): continue # might start on the same day but is for the next Congress
                    if "end" in leadership_node and leadership_node["end"] <= role.startdate.isoformat(): continue # might start on the same day but is for the previous Congress
                    if leadership_node["chamber"] != RoleType.by_value(role.role_type).congress_chamber.lower(): continue
                    role.leadership_title = leadership_node["title"]
                
                # Try to match this role with one already in the database.
                # First search for an exact match on type/start/end.
                ex_role = None
                for r in roles:
                    if role.role_type == r.role_type and r.startdate == role.startdate and r.enddate == role.enddate:
                        ex_role = r
                        break
                        
                # Otherwise match on type/start only.
                if not ex_role:
                    for r in roles:
                        if role.role_type == r.role_type and r.startdate == role.startdate:
                            ex_role = r
                            break
                        
                if ex_role:    
                    # These roles correspond.
                    processed_roles.add(ex_role.id)
                    role.id = ex_role.id
                    if role_processor.changed(ex_role, role) or options.force:
                        role.save()
                        role_list.append(role)
                        if not options.force:
                            log.warn("Updated %s" % role)
                    roles.remove(ex_role) # don't need to try matching this to any other node
                else:
                    # Didn't find a matching role.
                    if len([r for r in roles if r.role_type == role.role_type]) > 0:
                        print role, "is one of these?"
                        for ex_role in roles:
                            print "\t", ex_role
                        raise Exception("There is an unmatched role.")
                    log.warn("Created %s" % role)
                    role.save()
                    role_list.append(role)
                        
            # create the events for the roles after all have been loaded
            # because we don't create events for ends of terms and
            # starts of terms that are adjacent.
            if not options.disable_events:
                for i in xrange(len(role_list)):
                    role_list[i].create_events(
                        role_list[i-1] if i > 0 else None,
                        role_list[i+1] if i < len(role_list)-1 else None
                        )
            
            removed_roles = existing_roles - processed_roles
            for pk in removed_roles:
                pr = PersonRole.objects.get(pk=pk)
                print pr.person.id, pr
                raise ValueError("Deleted role??")
                log.warn("Deleted %s" % pr)
                pr.delete()
            
            # The name can't be determined until all of the roles are set. If
            # it changes, re-save. Unfortunately roles are cached so this actually
            # doesn't work yet. Re-run the parser to fix names.
            nn = (person.name, person.sortname)
            if hasattr(person, "role"): delattr(person, "role") # clear the cached info
            person.set_names()
            if nn != (person.name, person.sortname):
                log.warn("%s is now %s." % (nn[0], person.name))
                person.save()
            
        except Exception, ex:
            # Catch unexpected exceptions and log them
            pprint.pprint(node)
            log.error('', exc_info=ex)
            had_error = True

        progress.tick()
コード例 #10
0
def main(options):
    """
    Process committees, subcommittees and
    members of current congress committees.
    """

    BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH
    
    meeting_processor = CommitteeMeetingProcessor()

    log.info('Processing committees')
    COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print "New committee:", committee["thomas_id"]
                cobj = Committee(code=committee["thomas_id"])
               
            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.jurisdiction = committee.get("jurisdiction")
            cobj.jurisdiction_link = committee.get("jurisdiction_source")
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print "New subcommittee:", code
                    sobj = Committee(code=code)
                
                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)
                
            progress.tick()
            
        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees)
        if len(other_committees) > 0:
            print "Marking obsolete:", ", ".join(c.code for c in other_committees)
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)
        
    log.info('Processing committee members')
    MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map THOMAS IDs to GovTrack IDs
        y = yaml_load(BASE_PATH + "legislators-current.yaml")
        person_id_map = { }
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]:
                person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"]
        
        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')
        
        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print "Committee not found:", committee
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(id=person_id_map[member["thomas"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()
            
            progress.tick()

        File.objects.save_file(MEMBERS_FILE)
        
    log.info('Processing committee schedule')
    for chamber in ("house", "senate"):
		meetings_file = 'data/congress/committee_meetings_%s.json' % chamber
		file_changed = File.objects.is_changed(meetings_file)
	
		if not file_changed and not options.force:
			log.info('File %s was not changed' % meetings_file)
		else:
			meetings = json.load(open(meetings_file))
			
			# Process committee event nodes
			for meeting in meetings:
				try:
					# Associate it with an existing meeting object if GUID is already known.
					# Must get it like this, vs just assigning the ID as we do in other parsers,
					# because of the auto_now_add created field, which otherwise misbehaves.
					try:
						mobj = CommitteeMeeting.objects.get(guid=meeting['guid'])
					except CommitteeMeeting.DoesNotExist:
						mobj = CommitteeMeeting()
					
					# Parse.
					mobj = meeting_processor.process(mobj, meeting)
					
					# Attach the meeting to the subcommittee if set.
					if mobj.subcommittee:
						mobj.committee = Committee.objects.get(code=mobj.committee.code + mobj.subcommittee)
					
					mobj.save()
					
					mobj.bills.clear()
					for bill in meeting["bill_ids"]:
					    try:
					        bill_type, bill_num, bill_cong = re.match(r"([a-z]+)(\d+)-(\d+)$", bill).groups()
					        bill = Bill.objects.get(congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num))
					        mobj.bills.add(bill)
					    except AttributeError:
					        pass # regex failed
					    except common.enum.NotFound:
					        pass # invalid bill type code in source data
					    except Bill.DoesNotExist:
					        pass # we don't know about bill yet
				except Committee.DoesNotExist:
					log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting))
	
			for committee in Committee.objects.all():
				if not options.disable_events:
					committee.create_events()
				
			File.objects.save_file(meetings_file)
コード例 #11
0
def main(options):
    """
    Update Person and PersonRole models.
    
    Do safe update: touch only those records
    which have been changed.
    """

    BASE_PATH = CONGRESS_LEGISLATORS_PATH
    SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters

    for p in SRC_FILES:
        f = BASE_PATH + p + ".yaml"
        if not File.objects.is_changed(f) and not options.force:
            log.info('File %s was not changed' % f)
        else:
            # file modified...
            break
    else:
        # no 'break' ==> no files modified
        return

    # Start parsing.
    
    had_error = False

    # Get combined data.
    legislator_data = { }
    leg_id_map = { }
    for p in SRC_FILES:
        log.info('Opening %s...' % p)
        f = BASE_PATH + p + ".yaml"
        y = yaml_load(f)
        for m in y:
            if p == "legislators-current":
                # We know all terms but the last are non-current and the last is.
                for r in m["terms"]: r["current"] = False
                m["terms"][-1]["current"] = True
            elif p == "legislators-historical":
                # We know all terms are non-current.
                for r in m["terms"]: r["current"] = False

            if p != 'legislators-social-media':
                govtrack_id = m["id"].get("govtrack")
                
                # For the benefit of the social media file, make a mapping of IDs.
                for k, v in m["id"].items():
                    if type(v) != list:
                        leg_id_map[(k,v)] = govtrack_id
            else:
                # GovTrack IDs are not always listed in this file.
                govtrack_id = None
                for k, v in m["id"].items():
                    if type(v) != list and (k, v) in leg_id_map:
                        govtrack_id = leg_id_map[(k,v)]
                        break
            
            if not govtrack_id:
                print "No GovTrack ID:"
                pprint.pprint(m)
                had_error = True
                continue
                
            if govtrack_id not in legislator_data:
                legislator_data[govtrack_id] = m
            elif p == "legislators-social-media":
                legislator_data[govtrack_id]["social"] = m["social"]
            elif p == "executive":
                legislator_data[govtrack_id]["terms"].extend( m["terms"] )
            else:
                raise ValueError("Duplication in an unexpected way (%d, %s)." % (govtrack_id, p))
    
    person_processor = PersonProcessor()
    role_processor = PersonRoleProcessor()

    existing_persons = set(Person.objects.values_list('pk', flat=True))
    processed_persons = set()
    created_persons = set()

    progress = Progress(total=len(legislator_data))
    log.info('Processing persons')

    for node in legislator_data.values():
        # Wrap each iteration in try/except
        # so that if some node breaks the parsing process
        # then other nodes could be parsed
        try:
            person = person_processor.process(Person(), node)
            
            # Create cached name strings. This is done again later
            # after the roles are updated.
            person.set_names()

            # Now try to load the person with such ID from
            # database. If found it then just update it
            # else create new Person object
            try:
                ex_person = Person.objects.get(pk=person.pk)
                if person_processor.changed(ex_person, person) or options.force:
                    # If the person has PK of existing record,
                    # coming in via the YAML-specified GovTrack ID,
                    # then Django ORM will update existing record
                    if not options.force:
                        log.warn("Updated %s" % person)
                    person.save()
                    
            except Person.DoesNotExist:
                created_persons.add(person.pk)
                person.save()
                log.warn("Created %s" % person)

            processed_persons.add(person.pk)

            # Parse all of the roles.
            new_roles = []
            for termnode in node['terms']:
                role = role_processor.process(PersonRole(), termnode)
                role.person = person
                role.extra = filter_yaml_term_structure(termnode) # copy in the whole YAML structure

                # Is this role current? For legislators, same as whether it came from legislators-current, which eases Jan 3 transitions when we can't distinguish by date.
                if "current" in termnode:
                    role.current = termnode["current"]

                # But executives...
                else:
                    now = datetime.now().date()
                    role.current = role.startdate <= now and role.enddate >= now
                    # Because of date overlaps at noon transition dates, ensure that only the last term that covers
                    # today is current --- reset past roles to not current. Doesn't handle turning off retirning people tho.
                    for r in new_roles: r.current = False

                # Scan for most recent leadership role within the time period of this term,
                # which isn't great for Senators because it's likely it changed a few times
                # within a term, especially if there was a party switch.
                role.leadership_title = None
                for leadership_node in node.get("leadership_roles", []):
                    # must match on date and chamber
                    if leadership_node["start"] >= role.enddate.isoformat(): continue # might start on the same day but is for the next Congress
                    if "end" in leadership_node and leadership_node["end"] <= role.startdate.isoformat(): continue # might start on the same day but is for the previous Congress
                    if leadership_node["chamber"] != RoleType.by_value(role.role_type).congress_chamber.lower(): continue
                    role.leadership_title = leadership_node["title"]

                new_roles.append(role)

            # Try matching the new roles to existing db records. Since we don't have a primry key
            # in the source data, we have to match on the record values. But because of errors in data,
            # term start/end dates can change, so matching has to be a little fuzzy.
            existing_roles = list(PersonRole.objects.filter(person=person))
            matches = []
            def run_match_rule(rule):
                import itertools
                for new_role, existing_role in itertools.product(new_roles, existing_roles):
                    if new_role not in new_roles or existing_role not in existing_roles: continue # already matched on a previous iteration
                    if new_role.role_type != existing_role.role_type: continue
                    if new_role.state != existing_role.state: continue
                    if rule(new_role, existing_role):
                        matches.append((new_role, existing_role))
                        new_roles.remove(new_role)
                        existing_roles.remove(existing_role)

            # First match exactly, then exact on just one date, then on contractions and expansions.
            run_match_rule(lambda new_role, existing_role : new_role.startdate == existing_role.startdate and new_role.enddate == existing_role.enddate)
            run_match_rule(lambda new_role, existing_role : new_role.startdate == existing_role.startdate or new_role.enddate == existing_role.enddate)
            run_match_rule(lambda new_role, existing_role : new_role.startdate >= existing_role.startdate and new_role.enddate <= existing_role.enddate)
            run_match_rule(lambda new_role, existing_role : new_role.startdate <= existing_role.startdate and new_role.enddate >= existing_role.enddate)

            # Update the database entries that correspond with records in the data file.
            did_update_any = False
            for new_role, existing_role in matches:
                new_role.id = existing_role.id
                if role_processor.changed(existing_role, new_role) or options.force:
                    new_role.save()
                    did_update_any = True
                    if not options.force:
                        log.warn("Updated %s" % new_role)

            # If we have mutliple records on disk that didn't match and multiple records in the database
            # that didn't match, then we don't know how to align them.
            if len(new_roles) > 0 and len(existing_roles) > 0:
                print(new_roles)
                print(existing_roles)
                raise Exception("There is an unmatched role.")

            # Otherwise if there are any unmatched new roles, we can just add them.
            for role in new_roles:
                log.warn("Created %s" % role)
                role.save()
                did_update_any = True
            
            # And likewise for any existing roles that are left over.
            for pr in existing_roles:
                print pr.person.id, pr
                raise ValueError("Deleted role??")
                log.warn("Deleted %s" % pr)
                pr.delete()
            
            if did_update_any and not options.disable_events:
                # Create the events for the roles after all have been loaded
                # because we don't create events for ends of terms and
                # starts of terms that are adjacent. Refresh the list to get
                # the roles in order.
                role_list = list(PersonRole.objects.filter(person=person).order_by('startdate'))
                for i in xrange(len(role_list)):
                    role_list[i].create_events(
                        role_list[i-1] if i > 0 else None,
                        role_list[i+1] if i < len(role_list)-1 else None
                        )
            
            # The name can't be determined until all of the roles are set. If
            # it changes, re-save. Unfortunately roles are cached so this actually
            # doesn't work yet. Re-run the parser to fix names.
            nn = (person.name, person.sortname)
            if hasattr(person, "role"): delattr(person, "role") # clear the cached info
            person.set_names()
            if nn != (person.name, person.sortname):
                log.warn("%s is now %s." % (nn[0], person.name))
                person.save()
            
        except Exception, ex:
            # Catch unexpected exceptions and log them
            pprint.pprint(node)
            log.error('', exc_info=ex)
            had_error = True

        progress.tick()