Ejemplo n.º 1
0
def import_event(data):
    event = None
    data = normalize_dates(data)

    if '_guid' in data:
        event = db.events.find_one({
            settings.LEVEL_FIELD:
            data[settings.LEVEL_FIELD],
            '_guid':
            data['_guid']
        })

    if not event:
        event = db.events.find_one({
            settings.LEVEL_FIELD:
            data[settings.LEVEL_FIELD],
            'when':
            data['when'],
            'end':
            data['end'],
            'type':
            data['type'],
            'description':
            data['description']
        })

    data = apply_filters(filters, data)

    if not event:
        data['created_at'] = datetime.datetime.utcnow()
        data['updated_at'] = data['created_at']
        _insert_with_id(data)
    else:
        update(event, data, db.events)
Ejemplo n.º 2
0
def import_event(data):
    event = None
    data = normalize_dates(data)

    if "_guid" in data:
        event = db.events.find_one({settings.LEVEL_FIELD: data[settings.LEVEL_FIELD], "_guid": data["_guid"]})

    if not event:
        event = db.events.find_one(
            {
                settings.LEVEL_FIELD: data[settings.LEVEL_FIELD],
                "when": data["when"],
                "end": data["end"],
                "type": data["type"],
                "description": data["description"],
            }
        )

    data = apply_filters(filters, data)

    if not event:
        data["created_at"] = datetime.datetime.utcnow()
        data["updated_at"] = data["created_at"]
        _insert_with_id(data)
    else:
        update(event, data, db.events)
Ejemplo n.º 3
0
def import_event(data):
    event = None
    data = normalize_dates(data)

    if '_guid' in data:
        event = db.events.find_one({settings.LEVEL_FIELD:
                                    data[settings.LEVEL_FIELD],
                                    '_guid': data['_guid']})

    if not event:
        event = db.events.find_one({settings.LEVEL_FIELD:
                                    data[settings.LEVEL_FIELD],
                                    'when': data['when'],
                                    'end': data['end'],
                                    'type': data['type'],
                                    'description': data['description']})

    data = apply_filters(filters, data)

    if not event:
        data['created_at'] = datetime.datetime.utcnow()
        data['updated_at'] = data['created_at']
        _insert_with_id(data)
    else:
        update(event, data, db.events)
Ejemplo n.º 4
0
def import_bill(data, standalone_votes, categorizer):
    """
        insert or update a bill

        data - raw bill JSON
        standalone_votes - votes scraped separately
        categorizer - SubjectCategorizer (None - no categorization)
    """
    abbr = data[settings.LEVEL_FIELD]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [
            fix_bill_id(bid) for bid in data['alternate_bill_ids']
        ]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # companions
    for companion in data['companions']:
        companion['bill_id'] = fix_bill_id(companion['bill_id'])
        # query based on companion
        spec = companion.copy()
        spec[settings.LEVEL_FIELD] = abbr
        if not spec['chamber']:
            spec.pop('chamber')
        companion_obj = db.bills.find_one(spec)
        if companion_obj:
            companion['internal_id'] = companion_obj['_id']
        else:
            logger.warning(
                'Unknown companion: {chamber} {session} {bill_id}'.format(
                    **companion))

    # look for a prior version of this bill
    bill = db.bills.find_one({
        settings.LEVEL_FIELD: abbr,
        'session': data['session'],
        'chamber': data['chamber'],
        'bill_id': data['bill_id']
    })

    # keep doc ids consistent
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    match_sponsor_ids(abbr, data)

    # process votes ############

    # pull votes off bill
    bill_votes = data.pop('votes', [])

    # grab the external bill votes if present
    if metadata(abbr).get('_partial_vote_bill_id'):
        # this is a hack initially added for Rhode Island where we can't
        # determine the full bill_id, if this key is in the metadata
        # we just use the numeric portion, not ideal as it won't work
        # where HB/SBs overlap, but in RI they never do
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes += standalone_votes.pop(
            (data['chamber'], data['session'], numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes += standalone_votes.pop(
            (data['chamber'], data['session'], data['bill_id']), [])

    # do id matching and other vote prep
    if bill:
        prepare_votes(abbr, data['session'], bill['_id'], bill_votes)
    else:
        prepare_votes(abbr, data['session'], None, bill_votes)

    # process actions ###########

    dates = {
        'first': None,
        'last': None,
        'passed_upper': None,
        'passed_lower': None,
        'signed': None
    }

    vote_flags = {
        "bill:passed", "bill:failed", "bill:veto_override:passed",
        "bill:veto_override:failed", "amendment:passed", "amendment:failed",
        "committee:passed", "committee:passed:favorable",
        "committee:passed:unfavorable", "committee:passed:failed"
    }
    already_linked = set()
    remove_vote = set()

    for action in data['actions']:
        adate = action['date']

        def _match_committee(name):
            return get_committee_id(abbr, action['actor'], name)

        def _match_legislator(name):
            return get_legislator_id(abbr, data['session'], action['actor'],
                                     name)

        resolvers = {
            "committee": _match_committee,
            "legislator": _match_legislator
        }

        if "related_entities" in action:
            for entity in action['related_entities']:
                try:
                    resolver = resolvers[entity['type']]
                except KeyError as e:
                    # We don't know how to deal.
                    logger.error("I don't know how to sort a %s" % e)
                    continue

                id = resolver(entity['name'])
                entity['id'] = id

        # first & last dates
        if not dates['first'] or adate < dates['first']:
            dates['first'] = adate
        if not dates['last'] or adate > dates['last']:
            dates['last'] = adate

        # passed & signed dates
        if (not dates['passed_upper'] and action['actor'] == 'upper'
                and 'bill:passed' in action['type']):
            dates['passed_upper'] = adate
        elif (not dates['passed_lower'] and action['actor'] == 'lower'
              and 'bill:passed' in action['type']):
            dates['passed_lower'] = adate
        elif (not dates['signed'] and 'governor:signed' in action['type']):
            dates['signed'] = adate

        # vote-action matching
        action_attached = False
        # only attempt vote matching if action has a date and is one of the
        # designated vote action types
        if set(action['type']).intersection(vote_flags) and action['date']:
            for vote in bill_votes:
                if not vote['date']:
                    continue

                delta = abs(vote['date'] - action['date'])
                if (delta < datetime.timedelta(hours=20)
                        and vote['chamber'] == action['actor']):
                    if action_attached:
                        # multiple votes match, we can't guess
                        action.pop('related_votes', None)
                    else:
                        related_vote = vote['vote_id']
                        if related_vote in already_linked:
                            remove_vote.add(related_vote)

                        already_linked.add(related_vote)
                        action['related_votes'] = [related_vote]
                        action_attached = True

    # remove related_votes that we linked to multiple actions
    for action in data['actions']:
        for vote in remove_vote:
            if vote in action.get('related_votes', []):
                action['related_votes'].remove(vote)

    # save action dates to data
    data['action_dates'] = dates

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)
    data = apply_filters(filters, data)

    if not bill:
        insert_with_id(data)
        elasticsearch_push(data)
        git_add_bill(data)
        save_votes(data, bill_votes)
        return "insert"
    else:
        update(bill, data, db.bills)
        elasticsearch_push(bill)
        git_add_bill(bill)
        save_votes(bill, bill_votes)
        return "update"
Ejemplo n.º 5
0
def import_legislator(data):
    data = prepare_obj(data)

    if data.get('_scraped_name') is None:
        data['_scraped_name'] = data['full_name']

    # Rename 'role' -> 'type'
    for role in data['roles']:
        if 'role' in role:
            role['type'] = role.pop('role')

        # copy over LEVEL_FIELD into role
        if settings.LEVEL_FIELD in data:
            role[settings.LEVEL_FIELD] = data[settings.LEVEL_FIELD]

    scraped_role = data['roles'][0]
    scraped_term = scraped_role['term']

    abbr = data[settings.LEVEL_FIELD]

    spec = {
        settings.LEVEL_FIELD: abbr,
        'type': scraped_role['type'],
        'term': scraped_term
    }
    if 'district' in scraped_role:
        spec['district'] = scraped_role['district']
    if 'chamber' in scraped_role:
        spec['chamber'] = scraped_role['chamber']

    # find matching legislator in current term
    leg = db.legislators.find_one({
        settings.LEVEL_FIELD: abbr,
        '_scraped_name': data['_scraped_name'],
        'roles': {
            '$elemMatch': spec
        }
    })

    # legislator with a matching old_role
    if not leg:
        spec.pop('term')
        leg = db.legislators.find_one({
            settings.LEVEL_FIELD: abbr,
            '_scraped_name': data['_scraped_name'],
            'old_roles.%s' % scraped_term: {
                '$elemMatch': spec
            }
        })

        if leg:
            if 'old_roles' not in data:
                data['old_roles'] = leg.get('old_roles', {})
            # put scraped roles into their old_roles
            data['old_roles'][scraped_term] = data['roles']
            data['roles'] = leg['roles']  # don't overwrite their current roles

    # active matching legislator from different term
    if not leg:
        spec.pop('term', None)
        leg = db.legislators.find_one({
            settings.LEVEL_FIELD: abbr,
            '_scraped_name': data['_scraped_name'],
            'roles': {
                '$elemMatch': spec
            }
        })
        if leg:
            if 'old_roles' not in data:
                data['old_roles'] = leg.get('old_roles', {})

            # scraped_term < leg's term
            if term_older_than(abbr, scraped_term, leg['roles'][0]['term']):
                # move scraped roles into old_roles
                data['old_roles'][scraped_term] = data['roles']
                data['roles'] = leg['roles']
            else:
                data['old_roles'][leg['roles'][0]['term']] = leg['roles']

    data = apply_filters(filters, data)

    if leg:
        update(leg, data, db.legislators)
        return "update"
    else:
        insert_with_id(data)
        return "insert"
Ejemplo n.º 6
0
def import_bill(data, standalone_votes, categorizer):
    """
        insert or update a bill

        data - raw bill JSON
        standalone_votes - votes scraped separately
        categorizer - SubjectCategorizer (None - no categorization)
    """
    abbr = data[settings.LEVEL_FIELD]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in
                                      data['alternate_bill_ids']]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # companions
    for companion in data['companions']:
        companion['bill_id'] = fix_bill_id(companion['bill_id'])
        # query based on companion
        spec = companion.copy()
        spec[settings.LEVEL_FIELD] = abbr
        if not spec['chamber']:
            spec.pop('chamber')
        companion_obj = db.bills.find_one(spec)
        if companion_obj:
            companion['internal_id'] = companion_obj['_id']
        else:
            logger.warning('Unknown companion: {chamber} {session} {bill_id}'
                           .format(**companion))

    # look for a prior version of this bill
    bill = db.bills.find_one({settings.LEVEL_FIELD: abbr,
                              'session': data['session'],
                              'chamber': data['chamber'],
                              'bill_id': data['bill_id']})

    # keep doc ids consistent
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    match_sponsor_ids(abbr, data)

    # process votes ############

    # pull votes off bill
    bill_votes = data.pop('votes', [])

    # grab the external bill votes if present
    if metadata(abbr).get('_partial_vote_bill_id'):
        # this is a hack initially added for Rhode Island where we can't
        # determine the full bill_id, if this key is in the metadata
        # we just use the numeric portion, not ideal as it won't work
        # where HB/SBs overlap, but in RI they never do
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes += standalone_votes.pop((data['chamber'], data['session'],
                                            numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes += standalone_votes.pop((data['chamber'], data['session'],
                                            data['bill_id']), [])

    # do id matching and other vote prep
    if bill:
        prepare_votes(abbr, data['session'], bill['_id'], bill_votes)
    else:
        prepare_votes(abbr, data['session'], None, bill_votes)

    # process actions ###########

    dates = {'first': None, 'last': None, 'passed_upper': None,
             'passed_lower': None, 'signed': None}

    vote_flags = {
        "bill:passed",
        "bill:failed",
        "bill:veto_override:passed",
        "bill:veto_override:failed",
        "amendment:passed",
        "amendment:failed",
        "committee:passed",
        "committee:passed:favorable",
        "committee:passed:unfavorable",
        "committee:passed:failed"
    }
    already_linked = set()
    remove_vote = set()

    for action in data['actions']:
        adate = action['date']

        def _match_committee(name):
            return get_committee_id(abbr, action['actor'], name)

        def _match_legislator(name):
            return get_legislator_id(abbr,
                                     data['session'],
                                     action['actor'],
                                     name)

        resolvers = {
            "committee": _match_committee,
            "legislator": _match_legislator
        }

        if "related_entities" in action:
            for entity in action['related_entities']:
                try:
                    resolver = resolvers[entity['type']]
                except KeyError as e:
                    # We don't know how to deal.
                    logger.error("I don't know how to sort a %s" % e)
                    continue

                id = resolver(entity['name'])
                entity['id'] = id

        # first & last dates
        if not dates['first'] or adate < dates['first']:
            dates['first'] = adate
        if not dates['last'] or adate > dates['last']:
            dates['last'] = adate

        # passed & signed dates
        if (not dates['passed_upper'] and action['actor'] == 'upper'
                and 'bill:passed' in action['type']):
            dates['passed_upper'] = adate
        elif (not dates['passed_lower'] and action['actor'] == 'lower'
                and 'bill:passed' in action['type']):
            dates['passed_lower'] = adate
        elif (not dates['signed'] and 'governor:signed' in action['type']):
            dates['signed'] = adate

        # vote-action matching
        action_attached = False
        # only attempt vote matching if action has a date and is one of the
        # designated vote action types
        if set(action['type']).intersection(vote_flags) and action['date']:
            for vote in bill_votes:
                if not vote['date']:
                    continue

                delta = abs(vote['date'] - action['date'])
                if (delta < datetime.timedelta(hours=20) and
                        vote['chamber'] == action['actor']):
                    if action_attached:
                        # multiple votes match, we can't guess
                        action.pop('related_votes', None)
                    else:
                        related_vote = vote['vote_id']
                        if related_vote in already_linked:
                            remove_vote.add(related_vote)

                        already_linked.add(related_vote)
                        action['related_votes'] = [related_vote]
                        action_attached = True

    # remove related_votes that we linked to multiple actions
    for action in data['actions']:
        for vote in remove_vote:
            if vote in action.get('related_votes', []):
                action['related_votes'].remove(vote)

    # save action dates to data
    data['action_dates'] = dates

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # add/update tracked_versions collection
        track_version(data, version)

        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)
    data = apply_filters(filters, data)

    if not bill:
        insert_with_id(data)
        git_add_bill(data)
        save_votes(data, bill_votes)
        return "insert"
    else:
        update(bill, data, db.bills)
        git_add_bill(bill)
        save_votes(bill, bill_votes)
        return "update"
Ejemplo n.º 7
0
def import_legislator(data):
    data = prepare_obj(data)

    if data.get('_scraped_name') is None:
        data['_scraped_name'] = data['full_name']

    # Rename 'role' -> 'type'
    for role in data['roles']:
        if 'role' in role:
            role['type'] = role.pop('role')

        # copy over LEVEL_FIELD into role
        if settings.LEVEL_FIELD in data:
            role[settings.LEVEL_FIELD] = data[settings.LEVEL_FIELD]

    scraped_role = data['roles'][0]
    scraped_term = scraped_role['term']

    abbr = data[settings.LEVEL_FIELD]

    spec = {settings.LEVEL_FIELD: abbr,
            'type': scraped_role['type'],
            'term': scraped_term}
    if 'district' in scraped_role:
        spec['district'] = scraped_role['district']
    if 'chamber' in scraped_role:
        spec['chamber'] = scraped_role['chamber']

    # find matching legislator in current term
    leg = db.legislators.find_one(
        {settings.LEVEL_FIELD: abbr,
         '_scraped_name': data['_scraped_name'],
         'roles': {'$elemMatch': spec}})

    # legislator with a matching old_role
    if not leg:
        spec.pop('term')
        leg = db.legislators.find_one({
            settings.LEVEL_FIELD: abbr,
            '_scraped_name': data['_scraped_name'],
            'old_roles.%s' % scraped_term: {'$elemMatch': spec}
        })

        if leg:
            if 'old_roles' not in data:
                data['old_roles'] = leg.get('old_roles', {})
             # put scraped roles into their old_roles
            data['old_roles'][scraped_term] = data['roles']
            data['roles'] = leg['roles']  # don't overwrite their current roles

    # active matching legislator from different term
    if not leg:
        spec.pop('term', None)
        leg = db.legislators.find_one(
            {settings.LEVEL_FIELD: abbr,
             '_scraped_name': data['_scraped_name'],
             'roles': {'$elemMatch': spec}})
        if leg:
            if 'old_roles' not in data:
                data['old_roles'] = leg.get('old_roles', {})

            # scraped_term < leg's term
            if term_older_than(abbr, scraped_term, leg['roles'][0]['term']):
                # move scraped roles into old_roles
                data['old_roles'][scraped_term] = data['roles']
                data['roles'] = leg['roles']
            else:
                data['old_roles'][leg['roles'][0]['term']] = leg['roles']

    data = apply_filters(filters, data)

    if leg:
        update(leg, data, db.legislators)
        return "update"
    else:
        insert_with_id(data)
        return "insert"
Ejemplo n.º 8
0
from billy.core import db
from billy_settings import LEGISLATOR_FILTERS
from billy.importers.filters import apply_filters
from dictdiffer import diff


filters = LEGISLATOR_FILTERS


for leg in db.legislators.find():
    d1 = leg
    leg = leg.copy()
    d2 = apply_filters(filters, leg)
    changes = list(diff(d1, d2))
    if changes != []:
        print leg['_id'], changes