Beispiel #1
0
def main():

    abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()]
    logger = logging.getLogger('billy.purge_committee_ids')
    logger.setLevel(logging.INFO)
    tally = defaultdict(Counter)

    for abbr in abbrs:
        abbr_tally = tally['abbr']
        spec = {
            settings.LEVEL_FIELD: abbr,
            'related_bills': {
                '$exists': True,
                '$ne': []
            },
        }
        for event in db.events.find(spec):
            fixed = []
            for bill in event['related_bills']:

                bill_id = bill.get('bill_id')
                if bill_id is not None:

                    # If "bill_id" is a big id, rename it.
                    if re.match(r'[A-Z]{2}B\d{8}', bill_id):
                        _id = bill.pop('bill_id')
                        bill['id'] = _id
                        logger.info('Renamed "bill_id" to "id"')
                        abbr_tally['bill_id --> id'] += 1

                    # If it's something else, do fix_bill_id to
                    # fix screwed up old ids.
                    else:
                        bill['bill_id'] = fix_bill_id(bill['bill_id'])
                        logger.info('Fixed an un-fixed bill_id')
                        abbr_tally['fix_bill_id'] += 1

                    fixed = True

                if '_scraped_bill_id' in bill:
                    bill_id = fix_bill_id(bill.pop('_scraped_bill_id'))
                    bill['bill_id'] = bill_id
                    logger.info('Renamed "_scraped_bill_id" to "bill_id"')
                    abbr_tally['_scraped_bill_id --> bill_id'] += 1

                    fixed = True

            if fixed:
                msg = 'Updating related_bills on event %r.'
                logger.debug(msg % event['_id'])
                db.events.save(event)

        logger.info(abbr)
def main():

    abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()]
    logger = logging.getLogger('billy.purge_committee_ids')
    logger.setLevel(logging.INFO)
    tally = defaultdict(Counter)

    for abbr in abbrs:
        abbr_tally = tally['abbr']
        spec = {
            settings.LEVEL_FIELD: abbr,
            'related_bills': {'$exists': True, '$ne': []},
            }
        for event in db.events.find(spec):
            fixed = []
            for bill in event['related_bills']:

                bill_id = bill.get('bill_id')
                if bill_id is not None:

                    # If "bill_id" is a big id, rename it.
                    if re.match(r'[A-Z]{2}B\d{8}', bill_id):
                        _id = bill.pop('bill_id')
                        bill['id'] = _id
                        logger.info('Renamed "bill_id" to "id"')
                        abbr_tally['bill_id --> id'] += 1

                    # If it's something else, do fix_bill_id to
                    # fix screwed up old ids.
                    else:
                        bill['bill_id'] = fix_bill_id(bill['bill_id'])
                        logger.info('Fixed an un-fixed bill_id')
                        abbr_tally['fix_bill_id'] += 1

                    fixed = True

                if '_scraped_bill_id' in bill:
                    bill_id = fix_bill_id(bill.pop('_scraped_bill_id'))
                    bill['bill_id'] = bill_id
                    logger.info('Renamed "_scraped_bill_id" to "bill_id"')
                    abbr_tally['_scraped_bill_id --> bill_id'] += 1

                    fixed = True

            if fixed:
                msg = 'Updating related_bills on event %r.'
                logger.debug(msg % event['_id'])
                db.events.save(event)

        logger.info(abbr)
Beispiel #3
0
def document(request, abbr, session, bill_id, doc_id):
    '''
    Context:
        - abbr
        - session
        - bill
        - version
        - metadata
        - nav_active

    Templates:
        - billy/web/public/document.html
    '''
    # get fixed version
    fixed_bill_id = fix_bill_id(bill_id)
    # redirect if URL's id isn't fixed id without spaces
    if fixed_bill_id.replace(' ', '') != bill_id:
        return redirect('document', abbr=abbr, session=session,
                        bill_id=fixed_bill_id.replace(' ', ''), doc_id=doc_id)

    bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': session,
                              'bill_id': fixed_bill_id})

    for version in bill['versions']:
        if version['doc_id'] == doc_id:
            break
    else:
        raise Http404('No such document.')

    return render(request, templatename('document'),
                  dict(abbr=abbr, session=session, bill=bill, version=version,
                       metadata=bill.metadata, nav_active='bills'))
Beispiel #4
0
def import_events(abbr, data_dir, import_actions=False):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'events', '*.json')

    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))

        def _resolve_ctty(committee):
            return get_committee_id(data[settings.LEVEL_FIELD],
                                    committee['chamber'],
                                    committee['participant'])

        def _resolve_leg(leg):
            chamber = leg['chamber'] if leg['chamber'] in ['upper', 'lower'] \
                else None

            return get_legislator_id(abbr, data['session'], chamber,
                                     leg['participant'])

        resolvers = {"committee": _resolve_ctty, "legislator": _resolve_leg}

        for entity in data['participants']:
            type = entity['participant_type']
            id = None
            if type in resolvers:
                id = resolvers[type](entity)
            else:
                logger.warning("I don't know how to resolve a %s" % type)
            entity['id'] = id

        for bill in data['related_bills']:
            bill['_scraped_bill_id'] = bill['bill_id']
            bill_id = bill['bill_id']
            bill_id = fix_bill_id(bill_id)
            bill['bill_id'] = ""
            db_bill = db.bills.find_one({
                "$or": [{
                    settings.LEVEL_FIELD: abbr,
                    'session': data['session'],
                    'bill_id': bill_id
                }, {
                    settings.LEVEL_FIELD: abbr,
                    'session': data['session'],
                    'alternate_bill_ids': bill_id
                }]
            })

            if not db_bill:
                logger.warning("Error: Can't find %s" % bill_id)
                db_bill = {}
                db_bill['_id'] = None

            # Events are really hard to pin to a chamber. Some of these are
            # also a committee considering a bill from the other chamber, or
            # something like that.
            bill['bill_id'] = db_bill['_id']
        import_event(data)
    ensure_indexes()
Beispiel #5
0
def bill(request, abbr, session, bill_id):
    '''
    Context:
        - vote_preview_row_template
        - abbr
        - metadata
        - bill
        - events
        - show_all_sponsors
        - sponsors
        - sources
        - nav_active

    Templates:
        - billy/web/public/bill.html
        - billy/web/public/vote_preview_row.html
    '''
    # get fixed version
    fixed_bill_id = fix_bill_id(bill_id)
    # redirect if URL's id isn't fixed id without spaces
    if fixed_bill_id.replace(' ', '') != bill_id:
        return redirect('bill',
                        abbr=abbr,
                        session=session,
                        bill_id=fixed_bill_id.replace(' ', ''))
    _bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': session,
                              'bill_id': fixed_bill_id})
    if _bill is None:
        raise Http404(
            u'no bill found {0} {1} {2}'.format(abbr, session, bill_id))

    events = db.events.find({
        settings.LEVEL_FIELD: abbr,
        "related_bills.bill_id": _bill['_id']
    }).sort("when", -1)
    events = list(events)
    if len(events) > EVENT_PAGE_COUNT:
        events = events[:EVENT_PAGE_COUNT]

    popularity.counter.inc('bills',
                           _bill['_id'], abbr=abbr, session=session)

    show_all_sponsors = request.GET.get('show_all_sponsors')
    if show_all_sponsors:
        sponsors = _bill.sponsors_manager
    else:
        sponsors = _bill.sponsors_manager.first_fifteen

    return render(
        request, templatename('bill'),
        dict(vote_preview_row_template=templatename('vote_preview_row'),
             abbr=abbr,
             metadata=Metadata.get_object(abbr),
             bill=_bill,
             events=events,
             show_all_sponsors=show_all_sponsors,
             sponsors=sponsors,
             sources=_bill['sources'],
             nav_active='bills'))
Beispiel #6
0
def bill(request, abbr, session, bill_id):
    '''
    Context:
        - vote_preview_row_template
        - abbr
        - metadata
        - bill
        - events
        - show_all_sponsors
        - sponsors
        - sources
        - nav_active

    Templates:
        - billy/web/public/bill.html
        - billy/web/public/vote_preview_row.html
    '''
    # get fixed version
    fixed_bill_id = fix_bill_id(bill_id)
    # redirect if URL's id isn't fixed id without spaces
    if fixed_bill_id.replace(' ', '') != bill_id:
        return redirect('bill', abbr=abbr, session=session,
                        bill_id=fixed_bill_id.replace(' ', ''))
    bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': session,
                              'bill_id': fixed_bill_id})
    if bill is None:
        raise Http404(u'no bill found {0} {1} {2}'.format(abbr, session,
                                                         bill_id))

    events = db.events.find({
        settings.LEVEL_FIELD: abbr,
        "related_bills.bill_id": bill['_id']
    }).sort("when", -1)
    events = list(events)
    if len(events) > EVENT_PAGE_COUNT:
        events = events[:EVENT_PAGE_COUNT]

    popularity.counter.inc('bills', bill['_id'], abbr=abbr, session=session)

    show_all_sponsors = request.GET.get('show_all_sponsors')
    if show_all_sponsors:
        sponsors = bill.sponsors_manager
    else:
        sponsors = bill.sponsors_manager.first_fifteen

    return render(
        request, templatename('bill'),
        dict(vote_preview_row_template=templatename('vote_preview_row'),
             abbr=abbr,
             metadata=Metadata.get_object(abbr),
             bill=bill,
             events=events,
             show_all_sponsors=show_all_sponsors,
             sponsors=sponsors,
             sources=bill['sources'],
             nav_active='bills'))
Beispiel #7
0
def bill(request, abbr, session, bill_id):
    """
    Context:
        - vote_preview_row_template
        - abbr
        - metadata
        - bill
        - events
        - show_all_sponsors
        - sponsors
        - sources
        - nav_active

    Templates:
        - billy/web/public/bill.html
        - billy/web/public/vote_preview_row.html
    """
    # get fixed version
    fixed_bill_id = fix_bill_id(bill_id)
    # redirect if URL's id isn't fixed id without spaces
    if fixed_bill_id.replace(" ", "") != bill_id:
        return redirect("bill", abbr=abbr, session=session, bill_id=fixed_bill_id.replace(" ", ""))
    bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, "session": session, "bill_id": fixed_bill_id})
    if bill is None:
        raise Http404(u"no bill found {0} {1} {2}".format(abbr, session, bill_id))

    events = db.events.find({settings.LEVEL_FIELD: abbr, "related_bills.bill_id": bill["_id"]}).sort("when", -1)
    events = list(events)
    if len(events) > EVENT_PAGE_COUNT:
        events = events[:EVENT_PAGE_COUNT]

    popularity.counter.inc("bills", bill["_id"], abbr=abbr, session=session)

    show_all_sponsors = request.GET.get("show_all_sponsors")
    if show_all_sponsors:
        sponsors = bill.sponsors_manager
    else:
        sponsors = bill.sponsors_manager.first_fifteen

    return render(
        request,
        templatename("bill"),
        dict(
            vote_preview_row_template=templatename("vote_preview_row"),
            abbr=abbr,
            metadata=Metadata.get_object(abbr),
            bill=bill,
            events=events,
            show_all_sponsors=show_all_sponsors,
            sponsors=sponsors,
            sources=bill["sources"],
            nav_active="bills",
        ),
    )
Beispiel #8
0
 def func(request, abbr, session, bill_id, key):
     # get fixed version
     fixed_bill_id = fix_bill_id(bill_id)
     # redirect if URL's id isn't fixed id without spaces
     if fixed_bill_id.replace(" ", "") != bill_id:
         return redirect("bill", abbr=abbr, session=session, bill_id=fixed_bill_id.replace(" ", ""))
     bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, "session": session, "bill_id": fixed_bill_id})
     if bill is None:
         raise Http404("no bill found {0} {1} {2}".format(abbr, session, bill_id))
     return render(
         request,
         templatename("bill_all_%s" % key),
         dict(abbr=abbr, metadata=Metadata.get_object(abbr), bill=bill, sources=bill["sources"], nav_active="bills"),
     )
Beispiel #9
0
def import_events(abbr, data_dir, import_actions=False):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, "events", "*.json")

    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))

        def _resolve_ctty(committee):
            return get_committee_id(data[settings.LEVEL_FIELD], committee["chamber"], committee["participant"])

        def _resolve_leg(leg):
            chamber = leg["chamber"] if leg["chamber"] in ["upper", "lower"] else None

            return get_legislator_id(abbr, data["session"], chamber, leg["participant"])

        resolvers = {"committee": _resolve_ctty, "legislator": _resolve_leg}

        for entity in data["participants"]:
            type = entity["participant_type"]
            id = None
            if type in resolvers:
                id = resolvers[type](entity)
            else:
                logger.warning("I don't know how to resolve a %s" % type)
            entity["id"] = id

        for bill in data["related_bills"]:
            bill_id = bill["bill_id"]
            bill_id = fix_bill_id(bill_id)
            db_bill = db.bills.find_one(
                {
                    "$or": [
                        {settings.LEVEL_FIELD: abbr, "session": data["session"], "bill_id": bill_id},
                        {settings.LEVEL_FIELD: abbr, "session": data["session"], "alternate_bill_ids": bill_id},
                    ]
                }
            )

            if not db_bill:
                logger.warning("Error: Can't find %s" % bill_id)
                db_bill = {}
                db_bill["_id"] = None

            # Events are really hard to pin to a chamber. Some of these are
            # also a committee considering a bill from the other chamber, or
            # something like that.
            bill["id"] = db_bill["_id"]
            bill["bill_id"] = bill_id
        import_event(data)
Beispiel #10
0
def document(request, abbr, session, bill_id, doc_id):
    '''
    Context:
        - abbr
        - session
        - bill
        - version
        - metadata
        - nav_active

    Templates:
        - billy/web/public/document.html
    '''
    # get fixed version
    fixed_bill_id = fix_bill_id(bill_id)
    # redirect if URL's id isn't fixed id without spaces
    if fixed_bill_id.replace(' ', '') != bill_id:
        return redirect('document',
                        abbr=abbr,
                        session=session,
                        bill_id=fixed_bill_id.replace(' ', ''),
                        doc_id=doc_id)

    bill = db.bills.find_one({
        settings.LEVEL_FIELD: abbr,
        'session': session,
        'bill_id': fixed_bill_id
    })

    if not bill:
        raise Http404('No such bill.')

    for version in bill['versions']:
        if version['doc_id'] == doc_id:
            break
    else:
        raise Http404('No such document.')

    if not settings.ENABLE_DOCUMENT_VIEW.get(abbr, False):
        return redirect(version['url'])

    return render(
        request, templatename('document'),
        dict(abbr=abbr,
             session=session,
             bill=bill,
             version=version,
             metadata=bill.metadata,
             nav_active='bills'))
Beispiel #11
0
 def func(request, abbr, session, bill_id, key):
     # get fixed version
     fixed_bill_id = fix_bill_id(bill_id)
     # redirect if URL's id isn't fixed id without spaces
     if fixed_bill_id.replace(' ', '') != bill_id:
         return redirect('bill', abbr=abbr, session=session,
                         bill_id=fixed_bill_id.replace(' ', ''))
     bill = db.bills.find_one({settings.LEVEL_FIELD: abbr,
                               'session': session,
                               'bill_id': fixed_bill_id})
     if bill is None:
         raise Http404('no bill found {0} {1} {2}'.format(abbr, session,
                                                          bill_id))
     return render(request, templatename('bill_all_%s' % key),
                   dict(abbr=abbr, metadata=Metadata.get_object(abbr),
                        bill=bill, sources=bill['sources'],
                        nav_active='bills'))
Beispiel #12
0
def load_standalone_votes(data_dir):
    pattern = os.path.join(data_dir, 'votes', '*.json')
    paths = glob.glob(pattern)

    votes = defaultdict(list)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # need to match bill_id already in the database
        bill_id = fix_bill_id(data.pop('bill_id'))

        votes[(data['bill_chamber'], data['session'], bill_id)].append(data)

    logger.info('imported %s vote files' % len(paths))
    return votes
Beispiel #13
0
 def func(request, abbr, session, bill_id, key):
     # get fixed version
     fixed_bill_id = fix_bill_id(bill_id)
     # redirect if URL's id isn't fixed id without spaces
     if fixed_bill_id.replace(' ', '') != bill_id:
         return redirect('bill', abbr=abbr, session=session,
                         bill_id=fixed_bill_id.replace(' ', ''))
     bill = db.bills.find_one({settings.LEVEL_FIELD: abbr,
                               'session': session,
                               'bill_id': fixed_bill_id})
     if bill is None:
         raise Http404('no bill found {0} {1} {2}'.format(abbr, session,
                                                          bill_id))
     return render(request, templatename('bill_all_%s' % key),
                   dict(abbr=abbr, metadata=Metadata.get_object(abbr),
                        bill=bill, sources=bill['sources'],
                        nav_active='bills'))
Beispiel #14
0
def load_standalone_votes(data_dir):
    pattern = os.path.join(data_dir, 'votes', '*.json')
    paths = glob.glob(pattern)

    votes = defaultdict(list)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # need to match bill_id already in the database
        bill_id = fix_bill_id(data.pop('bill_id'))

        votes[(data['bill_chamber'], data['session'], bill_id)].append(data)

    logger.info('imported %s vote files' % len(paths))
    return votes
Beispiel #15
0
def bill(request, abbr, session, bill_id):
    '''
    Context:
        - vote_preview_row_template
        - abbr
        - metadata
        - bill
        - show_all_sponsors
        - sponsors
        - sources
        - nav_active

    Templates:
        - billy/web/public/bill.html
        - billy/web/public/vote_preview_row.html
    '''
    # get fixed version
    fixed_bill_id = fix_bill_id(bill_id)
    # redirect if URL's id isn't fixed id without spaces
    if fixed_bill_id.replace(' ', '') != bill_id:
        return redirect('bill', abbr=abbr, session=session, bill_id=fixed_bill_id.replace(' ', ''))
    bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': session,
                              'bill_id': fixed_bill_id})
    if bill is None:
        raise Http404(u'no bill found {0} {1} {2}'.format(abbr, session, bill_id))

    show_all_sponsors = request.GET.get('show_all_sponsors')
    if show_all_sponsors:
        sponsors = bill.sponsors_manager
    else:
        sponsors = bill.sponsors_manager.first_fifteen

    return render(
        request, templatename('bill'),
        dict(vote_preview_row_template=templatename('vote_preview_row'),
             abbr=abbr,
             metadata=Metadata.get_object(abbr),
             bill=bill,
             show_all_sponsors=show_all_sponsors,
             sponsors=sponsors,
             sources=bill['sources'],
             nav_active='bills'))
Beispiel #16
0
def _build_mongo_filter(request, keys, icase=True):
    _filter = {}
    keys = set(keys) - set(['fields'])

    for key in keys:
        value = request.GET.get(key)
        if value:
            if key in _lower_fields:
                _filter[key] = value.lower()
            elif key.endswith('__in'):
                values = value.split('|')
                _filter[key[:-4]] = values
            elif key == 'bill_id':
                _filter[key] = fix_bill_id(value.upper())
            else:
                # We use regex queries to get case insensitive search - this
                # means they won't use any indexes for now. Real case
                # insensitive queries are coming eventually:
                # http://jira.mongodb.org/browse/SERVER-90
                _filter[key] = re.compile('^%s$' % value, re.IGNORECASE)

    return _filter
Beispiel #17
0
def _build_mongo_filter(request, keys, icase=True):
    _filter = {}
    keys = set(keys) - set(['fields'])

    for key in keys:
        value = request.GET.get(key)
        if value:
            if key in _lower_fields:
                _filter[key] = value.lower()
            elif key.endswith('__in'):
                values = value.split('|')
                _filter[key[:-4]] = values
            elif key == 'bill_id':
                _filter[key] = fix_bill_id(value.upper())
            else:
                # We use regex queries to get case insensitive search - this
                # means they won't use any indexes for now. Real case
                # insensitive queries are coming eventually:
                # http://jira.mongodb.org/browse/SERVER-90
                _filter[key] = re.compile('^%s$' % value, re.IGNORECASE)

    return _filter
Beispiel #18
0
def document(request, abbr, session, bill_id, doc_id):
    """
    Context:
        - abbr
        - session
        - bill
        - version
        - metadata
        - nav_active

    Templates:
        - billy/web/public/document.html
    """
    # get fixed version
    fixed_bill_id = fix_bill_id(bill_id)
    # redirect if URL's id isn't fixed id without spaces
    if fixed_bill_id.replace(" ", "") != bill_id:
        return redirect("document", abbr=abbr, session=session, bill_id=fixed_bill_id.replace(" ", ""), doc_id=doc_id)

    bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, "session": session, "bill_id": fixed_bill_id})

    if not bill:
        raise Http404("No such bill.")

    for version in bill["versions"]:
        if version["doc_id"] == doc_id:
            break
    else:
        raise Http404("No such document.")

    if not settings.ENABLE_DOCUMENT_VIEW.get(abbr, False):
        return redirect(version["url"])

    return render(
        request,
        templatename("document"),
        dict(abbr=abbr, session=session, bill=bill, version=version, metadata=bill.metadata, nav_active="bills"),
    )
Beispiel #19
0
def search_by_bill_id(abbr, search_text):
    '''Find bills with ids like "HB1234".
    '''
    spec = {}

    # If the input looks like a bill id, try to fetch the bill.
    if re.search(r'\d', search_text):
        bill_id = fix_bill_id(search_text).upper()
        collection = db.bills
        spec.update(bill_id=bill_id)

        if abbr != 'all':
            spec[settings.LEVEL_FIELD] = abbr

        docs = collection.find(spec)

        # Do a regex search if the input consists solely of digits.
        if 0 == docs.count():
            spec['bill_id'] = {'$regex': bill_id}
            docs = collection.find(spec)

        # If there were actual results, return a bill_id result view.
        if 0 < docs.count():

            def sortkey(doc):
                session = doc['session']
                years = re.findall(r'\d{4}', session)
                try:
                    return int(years[-1])
                except IndexError:
                    return session

            docs = sorted(docs,
                          key=operator.itemgetter('session'),
                          reverse=True)

            return docs
Beispiel #20
0
def search_by_bill_id(abbr, search_text):
    '''Find bills with ids like "HB1234".
    '''
    spec = {}

    # If the input looks like a bill id, try to fetch the bill.
    if re.search(r'\d', search_text):
        bill_id = fix_bill_id(search_text).upper()
        collection = db.bills
        spec.update(bill_id=bill_id)

        if abbr != 'all':
            spec[settings.LEVEL_FIELD] = abbr

        docs = collection.find(spec)

        # Do a regex search if the input consists solely of digits.
        if 0 == docs.count():
            spec['bill_id'] = {'$regex': bill_id}
            docs = collection.find(spec)

        # If there were actual results, return a bill_id result view.
        if 0 < docs.count():

            def sortkey(doc):
                session = doc['session']
                years = re.findall(r'\d{4}', session)
                try:
                    return int(years[-1])
                except IndexError:
                    return session

            docs = sorted(docs, key=operator.itemgetter('session'),
                          reverse=True)

            return docs
Beispiel #21
0
def import_bill(data, standalone_votes, categorizer):
    """
        insert or update a bill

        data - raw bill JSON
        standalone_votes - votes scraped separately
        categorizer - SubjectCategorizer (None - no categorization)
    """
    abbr = data[settings.LEVEL_FIELD]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [
            fix_bill_id(bid) for bid in data['alternate_bill_ids']
        ]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # companions
    for companion in data['companions']:
        companion['bill_id'] = fix_bill_id(companion['bill_id'])
        # query based on companion
        spec = companion.copy()
        spec[settings.LEVEL_FIELD] = abbr
        if not spec['chamber']:
            spec.pop('chamber')
        companion_obj = db.bills.find_one(spec)
        if companion_obj:
            companion['internal_id'] = companion_obj['_id']
        else:
            logger.warning(
                'Unknown companion: {chamber} {session} {bill_id}'.format(
                    **companion))

    # look for a prior version of this bill
    bill = db.bills.find_one({
        settings.LEVEL_FIELD: abbr,
        'session': data['session'],
        'chamber': data['chamber'],
        'bill_id': data['bill_id']
    })

    # keep doc ids consistent
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    match_sponsor_ids(abbr, data)

    # process votes ############

    # pull votes off bill
    bill_votes = data.pop('votes', [])

    # grab the external bill votes if present
    if metadata(abbr).get('_partial_vote_bill_id'):
        # this is a hack initially added for Rhode Island where we can't
        # determine the full bill_id, if this key is in the metadata
        # we just use the numeric portion, not ideal as it won't work
        # where HB/SBs overlap, but in RI they never do
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes += standalone_votes.pop(
            (data['chamber'], data['session'], numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes += standalone_votes.pop(
            (data['chamber'], data['session'], data['bill_id']), [])

    # do id matching and other vote prep
    if bill:
        prepare_votes(abbr, data['session'], bill['_id'], bill_votes)
    else:
        prepare_votes(abbr, data['session'], None, bill_votes)

    # process actions ###########

    dates = {
        'first': None,
        'last': None,
        'passed_upper': None,
        'passed_lower': None,
        'signed': None
    }

    vote_flags = {
        "bill:passed", "bill:failed", "bill:veto_override:passed",
        "bill:veto_override:failed", "amendment:passed", "amendment:failed",
        "committee:passed", "committee:passed:favorable",
        "committee:passed:unfavorable", "committee:passed:failed"
    }
    already_linked = set()
    remove_vote = set()

    for action in data['actions']:
        adate = action['date']

        def _match_committee(name):
            return get_committee_id(abbr, action['actor'], name)

        def _match_legislator(name):
            return get_legislator_id(abbr, data['session'], action['actor'],
                                     name)

        resolvers = {
            "committee": _match_committee,
            "legislator": _match_legislator
        }

        if "related_entities" in action:
            for entity in action['related_entities']:
                try:
                    resolver = resolvers[entity['type']]
                except KeyError as e:
                    # We don't know how to deal.
                    logger.error("I don't know how to sort a %s" % e)
                    continue

                id = resolver(entity['name'])
                entity['id'] = id

        # first & last dates
        if not dates['first'] or adate < dates['first']:
            dates['first'] = adate
        if not dates['last'] or adate > dates['last']:
            dates['last'] = adate

        # passed & signed dates
        if (not dates['passed_upper'] and action['actor'] == 'upper'
                and 'bill:passed' in action['type']):
            dates['passed_upper'] = adate
        elif (not dates['passed_lower'] and action['actor'] == 'lower'
              and 'bill:passed' in action['type']):
            dates['passed_lower'] = adate
        elif (not dates['signed'] and 'governor:signed' in action['type']):
            dates['signed'] = adate

        # vote-action matching
        action_attached = False
        # only attempt vote matching if action has a date and is one of the
        # designated vote action types
        if set(action['type']).intersection(vote_flags) and action['date']:
            for vote in bill_votes:
                if not vote['date']:
                    continue

                delta = abs(vote['date'] - action['date'])
                if (delta < datetime.timedelta(hours=20)
                        and vote['chamber'] == action['actor']):
                    if action_attached:
                        # multiple votes match, we can't guess
                        action.pop('related_votes', None)
                    else:
                        related_vote = vote['vote_id']
                        if related_vote in already_linked:
                            remove_vote.add(related_vote)

                        already_linked.add(related_vote)
                        action['related_votes'] = [related_vote]
                        action_attached = True

    # remove related_votes that we linked to multiple actions
    for action in data['actions']:
        for vote in remove_vote:
            if vote in action.get('related_votes', []):
                action['related_votes'].remove(vote)

    # save action dates to data
    data['action_dates'] = dates

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)
    data = apply_filters(filters, data)

    if not bill:
        insert_with_id(data)
        elasticsearch_push(data)
        git_add_bill(data)
        save_votes(data, bill_votes)
        return "insert"
    else:
        update(bill, data, db.bills)
        elasticsearch_push(bill)
        git_add_bill(bill)
        save_votes(bill, bill_votes)
        return "update"
Beispiel #22
0
    def search(query=None, abbr=None, chamber=None, subjects=None,
               bill_id=None, bill_id__in=None, search_window=None,
               updated_since=None, sponsor_id=None, bill_fields=None,
               status=None, type_=None, session=None):
        _filter = {}
        for key, value in [(settings.LEVEL_FIELD, abbr),
                           ('chamber', chamber),
                           ('subjects', subjects),
                           ('bill_id', bill_id),
                          ]:
            if value is not None:
                _filter[key] = value

        if search_window:
            if search_window == 'session':
                _filter['_current_session'] = True
            elif search_window == 'term':
                _filter['_current_term'] = True
            elif search_window.startswith('session:'):
                _filter['session'] = search_window.split('session:')[1]
            elif search_window.startswith('term:'):
                _filter['_term'] = search_window.split('term:')[1]
            elif search_window == 'all':
                pass
            else:
                raise ValueError('invalid search_window. valid choices are '
                                 ' "term", "session", "all"')
        if updated_since:
            try:
                _filter['updated_at'] = {'$gte': parse_param_dt(updated_since)}
            except ValueError:
                raise ValueError('invalid updated_since parameter. '
                                 'please supply date in YYYY-MM-DD format')
        if sponsor_id:
            _filter['sponsors.leg_id'] = sponsor_id

        if status:
            # Status is slightly different: it's a dict like--
            # {'action_dates.signed': {'$ne': None}}
            _filter.update(**status)

        if type_:
            _filter['type'] = type_

        if session:
            _filter['session'] = session

        # process full-text query
        if query and settings.ENABLE_ELASTICSEARCH:
            # block spammers, possibly move to a BANNED_SEARCH_LIST setting
            if '<a href' in query:
                return db.bills.find({settings.LEVEL_FIELD: None})

            # if query is numeric convert to an id filter
            if re.findall('\d+', query):
                _id_filter = dict(_filter)

                # if query is entirely numeric make it a regex
                if not re.findall('\D', query):
                    _id_filter['bill_id'] = {'$regex':
                                             fix_bill_id(query).upper()}
                else:
                    _id_filter['bill_id'] = fix_bill_id(query).upper()

                # check for a result
                result = db.bills.find(_id_filter, fields=bill_fields)
                if result.count():
                    return result

            query = {"query_string": {"fields": ["text", "title"],
                                      "default_operator": "AND",
                                      "query": query}}
            search = pyes.Search(query, fields=[])

            # take terms from mongo query
            es_terms = []
            if settings.LEVEL_FIELD in _filter:
                es_terms.append(pyes.TermFilter(
                    settings.LEVEL_FIELD, _filter.pop(settings.LEVEL_FIELD)))
            if 'session' in _filter:
                es_terms.append(pyes.TermFilter('session',
                                                _filter.pop('session')))
            if 'chamber' in _filter:
                es_terms.append(pyes.TermFilter('chamber',
                                                _filter.pop('chamber')))
            if 'subjects' in _filter:
                es_terms.append(pyes.TermFilter(
                    'subjects', _filter.pop('subjects')['$all']))
            if 'sponsors.leg_id' in _filter:
                es_terms.append(pyes.TermFilter(
                    'sponsors', _filter.pop('sponsors.leg_id')))

            # add terms
            if es_terms:
                search.filter = pyes.ANDFilter(es_terms)

            # page size is a guess, could use tweaks
            es_result = elasticsearch.search(search, search_type='scan',
                                             scroll='3m', size=250)
            doc_ids = [r.get_id() for r in es_result]
            _filter['versions.doc_id'] = {'$in': doc_ids}
        elif query:
            _filter['title'] = {'$regex': query, '$options': 'i'}

        # return query
        return db.bills.find(_filter, fields=bill_fields)
Beispiel #23
0
    def search(query=None,
               abbr=None,
               chamber=None,
               subjects=None,
               bill_id=None,
               search_window=None,
               updated_since=None,
               last_action_since=None,
               sponsor_id=None,
               status=None,
               type_=None,
               session=None,
               bill_fields=None,
               sort=None,
               limit=None):

        use_elasticsearch = False
        numeric_query = False
        mongo_filter = {}
        es_terms = []

        if status is None:
            status = []

        if query:
            use_elasticsearch = settings.ENABLE_ELASTICSEARCH

            # spammers get a 400
            if '<a href' in query:
                raise PermissionDenied('html detected')

            # if query is numeric convert to an id filter
            #   (TODO: maybe this should be an $or)
            if re.findall('\d+', query):
                # if query is entirely numeric make it a regex and hit mongo
                if not re.findall('\D', query):
                    mongo_filter['bill_id'] = {
                        '$regex': fix_bill_id(query).upper()
                    }
                else:
                    mongo_filter['bill_id'] = fix_bill_id(query).upper()
                use_elasticsearch = False
                numeric_query = True

        # handle abbr
        if abbr and use_elasticsearch:
            es_terms.append({'term': {'jurisdiction': abbr}})
        elif abbr:
            mongo_filter[settings.LEVEL_FIELD] = abbr

        # sponsor_id
        if sponsor_id and use_elasticsearch:
            es_terms.append({'term': {'sponsor_ids': sponsor_id}})
        elif sponsor_id:
            mongo_filter['sponsors.leg_id'] = sponsor_id

        # handle simple term arguments (chamber, bill_id, type, session)
        if isinstance(bill_id, list) and not use_elasticsearch:
            bill_id = {'$in': bill_id}
        simple_args = {
            'chamber': chamber,
            'bill_id': bill_id,
            'type': type_,
            'session': session
        }
        if search_window:
            if search_window == 'session':
                simple_args['_current_session'] = True
            elif search_window == 'term':
                simple_args['_current_term'] = True
            elif search_window.startswith('session:'):
                simple_args['session'] = search_window.split('session:')[1]
            elif search_window.startswith('term:'):
                simple_args['_term'] = search_window.split('term:')[1]
            elif search_window != 'all':
                raise ValueError('invalid search_window. valid choices are '
                                 ' "term", "session", "all"')
        for key, value in simple_args.iteritems():
            if value is not None:
                if use_elasticsearch:
                    es_terms.append({'term': {key: value}})
                else:
                    mongo_filter[key] = value

        if subjects and use_elasticsearch:
            for subject in subjects:
                es_terms.append({'term': {'subjects': subject}})
        elif subjects:
            mongo_filter['subjects'] = {'$all': filter(None, subjects)}

        if updated_since and use_elasticsearch:
            es_terms.append({'range': {'updated_at': {'gte': updated_since}}})
        elif updated_since:
            try:
                mongo_filter['updated_at'] = {
                    '$gte': parse_param_dt(updated_since)
                }
            except ValueError:
                raise ValueError('invalid updated_since parameter. '
                                 'please supply date in YYYY-MM-DD format')

        if last_action_since and use_elasticsearch:
            es_terms.append(
                {'range': {
                    'action_dates.last': {
                        'gte': last_action_since
                    }
                }})
        elif last_action_since:
            try:
                mongo_filter['action_dates.last'] = {
                    '$gte': parse_param_dt(last_action_since)
                }
            except ValueError:
                raise ValueError('invalid last_action_since parameter. '
                                 'please supply date in YYYY-MM-DD format')

        # Status comes in as a list and needs to become:
        # {'action_dates.signed': {'$ne': None}}
        status_spec = []
        for _status in status:
            status_spec.append({'action_dates.%s' % _status: {'$ne': None}})

        if len(status_spec) == 1:
            status_spec = status_spec[0]
        elif len(status_spec) > 1:
            status_spec = {'$and': status_spec}

        if status_spec and use_elasticsearch:
            for key in status:
                es_terms.append({'exists': {'field': key}})
        elif status_spec:
            mongo_filter.update(**status_spec)

        # preprocess sort
        if sort in ('first', 'last', 'signed', 'passed_lower', 'passed_upper'):
            sort = 'action_dates.' + sort
        elif sort not in ('updated_at', 'created_at'):
            sort = 'action_dates.last'

        # do the actual ES query
        if query and use_elasticsearch:
            search = {
                'query': {
                    "query_string": {
                        "fields": ["text", "title"],
                        "default_operator": "AND",
                        "query": query
                    }
                }
            }
            if es_terms:
                search['filter'] = {'and': es_terms}
                search = {'query': {'filtered': search}}
            search['fields'] = []
            return BillSearchResults(search, None, sort, bill_fields)

        elif query and not numeric_query:
            mongo_filter['title'] = {'$regex': query, '$options': 'i'}

        return BillSearchResults(None, mongo_filter, sort, bill_fields)
Beispiel #24
0
def import_bill(data, standalone_votes, categorizer):
    """
        insert or update a bill

        data - raw bill JSON
        standalone_votes - votes scraped separately
        categorizer - SubjectCategorizer (None - no categorization)
    """
    abbr = data[settings.LEVEL_FIELD]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in
                                      data['alternate_bill_ids']]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # companions
    for companion in data['companions']:
        companion['bill_id'] = fix_bill_id(companion['bill_id'])
        # query based on companion
        spec = companion.copy()
        spec[settings.LEVEL_FIELD] = abbr
        if not spec['chamber']:
            spec.pop('chamber')
        companion_obj = db.bills.find_one(spec)
        if companion_obj:
            companion['internal_id'] = companion_obj['_id']
        else:
            logger.warning('Unknown companion: {chamber} {session} {bill_id}'
                           .format(**companion))

    # look for a prior version of this bill
    bill = db.bills.find_one({settings.LEVEL_FIELD: abbr,
                              'session': data['session'],
                              'chamber': data['chamber'],
                              'bill_id': data['bill_id']})

    # keep doc ids consistent
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    match_sponsor_ids(abbr, data)

    # process votes ############

    # pull votes off bill
    bill_votes = data.pop('votes', [])

    # grab the external bill votes if present
    if metadata(abbr).get('_partial_vote_bill_id'):
        # this is a hack initially added for Rhode Island where we can't
        # determine the full bill_id, if this key is in the metadata
        # we just use the numeric portion, not ideal as it won't work
        # where HB/SBs overlap, but in RI they never do
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes += standalone_votes.pop((data['chamber'], data['session'],
                                            numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes += standalone_votes.pop((data['chamber'], data['session'],
                                            data['bill_id']), [])

    # do id matching and other vote prep
    if bill:
        prepare_votes(abbr, data['session'], bill['_id'], bill_votes)
    else:
        prepare_votes(abbr, data['session'], None, bill_votes)

    # process actions ###########

    dates = {'first': None, 'last': None, 'passed_upper': None,
             'passed_lower': None, 'signed': None}

    vote_flags = {
        "bill:passed",
        "bill:failed",
        "bill:veto_override:passed",
        "bill:veto_override:failed",
        "amendment:passed",
        "amendment:failed",
        "committee:passed",
        "committee:passed:favorable",
        "committee:passed:unfavorable",
        "committee:passed:failed"
    }
    already_linked = set()
    remove_vote = set()

    for action in data['actions']:
        adate = action['date']

        def _match_committee(name):
            return get_committee_id(abbr, action['actor'], name)

        def _match_legislator(name):
            return get_legislator_id(abbr,
                                     data['session'],
                                     action['actor'],
                                     name)

        resolvers = {
            "committee": _match_committee,
            "legislator": _match_legislator
        }

        if "related_entities" in action:
            for entity in action['related_entities']:
                try:
                    resolver = resolvers[entity['type']]
                except KeyError as e:
                    # We don't know how to deal.
                    logger.error("I don't know how to sort a %s" % e)
                    continue

                id = resolver(entity['name'])
                entity['id'] = id

        # first & last dates
        if not dates['first'] or adate < dates['first']:
            dates['first'] = adate
        if not dates['last'] or adate > dates['last']:
            dates['last'] = adate

        # passed & signed dates
        if (not dates['passed_upper'] and action['actor'] == 'upper'
                and 'bill:passed' in action['type']):
            dates['passed_upper'] = adate
        elif (not dates['passed_lower'] and action['actor'] == 'lower'
                and 'bill:passed' in action['type']):
            dates['passed_lower'] = adate
        elif (not dates['signed'] and 'governor:signed' in action['type']):
            dates['signed'] = adate

        # vote-action matching
        action_attached = False
        # only attempt vote matching if action has a date and is one of the
        # designated vote action types
        if set(action['type']).intersection(vote_flags) and action['date']:
            for vote in bill_votes:
                if not vote['date']:
                    continue

                delta = abs(vote['date'] - action['date'])
                if (delta < datetime.timedelta(hours=20) and
                        vote['chamber'] == action['actor']):
                    if action_attached:
                        # multiple votes match, we can't guess
                        action.pop('related_votes', None)
                    else:
                        related_vote = vote['vote_id']
                        if related_vote in already_linked:
                            remove_vote.add(related_vote)

                        already_linked.add(related_vote)
                        action['related_votes'] = [related_vote]
                        action_attached = True

    # remove related_votes that we linked to multiple actions
    for action in data['actions']:
        for vote in remove_vote:
            if vote in action.get('related_votes', []):
                action['related_votes'].remove(vote)

    # save action dates to data
    data['action_dates'] = dates

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # add/update tracked_versions collection
        track_version(data, version)

        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)
    data = apply_filters(filters, data)

    if not bill:
        insert_with_id(data)
        git_add_bill(data)
        save_votes(data, bill_votes)
        return "insert"
    else:
        update(bill, data, db.bills)
        git_add_bill(bill)
        save_votes(bill, bill_votes)
        return "update"
Beispiel #25
0
    def search(query=None,
               abbr=None,
               chamber=None,
               subjects=None,
               bill_id=None,
               bill_id__in=None,
               search_window=None,
               updated_since=None,
               sponsor_id=None,
               bill_fields=None,
               status=None,
               type_=None,
               session=None):
        _filter = {}
        for key, value in [
            (settings.LEVEL_FIELD, abbr),
            ('chamber', chamber),
            ('subjects', subjects),
            ('bill_id', bill_id),
        ]:
            if value is not None:
                _filter[key] = value

        if search_window:
            if search_window == 'session':
                _filter['_current_session'] = True
            elif search_window == 'term':
                _filter['_current_term'] = True
            elif search_window.startswith('session:'):
                _filter['session'] = search_window.split('session:')[1]
            elif search_window.startswith('term:'):
                _filter['_term'] = search_window.split('term:')[1]
            elif search_window == 'all':
                pass
            else:
                raise ValueError('invalid search_window. valid choices are '
                                 ' "term", "session", "all"')
        if updated_since:
            try:
                _filter['updated_at'] = {'$gte': parse_param_dt(updated_since)}
            except ValueError:
                raise ValueError('invalid updated_since parameter. '
                                 'please supply date in YYYY-MM-DD format')
        if sponsor_id:
            _filter['sponsors.leg_id'] = sponsor_id

        if status:
            # Status is slightly different: it's a dict like--
            # {'action_dates.signed': {'$ne': None}}
            _filter.update(**status)

        if type_:
            _filter['type'] = type_

        if session:
            _filter['session'] = session

        # process full-text query
        if query and settings.ENABLE_ELASTICSEARCH:
            # block spammers, possibly move to a BANNED_SEARCH_LIST setting
            if '<a href' in query:
                return db.bills.find({settings.LEVEL_FIELD: None})

            if re.findall('\d+', query):
                _id_filter = dict(_filter)
                _id_filter['bill_id'] = fix_bill_id(query).upper()
                result = db.bills.find(_id_filter)
                if result:
                    return result

            query = {
                "query_string": {
                    "fields": ["text", "title"],
                    "default_operator": "AND",
                    "query": query
                }
            }
            search = pyes.Search(query, fields=[])

            # take terms from mongo query
            es_terms = []
            if settings.LEVEL_FIELD in _filter:
                es_terms.append(
                    pyes.TermFilter(settings.LEVEL_FIELD,
                                    _filter.pop(settings.LEVEL_FIELD)))
            if 'session' in _filter:
                es_terms.append(
                    pyes.TermFilter('session', _filter.pop('session')))
            if 'chamber' in _filter:
                es_terms.append(
                    pyes.TermFilter('chamber', _filter.pop('chamber')))
            if 'subjects' in _filter:
                es_terms.append(
                    pyes.TermFilter('subjects',
                                    _filter.pop('subjects')['$all']))
            if 'sponsors.leg_id' in _filter:
                es_terms.append(
                    pyes.TermFilter('sponsors',
                                    _filter.pop('sponsors.leg_id')))

            # add terms
            if es_terms:
                search.filter = pyes.ANDFilter(es_terms)

            # page size is a guess, could use tweaks
            es_result = elasticsearch.search(search,
                                             search_type='scan',
                                             scroll='3m',
                                             size=250)
            doc_ids = [r.get_id() for r in es_result]
            _filter['versions.doc_id'] = {'$in': doc_ids}
        elif query:
            _filter['title'] = {'$regex': query, '$options': 'i'}

        # return query
        return db.bills.find(_filter, bill_fields)
Beispiel #26
0
def import_events(abbr, data_dir, import_actions=False):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'events', '*.json')

    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))

        def _resolve_ctty(committee):
            return get_committee_id(data[settings.LEVEL_FIELD],
                                    committee['chamber'],
                                    committee['participant'])

        def _resolve_leg(leg):
            chamber = leg['chamber'] if leg['chamber'] in ['upper', 'lower'] \
                else None

            return get_legislator_id(abbr,
                                     data['session'],
                                     chamber,
                                     leg['participant'])

        resolvers = {
            "committee": _resolve_ctty,
            "legislator": _resolve_leg
        }

        for entity in data['participants']:
            type = entity['participant_type']
            id = None
            if type in resolvers:
                id = resolvers[type](entity)
            else:
                logger.warning("I don't know how to resolve a %s" % type)
            entity['id'] = id

        for bill in data['related_bills']:
            bill_id = bill['bill_id']
            bill_id = fix_bill_id(bill_id)
            db_bill = db.bills.find_one({
                "$or": [
                    {
                        settings.LEVEL_FIELD: abbr,
                        'session': data['session'],
                        'bill_id': bill_id
                    },
                    {
                        settings.LEVEL_FIELD: abbr,
                        'session': data['session'],
                        'alternate_bill_ids': bill_id
                    }
                ]
            })

            if not db_bill:
                logger.warning("Error: Can't find %s" % bill_id)
                db_bill = {}
                db_bill['_id'] = None

            # Events are really hard to pin to a chamber. Some of these are
            # also a committee considering a bill from the other chamber, or
            # something like that.
            bill['id'] = db_bill['_id']
        import_event(data)
    ensure_indexes()
Beispiel #27
0
    def search(query=None, abbr=None, chamber=None, subjects=None,
               bill_id=None, search_window=None, updated_since=None,
               last_action_since=None, sponsor_id=None, status=None,
               type_=None, session=None, bill_fields=None,
               sort=None, limit=None):

        use_elasticsearch = False
        numeric_query = False
        mongo_filter = {}
        es_terms = []

        if status is None:
            status = []

        if query:
            use_elasticsearch = settings.ENABLE_ELASTICSEARCH

            # spammers get a 400
            if '<a href' in query:
                raise PermissionDenied('html detected')

            # if query is numeric convert to an id filter
            #   (TODO: maybe this should be an $or)
            if re.findall('\d+', query):
                # if query is entirely numeric make it a regex and hit mongo
                if not re.findall('\D', query):
                    mongo_filter['bill_id'] = {'$regex':
                                               fix_bill_id(query).upper()}
                else:
                    mongo_filter['bill_id'] = fix_bill_id(query).upper()
                use_elasticsearch = False
                numeric_query = True

        # handle abbr
        if abbr and use_elasticsearch:
            es_terms.append({'term': {'jurisdiction': abbr}})
        elif abbr:
            mongo_filter[settings.LEVEL_FIELD] = abbr

        # sponsor_id
        if sponsor_id and use_elasticsearch:
            es_terms.append({'term': {'sponsor_ids': sponsor_id}})
        elif sponsor_id:
            mongo_filter['sponsors.leg_id'] = sponsor_id

        # handle simple term arguments (chamber, bill_id, type, session)
        if isinstance(bill_id, list) and not use_elasticsearch:
            bill_id = {'$in': bill_id}
        simple_args = {'chamber': chamber, 'bill_id': bill_id, 'type': type_,
                       'session': session}
        if search_window:
            if search_window == 'session':
                simple_args['_current_session'] = True
            elif search_window == 'term':
                simple_args['_current_term'] = True
            elif search_window.startswith('session:'):
                simple_args['session'] = search_window.split('session:')[1]
            elif search_window.startswith('term:'):
                simple_args['_term'] = search_window.split('term:')[1]
            elif search_window != 'all':
                raise ValueError('invalid search_window. valid choices are '
                                 ' "term", "session", "all"')
        for key, value in simple_args.iteritems():
            if value is not None:
                if use_elasticsearch:
                    es_terms.append({'term': {key: value}})
                else:
                    mongo_filter[key] = value

        if subjects and use_elasticsearch:
            for subject in subjects:
                es_terms.append({'term': {'subjects': subject}})
        elif subjects:
            mongo_filter['subjects'] = {'$all': filter(None, subjects)}

        if updated_since and use_elasticsearch:
            es_terms.append({'range': {'updated_at': {'gte': updated_since}}})
        elif updated_since:
            try:
                mongo_filter['updated_at'] = {'$gte':
                                              parse_param_dt(updated_since)}
            except ValueError:
                raise ValueError('invalid updated_since parameter. '
                                 'please supply date in YYYY-MM-DD format')

        if last_action_since and use_elasticsearch:
            es_terms.append({'range': {'action_dates.last':
                                       {'gte': last_action_since}}})
        elif last_action_since:
            try:
                mongo_filter['action_dates.last'] = {'$gte': parse_param_dt(last_action_since)}
            except ValueError:
                raise ValueError('invalid last_action_since parameter. '
                                 'please supply date in YYYY-MM-DD format')

        # Status comes in as a list and needs to become:
        # {'action_dates.signed': {'$ne': None}}
        status_spec = []
        for _status in status:
            status_spec.append({'action_dates.%s' % _status: {'$ne': None}})

        if len(status_spec) == 1:
            status_spec = status_spec[0]
        elif len(status_spec) > 1:
            status_spec = {'$and': status_spec}

        if status_spec and use_elasticsearch:
            for key in status:
                es_terms.append({'exists': {'field': key}})
        elif status_spec:
            mongo_filter.update(**status_spec)

        # preprocess sort
        if sort in ('first', 'last', 'signed', 'passed_lower', 'passed_upper'):
            sort = 'action_dates.' + sort
        elif sort not in ('updated_at', 'created_at'):
            sort = 'action_dates.last'

        # do the actual ES query
        if query and use_elasticsearch:
            search = {'query': {"query_string": {"fields": ["text", "title"],
                                                 "default_operator": "AND",
                                                 "query": query}}}
            if es_terms:
                search['filter'] = {'and': es_terms}
                search = {'query': {'filtered': search}}
            search['fields'] = []
            return BillSearchResults(search, None, sort, bill_fields)

        elif query and not numeric_query:
            mongo_filter['title'] = {'$regex': query, '$options': 'i'}

        return BillSearchResults(None, mongo_filter, sort, bill_fields)