def get_markers(db, client_name, clip_id, congress, chamber): api_url = API_PREFIX + client_name + '?type=marker&size=100000' data = '{"filter": { "term": { "video_id": %s}}, "sort": [{"offset":{"order":"asc"}}]}' % clip_id markers = query_api(db, api_url, data) clips = [] bill_ids = [] legislators = [] legislator_ids = [] roll_ids = [] if markers: for m in markers: m_new = m['_source'] c = { 'offset': m_new['offset'], 'events': [htmlentitydecode(m_new['name']).strip(),], 'time': m_new['datetime'] } if m != markers[-1]: #if it's not the last one c['duration'] = markers[markers.index(m)+1]['_source']['offset'] - m_new['offset'] year = dateparse(m_new['datetime']).year legis, bio_ids = rtc_utils.extract_legislators(c['events'][0], chamber, db) b = rtc_utils.extract_bills(c['events'][0], congress) r = rtc_utils.extract_rolls(c['events'][0], chamber, year) if legis: c['legislator_names'] = legis for l in legis: if l not in legislators: legislators.append(l) if bio_ids: c['legislator_ids'] = bio_ids for bi in bio_ids: if bi not in legislator_ids: legislator_ids.append(bi) if r: c['roll_ids'] = r for ro in r: if ro not in roll_ids: roll_ids.append(ro) if b: c['bill_ids'] = b for bill in b: if bill not in bill_ids: bill_ids.append(bill) clips.append(c) return (clips, bill_ids, legislators, legislator_ids, roll_ids) else: db.warning('There are no markers for video id: %s' % clip_id) return (None, None, None, None, None)
def run(db, es, options = {}): try: page = urllib2.urlopen("http://www.senate.gov/general/committee_schedules/hearings.xml") except: db.note("Couldn't load Senate hearings feed, can't proceed") else: soup = BeautifulStoneSoup(page) meetings = soup.findAll('meeting') parser = HTMLParser.HTMLParser() count = 0 for meeting in meetings: if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]): continue full_id = meeting.cmte_code.contents[0].strip() match = re.search("^([A-Z]+)(\d+)$", full_id) if match: committee_id, subcommittee_id = match.groups() else: committee_id, subcommittee_id = full_id, None if (subcommittee_id == "00") or (subcommittee_id == None): subcommittee_id = None else: subcommittee_id = full_id committee = committee_for(db, committee_id) # Don't warn if it's a bill-specific conference committee if committee: chamber = committee['chamber'] else: if committee_id == "JCC": chamber = "joint" else: db.warning("Couldn't locate committee by committee_id %s" % committee_id, {'committee_id': committee_id}) continue committee_url = meeting.committee['url'] date_string = meeting.date.contents[0].strip() occurs_at = datetime.datetime(*time.strptime(date_string, "%d-%b-%Y %I:%M %p")[0:6], tzinfo=rtc_utils.EST()) congress = rtc_utils.current_congress(occurs_at.year) document = None if meeting.document: document = meeting.document.contents[0].strip() room = meeting.room.contents[0].strip() description = meeting.matter.contents[0].strip().replace('\n', '') # content is double-escaped, e.g. " description = parser.unescape(parser.unescape(description)) bill_ids = rtc_utils.extract_bills(description, congress) documents = db['hearings'].find({ 'chamber': chamber, 'committee_id': committee_id, "$or": [{ 'occurs_at': occurs_at },{ 'description': description }] }) hearing = None if documents.count() > 0: hearing = documents[0] else: hearing = { 'chamber': chamber, 'committee_id': committee_id } hearing['created_at'] = datetime.datetime.now() if subcommittee_id: hearing['subcommittee_id'] = subcommittee_id hearing['updated_at'] = datetime.datetime.now() hearing.update({ 'congress': congress, 'occurs_at': occurs_at, 'room': room, 'description': description, 'dc': True, 'bill_ids': bill_ids }) if committee: hearing['committee'] = committee db['hearings'].save(hearing) count += 1 db.success("Updated or created %s Senate committee hearings" % count)
def run(db, es, options={}): try: page = urllib2.urlopen( "http://www.senate.gov/general/committee_schedules/hearings.xml") except: db.note("Couldn't load Senate hearings feed, can't proceed") else: soup = BeautifulStoneSoup(page) meetings = soup.findAll('meeting') parser = HTMLParser.HTMLParser() count = 0 for meeting in meetings: if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]): continue full_id = meeting.cmte_code.contents[0].strip() committee_id, subcommittee_id = re.search("^([A-Z]+)(\d+)$", full_id).groups() if subcommittee_id == "00": subcommittee_id = None else: subcommittee_id = full_id committee = committee_for(db, committee_id) # Don't warn if it's a bill-specific conference committee if committee: chamber = committee['chamber'] else: if committee_id != "JCC": chamber = "joint" else: db.warning( "Couldn't locate committee by committee_id %s" % committee_id, {'committee_id': committee_id}) continue committee_url = meeting.committee['url'] date_string = meeting.date.contents[0].strip() occurs_at = datetime.datetime(*time.strptime( date_string, "%d-%b-%Y %I:%M %p")[0:6], tzinfo=rtc_utils.EST()) congress = rtc_utils.current_congress(occurs_at.year) document = None if meeting.document: document = meeting.document.contents[0].strip() room = meeting.room.contents[0].strip() description = meeting.matter.contents[0].strip().replace('\n', '') # content is double-escaped, e.g. " description = parser.unescape(parser.unescape(description)) bill_ids = rtc_utils.extract_bills(description, congress) documents = db['hearings'].find({ 'chamber': chamber, 'committee_id': committee_id, "$or": [{ 'occurs_at': occurs_at }, { 'description': description }] }) hearing = None if documents.count() > 0: hearing = documents[0] else: hearing = {'chamber': chamber, 'committee_id': committee_id} hearing['created_at'] = datetime.datetime.now() if subcommittee_id: hearing['subcommittee_id'] = subcommittee_id hearing['updated_at'] = datetime.datetime.now() hearing.update({ 'congress': congress, 'occurs_at': occurs_at, 'room': room, 'description': description, 'dc': True, 'bill_ids': bill_ids }) if committee: hearing['committee'] = committee db['hearings'].save(hearing) count += 1 db.success("Updated or created %s Senate committee hearings" % count)
def run(db, es, options={}): try: page = urllib2.urlopen("http://www.senate.gov/general/committee_schedules/hearings.xml") except: db.note("Couldn't load Senate hearings feed, can't proceed") else: soup = BeautifulStoneSoup(page) meetings = soup.findAll("meeting") parser = HTMLParser.HTMLParser() count = 0 for meeting in meetings: if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]): continue full_id = meeting.cmte_code.contents[0].strip() committee_id, subcommittee_id = re.search("^([A-Z]+)(\d+)$", full_id).groups() if subcommittee_id == "00": subcommittee_id = None else: subcommittee_id = full_id committee = committee_for(db, committee_id) # Don't warn if it's a bill-specific conference committee if committee: chamber = committee["chamber"] else: if committee_id != "JCC": chamber = "joint" else: db.warning( "Couldn't locate committee by committee_id %s" % committee_id, {"committee_id": committee_id} ) continue committee_url = meeting.committee["url"] date_string = meeting.date.contents[0].strip() occurs_at = datetime.datetime(*time.strptime(date_string, "%d-%b-%Y %I:%M %p")[0:6], tzinfo=rtc_utils.EST()) congress = rtc_utils.current_congress(occurs_at.year) document = None if meeting.document: document = meeting.document.contents[0].strip() room = meeting.room.contents[0].strip() description = meeting.matter.contents[0].strip().replace("\n", "") # content is double-escaped, e.g. " description = parser.unescape(parser.unescape(description)) bill_ids = rtc_utils.extract_bills(description, congress) documents = db["hearings"].find( { "chamber": chamber, "committee_id": committee_id, "$or": [{"occurs_at": occurs_at}, {"description": description}], } ) hearing = None if documents.count() > 0: hearing = documents[0] else: hearing = {"chamber": chamber, "committee_id": committee_id} hearing["created_at"] = datetime.datetime.now() if subcommittee_id: hearing["subcommittee_id"] = subcommittee_id hearing["updated_at"] = datetime.datetime.now() hearing.update( { "congress": congress, "occurs_at": occurs_at, "room": room, "description": description, "dc": True, "bill_ids": bill_ids, } ) if committee: hearing["committee"] = committee db["hearings"].save(hearing) count += 1 db.success("Updated or created %s Senate committee hearings" % count)
def get_clips_for_senate(db, clip_id, congress, duration, year): #go with 5 minute clips? chamber = "senate" clip_segment = 5 * 60 clip_number = (duration / clip_segment) + 1 clips = [] bill_ids = [] legislators = [] legislator_ids = [] roll_ids = [] caps = get_captions('floor.senate.gov', clip_id) offset = 0 for clip_num in range(1, clip_number + 1): start = offset if clip_num == clip_number + 1: #last clip dur = duration - offset else: dur = clip_segment c = { 'offset': start, 'duration': dur } events = '' captions = get_senate_clip_captions(caps, start, start + clip_segment) legis, bio_ids = rtc_utils.extract_legislators(captions, chamber, db) b = rtc_utils.extract_bills(captions, congress) r = rtc_utils.extract_rolls(captions, chamber, year) if legis: c['legislator_names'] = legis events += 'Legislators mentioned in this clip: ' for l in legis: if l not in legislators: legislators.append(l) events += l if l != legislators[-1]: events += '; ' if bio_ids: c['legislator_ids'] = bio_ids for bi in bio_ids: if bi not in legislator_ids: legislator_ids.append(bi) if r: c['roll_ids'] = r for ro in r: if ro not in roll_ids: roll_ids.append(ro) if b: c['bill_ids'] = b events += 'Bills mentioned in this clip: ' for bill in b: if bill not in bill_ids: bill_ids.append(bill) bill_name = db['bills'].find_one({'bill_id':bill }) if bill_name and bill_name.has_key('short_title') and bill_name['short_title'] and bill_name['short_title'] != '': events += bill_name['short_title'] + '; ' elif bill_name: events += ("%s%s" % (bill_name['bill_type'], bill_name['number'])).upper() + '; ' if events == '': events = "No description for clip number %s" % clip_num c['events'] = [events,] clips.append(c) offset = offset + clip_segment return (clips, bill_ids, legislators, legislator_ids, roll_ids)