コード例 #1
0
ファイル: videos.py プロジェクト: eres805/congress
def get_markers(db, client_name, clip_id, congress, chamber):
    api_url = API_PREFIX + client_name + '?type=marker&size=100000'
    data = '{"filter": { "term": { "video_id": %s}}, "sort": [{"offset":{"order":"asc"}}]}' % clip_id
    markers = query_api(db, api_url, data)
    clips = []
    bill_ids = []
    legislators = []
    legislator_ids = []
    roll_ids = []

    if markers:
        for m in markers:
            m_new = m['_source']
            c = {
                'offset': m_new['offset'],
                'events': [htmlentitydecode(m_new['name']).strip(),],
                'time': m_new['datetime']
            }
            if m != markers[-1]:  #if it's not the last one
                c['duration'] = markers[markers.index(m)+1]['_source']['offset'] - m_new['offset']

            year = dateparse(m_new['datetime']).year

            legis, bio_ids = python_utils.extract_legislators(c['events'][0], chamber, db)
            b = python_utils.extract_bills(c['events'][0], congress)
            r = python_utils.extract_rolls(c['events'][0], chamber, year)

            if legis:
                c['legislator_names'] = legis
                for l in legis:
                    if l not in legislators:
                        legislators.append(l)
            if bio_ids:
                c['legislator_ids'] = bio_ids
                for bi in bio_ids:
                    if bi not in legislator_ids:
                        legislator_ids.append(bi)

            if r:
                c['roll_ids'] = r
                for ro in r:
                    if ro not in roll_ids:
                        roll_ids.append(ro)

            if b:
                c['bill_ids'] = b
                for bill in b:
                    if bill not in bill_ids:
                        bill_ids.append(bill)


            clips.append(c)

        return (clips, bill_ids, legislators, legislator_ids, roll_ids)

    else:
        db.warning('There are no markers for video id: %s' % clip_id)
        return (None, None, None, None, None)
コード例 #2
0
def run(db, es, options={}):
    try:
        page = urllib2.urlopen(
            "http://www.senate.gov/general/committee_schedules/hearings.xml")
    except:
        db.note("Couldn't load Senate hearings feed, can't proceed")

    else:
        soup = BeautifulStoneSoup(page)
        meetings = soup.findAll('meeting')
        parser = HTMLParser.HTMLParser()

        count = 0

        for meeting in meetings:
            if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]):
                continue

            full_id = meeting.cmte_code.contents[0].strip()

            match = re.search("^([A-Z]+)(\d+)$", full_id)
            if match:
                committee_id, subcommittee_id = match.groups()
            else:
                committee_id, subcommittee_id = full_id, None

            if (subcommittee_id == "00") or (subcommittee_id == None):
                subcommittee_id = None
            else:
                subcommittee_id = full_id

            committee = committee_for(db, committee_id)

            # Don't warn if it's a bill-specific conference committee
            if committee:
                chamber = committee['chamber']
            else:
                if committee_id == "JCC":
                    chamber = "joint"
                else:
                    db.warning(
                        "Couldn't locate committee by committee_id %s" %
                        committee_id, {'committee_id': committee_id})
                    continue

            committee_url = meeting.committee['url']

            date_string = meeting.date.contents[0].strip()
            occurs_at = datetime.datetime(*time.strptime(
                date_string, "%d-%b-%Y %I:%M %p")[0:6],
                                          tzinfo=python_utils.EST())
            congress = python_utils.current_congress(occurs_at.year)

            document = None
            if meeting.document:
                document = meeting.document.contents[0].strip()

            room = meeting.room.contents[0].strip()
            description = meeting.matter.contents[0].strip().replace('\n', '')

            # content is double-escaped, e.g. "
            description = parser.unescape(parser.unescape(description))

            bill_ids = python_utils.extract_bills(description, congress)

            if subcommittee_id == None:
                sub = '00'
            else:
                sub = str(subcommittee_id)
            # making sure the ids are more reproducable
            date_string = occurs_at.strftime("%d-%b-%Y %I:%M %p")
            id_string = (date_string + str(committee_id) + sub).encode("utf-8")
            hearing_id = hashlib.md5(id_string).hexdigest()

            documents = db['hearings'].find({
                'chamber':
                chamber,
                'committee_id':
                committee_id,
                "$or": [{
                    'occurs_at': occurs_at
                }, {
                    'description': description
                }]
            })

            hearing = None
            if documents.count() > 0:
                hearing = documents[0]
            else:
                hearing = {
                    'chamber': chamber,
                    'committee_id': committee_id,
                    'hearing_id': hearing_id
                }

                hearing['created_at'] = datetime.datetime.now()

            if subcommittee_id:
                hearing['subcommittee_id'] = subcommittee_id
            hearing['updated_at'] = datetime.datetime.now()

            hearing.update({
                'congress': congress,
                'occurs_at': occurs_at,
                'room': room,
                'description': description,
                'dc': True,
                'bill_ids': bill_ids
            })

            if committee:
                hearing['committee'] = committee

            db['hearings'].save(hearing)

            count += 1

        db.success("Updated or created %s Senate committee hearings" % count)
コード例 #3
0
def get_markers(db, client_name, clip_id, congress, chamber):
    api_url = API_PREFIX + client_name + '?type=marker&size=100000'
    data = '{"filter": { "term": { "video_id": %s}}, "sort": [{"offset":{"order":"asc"}}]}' % clip_id
    markers = query_api(db, api_url, data)
    clips = []
    bill_ids = []
    legislators = []
    legislator_ids = []
    roll_ids = []

    if markers:
        for m in markers:
            m_new = m['_source']
            c = {
                'offset': m_new['offset'],
                'events': [
                    htmlentitydecode(m_new['name']).strip(),
                ],
                'time': m_new['datetime']
            }
            if m != markers[-1]:  #if it's not the last one
                c['duration'] = markers[markers.index(
                    m) + 1]['_source']['offset'] - m_new['offset']

            year = dateparse(m_new['datetime']).year

            legis, bio_ids = python_utils.extract_legislators(
                c['events'][0], chamber, db)
            b = python_utils.extract_bills(c['events'][0], congress)
            r = python_utils.extract_rolls(c['events'][0], chamber, year)

            if legis:
                c['legislator_names'] = legis
                for l in legis:
                    if l not in legislators:
                        legislators.append(l)
            if bio_ids:
                c['legislator_ids'] = bio_ids
                for bi in bio_ids:
                    if bi not in legislator_ids:
                        legislator_ids.append(bi)

            if r:
                c['roll_ids'] = r
                for ro in r:
                    if ro not in roll_ids:
                        roll_ids.append(ro)

            if b:
                c['bill_ids'] = b
                for bill in b:
                    if bill not in bill_ids:
                        bill_ids.append(bill)

            clips.append(c)

        return (clips, bill_ids, legislators, legislator_ids, roll_ids)

    else:
        db.note('There are no markers for video id: %s' % clip_id)
        return (None, None, None, None, None)
コード例 #4
0
def get_clips_for_senate(db, clip_id, congress, duration, year):
    #go with 5 minute clips?
    chamber = "senate"
    clip_segment = 5 * 60
    clip_number = (duration / clip_segment) + 1

    clips = []
    bill_ids = []
    legislators = []
    legislator_ids = []
    roll_ids = []

    caps = get_captions('floor.senate.gov', clip_id)
    if caps is None:
        print "Server error while fetching captions, skipping."
        return None, None, None, None, None

    offset = 0
    for clip_num in range(1, clip_number + 1):
        start = offset
        if clip_num == clip_number + 1:  #last clip
            dur = duration - offset
        else:
            dur = clip_segment

        c = {'offset': start, 'duration': dur}

        events = ''
        captions = get_senate_clip_captions(caps, start, start + clip_segment)

        legis, bio_ids = python_utils.extract_legislators(
            captions, chamber, db)
        b = python_utils.extract_bills(captions, congress)
        r = python_utils.extract_rolls(captions, chamber, year)

        if legis:
            c['legislator_names'] = legis
            events += 'Legislators mentioned in this clip: '
            for l in legis:
                if l not in legislators:
                    legislators.append(l)
                events += l
                if l != legislators[-1]:
                    events += '; '

        if bio_ids:
            c['legislator_ids'] = bio_ids
            for bi in bio_ids:
                if bi not in legislator_ids:
                    legislator_ids.append(bi)

        if r:
            c['roll_ids'] = r
            for ro in r:
                if ro not in roll_ids:
                    roll_ids.append(ro)

        if b:
            c['bill_ids'] = b
            events += 'Bills mentioned in this clip: '
            for bill in b:
                if bill not in bill_ids:
                    bill_ids.append(bill)

                bill_name = db['bills'].find_one({'bill_id': bill})
                if bill_name and bill_name.has_key(
                        'short_title') and bill_name[
                            'short_title'] and bill_name['short_title'] != '':
                    events += bill_name['short_title'] + '; '
                elif bill_name:
                    events += ("%s%s" % (bill_name['bill_type'],
                                         bill_name['number'])).upper() + '; '

        if events == '':
            events = "No description for clip number %s" % clip_num

        c['events'] = [
            events,
        ]

        clips.append(c)

        offset = offset + clip_segment

    return (clips, bill_ids, legislators, legislator_ids, roll_ids)
コード例 #5
0
ファイル: videos.py プロジェクト: eres805/congress
def get_clips_for_senate(db, clip_id, congress, duration, year):
    #go with 5 minute clips?
    chamber = "senate"
    clip_segment = 5 * 60
    clip_number = (duration / clip_segment) + 1

    clips = []
    bill_ids = []
    legislators = []
    legislator_ids = []
    roll_ids = []

    caps = get_captions('floor.senate.gov', clip_id)
    if caps is None:
        print "Server error while fetching captions, skipping."
        return None, None, None, None, None

    offset = 0
    for clip_num in range(1, clip_number + 1):
        start = offset
        if clip_num == clip_number + 1: #last clip
            dur = duration - offset
        else:
            dur = clip_segment

        c = {
            'offset': start,
            'duration': dur
        }

        events = ''
        captions = get_senate_clip_captions(caps, start, start + clip_segment)

        legis, bio_ids = python_utils.extract_legislators(captions, chamber, db)
        b = python_utils.extract_bills(captions, congress)
        r = python_utils.extract_rolls(captions, chamber, year)

        if legis:
            c['legislator_names'] = legis
            events += 'Legislators mentioned in this clip: '
            for l in legis:
                if l not in legislators:
                    legislators.append(l)
                events += l
                if l != legislators[-1]:
                    events += '; '

        if bio_ids:
            c['legislator_ids'] = bio_ids
            for bi in bio_ids:
                if bi not in legislator_ids:
                    legislator_ids.append(bi)

        if r:
            c['roll_ids'] = r
            for ro in r:
                if ro not in roll_ids:
                    roll_ids.append(ro)

        if b:
            c['bill_ids'] = b
            events += 'Bills mentioned in this clip: '
            for bill in b:
                if bill not in bill_ids:
                    bill_ids.append(bill)

                bill_name = db['bills'].find_one({'bill_id':bill })
                if  bill_name and bill_name.has_key('short_title') and bill_name['short_title'] and bill_name['short_title'] != '':
                    events += bill_name['short_title'] + '; '
                elif bill_name:
                    events += ("%s%s" % (bill_name['bill_type'], bill_name['number'])).upper() + '; '

        if events == '':
            events = "No description for clip number %s" % clip_num

        c['events'] = [events,]

        clips.append(c)

        offset = offset + clip_segment

    return (clips, bill_ids, legislators, legislator_ids, roll_ids)
コード例 #6
0
def run(db, es, options = {}):
    try:
      page = urllib2.urlopen("http://www.senate.gov/general/committee_schedules/hearings.xml")
    except:
      db.note("Couldn't load Senate hearings feed, can't proceed")

    else:
      soup = BeautifulStoneSoup(page)
      meetings = soup.findAll('meeting')
      parser = HTMLParser.HTMLParser()

      count = 0

      for meeting in meetings:
        if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]):
          continue

        full_id = meeting.cmte_code.contents[0].strip()

        match = re.search("^([A-Z]+)(\d+)$", full_id)
        if match:
          committee_id, subcommittee_id = match.groups()
        else:
          committee_id, subcommittee_id = full_id, None

        if (subcommittee_id == "00") or (subcommittee_id == None):
          subcommittee_id = None
        else:
          subcommittee_id = full_id

        committee = committee_for(db, committee_id)


        # Don't warn if it's a bill-specific conference committee
        if committee:
          chamber = committee['chamber']
        else:
          if committee_id == "JCC":
            chamber = "joint"
          else:
            db.warning("Couldn't locate committee by committee_id %s" % committee_id, {'committee_id': committee_id})
            continue

        committee_url = meeting.committee['url']

        date_string = meeting.date.contents[0].strip()
        occurs_at = datetime.datetime(*time.strptime(date_string, "%d-%b-%Y %I:%M %p")[0:6], tzinfo=python_utils.EST())
        congress = python_utils.current_congress(occurs_at.year)

        document = None
        if meeting.document:
          document = meeting.document.contents[0].strip()

        room = meeting.room.contents[0].strip()
        description = meeting.matter.contents[0].strip().replace('\n', '')

        # content is double-escaped, e.g. "
        description = parser.unescape(parser.unescape(description))

        bill_ids = python_utils.extract_bills(description, congress)

        if subcommittee_id == None:
          sub = '00'
        else:
          sub = str(subcommittee_id)
        # making sure the ids are more reproducable
        date_string = occurs_at.strftime("%d-%b-%Y %I:%M %p")
        id_string = (date_string + str(committee_id) + sub).encode("utf-8")
        hearing_id = hashlib.md5(id_string).hexdigest()

        documents = db['hearings'].find({
          'chamber': chamber,
          'committee_id': committee_id,

          "$or": [{
            'occurs_at': occurs_at
            },{
            'description': description
          }]
        })

        hearing = None
        if documents.count() > 0:
          hearing = documents[0]
        else:
          hearing = {
            'chamber': chamber,
            'committee_id': committee_id,
            'hearing_id': hearing_id
          }

          hearing['created_at'] = datetime.datetime.now()

        if subcommittee_id:
          hearing['subcommittee_id'] = subcommittee_id
        hearing['updated_at'] = datetime.datetime.now()

        hearing.update({
          'congress': congress,
          'occurs_at': occurs_at,
          'room': room,

          'description': description,
          'dc': True,

          'bill_ids': bill_ids
        })

        if committee:
          hearing['committee'] = committee

        db['hearings'].save(hearing)

        count += 1

      db.success("Updated or created %s Senate committee hearings" % count)