Ejemplo n.º 1
0
def run(db, es, options={}):
    try:
        page = urllib2.urlopen(
            "http://www.senate.gov/general/committee_schedules/hearings.xml")
    except:
        db.note("Couldn't load Senate hearings feed, can't proceed")

    else:
        soup = BeautifulStoneSoup(page)
        meetings = soup.findAll('meeting')
        parser = HTMLParser.HTMLParser()

        count = 0

        for meeting in meetings:
            if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]):
                continue

            full_id = meeting.cmte_code.contents[0].strip()

            match = re.search("^([A-Z]+)(\d+)$", full_id)
            if match:
                committee_id, subcommittee_id = match.groups()
            else:
                committee_id, subcommittee_id = full_id, None

            if (subcommittee_id == "00") or (subcommittee_id == None):
                subcommittee_id = None
            else:
                subcommittee_id = full_id

            committee = committee_for(db, committee_id)

            # Don't warn if it's a bill-specific conference committee
            if committee:
                chamber = committee['chamber']
            else:
                if committee_id == "JCC":
                    chamber = "joint"
                else:
                    db.warning(
                        "Couldn't locate committee by committee_id %s" %
                        committee_id, {'committee_id': committee_id})
                    continue

            committee_url = meeting.committee['url']

            date_string = meeting.date.contents[0].strip()
            occurs_at = datetime.datetime(*time.strptime(
                date_string, "%d-%b-%Y %I:%M %p")[0:6],
                                          tzinfo=python_utils.EST())
            congress = python_utils.current_congress(occurs_at.year)

            document = None
            if meeting.document:
                document = meeting.document.contents[0].strip()

            room = meeting.room.contents[0].strip()
            description = meeting.matter.contents[0].strip().replace('\n', '')

            # content is double-escaped, e.g. "
            description = parser.unescape(parser.unescape(description))

            bill_ids = python_utils.extract_bills(description, congress)

            if subcommittee_id == None:
                sub = '00'
            else:
                sub = str(subcommittee_id)
            # making sure the ids are more reproducable
            date_string = occurs_at.strftime("%d-%b-%Y %I:%M %p")
            id_string = (date_string + str(committee_id) + sub).encode("utf-8")
            hearing_id = hashlib.md5(id_string).hexdigest()

            documents = db['hearings'].find({
                'chamber':
                chamber,
                'committee_id':
                committee_id,
                "$or": [{
                    'occurs_at': occurs_at
                }, {
                    'description': description
                }]
            })

            hearing = None
            if documents.count() > 0:
                hearing = documents[0]
            else:
                hearing = {
                    'chamber': chamber,
                    'committee_id': committee_id,
                    'hearing_id': hearing_id
                }

                hearing['created_at'] = datetime.datetime.now()

            if subcommittee_id:
                hearing['subcommittee_id'] = subcommittee_id
            hearing['updated_at'] = datetime.datetime.now()

            hearing.update({
                'congress': congress,
                'occurs_at': occurs_at,
                'room': room,
                'description': description,
                'dc': True,
                'bill_ids': bill_ids
            })

            if committee:
                hearing['committee'] = committee

            db['hearings'].save(hearing)

            count += 1

        db.success("Updated or created %s Senate committee hearings" % count)
Ejemplo n.º 2
0
def get_videos(db, es, client_name, chamber, archive=False, captions=False):
    api_url = API_PREFIX + client_name + '?type=video'
    data = '{ "sort": [ {"datetime": {"order": "desc" }} ]  }'
    if archive:
        api_url += '&size=100000'
    else:
        api_url += '&size=2'
    videos = query_api(db, api_url, data)

    if not videos:
        db.warning("Granicus API appears to be down", {'errors': PARSING_ERRORS})
        sys.exit()

    vcount = 0
    for vid in videos:

        v = vid['_source']

        legislative_day = dateparse(v['datetime'])

        video_id = chamber + '-' + str(int(timey.mktime(legislative_day.timetuple())))
        new_vid = db.get_or_initialize('videos', {'video_id': video_id})

        #initialize arrays and dicts so we don't have to worry about it later
        if not new_vid.has_key('clip_urls'): new_vid['clip_urls'] = {}
        if not new_vid.has_key('bill_ids'): new_vid['bill_ids'] = []
        if not new_vid.has_key('legislator_ids'): new_vid['legislator_ids'] = []
        if not new_vid.has_key('legislator_names'): new_vid['legislator_names'] = []

        if not new_vid.has_key('created_at'): new_vid['created_at'] = datetime.now()
        new_vid['updated_at'] = datetime.now()
        #video id, clips array, legislators array, bills array

        new_vid = try_key(v, 'id', 'clip_id', new_vid)
        new_vid = try_key(v, 'duration', 'duration', new_vid)
        new_vid = try_key(v, 'datetime', 'published_at', new_vid)

        # normalize timestamp format to RFC3339 in UTC
        new_vid['published_at'] = rfc3339(dateparse(new_vid['published_at']))


        new_vid['clip_urls'] = try_key(v, 'http', 'mp4', new_vid['clip_urls'])
        new_vid['clip_urls'] = try_key(v, 'hls', 'hls', new_vid['clip_urls'])
        new_vid['clip_urls'] = try_key(v, 'rtmp', 'rtmp', new_vid['clip_urls'])

        new_vid['legislative_day'] = legislative_day.strftime('%Y-%m-%d')
        new_vid['chamber'] = chamber
        new_vid['congress'] =  python_utils.current_congress(legislative_day.year)

        if chamber == 'house':
            new_vid['clips'], new_vid['bill_ids'], new_vid['legislator_names'], new_vid['legislator_ids'], new_vid['roll_ids'] = get_markers(db, client_name, new_vid['clip_id'], new_vid['congress'], chamber)
        elif chamber == 'senate':
            new_vid['clips'], new_vid['bill_ids'], new_vid['legislator_names'], new_vid['legislator_ids'], new_vid['roll_ids'] = get_clips_for_senate(db, new_vid['clip_id'], new_vid['congress'], new_vid['duration'], dateparse(new_vid['published_at']).year)

        if new_vid['clips'] is None:
            print "Couldn't fetch information for video, skipping."
            continue

        #make sure the last clip has a duration
        if new_vid['clips'] and len(new_vid['clips']) > 0:
            new_vid['clips'][-1]['duration'] = new_vid['duration'] - new_vid['clips'][-1]['offset']

        if captions:
            new_vid['captions'], new_vid['caption_srt_file'] = get_captions(client_name, new_vid['clip_id'])

        db['videos'].save(new_vid)
        vcount += 1

        #index clip objects in elastic search

        if captions and new_vid.has_key('clips') and new_vid['clips'] is not None and len(new_vid['clips']) > 0:
            for c in new_vid['clips']:
                clip = {
                        'id': "%s-%s" % (new_vid['video_id'], new_vid['clips'].index(c)),
                        'video_id': new_vid['video_id'],
                        'video_clip_id': new_vid['clip_id'],
                        'offset': c['offset'],
                        'duration': c['duration'],
                        'legislative_day': new_vid['legislative_day'],
                        'published_at': new_vid['published_at'],
                        'clip_urls': new_vid['clip_urls']
                }
                clip = try_key(c, 'legislator_names', 'legislator_names', clip)
                clip = try_key(c, 'roll_ids', 'roll_ids', clip)
                clip = try_key(c, 'events', 'events', clip)
                clip = try_key(c, 'bill_ids', 'bill_ids', clip)
                clip = try_key(c, 'legislator_ids', 'legislator_ids', clip)

                if new_vid.has_key('caption_srt_file'):
                    clip['srt_link'] = new_vid['caption_srt_file'],

                if new_vid.has_key('captions'):
                    clip['captions'] = get_clip_captions(new_vid, c, c == new_vid['clips'][0] ) #pass a boolean if this is the first clip

                resp = es.save(clip, 'clips', clip['id'])
        print "Successfully processed %s" % new_vid['clip_id']

    es.connection.indices.refresh()

    db.success("Updated or created %s legislative days for %s video" % (client_name, vcount))
Ejemplo n.º 3
0
def get_videos(db, es, client_name, chamber, archive=False, captions=False):
    api_url = API_PREFIX + client_name + '?type=video'
    data = '{ "sort": [ {"datetime": {"order": "desc" }} ]  }'
    if archive:
        api_url += '&size=100000'
    else:
        api_url += '&size=2'
    videos = query_api(db, api_url, data)

    if not videos:
        db.warning("Granicus API appears to be down",
                   {'errors': PARSING_ERRORS})
        sys.exit()

    vcount = 0
    for vid in videos:

        v = vid['_source']

        legislative_day = dateparse(v['datetime'])

        video_id = chamber + '-' + str(
            int(timey.mktime(legislative_day.timetuple())))
        new_vid = db.get_or_initialize('videos', {'video_id': video_id})

        #initialize arrays and dicts so we don't have to worry about it later
        if not new_vid.has_key('clip_urls'): new_vid['clip_urls'] = {}
        if not new_vid.has_key('bill_ids'): new_vid['bill_ids'] = []
        if not new_vid.has_key('legislator_ids'):
            new_vid['legislator_ids'] = []
        if not new_vid.has_key('legislator_names'):
            new_vid['legislator_names'] = []

        if not new_vid.has_key('created_at'):
            new_vid['created_at'] = datetime.now()
        new_vid['updated_at'] = datetime.now()
        #video id, clips array, legislators array, bills array

        new_vid = try_key(v, 'id', 'clip_id', new_vid)
        new_vid = try_key(v, 'duration', 'duration', new_vid)
        new_vid = try_key(v, 'datetime', 'published_at', new_vid)

        # normalize timestamp format to RFC3339 in UTC
        new_vid['published_at'] = rfc3339(dateparse(new_vid['published_at']))

        new_vid['clip_urls'] = try_key(v, 'http', 'mp4', new_vid['clip_urls'])
        new_vid['clip_urls'] = try_key(v, 'hls', 'hls', new_vid['clip_urls'])
        new_vid['clip_urls'] = try_key(v, 'rtmp', 'rtmp', new_vid['clip_urls'])

        new_vid['legislative_day'] = legislative_day.strftime('%Y-%m-%d')
        new_vid['chamber'] = chamber
        new_vid['congress'] = python_utils.current_congress(
            legislative_day.year)

        if chamber == 'house':
            new_vid['clips'], new_vid['bill_ids'], new_vid[
                'legislator_names'], new_vid['legislator_ids'], new_vid[
                    'roll_ids'] = get_markers(db, client_name,
                                              new_vid['clip_id'],
                                              new_vid['congress'], chamber)
        elif chamber == 'senate':
            new_vid['clips'], new_vid['bill_ids'], new_vid[
                'legislator_names'], new_vid['legislator_ids'], new_vid[
                    'roll_ids'] = get_clips_for_senate(
                        db, new_vid['clip_id'], new_vid['congress'],
                        new_vid['duration'],
                        dateparse(new_vid['published_at']).year)

        if new_vid['clips'] is None:
            print "Couldn't fetch information for video, skipping."
            continue

        #make sure the last clip has a duration
        if new_vid['clips'] and len(new_vid['clips']) > 0:
            new_vid['clips'][-1]['duration'] = new_vid['duration'] - new_vid[
                'clips'][-1]['offset']

        if captions:
            new_vid['captions'], new_vid['caption_srt_file'] = get_captions(
                client_name, new_vid['clip_id'])

        db['videos'].save(new_vid)
        vcount += 1

        #index clip objects in elastic search

        if captions and new_vid.has_key(
                'clips') and new_vid['clips'] is not None and len(
                    new_vid['clips']) > 0:
            for c in new_vid['clips']:
                clip = {
                    'id':
                    "%s-%s" % (new_vid['video_id'], new_vid['clips'].index(c)),
                    'video_id':
                    new_vid['video_id'],
                    'video_clip_id':
                    new_vid['clip_id'],
                    'offset':
                    c['offset'],
                    'duration':
                    c['duration'],
                    'legislative_day':
                    new_vid['legislative_day'],
                    'published_at':
                    new_vid['published_at'],
                    'clip_urls':
                    new_vid['clip_urls']
                }
                clip = try_key(c, 'legislator_names', 'legislator_names', clip)
                clip = try_key(c, 'roll_ids', 'roll_ids', clip)
                clip = try_key(c, 'events', 'events', clip)
                clip = try_key(c, 'bill_ids', 'bill_ids', clip)
                clip = try_key(c, 'legislator_ids', 'legislator_ids', clip)

                if new_vid.has_key('caption_srt_file'):
                    clip['srt_link'] = new_vid['caption_srt_file'],

                if new_vid.has_key('captions'):
                    clip['captions'] = get_clip_captions(
                        new_vid, c, c == new_vid['clips']
                        [0])  #pass a boolean if this is the first clip

                resp = es.save(clip, 'clips', clip['id'])
        print "Successfully processed %s" % new_vid['clip_id']

    es.connection.indices.refresh()

    db.success("Updated or created %s legislative days for %s video" %
               (client_name, vcount))
Ejemplo n.º 4
0
def run(db, es, options = {}):
    try:
      page = urllib2.urlopen("http://www.senate.gov/general/committee_schedules/hearings.xml")
    except:
      db.note("Couldn't load Senate hearings feed, can't proceed")

    else:
      soup = BeautifulStoneSoup(page)
      meetings = soup.findAll('meeting')
      parser = HTMLParser.HTMLParser()

      count = 0

      for meeting in meetings:
        if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]):
          continue

        full_id = meeting.cmte_code.contents[0].strip()

        match = re.search("^([A-Z]+)(\d+)$", full_id)
        if match:
          committee_id, subcommittee_id = match.groups()
        else:
          committee_id, subcommittee_id = full_id, None

        if (subcommittee_id == "00") or (subcommittee_id == None):
          subcommittee_id = None
        else:
          subcommittee_id = full_id

        committee = committee_for(db, committee_id)


        # Don't warn if it's a bill-specific conference committee
        if committee:
          chamber = committee['chamber']
        else:
          if committee_id == "JCC":
            chamber = "joint"
          else:
            db.warning("Couldn't locate committee by committee_id %s" % committee_id, {'committee_id': committee_id})
            continue

        committee_url = meeting.committee['url']

        date_string = meeting.date.contents[0].strip()
        occurs_at = datetime.datetime(*time.strptime(date_string, "%d-%b-%Y %I:%M %p")[0:6], tzinfo=python_utils.EST())
        congress = python_utils.current_congress(occurs_at.year)

        document = None
        if meeting.document:
          document = meeting.document.contents[0].strip()

        room = meeting.room.contents[0].strip()
        description = meeting.matter.contents[0].strip().replace('\n', '')

        # content is double-escaped, e.g. "
        description = parser.unescape(parser.unescape(description))

        bill_ids = python_utils.extract_bills(description, congress)

        if subcommittee_id == None:
          sub = '00'
        else:
          sub = str(subcommittee_id)
        # making sure the ids are more reproducable
        date_string = occurs_at.strftime("%d-%b-%Y %I:%M %p")
        id_string = (date_string + str(committee_id) + sub).encode("utf-8")
        hearing_id = hashlib.md5(id_string).hexdigest()

        documents = db['hearings'].find({
          'chamber': chamber,
          'committee_id': committee_id,

          "$or": [{
            'occurs_at': occurs_at
            },{
            'description': description
          }]
        })

        hearing = None
        if documents.count() > 0:
          hearing = documents[0]
        else:
          hearing = {
            'chamber': chamber,
            'committee_id': committee_id,
            'hearing_id': hearing_id
          }

          hearing['created_at'] = datetime.datetime.now()

        if subcommittee_id:
          hearing['subcommittee_id'] = subcommittee_id
        hearing['updated_at'] = datetime.datetime.now()

        hearing.update({
          'congress': congress,
          'occurs_at': occurs_at,
          'room': room,

          'description': description,
          'dc': True,

          'bill_ids': bill_ids
        })

        if committee:
          hearing['committee'] = committee

        db['hearings'].save(hearing)

        count += 1

      db.success("Updated or created %s Senate committee hearings" % count)