Beispiel #1
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx"

        page = self.lxmlize(url)
        for entry in page.xpath(
                "//tr[@style='font-family: Verdana; font-size: 12px;']"):
            name, when, links = entry.xpath(".//td")
            name = name.text.strip().replace(u"\xc2\xa0", "")
            when = when.text.strip().replace(u"\xc2\xa0", "")
            when = dt.datetime.strptime(when, "%m/%d/%Y")
            links = links.xpath(".//a")
            links = {x.text: x.attrib['href'] for x in links}
            e = Event(name=name,
                      session=self.session,
                      when=when,
                      location='unknown')

            e.add_source(url)
            for note, url in links.items():
                e.add_link(note=note, url=url)

            yield e
Beispiel #2
0
def test_basic_event():
    """ test that we can create an event """
    e = Event(name="get-together",
              when=dt.datetime.utcnow(),
              location="Joe's Place")

    e.add_source(url='foobar')
    e.validate()

    e.add_link("http://foobar.baz")
    e.add_link("http://foobar.baz", note="foo")
    e.validate()

    assert len(e.links) == 2
Beispiel #3
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://chicago.legistar.com/Calendar.aspx/"
        page = self.lxmlize(url)
        main = page.xpath("//table[@class='rgMasterTable']")[0]
        rows = main.xpath(".//tr")[1:]
        for row in rows:
            if "No records were found." in row.text_content():
                self.warning("Hum. They don't seem to have events?")
                continue

            (name, date, _, time, where, details, notice,
             agenda, summary, video) = row.xpath(".//td")
            # _ nom's the image next to the date on the page.

            name = name.text_content().strip()  # leaving an href on the table
            time = time.text_content().strip()
            location = where.text_content().strip()

            if "Deferred" in time:
                continue

            all_day = False
            if time == "":
                all_day = True
                when = dt.datetime.strptime(date.text.strip(),
                                            "%m/%d/%Y")
            else:
                when = dt.datetime.strptime("%s %s" % (date.text.strip(), time),
                                            "%m/%d/%Y %I:%M %p")

            event = Event(name=name,
                          session=self.session,
                          when=when,
                          location=location)
            event.add_source(url)

            agendas = agenda.xpath(".//a[@href]")
            for a in agendas:
                event.add_link(a.text, a.attrib['href'])

            summary = summary.xpath(".//a[@href]")
            for minute in summary:
                event.add_link(minute.text, minute.attrib['href'])

            yield event
Beispiel #4
0
    def migrate_events(self, state):
        spec = {}
        if state:
            spec['state'] = state

        for entry in self.billy_db.events.find(spec, timeout=False):

            e = Event(
                name=entry['description'],
                when=entry['when'],
                location=entry['location'],
                session=entry['session'],
                updated_at=entry['updated_at'],
                created_at=entry['created_at'],
                type=entry['type'],
            )
            e.identifiers = [{'scheme': 'openstates',
                             'identifier': entry['_id']}]
            e._openstates_id = entry['_id']
            if entry.get('+location_url'):
                e.add_location_url(entry['+location_url'])

            link = entry.get('link', entry.get("+link"))
            if link:
                e.add_link(link, 'link')

            blacklist = ["description", "when", "location", "session",
                         "updated_at", "created_at", "end", "sources",
                         "documents", "related_bills", "state", "+link",
                         "link", "level", "participants", "country",
                         "_all_ids", "type"]

            e.status = entry.get('status')
            typos = {
                "canceled": "cancelled"
            }
            if e.status in typos:
                e.status = typos[e.status]

            for key, value in entry.items():
                if key in blacklist or not value or key.startswith("_"):
                    continue
                e.extras[key] = value

            if entry.get('end'):
                end = entry['end']
                try:
                    end = dt.datetime.fromtimestamp(end)
                except TypeError:
                    pass

                e.end = end

            for source in entry['sources']:
                e.add_source(url=source['url'])

            if e.sources == []:
                continue  # XXX: print warning

            for document in entry.get('documents', []):
                e.add_document(name=document.get('name'),
                               document_id=document.get('doc_id'),
                               url=document['url'],
                               mimetype=document.get(
                                   "mimetype", document.get(
                                       "+mimetype",
                                       "application/octet-stream")))
                # Try to add the mimetype. If it fails, fall back to a generic
                # undeclared application/octet-stream.

            agenda = None
            for bill in entry.get('related_bills', []):
                if agenda is None:
                    agenda = e.add_agenda_item(
                        description="Bills up for Consideration"
                    )

                hcid = _hot_cache.get(bill.get('id', None), None)
                bid = bill['bill_id']
                if bid is None:
                    continue

                agenda.add_bill(bill=bid, id=hcid)

            for who in entry.get('participants', []):
                participant_type = who.get('participant_type', 'committee')
                # I've gone through the backlog of OpenStates data, they are
                # all committees of some sort.

                who_chamber = who.get('chamber')
                if who_chamber is None:
                    for chamber in ["_chamber", "+chamber"]:
                        f = who.get(chamber)
                        if f:
                            who_chamber = f
                            break

                if who_chamber is None:
                    # Freak of nature ...
                    continue

                hcid = _hot_cache.get(who.get('id', None), None)

                e.add_participant(
                    name=who['participant'],
                    type={
                        "committee": "organization",
                        "legislator": "person",
                        "person": "person",
                    }[participant_type],
                    id=hcid,
                    note=who['type'],
                    chamber=who_chamber)

            self.save_object(e)
Beispiel #5
0
    def get_events(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)
        
        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type):
                
                # attempt to map the cells across table types. 
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title, when=meeting_date, session=self.session, location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)                

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a'))>0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html')                    

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href','')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text, url=link_url, mimetype='application/pdf') 

                # skip everything below here for the 'upcoming' table
                if meeting_type=='upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell)>0:
                    video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick',''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell)>0:
                    e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell)>0:
                    e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html')

                yield e