Esempio n. 1
0
    def get_events(self):
        page = self.lxmlize(PAGE)
        events = page.xpath("//div[@class='col-middle']//ul/li")
        when = None
        for event in events:
            h3 = event.xpath("./a/h2")
            h3 = h3[0] if h3 else None
            if h3 is not None:
                when = h3.text
            else:
                if when is None:
                    self.warning("Ungrok!")
                    continue

                b, _, i = event.xpath("./p/*")
                title = b.text_content()
                event = i.text_content()

                if "NO MEETING" in event:
                    continue

                day, title = (x.strip() for x in title.split("-", 1))

                where = "Council Chambers"

                for subevent in (x.strip() for x in event.split(";")):
                    if " in " in subevent:
                        subevent, where = subevent.rsplit(" in ", 1)
                    subevent = subevent.replace(u'\xa0', ' ')

                    if "NO" in subevent and "MEETING" in subevent:
                        continue

                    if "to follow" in subevent:
                        continue

                    info = EVENT_RE.match(subevent).groupdict()
                    event, time = [info[x] for x in ['event', 'time']]

                    ampm = {
                        "a.m.": "AM",
                        "p.m.": "PM",
                    }

                    for old, new in ampm.items():
                        time = time.replace(old, new)

                    dtstring = ", ".join([day, time])

                    try:
                        etime = dt.datetime.strptime(dtstring,
                                                     "%m/%d/%Y, %I:%M %p")
                    except ValueError:
                        etime = dt.datetime.strptime(dtstring,
                                                     "%m/%d/%Y, %I%p")

                    e = Event(name=event, when=etime, location=where)
                    e.add_source(PAGE)
                    yield e
Esempio n. 2
0
def event_obj():
    e = Event(name="get-together",
              when=dt.datetime.utcnow(),
              location="Joe's Place")

    e.add_source(url='foobar')
    e.validate()
    return e
Esempio n. 3
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://legistar.council.nyc.gov/Calendar.aspx"
        page = self.lxmlize(url)
        main = page.xpath("//table[@class='rgMasterTable']")[0]
        rows = main.xpath(".//tr")[1:]
        for row in rows:
            els = row.xpath(".//td")
            if len(els) <= 2:
                continue  # Odd one-off.

            (name, date, _, time, where, topic, details, agenda, minutes,
             media) = els
            # _ nom's the image of the cal next to the meeting date.

            name = name.text_content().strip()  # leaving an href on the table
            time = time.text_content().strip()
            location = where.text_content().strip()
            topic = topic.text_content().strip()

            if "Deferred" in time:
                continue

            all_day = False
            if time == "":
                all_day = True
                when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y")
            else:
                when = dt.datetime.strptime(
                    "%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p")

            event = Event(name=name,
                          session=self.session,
                          when=when,
                          location=location)
            event.add_source(url)

            details = details.xpath(".//a[@href]")
            for detail in details:
                event.add_document(detail.text,
                                   detail.attrib['href'],
                                   mimetype='text/html')

            agendas = agenda.xpath(".//a[@href]")
            for a in agendas:
                event.add_document(a.text,
                                   a.attrib['href'],
                                   mimetype='application/pdf')

            minutes = minutes.xpath(".//a[@href]")
            for minute in minutes:
                event.add_document(minute.text,
                                   minute.attrib['href'],
                                   mimetype='application/pdf')

            yield event
Esempio n. 4
0
    def scrape_event_page(self, event):
        url = event.attrib['href']
        page = self.lxmlize(url)
        title = page.xpath("//h2[@class='evlist_header']")
        title = title[0].text.strip() if title else None
        if title is None:
            return
        if "CANCELED" in title:
            return

        info = page.xpath(
            "//div[@style='position:relative;margin-right:40px;']")[0]
        blocks = info.xpath(".//div")
        ret = {}
        for block in blocks:
            els = block.xpath("./*")
            if not els:
                continue
            le = els[0]

            if le.tag != 'label':
                continue

            label, div = els

            ltex = label.text_content().strip()
            dtex = div.text_content().strip()
            ret[ltex] = dtex

        when = dt.datetime.utcnow()
        date, start, end = (x.strip() for x in ret['When:'].split("\n"))
        start = re.sub("^@", "", start).strip()
        end = end.replace("-", "").strip()

        replace = [
            ('Apr', 'April'),
        ]

        skip = ["Occurs every"]

        for k, v in replace:
            date = date.replace(k, v).strip()

        if True in (x in end for x in skip):
            return

        start = "%s %s" % (date, start)
        end = "%s %s" % (date, end)
        start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p")
                      for x in (start, end))

        event = Event(session=self.session,
                      name=title,
                      location=ret['Where:'],
                      when=start,
                      end=end)
        event.add_source(url)
        yield event
    def scrape_event_page(self, event):
        url = event.attrib['href']
        page = self.lxmlize(url)
        title = page.xpath("//h2[@class='evlist_header']")
        title = title[0].text.strip() if title else None
        if title is None:
            return
        if "CANCELED" in title:
            return

        info = page.xpath("//div[@style='position:relative;margin-right:40px;']")[0]
        blocks = info.xpath(".//div")
        ret = {}
        for block in blocks:
            els = block.xpath("./*")
            if not els:
                continue
            le = els[0]

            if le.tag != 'label':
                continue

            label, div = els

            ltex = label.text_content().strip()
            dtex = div.text_content().strip()
            ret[ltex] = dtex

        when = dt.datetime.utcnow()
        date, start, end = (x.strip() for x in ret['When:'].split("\n"))
        start = re.sub("^@", "", start).strip()
        end = end.replace("-", "").strip()

        replace = [
            ('Apr', 'April'),
        ]

        skip = ["Occurs every"]

        for k, v in replace:
            date = date.replace(k, v).strip()

        if True in (x in end for x in skip):
            return

        start = "%s %s" % (date, start)
        end = "%s %s" % (date, end)
        start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p") for x in (start, end))

        event = Event(
            session=self.session,
            name=title,
            location=ret['Where:'],
            when=start,
            end=end)
        event.add_source(url)
        yield event
Esempio n. 6
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        curdate = None
        page = self.lxmlize(CAL_PAGE)
        for el in page.xpath("//div[@id='Section1']/*"):
            if el.tag[0] == 'h':
                when = WHEN.findall(el.text_content())
                when = when[0] if when else None
                if when is None:
                    continue
                curdate = " ".join(when)

            if (el.tag == 'p'): # and el.attrib.get('class') == 'MsoNormal'):

                els = el.xpath("./*")
                agenda = el.xpath(".//a[contains(@href, 'Archive.aspx')]")
                agenda = agenda[0] if agenda else None
                if agenda is None:
                    continue

                info = self.cleanup(el.text_content())
                when = DT.findall(info)
                when = when[0] if when else None
                if when is None:
                    continue

                people = el.xpath(".//personname")
                places = el.xpath(".//place")
                time, ampm = when

                if curdate is None:
                    self.warning("Can't scrape, since I don't know what date it is")
                    continue

                tbuf = " ".join([curdate, time, ampm])
                obj = dt.datetime.strptime(tbuf, "%B %d %Y %I:%M %p")

                try:
                    _, where = info.rsplit(u"–", 1)
                except ValueError:
                    continue

                where = where.replace(u" ", " ")
                where  = re.sub("\s+", " ", where).strip()
                where = re.sub("agenda$", "", where).strip()

                event = Event(name=info,
                              session=self.session,
                              when=obj,
                              location=where)
                event.add_source(CAL_PAGE)
                yield event
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        start = dt.datetime.utcnow()
        start = start - dt.timedelta(days=10)
        end = start + dt.timedelta(days=30)

        url = URL.format(**{"from": start.strftime("%Y/%m/%d"),
                            "til": end.strftime("%Y/%m/%d")})


        page = self.lxmlize(url)
        events = page.xpath("//ul[contains(@class, 'committee-events')]//li")

        for event in events:
            string = event.text_content()

            po = CLICK_INFO.match(event.xpath(".//span")[0].attrib['onclick'])
            if po is None:
                continue

            poid = po.groupdict()['info_id']  # This is used to get more deetz on

            popage = self.popOverUrl(poid)
            when = dt.datetime.strptime(popage.xpath("//strong")[0].text,
                                        "%B %d, %Y @ %I:%M %p")
            who = popage.xpath("//h1")[0].text
            related = []

            for item in popage.xpath("//div"):
                t = item.text
                if t is None:
                    continue

                t = t.strip()
                for related_entity in ORD_INFO.findall(t):
                    related.append({
                        "ord_no": related_entity,
                        "what": t
                    })

            e = Event(name=who,
                      session=self.session,
                      when=when,
                      location='unknown')
            e.add_source(url)

            for o in related:
                i = e.add_agenda_item(o['what'])
                i.add_bill(o['ord_no'], note='consideration')

            yield e
Esempio n. 8
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        start = dt.datetime.utcnow()
        start = start - dt.timedelta(days=10)
        end = start + dt.timedelta(days=30)

        url = URL.format(**{
            "from": start.strftime("%Y/%m/%d"),
            "til": end.strftime("%Y/%m/%d")
        })

        page = self.lxmlize(url)
        events = page.xpath("//ul[contains(@class, 'committee-events')]//li")

        for event in events:
            string = event.text_content()

            po = CLICK_INFO.match(event.xpath(".//span")[0].attrib['onclick'])
            if po is None:
                continue

            poid = po.groupdict()[
                'info_id']  # This is used to get more deetz on

            popage = self.popOverUrl(poid)
            when = dt.datetime.strptime(
                popage.xpath("//strong")[0].text, "%B %d, %Y @ %I:%M %p")
            who = popage.xpath("//h1")[0].text
            related = []

            for item in popage.xpath("//div"):
                t = item.text
                if t is None:
                    continue

                t = t.strip()
                for related_entity in ORD_INFO.findall(t):
                    related.append({"ord_no": related_entity, "what": t})

            e = Event(name=who,
                      session=self.session,
                      when=when,
                      location='unknown')
            e.add_source(url)

            for o in related:
                i = e.add_agenda_item(o['what'])
                i.add_bill(o['ord_no'], note='consideration')

            yield e
Esempio n. 9
0
def test_basic_agenda():
    e = Event(name="get-together",
              when=dt.datetime.utcnow(),
              location="Joe's Place")

    e.add_source(url='foobar')
    e.validate()

    agenda = e.add_agenda_item("foo bar")
    assert agenda
    e.validate()
Esempio n. 10
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://legistar.council.nyc.gov/Calendar.aspx"
        page = self.lxmlize(url)
        main = page.xpath("//table[@class='rgMasterTable']")[0]
        rows = main.xpath(".//tr")[1:]
        for row in rows:
            els = row.xpath(".//td")
            if len(els) <= 2:
                continue  # Odd one-off.

            (name, date, _, time, where, topic, details, agenda, minutes, media) = els
            # _ nom's the image of the cal next to the meeting date.

            name = name.text_content().strip()  # leaving an href on the table
            time = time.text_content().strip()
            location = where.text_content().strip()
            topic = topic.text_content().strip()

            if "Deferred" in time:
                continue

            all_day = False
            if time == "":
                all_day = True
                when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y")
            else:
                when = dt.datetime.strptime("%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p")

            event = Event(name=name, session=self.session, when=when, location=location)
            event.add_source(url)

            details = details.xpath(".//a[@href]")
            for detail in details:
                event.add_document(detail.text, detail.attrib["href"], mimetype="text/html")

            agendas = agenda.xpath(".//a[@href]")
            for a in agendas:
                event.add_document(a.text, a.attrib["href"], mimetype="application/pdf")

            minutes = minutes.xpath(".//a[@href]")
            for minute in minutes:
                event.add_document(minute.text, minute.attrib["href"], mimetype="application/pdf")

            yield event
Esempio n. 11
0
    def scrape_event(self, href):
        page = self.lxmlize(href.attrib['href'])
        what = page.xpath("//td[@id='ctl14_ctl16_tdTitleCell']")[0].text
        info = page.xpath("//div[@id='ctl14_pnlEvent']//table//table//tr")[1:]
        ret = {
            "Location:": "Unknown"
        }
        for tr in info:
            tds = tr.xpath(".//td")
            if len(tds) < 2:
                continue
            what, data = [tds.pop(0).text_content().strip() for x in range(2)]
            ret[what] = data

        agendas = page.xpath("//a[contains(@title, 'Meeting Agenda')]")
        if agendas:
            for agenda in agendas:
                print("Agenda:", agenda.attrib['href'])

        t = ret['Time:']
        start_time, end_time = t, None
        if "-" in t:
            start_time, end_time = (x.strip() for x in t.split("-", 1))

        start_time = "%s %s" % (ret['Date:'], start_time)
        dts = "%B %d, %Y %I:%M %p"
        start = dt.datetime.strptime(start_time, dts)

        end = None
        if end_time:
            end = "%s %s" % (ret['Date:'], end_time)
            end = dt.datetime.strptime(end, dts)

        kwargs = {}
        if end:
            kwargs['end'] = end

        e = Event(name=what, session=self.session, location=ret['Location:'], when=start,
                  **kwargs)
        e.add_source(href.attrib['href'])
        yield e
Esempio n. 12
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx"

        page = self.lxmlize(url)
        for entry in page.xpath(
                "//tr[@style='font-family: Verdana; font-size: 12px;']"):
            name, when, links = entry.xpath(".//td")
            name = name.text.strip().replace(u"\xc2\xa0", "")
            when = when.text.strip().replace(u"\xc2\xa0", "")
            when = dt.datetime.strptime(when, "%m/%d/%Y")
            links = links.xpath(".//a")
            links = {x.text: x.attrib['href'] for x in links}
            e = Event(name=name,
                      session=self.session,
                      when=when,
                      location='unknown')

            e.add_source(url)
            for note, url in links.items():
                e.add_link(note=note, url=url)

            yield e
Esempio n. 13
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://chicago.legistar.com/Calendar.aspx/"
        page = self.lxmlize(url)
        main = page.xpath("//table[@class='rgMasterTable']")[0]
        rows = main.xpath(".//tr")[1:]
        for row in rows:
            if "No records were found." in row.text_content():
                self.warning("Hum. They don't seem to have events?")
                continue

            (name, date, _, time, where, details, notice,
             agenda, summary, video) = row.xpath(".//td")
            # _ nom's the image next to the date on the page.

            name = name.text_content().strip()  # leaving an href on the table
            time = time.text_content().strip()
            location = where.text_content().strip()

            if "Deferred" in time:
                continue

            all_day = False
            if time == "":
                all_day = True
                when = dt.datetime.strptime(date.text.strip(),
                                            "%m/%d/%Y")
            else:
                when = dt.datetime.strptime("%s %s" % (date.text.strip(), time),
                                            "%m/%d/%Y %I:%M %p")

            event = Event(name=name,
                          session=self.session,
                          when=when,
                          location=location)
            event.add_source(url)

            agendas = agenda.xpath(".//a[@href]")
            for a in agendas:
                event.add_link(a.text, a.attrib['href'])

            summary = summary.xpath(".//a[@href]")
            for minute in summary:
                event.add_link(minute.text, minute.attrib['href'])

            yield event
Esempio n. 14
0
    def get_events(self):
        for page in self.eventPages(EVENTSPAGE) :
            events_table = page.xpath("//table[@class='rgMasterTable']")[0]
            for events, headers, rows in self.parseDataTable(events_table) :
                print(events)
                location_string = events[u'Meeting\xa0Location']
                location_list = location_string.split('--')
                location = ', '.join(location_list[0:2])

                status_string = location_list[-1].split('Chicago, Illinois')
                if len(status_string) > 1 and status_string[1] :
                    status = status_string[1].lower()
                    if status not in ['cancelled', 'tentative', 'confirmed', 'passed'] :
                        print(status)
                        status = 'confirmed'
                else :
                    status = 'confirmed'



                when = events[u'Meeting\xa0Date']
                time_string = events[u'Meeting\xa0Time']
                event_time = datetime.datetime.strptime(time_string,
                                                        "%I:%M %p")
                when = when.replace(hour=event_time.hour)

                e = Event(name=events["Name"]["label"],
                          session=self.session,
                          when=when,
                          location=location,
                          status=status)
                e.add_source(EVENTSPAGE)
                if events['Video'] != u'Not\xa0available' :
                    print(events['Video'])

                yield e
Esempio n. 15
0
    def get_events(self):
        # get list of executive orders
        url = 'http://www.governor.ny.gov/sl2/ExecutiveOrderindex'
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # extract governor's name
        gov = page.xpath("(//div[@class='section-header']/div/div/div/a/div/h2)[1]")[0]
        governor_name = gov.text.lstrip('Governor ')

        # scrape each executive order
        for eo_par in page.xpath("//div[@class='content']/p"):
            for link in eo_par.xpath(".//a"):

                url = link.get('href').lower()
                if url.endswith('.pdf'):
                    continue

                # get date for executive order
                eo_page = self.urlopen(url)
                eo_page = lxml.html.fromstring(eo_page)
                eo_page = re.sub('(\\r*\\n|\W)', ' ', eo_page.xpath('string()').lower())
                eo_page = re.sub('\s+', ' ', eo_page)
                date_par = re.search('(?:g i v e n)(.*)(?:by the governor)', eo_page).groups()[0]
                date_comp = [s.strip() for s in
                             re.match('(?:.*this)(.*)(?:day of)(.*)(?:in the year)(.*)', date_par).groups()]
                eo_date = dt.datetime.strptime(' '.join(
                    (str(Wtn.parse(date_comp[0])), date_comp[1], str(Wtn.parse(date_comp[2])))), '%d %B %Y')

                # build yield object
                eo_number = eo_par.xpath('string()').split(':', 1)[0]
                eo = Event(eo_number, eo_date, 'New York')
                eo.add_person(governor_name, 'governor')
                eo.description = link.text
                eo.add_document(eo_number, url, 'text/html')
                eo.add_source(url)

                yield eo

        # TODO: get list of press statements
Esempio n. 16
0
    def scrape_event_page(self, page):
        for entry in page.xpath(
                "//table[@id='Listview1_DataGrid1']//tr[@class='mainText']"):
            title = None
            ret = {}
            for block in entry.xpath(".//td[@class='mainText']"):
                entries = block.xpath("./*")
                if "table" in (x.tag for x in entries):
                    continue
                info = [self.cleanup(x.text_content()) for x in entries]
                if title is None:
                    title = info[1]
                    continue
                key = info.pop(0)
                val = None
                if "Time: " in key:
                    _, val = key.split("Time: ", 1)
                    start, end = val.split(" - ", 1)
                    val = {"start": start, "end": end}
                    key = "time"
                else:
                    val = info.pop(0) if info else None

                ret[key] = val
                if info != []:
                    raise Exception("Erm. odd scrape.")

            if title is None:
                continue

            ret['title'] = title
            start, end = self.get_start_end(ret)
            ret['time']['start'], ret['time']['end'] = start, end

            event = Event(name=ret['Description:'] or "TBA",
                          session=self.session,
                          location=ret['Location:'],
                          when=ret['time']['start'],
                          end=ret['time']['end'])
            yield event
Esempio n. 17
0
    def handle_buffer(self, buf):
        dates = DATE_FINDER.findall(buf)
        if dates == []:
            return
        month, day, year = dates[0]
        _, buf = buf.split(year, 1)
        time = TIME_FINDER.findall(buf)
        time = time[0] if time else None

        all_day = time is None

        tbuf = "%s %s %s" % (month, day, year)
        fmt = "%B %d %Y"

        dt_replace = {"Noon": "PM"}
        et_replace = [["–", "-"],
                      [r"^\s+\-\s+", ""]]

        if not all_day:
            tbuf += " %s" % (time)
            fmt += " %I:%M %p"

        for k, v in dt_replace.items():
            tbuf = tbuf.replace(k, v)

        for k, v in et_replace:
            buf = re.sub(k, v, buf)

        buf = buf.strip()

        obj = dt.datetime.strptime(tbuf, fmt)
        e = Event(name=buf,
                  session=self.session,
                  when=obj,
                  location="City Hall")
        yield e
Esempio n. 18
0
    def get_events(self):
        # get list of executive orders
        url = 'http://nj.gov/infobank/circular/eoindex.htm'
        page = self.urlopen(url)
        page = lxml_html.fromstring(page)
        page.make_links_absolute(url)

        # state variables for parser
        governor_name = None
        gov_session_name = None

        # parse the table of executive orders
        for eo_row in page.xpath('//table[@border>0]//tr'):

            cols = eo_row.xpath('.//td')

            # extract governor's name
            if len(cols) == 1:
                # remove things like "'s"
                governor_name = re.sub('\W\w\s', ' ', eo_row.xpath('string()'))
                governor_name = re.sub('\\r*\\n|\W', ' ', governor_name)
                governor_name = re.sub('\s+', ' ', governor_name)
                governor_name = re.search("executive order.*governor(.*)administration",
                                          governor_name, re.IGNORECASE).groups()[0].strip()
                gov_session_name = re.sub('\s+', '_', governor_name)

            # extract executive order
            elif len(cols) == 3:
                if self.session == gov_session_name:
                    eo_num = cols[0].xpath('string()').strip()
                    try:
                        float(eo_num)
                    except ValueError:
                        continue

                    eo_title = re.sub('\\r*\\n', ' ', cols[1].xpath('string()'))
                    eo_title = re.sub('\s+', ' ', eo_title)
                    eo_title = re.sub('\[.*pdf.*\]', '', eo_title).strip()
                    if eo_title == '' or eo_title is None:
                        continue

                    eo_date = re.search('([0-9]{1,2}).*/([0-9]{1,2}).*/([0-9]{4}|[0-9]{2})', cols[2].xpath('string()'))
                    if eo_date is None:
                        continue
                    eo_date = '/'.join(eo_date.groups())
                    try:
                        eo_date = dt.datetime.strptime(eo_date, '%m/%d/%y')
                    except ValueError:
                        eo_date = dt.datetime.strptime(eo_date, '%m/%d/%Y')

                    eo_source = cols[0].xpath('.//a')[0].get('href').lower()
                    mime_type = MimeTypes().guess_type(eo_source)[0]
                    if mime_type is None:
                        mime_type = 'text/html'

                    # build yield object
                    eo = Event(eo_num, eo_date, 'New Jersey', gov_session_name)
                    eo.add_person(governor_name, 'governor')
                    eo.description = eo_title
                    eo.add_document(eo_num, eo_source, mime_type)
                    eo.add_source(eo_source)

                    yield eo
Esempio n. 19
0
    def get_events(self):
        "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport"
        "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport"

        # scrape attendance

        tmpdir = tempfile.mkdtemp()

        page = lxmlize(
            "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport"
        )
        members = page.xpath(
            '//td[@class="inputText"]/select[@name="memberId"]/option')
        for member in members:
            post = {
                'function': 'getMemberAttendanceReport',
                'download': 'csv',
                'exportPublishReportId': 1,
                'termId': 4,
                'memberId': member.attrib['value'],
                'decisionBodyId': 0,
            }
            r = requests.post("http://app.toronto.ca/tmmis/getAdminReport.do",
                              data=post)
            if r.headers['content-type'] != 'application/vnd.ms-excel':
                continue

            attendance_file = open(tmpdir + '/' + member.text + '.csv', 'w')
            attendance_file.write(r.text)
            attendance_file.close()


# scrape events
        post = {
            'function': 'getMeetingScheduleReport',
            'download': 'csv',
            'exportPublishReportId': 3,
            'termId': 4,
            'decisionBodyId': 0,
        }

        r = requests.post("http://app.toronto.ca/tmmis/getAdminReport.do",
                          data=post)
        empty = []

        meeting_file = open('meetings.csv', 'w')
        meeting_file.write(r.text)
        meeting_file.close()
        with open('meetings.csv', 'rb') as csvfile:
            csvfile = csv.reader(csvfile, delimiter=',')
            next(csvfile)

            committee = ''
            agenda_items = []

            for row in csvfile:
                name = row[0]
                when = row[2]
                when = dt.datetime.strptime(when, "%Y-%m-%d")
                location = row[5]

                if name != committee:
                    committee = name
                    agenda_items = find_items(committee)

                e = Event(name=name,
                          session=self.session,
                          when=when,
                          location=location)

                attendees = find_attendees(tmpdir, row)
                if len(attendees) == 0:
                    empty.append(row)
                for attendee in find_attendees(tmpdir, row):
                    e.add_person(attendee)
                e.add_source(
                    "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport"
                )

                for item in agenda_items:
                    if item['date'].date() == when.date():
                        i = e.add_agenda_item(item['description'])
                        i.add_committee(committee)
                        i['order'] = item['order']

                        for link in item['links']:
                            i.add_media_link(link['name'],
                                             link['url'],
                                             on_duplicate='ignore')

                        if 'notes' in item:
                            i['notes'] = [item['notes']]

                yield e

        shutil.rmtree(tmpdir)
        os.remove('meetings.csv')
Esempio n. 20
0
  def get_events(self):
    "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport"
    "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport"

 # scrape attendance

    tmpdir = tempfile.mkdtemp()

    page = lxmlize("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport")
    members = page.xpath('//td[@class="inputText"]/select[@name="memberId"]/option')
    for member in members:
      post = {
        'function': 'getMemberAttendanceReport',
        'download': 'csv',
        'exportPublishReportId': 1,
        'termId': 4,
        'memberId': member.attrib['value'],
        'decisionBodyId': 0,
      }
      r = requests.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post)
      if r.headers['content-type'] != 'application/vnd.ms-excel':
        continue

      attendance_file = open(tmpdir + '/' + member.text + '.csv', 'w')
      attendance_file.write(r.text)
      attendance_file.close()

# scrape events
    post = {
      'function': 'getMeetingScheduleReport',
      'download': 'csv',
      'exportPublishReportId': 3,
      'termId': 4,
      'decisionBodyId': 0,
    }

    r = requests.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post)
    empty = []

    meeting_file = open('meetings.csv', 'w')
    meeting_file.write(r.text)
    meeting_file.close()
    with open('meetings.csv', 'rb') as csvfile:
      csvfile = csv.reader(csvfile, delimiter=',')
      next(csvfile)

      committee = ''
      agenda_items = []

      for row in csvfile:
        name = row[0]
        when = row[2]
        when = dt.datetime.strptime(when, "%Y-%m-%d")
        location = row[5]

        if name != committee:
          committee = name
          agenda_items = find_items(committee)

        e = Event(name=name,
                  session=self.session,
                  when=when,
                  location=location
                  )

        attendees = find_attendees(tmpdir, row)
        if len(attendees) == 0:
          empty.append(row)
        for attendee in find_attendees(tmpdir, row):
          e.add_person(attendee)
        e.add_source("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport")

        for item in agenda_items:
          if item['date'].date() == when.date():
            i = e.add_agenda_item(item['description'])
            i.add_committee(committee)
            i['order'] = item['order']

            for link in item['links']:
              i.add_media_link(link['name'], link['url'], on_duplicate='ignore')

            if 'notes' in item:
              i['notes'] = [item['notes']]

        yield e

    shutil.rmtree(tmpdir)
    os.remove('meetings.csv')
Esempio n. 21
0
def test_basic_event():
    """ test that we can create an event """
    e = Event(name="get-together",
              when=dt.datetime.utcnow(),
              location="Joe's Place")

    e.add_source(url='foobar')
    e.validate()

    e.add_link("http://foobar.baz")
    e.add_link("http://foobar.baz", note="foo")
    e.validate()

    assert len(e.links) == 2
Esempio n. 22
0
    def get_events(self):
        page = self.lxmlize(PAGE)
        events = page.xpath("//div[@class='col-middle']//ul/li")
        when = None
        for event in events:
            h3 = event.xpath("./a/h2")
            h3 = h3[0] if h3 else None
            if h3 is not None:
                when = h3.text
            else:
                if when is None:
                    self.warning("Ungrok!")
                    continue

                b, _, i = event.xpath("./p/*")
                title = b.text_content()
                event = i.text_content()

                if "NO MEETING" in event:
                    continue

                day, title = (x.strip() for x in title.split("-", 1))

                where = "Council Chambers"

                for subevent in (x.strip() for x in event.split(";")):
                    if " in " in subevent:
                        subevent, where = subevent.rsplit(" in ", 1)
                    subevent = subevent.replace(u'\xa0', ' ')

                    if "NO" in subevent and "MEETING" in subevent:
                        continue

                    if "to follow" in subevent:
                        continue

                    info = EVENT_RE.match(subevent).groupdict()
                    event, time = [info[x] for x in ['event', 'time']]

                    ampm = {
                        "a.m.": "AM",
                        "p.m.": "PM",
                    }

                    for old, new in ampm.items():
                        time = time.replace(old, new)

                    dtstring = ", ".join([day, time])

                    try:
                        etime = dt.datetime.strptime(
                            dtstring, "%m/%d/%Y, %I:%M %p")
                    except ValueError:
                        etime = dt.datetime.strptime(
                            dtstring, "%m/%d/%Y, %I%p")

                    e = Event(name=event,
                              when=etime,
                              location=where)
                    e.add_source(PAGE)
                    yield e
Esempio n. 23
0
    def migrate_events(self, state):
        spec = {}
        if state:
            spec['state'] = state

        for entry in self.billy_db.events.find(spec, timeout=False):

            e = Event(
                name=entry['description'],
                when=entry['when'],
                location=entry['location'],
                session=entry['session'],
                updated_at=entry['updated_at'],
                created_at=entry['created_at'],
                type=entry['type'],
            )
            e.identifiers = [{'scheme': 'openstates',
                             'identifier': entry['_id']}]
            e._openstates_id = entry['_id']
            if entry.get('+location_url'):
                e.add_location_url(entry['+location_url'])

            link = entry.get('link', entry.get("+link"))
            if link:
                e.add_link(link, 'link')

            blacklist = ["description", "when", "location", "session",
                         "updated_at", "created_at", "end", "sources",
                         "documents", "related_bills", "state", "+link",
                         "link", "level", "participants", "country",
                         "_all_ids", "type"]

            e.status = entry.get('status')
            typos = {
                "canceled": "cancelled"
            }
            if e.status in typos:
                e.status = typos[e.status]

            for key, value in entry.items():
                if key in blacklist or not value or key.startswith("_"):
                    continue
                e.extras[key] = value

            if entry.get('end'):
                end = entry['end']
                try:
                    end = dt.datetime.fromtimestamp(end)
                except TypeError:
                    pass

                e.end = end

            for source in entry['sources']:
                e.add_source(url=source['url'])

            if e.sources == []:
                continue  # XXX: print warning

            for document in entry.get('documents', []):
                e.add_document(name=document.get('name'),
                               document_id=document.get('doc_id'),
                               url=document['url'],
                               mimetype=document.get(
                                   "mimetype", document.get(
                                       "+mimetype",
                                       "application/octet-stream")))
                # Try to add the mimetype. If it fails, fall back to a generic
                # undeclared application/octet-stream.

            agenda = None
            for bill in entry.get('related_bills', []):
                if agenda is None:
                    agenda = e.add_agenda_item(
                        description="Bills up for Consideration"
                    )

                hcid = _hot_cache.get(bill.get('id', None), None)
                bid = bill['bill_id']
                if bid is None:
                    continue

                agenda.add_bill(bill=bid, id=hcid)

            for who in entry.get('participants', []):
                participant_type = who.get('participant_type', 'committee')
                # I've gone through the backlog of OpenStates data, they are
                # all committees of some sort.

                who_chamber = who.get('chamber')
                if who_chamber is None:
                    for chamber in ["_chamber", "+chamber"]:
                        f = who.get(chamber)
                        if f:
                            who_chamber = f
                            break

                if who_chamber is None:
                    # Freak of nature ...
                    continue

                hcid = _hot_cache.get(who.get('id', None), None)

                e.add_participant(
                    name=who['participant'],
                    type={
                        "committee": "organization",
                        "legislator": "person",
                        "person": "person",
                    }[participant_type],
                    id=hcid,
                    note=who['type'],
                    chamber=who_chamber)

            self.save_object(e)
Esempio n. 24
0
    def get_events(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)
        
        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type):
                
                # attempt to map the cells across table types. 
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title, when=meeting_date, session=self.session, location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)                

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a'))>0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html')                    

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href','')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text, url=link_url, mimetype='application/pdf') 

                # skip everything below here for the 'upcoming' table
                if meeting_type=='upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell)>0:
                    video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick',''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell)>0:
                    e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell)>0:
                    e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html')

                yield e