Exemple #1
0
    def scrape_committee_events(self, code, name):
        events_url = (
            "http://www.cga.ct.gov/basin/fullcalendar/commevents.php?"
            "comm_code={}".format(code))
        events_data = self.get(events_url, verify=False).text
        events = json.loads(events_data)

        DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
        for info in events:

            if info["title"] is None:
                self.warning("Event found with no title; it will be skipped")
                continue
            elif info["title"].startswith("CANCELLED:"):
                self.info(
                    "Cancelled event found; it will be skipped: {}".format(
                        info["title"]))
                continue

            when = datetime.datetime.strptime(info["start"], DATETIME_FORMAT)
            # end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT)
            where = "{0} {1}".format(info["building"].strip(),
                                     info["location"].strip())
            # end_time=self._tz.localize(end),
            event = Event(
                start_date=self._tz.localize(when),
                location_name=where,
                name=info["title"],
                description=info["title"],
            )
            event.add_source(events_url)

            yield event
    def scrape_upper(self):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        text = page.text_content()
        _, text = text.split("MEETING NOTICES")
        re_date = r"[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}"
        chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:])

        for match, data in chunks:
            when = match.group()
            when = datetime.datetime.strptime(when, "%A, %B %d, %Y")

            lines = filter(None, [x.strip() for x in data.splitlines()])
            time_ = re.search(r"^\s*TIME:\s+(.+?)\s+\x96", data, re.M).group(1)
            time_ = time_.replace("a.m.", "AM").replace("p.m.", "PM")
            time_ = time.strptime(time_, "%I:%M %p")
            when += datetime.timedelta(hours=time_.tm_hour,
                                       minutes=time_.tm_min)

            title = lines[0]

            where = re.search(r"^\s*PLACE:\s+(.+)", data, re.M).group(1)
            where = where.strip()

            event = Event(name=title,
                          start_date=self._tz.localize(when),
                          location_name=where)
            event.add_source(url)

            yield event
Exemple #3
0
def event_obj():
    e = Event(
        name="get-together",
        start_date=datetime.datetime.utcnow().isoformat().split(".")[0] + "Z",
        location_name="Joe's Place",
    )
    e.add_source(url="http://example.com/foobar")
    return e
    def scrape_chamber(self, chamber):
        url = utils.urls["events"][chamber]
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for table in page.xpath(
                '//table[@class="CMS-MeetingDetail-CurrMeeting"]'):
            date_string = table.xpath(
                'ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0]
            for row in table.xpath("tr"):
                time_string = row.xpath(
                    'td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip()
                description = (
                    row.xpath('td[@class="CMS-MeetingDetail-Agenda"]/div/div')
                    [-1].text_content().strip())
                location = (row.xpath('td[@class="CMS-MeetingDetail-Location"]'
                                      )[0].text_content().strip())
                committees = row.xpath(
                    './/div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a'
                )
                bills = row.xpath('.//a[contains(@href, "billinfo")]')

                try:
                    start_date = datetime.datetime.strptime(
                        "{} {}".format(date_string, time_string),
                        "%m/%d/%Y %I:%M %p")
                except ValueError:
                    break

                event = Event(
                    name=description,
                    start_date=self._tz.localize(start_date),
                    location_name=location,
                )
                event.add_source(url)

                if bills or committees:
                    item = event.add_agenda_item(description)
                    for bill in bills:
                        parsed = urllib.parse.urlparse(bill.get("href"))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_bill("{}{} {}".format(qs["body"], qs["type"],
                                                       qs["bn"]))
                    for committee in committees:
                        parsed = urllib.parse.urlparse(committee.get("href"))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_committee(
                            re.sub(r" \([S|H]\)$", "", committee.text),
                            id=qs.get("Code"),
                        )

                yield event
Exemple #5
0
    def scrape_lower_event(self, url):
        html = self.get(url).text

        if "not meeting" in html.lower():
            self.info(f"Skipping {url}, not meeting")
            return

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        com = (page.xpath('//div[contains(@class,"sectionhead")]/h1')
               [0].text_content().strip())

        com = f"House {com}"

        start = self.get_meeting_row(page, "Start Date")
        start = self.tz.localize(dateutil.parser.parse(start))

        end = None
        if self.get_meeting_row(page, "End Date"):
            end = self.get_meeting_row(page, "End Date")
            end = self.tz.localize(dateutil.parser.parse(end))
        location = self.get_meeting_row(page, "Location")

        summary = ""
        if page.xpath('//div[contains(text(),"Meeting Overview")]'):
            summary = (page.xpath(
                '//div[div[contains(text(),"Meeting Overview")]]/div[contains(@class,"ml-3")]'
            )[0].text_content().strip())

        if end:
            event = Event(
                name=com,
                start_date=start,
                end_date=end,
                location_name=location,
                description=summary,
            )
        else:
            event = Event(name=com,
                          start_date=start,
                          location_name=location,
                          description=summary)
        event.add_source(url)

        for h5 in page.xpath(
                '//div[contains(@class,"meeting-actions-bills")]/h5'):
            event.add_agenda_item(h5.text_content().strip())
            for agenda_item in h5.xpath("following-sibling::ul/li"):
                agenda_text = agenda_item.text_content().strip()
                agenda_text = re.sub(r"\s+\u2013\s+", " - ", agenda_text)
                item = event.add_agenda_item(agenda_text)
                found_bills = re.findall(r"H.*\s+\d+", agenda_text)
                if found_bills:
                    item.add_bill(found_bills[0])

        yield event
Exemple #6
0
    def scrape_upper_com(self, url, com, session):
        url = f"{url}{session}"
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        com = f"Senate {com}"

        for row in page.xpath('//table[@id="meetingsTbl"]/tbody/tr'):
            day = row.xpath("td[1]")[0].text_content().strip()
            time = row.xpath("td[2]")[0].text_content().strip()
            notice = row.xpath("td[3]")[0].text_content().strip()
            location = "See Agenda"  # it's in the PDFs but not the web page

            date = dateutil.parser.parse(f"{day} {time}")
            date = self.tz.localize(date)

            if notice.lower() == "not meeting" or "cancelled" in notice.lower(
            ):
                continue

            event = Event(name=com, start_date=date, location_name=location)

            agenda_classes = [
                "mtgrecord_notice",
                "mtgrecord_expandedAgenda",
                "mtgrecord_attendance",
            ]

            for agenda_class in agenda_classes:
                if row.xpath(f"//a[@class='{agenda_class}']"):
                    url = row.xpath(f"//a[@class='{agenda_class}']/@href")[0]
                    doc_name = (row.xpath(f"//a[@class='{agenda_class}']")
                                [0].text_content().strip())
                    event.add_document(doc_name,
                                       url,
                                       media_type="application/pdf")

            for link in row.xpath("td[7]/a"):
                url = link.xpath("@href")[0]
                doc_name = link.text_content().strip()
                event.add_media_link(doc_name, url, "audio/mpeg")

            for link in row.xpath("td[9]/a"):
                url = link.xpath("@href")[0]
                doc_name = link.text_content().strip()
                event.add_media_link(doc_name, url, "text/html")

            event.add_source(url)
            yield event
Exemple #7
0
    def scrape(self, start=None):
        if start is None:
            start = datetime.datetime.today()
        else:
            start = datetime.datetime.strptime(start, "%Y-%m-%d")

        date_format = "%a %b %d %Y"
        date_slug = start.strftime(date_format)

        url = f"https://www.legis.ga.gov/api/meetings?startDate={date_slug}"

        page = self.get(url).json()

        for row in page:
            status = "tentative"

            title = row["subject"]

            if "joint" not in title.lower():
                if row["chamber"] == 2:
                    title = f"Senate {title}"
                elif row["chamber"] == 1:
                    title = f"House {title}"

            start = dateutil.parser.parse(row["start"])

            if start < self.tz.localize(datetime.datetime.now()):
                status = "passed"

            if "cancelled" in title.lower() or "canceled" in title.lower():
                status = "cancelled"
                # try to replace all variants of "[optional dash] cancel[l]ed [optional dash]"
                # so we can match up events to their pre-cancellation occurrence
                title = re.sub(r"-?\s*cancell?ed\s*-?\s*", " ", title, flags=re.I)

            where = row["location"]
            where = f"206 Washington St SW, Atlanta, Georgia, {where}"

            event = Event(
                name=title,
                start_date=start,
                location_name=where,
                classification="committee-meeting",
                status=status,
            )

            if row["agendaUri"] != "":
                event.add_document(
                    "Agenda", row["agendaUri"], media_type="application/pdf"
                )

            if row["livestreamUrl"] is not None:
                event.add_media_link(
                    "Video", row["livestreamUrl"], media_type="text/html"
                )

            event.add_source("https://www.legis.ga.gov/schedule/all")

            yield event
Exemple #8
0
    def scrape_chamber(self, chamber):
        chamber_abbr = self.chamber_abbrs[chamber]
        event_url = f"http://billstatus.ls.state.ms.us/htms/{chamber_abbr}_sched.htm"
        text = self.get(event_url).text
        event = None

        when, time, room, com, desc = None, None, None, None, None

        for line in text.splitlines():
            # new date
            if re.match(
                r"^(MONDAY|TUESDAY|WEDNESDAY|THURSDAY|FRIDAY|SATURDAY|SUNDAY)",
                line,
                re.IGNORECASE,
            ):
                day = line.split("   ")[0].strip()
            # timestamp, start of a new event
            if re.match(r"^\d{2}:\d{2}", line) or re.match(r"^(BC|AR|AA|TBA)\+", line):
                # if there's an event from the previous lines, yield it
                if when and room and com:
                    event = Event(
                        name=com,
                        start_date=when,
                        location_name=room,
                        classification="committee-meeting",
                        description=desc,
                    )
                    event.add_source(event_url)
                    yield event

                (time, room, com) = re.split(r"\s+", line, maxsplit=2)

                # if it's an after recess/adjourn
                # we can't calculate the time so just leave it empty
                if re.match(r"^(BC|AR|AA|TBA)\+", line):
                    time = ""

                com = com.strip()
                when = dateutil.parser.parse(f"{day} {time}")
                when = self._tz.localize(when)

                # reset the description so we can populate it w/
                # upcoming lines (if any)
                desc = ""
            elif when and room and com:
                if line.strip():
                    desc += "\n" + line.strip()

        # don't forget about the last event, which won't get triggered by a new date
        if when and room and com:
            event = Event(
                name=com,
                start_date=when,
                location_name=room,
                classification="committee-meeting",
                description=desc,
            )
            event.add_source(event_url)
            yield event
Exemple #9
0
    def scrape_upper(self, session_id):
        list_url = f"https://lis.virginia.gov/cgi-bin/legp604.exe?{session_id}+oth+MTG&{session_id}+oth+MTG"
        page = self.get(list_url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(list_url)

        date = None
        # note the [td] at the end, they have some empty tr-s so skip them
        for row in page.xpath("//div[@id='mainC']/center/table/tr[td]"):
            if row.xpath("td[1]/text()")[0].strip() != "":
                date = row.xpath("td[1]/text()")[0].strip()

            description = row.xpath("td[3]/text()")[0].strip()

            # data on the house page is better
            if "senate" not in description.lower():
                continue

            time = row.xpath("td[2]/text()")[0].strip()

            status = "tentative"
            if "CANCELLED" in time.lower():
                status = "cancelled"

            try:
                when = dateutil.parser.parse(f"{date} {time}")
            except dateutil.parser._parser.ParserError:
                when = dateutil.parser.parse(date)

            when = self._tz.localize(when)

            # TODO: Post covid figure out how they post locations
            if "virtual" in description.lower():
                location = "Virtual"
            else:
                location = "Unknown"

            event = Event(
                name=description,
                start_date=when,
                classification="committee-meeting",
                location_name=location,
                status=status,
            )

            event.add_source(list_url)
            yield event
Exemple #10
0
    def scrape_upper(self):
        # http://gencourt.state.nh.us/dynamicdatafiles/Committees.txt?x=20201216031749
        url = "http://gencourt.state.nh.us/senate/schedule/CalendarWS.asmx/GetEvents"
        page = self.get(
            url,
            headers={
                "Accept":
                "Accept: application/json, text/javascript, */*; q=0.01",
                "X-Requested-With":
                "XMLHttpRequest",
                "Content-Type":
                "application/json; charset=utf-8",
                "Referer":
                "http://gencourt.state.nh.us/senate/schedule/dailyschedule.aspx",
                "User-Agent":
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
            },
        )

        page = json.loads(page.content)
        # real data is double-json encoded string in the 'd' key
        page = json.loads(page["d"])

        event_root = "http://gencourt.state.nh.us/senate/schedule"

        for row in page:
            event_url = "{}/{}".format(event_root, row["url"])

            start = dateutil.parser.parse(row["start"])
            start = self._tz.localize(start)
            end = dateutil.parser.parse(row["end"])
            end = self._tz.localize(end)

            title = row["title"].strip()

            event = Event(
                name=title,
                start_date=start,
                end_date=end,
                location_name="See Source",
            )

            event.add_source(event_url)

            self.scrape_upper_details(event, event_url)
            yield event
Exemple #11
0
    def scrape_lower_event(self, url):
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        table = page.xpath('//section[@id="leg-agenda-mod"]/div/table')[0]
        meta = table.xpath("tr[1]/td[1]/text()")

        # careful, the committee name in the page #committee_div
        # is getting inserted via JS
        # so use the one from the table, and strip the chair name
        com_name = re.sub(r"\(.*\)", "", meta[0])
        com_name = f"Assembly {com_name}"

        when = dateutil.parser.parse(meta[1])
        when = self._tz.localize(when)
        location = meta[2]

        event = Event(
            name=com_name,
            start_date=when,
            location_name=location,
        )

        event.add_participant(com_name, type="committee", note="host")

        event.add_source(url)

        if table.xpath('.//a[contains(@href, "/leg/")]'):
            agenda = event.add_agenda_item("Bills under Consideration")
            for bill_link in table.xpath('.//a[contains(@href, "/leg/")]'):
                agenda.add_bill(bill_link.text_content().strip())

        yield event
Exemple #12
0
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainfo = {}
        plaintext = ""
        for p in info:
            content = re.sub(r"\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainfo[key.strip()] = val.strip()
        committee = metainfo["COMMITTEE"]
        where = metainfo["PLACE"]
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainfo["PLACE"] = where.strip()
            metainfo["CHAIR"] = chair.strip()

        chair = None
        if "CHAIR" in metainfo:
            chair = metainfo["CHAIR"]

        plaintext = re.sub(r"\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(name=committee,
                      start_date=self._tz.localize(datetime),
                      location_name=where)
        event.dedupe_key = url

        event.add_source(url)
        event.add_participant(committee, type="committee", note="host")
        if chair is not None:
            event.add_participant(chair, type="legislator", note="chair")

        # add a single agenda item, attach all bills
        agenda = event.add_agenda_item(plaintext)

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            agenda.add_bill(bill_id)

        yield event
Exemple #13
0
    def scrape(self):

        EVENTS_URL = (
            "http://www.legislature.state.al.us/aliswww/ISD/InterimMeetings.aspx"
        )
        rows = self.lxmlize(EVENTS_URL).xpath(
            '//table[@id="ContentPlaceHolder1_gvInterimMeeting"]/tr')
        for row in rows[1:]:
            date = row.xpath("td")[0].text_content().strip()
            time = row.xpath("td")[1].text_content().strip()
            details = row.xpath("td")[4].text_content().strip()

            if time != "":
                date_with_time = "{} {}".format(date, time)
            else:
                # sometimes the time is the first part of the decription
                match = re.match(r"\s*\d+:\d+\s*[ap]m", details, flags=re.I)
                if match:
                    date_with_time = "{} {}".format(date, match.group())

            location = row.xpath("td")[2].text_content().strip()

            # 11 South Union Street, Montgomery, Alabama, United States
            # TODO: IF location is "room (X)" add state house
            # TODO: REplace "state house" with address

            # 32°22′37.294″N 86°17′57.991″W

            # host = row.xpath('td')[3].text_content().strip()
            name = row.xpath("td")[3].text_content().strip()
            if name == "":
                continue

            event = Event(
                start_date=self._TZ.localize(
                    dateutil.parser.parse(date_with_time)),
                name=name,
                location_name=location,
                description=details,
            )

            event.add_source(EVENTS_URL)

            yield event
Exemple #14
0
    def scrape(self, start=None, end=None):
        if start is None:
            start = dt.datetime.today()
        else:
            start = dateutil.parser.parse(start)

        if end is None:
            end = start + relativedelta(months=+3)
        else:
            end = dateutil.parser.parse(end)

        start = start.strftime("%Y-%m-%d")
        end = end.strftime("%Y-%m-%d")

        url = f"{self.base_url}calendar-data?start={start}&end={end}"
        data = json.loads(self.scraper.get(url).content)

        for item in data:
            name = item["title"].strip()
            if "canceled" in name.lower():
                continue

            if "house session" in name.lower(
            ) or "senate session" in name.lower():
                continue

            url = f"{self.base_url}{item['url']}"

            when = dateutil.parser.parse(item["start"])
            when = self._tz.localize(when)

            page = self.scraper.get(url).content
            page = lxml.html.fromstring(page)

            location = page.xpath(
                '//div[contains(@class,"eventModule") and h3[contains(text(), "Location")]]/text()'
            )[0].strip()
            agenda_url = page.xpath(
                '//a[contains(@class,"linkButton") and contains(text(),"Agenda")]/@href'
            )[0]

            event = Event(
                name=name,
                start_date=when,
                location_name=location,
            )

            event.add_participant(name, type="committee", note="host")
            event.add_document("Agenda",
                               agenda_url,
                               media_type="application/pdf")
            event.add_source(url)

            yield event
Exemple #15
0
    def upper_parse_agenda_item(self, item):
        response = self.api_client.get(
            "meeting",
            year=item["agendaId"]["year"],
            agenda_id=item["agendaId"]["number"],
            committee=item["committeeId"]["name"],
        )

        data = response["result"]

        chamber = data["committee"]["committeeId"]["chamber"].title()
        com_code = data["committee"]["committeeId"]["name"]
        com_name = f"{chamber} {com_code}"

        # each "meeting" is actually a listing page of multiple meetings of the same committee
        # broken out by different addendumId
        for addendum in data["committee"]["addenda"]["items"]:
            if addendum["addendumId"] != item["addendum"]:
                continue

            meeting = addendum["meeting"]

            when = dateutil.parser.parse(meeting["meetingDateTime"])
            when = self._tz.localize(when)

            location = meeting["location"]
            description = meeting["notes"]

            if location == "":
                location = "See Committee Site"

            if "canceled" in description.lower():
                continue

            event = Event(
                name=com_name,
                start_date=when,
                location_name=location,
                description=description,
            )

            event.add_participant(com_name, type="committee", note="host")

            com_code = (com_code.lower().replace("'", "").replace(" ",
                                                                  "-").replace(
                                                                      ",", ""))
            url = f"https://www.nysenate.gov/committees/{com_code}"
            event.add_source(url)

            bills = addendum["bills"]["items"]

            if len(bills) > 0:
                agenda = event.add_agenda_item("Bills under consideration")

            for bill in bills:
                agenda.add_bill(bill["billId"]["printNo"])

            yield event
Exemple #16
0
    def scrape_upper(self):
        listing_url = "https://www.senate.mo.gov/hearingsschedule/hrings.htm"

        html = self.get(listing_url).text

        # The HTML here isn't wrapped in a container per-event
        # which makes xpath a pain. So string split by <hr>
        # then parse each event's fragment for cleaner results
        for fragment in html.split("<hr />")[1:]:
            page = lxml.html.fromstring(fragment)

            when_date = self.row_content(page, "Date:")
            when_time = self.row_content(page, "Time:")
            location = self.row_content(page, "Room:")

            location = "{}, {}".format(
                location, "201 W Capitol Ave, Jefferson City, MO 65101")

            # com = self.row_content(page, 'Committee:')
            com = page.xpath(
                '//td[descendant::b[contains(text(),"Committee")]]/a/text()'
            )[0]
            com = com.split(", Senator")[0].strip()

            start_date = self._TZ.localize(
                dateutil.parser.parse("{} {}".format(when_date, when_time)))

            event = Event(start_date=start_date,
                          name=com,
                          location_name=location)

            event.add_source(listing_url)

            event.add_participant(com, type="committee", note="host")

            for bill_table in page.xpath(
                    '//table[@width="85%" and @border="0"]'):
                bill_link = ""
                if bill_table.xpath(self.bill_link_xpath):
                    agenda_line = bill_table.xpath("string(tr[2])").strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

                    bill_link = bill_table.xpath(
                        self.bill_link_xpath)[0].strip()
                    agenda_item.add_bill(bill_link)
                else:
                    agenda_line = bill_table.xpath("string(tr[1])").strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

            yield event
Exemple #17
0
    def scrape_cal_page(self, url):
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for row in page.xpath("//article[contains(@class,'accordion')]"):
            when = row.xpath(".//time/@datetime")[0]
            when = dateutil.parser.parse(when)

            title = row.xpath(
                ".//h3[contains(@class,'heading-link')]/text()")[0].strip()

            description = row.xpath(
                "section/div[contains(@class,'large-8')]/div[contains(@class,'base')]"
            )[0].text_content()

            # fix special chars
            description = (description.replace("\n\u2013", " ").replace(
                "\n", " ").replace("\u203a", ""))
            description = description.replace("More about this event",
                                              "").strip()

            location = row.xpath(
                "header/div/div[contains(@class,'large-8')]/div/div[contains(@class,'text-right')]/p"
            )[0].text_content()

            event = Event(
                name=title,
                description=description,
                start_date=when,
                location_name=location,
            )

            agenda_url = row.xpath(
                ".//a[contains(text(),'More about this event')]/@href")
            if agenda_url != []:
                event.add_document("Details and Agenda",
                                   agenda_url[0],
                                   media_type="text/html")

            if "committee meeting" in title.lower():
                com_name = title.replace("Committee Meeting", "").strip()
                event.add_participant(com_name, type="commitee", note="host")

            event.add_source(url)

            yield event

        if page.xpath("//a[contains(text(), 'Upcoming Events')]"):
            next_url = page.xpath(
                "//a[contains(text(), 'Upcoming Events')]/@href")[0]
            yield from self.scrape_cal_page(next_url)
Exemple #18
0
    def scrape_events(self, page):

        page = lxml.html.fromstring(page)

        if page.xpath(
                "//h3[contains(text(),'There are no hearings for the date range')]"
        ):
            raise EmptyScrape
            return

        for meeting in page.xpath('//div[@class="card mb-4"]'):
            com = meeting.xpath(
                'div[contains(@class, "card-header")]/text()')[0].strip()
            details = meeting.xpath(
                'div[contains(@class, "card-header")]/small/text()')[0].strip(
                )

            (location, time) = details.split(" - ")

            # turn room numbers into the full address
            if location.lower().startswith("room"):
                location = "1445 K St, Lincoln, NE 68508, {}".format(location)

            day = meeting.xpath(
                "./preceding-sibling::h2[@class='text-center']/text()"
            )[-1].strip()

            # Thursday February 27, 2020 1:30 PM
            date = "{} {}".format(day, time)
            event_date = self._tz.localize(
                datetime.datetime.strptime(date, "%A %B %d, %Y %I:%M %p"))

            event = Event(
                name=com,
                start_date=event_date,
                classification="committee-meeting",
                description="Committee Meeting",
                location_name=location,
            )

            event.add_committee(com, note="host")

            for row in meeting.xpath("div/table/tr"):
                if not row.xpath("td[3]"):
                    continue
                agenda_desc = row.xpath("td[3]/text()")[0].strip()
                agenda_item = event.add_agenda_item(description=agenda_desc)

                if row.xpath("td[1]/a"):
                    # bill link
                    agenda_item.add_bill(
                        row.xpath("td[1]/a/text()")[0].strip())

            event.add_source(
                "https://nebraskalegislature.gov/calendar/calendar.php")

            yield event
Exemple #19
0
    def scrape_upper_events(self):
        url = "https://www.flsenate.gov/Tracker/RSS/DailyCalendar"
        page = self.get(url).text
        feed = feedparser.parse(page)
        for entry in feed["entries"]:
            # The feed breaks the RSS standard by making the pubdate the
            # actual event's date, not the RSS item publish date
            when = datetime.datetime(*entry["published_parsed"][:6])
            when = pytz.utc.localize(when)

            desc = entry["summary"].split(" - ")[0]
            location = entry["summary"].split(" - ")[1]

            event = Event(name=desc,
                          start_date=when,
                          description=desc,
                          location_name=location)

            event.add_source(entry["link"])
            yield event
Exemple #20
0
    def scrape_meeting_notice(self, item, url):
        # Since Event Name is not provided for all mettings.
        if "Joint" in str(item["CommitteeName"]):
            event_name = str(item["CommitteeName"])
        else:
            event_name = "{} {}".format(str(item["CommitteeTypeName"]),
                                        str(item["CommitteeName"]))
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt)
        location_name = str(item["AddressAliasNickname"])
        event = Event(
            location_name=location_name,
            start_date=self._tz.localize(start_time),
            name=event_name,
            description="Committee Meeting Status: {}".format(
                item["CommitteeMeetingStatusName"]),
        )

        event.add_committee(name=str(item["CommitteeName"]),
                            id=item["CommitteeId"])

        html_url = f'https://legis.delaware.gov/MeetingNotice?committeeMeetingId={item["CommitteeMeetingId"]}'
        event.add_source(html_url)

        page_url = f'https://legis.delaware.gov/json/MeetingNotice/GetCommitteeMeetingItems?committeeMeetingId={item["CommitteeMeetingId"]}'

        page_data = []
        try:
            page_data = self.post(page_url).json()["Data"]
        except json.decoder.JSONDecodeError:
            # No agenda items
            self.info(f"POST returned nothing on {page_url}")

        for item in page_data:
            a = event.add_agenda_item(description=str(item["ItemDescription"]))
            if item["LegislationDisplayText"] is not None:
                a.add_bill(item["LegislationDisplayText"])

            event.add_person(
                name=str(item["PrimarySponsorShortName"]),
                id=str(item["PrimarySponsorPersonId"]),
                note="Sponsor",
            )

        yield event
Exemple #21
0
    def scrape(self, chamber=None):

        # we need to GET the page once to set up the ASP.net vars
        # then POST to it to set it to monthly
        url = "https://www.okhouse.gov/Committees/MeetingNotices.aspx"

        params = {
            "__EVENTTARGET": "ctl00$ContentPlaceHolder1$cbMonthly",
            "ctl00$ScriptManager1":
            "ctl00$ContentPlaceHolder1$ctl00$ContentPlaceHolder1$RadAjaxPanel1Panel|ctl00$ContentPlaceHolder1$cbMonthly",
            "ctl00_FormDecorator1_ClientState": "",
            "ctl00_RadToolTipManager1_ClientState": "",
            "ctl00_mainNav_ClientState": "",
            "ctl00$ContentPlaceHolder1$cbToday": "on",
            "ctl00$ContentPlaceHolder1$cbMonthly": "on",
            "ctl00_ContentPlaceHolder1_dgrdNotices_ClientState": "",
            "__ASYNCPOST": "true",
            "RadAJAXControlID": "ctl00_ContentPlaceHolder1_RadAjaxPanel1",
        }

        page = self.get(url).content
        page = lxml.html.fromstring(page)

        html = self.asp_post(url, page, params)
        page = lxml.html.fromstring(html)

        for row in page.xpath('//tr[contains(@id,"_dgrdNotices_")]'):
            status = "tentative"
            agenda_link = row.xpath('.//a[@id="hlMeetAgenda"]')[0]
            title = agenda_link.xpath("text()")[0].strip()
            agenda_url = agenda_link.xpath("@href")[0]
            location = row.xpath("td[3]")[0].text_content().strip()

            # swap in a space for the <br/>
            when = row.xpath("td[4]")[0]
            for br in when.xpath(".//br"):
                br.tail = " " + br.tail if br.tail else " "

            when = when.text_content().strip()
            if "cancelled" in when.lower():
                status = "cancelled"

            when = re.sub("CANCELLED", "", when, re.IGNORECASE)
            when = self._tz.localize(dateutil.parser.parse(when))

            event = Event(
                name=title,
                location_name=location,
                start_date=when,
                classification="committee-meeting",
                status=status,
            )

            event.add_source(url)

            event.add_document("Agenda",
                               agenda_url,
                               media_type="application/pdf")

            yield event
    def scrape_meetings(self, meetings, group):
        """
        Scrape and save event data from a list of meetings.

        Arguments:
        meetings -- A list of lxml elements containing event information
        group -- The type of meeting. The legislature site applies
                 different formatting to events based on which group
                 they correspond to.  `group` should be one of the
                 following strings: 'house', 'senate', or 'commission'.

        """
        for meeting in meetings:
            when = self.get_date(meeting)
            description = self.get_description(meeting)
            location = self.get_location(meeting)

            if when and description and location:
                event = Event(
                    name=description,
                    start_date=when.replace(tzinfo=self.tz),
                    description=description,
                    location_name=location,
                )
                agenda = self.get_agenda(meeting)
                if agenda:
                    event.add_agenda_item(agenda)
                event.add_source(url)
                yield event
Exemple #23
0
    def scrape(self):

        EVENTS_URL = (
            "http://www.legislature.state.al.us/aliswww/ISD/InterimMeetings.aspx"
        )
        rows = self.lxmlize(EVENTS_URL).xpath(
            '//table[@id="ContentPlaceHolder1_gvInterimMeeting"]/tr')
        for row in rows[1:]:
            date = row.xpath("td")[0].text_content().strip()
            time = row.xpath("td")[1].text_content().strip()

            date_with_time = "{} {}".format(date, time)

            location = row.xpath("td")[2].text_content().strip()

            # 11 South Union Street, Montgomery, Alabama, United States
            # TODO: IF location is "room (X)" add state house
            # TODO: REplace "state house" with address

            # 32°22′37.294″N 86°17′57.991″W

            # host = row.xpath('td')[3].text_content().strip()
            name = row.xpath("td")[3].text_content().strip()
            details = row.xpath("td")[4].text_content().strip()

            event = Event(
                start_date=self._TZ.localize(
                    datetime.datetime.strptime(date_with_time,
                                               self._DATETIME_FORMAT)),
                name=name,
                location_name=location,
                description=details,
            )

            event.add_source(EVENTS_URL)

            yield event
Exemple #24
0
    def create_event(self, committee, agenda_document):
        name = committee["FullName"]

        start_date = dateutil.parser.parse(agenda_document["DocumentDate"])

        location = "500 E Capitol Ave, Pierre, SD 57501"

        event = Event(
            name=name,
            start_date=start_date,
            location_name=location,
            classification="committee-meeting",
        )

        return event
Exemple #25
0
    def scrape(self, session=None):

        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        # figuring out starting year from metadata
        for item in self.jurisdiction.legislative_sessions:
            if item["identifier"] == session:
                start_year = item["start_date"][:4]
                self.year = start_year
                break

        url = f"https://www.legis.nd.gov/assembly/{session}-{start_year}/committees/interim/committee-meeting-summary"

        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for table in page.xpath('//table[contains(@class,"views-table")]'):
            com = table.xpath("caption/a")[0].text_content().strip()
            for row in table.xpath("tbody/tr"):
                date_link = row.xpath("td[1]/strong/a")[0]
                event_url = date_link.xpath("@href")[0]

                date = date_link.xpath("span")[0].text_content().strip()
                date = dateutil.parser.parse(date)
                date = self._tz.localize(date)

                self.event_months.add(date.strftime("%Y-%m"))

                location = "See Agenda"

                event = Event(name=com,
                              start_date=date,
                              location_name=location)

                event.add_source(event_url)

                for link in row.xpath("td[2]//a"):
                    link_text = link.text_content().strip()

                    # skip live broadcast links
                    if "video.legis" in link_text:
                        continue

                    event.add_document(link_text,
                                       link.xpath("@href")[0],
                                       media_type="application/pdf")

                self.events[event_url] = event

        for year_month in self.event_months:
            self.scrape_calendar(year_month)

        for key in self.events:
            yield self.events[key]
    def parse_event(self, row, chamber):
        # sample event available at http://www.akleg.gov/apptester.html
        committee_code = row.xpath("string(Sponsor)").strip()

        if committee_code in self.COMMITTEES[chamber]:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                self.COMMITTEES[chamber][committee_code]["name"],
            )
        else:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                "MISCELLANEOUS",
            )

        name = "{} {}".format(self.COMMITTEES_PRETTY[chamber],
                              row.xpath("string(Title)").strip())

        # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>"
        if name == "":
            name = committee_name

        location = row.xpath("string(Location)").strip()

        # events with no location all seem to be committee hearings
        if location == "":
            location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801"

        start_date = dateutil.parser.parse(row.xpath("string(Schedule)"))
        # todo: do i need to self._TZ.localize() ?

        event = Event(start_date=start_date, name=name, location_name=location)

        event.add_source("http://w3.akleg.gov/index.php#tab4")

        if committee_code in self.COMMITTEES[chamber]:
            event.add_participant(committee_name,
                                  type="committee",
                                  note="host")

        for item in row.xpath("Agenda/Item"):
            agenda_desc = item.xpath("string(Text)").strip()
            if agenda_desc != "":
                agenda_item = event.add_agenda_item(description=agenda_desc)
                if item.xpath("BillRoot"):
                    bill_id = item.xpath("string(BillRoot)")
                    # AK Bill ids have a bunch of extra spaces
                    bill_id = re.sub(r"\s+", " ", bill_id)
                    agenda_item.add_bill(bill_id)

        yield event
Exemple #27
0
    def scrape_chamber(self, chamber):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = (self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description)

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {"Asm": "lower", "Sen": "upper"}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.items():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = [
                "%s %s" % re.match(r"\d+([^\d]+)(\d+)", bill).groups()
                for bill in bill_ids
            ]

            # Dereference the committee_nr number and get display name.
            msg = "More than one committee meeting at (location, date) %r"
            msg = msg % ((location, date), )
            assert len(set(hearing.committee_nr
                           for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = "Committee Meeting: " + committee_name
            event = Event(name=desc,
                          start_date=date,
                          location_name=committee_name)
            for bill_id in bills:
                if "B" in bill_id:
                    type_ = "bill"
                else:
                    type_ = "resolution"
                item = event.add_agenda_item("consideration")
                item.add_bill(bill_id, note=type_)

            event.add_person(committee_name + " Committee", note="host")
            event.add_source("https://downloads.leginfo.legislature.ca.gov/")

            yield event
Exemple #28
0
    def scrape_lower_item(self, page):
        # print(lxml.etree.tostring(page, pretty_print=True))
        com = self.table_row_content(page, "Committee:")
        when_date = self.table_row_content(page, "Date:")
        when_time = self.table_row_content(page, "Time:")
        location = self.table_row_content(page, "Location:")

        if "house hearing room" in location.lower():
            location = "{}, {}".format(
                location, "201 W Capitol Ave, Jefferson City, MO 65101")

        # fix some broken times, e.g. '12 :00'
        when_time = when_time.replace(" :", ":")
        # a.m. and p.m. seem to confuse dateutil.parser
        when_time = when_time.replace("A.M.", "AM").replace("P.M.", "PM")

        # some times have extra info after the AM/PM
        if "upon" in when_time:
            when_time = when_time.split("AM", 1)[0]
            when_time = when_time.split("PM", 1)[0]

        # fix '- Upcoming', '- In Progress'  in dates
        when_date = re.sub(r"- (.*)", "", when_date).strip()

        try:
            start_date = dateutil.parser.parse(f"{when_date} {when_time}")
        except dateutil.parser._parser.ParserError:
            start_date = dateutil.parser.parse(when_date)

        start_date = self._TZ.localize(start_date)

        event = Event(start_date=start_date, name=com, location_name=location)

        event.add_source("https://house.mo.gov/HearingsTimeOrder.aspx")

        event.add_participant(com, type="committee", note="host")

        # different from general MO link xpath due to the <b>
        house_link_xpath = ('.//a[contains(@href, "Bill.aspx") '
                            'or contains(@href, "bill.aspx")]/b/text()')

        for bill_title in page.xpath(house_link_xpath):
            bill_no = bill_title.split("--")[0].strip()
            bill_no = bill_no.replace("HCS", "").strip()

            agenda_item = event.add_agenda_item(description=bill_title)
            agenda_item.add_bill(bill_no)

        yield event
Exemple #29
0
    def scrape_chamber(self, chamber, session, start, end):
        page = self.get_xml(start, end)

        for row in xpath(page, "//wa:CommitteeMeeting"):
            event_cancelled = xpath(row, "string(wa:Cancelled)")
            if event_cancelled == "true":
                continue

            event_chamber = xpath(row, "string(wa:Agency)")
            if self.chambers[event_chamber] != chamber:
                continue

            event_date = datetime.datetime.strptime(
                xpath(row, "string(wa:Date)"), "%Y-%m-%dT%H:%M:%S"
            )
            event_date = self._tz.localize(event_date)
            event_com = xpath(row, "string(wa:Committees/" "wa:Committee/wa:LongName)")
            agenda_id = xpath(row, "string(wa:AgendaId)")
            notes = xpath(row, "string(wa:Notes)")
            room = xpath(row, "string(wa:Room)")
            building = xpath(row, "string(wa:Building)")
            # XML has a wa:Address but it seems useless
            city = xpath(row, "string(wa:City)")
            state = xpath(row, "string(wa:State)")

            location = "{}, {}, {} {}".format(room, building, city, state)

            event = Event(
                name=event_com,
                start_date=event_date,
                location_name=location,
                description=notes,
            )

            source_url = (
                "https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}".format(
                    agenda_id
                )
            )
            event.add_source(source_url)

            event.add_participant(event_com, type="committee", note="host")

            event.extras["agendaId"] = agenda_id

            self.scrape_agenda_items(agenda_id, event)

            yield event
    def scrape_page(self, url, session, chamber):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        ctty_name = doc.xpath("//span[@class='heading']")[0].text_content()

        tables = doc.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf["Location:"]
        subject_matter = metainf["Subject Matter:"]
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf["Scheduled Date:"]
        datetime = re.sub(r"\s+", " ", datetime)
        repl = {"AM": " AM", "PM": " PM"}  # Space shim.
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = self.localize(
            dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p"))

        event = Event(description, start_date=datetime, location_name=where)
        event.add_source(url)

        if ctty_name.startswith("Hearing Notice For"):
            ctty_name.replace("Hearing Notice For", "")
        event.add_participant(ctty_name, "organization")

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            agenda_item = event.add_agenda_item(bill_id)
            agenda_item.add_bill(bill_id)

        return event