Beispiel #1
0
    def scrape(self, chamber=None):

        # we need to GET the page once to set up the ASP.net vars
        # then POST to it to set it to monthly
        url = "https://www.okhouse.gov/Committees/MeetingNotices.aspx"

        params = {
            "__EVENTTARGET": "ctl00$ContentPlaceHolder1$cbMonthly",
            "ctl00$ScriptManager1":
            "ctl00$ContentPlaceHolder1$ctl00$ContentPlaceHolder1$RadAjaxPanel1Panel|ctl00$ContentPlaceHolder1$cbMonthly",
            "ctl00_FormDecorator1_ClientState": "",
            "ctl00_RadToolTipManager1_ClientState": "",
            "ctl00_mainNav_ClientState": "",
            "ctl00$ContentPlaceHolder1$cbToday": "on",
            "ctl00$ContentPlaceHolder1$cbMonthly": "on",
            "ctl00_ContentPlaceHolder1_dgrdNotices_ClientState": "",
            "__ASYNCPOST": "true",
            "RadAJAXControlID": "ctl00_ContentPlaceHolder1_RadAjaxPanel1",
        }

        page = self.get(url).content
        page = lxml.html.fromstring(page)

        html = self.asp_post(url, page, params)
        page = lxml.html.fromstring(html)

        for row in page.xpath('//tr[contains(@id,"_dgrdNotices_")]'):
            status = "tentative"
            agenda_link = row.xpath('.//a[@id="hlMeetAgenda"]')[0]
            title = agenda_link.xpath("text()")[0].strip()
            agenda_url = agenda_link.xpath("@href")[0]
            location = row.xpath("td[3]")[0].text_content().strip()

            # swap in a space for the <br/>
            when = row.xpath("td[4]")[0]
            for br in when.xpath(".//br"):
                br.tail = " " + br.tail if br.tail else " "

            when = when.text_content().strip()
            if "cancelled" in when.lower():
                status = "cancelled"

            when = re.sub("CANCELLED", "", when, re.IGNORECASE)
            when = self._tz.localize(dateutil.parser.parse(when))

            event = Event(
                name=title,
                location_name=location,
                start_date=when,
                classification="committee-meeting",
                status=status,
            )

            event.add_source(url)

            event.add_document("Agenda",
                               agenda_url,
                               media_type="application/pdf")

            yield event
Beispiel #2
0
    def scrape(self, start=None):
        if start is None:
            start = datetime.datetime.today()
        else:
            start = datetime.datetime.strptime(start, "%Y-%m-%d")

        date_format = "%a %b %d %Y"
        date_slug = start.strftime(date_format)

        url = f"https://www.legis.ga.gov/api/meetings?startDate={date_slug}"

        page = self.get(url).json()

        for row in page:
            status = "tentative"

            title = row["subject"]

            if "joint" not in title.lower():
                if row["chamber"] == 2:
                    title = f"Senate {title}"
                elif row["chamber"] == 1:
                    title = f"House {title}"

            start = dateutil.parser.parse(row["start"])

            if start < self.tz.localize(datetime.datetime.now()):
                status = "passed"

            if "cancelled" in title.lower() or "canceled" in title.lower():
                status = "cancelled"
                # try to replace all variants of "[optional dash] cancel[l]ed [optional dash]"
                # so we can match up events to their pre-cancellation occurrence
                title = re.sub(r"-?\s*cancell?ed\s*-?\s*", " ", title, flags=re.I)

            where = row["location"]
            where = f"206 Washington St SW, Atlanta, Georgia, {where}"

            event = Event(
                name=title,
                start_date=start,
                location_name=where,
                classification="committee-meeting",
                status=status,
            )

            if row["agendaUri"] != "":
                event.add_document(
                    "Agenda", row["agendaUri"], media_type="application/pdf"
                )

            if row["livestreamUrl"] is not None:
                event.add_media_link(
                    "Video", row["livestreamUrl"], media_type="text/html"
                )

            event.add_source("https://www.legis.ga.gov/schedule/all")

            yield event
Beispiel #3
0
    def scrape_events(self, session, start_date):
        session_key = SESSION_KEYS[session]

        if start_date is None:
            start_date = datetime.date.today()
        else:
            start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")

        committees_by_code = {}

        committees_response = self.api_client.get("committees",
                                                  session=session_key)
        for committee in committees_response:
            committees_by_code[
                committee["CommitteeCode"]] = committee["CommitteeName"]

        meetings_response = self.api_client.get(
            "committee_meetings",
            start_date=start_date.strftime(self._DATE_FORMAT),
            session=session_key,
        )

        if len(meetings_response) == 0:
            raise EmptyScrape

        for meeting in meetings_response:
            event_date = self._TZ.localize(
                datetime.datetime.strptime(meeting["MeetingDate"],
                                           self._DATE_FORMAT))
            com_name = committees_by_code[meeting["CommitteeCode"]]

            event = Event(start_date=event_date,
                          name=com_name,
                          location_name=meeting["Location"])

            event.add_source(meeting["AgendaUrl"])

            event.extras["meeting_guid"] = meeting["MeetingGuid"]
            event.extras["committee_code"] = committee["CommitteeCode"]

            event.add_participant(com_name, type="committee", note="host")

            for row in meeting["CommitteeAgendaItems"]:
                if row["Comments"] is not None:
                    agenda = event.add_agenda_item(row["Comments"])

                if row["MeasureNumber"] is not None:
                    bill_id = "{} {}".format(row["MeasurePrefix"],
                                             row["MeasureNumber"])
                    agenda.add_bill(bill_id)

            for row in meeting["CommitteeMeetingDocuments"]:
                event.add_document(
                    note=row["ExhibitTitle"],
                    url=row["DocumentUrl"],
                    on_duplicate="ignore",
                )
            yield event
Beispiel #4
0
    def scrape(self, session=None):

        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        # figuring out starting year from metadata
        for item in self.jurisdiction.legislative_sessions:
            if item["identifier"] == session:
                start_year = item["start_date"][:4]
                self.year = start_year
                break

        url = f"https://www.legis.nd.gov/assembly/{session}-{start_year}/committees/interim/committee-meeting-summary"

        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for table in page.xpath('//table[contains(@class,"views-table")]'):
            com = table.xpath("caption/a")[0].text_content().strip()
            for row in table.xpath("tbody/tr"):
                date_link = row.xpath("td[1]/strong/a")[0]
                event_url = date_link.xpath("@href")[0]

                date = date_link.xpath("span")[0].text_content().strip()
                date = dateutil.parser.parse(date)
                date = self._tz.localize(date)

                self.event_months.add(date.strftime("%Y-%m"))

                location = "See Agenda"

                event = Event(name=com,
                              start_date=date,
                              location_name=location)

                event.add_source(event_url)

                for link in row.xpath("td[2]//a"):
                    link_text = link.text_content().strip()

                    # skip live broadcast links
                    if "video.legis" in link_text:
                        continue

                    event.add_document(link_text,
                                       link.xpath("@href")[0],
                                       media_type="application/pdf")

                self.events[event_url] = event

        for year_month in self.event_months:
            self.scrape_calendar(year_month)

        for key in self.events:
            yield self.events[key]
Beispiel #5
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(".//a[contains(@title,'Committee Details')]")
            if len(comit_url) != 1:
                continue

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib["href"])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib["href"]
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            if cttie:
                cttie = cttie.replace("Committee on", "").strip()
                cttie = f"{chamber} {cttie}"
                name = cttie

            event = Event(
                name=name, location_name=where, start_date=self._tz.localize(when)
            )

            event.add_source(calurl)

            event.add_committee(cttie, note="host")

            event.add_document("notice", notice, media_type="application/pdf")

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith("AB") or entry.startswith("SB"):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing["name"])

            yield event
Beispiel #6
0
    def scrape(self, start=None, end=None):
        if start is None:
            start = dt.datetime.today()
        else:
            start = dateutil.parser.parse(start)

        if end is None:
            end = start + relativedelta(months=+3)
        else:
            end = dateutil.parser.parse(end)

        start = start.strftime("%Y-%m-%d")
        end = end.strftime("%Y-%m-%d")

        url = f"{self.base_url}calendar-data?start={start}&end={end}"
        data = json.loads(self.scraper.get(url).content)

        for item in data:
            name = item["title"].strip()
            if "canceled" in name.lower():
                continue

            if "house session" in name.lower(
            ) or "senate session" in name.lower():
                continue

            url = f"{self.base_url}{item['url']}"

            when = dateutil.parser.parse(item["start"])
            when = self._tz.localize(when)

            page = self.scraper.get(url).content
            page = lxml.html.fromstring(page)

            location = page.xpath(
                '//div[contains(@class,"eventModule") and h3[contains(text(), "Location")]]/text()'
            )[0].strip()
            agenda_url = page.xpath(
                '//a[contains(@class,"linkButton") and contains(text(),"Agenda")]/@href'
            )[0]

            event = Event(
                name=name,
                start_date=when,
                location_name=location,
            )

            event.add_participant(name, type="committee", note="host")
            event.add_document("Agenda",
                               agenda_url,
                               media_type="application/pdf")
            event.add_source(url)

            yield event
Beispiel #7
0
    def scrape_cal_page(self, url):
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for row in page.xpath("//article[contains(@class,'accordion')]"):
            when = row.xpath(".//time/@datetime")[0]
            when = dateutil.parser.parse(when)

            title = row.xpath(
                ".//h3[contains(@class,'heading-link')]/text()")[0].strip()

            description = row.xpath(
                "section/div[contains(@class,'large-8')]/div[contains(@class,'base')]"
            )[0].text_content()

            # fix special chars
            description = (description.replace("\n\u2013", " ").replace(
                "\n", " ").replace("\u203a", ""))
            description = description.replace("More about this event",
                                              "").strip()

            location = row.xpath(
                "header/div/div[contains(@class,'large-8')]/div/div[contains(@class,'text-right')]/p"
            )[0].text_content()

            event = Event(
                name=title,
                description=description,
                start_date=when,
                location_name=location,
            )

            agenda_url = row.xpath(
                ".//a[contains(text(),'More about this event')]/@href")
            if agenda_url != []:
                event.add_document("Details and Agenda",
                                   agenda_url[0],
                                   media_type="text/html")

            if "committee meeting" in title.lower():
                com_name = title.replace("Committee Meeting", "").strip()
                event.add_participant(com_name, type="commitee", note="host")

            event.add_source(url)

            yield event

        if page.xpath("//a[contains(text(), 'Upcoming Events')]"):
            next_url = page.xpath(
                "//a[contains(text(), 'Upcoming Events')]/@href")[0]
            yield from self.scrape_cal_page(next_url)
Beispiel #8
0
    def scrape_upper_com(self, url, com, session):
        url = f"{url}{session}"
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        com = f"Senate {com}"

        for row in page.xpath('//table[@id="meetingsTbl"]/tbody/tr'):
            day = row.xpath("td[1]")[0].text_content().strip()
            time = row.xpath("td[2]")[0].text_content().strip()
            notice = row.xpath("td[3]")[0].text_content().strip()
            location = "See Agenda"  # it's in the PDFs but not the web page

            date = dateutil.parser.parse(f"{day} {time}")
            date = self.tz.localize(date)

            if notice.lower() == "not meeting" or "cancelled" in notice.lower(
            ):
                continue

            event = Event(name=com, start_date=date, location_name=location)

            agenda_classes = [
                "mtgrecord_notice",
                "mtgrecord_expandedAgenda",
                "mtgrecord_attendance",
            ]

            for agenda_class in agenda_classes:
                if row.xpath(f"//a[@class='{agenda_class}']"):
                    url = row.xpath(f"//a[@class='{agenda_class}']/@href")[0]
                    doc_name = (row.xpath(f"//a[@class='{agenda_class}']")
                                [0].text_content().strip())
                    event.add_document(doc_name,
                                       url,
                                       media_type="application/pdf")

            for link in row.xpath("td[7]/a"):
                url = link.xpath("@href")[0]
                doc_name = link.text_content().strip()
                event.add_media_link(doc_name, url, "audio/mpeg")

            for link in row.xpath("td[9]/a"):
                url = link.xpath("@href")[0]
                doc_name = link.text_content().strip()
                event.add_media_link(doc_name, url, "text/html")

            event.add_source(url)
            yield event
Beispiel #9
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(start_date=start_date,
                      end_date=end_date,
                      name=title,
                      location_name=location)

        event.add_source(
            "http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx")

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath("@href")[0],
                media_type="application/pdf",
                on_duplicate="ignore",
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath(
                './/div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link("Video of Hearing",
                                 video[0].xpath("@href")[0], "text/html")

        if "subcommittee" in title.lower():
            subcom = title.split("-")[0].strip()
            event.add_participant(subcom, type="committee", note="host")
        else:
            event.add_participant(com, type="committee", note="host")
        yield event
Beispiel #10
0
    def scrape_event(self, row):
        date_td = row.xpath("td[1]")[0]
        info_td = row.xpath("td[2]")[0]

        date = date_td.xpath("b")[0].text.strip()
        time = date_td.xpath("b/following-sibling::text()")[0].strip()

        date_and_time = "{} {}".format(date, time)
        start_date = datetime.datetime.strptime(date_and_time, "%m/%d/%y %I:%M %p")

        title = info_td.xpath("font[1]/strong")[0].text.strip()

        all_text = info_td.xpath("descendant-or-self::*/text()")
        notes = (line.strip() for line in all_text if line.strip())
        notes = list(notes)

        if len(notes) > 1:
            # Skip the first line, which is the title
            notes = notes[1:]
            # Split out the address
            address = notes[0]
            notes = notes[1:]
            # The rest just becomes the description
            notes = "\n".join(notes)
        else:
            address = "TBD"
            notes = notes[0]

        event = Event(
            start_date=self._TZ.localize(start_date),
            name=title,
            location_name=address,
            description=notes,
        )

        event.add_source(self.URL)

        if info_td.xpath('a[contains(font/text(),"agenda")]'):
            agenda_url = info_td.xpath("a/@href")[0]
            event.add_document("Agenda", url=agenda_url)

        yield event
Beispiel #11
0
    def scrape_web_json(self, url):
        web_events = self.get(url).json()

        for web_event in web_events:
            event_start = dateutil.parser.parse(web_event["start"])
            event_start = self._tz.localize(event_start)
            event_end = dateutil.parser.parse(web_event["end"])
            event_end = self._tz.localize(event_end)

            event_desc = ""

            if "longtitle" in web_event and web_event["longtitle"] != "":
                event_title = web_event["longtitle"]
            else:
                event_title = web_event["title"]

            event_loc = web_event["body"]
            if event_loc in ["H", "S", "I"]:
                event_loc = "1700 W. Washington St., Phoenix, Arizona, 85007"

            if not event_loc:
                event_loc = "See Agenda"

            event = Event(
                name=event_title,
                location_name=event_loc,
                start_date=event_start,
                end_date=event_end,
                description=event_desc,
            )

            if "PDFFile" in web_event:
                pdf_url = f"https://www.azleg.gov{web_event['PDFFile']}"
                event.add_document("Agenda",
                                   pdf_url,
                                   media_type="application/pdf")

            event.add_source("https://www.azleg.gov/Alis-Today/")

            yield event
Beispiel #12
0
    def scrape_lower(self):
        url = "https://www.house.leg.state.mn.us/Schedules/All"
        page = self.lxmlize(url)

        for row in page.xpath('//div[contains(@class,"my-2 d-print-block")]'):
            # print(row.text_content())

            # skip floor sessions and unlinked events
            if not row.xpath(
                    'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b'
            ):
                continue

            # skip joint ones, we'll get those from the senate API
            if row.xpath('div[contains(@class,"card-header bg-joint")]'):
                continue

            # top-level committee
            com = row.xpath(
                'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()'
            )[0].strip()
            com_link = row.xpath(
                'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/@href'
            )[0]

            when = (row.xpath(
                'div[contains(@class,"card-header")]/span[contains(@class,"text-white")]/text()'
            )[0].replace("\r\n", "").strip())
            when = dateutil.parser.parse(when)
            when = self._tz.localize(when)

            if row.xpath('.//b[.="Location:"]'):
                where = row.xpath(
                    './/b[.="Location:"]/following-sibling::text()[1]'
                )[0].strip()
            else:
                where = "See committee page"

            if row.xpath('.//b[.="Agenda:"]'):
                desc = "\n".join(
                    row.xpath('.//b[.="Agenda:"]/following-sibling::div/text()'
                              )).strip()
            else:
                desc = "See committee page"

            event = Event(
                name=com,
                start_date=when,
                location_name=where,
                classification="committee-meeting",
                description=desc,
            )

            event.add_source(com_link)

            for bill in get_bill_ids(desc):
                event.add_bill(desc)

            if row.xpath(
                    ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]"
            ):
                agenda = event.add_agenda_item("Bills")
                for bill_id in row.xpath(
                        ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]/text()"
                ):
                    agenda.add_bill(bill_id.strip())

            for attachment in row.xpath(".//ul/li/div/a"):
                doc_url = attachment.xpath("@href")[0]
                doc_name = attachment.xpath("text()")[0].strip()
                # if they don't provide a name just use the filename
                if doc_name == "":
                    parsed_url = urlparse(doc_url)
                    doc_name = os.path.basename(parsed_url)

                # sometimes broken links to .msg files (emails?) are attached,
                # they always 404.
                if doc_url.endswith(".msg"):
                    continue
                media_type = get_media_type(doc_url)
                event.add_document(doc_name,
                                   doc_url,
                                   media_type=media_type,
                                   on_duplicate="ignore")

            for committee in row.xpath(
                    'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()'
            ):
                event.add_participant(committee, type="committee", note="host")

            yield event
Beispiel #13
0
    def scrape_upper(self):
        url = "https://www.senate.mn/api/schedule/upcoming"
        data = self.get(url).json()

        for row in data["events"]:
            com = row["committee"]["committee_name"]
            start = dateutil.parser.parse(row["hearing_start"])
            start = self._tz.localize(start)

            if (row["hearing_room"] and "hearing_building" in row
                    and row["hearing_building"]):
                where = f"{row['hearing_building']} {row['hearing_room']}"
            elif "hearing_building" in row and row["hearing_building"]:
                where = row["hearing_building"]
            else:
                where = "TBD"

            description = ""

            if "hearing_notes" in row and row["hearing_notes"]:
                description = row["hearing_notes"]

            event = Event(
                name=com,
                location_name=where,
                start_date=start,
                classification="committee-meeting",
                description=description,
            )

            for bill in get_bill_ids(description):
                event.add_bill(description)

            if "lrl_schedule_link" in row:
                event.add_source(row["lrl_schedule_link"])
            else:
                if "link" in row["committee"]:
                    if row["committee"]["link"].startswith("http"):
                        event.add_source(row["committee"]["link"])
                    elif row["committee"]["link"].startswith("www"):
                        event.add_source(f"http://{row['committee']['link']}")
                    else:
                        event.add_source(
                            f"https://www.senate.mn/{row['committee']['link']}"
                        )
                elif "senate_chair_link" in row["committee"]:
                    event.add_source(
                        f"https://www.senate.mn/{row['committee']['senate_chair_link']}"
                    )

            if "agenda" in row:
                for agenda_row in row["agenda"]:
                    if (agenda_row["description"] is None
                            or agenda_row["description"].strip() == ""):
                        # sometimes they have blank agendas but bills or files
                        agenda_row["description"] = "Agenda"
                    agenda = event.add_agenda_item(agenda_row["description"])
                    if "bill_type" in agenda_row:
                        agenda.add_bill("{} {}".format(
                            agenda_row["bill_type"].replace(".", ""),
                            agenda_row["bill_number"],
                        ))

                    if "files" in agenda_row:
                        for file_row in agenda_row["files"]:
                            doc_name = file_row["filename"]
                            doc_url = file_row["file_path"]

                            # if they don't provide a name just use the filename
                            if doc_name == "":
                                parsed_url = urlparse(doc_url)
                                doc_name = os.path.basename(parsed_url.path)

                            event.add_document(
                                doc_name,
                                f"https://www.senate.mn/{doc_url}",
                                media_type="text/html",
                                on_duplicate="ignore",
                            )

            if "video_link" in row:
                event.add_media_link("Video", row["video_link"], "text/html")

            if "audio_link" in row:
                event.add_media_link("Audio", row["audio_link"], "text/html")

            yield event
Beispiel #14
0
    def house_meeting(self, xml, source_url):

        title = xml.xpath("string(//meeting-details/meeting-title)")

        meeting_date = xml.xpath("string(//meeting-date/calendar-date)")
        start_time = xml.xpath("string(//meeting-date/start-time)")
        end_time = xml.xpath("string(//meeting-date/end-time)")

        start_dt = datetime.datetime.strptime(
            "{} {}".format(meeting_date, start_time), "%Y-%m-%d %H:%M:%S")

        start_dt = self._TZ.localize(start_dt)

        end_dt = None

        if end_time != "":
            end_dt = datetime.datetime.strptime(
                "{} {}".format(meeting_date, end_time), "%Y-%m-%d %H:%M:%S")
            end_dt = self._TZ.localize(end_dt)

        building = xml.xpath(
            "string(//meeting-details/meeting-location/capitol-complex/building)"
        )

        address = "US Capitol"
        if building != "Select one":
            if self.buildings.get(building):
                building = self.buildings.get(building)

            room = xml.xpath(
                "string(//meeting-details/meeting-location/capitol-complex/room)"
            )
            address = "{}, Room {}".format(building, room)

        event = Event(start_date=start_dt, name=title, location_name=address)

        event.add_source(source_url)

        coms = xml.xpath(
            "//committees/committee-name | //subcommittees/committee-name")
        for com in coms:
            com_name = com.xpath("string(.)")
            com_name = "House {}".format(com_name)
            event.add_participant(
                com_name,
                type="committee",
                note="host",
            )

        docs = xml.xpath("//meeting-documents/meeting-document")
        for doc in docs:
            doc_name = doc.xpath("string(description)")
            doc_files = doc.xpath("files/file")
            for doc_file in doc_files:
                media_type = self.media_types[doc_file.get("doc-type")]
                url = doc_file.get("doc-url")

                if doc.get("type") in ["BR", "AM", "CA"]:
                    if doc_name == "":
                        doc_name = doc.xpath("string(legis-num)").strip()
                    matches = re.findall(r"([\w|\.]+)\s+(\d+)", doc_name)

                    if matches:
                        match = matches[0]
                        bill_type = match[0].replace(".", "")
                        bill_number = match[1]
                        bill_name = "{} {}".format(bill_type, bill_number)
                        agenda = event.add_agenda_item(description=bill_name)
                        agenda.add_bill(bill_name)

                if doc_name == "":
                    try:
                        doc_name = self.hearing_document_types[doc.get("type")]
                    except KeyError:
                        self.warning("Unable to find document type: {}".format(
                            doc.get("type")))

                event.add_document(doc_name,
                                   url,
                                   media_type=media_type,
                                   on_duplicate="ignore")

        yield event
Beispiel #15
0
    def scrape(self):

        get_short_codes(self)
        page = self.lxmlize(URL)

        if page.xpath("//td[contains(string(.),'No Hearings')]"):
            raise EmptyScrape

        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()

            # Multi-committee events will be CODE1/CODE2/CODE3
            if "/" in committee:
                coms = committee.split("/")
                com_names = []
                for com in coms:
                    com_names.append("{} {}".format(
                        self.chambers[self.short_ids[com]["chamber"]],
                        self.short_ids[com]["name"],
                    ))
                descr = ", ".join(com_names)
            elif self.short_ids.get(committee):
                descr = "{} {}".format(
                    self.chambers[self.short_ids[committee]["chamber"]],
                    self.short_ids[committee]["name"],
                )
            else:
                descr = [x.text_content() for x in tds[1].xpath(".//span")]
                if len(descr) != 1:
                    raise Exception
                descr = descr[0].replace(".", "").strip()

            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib["href"]
            notice_name = notice.text

            # the listing page shows the same hearing in multiple rows.
            # combine these -- get_related_bills() will take care of adding the bills
            # and descriptions
            if notice_href in self.seen_hearings:
                continue
            else:
                self.seen_hearings.append(notice_href)

            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = TIMEZONE.localize(when)
            event = Event(
                name=descr,
                start_date=when,
                classification="committee-meeting",
                description=descr,
                location_name=where,
            )

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee and committee in self.short_ids:
                    committee = "{} {}".format(
                        self.chambers[self.short_ids[committee]["chamber"]],
                        self.short_ids[committee]["name"],
                    )
                event.add_committee(committee, note="host")

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type="text/html")
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill["descr"].strip())
                bill["bill_id"] = bill["bill_id"].split(",")[0]
                a.add_bill(bill["bill_id"], note=bill["type"])
            yield event
Beispiel #16
0
    def scrape_agenda(self, url):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf["DATE:"]
        time = metainf["TIME:"]
        where = metainf["PLACE:"]

        # check for duration in time
        if " - " in time:
            start, end = time.split(" - ")
            am_pm_srch = re.search("(?i)(am|pm)", end)
            if am_pm_srch:
                time = " ".join([start, am_pm_srch.group().upper()])
            else:
                time = start

        fmts = [
            "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M"
        ]

        event_desc = "Meeting Notice"
        if "Rise" in time:
            datetime = date
            event_desc = "Meeting Notice: Starting at {}".format(time)
        else:
            datetime = "%s %s" % (date, time)
        if "CANCELLED" in datetime.upper():
            return

        transtable = {
            "P.M": "PM",
            "PM.": "PM",
            "P.M.": "PM",
            "A.M.": "AM",
            "POSTPONED": "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(name=event_desc,
                      start_date=self._tz.localize(datetime),
                      location_name=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib["href"]
            event.add_document(bill.text_content(),
                               bill_ft,
                               media_type="application/pdf")
            root = bill.xpath("../../*")
            root = [x.text_content() for x in root]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = (bill.getparent().getparent().getparent().getnext().
                     getnext().text_content())

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            item = event.add_agenda_item(descr)
            item.add_bill(bill.text_content())

        committee = page.xpath("//span[@id='lblSession']")[0].text_content()

        event.add_participant(committee, "committee", note="host")

        yield event
Beispiel #17
0
    def scrape(self, start=None, end=None):
        if start is None:
            start_date = datetime.datetime.now().strftime(self.date_format)

        # default to 90 days if no end
        if end is None:
            dtdelta = datetime.timedelta(days=90)
            end_date = datetime.datetime.now() + dtdelta
            end_date = end_date.strftime(self.date_format)

        url = f"https://www.arkleg.state.ar.us/Calendars/Meetings?tbType=&meetingStartDate={start_date}&meetingEndDate={end_date}"

        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for row in page.xpath(
                "//div[@id='meetingBodyWrapper']/div[contains(@class,'row')]"):
            row_class = row.xpath("@class")[0]
            if "tableSectionHeader" in row_class:
                day = row.xpath("div/text()")[0].strip()
                continue

            time = row.xpath(
                "div[contains(@class,'timeRow')]/b/text()")[0].strip()
            if "no meeting" in time.lower() or "cancelled" in time.lower():
                continue

            if "upon adjournment" in time.lower():
                time = "1:00 PM"

            title = row.xpath("div[2]/b")[0].text_content().strip()

            if "call of the chair" in time.lower():
                time = ""
            else:
                times = re.findall(r"\d+:\d+\s*[A|P]M", time)
                time = times[0]

            when = dateutil.parser.parse(f"{day} {time}")
            when = self._tz.localize(when)

            location = row.xpath("div[2]/text()")[1].strip()

            event = Event(
                name=title,
                start_date=when,
                location_name=location,
                description="",
            )
            event.add_source(
                "https://www.arkleg.state.ar.us/Calendars/Meetings")

            if row.xpath(".//a[@aria-label='Agenda']"):
                agenda_url = row.xpath(".//a[@aria-label='Agenda']/@href")[0]
                event.add_document("Agenda",
                                   agenda_url,
                                   media_type="application/pdf")

            if row.xpath(".//a[@aria-label='Play Video']"):
                video_url = row.xpath(
                    ".//a[@aria-label='Play Video']/@href")[0]
                event.add_media_link("Video of Hearing",
                                     video_url,
                                     media_type="text/html")

            if row.xpath(".//a[@aria-label='Referred']"):
                bill_url = row.xpath(".//a[@aria-label='Referred']/@href")[0]
                self.scrape_referred_bills(event, bill_url)

            yield event
Beispiel #18
0
    def scrape_chamber(self, chamber):
        session = self.latest_session()
        session_id = session_metadata.session_id_meta_data[session]

        chamber_abbr = self.chamber_codes[chamber]

        com_url = (
            "https://apps.azleg.gov/api/Committee/?includeOnlyCommitteesWithAgendas=true"
            "&legislativeBody={}&sessionId={}&standingOnly=true&interimOnly=false&jointCommitteesOnly=false"
        )
        com_url = com_url.format(chamber_abbr, session_id)

        coms = self.get(com_url).json()

        for com in coms:
            # joint committees get returned by both endpoints, so skip one
            if com["LegislativeBody"] != chamber_abbr:
                continue

            #  https://apps.azleg.gov/api/Agenda/?showPassed=true&sessionId=123
            #  &isInterimAgenda=false&body=S&includeItems=false&committeeId=1960
            events_url = (
                "https://apps.azleg.gov/api/Agenda/?includeItems=true&showPassed=true"
                "&sessionId={}&isInterimAgenda=false&body={}&committeeId={}")
            events_url = events_url.format(session_id, chamber_abbr,
                                           com["CommitteeId"])
            events_list = self.get(events_url).json()

            for row in events_list:
                if (row["AgendaCanceled"] is True
                        or "not meeting" in row["Time"].lower()):
                    continue

                title = "{} {}".format(self.code_chambers[chamber_abbr],
                                       row["CommitteeName"])

                # fix for dateutil parser confusion
                row["Time"] = row["Time"].replace("A.M.",
                                                  "AM").replace("P.M.", "PM")

                if "upon rec" not in row["Time"].lower():
                    time = re.findall(r"(\d+:\d+\s+[A|P]M)", row["Time"])
                    if len(time) == 0:
                        self.warning(
                            f"Unable to get time for {row['Time']} on {title}")
                        time = "00:00:00"
                    else:
                        time = time[0]

                    time = time.replace(r"\s+", " ")
                else:
                    time = ""

                when = dateutil.parser.parse(f"{row['Date']} {time}")
                when = self._tz.localize(when)

                where = "{}, Room {}".format(self.address, row["Room"])

                description = ""

                event = Event(
                    name=title,
                    location_name=where,
                    start_date=when,
                    description=description,
                )

                event.add_document("Agenda",
                                   row["HttpPath"],
                                   media_type="text/html")
                event.add_document("Agenda",
                                   row["HttpPdfPath"],
                                   media_type="application/pdf")

                event.add_participant(row["CommitteeName"],
                                      type="committee",
                                      note="host")

                for item in row["Items"]:
                    agenda_item = event.add_agenda_item(item["Description"])
                    bill_id = re.findall(r"^(.*?)\s", item["Description"])
                    bill_id = bill_id[0]
                    agenda_item.add_bill(bill_id)

                    for speaker in item["RequestsToSpeak"]:
                        speaker_title = speaker["Name"]
                        if speaker["Representing"] != "Self":
                            speaker_title = (
                                f"{speaker['Name']} ({speaker['Representing']})"
                            )

                        event.add_participant(speaker_title,
                                              type="person",
                                              note="speaker")

                event.add_source(
                    "https://apps.azleg.gov/BillStatus/AgendaSearch")
                yield event
    def scrape(self, chamber=None, session=None):
        """
        Scrape the events data from all dates from the sc meetings page,
        then create and yield the events objects from the data.
        :param chamber:
        :param session:
        :return: yielded Event objects
        """

        chambers = {
            "upper": {"name": "Senate", "title": "Senator"},
            "lower": {"name": "House", "title": "Representative"},
        }
        if chamber == "other":
            return

        if chamber is None:
            self.info("no chamber specified, using Joint Committee Meeting Schedule")
            events_url = "http://www.scstatehouse.gov/meetings.php"
        else:
            events_url = "http://www.scstatehouse.gov/meetings.php?chamber=%s" % (
                chambers[chamber]["name"].upper()[0]
            )

        page = self.get_page_from_url(events_url)

        meeting_year = page.xpath('//h2[@class="barheader"]/span')[0].text_content()
        meeting_year = re.search(
            r"Week of [A-Z][a-z]+\s+[0-9]{1,2}, ([0-9]{4})", meeting_year
        ).group(1)

        dates = page.xpath("//div[@id='contentsection']/ul")

        for date in dates:
            date_string = date.xpath("span")

            if len(date_string) == 1:
                date_string = date_string[0].text_content()
            else:
                continue

            # If a event is in the next calendar year, the date_string
            # will have a year in it
            if date_string.count(",") == 2:
                event_year = date_string[-4:]
                date_string = date_string[:-6]
            elif date_string.count(",") == 1:
                event_year = meeting_year
            else:
                raise AssertionError("This is not a valid date: '{}'").format(
                    date_string
                )

            for meeting in date.xpath("li"):
                time_string = meeting.xpath("span")[0].text_content()

                if (
                    time_string == "CANCELED"
                    or len(meeting.xpath('.//span[contains(text(), "CANCELED")]')) > 0
                ):
                    continue

                time_string = normalize_time(time_string)
                date_time = datetime.datetime.strptime(
                    event_year + " " + date_string + " " + time_string,
                    "%Y %A, %B %d %I:%M %p",
                )

                date_time = self._tz.localize(date_time)
                meeting_info = meeting.xpath("br[1]/preceding-sibling::node()")[1]
                location, description = re.search(
                    r"-- (.*?) -- (.*)", meeting_info
                ).groups()

                # if re.search(r'committee', description, re.I):
                #     meeting_type = 'committee:meeting'
                # else:
                #     meeting_type = 'other:meeting'

                event = Event(
                    name=description,  # Event Name
                    start_date=date_time,  # When the event will take place
                    location_name=location,
                )  # Where the event will be

                event.add_source(events_url)

                agenda_url = meeting.xpath(".//a[contains(@href,'agendas')]")

                if agenda_url:
                    agenda_url = agenda_url[0].attrib["href"]
                    event.add_source(agenda_url)
                    event.add_document(
                        note="Agenda", url=agenda_url, media_type="application/pdf"
                    )

                    agenda_page = self.get_page_from_url(agenda_url)

                    for bill in agenda_page.xpath(
                        ".//a[contains(@href,'billsearch.php')]"
                    ):
                        # bill_url = bill.attrib['href']
                        bill_id = bill.text_content().replace(".", "").replace(" ", "")
                        # bill_description = self.get_bill_description(bill_url)

                        event.add_bill(bill_id)

                yield event
Beispiel #20
0
    def scrape_committee_page(self, url):
        page = self.get(url, headers=self.cf_headers).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        com = page.xpath(
            '//div[contains(@class, "pull-left span8")]/h1/text()')[0].strip()

        for row in page.xpath('//div[contains(@id, "agenda-item")]'):
            # status = "tentative"
            meta = row.xpath(
                'div[contains(@class,"accordion-heading-agenda")]/a')[0]

            date = meta.xpath("text()")[0].strip()

            time_and_loc = meta.xpath("span/text()")[0].strip()
            time_and_loc = time_and_loc.split("\n")
            time = time_and_loc[0]
            loc = time_and_loc[1]

            if loc == "":
                loc = "See Agenda"

            com = com.replace("(S)", "Senate").replace("(H)", "House")

            # Indiana has a LOT of undefined times, stuff like "15 mins after adj. of elections"
            # so just remove the time component if it won't parse, and the user can go to the agenda
            try:
                when = dateutil.parser.parse(f"{date} {time}")
            except dateutil.parser._parser.ParserError:
                when = dateutil.parser.parse(date)
            when = self._tz.localize(when)

            if "cancelled" in time.lower():
                continue

            event = Event(
                name=com,
                start_date=when,
                location_name=loc,
                classification="committee-meeting",
            )

            event.add_source(url)
            event.add_participant(com, type="committee", note="host")

            if row.xpath('.//a[contains(text(), "View Agenda")]'):
                agenda_url = row.xpath(
                    './/a[contains(text(), "View Agenda")]/@href')[0]
                event.add_document("Agenda",
                                   agenda_url,
                                   media_type="application/pdf")

            if row.xpath('.//a[contains(text(), "Watch")]'):
                vid_url = row.xpath('.//a[contains(text(), "Watch")]/@href')[0]
                event.add_media_link("Video of Hearing",
                                     vid_url,
                                     media_type="text/html")

            if row.xpath('.//tr[contains(@class,"bill-container")]/td'):
                agenda = event.add_agenda_item("Bills under consideration")
                for bill_row in row.xpath(
                        './/tr[contains(@class,"bill-container")]'):
                    bill_id = bill_row.xpath(
                        ".//a[contains(@class,'bill-name-link')]/text()")[0]
                    agenda.add_bill(bill_id)

            yield event
Beispiel #21
0
    def scrape(self, chamber=None):
        url = "https://le.utah.gov/CalServ/CalServ?month={}&year={}"

        year = datetime.datetime.today().year

        for i in range(0, 12):
            page = self.get(url.format(i, year)).json()
            if "days" in page:
                for day_row in page["days"]:
                    for row in day_row["events"]:
                        # ignore 'note', 'housefloor', 'senatefloor'
                        if row["type"] == "meeting":
                            status = "tentative"
                            title = row["desc"]
                            where = row["location"]

                            when = dateutil.parser.parse(
                                f"{day_row['year']}-{str(int(day_row['month'])+1)}-{day_row['day']} {row['time']}"
                            )

                            when = self._tz.localize(when)

                            if "status" in row and row["status"] == "C":
                                status = "cancelled"

                            event = Event(
                                name=title,
                                location_name=where,
                                start_date=when,
                                classification="committee-meeting",
                                status=status,
                            )

                            if "agenda" in row:
                                event.add_document(
                                    "Agenda",
                                    f"{self.base_url}{row['agenda']}",
                                    media_type="text/html",
                                    on_duplicate="ignore",
                                )

                            if "minutes" in row:
                                event.add_document(
                                    "Minutes",
                                    f"{self.base_url}{row['minutes']}",
                                    media_type="text/html",
                                    on_duplicate="ignore",
                                )

                            if "mediaurl" in row:
                                event.add_media_link(
                                    "Media",
                                    f"{self.base_url}{row['mediaurl']}",
                                    media_type="text/html",
                                    on_duplicate="ignore",
                                )
                                if re.findall(r"mtgID=(\d+)", row["mediaurl"]):
                                    hearing_id = re.findall(
                                        r"mtgID=(\d+)", row["mediaurl"])[0]
                                    docs_url = f"https://glen.le.utah.gov/committees/meeting/{hearing_id}/1234"
                                    docs_page = self.get(docs_url).json()
                                    if "meetingMaterials" in docs_page:
                                        for mat in docs_page[
                                                "meetingMaterials"]:
                                            agenda = event.add_agenda_item(
                                                mat["description"])
                                            event.add_document(
                                                mat["description"],
                                                f"{self.base_url}{mat['docUrl']}",
                                                media_type="application/pdf",
                                                on_duplicate="ignore",
                                            )

                                            for bill_row in re.findall(
                                                    r"(\w{2,3}\d{4})",
                                                    mat["description"]):
                                                agenda.add_bill(bill_row)

                                    # NOTE: The following data appears to be duped on the meetingMaterials endpoint
                                    # but leaving this in place commented out, in case that ever changes.
                                    #
                                    # rather than return an empty object this page just times out if there are no bills
                                    # so don't retry, and pass on failure
                                    # bills_url = f"https://glen.le.utah.gov/agencal/{hearing_id}/1234"
                                    # self.retry_attempts = 0
                                    # try:
                                    #     bills_page = self.get(bills_url, timeout=3).json()
                                    #     if 'agendaitems' in bills_page:
                                    #         for bill_row in bills_page['agendaitems']:

                                    #             agenda = event.add_agenda_item(bill_row['description'])
                                    #             if 'bill' in bill_row:
                                    #                 agenda.add_bill(bill_row['bill'])
                                    #                 print(bill_row)
                                    # except requests.exceptions.ReadTimeout:
                                    #     pass

                                    # then reset the retry attempts to normal for other requests
                                    self.retry_attempts = 3

                            source_url = f"{self.base_url}{row['itemurl']}"
                            event.add_source(source_url)

                            yield event
Beispiel #22
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info("no session specified, using latest")

        for i in self.jurisdiction.legislative_sessions:
            if i["identifier"] == session:
                session_slug = i["_scraped_name"]

        url = (
            "http://laws.leg.mt.gov/legprd/LAW0240W$CMTE.ActionQuery?P_SESS={session_slug}"
            "&P_COM_NM=&P_ACTN_DTM={start}&U_ACTN_DTM={end}&Z_ACTION2=Find")

        start = datetime.datetime.today()
        # this month and the next 2 months
        end = start + relativedelta.relativedelta(months=+2)

        url = url.format(
            session_slug=session_slug,
            start=start.strftime("%m/01/%Y"),
            end=end.strftime("%m/%d/%Y"),
        )

        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for row in page.xpath("//table[@border]/tr"):
            # skip table headers
            if not row.xpath("td[1]/a"):
                continue
            day = row.xpath("td[2]/text()")[0].strip()
            time = row.xpath("td[3]/text()")[0].strip()
            room = row.xpath("td[4]")[0].text_content().strip()
            bill = row.xpath("td[5]/a[1]/text()")[0].strip()
            bill_title = row.xpath("td[6]/text()")[0].strip()

            com = row.xpath("td[1]/a[1]/text()")[0].strip()
            com = com.replace("(H)", "House").replace("(S)", "Senate")

            when = parser.parse(f"{day} {time}")
            when = self._tz.localize(when)

            when_slug = when.strftime("%Y%m%d%H%I")
            if com not in self.events:
                self.events[com] = {}

            if when_slug not in self.events[com]:
                event = Event(
                    name=com,
                    location_name=room,
                    start_date=when,
                    classification="committee-meeting",
                )
                event.add_source(row.xpath("td[1]/a[1]/@href")[0])
            else:
                event = self.events[com][when_slug]

            agenda = event.add_agenda_item(bill_title)
            agenda.add_bill(bill)

            if row.xpath('.//a[contains(@href,"/billhtml/")]'):
                bill_url = row.xpath(
                    './/a[contains(@href,"/billhtml/")]/@href')[0]
                event.add_document(bill_title,
                                   bill_url,
                                   media_type="text/html")
            if row.xpath('.//a[contains(@href,"/billpdf/")]'):
                bill_url = row.xpath(
                    './/a[contains(@href,"/billpdf/")]/@href')[0]
                event.add_document(bill_title,
                                   bill_url,
                                   media_type="application/pdf")

            self.events[com][when_slug] = event

        for com in self.events:
            for date in self.events[com]:
                yield self.events[com][date]
Beispiel #23
0
    def scrape(self):
        url = "https://apps.legislature.ky.gov/legislativecalendar"

        page = self.get(url).content
        page = lxml.html.fromstring(page)

        for time_row in page.xpath(
                '//div[contains(@class,"TimeAndLocation")]'):
            date = (time_row.xpath(
                'preceding-sibling::div[contains(@class,"DateHeading")][1]')
                    [0].text_content().strip())

            status = "tentative"

            if time_row.xpath('div[contains(@class,"Cancelled")]'):
                status = "cancelled"

            row_text = time_row.text_content()
            row_text = row_text.replace("Noon", "PM")
            # upon recess (of House|Senate)
            row_text = re.sub(r"Upon Recess(\sof\s)?(House|Senate)?", "",
                              row_text)
            parts = re.split(r",|AM|PM", row_text)
            time = parts[0].strip()
            location = " ".join(
                x.replace(r"\xa0", "").strip() for x in parts[1:])

            when = f"{date} {time}"
            when = dateutil.parser.parse(when)
            when = self._tz.localize(when)

            if not time_row.xpath(
                    'following-sibling::div[contains(@class,"CommitteeName")][1]/a'
            ):
                continue

            com_name = (time_row.xpath(
                'following-sibling::div[contains(@class,"CommitteeName")][1]/a'
            )[0].text_content().strip())

            event = Event(
                name=com_name,
                start_date=when,
                classification="committee-meeting",
                location_name=location,
                status=status,
            )

            if time_row.xpath(
                    'following-sibling::div[contains(@class,"Agenda")][1]'):
                agenda_row = time_row.xpath(
                    'following-sibling::div[contains(@class,"Agenda")][1]')[0]
                agenda_text = agenda_row.text_content().strip()

                agenda = event.add_agenda_item(agenda_text)

                for bill_link in agenda_row.xpath(
                        './/a[contains(@href,"/record/")]'):
                    agenda.add_bill(bill_link.text_content().strip())

            event.add_participant(com_name, note="host", type="committee")

            com_page_link = time_row.xpath(
                'following-sibling::div[contains(@class,"CommitteeName")][1]/a/@href'
            )[0]

            docs = self.scrape_com_docs(com_page_link)
            lookup_date = when.strftime("%Y-%m-%d")

            if lookup_date in docs["mats"]:
                for mat in docs["mats"][lookup_date]:
                    event.add_document(mat["text"],
                                       mat["url"],
                                       on_duplicate="ignore")

            if lookup_date in docs["minutes"]:
                for mat in docs["minutes"][lookup_date]:
                    event.add_document(mat["text"],
                                       mat["url"],
                                       on_duplicate="ignore")

            event.add_source(url)

            yield event
Beispiel #24
0
    def scrape(self):
        today = datetime.datetime.today()

        url = "https://web.wyoleg.gov/LsoService/api/Calendar/Events/{}{}01"

        # this month and the next 2 months
        for add in [0, 1, 2]:
            test_date = today + relativedelta.relativedelta(months=+add)
            month_url = url.format(str(test_date.year),
                                   str(test_date.month).zfill(2))
            page = self.get(month_url).json()
            for row in page:
                if row["meetingKind"] == 2:
                    com = f"{row['meetingType']} {row['committee']['fullName']}"
                    # skip state holidays or other non-committee hearings
                    if com.strip() == "":
                        continue

                    start = parser.parse(row["startDate"])
                    start = self._tz.localize(start)
                    end = parser.parse(row["endTime"])
                    end = self._tz.localize(end)

                    where = row["address1"]

                    if where == "":
                        where = "TBD"

                    desc = row["purpose"]

                    event = Event(
                        name=com,
                        location_name=where,
                        start_date=start,
                        end_date=end,
                        classification="committee-meeting",
                        description=desc,
                    )

                    for media in row["meetingMedias"]:
                        # all these i've seen say they're octet stream but are actually youtube links
                        event.add_media_link(
                            media["documentType"],
                            media["filePath"],
                            "text/html",
                            on_duplicate="ignore",
                        )

                    for doc in row["meetingDocuments"]:
                        event.add_document(
                            doc["title"],
                            f"{self.base_url}{doc['documentUrl']}",
                            on_duplicate="ignore",
                        )

                    for item in row["meetingAgendas"]:
                        self.parse_agenda_item(event, item)

                    bills_agenda_item = None
                    for bill in row["sessionMeetingBills"]:
                        if bills_agenda_item is None:
                            bills_agenda_item = event.add_agenda_item(
                                "Bills under Consideration")
                        bills_agenda_item.add_bill(bill["billNumber"])

                    web_url = "https://www.wyoleg.gov/Calendar/{year}{month}01/Meeting?type=committee&id={meeting_id}"
                    web_url = web_url.format(
                        year=str(test_date.year),
                        month=str(test_date.month).zfill(2),
                        meeting_id=row["id"],
                    )

                    event.add_source(web_url)
                    yield event
    def scrape(self, session=None, start=None, end=None):

        if session is None:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        # testimony url, we'll need it later in a loop

        # testmony query looks gnary but breaks down to:
        # $filter: (Request/PaperNumber eq 'SP0219') and (Request/Legislature eq 129)
        # $orderby: LastName,FirstName,Organization
        # $expand: Request
        # $select: Id,FileType,NamePrefix,FirstName,LastName,Organization,
        # PresentedDate,FileSize,Topic

        testimony_url_base = (
            "http://legislature.maine.gov/backend/"
            "breeze/data/CommitteeTestimony?"
            "$filter=(Request%2FPaperNumber%20eq%20%27{}%27)%20and"
            "%20(Request%2FLegislature%20eq%20{})"
            "&$orderby=LastName%2CFirstName%2COrganization&"
            "$expand=Request&$select=Id%2CFileType%2CNamePrefix"
            "%2CFirstName%2CLastName%2COrganization%2CPresentedDate%2CFileSize%2CTopic"
        )

        if start is None:
            start_date = datetime.datetime.now().isoformat()
        else:
            start_date = datetime.datetime.strptime(start, "%Y-%m-%d")
            start_date = start_date.isoformat()

        # default to 30 days if no end
        if end is None:
            dtdelta = datetime.timedelta(days=30)
            end_date = datetime.datetime.now() + dtdelta
            end_date = end_date.isoformat()
        else:
            end_date = datetime.datetime.strptime(end, "%Y-%m-%d")
            end_date = end_date.isoformat()

        bills_by_event = {}

        bills_url = ("http://legislature.maine.gov/backend/breeze/data/"
                     "getCalendarEventsBills?startDate={}&endDate={}")
        bills_url = bills_url.format(start_date, end_date)
        page = json.loads(self.get(bills_url).content)

        for row in page:
            bills_by_event.setdefault(row["EventId"], [])
            bills_by_event[row["EventId"]].append(row)

        # http://legislature.maine.gov/backend/breeze/data/getCalendarEventsRaw?startDate=2019-03-01T05%3A00%3A00.000Z&endDate=2019-04-01T03%3A59%3A59.999Z&OnlyPHWS=false
        url = ("http://legislature.maine.gov/backend/breeze/data/"
               "getCalendarEventsRaw?startDate={}&endDate={}&OnlyPHWS=true")
        url = url.format(start_date, end_date)

        page = json.loads(self.get(url).content)

        for row in page:
            if row["Cancelled"] is True or row["Postponed"] is True:
                continue

            start_date = self._TZ.localize(
                dateutil.parser.parse(row["FromDateTime"]))
            end_date = self._TZ.localize(
                dateutil.parser.parse(row["ToDateTime"]))

            name = row["CommitteeName"]

            if name is None:
                name = row["Host"]

            address = row["Location"]
            address = address.replace(
                "Cross Building",
                "Cross Office Building, 111 Sewall St, Augusta, ME 04330",
            )

            address = address.replace(
                "State House",
                "Maine State House, 210 State St, Augusta, ME 04330")

            event = Event(
                start_date=start_date,
                end_date=end_date,
                name=name,
                location_name=address,
            )

            event.add_source(
                "http://legislature.maine.gov/committee/#Committees/{}".format(
                    row["CommitteeCode"]))

            if bills_by_event.get(row["Id"]):
                for bill in bills_by_event[row["Id"]]:
                    description = "LD {}: {}".format(bill["LD"], bill["Title"])
                    agenda = event.add_agenda_item(description=description)
                    agenda.add_bill("LD {}".format(bill["LD"]))

                    if bill["TestimonyCount"] > 0:
                        test_url = testimony_url_base.format(
                            bill["PaperNumber"], session)
                        test_page = json.loads(self.get(test_url).content)
                        for test in test_page:
                            title = "{} {} - {}".format(
                                test["FirstName"],
                                test["LastName"],
                                test["Organization"],
                            )
                            if test["NamePrefix"] is not None:
                                title = "{} {}".format(test["NamePrefix"],
                                                       title)

                            test_url = (
                                "http://legislature.maine.gov/backend/app/services"
                                "/getDocument.aspx?doctype=test&documentId={}".
                                format(test["Id"]))

                            if test["FileType"] == "pdf":
                                media_type = "application/pdf"

                            event.add_document(note=title,
                                               url=test_url,
                                               media_type=media_type)
            yield event
Beispiel #26
0
    def scrape_agenda(self, chamber, url):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf["DATE:"]
        time = metainf["TIME:"]
        where = metainf["PLACE:"]

        # check for duration in time
        if " - " in time:
            start, end = time.split(" - ")
            am_pm_srch = re.search("(?i)(am|pm)", end)
            if am_pm_srch:
                time = " ".join([start, am_pm_srch.group().upper()])
            else:
                time = start

        fmts = [
            "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M"
        ]

        event_desc = "Meeting Notice"
        if "Rise" in time:
            datetime = date
            event_desc = "Meeting Notice: Starting at {}".format(time)
        else:
            datetime = "%s %s" % (date, time)
        if "CANCELLED" in datetime.upper() or "CANCELED" in datetime.upper():
            return

        if page.xpath("//span[@id='lblSession']"):
            event_desc = (page.xpath("//span[@id='lblSession']")
                          [0].text_content().strip())

        transtable = {
            "P.M": "PM",
            "PM.": "PM",
            "P.M.": "PM",
            "A.M.": "AM",
            "POSTPONED": "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(name=event_desc,
                      start_date=self._tz.localize(datetime),
                      location_name=where)

        event.add_document("Agenda",
                           url,
                           media_type="text/html",
                           on_duplicate="ignore")
        event.add_source(url)

        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib["href"]
            event.add_document(
                bill.text_content(),
                bill_ft,
                media_type="application/pdf",
                on_duplicate="ignore",
            )
            root = bill.xpath("../../*")
            root = [x.text_content() for x in root]
            bill_id = "".join(root).replace("\u00a0", "")

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().text_content().replace(
                "\u00a0", " ")

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            item = event.add_agenda_item(descr)
            item.add_bill(bill_id)

        # sometimes bill references are just plain links or plain text.
        bill_links = page.xpath('//a[contains(@href,"/BillText/")]/@href')
        linked_bills = set()
        for bill_link in bill_links:
            bill_nums = re.findall(r"\/(\w+\d+)\.pdf",
                                   bill_link,
                                   flags=re.IGNORECASE)
            for bill_num in bill_nums:
                linked_bills.add(bill_num)

        # sometimes (H 1234) ends up in the title or somewhere else unlinked
        text_bill_nums = re.findall(r"\((\w{1,3}\s?\d+)\)",
                                    page.text_content(),
                                    flags=re.IGNORECASE)
        for bill_num in text_bill_nums:
            bill_num = bill_num.replace(" ", "")
            linked_bills.add(bill_num)

        if len(linked_bills) != 0:
            item = event.add_agenda_item("Bills under consideration")
            for bill in linked_bills:
                item.add_bill(bill)

        if page.xpath("//span[@id='lblSession']"):
            committee = page.xpath(
                "//span[@id='lblSession']")[0].text_content()
            event.add_participant(committee, "committee", note="host")

        yield event
Beispiel #27
0
    def scrape_house_weekly_schedule(self):
        url = "https://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [
            row for row in meeting_rows
            if row.xpath("./td[1]")[0].text_content().replace("\xa0", "")
            and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]')
            and "Not Meeting" not in row.xpath("./td[2]")[0].text_content()
        ]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath("./td/a[descendant::img[contains(@src,"
                                     '"PDF-AGENDA.png")]]/@href')[0]
                # self.logger.debug(guid)
                self.warning("logger.debug" + guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath("./td[1]/text()")[0].strip()
            meeting_string = meeting.xpath("./td[2]")[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = (
                [s.strip()
                 for s in meeting_string.split(",") if s] + [None] * 3)[:3]

            # check for time in date because of missing comma
            time_srch = re.search(r"\d{2}:\d{2} (AM|PM)", date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, "")

            # self.logger.debug(location)
            self.warning("logger.debug" + location)

            year = datetime.datetime.now().year
            datetime_string = " ".join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string,
                                              "%b %d %Y %I:%M %p")
            when = self._tz.localize(when)

            description = "Committee Meeting: {}".format(committee_name)
            # self.logger.debug(description)
            self.warning("logger.debug" + description)

            event = Event(
                name=description,
                start_date=self._tz.localize(when),
                location_name=location,
            )
            event.add_source(url)
            event.add_participant(committee_name,
                                  type="committee",
                                  note="host")
            event.add_document(note="Agenda",
                               url=guid,
                               text="agenda",
                               media_type="application/pdf")

            yield event
Beispiel #28
0
    def scrape_lower(self):
        list_url = (
            "https://virginiageneralassembly.gov/house/schedule/meetingSchedule.php"
        )

        page = self.get(list_url).content
        page = lxml.html.fromstring(page)

        page.make_links_absolute(list_url)

        for row in page.xpath("//table[contains(@class, 'CODayTable')]/tbody/tr"):

            # TODO: it would be nice to go back in and update the record to mark it as cancelled,
            # but since there's no ics link it makes the day logic way more complicated
            if row.xpath(".//span[contains(@class, 'COCancelled')]"):
                continue

            # fallback for unlinked events
            source = (
                "https://virginiageneralassembly.gov/house/schedule/meetingSchedule.php"
            )

            if row.xpath(".//a[1]/text()"):
                title = row.xpath(".//a[1]/text()")[0].strip()
                source = row.xpath(".//a[1]/@href")[0]
                event_type = "committee-meeting"
            else:
                # skip unlinked misc events
                if row.xpath("td[contains(@class, 'COCommType')]/text()"):
                    title = row.xpath("td[contains(@class, 'COCommType')]/text()")[
                        0
                    ].strip()
                    event_type = "other"
                else:
                    continue

            date_link = row.xpath(".//a[@title='Add to Calendar']/@href")[0]
            parsed = parse.parse_qs(parse.urlparse(date_link).query)
            date_raw = parsed["dt"][0]
            location = parsed["loc"][0]

            start = dateutil.parser.parse(date_raw, tzinfos=self.tzinfos)

            # If there's a chair in parentheticals, remove them from the title
            # and add as a person instead
            chair_note = re.findall(r"\(.*\)", title)
            chair = None
            for chair_str in chair_note:
                title = title.replace(chair_str, "").strip()
                # drop the outer parens
                chair = chair_str[1:-1]

            event = Event(
                name=title,
                start_date=start,
                location_name=location,
                classification=event_type,
            )
            event.add_source(source)

            if chair is not None:
                event.add_participant(chair, type="person", note="chair")

            if event_type == "committee-meeting":
                event.add_participant(title, type="committee", note="host")

            if row.xpath(".//a[contains(@class,'COAgendaLink')]"):
                agenda_url = row.xpath(".//a[contains(@class,'COAgendaLink')]/@href")[0]
                event.add_document("Agenda", agenda_url, media_type="text/html")
                self.scrape_lower_agenda(event, agenda_url)

            yield event
Beispiel #29
0
    def scrape_chamber(self, chamber):
        if chamber == "upper":
            url = "https://legislature.idaho.gov/sessioninfo/agenda/sagenda/"
        elif chamber == "lower":
            url = "https://legislature.idaho.gov/sessioninfo/agenda/hagenda/"

        page = self.get(url).content
        page = lxml.html.fromstring(page)

        for row in page.xpath('//div[@id="ai1ec-container"]/div'):
            month = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/text()"
            )[0].strip()
            day = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/span/text()"
            )[0].strip()

            time_and_loc = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'abbr')]/h2/text()"
            )
            time = time_and_loc[0].strip()
            loc = time_and_loc[1].strip()

            if "not meet" in time.lower():
                continue

            try:
                start = dateutil.parser.parse(f"{month} {day} {time}")
            except dateutil.parser._parser.ParserError:
                start = dateutil.parser.parse(f"{month} {day}")

            start = self._tz.localize(start)

            com = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'day')]/h2/a/text()"
            )[0].strip()

            event = Event(
                name=com,
                start_date=start,
                location_name=loc,
                classification="committee-meeting",
            )

            event.add_participant(com, type="committee", note="host")

            agenda_url = row.xpath(
                './/a[contains(text(), "Full Agenda")]/@href')[0]
            event.add_document("Agenda",
                               agenda_url,
                               media_type="application/pdf")

            agenda_rows = row.xpath(
                './/div[contains(@class,"card")]/div[contains(@id, "Agenda")]/div/table/tbody/tr'
            )[1:]

            for agenda_row in agenda_rows:
                subject = agenda_row.xpath("string(td[1])").strip()
                description = agenda_row.xpath("string(td[2])").strip()
                presenter = agenda_row.xpath("string(td[3])").strip()
                if presenter != "":
                    agenda_text = (
                        f"{subject} {description} Presenter: {presenter}".
                        strip())
                    event.add_participant(agenda_text,
                                          type="person",
                                          note="Presenter")
                else:
                    agenda_text = f"{subject} {description}".strip()

                agenda = event.add_agenda_item(agenda_text)

                if agenda_row.xpath(
                        'td[1]/a[contains(@href,"/legislation/")]'):
                    agenda.add_bill(
                        agenda_row.xpath(
                            'td[1]/a[contains(@href,"/legislation/")]/text()')
                        [0].strip())

            event.add_source(url)
            yield event
Beispiel #30
0
    def scrape_meeting_page(self, com_id, chamber, com_name, meeting_date,
                            meeting_time, location):
        # http://www.kslegislature.org/li/b2021_22/committees/ctte_s_jud_1/documents/?date_choice=2021-03-19
        meeting_page_url = (
            f"http://www.kslegislature.org/li/{self.slug}/"
            f"committees/{com_id}/documents/?date_choice={meeting_date}")

        page = self.get(meeting_page_url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(meeting_page_url)

        try:
            start_date = dateutil.parser.parse(
                f"{meeting_date} {meeting_time}")
        except dateutil.parser._parser.ParserError:
            start_date = dateutil.parser.parse(meeting_date)

        start_date = self.tz.localize(start_date)

        pretty_chamber = self.chamber_names[chamber].title()

        event = Event(
            start_date=start_date,
            name=f"{pretty_chamber} {com_name}",
            location_name=location,
        )

        event.add_participant(f"{pretty_chamber} {com_name}",
                              type="committee",
                              note="host")

        # Agendas & Minutes
        for row in page.xpath(
                "//table[.//h4[contains(text(), 'Agendas')]]/table[contains(@class,'bottom')]/tbody/tr"
        ):
            doc_name = row.xpath("td[1]")[0].text_content()
            doc_url = row.xpath("td[2]/a/@href")[0]
            event.add_document(doc_name, doc_url, media_type="application/pdf")

        # Witness testimony
        for row in page.xpath("//tr[td[ul[@id='testimony-docs']]]"):

            doc_type = row.xpath("td[1]")[0].text_content()
            meta = row.xpath("td[2]/ul[@id='testimony-docs']")[0]

            witness = meta.xpath(
                "li[strong[contains(text(),'Presenter')]]/text()")[0].strip()

            org = ""
            if meta.xpath(
                    "li[strong[contains(text(),'Organization')]]/text()"):
                org = meta.xpath(
                    "li[strong[contains(text(),'Organization')]]/text()"
                )[0].strip()

            topic = meta.xpath(
                "li[strong[contains(text(),'Topic')]]/text()")[0].strip()

            if org:
                doc_name = f"{doc_type} - {witness} ({org}) - {topic}"
            else:
                doc_name = f"{doc_type} - {witness} - {topic}"

            agenda = event.add_agenda_item(doc_name)
            if meta.xpath("li[strong[contains(text(),'Measure')]]/text()"):
                bill_id = meta.xpath(
                    "li[strong[contains(text(),'Measure')]]/text()")[0].strip(
                    )
                agenda.add_bill(bill_id)

        event.add_source(meeting_page_url)

        yield event