Ejemplo n.º 1
0
    def scrape(self, start=None):
        if start is None:
            start = datetime.datetime.today()
        else:
            start = datetime.datetime.strptime(start, "%Y-%m-%d")

        date_format = "%a %b %d %Y"
        date_slug = start.strftime(date_format)

        url = f"https://www.legis.ga.gov/api/meetings?startDate={date_slug}"

        page = self.get(url).json()

        for row in page:
            status = "tentative"

            title = row["subject"]

            if "joint" not in title.lower():
                if row["chamber"] == 2:
                    title = f"Senate {title}"
                elif row["chamber"] == 1:
                    title = f"House {title}"

            start = dateutil.parser.parse(row["start"])

            if start < self.tz.localize(datetime.datetime.now()):
                status = "passed"

            if "cancelled" in title.lower() or "canceled" in title.lower():
                status = "cancelled"
                # try to replace all variants of "[optional dash] cancel[l]ed [optional dash]"
                # so we can match up events to their pre-cancellation occurrence
                title = re.sub(r"-?\s*cancell?ed\s*-?\s*", " ", title, flags=re.I)

            where = row["location"]
            where = f"206 Washington St SW, Atlanta, Georgia, {where}"

            event = Event(
                name=title,
                start_date=start,
                location_name=where,
                classification="committee-meeting",
                status=status,
            )

            if row["agendaUri"] != "":
                event.add_document(
                    "Agenda", row["agendaUri"], media_type="application/pdf"
                )

            if row["livestreamUrl"] is not None:
                event.add_media_link(
                    "Video", row["livestreamUrl"], media_type="text/html"
                )

            event.add_source("https://www.legis.ga.gov/schedule/all")

            yield event
Ejemplo n.º 2
0
    def scrape_upper_com(self, url, com, session):
        url = f"{url}{session}"
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        com = f"Senate {com}"

        for row in page.xpath('//table[@id="meetingsTbl"]/tbody/tr'):
            day = row.xpath("td[1]")[0].text_content().strip()
            time = row.xpath("td[2]")[0].text_content().strip()
            notice = row.xpath("td[3]")[0].text_content().strip()
            location = "See Agenda"  # it's in the PDFs but not the web page

            date = dateutil.parser.parse(f"{day} {time}")
            date = self.tz.localize(date)

            if notice.lower() == "not meeting" or "cancelled" in notice.lower(
            ):
                continue

            event = Event(name=com, start_date=date, location_name=location)

            agenda_classes = [
                "mtgrecord_notice",
                "mtgrecord_expandedAgenda",
                "mtgrecord_attendance",
            ]

            for agenda_class in agenda_classes:
                if row.xpath(f"//a[@class='{agenda_class}']"):
                    url = row.xpath(f"//a[@class='{agenda_class}']/@href")[0]
                    doc_name = (row.xpath(f"//a[@class='{agenda_class}']")
                                [0].text_content().strip())
                    event.add_document(doc_name,
                                       url,
                                       media_type="application/pdf")

            for link in row.xpath("td[7]/a"):
                url = link.xpath("@href")[0]
                doc_name = link.text_content().strip()
                event.add_media_link(doc_name, url, "audio/mpeg")

            for link in row.xpath("td[9]/a"):
                url = link.xpath("@href")[0]
                doc_name = link.text_content().strip()
                event.add_media_link(doc_name, url, "text/html")

            event.add_source(url)
            yield event
Ejemplo n.º 3
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(start_date=start_date,
                      end_date=end_date,
                      name=title,
                      location_name=location)

        event.add_source(
            "http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx")

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath("@href")[0],
                media_type="application/pdf",
                on_duplicate="ignore",
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath(
                './/div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link("Video of Hearing",
                                 video[0].xpath("@href")[0], "text/html")

        if "subcommittee" in title.lower():
            subcom = title.split("-")[0].strip()
            event.add_participant(subcom, type="committee", note="host")
        else:
            event.add_participant(com, type="committee", note="host")
        yield event
Ejemplo n.º 4
0
    def scrape_committee_page(self, url):
        page = self.get(url, headers=self.cf_headers).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        com = page.xpath(
            '//div[contains(@class, "pull-left span8")]/h1/text()')[0].strip()

        for row in page.xpath('//div[contains(@id, "agenda-item")]'):
            # status = "tentative"
            meta = row.xpath(
                'div[contains(@class,"accordion-heading-agenda")]/a')[0]

            date = meta.xpath("text()")[0].strip()

            time_and_loc = meta.xpath("span/text()")[0].strip()
            time_and_loc = time_and_loc.split("\n")
            time = time_and_loc[0]
            loc = time_and_loc[1]

            if loc == "":
                loc = "See Agenda"

            com = com.replace("(S)", "Senate").replace("(H)", "House")

            # Indiana has a LOT of undefined times, stuff like "15 mins after adj. of elections"
            # so just remove the time component if it won't parse, and the user can go to the agenda
            try:
                when = dateutil.parser.parse(f"{date} {time}")
            except dateutil.parser._parser.ParserError:
                when = dateutil.parser.parse(date)
            when = self._tz.localize(when)

            if "cancelled" in time.lower():
                continue

            event = Event(
                name=com,
                start_date=when,
                location_name=loc,
                classification="committee-meeting",
            )

            event.add_source(url)
            event.add_participant(com, type="committee", note="host")

            if row.xpath('.//a[contains(text(), "View Agenda")]'):
                agenda_url = row.xpath(
                    './/a[contains(text(), "View Agenda")]/@href')[0]
                event.add_document("Agenda",
                                   agenda_url,
                                   media_type="application/pdf")

            if row.xpath('.//a[contains(text(), "Watch")]'):
                vid_url = row.xpath('.//a[contains(text(), "Watch")]/@href')[0]
                event.add_media_link("Video of Hearing",
                                     vid_url,
                                     media_type="text/html")

            if row.xpath('.//tr[contains(@class,"bill-container")]/td'):
                agenda = event.add_agenda_item("Bills under consideration")
                for bill_row in row.xpath(
                        './/tr[contains(@class,"bill-container")]'):
                    bill_id = bill_row.xpath(
                        ".//a[contains(@class,'bill-name-link')]/text()")[0]
                    agenda.add_bill(bill_id)

            yield event
Ejemplo n.º 5
0
    def scrape_upper(self):
        url = "https://www.senate.mn/api/schedule/upcoming"
        data = self.get(url).json()

        for row in data["events"]:
            com = row["committee"]["committee_name"]
            start = dateutil.parser.parse(row["hearing_start"])
            start = self._tz.localize(start)

            if (row["hearing_room"] and "hearing_building" in row
                    and row["hearing_building"]):
                where = f"{row['hearing_building']} {row['hearing_room']}"
            elif "hearing_building" in row and row["hearing_building"]:
                where = row["hearing_building"]
            else:
                where = "TBD"

            description = ""

            if "hearing_notes" in row and row["hearing_notes"]:
                description = row["hearing_notes"]

            event = Event(
                name=com,
                location_name=where,
                start_date=start,
                classification="committee-meeting",
                description=description,
            )

            for bill in get_bill_ids(description):
                event.add_bill(description)

            if "lrl_schedule_link" in row:
                event.add_source(row["lrl_schedule_link"])
            else:
                if "link" in row["committee"]:
                    if row["committee"]["link"].startswith("http"):
                        event.add_source(row["committee"]["link"])
                    elif row["committee"]["link"].startswith("www"):
                        event.add_source(f"http://{row['committee']['link']}")
                    else:
                        event.add_source(
                            f"https://www.senate.mn/{row['committee']['link']}"
                        )
                elif "senate_chair_link" in row["committee"]:
                    event.add_source(
                        f"https://www.senate.mn/{row['committee']['senate_chair_link']}"
                    )

            if "agenda" in row:
                for agenda_row in row["agenda"]:
                    if (agenda_row["description"] is None
                            or agenda_row["description"].strip() == ""):
                        # sometimes they have blank agendas but bills or files
                        agenda_row["description"] = "Agenda"
                    agenda = event.add_agenda_item(agenda_row["description"])
                    if "bill_type" in agenda_row:
                        agenda.add_bill("{} {}".format(
                            agenda_row["bill_type"].replace(".", ""),
                            agenda_row["bill_number"],
                        ))

                    if "files" in agenda_row:
                        for file_row in agenda_row["files"]:
                            doc_name = file_row["filename"]
                            doc_url = file_row["file_path"]

                            # if they don't provide a name just use the filename
                            if doc_name == "":
                                parsed_url = urlparse(doc_url)
                                doc_name = os.path.basename(parsed_url.path)

                            event.add_document(
                                doc_name,
                                f"https://www.senate.mn/{doc_url}",
                                media_type="text/html",
                                on_duplicate="ignore",
                            )

            if "video_link" in row:
                event.add_media_link("Video", row["video_link"], "text/html")

            if "audio_link" in row:
                event.add_media_link("Audio", row["audio_link"], "text/html")

            yield event
Ejemplo n.º 6
0
    def scrape(self, start=None, end=None):
        if start is None:
            start_date = datetime.datetime.now().strftime(self.date_format)

        # default to 90 days if no end
        if end is None:
            dtdelta = datetime.timedelta(days=90)
            end_date = datetime.datetime.now() + dtdelta
            end_date = end_date.strftime(self.date_format)

        url = f"https://www.arkleg.state.ar.us/Calendars/Meetings?tbType=&meetingStartDate={start_date}&meetingEndDate={end_date}"

        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for row in page.xpath(
                "//div[@id='meetingBodyWrapper']/div[contains(@class,'row')]"):
            row_class = row.xpath("@class")[0]
            if "tableSectionHeader" in row_class:
                day = row.xpath("div/text()")[0].strip()
                continue

            time = row.xpath(
                "div[contains(@class,'timeRow')]/b/text()")[0].strip()
            if "no meeting" in time.lower() or "cancelled" in time.lower():
                continue

            if "upon adjournment" in time.lower():
                time = "1:00 PM"

            title = row.xpath("div[2]/b")[0].text_content().strip()

            if "call of the chair" in time.lower():
                time = ""
            else:
                times = re.findall(r"\d+:\d+\s*[A|P]M", time)
                time = times[0]

            when = dateutil.parser.parse(f"{day} {time}")
            when = self._tz.localize(when)

            location = row.xpath("div[2]/text()")[1].strip()

            event = Event(
                name=title,
                start_date=when,
                location_name=location,
                description="",
            )
            event.add_source(
                "https://www.arkleg.state.ar.us/Calendars/Meetings")

            if row.xpath(".//a[@aria-label='Agenda']"):
                agenda_url = row.xpath(".//a[@aria-label='Agenda']/@href")[0]
                event.add_document("Agenda",
                                   agenda_url,
                                   media_type="application/pdf")

            if row.xpath(".//a[@aria-label='Play Video']"):
                video_url = row.xpath(
                    ".//a[@aria-label='Play Video']/@href")[0]
                event.add_media_link("Video of Hearing",
                                     video_url,
                                     media_type="text/html")

            if row.xpath(".//a[@aria-label='Referred']"):
                bill_url = row.xpath(".//a[@aria-label='Referred']/@href")[0]
                self.scrape_referred_bills(event, bill_url)

            yield event
Ejemplo n.º 7
0
    def scrape(self, chamber=None):
        url = "https://le.utah.gov/CalServ/CalServ?month={}&year={}"

        year = datetime.datetime.today().year

        for i in range(0, 12):
            page = self.get(url.format(i, year)).json()
            if "days" in page:
                for day_row in page["days"]:
                    for row in day_row["events"]:
                        # ignore 'note', 'housefloor', 'senatefloor'
                        if row["type"] == "meeting":
                            status = "tentative"
                            title = row["desc"]
                            where = row["location"]

                            when = dateutil.parser.parse(
                                f"{day_row['year']}-{str(int(day_row['month'])+1)}-{day_row['day']} {row['time']}"
                            )

                            when = self._tz.localize(when)

                            if "status" in row and row["status"] == "C":
                                status = "cancelled"

                            event = Event(
                                name=title,
                                location_name=where,
                                start_date=when,
                                classification="committee-meeting",
                                status=status,
                            )

                            if "agenda" in row:
                                event.add_document(
                                    "Agenda",
                                    f"{self.base_url}{row['agenda']}",
                                    media_type="text/html",
                                    on_duplicate="ignore",
                                )

                            if "minutes" in row:
                                event.add_document(
                                    "Minutes",
                                    f"{self.base_url}{row['minutes']}",
                                    media_type="text/html",
                                    on_duplicate="ignore",
                                )

                            if "mediaurl" in row:
                                event.add_media_link(
                                    "Media",
                                    f"{self.base_url}{row['mediaurl']}",
                                    media_type="text/html",
                                    on_duplicate="ignore",
                                )
                                if re.findall(r"mtgID=(\d+)", row["mediaurl"]):
                                    hearing_id = re.findall(
                                        r"mtgID=(\d+)", row["mediaurl"])[0]
                                    docs_url = f"https://glen.le.utah.gov/committees/meeting/{hearing_id}/1234"
                                    docs_page = self.get(docs_url).json()
                                    if "meetingMaterials" in docs_page:
                                        for mat in docs_page[
                                                "meetingMaterials"]:
                                            agenda = event.add_agenda_item(
                                                mat["description"])
                                            event.add_document(
                                                mat["description"],
                                                f"{self.base_url}{mat['docUrl']}",
                                                media_type="application/pdf",
                                                on_duplicate="ignore",
                                            )

                                            for bill_row in re.findall(
                                                    r"(\w{2,3}\d{4})",
                                                    mat["description"]):
                                                agenda.add_bill(bill_row)

                                    # NOTE: The following data appears to be duped on the meetingMaterials endpoint
                                    # but leaving this in place commented out, in case that ever changes.
                                    #
                                    # rather than return an empty object this page just times out if there are no bills
                                    # so don't retry, and pass on failure
                                    # bills_url = f"https://glen.le.utah.gov/agencal/{hearing_id}/1234"
                                    # self.retry_attempts = 0
                                    # try:
                                    #     bills_page = self.get(bills_url, timeout=3).json()
                                    #     if 'agendaitems' in bills_page:
                                    #         for bill_row in bills_page['agendaitems']:

                                    #             agenda = event.add_agenda_item(bill_row['description'])
                                    #             if 'bill' in bill_row:
                                    #                 agenda.add_bill(bill_row['bill'])
                                    #                 print(bill_row)
                                    # except requests.exceptions.ReadTimeout:
                                    #     pass

                                    # then reset the retry attempts to normal for other requests
                                    self.retry_attempts = 3

                            source_url = f"{self.base_url}{row['itemurl']}"
                            event.add_source(source_url)

                            yield event
Ejemplo n.º 8
0
    def scrape(self):
        today = datetime.datetime.today()

        url = "https://web.wyoleg.gov/LsoService/api/Calendar/Events/{}{}01"

        # this month and the next 2 months
        for add in [0, 1, 2]:
            test_date = today + relativedelta.relativedelta(months=+add)
            month_url = url.format(str(test_date.year),
                                   str(test_date.month).zfill(2))
            page = self.get(month_url).json()
            for row in page:
                if row["meetingKind"] == 2:
                    com = f"{row['meetingType']} {row['committee']['fullName']}"
                    # skip state holidays or other non-committee hearings
                    if com.strip() == "":
                        continue

                    start = parser.parse(row["startDate"])
                    start = self._tz.localize(start)
                    end = parser.parse(row["endTime"])
                    end = self._tz.localize(end)

                    where = row["address1"]

                    if where == "":
                        where = "TBD"

                    desc = row["purpose"]

                    event = Event(
                        name=com,
                        location_name=where,
                        start_date=start,
                        end_date=end,
                        classification="committee-meeting",
                        description=desc,
                    )

                    for media in row["meetingMedias"]:
                        # all these i've seen say they're octet stream but are actually youtube links
                        event.add_media_link(
                            media["documentType"],
                            media["filePath"],
                            "text/html",
                            on_duplicate="ignore",
                        )

                    for doc in row["meetingDocuments"]:
                        event.add_document(
                            doc["title"],
                            f"{self.base_url}{doc['documentUrl']}",
                            on_duplicate="ignore",
                        )

                    for item in row["meetingAgendas"]:
                        self.parse_agenda_item(event, item)

                    bills_agenda_item = None
                    for bill in row["sessionMeetingBills"]:
                        if bills_agenda_item is None:
                            bills_agenda_item = event.add_agenda_item(
                                "Bills under Consideration")
                        bills_agenda_item.add_bill(bill["billNumber"])

                    web_url = "https://www.wyoleg.gov/Calendar/{year}{month}01/Meeting?type=committee&id={meeting_id}"
                    web_url = web_url.format(
                        year=str(test_date.year),
                        month=str(test_date.month).zfill(2),
                        meeting_id=row["id"],
                    )

                    event.add_source(web_url)
                    yield event
Ejemplo n.º 9
0
    def scrape_chamber(self, chamber, session):
        today = datetime.date.today()
        start_date = today - datetime.timedelta(days=10)
        end_date = today + datetime.timedelta(days=10)

        if chamber == "upper":
            chamber_abbrev = "S"
        else:
            chamber_abbrev = "H"

        url = (
            "http://www.legis.iowa.gov/committees/meetings/meetingsList"
            "Chamber?chamber=%s&bDate=%02d/%02d/"
            "%d&eDate=%02d/%02d/%d"
            % (
                chamber_abbrev,
                start_date.month,
                start_date.day,
                start_date.year,
                end_date.month,
                end_date.day,
                end_date.year,
            )
        )

        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)
        for link in page.xpath(
            "//div[contains(@class, 'meetings')]/table[1]/"
            "tbody/tr[not(contains(@class, 'hidden'))]"
        ):
            comm = None
            desc = None
            pretty_name = None
            status = "tentative"

            comm = link.xpath("string(./td[2]/a[1]/span/text())").strip()
            if comm == "":
                comm = link.xpath("string(./td[2]/a[1]/text())").strip()
            desc = comm + " Committee Hearing"

            location = link.xpath("string(./td[3]/span/text())").strip()
            if location == "":
                location = link.xpath("string(./td[3]/text())").strip()

            when = link.xpath("string(./td[1]/span[1]/text())").strip()
            if when == "":
                when = link.xpath("string(./td[1]/text())").strip()

            if "cancelled" in when.lower() or "upon" in when.lower():
                status = "cancelled"
            if "To Be Determined" in when:
                continue

            # sometimes they say cancelled, sometimes they do a red strikethrough
            if link.xpath("./td[1]/span[contains(@style,'line-through')]"):
                status = "cancelled"
            if "cancelled" in link.xpath("@class")[0]:
                status = "cancelled"

            junk = ["Reception"]
            for key in junk:
                when = when.replace(key, "")

            pretty_name = f"{self.chambers[chamber]} {desc}"

            when = re.sub(r"\s+", " ", when).strip()
            if "tbd" in when.lower():
                # OK. This is a partial date of some sort.
                when = datetime.datetime.strptime(when, "%m/%d/%Y TIME - TBD %p")
            else:
                try:
                    when = datetime.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
                except ValueError:
                    try:
                        when = datetime.datetime.strptime(when, "%m/%d/%Y %I %p")
                    except ValueError:
                        self.warning(f"error parsing timestamp {when} on {pretty_name}")
                        continue

            event = Event(
                name=pretty_name,
                description=desc,
                start_date=self._tz.localize(when),
                location_name=location,
                status=status,
            )

            if link.xpath("td[4]/span/a"):
                video_link = link.xpath("td[4]/span/a/@href")[0]
                event.add_media_link("Video of Hearing", video_link, "text/html")

            if status != "cancelled" and link.xpath('.//a[contains(text(),"Agenda")]'):
                agenda_rows = link.xpath(
                    'following-sibling::tr[1]/td/div[contains(@class,"agenda")]/p'
                )

                for agenda_row in agenda_rows:
                    agenda_text = agenda_row.xpath("string(.)")
                    if agenda_text.strip() != "":
                        agenda = event.add_agenda_item(agenda_text)

                        for bill_row in agenda_row.xpath(
                            './/a[contains(@href, "/BillBook")]/text()'
                        ):
                            agenda.add_bill(bill_row)

            event.add_source(url)
            event.add_participant(comm, note="host", type="committee")

            yield event