Python Event.add_sourceの例、openstates.scrape.Event.add_source Pythonの例

コード例 #1

0

ファイルを表示

ファイル: events.py プロジェクト: vikrantmygamma/openstates-scrapers

    def scrape_upper(self):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        text = page.text_content()
        _, text = text.split("MEETING NOTICES")
        re_date = r"[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}"
        chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:])

        for match, data in chunks:
            when = match.group()
            when = datetime.datetime.strptime(when, "%A, %B %d, %Y")

            lines = filter(None, [x.strip() for x in data.splitlines()])
            time_ = re.search(r"^\s*TIME:\s+(.+?)\s+\x96", data, re.M).group(1)
            time_ = time_.replace("a.m.", "AM").replace("p.m.", "PM")
            time_ = time.strptime(time_, "%I:%M %p")
            when += datetime.timedelta(hours=time_.tm_hour,
                                       minutes=time_.tm_min)

            title = lines[0]

            where = re.search(r"^\s*PLACE:\s+(.+)", data, re.M).group(1)
            where = where.strip()

            event = Event(name=title,
                          start_date=self._tz.localize(when),
                          location_name=where)
            event.add_source(url)

            yield event

コード例 #2

0

ファイルを表示

    def scrape_committee_events(self, code, name):
        events_url = (
            "http://www.cga.ct.gov/basin/fullcalendar/commevents.php?"
            "comm_code={}".format(code))
        events_data = self.get(events_url, verify=False).text
        events = json.loads(events_data)

        DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
        for info in events:

            if info["title"] is None:
                self.warning("Event found with no title; it will be skipped")
                continue
            elif info["title"].startswith("CANCELLED:"):
                self.info(
                    "Cancelled event found; it will be skipped: {}".format(
                        info["title"]))
                continue

            when = datetime.datetime.strptime(info["start"], DATETIME_FORMAT)
            # end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT)
            where = "{0} {1}".format(info["building"].strip(),
                                     info["location"].strip())
            # end_time=self._tz.localize(end),
            event = Event(
                start_date=self._tz.localize(when),
                location_name=where,
                name=info["title"],
                description=info["title"],
            )
            event.add_source(events_url)

            yield event

コード例 #3

0

ファイルを表示

    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item["CommitteeName"])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt)
        location_name = str(item["AddressAliasNickname"])
        event = Event(
            location_name=location_name,
            start_date=self._tz.localize(start_time),
            name=event_name,
            description="Committee Meeting Status: {}".format(
                item["CommitteeMeetingStatusName"]),
        )

        event.add_source(url)
        event.add_committee(name=str(item["CommitteeName"]),
                            id=item["CommitteeId"])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item["CommitteeMeetingId"]))

        event.add_source(page_url)
        page_data = self.post(page_url).json()["Data"]
        for item in page_data:
            event.add_agenda_item(description=str(item["ItemDescription"]))
            event.add_person(
                name=str(item["PrimarySponsorShortName"]),
                id=str(item["PrimarySponsorPersonId"]),
                note="Sponsor",
            )

        yield event

コード例 #4

0

ファイルを表示

ファイル: events.py プロジェクト: csnardi/openstates

    def scrape_lower_event(self, url):
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        table = page.xpath('//section[@id="leg-agenda-mod"]/div/table')[0]
        meta = table.xpath("tr[1]/td[1]/text()")

        # careful, the committee name in the page #committee_div
        # is getting inserted via JS
        # so use the one from the table, and strip the chair name
        com_name = re.sub(r"\(.*\)", "", meta[0])
        com_name = f"Assembly {com_name}"

        when = dateutil.parser.parse(meta[1])
        when = self._tz.localize(when)
        location = meta[2]

        event = Event(
            name=com_name,
            start_date=when,
            location_name=location,
        )

        event.add_participant(com_name, type="committee", note="host")

        event.add_source(url)

        if table.xpath('.//a[contains(@href, "/leg/")]'):
            agenda = event.add_agenda_item("Bills under Consideration")
            for bill_link in table.xpath('.//a[contains(@href, "/leg/")]'):
                agenda.add_bill(bill_link.text_content().strip())

        yield event

コード例 #5

0

ファイルを表示

    def scrape(self, chamber=None):

        # we need to GET the page once to set up the ASP.net vars
        # then POST to it to set it to monthly
        url = "https://www.okhouse.gov/Committees/MeetingNotices.aspx"

        params = {
            "__EVENTTARGET": "ctl00$ContentPlaceHolder1$cbMonthly",
            "ctl00$ScriptManager1":
            "ctl00$ContentPlaceHolder1$ctl00$ContentPlaceHolder1$RadAjaxPanel1Panel|ctl00$ContentPlaceHolder1$cbMonthly",
            "ctl00_FormDecorator1_ClientState": "",
            "ctl00_RadToolTipManager1_ClientState": "",
            "ctl00_mainNav_ClientState": "",
            "ctl00$ContentPlaceHolder1$cbToday": "on",
            "ctl00$ContentPlaceHolder1$cbMonthly": "on",
            "ctl00_ContentPlaceHolder1_dgrdNotices_ClientState": "",
            "__ASYNCPOST": "true",
            "RadAJAXControlID": "ctl00_ContentPlaceHolder1_RadAjaxPanel1",
        }

        page = self.get(url).content
        page = lxml.html.fromstring(page)

        html = self.asp_post(url, page, params)
        page = lxml.html.fromstring(html)

        for row in page.xpath('//tr[contains(@id,"_dgrdNotices_")]'):
            status = "tentative"
            agenda_link = row.xpath('.//a[@id="hlMeetAgenda"]')[0]
            title = agenda_link.xpath("text()")[0].strip()
            agenda_url = agenda_link.xpath("@href")[0]
            location = row.xpath("td[3]")[0].text_content().strip()

            # swap in a space for the <br/>
            when = row.xpath("td[4]")[0]
            for br in when.xpath(".//br"):
                br.tail = " " + br.tail if br.tail else " "

            when = when.text_content().strip()
            if "cancelled" in when.lower():
                status = "cancelled"

            when = re.sub("CANCELLED", "", when, re.IGNORECASE)
            when = self._tz.localize(dateutil.parser.parse(when))

            event = Event(
                name=title,
                location_name=location,
                start_date=when,
                classification="committee-meeting",
                status=status,
            )

            event.add_source(url)

            event.add_document("Agenda",
                               agenda_url,
                               media_type="application/pdf")

            yield event

コード例 #6

0

ファイルを表示

    def scrape_chamber(self, chamber):
        chamber_abbr = self.chamber_abbrs[chamber]
        event_url = f"http://billstatus.ls.state.ms.us/htms/{chamber_abbr}_sched.htm"
        text = self.get(event_url).text
        event = None

        when, time, room, com, desc = None, None, None, None, None

        for line in text.splitlines():
            # new date
            if re.match(
                r"^(MONDAY|TUESDAY|WEDNESDAY|THURSDAY|FRIDAY|SATURDAY|SUNDAY)",
                line,
                re.IGNORECASE,
            ):
                day = line.split("   ")[0].strip()
            # timestamp, start of a new event
            if re.match(r"^\d{2}:\d{2}", line) or re.match(r"^(BC|AR|AA|TBA)\+", line):
                # if there's an event from the previous lines, yield it
                if when and room and com:
                    event = Event(
                        name=com,
                        start_date=when,
                        location_name=room,
                        classification="committee-meeting",
                        description=desc,
                    )
                    event.add_source(event_url)
                    yield event

                (time, room, com) = re.split(r"\s+", line, maxsplit=2)

                # if it's an after recess/adjourn
                # we can't calculate the time so just leave it empty
                if re.match(r"^(BC|AR|AA|TBA)\+", line):
                    time = ""

                com = com.strip()
                when = dateutil.parser.parse(f"{day} {time}")
                when = self._tz.localize(when)

                # reset the description so we can populate it w/
                # upcoming lines (if any)
                desc = ""
            elif when and room and com:
                if line.strip():
                    desc += "\n" + line.strip()

        # don't forget about the last event, which won't get triggered by a new date
        if when and room and com:
            event = Event(
                name=com,
                start_date=when,
                location_name=room,
                classification="committee-meeting",
                description=desc,
            )
            event.add_source(event_url)
            yield event

コード例 #7

0

ファイルを表示

def event_obj():
    e = Event(
        name="get-together",
        start_date=datetime.datetime.utcnow().isoformat().split(".")[0] + "Z",
        location_name="Joe's Place",
    )
    e.add_source(url="http://example.com/foobar")
    return e

コード例 #8

0

ファイルを表示

ファイル: events.py プロジェクト: csnardi/openstates

    def upper_parse_agenda_item(self, item):
        response = self.api_client.get(
            "meeting",
            year=item["agendaId"]["year"],
            agenda_id=item["agendaId"]["number"],
            committee=item["committeeId"]["name"],
        )

        data = response["result"]

        chamber = data["committee"]["committeeId"]["chamber"].title()
        com_code = data["committee"]["committeeId"]["name"]
        com_name = f"{chamber} {com_code}"

        # each "meeting" is actually a listing page of multiple meetings of the same committee
        # broken out by different addendumId
        for addendum in data["committee"]["addenda"]["items"]:
            if addendum["addendumId"] != item["addendum"]:
                continue

            meeting = addendum["meeting"]

            when = dateutil.parser.parse(meeting["meetingDateTime"])
            when = self._tz.localize(when)

            location = meeting["location"]
            description = meeting["notes"]

            if location == "":
                location = "See Committee Site"

            if "canceled" in description.lower():
                continue

            event = Event(
                name=com_name,
                start_date=when,
                location_name=location,
                description=description,
            )

            event.add_participant(com_name, type="committee", note="host")

            com_code = (com_code.lower().replace("'", "").replace(" ",
                                                                  "-").replace(
                                                                      ",", ""))
            url = f"https://www.nysenate.gov/committees/{com_code}"
            event.add_source(url)

            bills = addendum["bills"]["items"]

            if len(bills) > 0:
                agenda = event.add_agenda_item("Bills under consideration")

            for bill in bills:
                agenda.add_bill(bill["billId"]["printNo"])

            yield event

コード例 #9

0

ファイルを表示

    def scrape(self, session=None):

        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        # figuring out starting year from metadata
        for item in self.jurisdiction.legislative_sessions:
            if item["identifier"] == session:
                start_year = item["start_date"][:4]
                self.year = start_year
                break

        url = f"https://www.legis.nd.gov/assembly/{session}-{start_year}/committees/interim/committee-meeting-summary"

        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for table in page.xpath('//table[contains(@class,"views-table")]'):
            com = table.xpath("caption/a")[0].text_content().strip()
            for row in table.xpath("tbody/tr"):
                date_link = row.xpath("td[1]/strong/a")[0]
                event_url = date_link.xpath("@href")[0]

                date = date_link.xpath("span")[0].text_content().strip()
                date = dateutil.parser.parse(date)
                date = self._tz.localize(date)

                self.event_months.add(date.strftime("%Y-%m"))

                location = "See Agenda"

                event = Event(name=com,
                              start_date=date,
                              location_name=location)

                event.add_source(event_url)

                for link in row.xpath("td[2]//a"):
                    link_text = link.text_content().strip()

                    # skip live broadcast links
                    if "video.legis" in link_text:
                        continue

                    event.add_document(link_text,
                                       link.xpath("@href")[0],
                                       media_type="application/pdf")

                self.events[event_url] = event

        for year_month in self.event_months:
            self.scrape_calendar(year_month)

        for key in self.events:
            yield self.events[key]

コード例 #10

0

ファイルを表示

    def scrape(self, start=None, end=None):
        if start is None:
            start = dt.datetime.today()
        else:
            start = dateutil.parser.parse(start)

        if end is None:
            end = start + relativedelta(months=+3)
        else:
            end = dateutil.parser.parse(end)

        start = start.strftime("%Y-%m-%d")
        end = end.strftime("%Y-%m-%d")

        url = f"{self.base_url}calendar-data?start={start}&end={end}"
        data = json.loads(self.scraper.get(url).content)

        for item in data:
            name = item["title"].strip()
            if "canceled" in name.lower():
                continue

            if "house session" in name.lower(
            ) or "senate session" in name.lower():
                continue

            url = f"{self.base_url}{item['url']}"

            when = dateutil.parser.parse(item["start"])
            when = self._tz.localize(when)

            page = self.scraper.get(url).content
            page = lxml.html.fromstring(page)

            location = page.xpath(
                '//div[contains(@class,"eventModule") and h3[contains(text(), "Location")]]/text()'
            )[0].strip()
            agenda_url = page.xpath(
                '//a[contains(@class,"linkButton") and contains(text(),"Agenda")]/@href'
            )[0]

            event = Event(
                name=name,
                start_date=when,
                location_name=location,
            )

            event.add_participant(name, type="committee", note="host")
            event.add_document("Agenda",
                               agenda_url,
                               media_type="application/pdf")
            event.add_source(url)

            yield event

コード例 #11

0

ファイルを表示

    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(".//a[contains(@title,'Committee Details')]")
            if len(comit_url) != 1:
                continue

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib["href"])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib["href"]
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            if cttie:
                cttie = cttie.replace("Committee on", "").strip()
                cttie = f"{chamber} {cttie}"
                name = cttie

            event = Event(
                name=name, location_name=where, start_date=self._tz.localize(when)
            )

            event.add_source(calurl)

            event.add_committee(cttie, note="host")

            event.add_document("notice", notice, media_type="application/pdf")

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith("AB") or entry.startswith("SB"):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing["name"])

            yield event

コード例 #12

0

ファイルを表示

ファイル: events.py プロジェクト: vikrantmygamma/openstates-scrapers

    def scrape_chamber(self, chamber):
        url = utils.urls["events"][chamber]
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for table in page.xpath(
                '//table[@class="CMS-MeetingDetail-CurrMeeting"]'):
            date_string = table.xpath(
                'ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0]
            for row in table.xpath("tr"):
                time_string = row.xpath(
                    'td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip()
                description = (
                    row.xpath('td[@class="CMS-MeetingDetail-Agenda"]/div/div')
                    [-1].text_content().strip())
                location = (row.xpath('td[@class="CMS-MeetingDetail-Location"]'
                                      )[0].text_content().strip())
                committees = row.xpath(
                    './/div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a'
                )
                bills = row.xpath('.//a[contains(@href, "billinfo")]')

                try:
                    start_date = datetime.datetime.strptime(
                        "{} {}".format(date_string, time_string),
                        "%m/%d/%Y %I:%M %p")
                except ValueError:
                    break

                event = Event(
                    name=description,
                    start_date=self._tz.localize(start_date),
                    location_name=location,
                )
                event.add_source(url)

                if bills or committees:
                    item = event.add_agenda_item(description)
                    for bill in bills:
                        parsed = urllib.parse.urlparse(bill.get("href"))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_bill("{}{} {}".format(qs["body"], qs["type"],
                                                       qs["bn"]))
                    for committee in committees:
                        parsed = urllib.parse.urlparse(committee.get("href"))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_committee(
                            re.sub(r" \([S|H]\)$", "", committee.text),
                            id=qs.get("Code"),
                        )

                yield event

コード例 #13

0

ファイルを表示

    def scrape_upper(self):
        listing_url = "https://www.senate.mo.gov/hearingsschedule/hrings.htm"

        html = self.get(listing_url).text

        # The HTML here isn't wrapped in a container per-event
        # which makes xpath a pain. So string split by <hr>
        # then parse each event's fragment for cleaner results
        for fragment in html.split("<hr />")[1:]:
            page = lxml.html.fromstring(fragment)

            when_date = self.row_content(page, "Date:")
            when_time = self.row_content(page, "Time:")
            location = self.row_content(page, "Room:")

            location = "{}, {}".format(
                location, "201 W Capitol Ave, Jefferson City, MO 65101")

            # com = self.row_content(page, 'Committee:')
            com = page.xpath(
                '//td[descendant::b[contains(text(),"Committee")]]/a/text()'
            )[0]
            com = com.split(", Senator")[0].strip()

            start_date = self._TZ.localize(
                dateutil.parser.parse("{} {}".format(when_date, when_time)))

            event = Event(start_date=start_date,
                          name=com,
                          location_name=location)

            event.add_source(listing_url)

            event.add_participant(com, type="committee", note="host")

            for bill_table in page.xpath(
                    '//table[@width="85%" and @border="0"]'):
                bill_link = ""
                if bill_table.xpath(self.bill_link_xpath):
                    agenda_line = bill_table.xpath("string(tr[2])").strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

                    bill_link = bill_table.xpath(
                        self.bill_link_xpath)[0].strip()
                    agenda_item.add_bill(bill_link)
                else:
                    agenda_line = bill_table.xpath("string(tr[1])").strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

            yield event

コード例 #14

0

ファイルを表示

ファイル: events.py プロジェクト: jealob/openstates-scrapers

    def scrape_cal_page(self, url):
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for row in page.xpath("//article[contains(@class,'accordion')]"):
            when = row.xpath(".//time/@datetime")[0]
            when = dateutil.parser.parse(when)

            title = row.xpath(
                ".//h3[contains(@class,'heading-link')]/text()")[0].strip()

            description = row.xpath(
                "section/div[contains(@class,'large-8')]/div[contains(@class,'base')]"
            )[0].text_content()

            # fix special chars
            description = (description.replace("\n\u2013", " ").replace(
                "\n", " ").replace("\u203a", ""))
            description = description.replace("More about this event",
                                              "").strip()

            location = row.xpath(
                "header/div/div[contains(@class,'large-8')]/div/div[contains(@class,'text-right')]/p"
            )[0].text_content()

            event = Event(
                name=title,
                description=description,
                start_date=when,
                location_name=location,
            )

            agenda_url = row.xpath(
                ".//a[contains(text(),'More about this event')]/@href")
            if agenda_url != []:
                event.add_document("Details and Agenda",
                                   agenda_url[0],
                                   media_type="text/html")

            if "committee meeting" in title.lower():
                com_name = title.replace("Committee Meeting", "").strip()
                event.add_participant(com_name, type="commitee", note="host")

            event.add_source(url)

            yield event

        if page.xpath("//a[contains(text(), 'Upcoming Events')]"):
            next_url = page.xpath(
                "//a[contains(text(), 'Upcoming Events')]/@href")[0]
            yield from self.scrape_cal_page(next_url)

コード例 #15

0

ファイルを表示

ファイル: events.py プロジェクト: recipefordisaster/openstates-scrapers

    def parse_event(self, row, chamber):
        # sample event available at http://www.akleg.gov/apptester.html
        committee_code = row.xpath("string(Sponsor)").strip()

        if committee_code in self.COMMITTEES[chamber]:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                self.COMMITTEES[chamber][committee_code]["name"],
            )
        else:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                "MISCELLANEOUS",
            )

        name = "{} {}".format(self.COMMITTEES_PRETTY[chamber],
                              row.xpath("string(Title)").strip())

        # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>"
        if name == "":
            name = committee_name

        location = row.xpath("string(Location)").strip()

        # events with no location all seem to be committee hearings
        if location == "":
            location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801"

        start_date = dateutil.parser.parse(row.xpath("string(Schedule)"))
        # todo: do i need to self._TZ.localize() ?

        event = Event(start_date=start_date, name=name, location_name=location)

        event.add_source("http://w3.akleg.gov/index.php#tab4")

        if committee_code in self.COMMITTEES[chamber]:
            event.add_participant(committee_name,
                                  type="committee",
                                  note="host")

        for item in row.xpath("Agenda/Item"):
            agenda_desc = item.xpath("string(Text)").strip()
            if agenda_desc != "":
                agenda_item = event.add_agenda_item(description=agenda_desc)
                if item.xpath("BillRoot"):
                    bill_id = item.xpath("string(BillRoot)")
                    # AK Bill ids have a bunch of extra spaces
                    bill_id = re.sub(r"\s+", " ", bill_id)
                    agenda_item.add_bill(bill_id)

        yield event

コード例 #16

0

ファイルを表示

ファイル: events.py プロジェクト: jealob/openstates-scrapers

    def scrape_chamber(self, chamber):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = (self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description)

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {"Asm": "lower", "Sen": "upper"}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.items():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = [
                "%s %s" % re.match(r"\d+([^\d]+)(\d+)", bill).groups()
                for bill in bill_ids
            ]

            # Dereference the committee_nr number and get display name.
            msg = "More than one committee meeting at (location, date) %r"
            msg = msg % ((location, date), )
            assert len(set(hearing.committee_nr
                           for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = "Committee Meeting: " + committee_name
            event = Event(name=desc,
                          start_date=date,
                          location_name=committee_name)
            for bill_id in bills:
                if "B" in bill_id:
                    type_ = "bill"
                else:
                    type_ = "resolution"
                item = event.add_agenda_item("consideration")
                item.add_bill(bill_id, note=type_)

            event.add_person(committee_name + " Committee", note="host")
            event.add_source("https://downloads.leginfo.legislature.ca.gov/")

            yield event

コード例 #17

0

ファイルを表示

    def scrape_chamber(self, chamber, session, start, end):
        page = self.get_xml(start, end)

        for row in xpath(page, "//wa:CommitteeMeeting"):
            event_cancelled = xpath(row, "string(wa:Cancelled)")
            if event_cancelled == "true":
                continue

            event_chamber = xpath(row, "string(wa:Agency)")
            if self.chambers[event_chamber] != chamber:
                continue

            event_date = datetime.datetime.strptime(
                xpath(row, "string(wa:Date)"), "%Y-%m-%dT%H:%M:%S"
            )
            event_date = self._tz.localize(event_date)
            event_com = xpath(row, "string(wa:Committees/" "wa:Committee/wa:LongName)")
            agenda_id = xpath(row, "string(wa:AgendaId)")
            notes = xpath(row, "string(wa:Notes)")
            room = xpath(row, "string(wa:Room)")
            building = xpath(row, "string(wa:Building)")
            # XML has a wa:Address but it seems useless
            city = xpath(row, "string(wa:City)")
            state = xpath(row, "string(wa:State)")

            location = "{}, {}, {} {}".format(room, building, city, state)

            event = Event(
                name=event_com,
                start_date=event_date,
                location_name=location,
                description=notes,
            )

            source_url = (
                "https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}".format(
                    agenda_id
                )
            )
            event.add_source(source_url)

            event.add_participant(event_com, type="committee", note="host")

            event.extras["agendaId"] = agenda_id

            self.scrape_agenda_items(agenda_id, event)

            yield event

コード例 #18

0

ファイルを表示

    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(start_date=start_date,
                      end_date=end_date,
                      name=title,
                      location_name=location)

        event.add_source(
            "http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx")

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath("@href")[0],
                media_type="application/pdf",
                on_duplicate="ignore",
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath(
                './/div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link("Video of Hearing",
                                 video[0].xpath("@href")[0], "text/html")

        if "subcommittee" in title.lower():
            subcom = title.split("-")[0].strip()
            event.add_participant(subcom, type="committee", note="host")
        else:
            event.add_participant(com, type="committee", note="host")
        yield event

コード例 #19

0

ファイルを表示

ファイル: events.py プロジェクト: hiteshgarg14/openstates

    def scrape(self, session=None, chamber=None):
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt"
        page = self.get(url)
        page = csv.reader(StringIO(page.text), delimiter="|")

        for row in page:
            # Deal with embedded newline characters, which cause fake new rows
            LINE_LENGTH = 11
            while len(row) < LINE_LENGTH:
                row += next(page)

            desc = row[7].strip()

            match = re.match(r"^(.*)- (HOUSE|SENATE)$", desc)
            if match:

                comm = match.group(1).strip()
                comm = re.sub(r"\s+", " ", comm)
                location = row[5].strip() or "Unknown"
                when = datetime.datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S")
                when = self._tz.localize(when)
                # Only assign events to a session if they are in the same year
                # Given that session metadata have some overlap and
                # missing end dates, this is the best option available
                session_year = int(session[:4])
                if session_year != when.year:
                    continue

                description = "%s MEETING" % comm
                event = Event(
                    name=description,
                    start_date=when,
                    location_name=location,
                    description=description,
                )
                event.add_source(url)

                event.add_participant(comm, type="committee", note="host")
                # time = row[3].strip()
                # if time in _TIMECODES:
                #     event['notes'] = TIMECODES[time]

                yield event

コード例 #20

0

ファイルを表示

    def scrape_upper(self, session_id):
        list_url = f"https://lis.virginia.gov/cgi-bin/legp604.exe?{session_id}+oth+MTG&{session_id}+oth+MTG"
        page = self.get(list_url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(list_url)

        date = None
        # note the [td] at the end, they have some empty tr-s so skip them
        for row in page.xpath("//div[@id='mainC']/center/table/tr[td]"):
            if row.xpath("td[1]/text()")[0].strip() != "":
                date = row.xpath("td[1]/text()")[0].strip()

            description = row.xpath("td[3]/text()")[0].strip()

            # data on the house page is better
            if "senate" not in description.lower():
                continue

            time = row.xpath("td[2]/text()")[0].strip()

            status = "tentative"
            if "CANCELLED" in time.lower():
                status = "cancelled"

            try:
                when = dateutil.parser.parse(f"{date} {time}")
            except dateutil.parser._parser.ParserError:
                when = dateutil.parser.parse(date)

            when = self._tz.localize(when)

            # TODO: Post covid figure out how they post locations
            if "virtual" in description.lower():
                location = "Virtual"
            else:
                location = "Unknown"

            event = Event(
                name=description,
                start_date=when,
                classification="committee-meeting",
                location_name=location,
                status=status,
            )

            event.add_source(list_url)
            yield event

コード例 #21

0

ファイルを表示

    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        # Keep record of all events
        records = self.access_to_csv("Agendas")
        for record in records:
            if record["Status"] != "Scheduled":
                continue
            description = record["Comments"]
            related_bills = []

            for bill in re.findall(r"(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id": "%s %s" % (bill[0], bill[2]),
                    "descr": description
                })

            date_time = "%s %s" % (record["Date"], record["Time"])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")

            try:
                hr_name = self._committees[record["CommHouse"]]
            except KeyError:
                self.warning("unknown committee code %s, skipping",
                             record["CommHouse"])

            description = "Meeting of the {}".format(hr_name)

            event = Event(
                name=description,
                start_date=self._tz.localize(date_time),
                location_name=record["Location"] or "Statehouse",
            )
            item = None
            for bill in related_bills:
                item = item or event.add_agenda_item(description)
                item.add_bill(bill["bill_id"])
            # Add committee to event
            event.add_committee(hr_name, id=record["CommHouse"], note="host")
            event.add_source("http://www.njleg.state.nj.us/downloads.asp")

            yield event