Ejemplo n.º 1
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        table = response.xpath(
            '//*[@id="page_content_wrapper"]/div/div/div/div[1]/div[3]')

        self._validate_meeting_times(table)
        location = self._parse_location(table)

        for item in table.xpath(".//h3 | .//p"):
            if "h3" in item.get():
                meeting_year = None
                meeting_year = item.xpath(
                    "descendant-or-self::text()").re_first(r"\d{4}")
                continue
            if "<p>" in item.get():
                split_items = [
                    Selector(text=section)
                    for section in re.split(r"<br>", item.get())
                ]
                for split_string in split_items:
                    meeting_date_match = split_string.re_first(
                        r"([A-Z]\w{2,}\s\d\d?)")
                    if not (meeting_date_match and meeting_year):
                        continue
                    converted_date = self._convert_date(
                        meeting_date_match, meeting_year)

                    meeting = Meeting(
                        title="Commission",
                        description="",
                        classification=COMMISSION,
                        start=self._parse_start(converted_date),
                        end=self._parse_end(converted_date),
                        all_day=self._parse_all_day(item),
                        time_notes="",
                        location=location,
                        links=self._parse_links(split_string),
                        source=self._parse_source(response),
                    )

                    meeting["status"] = self._get_status(meeting)
                    meeting["id"] = self._get_id(meeting)

                    yield meeting
Ejemplo n.º 2
0
    def parse(self, response):
        msg = BytesParser(policy=default).parsebytes(response.body)
        attachments = list(msg.iter_attachments())
        pdf_list = [
            a for a in attachments if a.get_content_type() == "application/pdf"
        ]
        # List of tuples of filename, match string
        match_list = []

        for pdf_obj in pdf_list:
            pdf_text = self._parse_pdf_text(pdf_obj.get_payload(decode=True))
            meeting_match = re.search(
                r"Senior Citizens\s+Commission\n.*?(?=\n\n)",
                pdf_text,
                flags=re.I | re.M | re.DOTALL,
            )
            if meeting_match:
                match_list.append(
                    (pdf_obj.get_filename(), meeting_match.group()))

        if len(match_list) == 0:
            raise ValueError("Meeting not found in {} PDFs".format(
                len(pdf_list)))

        for pdf_name, meeting_str in match_list:
            year_match = re.search(r"\d{4}", pdf_list[0].get_filename())
            year_str = None
            if year_match:
                year_str = year_match.group()
            start, end = self._parse_times(meeting_str, year_str)
            if not start:
                return
            meeting = Meeting(
                title="Senior Citizens Commission",
                description="",
                classification=COMMISSION,
                start=start,
                end=end,
                all_day=False,
                time_notes="",
                location=self._parse_location(meeting_str),
                links=[],
                source=response.url,
            )

            meeting["status"] = self._get_status(meeting, text=meeting_str)
            meeting["id"] = self._get_id(meeting)

            yield meeting
Ejemplo n.º 3
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        last_year = datetime.today().replace(year=datetime.today().year - 1)
        for meeting_group in response.css(
                ".page-full-description table[cellspacing]"):
            year_str = meeting_group.xpath(
                "preceding::strong[1]/text()").re_first(r"\d{4}")
            for column in meeting_group.css("td").extract():
                for item_str in re.split(r"\<br\s*\/?\>", column):
                    item = Selector(text=item_str)
                    start = self._parse_start(item, year_str)
                    if start is None or (start < last_year
                                         and not self.settings.getbool(
                                             "CITY_SCRAPERS_ARCHIVE")):
                        continue
                    links = self._parse_links(item, response)
                    detail_links = [
                        link["href"] for link in links
                        if link["href"].endswith(".html")
                        and "postpone" not in link["title"].lower()
                    ]
                    if len(detail_links) > 0:
                        for link in detail_links:
                            yield response.follow(
                                link,
                                callback=self._parse_detail,
                                cb_kwargs={"start": start},
                            )
                        continue
                    meeting = Meeting(
                        title="Commission",
                        description="",
                        classification=COMMISSION,
                        start=start,
                        end=None,
                        time_notes="",
                        all_day=False,
                        location=self.location,
                        source=response.url,
                        links=links,
                    )
                    meeting["id"] = self._get_id(meeting)
                    meeting["status"] = self._get_status(meeting)
                    yield meeting
 def _set_meeting_defaults(response):
     return Meeting(
         title='Board of Directors',
         description='',
         classification=BOARD,
         end=None,
         time_notes='',
         all_day=False,
         location={
             'name': 'DEGC, Guardian Building',
             'address': '500 Griswold St, Suite 2200, Detroit, MI 48226',
         },
         links=[],
         source=response.url,
     )
Ejemplo n.º 5
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """

        if '122 South Michigan Avenue, 19th Floor' not in response.text:
            raise ValueError('Meeting address has changed')

        upcoming_meetings = self._parse_upcoming_meetings(response)
        past_meetings = self._parse_past_meetings(response)
        links = self._parse_links(response)

        # create a master dictionary with one key per meeting date
        reconciled_meetings = upcoming_meetings
        only_past_dates = set(past_meetings.keys()).difference(
            upcoming_meetings.keys())
        only_past_meetings = {
            key: value
            for key, value in past_meetings.items() if key in only_past_dates
        }
        reconciled_meetings.update(only_past_meetings)

        for key in sorted(reconciled_meetings.keys(), reverse=True):
            item = reconciled_meetings[key]
            meeting = Meeting(
                title=self._parse_title(item),
                description='',
                classification=BOARD,
                start=self._parse_start(item),
                end=None,
                all_day=False,
                time_notes='',
                location={
                    'name':
                    'Chicago Lottery Office',
                    'address':
                    '122 South Michigan Avenue, 19th Floor, Chicago, IL 60603'
                },
                source=response.url,
            )
            meeting['id'] = self._get_id(meeting)
            meeting['status'] = self._get_status(meeting, text=item)
            meeting['links'] = links.get(meeting['start'].date(), [])

            yield meeting
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        page_content = response.css("#content .field-items .field-item")[0]
        bold_text = " ".join(page_content.css("strong *::text").extract())
        year_match = re.search(r"\d{4}(?= Agenda)", bold_text)
        if year_match:
            year_str = year_match.group()
        else:
            year_str = str(datetime.now().year)
        design_review_committees = re.split(r"\<hr.*?\>",
                                            page_content.extract())[1:]
        for committee in design_review_committees:
            committee_item = Selector(text=committee)
            title = self._parse_title(committee_item)
            if not title:
                continue
            location = self._parse_location(committee_item)
            time_str = self._parse_time_str(committee_item)
            for row in committee_item.css(".report tr"):
                month_str = row.css(
                    "td:first-child::text").extract_first().replace(".", "")
                for date_cell in row.css("td:not(:first-child)"):
                    start = self._parse_start(date_cell, year_str, month_str,
                                              time_str)
                    if not start:
                        continue
                    meeting = Meeting(
                        title=title,
                        description="",
                        classification=ADVISORY_COMMITTEE,
                        start=start,
                        end=None,
                        all_day=False,
                        time_notes="",
                        location=location,
                        links=self._parse_links(date_cell, response),
                        source=response.url,
                    )

                    meeting["status"] = self._get_status(meeting)
                    meeting["id"] = self._get_id(meeting)

                    yield meeting
    def parse(self, response):
        # location_info describes the normal location of these meetings. If
        # that changes, an error should be thrown.
        location_info = response.xpath('//*[@id="article"]/div/div/div/p').get()
        if self.LOCATION_DESCRIPTION not in location_info:
            raise ValueError("Meeting location has changed")

        # The DOM splits meetings into 2020, 2019, and previous years.
        # Here we combine those separate buckets into one big list:
        meeting_soup = response.xpath('//*[@id="article"]/div/div/div').get()
        # We split the soup by the collapsing accordions
        # The [1:-1] slice is to get rid of the first and last elements of the list,
        # which were not relevant to our needs.
        meetings_split_by_accordions = meeting_soup.split("collapsing-content")[1:-1]

        # This represents all columns in all years containing meetings. Each index
        # represents a column in the DOM.
        all_columns = []
        for year_soup in meetings_split_by_accordions:
            year_columns = year_soup.split("col-lg-4")[1:]
            for year_column in year_columns:
                all_columns.append(year_column)

        # Now we go through each column and split it into meetings:
        meetings = []
        for column in all_columns:
            for meeting in column.split(r"<p><strong>")[1:]:
                meetings.append(meeting)

        for item in meetings:
            meeting = Meeting(
                title=self._parse_title(item),
                description=self._parse_description(item),
                classification=self._parse_classification(item),
                start=self._parse_start(item),
                end=self._parse_end(item),
                all_day=self._parse_all_day(item),
                time_notes=self._parse_time_notes(item),
                location=self._parse_location(item),
                links=self._parse_links(item),
                source=self._parse_source(response),
            )

            meeting["status"] = self._get_status(meeting)
            meeting["id"] = self._get_id(meeting)

            yield meeting
Ejemplo n.º 8
0
    def _parse_event(self, response):
        """
        `_parse_calendar` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        data = json.loads(
            response.css(
                "script[type='application/ld+json']::text").extract_first())
        detail = {}
        script_str = " ".join(
            response.css(
                "head script[type='text/javascript']::text").extract())
        json_match = re.search(r"(?<=window\.tkf = ).*?(?=;\n)", script_str)
        if json_match:
            detail = json.loads(json_match.group())
            events = (detail.get("bootdata",
                                 {}).get("query",
                                         {}).get("detail",
                                                 {}).get("events", []))
            if len(events) > 0:
                detail = events[0]

        title = self._parse_title(data)
        classification = self._parse_classification(title)
        start = self._parse_dt(data["startDate"])
        links = []
        if classification == BOARD:
            links = self.date_link_map[start.strftime("%B %Y")]
        meeting = Meeting(
            title=title,
            description="",
            classification=classification,
            start=start,
            end=self._parse_dt(data["endDate"]),
            all_day=False,
            time_notes="",
            location=self._parse_location(detail),
            links=links + self._parse_links(detail),
            source=response.url,
        )

        meeting["status"] = self._get_status(meeting, text=data["name"])
        meeting["id"] = self._get_id(meeting)

        yield meeting
Ejemplo n.º 9
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        meetings = response.css(".field-items strong *::text").getall()
        dates = []
        for i, item in enumerate(meetings):
            if "Time" in item:
                times = re.findall(
                    "(1[0-2]|0?[1-9]):([0-5]\\d)\\s*([AaPp][Mm])", item)
                start_time = times[0]
                end_time = times[1]
            if "Location" in item:
                venue = item.split(": ")[1]
                street = meetings[i + 1] + "; " + meetings[i + 2]
                address = {
                    "name": venue,
                    "address": street.replace(u'\xa0', u' ')
                }
            if re.match(
                    "((Mon|Tues|Wednes|Thurs|Fri|Satur|Sun)day), "
                    "((Jan|Febr)uary|March|April|May|June|July|August|"
                    "(Septem|Octo|Novem|Decem)ber) "
                    "[1-3]*\\d, "
                    "\\d{4}", item):
                dates.append(item)

        for i, d in enumerate(dates):
            meeting = Meeting(
                title=self._parse_title(response),
                description=self._parse_description(i, response),
                classification=ADVISORY_COMMITTEE,
                start=self._parse_start(d, start_time),
                end=self._parse_end(d, end_time),
                time_notes='',
                all_day=False,
                location=address,
                links=self._parse_links(d),
                source=self._parse_source(response),
            )
            meeting["status"] = self._get_status(meeting)
            meeting["id"] = self._get_id(meeting)

            yield meeting
Ejemplo n.º 10
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        self._validate_location(response)
        links_map = self._parse_links(response)

        for meeting in self._parse_upcoming_meetings(response, links_map):
            yield meeting

        last_year = datetime.today().replace(year=datetime.today().year - 1)
        for item in response.xpath(
                "//h2[text()='Agendas']/following-sibling::ul"
                "[not(preceding-sibling::h2[text()='Minutes'])]"
                "/li/p"):
            start = self._parse_start(item)
            if start < last_year and not self.settings.getbool(
                    "CITY_SCRAPERS_ARCHIVE"):
                continue
            meeting = Meeting(
                title=self._parse_title(item),
                description="",
                classification=self._parse_classification(item),
                start=start,
                end=None,
                all_day=False,
                time_notes="Confirm start time with agency.",
                location={
                    "name":
                    "James R. Thompson Center",
                    "address":
                    "100 W. Randolph St., Room 16-503, Chicago, Illinois",
                },
                source=response.url,
            )

            meeting["status"] = self._get_status(meeting)
            meeting["id"] = self._get_id(meeting)

            meeting_type = self._parse_meeting_type(meeting["title"])
            meeting["links"] = links_map.get(
                (meeting["start"].date(), meeting_type), [])

            yield meeting
Ejemplo n.º 11
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        meeting_types = [
            "admin-opp-committee-meeting",
            "audit-committee",
            "board-meeting",
        ]

        data = json.loads(response.text)
        for item in data:
            if item.get("category") != [] and item.get(
                    "category")[0] in meeting_types:
                title, dt_time = self._parse_title_time(item["title"])
                start = self._parse_dt_time(
                    self._parse_datetime(item["start"]), dt_time)
                end = self._parse_dt_time(self._parse_datetime(item["end"]),
                                          dt_time)
                if end <= start or end.day != start.day:
                    end = None
                meeting = Meeting(
                    title=title,
                    description="",
                    classification=self._parse_classification(
                        item.get("category")[0]),
                    start=start,
                    end=end,
                    time_notes="",
                    all_day=False,
                    source=self._parse_source(item),
                )
                meeting["status"] = self._get_status(meeting)
                meeting["id"] = self._get_id(meeting)

                # Request each relevant event page, including current data in meta attr
                req = scrapy.Request(
                    item["url"],
                    callback=self._parse_event,
                    dont_filter=True,
                )
                req.meta["meeting"] = meeting
                req.meta["category"] = item["category"]
                yield req
Ejemplo n.º 12
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        page_content = response.css("#content .field-items .field-item")[0]
        bold_text = " ".join(page_content.css("strong *::text").extract())
        year_match = re.search(r"\d{4}(?= Agenda)", bold_text)
        if year_match:
            year_str = year_match.group()
        else:
            raise ValueError("Year not found")
        # else:
        #     year_str = str(datetime.now().year)

        content = scrapy.Selector(
            text=" ".join(re.split(r"\<hr.*?\>", page_content.extract())[:2])
        )
        self._validate_start_time(content)
        self._validate_location(content)

        for row in content.css(".report tr"):
            month_str = row.css("td:first-child::text").extract_first().replace(".", "")
            for date_cell in row.css("td:not(:first-child)"):
                start = self._parse_start(date_cell, year_str, month_str)
                if not start:
                    continue
                meeting = Meeting(
                    title="City Planning Commission",
                    description="",
                    classification=COMMISSION,
                    start=start,
                    end=None,
                    all_day=False,
                    time_notes="",
                    location=self.location,
                    links=self._parse_links(date_cell, response),
                    source=response.url,
                )

                meeting["status"] = self._get_status(meeting)
                meeting["id"] = self._get_id(meeting)

                yield meeting
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        self._validate_location(response)

        # Only parse the first few sections to avoid returning the whole archive each time
        for header in response.css(".page-full-description h3")[:3]:
            header_text = header.css("*::text").extract_first()
            if "Schedule" not in header_text:
                continue
            year_str = re.search(r"\d{4}", header_text).group()
            for column in header.xpath("./following-sibling::table[1]").css(
                    "td"):
                # Use the immediate child p element instead of the td el if it exists
                if len(column.css("p")) > 0:
                    column = column.css("p")[0]
                # Because the markup is irregular and based on br tags, split the HTML content on br
                # tags and then create separate selectors for each one
                column_str = column.extract()
                if isinstance(column_str, list):
                    column_str = " ".join(column_str)
                for item_str in re.split(r"\<br[\s\/]*?\>", column_str):
                    item = Selector(text=re.sub(r"\s+", " ", item_str).strip())
                    start = self._parse_start(item, year_str)
                    if not start:
                        continue
                    meeting = Meeting(
                        title="Commission",
                        description="",
                        classification=COMMISSION,
                        start=start,
                        end=None,
                        time_notes="See details to confirm time",
                        all_day=False,
                        location=self.location,
                        links=self._parse_links(item, response),
                        source=response.url,
                    )
                    meeting["id"] = self._get_id(meeting)
                    meeting["status"] = self._get_status(meeting,
                                                         text=item_str)
                    yield meeting
Ejemplo n.º 14
0
 def _parse_calendar(self, response):
     """Parse items on the main calendar page"""
     items = []
     for item in response.css('.day-with-date:not(.no-events), .current-day:not(.no-events)'):
         title = self._parse_title(item)
         description = self._parse_description(item)
         items.append(
             Meeting(
                 title=title,
                 description=description,
                 classification=self._parse_classification(title),
                 all_day=False,
                 links=[],
                 source=self._parse_source(item, response.url),
             )
         )
     return items
Ejemplo n.º 15
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        # TODO: Committees
        for item in response.css(".page-full-description .col-xs-12 > *"):
            if item.css("strong"):
                year_match = re.search(
                    r"\d{4}",
                    item.css("strong::text").extract_first())
                if year_match:
                    year_str = year_match.group()
            date_map = {}
            active_key = None
            for content in item.css("td::text, td a"):
                if isinstance(content.root, str) and content.root.strip():
                    date_map[content.root] = []
                    active_key = content.root
                else:
                    date_map[active_key].append(content)

            for date_str, links in date_map.items():
                start = self._parse_start(date_str, year_str)
                if not start:
                    continue
                meeting = Meeting(
                    title="Board of Directors",
                    description="",
                    # TODO: Figure out committees
                    classification=BOARD,
                    start=start,
                    end=None,
                    all_day=False,
                    time_notes="See agenda to confirm time",
                    location=self.location,
                    links=self._parse_links(links, response),
                    source=response.url,
                )

                meeting["status"] = self._get_status(meeting, text=date_str)
                meeting["id"] = self._get_id(meeting)

                yield meeting
Ejemplo n.º 16
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_id`, `_parse_name`, etc methods to fit your scraping
        needs.
        """

        # Address of Pace Headquarters. Where all meetings seem to be held
        hq_address = "550 W. Algonquin Rd., Arlington Heights, IL 60005"

        # Current year of meetings listed
        year = (response.xpath("//th[@class='rowheader']/em/strong/text()").re(
            r'(\d\d\d\d) Meetings')[0].strip())

        # Get rows of meeting table
        meeting_rows = response.xpath("//tr/td[@class='rowy2']/parent::* | \
            //tr/td[@class='rowl2']/parent::*")

        for item in meeting_rows:
            meeting = Meeting(
                title=self._parse_title(item),
                description="",  # No description
                # classification -- do after based on title
                start=self._parse_start(item, year),
                end=None,  # No end time
                all_day=False,  # Probably not, usually starts in evening
                time_notes=None,
                location=self._parse_location(item, hq_address),
                # links -- do this after based on title and date,
                source=self.start_urls[0],
            )

            # Figure out classification from meeting title
            meeting['classification'] = self._parse_classification(
                title=meeting['title'])

            # Figure out meeting documents from title and date
            meeting['links'] = self._parse_links(title=meeting['title'],
                                                 date=meeting['start'])

            meeting["status"] = self._get_status(
                meeting, text=" ".join(item.css("*::text").extract()))
            meeting["id"] = self._get_id(meeting)

            yield meeting
Ejemplo n.º 17
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        date_link_map = defaultdict(list)

        for link in response.css("#sidebar a"):
            if (
                "agenda" not in link.attrib["href"].lower()
                and "minutes" not in link.attrib["href"].lower()
            ):
                continue
            link_title = (
                "Agenda" if "agenda" in link.attrib["href"].lower() else "Minutes"
            )
            start = self._parse_start(link)
            if not start:
                continue
            date_link_map[start].append(
                {"title": link_title, "href": response.urljoin(link.attrib["href"])}
            )

        start_list = sorted([k for k in date_link_map.keys() if k], reverse=True)
        # Use the most recent 20 meetings
        for start in start_list[:20]:
            meeting = Meeting(
                title="Board of Control",
                description="",
                classification=BOARD,
                start=start,
                end=None,
                all_day=False,
                time_notes="See agenda to confirm details",
                location=self.location,
                links=date_link_map[start],
                source=response.url,
            )

            meeting["status"] = self._get_status(meeting)
            meeting["id"] = self._get_id(meeting)

            yield meeting
Ejemplo n.º 18
0
    def _parse_minutes(self, response):
        meeting_titles = []
        meeting_selects = []
        for idx, row in enumerate(response.css(".padding table tr")):
            check_idx = idx + 1
            # Ignore every third row
            if check_idx % 3 == 0:
                continue
            if check_idx > 3:
                check_idx = check_idx - math.floor(check_idx / 3)
            if check_idx % 2 == 1:
                meeting_titles.extend([
                    t.strip() for t in row.css("td *::text").extract()
                    if t.strip()
                ])
            elif check_idx % 2 == 0:
                meeting_selects.extend(row.css("select"))
        for title, select in zip(meeting_titles, meeting_selects):
            # Use most recent 3 meetings per category
            for option in select.css("option:not([selected])")[:3]:
                minutes_link = option.attrib["value"]
                if minutes_link == "#":
                    continue
                date_str = option.css("*::text").extract_first().strip()
                start_date = datetime.strptime(date_str, "%m/%d/%Y").date()
                meeting = Meeting(
                    title=title.replace("Faith Based", "Faith-based"),
                    description="",
                    classification=self._parse_classification(title),
                    start=datetime.combine(start_date, time(16)),
                    end=None,
                    all_day=False,
                    time_notes="See meeting source to confirm",
                    location=self.location,
                    links=[{
                        "title": "Minutes",
                        "href": response.urljoin(minutes_link)
                    }],
                    source=response.url,
                )

                meeting["status"] = self._get_status(meeting)
                meeting["id"] = self._get_id(meeting)

                yield meeting
Ejemplo n.º 19
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        self._validate_location(response)
        last_parsed_date = ""
        for item in response.css("div#element106 font"):
            """
            The date and times are contained within sibling divs that are identicals, so we have to
            continue the loop and only create the meeting until both date and times have been
            parsed.
            """
            if not last_parsed_date:
                last_parsed_date = self._parse_date(item)
                continue
            else:
                start_and_end = self._parse_time(item)
                if not start_and_end:
                    continue
                start = last_parsed_date + " " + start_and_end[0].strip()
                start = datetime.strptime(start, "%B %d, %Y %I:%M%p")
                end = last_parsed_date + " " + start_and_end[1].strip()
                end = datetime.strptime(end, "%B %d, %Y %I:%M%p")
                last_parsed_date = ""

            meeting = Meeting(
                title="Commission",
                description="",
                classification=COMMISSION,
                start=start,
                end=end,
                all_day=False,
                time_notes="",
                location=self.location,
                links=[],
                source=response.url,
            )

            meeting["status"] = self._get_status(meeting)
            meeting["id"] = self._get_id(meeting)

            yield meeting
Ejemplo n.º 20
0
 def parse(self, response):
     for item in self._parse_entries(response):
         meeting = Meeting(
             title=self.meeting_name,
             description=self.description,
             classification=self.classification,
             start=self._parse_start(item),
             end=None,
             time_notes='',
             all_day=False,
             location=self.location,
             links=self._parse_links(item, response),
             source=response.url,
         )
         meeting['id'] = self._get_id(meeting)
         status_str = ' '.join(item.xpath('.//td//text()').extract())
         meeting['status'] = self._get_status(meeting, text=status_str)
         yield meeting
Ejemplo n.º 21
0
 def parse_legistar(self, events):
     for event, _ in events:
         meeting = Meeting(
             title=event["Name"],
             description="",
             classification=BOARD,
             start=self.legistar_start(event),
             end=None,
             time_notes="",
             all_day=False,
             location=self._parse_location(event),
             links=self.legistar_links(event),
             source=self.legistar_source(event),
         )
         meeting["status"] = self._get_status(meeting,
                                              event["Meeting Location"])
         meeting["id"] = self._get_id(meeting)
         yield meeting
Ejemplo n.º 22
0
    def _parse_event(self, response):
        """Parse the event page"""
        meeting = Meeting(
            title=self._parse_title(response),
            description="",
            classification=COMMISSION,
            start=self._parse_start(response),
            end=self._parse_end(response),
            all_day=False,
            time_notes=self._parse_time_notes(response),
            location=self._parse_location(response),
            links=self._parse_links(response),
            source=response.url,
        )

        meeting["status"] = self._get_status(meeting)
        meeting["id"] = self._get_id(meeting)
        return meeting
Ejemplo n.º 23
0
 def _parse_upcoming_meetings(self, response):
     for item in response.css(".MeetingListFuture .meeting"):
         title = self._parse_title(item)
         meeting = Meeting(
             title=title,
             description="",
             classification=self._parse_classification(title),
             start=self._parse_start(item),
             end=None,
             time_notes="",
             all_day=False,
             location=self._parse_location(item),
             links=self._parse_links(response, item),
             source=self._parse_source(response, item),
         )
         meeting["status"] = self._parse_status(meeting, item)
         meeting["id"] = self.get_id(meeting)
         yield meeting
Ejemplo n.º 24
0
    def _parse_detail(self, response):
        start = self._parse_start(response)
        meeting = Meeting(
            title=self._parse_title(response),
            description="",
            classification=COMMISSION,
            start=start,
            end=self._parse_end(response),
            all_day=False,
            time_notes="",
            location=self._parse_location(response),
            links=self.link_date_map[start.date()],
            source=response.url,
        )

        meeting["status"] = self._get_status(meeting, text="TODO")
        meeting["id"] = self._get_id(meeting)
        yield meeting
Ejemplo n.º 25
0
    def _parse_meeting(self, response):
        meeting = Meeting(
            title=self._parse_title(response),
            description=self._parse_description(response),
            classification=BOARD,
            start=self._parse_start(response),
            end=self._parse_end(response),
            all_day=False,
            time_notes="",
            location=self._parse_location(response),
            links=self._parse_links(response),
            source=response.url,
        )

        meeting["status"] = self._get_status(meeting)
        meeting["id"] = self._get_id(meeting)

        yield meeting
Ejemplo n.º 26
0
    def _parse_detail_api(self, response):
        item = loads(response.text)
        meeting = Meeting(
            title=self._parse_title(item["Event"]),
            description=self._parse_description(item["Event"]),
            classification=self._parse_classification(item),
            start=self._parse_start(item),
            end=self._parse_end(item),
            all_day=self._parse_all_day(item),
            time_notes=self._parse_time_notes(item),
            location=self._parse_location(item),
            links=self._parse_links(item),
            source=self._parse_source(item),
        )

        meeting["status"] = self._get_status(meeting)
        meeting["id"] = self._get_id(meeting)
        yield meeting
Ejemplo n.º 27
0
 def _parse_event(self, response):
     """Parse the event page."""
     title = self._parse_title(response)
     meeting = Meeting(
         title=title,
         description=self._parse_description(response),
         classification=BOARD,
         start=self._parse_start(response),
         end=self._parse_end(response),
         time_notes="",
         all_day=self._parse_all_day(response),
         location=self._parse_location(response),
         links=self._parse_links(response),
         source=response.url,
     )
     meeting["id"] = self._get_id(meeting)
     meeting["status"] = self._get_status(meeting)
     return meeting
Ejemplo n.º 28
0
 def parse_legistar(self, events):
     for event, _ in events:
         meeting = Meeting(
             title=event['Name'],
             description='',
             classification=BOARD,
             start=self.legistar_start(event),
             end=None,
             time_notes='',
             all_day=False,
             location=self._parse_location(event),
             links=self.legistar_links(event),
             source=self.legistar_source(event),
         )
         meeting['status'] = self._get_status(meeting,
                                              event['Meeting Location'])
         meeting['id'] = self._get_id(meeting)
         yield meeting
Ejemplo n.º 29
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """

        # I didn't know how to list defualt locations and time other than
        # hard code them in.
        # It was elsewhere in the page in a paragraph

        DEFAULT_LOCATION = [
            "Pittsburgh International Airport",
            "Conference Room A, 4th Flr Mezzanine, Landside Terminal",
            "Pittsburgh International Airport"
        ]
        DEFAULT_TIME = [11, 30, 0]

        print("\n\n\n\n\nBEGIN SPIDER\n\n\n\n")
        # takes page HTML and and parses into date time and location
        datetimeLocationList = self.responseProcessing(response, DEFAULT_LOCATION, DEFAULT_TIME)
        print(datetimeLocationList)
        print("\n\n\n\n\n")
        for dtL in datetimeLocationList:
            print("IN LOOP")
            meeting = Meeting(
                title="Allegheny County Airport Authority Board Meeting",
                description="",
                classification=self._parse_classification(dtL),
                start=dtL[0],
                end=None,
                all_day=self._parse_all_day(dtL),
                time_notes=self._parse_time_notes(dtL),
                location=self._parse_location(dtL),
                links=self._parse_links(response),
                source=self._parse_source(response),
            )
            # NOT SURE WHAT THESE DO
            # TODO: LOOK INTO AT NEXT MEETING
            # meeting["status"] = self._get_status(meeting)
            # meeting["id"] = self._get_id(meeting)
            yield meeting
            print("AFTER YIELD")
Ejemplo n.º 30
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        for item in response.css(".meeting-row"):
            agenda_link = item.css(
                "td:last-child a::attr(href)").extract_first()
            if agenda_link:
                agenda_link = agenda_link.replace(
                    "Meetings/ViewMeeting?i", "Documents/ViewAgenda?meetingI")
                pdf_link = re.sub(
                    r"downloadfile",
                    "ViewDocument",
                    item.css("td:last-child a::attr(href)").extract()[-1],
                    flags=re.I,
                )
                yield response.follow(
                    agenda_link,
                    callback=self._parse_detail,
                    cb_kwargs={
                        "links": [{
                            "title": "Agenda",
                            "href": response.urljoin(pdf_link)
                        }]
                    },
                    dont_filter=True,
                )
            else:
                start_str = (item.css("[data-sortable-type='mtgTime']::text").
                             extract_first().strip())
                meeting = Meeting(
                    title="City Council",
                    start=datetime.strptime(start_str, "%m/%d/%Y %I:%M:%S %p"),
                    links=[],
                    source=response.url,
                    **self.meeting_defaults,
                )
                meeting["status"] = self._get_status(meeting)
                meeting["id"] = self._get_id(meeting)

                yield meeting