def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ table = response.xpath( '//*[@id="page_content_wrapper"]/div/div/div/div[1]/div[3]') self._validate_meeting_times(table) location = self._parse_location(table) for item in table.xpath(".//h3 | .//p"): if "h3" in item.get(): meeting_year = None meeting_year = item.xpath( "descendant-or-self::text()").re_first(r"\d{4}") continue if "<p>" in item.get(): split_items = [ Selector(text=section) for section in re.split(r"<br>", item.get()) ] for split_string in split_items: meeting_date_match = split_string.re_first( r"([A-Z]\w{2,}\s\d\d?)") if not (meeting_date_match and meeting_year): continue converted_date = self._convert_date( meeting_date_match, meeting_year) meeting = Meeting( title="Commission", description="", classification=COMMISSION, start=self._parse_start(converted_date), end=self._parse_end(converted_date), all_day=self._parse_all_day(item), time_notes="", location=location, links=self._parse_links(split_string), source=self._parse_source(response), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): msg = BytesParser(policy=default).parsebytes(response.body) attachments = list(msg.iter_attachments()) pdf_list = [ a for a in attachments if a.get_content_type() == "application/pdf" ] # List of tuples of filename, match string match_list = [] for pdf_obj in pdf_list: pdf_text = self._parse_pdf_text(pdf_obj.get_payload(decode=True)) meeting_match = re.search( r"Senior Citizens\s+Commission\n.*?(?=\n\n)", pdf_text, flags=re.I | re.M | re.DOTALL, ) if meeting_match: match_list.append( (pdf_obj.get_filename(), meeting_match.group())) if len(match_list) == 0: raise ValueError("Meeting not found in {} PDFs".format( len(pdf_list))) for pdf_name, meeting_str in match_list: year_match = re.search(r"\d{4}", pdf_list[0].get_filename()) year_str = None if year_match: year_str = year_match.group() start, end = self._parse_times(meeting_str, year_str) if not start: return meeting = Meeting( title="Senior Citizens Commission", description="", classification=COMMISSION, start=start, end=end, all_day=False, time_notes="", location=self._parse_location(meeting_str), links=[], source=response.url, ) meeting["status"] = self._get_status(meeting, text=meeting_str) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ last_year = datetime.today().replace(year=datetime.today().year - 1) for meeting_group in response.css( ".page-full-description table[cellspacing]"): year_str = meeting_group.xpath( "preceding::strong[1]/text()").re_first(r"\d{4}") for column in meeting_group.css("td").extract(): for item_str in re.split(r"\<br\s*\/?\>", column): item = Selector(text=item_str) start = self._parse_start(item, year_str) if start is None or (start < last_year and not self.settings.getbool( "CITY_SCRAPERS_ARCHIVE")): continue links = self._parse_links(item, response) detail_links = [ link["href"] for link in links if link["href"].endswith(".html") and "postpone" not in link["title"].lower() ] if len(detail_links) > 0: for link in detail_links: yield response.follow( link, callback=self._parse_detail, cb_kwargs={"start": start}, ) continue meeting = Meeting( title="Commission", description="", classification=COMMISSION, start=start, end=None, time_notes="", all_day=False, location=self.location, source=response.url, links=links, ) meeting["id"] = self._get_id(meeting) meeting["status"] = self._get_status(meeting) yield meeting
def _set_meeting_defaults(response): return Meeting( title='Board of Directors', description='', classification=BOARD, end=None, time_notes='', all_day=False, location={ 'name': 'DEGC, Guardian Building', 'address': '500 Griswold St, Suite 2200, Detroit, MI 48226', }, links=[], source=response.url, )
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ if '122 South Michigan Avenue, 19th Floor' not in response.text: raise ValueError('Meeting address has changed') upcoming_meetings = self._parse_upcoming_meetings(response) past_meetings = self._parse_past_meetings(response) links = self._parse_links(response) # create a master dictionary with one key per meeting date reconciled_meetings = upcoming_meetings only_past_dates = set(past_meetings.keys()).difference( upcoming_meetings.keys()) only_past_meetings = { key: value for key, value in past_meetings.items() if key in only_past_dates } reconciled_meetings.update(only_past_meetings) for key in sorted(reconciled_meetings.keys(), reverse=True): item = reconciled_meetings[key] meeting = Meeting( title=self._parse_title(item), description='', classification=BOARD, start=self._parse_start(item), end=None, all_day=False, time_notes='', location={ 'name': 'Chicago Lottery Office', 'address': '122 South Michigan Avenue, 19th Floor, Chicago, IL 60603' }, source=response.url, ) meeting['id'] = self._get_id(meeting) meeting['status'] = self._get_status(meeting, text=item) meeting['links'] = links.get(meeting['start'].date(), []) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ page_content = response.css("#content .field-items .field-item")[0] bold_text = " ".join(page_content.css("strong *::text").extract()) year_match = re.search(r"\d{4}(?= Agenda)", bold_text) if year_match: year_str = year_match.group() else: year_str = str(datetime.now().year) design_review_committees = re.split(r"\<hr.*?\>", page_content.extract())[1:] for committee in design_review_committees: committee_item = Selector(text=committee) title = self._parse_title(committee_item) if not title: continue location = self._parse_location(committee_item) time_str = self._parse_time_str(committee_item) for row in committee_item.css(".report tr"): month_str = row.css( "td:first-child::text").extract_first().replace(".", "") for date_cell in row.css("td:not(:first-child)"): start = self._parse_start(date_cell, year_str, month_str, time_str) if not start: continue meeting = Meeting( title=title, description="", classification=ADVISORY_COMMITTEE, start=start, end=None, all_day=False, time_notes="", location=location, links=self._parse_links(date_cell, response), source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): # location_info describes the normal location of these meetings. If # that changes, an error should be thrown. location_info = response.xpath('//*[@id="article"]/div/div/div/p').get() if self.LOCATION_DESCRIPTION not in location_info: raise ValueError("Meeting location has changed") # The DOM splits meetings into 2020, 2019, and previous years. # Here we combine those separate buckets into one big list: meeting_soup = response.xpath('//*[@id="article"]/div/div/div').get() # We split the soup by the collapsing accordions # The [1:-1] slice is to get rid of the first and last elements of the list, # which were not relevant to our needs. meetings_split_by_accordions = meeting_soup.split("collapsing-content")[1:-1] # This represents all columns in all years containing meetings. Each index # represents a column in the DOM. all_columns = [] for year_soup in meetings_split_by_accordions: year_columns = year_soup.split("col-lg-4")[1:] for year_column in year_columns: all_columns.append(year_column) # Now we go through each column and split it into meetings: meetings = [] for column in all_columns: for meeting in column.split(r"<p><strong>")[1:]: meetings.append(meeting) for item in meetings: meeting = Meeting( title=self._parse_title(item), description=self._parse_description(item), classification=self._parse_classification(item), start=self._parse_start(item), end=self._parse_end(item), all_day=self._parse_all_day(item), time_notes=self._parse_time_notes(item), location=self._parse_location(item), links=self._parse_links(item), source=self._parse_source(response), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_event(self, response): """ `_parse_calendar` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ data = json.loads( response.css( "script[type='application/ld+json']::text").extract_first()) detail = {} script_str = " ".join( response.css( "head script[type='text/javascript']::text").extract()) json_match = re.search(r"(?<=window\.tkf = ).*?(?=;\n)", script_str) if json_match: detail = json.loads(json_match.group()) events = (detail.get("bootdata", {}).get("query", {}).get("detail", {}).get("events", [])) if len(events) > 0: detail = events[0] title = self._parse_title(data) classification = self._parse_classification(title) start = self._parse_dt(data["startDate"]) links = [] if classification == BOARD: links = self.date_link_map[start.strftime("%B %Y")] meeting = Meeting( title=title, description="", classification=classification, start=start, end=self._parse_dt(data["endDate"]), all_day=False, time_notes="", location=self._parse_location(detail), links=links + self._parse_links(detail), source=response.url, ) meeting["status"] = self._get_status(meeting, text=data["name"]) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ meetings = response.css(".field-items strong *::text").getall() dates = [] for i, item in enumerate(meetings): if "Time" in item: times = re.findall( "(1[0-2]|0?[1-9]):([0-5]\\d)\\s*([AaPp][Mm])", item) start_time = times[0] end_time = times[1] if "Location" in item: venue = item.split(": ")[1] street = meetings[i + 1] + "; " + meetings[i + 2] address = { "name": venue, "address": street.replace(u'\xa0', u' ') } if re.match( "((Mon|Tues|Wednes|Thurs|Fri|Satur|Sun)day), " "((Jan|Febr)uary|March|April|May|June|July|August|" "(Septem|Octo|Novem|Decem)ber) " "[1-3]*\\d, " "\\d{4}", item): dates.append(item) for i, d in enumerate(dates): meeting = Meeting( title=self._parse_title(response), description=self._parse_description(i, response), classification=ADVISORY_COMMITTEE, start=self._parse_start(d, start_time), end=self._parse_end(d, end_time), time_notes='', all_day=False, location=address, links=self._parse_links(d), source=self._parse_source(response), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ self._validate_location(response) links_map = self._parse_links(response) for meeting in self._parse_upcoming_meetings(response, links_map): yield meeting last_year = datetime.today().replace(year=datetime.today().year - 1) for item in response.xpath( "//h2[text()='Agendas']/following-sibling::ul" "[not(preceding-sibling::h2[text()='Minutes'])]" "/li/p"): start = self._parse_start(item) if start < last_year and not self.settings.getbool( "CITY_SCRAPERS_ARCHIVE"): continue meeting = Meeting( title=self._parse_title(item), description="", classification=self._parse_classification(item), start=start, end=None, all_day=False, time_notes="Confirm start time with agency.", location={ "name": "James R. Thompson Center", "address": "100 W. Randolph St., Room 16-503, Chicago, Illinois", }, source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) meeting_type = self._parse_meeting_type(meeting["title"]) meeting["links"] = links_map.get( (meeting["start"].date(), meeting_type), []) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ meeting_types = [ "admin-opp-committee-meeting", "audit-committee", "board-meeting", ] data = json.loads(response.text) for item in data: if item.get("category") != [] and item.get( "category")[0] in meeting_types: title, dt_time = self._parse_title_time(item["title"]) start = self._parse_dt_time( self._parse_datetime(item["start"]), dt_time) end = self._parse_dt_time(self._parse_datetime(item["end"]), dt_time) if end <= start or end.day != start.day: end = None meeting = Meeting( title=title, description="", classification=self._parse_classification( item.get("category")[0]), start=start, end=end, time_notes="", all_day=False, source=self._parse_source(item), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) # Request each relevant event page, including current data in meta attr req = scrapy.Request( item["url"], callback=self._parse_event, dont_filter=True, ) req.meta["meeting"] = meeting req.meta["category"] = item["category"] yield req
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ page_content = response.css("#content .field-items .field-item")[0] bold_text = " ".join(page_content.css("strong *::text").extract()) year_match = re.search(r"\d{4}(?= Agenda)", bold_text) if year_match: year_str = year_match.group() else: raise ValueError("Year not found") # else: # year_str = str(datetime.now().year) content = scrapy.Selector( text=" ".join(re.split(r"\<hr.*?\>", page_content.extract())[:2]) ) self._validate_start_time(content) self._validate_location(content) for row in content.css(".report tr"): month_str = row.css("td:first-child::text").extract_first().replace(".", "") for date_cell in row.css("td:not(:first-child)"): start = self._parse_start(date_cell, year_str, month_str) if not start: continue meeting = Meeting( title="City Planning Commission", description="", classification=COMMISSION, start=start, end=None, all_day=False, time_notes="", location=self.location, links=self._parse_links(date_cell, response), source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ self._validate_location(response) # Only parse the first few sections to avoid returning the whole archive each time for header in response.css(".page-full-description h3")[:3]: header_text = header.css("*::text").extract_first() if "Schedule" not in header_text: continue year_str = re.search(r"\d{4}", header_text).group() for column in header.xpath("./following-sibling::table[1]").css( "td"): # Use the immediate child p element instead of the td el if it exists if len(column.css("p")) > 0: column = column.css("p")[0] # Because the markup is irregular and based on br tags, split the HTML content on br # tags and then create separate selectors for each one column_str = column.extract() if isinstance(column_str, list): column_str = " ".join(column_str) for item_str in re.split(r"\<br[\s\/]*?\>", column_str): item = Selector(text=re.sub(r"\s+", " ", item_str).strip()) start = self._parse_start(item, year_str) if not start: continue meeting = Meeting( title="Commission", description="", classification=COMMISSION, start=start, end=None, time_notes="See details to confirm time", all_day=False, location=self.location, links=self._parse_links(item, response), source=response.url, ) meeting["id"] = self._get_id(meeting) meeting["status"] = self._get_status(meeting, text=item_str) yield meeting
def _parse_calendar(self, response): """Parse items on the main calendar page""" items = [] for item in response.css('.day-with-date:not(.no-events), .current-day:not(.no-events)'): title = self._parse_title(item) description = self._parse_description(item) items.append( Meeting( title=title, description=description, classification=self._parse_classification(title), all_day=False, links=[], source=self._parse_source(item, response.url), ) ) return items
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ # TODO: Committees for item in response.css(".page-full-description .col-xs-12 > *"): if item.css("strong"): year_match = re.search( r"\d{4}", item.css("strong::text").extract_first()) if year_match: year_str = year_match.group() date_map = {} active_key = None for content in item.css("td::text, td a"): if isinstance(content.root, str) and content.root.strip(): date_map[content.root] = [] active_key = content.root else: date_map[active_key].append(content) for date_str, links in date_map.items(): start = self._parse_start(date_str, year_str) if not start: continue meeting = Meeting( title="Board of Directors", description="", # TODO: Figure out committees classification=BOARD, start=start, end=None, all_day=False, time_notes="See agenda to confirm time", location=self.location, links=self._parse_links(links, response), source=response.url, ) meeting["status"] = self._get_status(meeting, text=date_str) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_id`, `_parse_name`, etc methods to fit your scraping needs. """ # Address of Pace Headquarters. Where all meetings seem to be held hq_address = "550 W. Algonquin Rd., Arlington Heights, IL 60005" # Current year of meetings listed year = (response.xpath("//th[@class='rowheader']/em/strong/text()").re( r'(\d\d\d\d) Meetings')[0].strip()) # Get rows of meeting table meeting_rows = response.xpath("//tr/td[@class='rowy2']/parent::* | \ //tr/td[@class='rowl2']/parent::*") for item in meeting_rows: meeting = Meeting( title=self._parse_title(item), description="", # No description # classification -- do after based on title start=self._parse_start(item, year), end=None, # No end time all_day=False, # Probably not, usually starts in evening time_notes=None, location=self._parse_location(item, hq_address), # links -- do this after based on title and date, source=self.start_urls[0], ) # Figure out classification from meeting title meeting['classification'] = self._parse_classification( title=meeting['title']) # Figure out meeting documents from title and date meeting['links'] = self._parse_links(title=meeting['title'], date=meeting['start']) meeting["status"] = self._get_status( meeting, text=" ".join(item.css("*::text").extract())) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ date_link_map = defaultdict(list) for link in response.css("#sidebar a"): if ( "agenda" not in link.attrib["href"].lower() and "minutes" not in link.attrib["href"].lower() ): continue link_title = ( "Agenda" if "agenda" in link.attrib["href"].lower() else "Minutes" ) start = self._parse_start(link) if not start: continue date_link_map[start].append( {"title": link_title, "href": response.urljoin(link.attrib["href"])} ) start_list = sorted([k for k in date_link_map.keys() if k], reverse=True) # Use the most recent 20 meetings for start in start_list[:20]: meeting = Meeting( title="Board of Control", description="", classification=BOARD, start=start, end=None, all_day=False, time_notes="See agenda to confirm details", location=self.location, links=date_link_map[start], source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_minutes(self, response): meeting_titles = [] meeting_selects = [] for idx, row in enumerate(response.css(".padding table tr")): check_idx = idx + 1 # Ignore every third row if check_idx % 3 == 0: continue if check_idx > 3: check_idx = check_idx - math.floor(check_idx / 3) if check_idx % 2 == 1: meeting_titles.extend([ t.strip() for t in row.css("td *::text").extract() if t.strip() ]) elif check_idx % 2 == 0: meeting_selects.extend(row.css("select")) for title, select in zip(meeting_titles, meeting_selects): # Use most recent 3 meetings per category for option in select.css("option:not([selected])")[:3]: minutes_link = option.attrib["value"] if minutes_link == "#": continue date_str = option.css("*::text").extract_first().strip() start_date = datetime.strptime(date_str, "%m/%d/%Y").date() meeting = Meeting( title=title.replace("Faith Based", "Faith-based"), description="", classification=self._parse_classification(title), start=datetime.combine(start_date, time(16)), end=None, all_day=False, time_notes="See meeting source to confirm", location=self.location, links=[{ "title": "Minutes", "href": response.urljoin(minutes_link) }], source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ self._validate_location(response) last_parsed_date = "" for item in response.css("div#element106 font"): """ The date and times are contained within sibling divs that are identicals, so we have to continue the loop and only create the meeting until both date and times have been parsed. """ if not last_parsed_date: last_parsed_date = self._parse_date(item) continue else: start_and_end = self._parse_time(item) if not start_and_end: continue start = last_parsed_date + " " + start_and_end[0].strip() start = datetime.strptime(start, "%B %d, %Y %I:%M%p") end = last_parsed_date + " " + start_and_end[1].strip() end = datetime.strptime(end, "%B %d, %Y %I:%M%p") last_parsed_date = "" meeting = Meeting( title="Commission", description="", classification=COMMISSION, start=start, end=end, all_day=False, time_notes="", location=self.location, links=[], source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): for item in self._parse_entries(response): meeting = Meeting( title=self.meeting_name, description=self.description, classification=self.classification, start=self._parse_start(item), end=None, time_notes='', all_day=False, location=self.location, links=self._parse_links(item, response), source=response.url, ) meeting['id'] = self._get_id(meeting) status_str = ' '.join(item.xpath('.//td//text()').extract()) meeting['status'] = self._get_status(meeting, text=status_str) yield meeting
def parse_legistar(self, events): for event, _ in events: meeting = Meeting( title=event["Name"], description="", classification=BOARD, start=self.legistar_start(event), end=None, time_notes="", all_day=False, location=self._parse_location(event), links=self.legistar_links(event), source=self.legistar_source(event), ) meeting["status"] = self._get_status(meeting, event["Meeting Location"]) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_event(self, response): """Parse the event page""" meeting = Meeting( title=self._parse_title(response), description="", classification=COMMISSION, start=self._parse_start(response), end=self._parse_end(response), all_day=False, time_notes=self._parse_time_notes(response), location=self._parse_location(response), links=self._parse_links(response), source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) return meeting
def _parse_upcoming_meetings(self, response): for item in response.css(".MeetingListFuture .meeting"): title = self._parse_title(item) meeting = Meeting( title=title, description="", classification=self._parse_classification(title), start=self._parse_start(item), end=None, time_notes="", all_day=False, location=self._parse_location(item), links=self._parse_links(response, item), source=self._parse_source(response, item), ) meeting["status"] = self._parse_status(meeting, item) meeting["id"] = self.get_id(meeting) yield meeting
def _parse_detail(self, response): start = self._parse_start(response) meeting = Meeting( title=self._parse_title(response), description="", classification=COMMISSION, start=start, end=self._parse_end(response), all_day=False, time_notes="", location=self._parse_location(response), links=self.link_date_map[start.date()], source=response.url, ) meeting["status"] = self._get_status(meeting, text="TODO") meeting["id"] = self._get_id(meeting) yield meeting
def _parse_meeting(self, response): meeting = Meeting( title=self._parse_title(response), description=self._parse_description(response), classification=BOARD, start=self._parse_start(response), end=self._parse_end(response), all_day=False, time_notes="", location=self._parse_location(response), links=self._parse_links(response), source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_detail_api(self, response): item = loads(response.text) meeting = Meeting( title=self._parse_title(item["Event"]), description=self._parse_description(item["Event"]), classification=self._parse_classification(item), start=self._parse_start(item), end=self._parse_end(item), all_day=self._parse_all_day(item), time_notes=self._parse_time_notes(item), location=self._parse_location(item), links=self._parse_links(item), source=self._parse_source(item), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_event(self, response): """Parse the event page.""" title = self._parse_title(response) meeting = Meeting( title=title, description=self._parse_description(response), classification=BOARD, start=self._parse_start(response), end=self._parse_end(response), time_notes="", all_day=self._parse_all_day(response), location=self._parse_location(response), links=self._parse_links(response), source=response.url, ) meeting["id"] = self._get_id(meeting) meeting["status"] = self._get_status(meeting) return meeting
def parse_legistar(self, events): for event, _ in events: meeting = Meeting( title=event['Name'], description='', classification=BOARD, start=self.legistar_start(event), end=None, time_notes='', all_day=False, location=self._parse_location(event), links=self.legistar_links(event), source=self.legistar_source(event), ) meeting['status'] = self._get_status(meeting, event['Meeting Location']) meeting['id'] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ # I didn't know how to list defualt locations and time other than # hard code them in. # It was elsewhere in the page in a paragraph DEFAULT_LOCATION = [ "Pittsburgh International Airport", "Conference Room A, 4th Flr Mezzanine, Landside Terminal", "Pittsburgh International Airport" ] DEFAULT_TIME = [11, 30, 0] print("\n\n\n\n\nBEGIN SPIDER\n\n\n\n") # takes page HTML and and parses into date time and location datetimeLocationList = self.responseProcessing(response, DEFAULT_LOCATION, DEFAULT_TIME) print(datetimeLocationList) print("\n\n\n\n\n") for dtL in datetimeLocationList: print("IN LOOP") meeting = Meeting( title="Allegheny County Airport Authority Board Meeting", description="", classification=self._parse_classification(dtL), start=dtL[0], end=None, all_day=self._parse_all_day(dtL), time_notes=self._parse_time_notes(dtL), location=self._parse_location(dtL), links=self._parse_links(response), source=self._parse_source(response), ) # NOT SURE WHAT THESE DO # TODO: LOOK INTO AT NEXT MEETING # meeting["status"] = self._get_status(meeting) # meeting["id"] = self._get_id(meeting) yield meeting print("AFTER YIELD")
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for item in response.css(".meeting-row"): agenda_link = item.css( "td:last-child a::attr(href)").extract_first() if agenda_link: agenda_link = agenda_link.replace( "Meetings/ViewMeeting?i", "Documents/ViewAgenda?meetingI") pdf_link = re.sub( r"downloadfile", "ViewDocument", item.css("td:last-child a::attr(href)").extract()[-1], flags=re.I, ) yield response.follow( agenda_link, callback=self._parse_detail, cb_kwargs={ "links": [{ "title": "Agenda", "href": response.urljoin(pdf_link) }] }, dont_filter=True, ) else: start_str = (item.css("[data-sortable-type='mtgTime']::text"). extract_first().strip()) meeting = Meeting( title="City Council", start=datetime.strptime(start_str, "%m/%d/%Y %I:%M:%S %p"), links=[], source=response.url, **self.meeting_defaults, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting