def test_meeting_pipeline_sets_end(): pipeline = MeetingPipeline() meeting = pipeline.process_item( Meeting(title="Test", start=datetime.now()), CityScrapersSpider(name="test")) assert meeting["end"] > meeting["start"] now = datetime.now() meeting = pipeline.process_item(Meeting(title="Test", start=now, end=now), CityScrapersSpider(name="test")) assert meeting["end"] > meeting["start"]
def _parse_meetings(self, response): description = ' '.join( response.css('#post p')[0].css('*::text').extract()) if '500 woodward ave' not in description.lower(): raise ValueError('Meeting location has changed') meetings = [] mtg_cls = self._parse_classification(response) meeting_kwargs = { 'title': self._parse_title(response), 'description': '', 'classification': mtg_cls, 'end': None, 'all_day': False, 'time_notes': '', 'source': response.url, } default_start_time = None for item in response.css('#post table tr:not(:first-child)'): start = self._parse_start(item) # Set default meeting start time from first meeting if default_start_time is None: default_start_time = start.time() links = self.document_date_map.pop(start.date(), []) item_kwargs = { **meeting_kwargs, 'title': self._parse_title(response, item=item) } meetings.append( Meeting( **item_kwargs, start=start, location=self._parse_location(item), links=links, )) for doc_date, doc_links in self.document_date_map.items(): meetings.append( Meeting( **meeting_kwargs, start=datetime.combine(doc_date, default_start_time), location=self.location, links=doc_links, )) for meeting in meetings: meeting['status'] = self._get_status(meeting) meeting['id'] = self._get_id(meeting) yield meeting
def _prev_meeting(self, response, **kwargs): minutes_href = kwargs["minutes_href"] dt_object = kwargs["dt_object"] if response.status == 200: dt_str = " ".join([ x.strip() for x in response.css("div.soi-event-data::text").extract() ]) meeting = Meeting( title=self._parse_title(response), description="", classification=BOARD, start=self._parse_start(dt_str), end=self._parse_end(dt_str), all_day=False, time_notes="", location=self._parse_location(response), links=self._parse_link(response, self._parse_start(dt_str), minutes_href), source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting else: meeting = Meeting( title="Board Meeting", description="", classification=BOARD, start=dt_object + datetime.timedelta(hours=13), end=dt_object + datetime.timedelta(hours=16), all_day=False, time_notes="Meeting time is estimated.", location={ "address": "100 West Randolph 9-040 Chicago, IL", "name": "" }, links=self._parse_link(response, dt_object, minutes_href), source=minutes_href, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def _next_meeting(self, response): text = response.xpath('//text()').extract() # Will return full dates, like "9:30 a.m. on Dec. 11, 2018" date_strs = [ re.search(r'\d{1,2}:.*20\d{2}', x).group(0) for x in text if re.search(r'\d{1,2}:.*20\d{2}', x) ] # Has meeting location body_text = response.xpath('//div[@class="copy"]/text()').extract() # Check for 69 (in 69 W Washington) in any of the strings, raise error if not present and # location may have changed if not any('69' in text for text in body_text): raise ValueError('The meeting address may have changed') for date_str in date_strs: meeting = Meeting( title='Electoral Board', description='', classification=COMMISSION, start=self._parse_start(date_str, ''), end=None, time_notes='Meeting end time is estimated', all_day=False, location=self.location, links=self._parse_links(response), source=response.url, ) meeting['status'] = self._get_status(meeting) meeting['id'] = self._get_id(meeting) yield meeting
def _parse_current_year(self, response): meetings = response.css('.page-post-content h2:nth-of-type(2)')[0] items = [] for item in meetings.xpath('child::node()'): if isinstance(item.root, str): items.append({'text': item.root}) elif item.root.tag == 'a': text_items = item.css('* ::text').extract() for item_text in text_items: if item_text and 'agenda' in item_text.lower(): items[-1]['agenda'] = item.root.get('href') elif item_text: items.append({'text': item_text}) meetings = [] for item in items: meeting = Meeting( title=self._parse_title(item['text']), description='', classification=COMMISSION, start=self._parse_start(item['text']), end=None, time_notes='', all_day=False, location=self.location, links=self._parse_links(item.get('agenda')), source=response.url, ) meeting['status'] = self._get_status(meeting) meeting['id'] = self._get_id(meeting) meetings.append(meeting) return meetings
def _parse_event(self, response): """ return Meeting items. """ start = self._parse_start(response) links_key = datetime.strftime(start, "%m-%d") meeting = Meeting( title="Port Authority Commission", description="", classification=BOARD, start=start, end=self._parse_end(response), all_day=False, location=self._parse_location(response), source=response.url, ) if links_key in self.agenda_map.keys(): meeting["links"] = self.agenda_map[links_key] else: meeting["links"] = [] meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) return meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for item in response.css(".entry-content a"): start = self._parse_start(item) if not start: continue meeting = Meeting( title="Commission", description="", classification=COMMISSION, start=start, end=None, all_day=False, time_notes="", location=self.location, links=self._parse_links(item), source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for item in response.css("table:first-of-type tr:not(:first-child)"): start, end = self._parse_start_end(item) meeting = Meeting( title=self._parse_title(item), description="", classification=COMMISSION, start=start, end=end, time_notes="", all_day=False, location=self._parse_location(item), links=self._parse_links(response, item), source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ year = response.css("div.entry-content h2::text").extract_first() for item in response.css("div.entry-content p"): if len(item.css("strong")) == 0: continue start = self._parse_start(item, year) meeting = Meeting( title="Board of Directors", description="", classification=BOARD, start=start, end=None, time_notes="", all_day=False, location=self._parse_location(item), links=self._parse_links(start), source=response.url, ) meeting["id"] = self._get_id(meeting) meeting["status"] = self._get_status(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for item in response.css("tr.eventspage"): title = self._parse_title(item) description = self._parse_description(item) # Skip meetings in certain categories if self.should_ignore_meeting(title, description): continue meeting = Meeting( title=title, description=description, classification=self._parse_classification(item), start=self._parse_start(item), end=self._parse_end(item), all_day=False, time_notes="", location=self._parse_location(description), links=self._parse_links(item), source=response.url, ) meeting["status"] = self._get_status( meeting, text=item.css(".event_cancelled::text").extract_first() or "" ) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): soup = response.xpath("//*[@id=\"main\"]/section[3]").get().split( "<div class=\"links\">") normal_location = self._parse_location(response) start_hour = self._parse_starting_hour(response) for i in range(1, len(soup)): item = soup[i] meeting = Meeting( title=self._parse_title(item), description=self._parse_description(item), classification=self._parse_classification(item), start=self._parse_start(item, start_hour), end=self._parse_end(item), all_day=self._parse_all_day(item), time_notes=self._parse_time_notes(item), location=normal_location, links=self._parse_links(item), source=self._parse_source(response), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for item in response.xpath( "//a[@class='h2 accordion-toggle collapsed']"): title = self._parse_title(item) aria_control = item.xpath("@aria-controls").extract_first() item_uncollapsed = item.xpath( "//div[@id='{}']//tbody//td[@data-title='Meeting Information']" .format(aria_control)) for subitem in item_uncollapsed: start, end = self._parse_times(subitem) meeting = Meeting( title=title, description='', classification=self._parse_classification(title), start=start, end=end, time_notes='', all_day=False, location=self._parse_location(subitem), links=self._parse_links(subitem), source=response.url, ) meeting['status'] = self._get_status(meeting) meeting['id'] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for item in response.xpath("//meeting"): agenda_url = item.xpath("./link/text()").extract_first() links = [] if agenda_url: links = [{"title": "Agenda", "href": agenda_url}] meeting = Meeting( title=self._parse_title(item), description="", classification=self._parse_classification(item), start=self._parse_start(item), end=None, all_day=False, time_notes="", location=self._parse_location(item), links=links, source=agenda_url or response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ events = self._build_list(response) for item in events: meeting = Meeting( title=self._parse_title(item), description=self._parse_description(item), classification=self._parse_classification(item), start=self._parse_start(item), end=self._parse_end(item), all_day=self._parse_all_day(item), time_notes=self._parse_time_notes(item), location=self._parse_location(item), links=self._parse_links(item), source=self._parse_source(response), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse_legistar(self, events): """ `parse_legistar` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for event, _ in events: start = self.legistar_start(event) if not start: continue meeting = Meeting( title=event['Name']['label'], description='', classification=CITY_COUNCIL, start=start, end=None, all_day=False, time_notes='Estimated 2 hour duration', location=self._parse_location(event), links=self.legistar_links(event), source=self.legistar_source(event), ) meeting["status"] = self._get_status( meeting, text=event["Meeting Location"]) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ last_year = datetime.today().replace(year=datetime.today().year - 1) for item in response.css(".minutes"): start = self._parse_start(item) if start < last_year and not self.settings.getbool( "CITY_SCRAPERS_ARCHIVE"): continue meeting = Meeting( title="Illinois Board of Examiners", description=self._parse_description(item), classification=self._parse_classification(item), start=start, end=None, all_day=False, time_notes="", location=self._parse_location(item), links=self._parse_links(item), source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_detail(self, response): title = self._parse_title(response) start, end = self._parse_start_end(response) classification = self._parse_classification(title) if not start: return meeting = Meeting( title=title, description="", classification=classification, start=start, end=end, all_day=False, time_notes="", location=self._parse_location(response), links=self._parse_links(start, classification), source=response.url, ) meeting["status"] = self._get_status( meeting, text=" ".join(response.css(".post header h1").extract())) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_meetings(self, response): for item in response.css("article"): title = self._parse_title(item) summary = item.css( ".entry-summary p:first-child::text").extract_first() if not summary: continue detail_link = item.css("a")[0].attrib["href"] times = self._parse_times(summary) if len(times) == 0: continue end = None start = times[0] if len(times) > 1: end = times[1] meeting = Meeting( title=title, description="", classification=self._parse_classification(title), start=start, end=end, all_day=False, time_notes="", location=self._parse_location(summary), links=[{ "title": "Agenda", "href": detail_link }] + self.minutes_map[start.date()], source=detail_link, ) meeting["status"] = self._get_status(meeting, text=summary) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ year = response.xpath("//u/font/strong/em/text()").get()[0:4] for item in response.xpath("//ol/li"): date_text = item.xpath("./font/strong/em/text()").get() meeting = Meeting( title=self.title, description="", classification=COMMISSION, start=self._parse_start(date_text, year), end=None, all_day=False, time_notes="", location=self.location, links=self._parse_links(item, response), source=response.url, ) meeting["status"] = self._get_status(meeting, text=date_text) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for year, months in self.get_year_month_pairs(response): for item in months.xpath(".//td/p"): if not self._pass_filter(item): continue meeting = Meeting( title="Committee on Standards and Tests", description="", classification=COMMITTEE, start=self._parse_start(item, year), end=None, all_day=False, time_notes="Confirm details with the agency", location={ "address": "121 North LaSalle Street, Room 906, Chicago, IL 60602", "name": "City Hall" }, links=self._parse_links(item, response), source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ meeting_year = response.xpath("//div[@class='meeting-dates-block']/h2") meeting_date = response.xpath("//div[@class='meeting-dates-block']/h5") meeting_info = response.xpath("//div[@class='meeting-dates-block']/p") meeting_links = response.xpath("//div[@class='meeting-minutes-block']") for item_date, item_info in zip(meeting_date, meeting_info): start_time, end_time = self._parse_start_end( item_date, item_info, meeting_year) meeting = Meeting( title="Commission", description="", classification=COMMISSION, start=start_time, end=end_time, all_day=False, time_notes="", location=self._parse_location(item_info), links=self._parse_links(start_time, meeting_links), source=self._parse_source(response), ) meeting["status"] = self._get_status( meeting, text=meeting_info.xpath(".//text()").get()) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ location = self._parse_location(response) meeting_map = {} for item in response.css(".layoutArea li"): start = self._parse_start(item) if not start: continue meeting = Meeting( title="State Street Commission", description="", classification=COMMISSION, start=start, end=None, time_notes="", all_day=False, location=location, links=self._parse_links(item, response), source=response.url, ) meeting["status"] = self._get_status(meeting, text=item.extract()) meeting["id"] = self._get_id(meeting) if meeting["start"] in meeting_map: meeting_map[meeting["start"]]["links"].extend(meeting["links"]) else: meeting_map[meeting["start"]] = meeting for meeting in meeting_map.values(): yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for year_item in response.css(".ui.basic.vertical section > h3"): year = year_item.xpath("./text()").extract_first().strip() for item in year_item.xpath("following-sibling::table[1]").css( "tbody tr"): start = self._parse_start(item, year) if start is None: continue meeting = Meeting( title="Ethics Board", description="", classification=BOARD, start=start, end=None, all_day=False, time_notes="See agenda to confirm time", location=self.location, links=self._parse_links(item, response), source=response.url, ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_detail(self, response): schema_text = response.css( "script[type='application/ld+json']::text").extract()[-1] schema_json = json.loads(schema_text) if isinstance(schema_json, list): item = schema_json[0] else: item = schema_json start = self._parse_dt(item["startDate"]) title = self._parse_title(item["name"]) meeting = Meeting( title=title, description="", classification=self._parse_classification(title), start=start, end=self._parse_dt(item["endDate"]), all_day=False, time_notes="", location=self._parse_location(item), links=self.link_date_map[start.date()], source=response.url, ) meeting["status"] = self._get_status(meeting, text=schema_text) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for table in response.xpath("//table/tbody"): year = self._get_year(table) for item in table.xpath("./tr")[1::]: meeting = Meeting( title=self._parse_title(item), description=self._parse_description(item), classification=self._parse_classification(item), start=self._parse_start(item, year), end=self._parse_end(item), all_day=self._parse_all_day(item), time_notes=self._parse_time_notes(item), location=self._parse_location(item), links=self._parse_links(item, response), source=self._parse_source(response), ) meeting_status = item.xpath("td[2]/text()").get() or "" meeting["status"] = self._get_status(meeting, text=meeting_status) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. """ self.minutes_list = self.get_minutes_panel_items(response) self._validate_locations(response) commission_path = "div #content-232764 div.panel-body p" for item in response.css(commission_path)[1:]: # main title = self._parse_title(item) start = self._parse_start(item) if not start: continue links = self._parse_links(item) meeting = Meeting( title=title, description="", classification=COMMISSION, start=start, end=None, all_day=False, time_notes="", location=self.location, links=links, source=response.url, ) # Include status excluding "rescheduled from" which triggers cancelled status_text = " ".join(item.css("*::text").extract()) if "rescheduled from" in status_text: status_text = "" meeting["status"] = self._get_status(meeting, text=status_text) meeting["id"] = self._get_id(meeting) yield meeting
def parse_legistar(self, events): """ `parse_legistar` should always `yield` Meeting items. Change the `_parse_id`, `_parse_name`, etc methods to fit your scraping needs. """ for event, _ in events: start = self.legistar_start(event) meeting = Meeting( title=event["Name"]["label"], description="", classification=self._parse_classification(event), start=start, end=self._parse_end(start), all_day=False, time_notes="Estimated 3 hour meeting length", location=self._parse_location(event), links=self.legistar_links(event), source=self.legistar_source(event), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ columns = self.parse_meetings(response) for column in columns: year = column.xpath('preceding::strong[1]/text()').re_first(r'(\d{4})(.*)') meetings = column.xpath('text()[normalize-space()]').extract() for item in meetings: if not item.strip(): continue meeting = Meeting( title='Zoning Board of Appeals', description='', classification=COMMISSION, start=self._parse_start(item, year), end=None, time_notes='', all_day=False, location={ 'name': 'City Hall', 'address': '121 N LaSalle St Chicago, IL 60602' }, source=response.url, ) meeting['links'] = self._parse_links(column, meeting, response) meeting['id'] = self._get_id(meeting) meeting['status'] = self._get_status(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_id`, `_parse_name`, etc methods to fit your scraping needs. """ for item in response.xpath("//p//a"): if "Agenda" not in item.xpath("text()").get(): break meeting = Meeting( title=self._parse_title(item), description=self._parse_description(item), classification=self._parse_classification(item), start=self._parse_start(item), end=self._parse_end(item), all_day=self._parse_all_day(item), time_notes=self._parse_time_notes(item), location=self._parse_location(item), links=self._parse_links(item, response), source=self._parse_source(response), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ for item in response.css(".page-content-row div:nth-child(2)"): title = self._parse_title(item) start, end = self._parse_start_end(item) if (not title or not start or ("Board" not in title and "Committee" not in title)): continue meeting = Meeting( title=title, description="", classification=self._parse_classification(title), start=start, end=end, all_day=False, time_notes="", location=self._parse_location(item), links=self._parse_links(item, response), source=self._parse_source(item, response), ) meeting["status"] = self._get_status(meeting, text=" ".join(item.extract())) meeting["id"] = self._get_id(meeting) yield meeting