def parse_details(self, response): gazette = response.meta["gazette"] gazette["year_and_edition"] = ( response.css("span.style4 ::text").extract()[1].strip()) titles = response.xpath( "//tr/td/table/tr/td[@colspan='2']/text()").extract() descriptions = response.css("td.destaqt ::text").extract() events = [] while titles: events.append({ "title": titles.pop(0).strip(), "secretariat": descriptions.pop(0).strip(), "summary": descriptions.pop(0).strip(), }) titles.pop(0) if gazette.get("events") is None: gazette["events"] = events else: gazette["events"].extend(events.copy()) current_page = response.css("ul li.current ::text").extract_first() last_page = response.css("ul li:last-child ::text").extract_first() if current_page: current_page = current_page.strip() last_page = last_page.strip() if current_page != last_page: next_page = int(current_page) + 1 url = response.css("ul li a::attr(href)").extract_first() url = replace_query_param(url, "p", next_page) yield Request( response.urljoin(url), callback=self.parse_details, meta={"gazette": gazette}, ) else: gazette_item = GazetteItem( date=from_str_to_date(gazette["date"]), power=gazette["power"], year_and_edition=gazette["year_and_edition"], events=gazette["events"], crawled_at=datetime.now(), crawled_from=response.url, ) yield Request( gazette["file_url"], callback=self.parse_document_url, meta={"gazette": gazette_item}, )
def parse(self, response): dates = response.xpath("//table/tbody/tr/td[1]/strong/text()").getall() event_titles = response.xpath("//table/tbody/tr/td[2]/p/strong/text()").getall() file_urls = response.xpath("//p/a/@href").extract() for event_date, title, file_url in zip(dates, event_titles, file_urls): event_date = from_str_to_date(event_date) yield CityCouncilMinuteItem( crawled_at=datetime.now(), crawled_from=response.url, date=event_date, title=title.strip(), event_type=self.get_type(title), files=[response.urljoin(file_url)], )
def parse_page(self, response): event_details = response.css("div.feature-box") dates = response.xpath("//table/tbody/tr/td[1]/strong/text()").getall() event_titles = response.xpath("//table/tbody/tr/td[2]/p/strong/text()").getall() for details, event_date, title in zip(event_details, dates, event_titles): events = [ line.strip() for line in details.css("p ::text").getall() if line.strip() != "" ] event_date = from_str_to_date(event_date) yield CityCouncilAgendaItem( crawled_at=datetime.now(), crawled_from=response.url, date=event_date, details=" ".join(events), title=title.strip(), event_type=self.get_type(title), )
def parse(self, response): if "SEM INFORMA" not in response.text: # it means page found events, urls = self.extract_events(response) for event, url in zip(events, urls): yield LegacyGazetteItem( title=event["event"], published_on=event["published_on"], date=from_str_to_date(event["date"]), details=url["details"], file_urls=[url["url"]], crawled_at=datetime.now(), crawled_from=response.url, ) current_page = self.get_current_page(response) last_page = response.xpath("//table/tr[10]/td/ul/li/a/text()") if current_page and last_page: last_page = int(last_page[-1].get().strip()) next_page = int(current_page.strip()) + 1 if next_page <= last_page: url = replace_query_param(response.url, "p", next_page) yield response.follow(url, callback=self.parse)
def extract_date(str_with_date): DATE_PATTERN = re.compile(r"\d+\/\d+\/\d+") result = re.search(DATE_PATTERN, str_with_date) if result: return from_str_to_date(result.group(0)) return
def test_possible_date(date_str, expected_obj): assert from_str_to_date(date_str) == expected_obj