def parse_archive(self, response): archive_base_url = get_base_url(response) def get_agenda_url(relative_urls): full_url = [] if relative_urls: for url in relative_urls: url = urljoin(archive_base_url, url) full_url.append(url) return full_url else: None table_body = response.xpath('//table/tbody/tr') for row in table_body: meeting_type = row.xpath( './/span[@itemprop="summary"]/text()').extract_first() date_time = row.xpath( './/td[@class="event_datetime"]/text()').extract_first() agenda_url = row.xpath( './/td[@class="event_agenda"]//a/@href').extract_first() event_minutes_url = row.xpath( './/td[@class="event_minutes"]/a/@href').extract_first() event = Event( _type='event', ocd_division_id=self.ocd_division_id, name='Belmont, CA City Council {}'.format(meeting_type), scraped_datetime=datetime.datetime.utcnow(), record_date=parse_date_string(date_time), source=self.name, source_url=response.url, meeting_type=meeting_type) documents = [] if agenda_url is not None: # If path to agenda is relative, complete it with the base url if archive_base_url not in agenda_url: agenda_url = urljoin(archive_base_url, agenda_url) agenda_doc = { 'url': agenda_url, 'url_hash': url_to_md5(agenda_url), 'category': 'agenda' } documents.append(agenda_doc) if event_minutes_url is not None: # If path to minutes is relative, complete it with the base url if BASE_URL not in event_minutes_url: event_minutes_url = urljoin(archive_base_url, event_minutes_url) minutes_doc = { 'url': event_minutes_url, 'url_hash': url_to_md5(event_minutes_url), 'category': 'minutes' } documents.append(minutes_doc) event['documents'] = documents yield event
def parse_archive(self, response): def get_agenda_url(relative_urls): full_url = [] if relative_urls: for url in relative_urls: base_url = 'http://dublinca.gov' url = urljoin(base_url, url) full_url.append(url) return full_url else: return None table_body = response.xpath('//table/tbody/tr') for row in table_body: record_date = row.xpath('.//td[@data-th="Date"]/text()').extract_first() record_date = datetime.datetime.strptime(record_date, '%B %d, %Y').date() meeting_type = row.xpath('.//td[@data-th="Meeting Type"]/text()').extract_first() agenda_urls = row.xpath('.//td[starts-with(@data-th,"Agenda")]/a/@href').extract() agenda_urls = get_agenda_url(agenda_urls) minutes_url = row.xpath('.//td[@data-th="Minutes"]/a/@href').extract_first() event = Event( _type='event', ocd_division_id=self.ocd_division_id, name='Dublin, CA City Council {}'.format(meeting_type).strip(), scraped_datetime=datetime.datetime.utcnow(), record_date=record_date, source=self.name.strip(), source_url=response.url.strip(), meeting_type=meeting_type.strip(), ) # This block should be cleaned up later # create nested JSON obj for each doc related to meeting documents = [] for url in agenda_urls: agenda_doc = { 'url': url, 'url_hash': url_to_md5(url), 'category': 'agenda' } documents.append(agenda_doc) if minutes_url: minutes_doc = { 'url': minutes_url, 'url_hash': url_to_md5(url), 'category': 'minutes' } documents.append(minutes_doc) event['documents'] = documents yield event
def parse_archive(self, response): def get_agenda_url(relative_urls): full_url = [] if relative_urls: for url in relative_urls: if self.base_url not in url: # encoding url because several paths have spaces in this crawler url = quote(url) url = urljoin(self.base_url, url) full_url.append(url) return full_url else: return None table_body = response.xpath('//table/tbody/tr') for row in table_body: record_date = row.xpath('.//td[1]/text()').extract_first() record_date = parse_date_string(record_date) agenda_urls = row.xpath('.//td[1]/a/@href').extract() agenda_urls = get_agenda_url(agenda_urls) meeting_type = row.xpath('.//td[1]/a/text()').extract_first() minutes_url = row.xpath('.//td[2]/a/@href').extract_first() event = Event( _type='event', ocd_division_id=self.ocd_division_id, name='Moraga, CA City Council {}'.format(meeting_type), scraped_datetime=datetime.datetime.utcnow(), record_date=record_date, source=self.name, source_url=response.url, meeting_type=meeting_type, ) # This block should be cleaned up later # create nested JSON obj for each doc related to meeting documents = [] if agenda_urls is not None: for url in agenda_urls: agenda_doc = { 'media_type': 'application/pdf', 'url': url, 'url_hash': url_to_md5(url), 'category': 'agenda' } documents.append(agenda_doc) if minutes_url is not None: if self.base_url not in minutes_url: # encoding url because several paths have spaces in this crawler minutes_url = quote(minutes_url) minutes_url = urljoin(self.base_url, minutes_url) minutes_doc = { 'media_type': 'application/pdf', 'url': minutes_url, 'url_hash': url_to_md5(minutes_url), 'category': 'minutes' } documents.append(minutes_doc) event['documents'] = documents yield event
def parse_archive(self, response): archive_base_url = get_base_url(response) def get_agenda_url(relative_urls): full_url = [] if relative_urls: for url in relative_urls: url = urljoin(archive_base_url, url) full_url.append(url) return full_url table_body = response.xpath('//table[@class="rgMasterTable"]/tbody/tr') for row in table_body: # most elements are wrapped in <font> tags that aren't # visible when viewing in e.g. Chrome debugger meeting_type = row.xpath('.//td[1]/font/a/font/text()').extract_first() date = row.xpath('.//td[2]/font/text()').extract_first() time = row.xpath('.//td[4]/font/span/font/text()').extract_first() date_time = '{} {}'.format(date, time) agenda_url = row.xpath('.//td[7]/font/span/a/@href').extract_first() event_minutes_url = row.xpath('.//td[8]/font/span/a/font/text()').extract_first() # if there are no minutes the data will be 'Not\xa0available' (with unicode space) if event_minutes_url == 'Not\xa0available': event_minutes_url = None event = Event( _type='event', ocd_division_id=self.ocd_division_id, name='{} City Council {}'.format(self.formatted_city_name, meeting_type), scraped_datetime=datetime.datetime.utcnow(), record_date=parse_date_string(date_time), source=self.city_name, source_url=response.url, meeting_type=meeting_type ) documents = [] if agenda_url is not None: # If path to agenda is relative, complete it with the base url if archive_base_url not in agenda_url: agenda_url = urljoin(archive_base_url, agenda_url) agenda_doc = { 'url': agenda_url, 'url_hash': url_to_md5(agenda_url), 'category': 'agenda' } documents.append(agenda_doc) if event_minutes_url is not None: # If path to minutes is relative, complete it with the base url if archive_base_url not in event_minutes_url: event_minutes_url = urljoin(archive_base_url, event_minutes_url) minutes_doc = { 'url': event_minutes_url, 'url_hash': url_to_md5(event_minutes_url), 'category': 'minutes' } documents.append(minutes_doc) event['documents'] = documents yield event
def parse_archive(self, response): def get_agenda_url(relative_urls): full_url = [] if relative_urls: for url in relative_urls: if self.base_url not in url: url = urljoin(self.base_url, url) full_url.append(url) return full_url else: return None containers = response.xpath( '//div[contains(concat(" ", normalize-space(@class), " "), " listing ")]' ) for table in containers: table_body = table.xpath('.//table/tbody/tr') meeting_type = table.xpath('.//h2/text()').extract_first() for row in table_body: record_date = row.xpath('.//td[1]/h4/a[2]/strong/abbr/text()').extract_first() + \ " " + row.xpath('.//td[1]/h4/a[2]/strong/text()').extract_first() record_date = datetime.datetime.strptime( record_date, '%b %d, %Y').date() agenda_urls = row.xpath( './/td[@class="downloads"]/div/div/div/div/ol/li/a/@href' ).extract() agenda_urls = get_agenda_url(agenda_urls) minutes_url = row.xpath( './/td[@class="minutes"]/a/@href').extract_first() event = Event( _type='event', ocd_division_id=self.ocd_division_id, name='Fremont, CA City Council {}'.format(meeting_type), scraped_datetime=datetime.datetime.utcnow(), record_date=record_date, source=self.name, source_url=response.url, meeting_type=meeting_type, ) # This block should be cleaned up later # create nested JSON obj for each doc related to meeting documents = [] if agenda_urls is not None: for url in agenda_urls: agenda_doc = { 'url': url, 'url_hash': url_to_md5(url), 'category': 'agenda' } documents.append(agenda_doc) if minutes_url is not None: if self.base_url not in minutes_url: minutes_url = urljoin(self.base_url, minutes_url) minutes_doc = { 'url': url, 'url_hash': url_to_md5(minutes_url), 'category': 'minutes' } documents.append(minutes_doc) event['documents'] = documents yield event