def parse(self, response): ''' Parse starting page. Extract event list, then yield further request for each event ''' # load existing events for filtering existing_urls = load_existing_urls(self.name) # extract all events related content events_element = response.xpath('//div[@class="wrapper clearfix "]//div[contains(@class, "views-row")]') self.logger.info('Found %d events on url %s', len(events_element), response.url) for event in events_element: title = event.xpath('.//div[@class="views-field-title"]//a/text()').extract_first() url = urlparse.urljoin(response.url, event.xpath('.//div[@class="views-field-title"]//a/@href').extract_first()) if url not in existing_urls: category = event.xpath('.//div[@class="views-field-field-event-category-value-1"]//a/text()').extract_first() # yield request for newly found event yield Request(url, meta={"category": category}, callback=self.parse_one_event) else: self.logger.info('Event %s with url %s has already been parsed', title, url)
def __init__(self): # load existing urls for filtering self.existing_urls = load_existing_urls(self.name)
def __init__(self): # load existing urls for whatshappen self.existing_urls = load_existing_urls(self.name)
def __init__(self): """Load existing urls for source eventfinda""" self.existing_urls = load_existing_urls(self.name)