def parse(self, response): item = CleanLoader(item=ElancefItem(), response=response) item.add_xpath("name", "//a[@class='title-link']/text()") item.add_xpath("link", "//a[@class='title-link']/@href", TakeFirst(), re="https://[^#]*") item.add_xpath("tagline", "//div[@class='tagline']/text()") item.add_xpath("location", "//span[@class='location']/text()") item.add_xpath("skills", "//div[@class='skills-bar left']/a/text()") # will return an array yield item.load_item()
def parse(self, response): for source in response.xpath("//div[@data-tn-component='organicJob']"): item = CleanLoader(item=IndeedItem(), selector=source) item.add_value("source", u"indeed") item.add_xpath("title", "h2/a[@itemprop='title']/@title") item.add_xpath("link", "h2/a[@itemprop='title']/@href", TakeFirst(), response.urljoin) item.add_xpath("company", "span/span[@itemprop='name']/text()") item.add_xpath("location", "span/span/span[@itemprop='addressLocality']/text()") item.add_xpath("date", "table//span[@class='date']/text()", re="(\d)\w+") d = item.load_item() print d["date"] # yield item.load_item() next_page = response.xpath("//div[@class='pagination']/b/following-sibling::a[1]/@href").extract() if next_page: url = response.urljoin("".join(next_page)) yield scrapy.Request(url, self.parse)