def parse_job_detail(self, response): hxs = HtmlXPathSelector(response) title = hxs.select("//div[@id='businessmyaccountjobadpreviewmain']/div/font/text()").extract_unquoted() company = hxs.select("//li[@class='propertiesleft' and contains(text(),'Podjetje:')]/following-sibling::li/text()").extract_unquoted() if title and company: city = hxs.select("//li[@class='propertiesleft' and contains(text(),'Regija in kraj dela:')]/following-sibling::li/text()").extract_unquoted() category = hxs.select(u"//li[@class='propertiesleft2' and contains(text(),'Področje dela:')]/following-sibling::li/text()".encode('utf-8')).extract_unquoted() images_url = hxs.select("//img[@id='mainimage']/@src").extract() item=JobItem() if images_url: item.load_image(self.get_base_url(response, images_url[0])) loader = JobLoader(item) loader.add_value('title', title) loader.add_value('company', company) loader.add_value('category', category) loader.add_value('city', city) loader.add_value('details_url', url_query_cleaner(response.url)) loader.add_value('published_date', hxs.select("//li[@class='dates']/text()").re(r".*:\s+(.*)")) loader.add_value('id', self.generate_id(response.url)) loader.add_value('content', response.body_as_unicode()) loader.add_value('source', self.name) loader.add_value('source_label', self.label) yield loader.load_item()
def parse_job(self, response): hxs = HtmlXPathSelector(response) next_page = hxs.select("//ul[@class='pagination']/li[@class='selected']//following-sibling::li[1]/a/@href").extract() if next_page: next_page = self.get_base_url(response, next_page[0]) if next_page != self.get_base_url(response, response.request.url): yield Request(url=next_page, callback=self.parse_job, meta={'category': response.request.meta['category']}) for job in hxs.select("//ul[@id='newJobs']/li"): name = job.select("p[@class='jobTitle']/a/text()").extract_unquoted() company = job.select("strong/text()").extract_unquoted() if name and company: detail_url = job.select("p[@class='jobTitle']/a/@href").extract() detail_url = self.get_base_url(response, detail_url[0]) if detail_url: images_url = job.select("div[@class='jobImgDiv']/img/@src").extract() item = JobItem() item['title'] = name item['company'] = company item['category'] = response.request.meta['category'] item['summary'] = job.select("p[2]/text()").extract_unquoted() item['details_url'] = url_query_cleaner(detail_url) item['published_date'] = job.select("span[1]/text()").re(r".*:\s(.*)") if images_url: item.load_image(self.get_base_url(response, images_url[0])) yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
def parse_job_detail(self, response): hxs = HtmlXPathSelector(response) title = hxs.select("//span[@class='header']/text()").extract_unquoted() company = hxs.select("//span[@class='fontblacklarge']/b/text()").extract_unquoted() if title and company: city = hxs.select("//span[@class='fontblacklarge']/b[2]/text()").extract_unquoted() category = response.request.meta['category'] published_date = hxs.select("//span[@class='fontblacklarge']/../../following-sibling::tr[2]/td/b/text()").extract_unquoted() item=JobItem() images_url = hxs.select("//span[@class='fontblacklarge']/../following-sibling::td[2]/img/@src").extract() if images_url: item.load_image(self.get_base_url(response, images_url[0])) loader = JobLoader(item) loader.add_value('title', title) loader.add_value('company', company) loader.add_value('category', category) loader.add_value('city', city) loader.add_value('details_url', url_query_cleaner(response.url, ('najdi', 'id'))) loader.add_value('published_date', published_date) loader.add_value('id', self.generate_id(response.url, ('najdi', 'id'))) loader.add_value('content', response.body_as_unicode()) loader.add_value('source', self.name) loader.add_value('source_label', self.label) yield loader.load_item()
def parse_job(self, response): hxs = HtmlXPathSelector(response) next_page = hxs.select("//div[@class='PagedList-pager']//ul/li[contains(@class, 'PagedList-currentPage')]/following-sibling::li[1]/a/@href").extract() if next_page: next_page = self.get_base_url(response, next_page[0]) if next_page != self.get_base_url(response, response.request.url): yield Request(url=next_page, callback=self.parse_job) category = hxs.select("//form[@id='searchForm']//select[@name='wfid']/option[@selected='selected']/text()").extract() informations = hxs.select("//table[@class='job-add-listing']//tr//div[@class='job-add-item-inner']") for information in informations: name = information.select("h2/a/text()").extract() company = information.select("p[3]/strong/text()").extract() if name and company: detail_url = information.select("h2/a/@href").extract() detail_url = self.get_base_url(response, detail_url[0]) if detail_url: images_url = information.select("div[contains(@class,'city-logo')]/img/@src").extract() item = JobItem() item['title'] = name item['company'] = company item['category'] = category item['summary'] = information.select("p[2]/text()").extract() item['city'] = information.select("p[1]/text()").extract() item['details_url'] = url_query_cleaner(detail_url) item['published_date'] = information.select("div[contains(@class,'city-logo')]/div/text()").extract() if images_url: item.load_image(self.get_base_url(response, images_url[0])) yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
def parse_job(self, response): hxs = HtmlXPathSelector(response) next_page = hxs.select("//span[@class='stevilke']/a[contains(@class, 'active')]/following-sibling::a[1]/@href").extract() if next_page: next_page = self.get_base_url(response, next_page[0]) if next_page != self.get_base_url(response, response.request.url): yield Request(url=next_page, callback=self.parse_job) category = hxs.select( "//input[@type='checkbox' and contains(@class, 'iskalnik_kriteriji_tip_sektor') and @checked]" + "//following-sibling::label[1]/text()" ).extract_unquoted() for job in hxs.select("//tr[@class='bg_oglas_dm']"): name = job.select("td[@class='ena']/div/a/b/text()").extract_unquoted() company = job.select("td[@class='dva']/a/text()").extract_unquoted() if name and company: detail_url = job.select("td[@class='ena']/div/a/@href").extract() detail_url = self.get_base_url(response, detail_url[0]) if detail_url: images_url = job.select("td[@class='stiri']//img/@src").extract() item = JobItem() item['title'] = name item['company'] = company item['category'] = category item['city'] = job.select("td[@class='tri']/a/text()").extract_unquoted() item['details_url'] = url_query_cleaner(detail_url) item['published_date'] = job.select("td[@class='stiri']//div[2]/text()").re(r"\s+(\d{2}.\d{2}.\d{4})\s+") if images_url: item.load_image(self.get_base_url(response, images_url[0])) yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})