def parse_job_detail(self, response): hxs = HtmlXPathSelector(response) title = hxs.select("//div[@id='businessmyaccountjobadpreviewmain']/div/font/text()").extract_unquoted() company = hxs.select("//li[@class='propertiesleft' and contains(text(),'Podjetje:')]/following-sibling::li/text()").extract_unquoted() if title and company: city = hxs.select("//li[@class='propertiesleft' and contains(text(),'Regija in kraj dela:')]/following-sibling::li/text()").extract_unquoted() category = hxs.select(u"//li[@class='propertiesleft2' and contains(text(),'Področje dela:')]/following-sibling::li/text()".encode('utf-8')).extract_unquoted() images_url = hxs.select("//img[@id='mainimage']/@src").extract() item=JobItem() if images_url: item.load_image(self.get_base_url(response, images_url[0])) loader = JobLoader(item) loader.add_value('title', title) loader.add_value('company', company) loader.add_value('category', category) loader.add_value('city', city) loader.add_value('details_url', url_query_cleaner(response.url)) loader.add_value('published_date', hxs.select("//li[@class='dates']/text()").re(r".*:\s+(.*)")) loader.add_value('id', self.generate_id(response.url)) loader.add_value('content', response.body_as_unicode()) loader.add_value('source', self.name) loader.add_value('source_label', self.label) yield loader.load_item()
def parse_job_detail(self, response): hxs = HtmlXPathSelector(response) title = hxs.select("//span[@class='header']/text()").extract_unquoted() company = hxs.select("//span[@class='fontblacklarge']/b/text()").extract_unquoted() if title and company: city = hxs.select("//span[@class='fontblacklarge']/b[2]/text()").extract_unquoted() category = response.request.meta['category'] published_date = hxs.select("//span[@class='fontblacklarge']/../../following-sibling::tr[2]/td/b/text()").extract_unquoted() item=JobItem() images_url = hxs.select("//span[@class='fontblacklarge']/../following-sibling::td[2]/img/@src").extract() if images_url: item.load_image(self.get_base_url(response, images_url[0])) loader = JobLoader(item) loader.add_value('title', title) loader.add_value('company', company) loader.add_value('category', category) loader.add_value('city', city) loader.add_value('details_url', url_query_cleaner(response.url, ('najdi', 'id'))) loader.add_value('published_date', published_date) loader.add_value('id', self.generate_id(response.url, ('najdi', 'id'))) loader.add_value('content', response.body_as_unicode()) loader.add_value('source', self.name) loader.add_value('source_label', self.label) yield loader.load_item()
def parse_job(self, response): hxs = HtmlXPathSelector(response) next_page = hxs.select("//ul[@class='pagination']/li[@class='selected']//following-sibling::li[1]/a/@href").extract() if next_page: next_page = self.get_base_url(response, next_page[0]) if next_page != self.get_base_url(response, response.request.url): yield Request(url=next_page, callback=self.parse_job, meta={'category': response.request.meta['category']}) for job in hxs.select("//ul[@id='newJobs']/li"): name = job.select("p[@class='jobTitle']/a/text()").extract_unquoted() company = job.select("strong/text()").extract_unquoted() if name and company: detail_url = job.select("p[@class='jobTitle']/a/@href").extract() detail_url = self.get_base_url(response, detail_url[0]) if detail_url: images_url = job.select("div[@class='jobImgDiv']/img/@src").extract() item = JobItem() item['title'] = name item['company'] = company item['category'] = response.request.meta['category'] item['summary'] = job.select("p[2]/text()").extract_unquoted() item['details_url'] = url_query_cleaner(detail_url) item['published_date'] = job.select("span[1]/text()").re(r".*:\s(.*)") if images_url: item.load_image(self.get_base_url(response, images_url[0])) yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
def parse_detail_pages(self, response): hxs = Selector(response) jobs = hxs.xpath('//div[contains(@class, "content")]') items = [] for job in jobs: item = JobItem() item["title"] = job.xpath( '//div[contains(@class, "posting-headline")]/h2/text()' ).extract_first() item["company"] = job.xpath( '//div[contains(@class, "main-footer-text page-centered")]/p/a/text()' ).extract() item["company_url"] = job.xpath( '//div[contains(@class, "main-footer-text page-centered")]/p/a/@href' ).extract() item["body"] = job.xpath( '//div[contains(@class, "section page-centered")]').extract() item["location"] = job.xpath( '//div[contains(@class, "sort-by-time posting-category medium-category-label")]' ).extract_first() item["url"] = response.request.url item["pub_date"] = str('n/a') item["email"] = str('n/a') item["salary"] = str('n/a') item["scrape_date"] = timezone.now() item["job_board"] = "Lever" item["board_url"] = "lever.co" items.append(item) return items
def parse_node(self, response, node): item = JobItem() item['title'] = node.xpath('title/text()').extract_first() item['company'] = 'Remote Python' item['body'] = node.xpath('description/text()').extract() item['pub_date'] = 'n/a' item['url'] = node.xpath('link/text()').extract_first() item["scrape_date"] = timezone.now() item["job_board"] = "Remote Python" item["board_url"] = "www.remotepython.com" item["email"] = 'n/a' item["salary"] = 'n/a' item['location'] = 'n/a' return item
def parse_node(self, response, node): item = JobItem() item['title'] = node.xpath('title/text()').extract_first() item['company'] = node.xpath('source/text()').extract_first() item['body'] = node.xpath('description/text()').extract() item['pub_date'] = node.xpath('pubDate/text()').extract_first() item['url'] = node.xpath('link/text()').extract_first() item["scrape_date"] = timezone.now() item["job_board"] = "Indeed" item["board_url"] = "www.indeed.com" item["email"] = str('n/a') item["salary"] = str('n/a') item['location'] = str('n/a') return item
def parse_job(self, response): hxs = HtmlXPathSelector(response) next_page = hxs.select("//div[@class='PagedList-pager']//ul/li[contains(@class, 'PagedList-currentPage')]/following-sibling::li[1]/a/@href").extract() if next_page: next_page = self.get_base_url(response, next_page[0]) if next_page != self.get_base_url(response, response.request.url): yield Request(url=next_page, callback=self.parse_job) category = hxs.select("//form[@id='searchForm']//select[@name='wfid']/option[@selected='selected']/text()").extract() informations = hxs.select("//table[@class='job-add-listing']//tr//div[@class='job-add-item-inner']") for information in informations: name = information.select("h2/a/text()").extract() company = information.select("p[3]/strong/text()").extract() if name and company: detail_url = information.select("h2/a/@href").extract() detail_url = self.get_base_url(response, detail_url[0]) if detail_url: images_url = information.select("div[contains(@class,'city-logo')]/img/@src").extract() item = JobItem() item['title'] = name item['company'] = company item['category'] = category item['summary'] = information.select("p[2]/text()").extract() item['city'] = information.select("p[1]/text()").extract() item['details_url'] = url_query_cleaner(detail_url) item['published_date'] = information.select("div[contains(@class,'city-logo')]/div/text()").extract() if images_url: item.load_image(self.get_base_url(response, images_url[0])) yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
def parse_job(self, response): hxs = HtmlXPathSelector(response) next_page = hxs.select("//span[@class='stevilke']/a[contains(@class, 'active')]/following-sibling::a[1]/@href").extract() if next_page: next_page = self.get_base_url(response, next_page[0]) if next_page != self.get_base_url(response, response.request.url): yield Request(url=next_page, callback=self.parse_job) category = hxs.select( "//input[@type='checkbox' and contains(@class, 'iskalnik_kriteriji_tip_sektor') and @checked]" + "//following-sibling::label[1]/text()" ).extract_unquoted() for job in hxs.select("//tr[@class='bg_oglas_dm']"): name = job.select("td[@class='ena']/div/a/b/text()").extract_unquoted() company = job.select("td[@class='dva']/a/text()").extract_unquoted() if name and company: detail_url = job.select("td[@class='ena']/div/a/@href").extract() detail_url = self.get_base_url(response, detail_url[0]) if detail_url: images_url = job.select("td[@class='stiri']//img/@src").extract() item = JobItem() item['title'] = name item['company'] = company item['category'] = category item['city'] = job.select("td[@class='tri']/a/text()").extract_unquoted() item['details_url'] = url_query_cleaner(detail_url) item['published_date'] = job.select("td[@class='stiri']//div[2]/text()").re(r"\s+(\d{2}.\d{2}.\d{4})\s+") if images_url: item.load_image(self.get_base_url(response, images_url[0])) yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
def parse_item(self, response): job = JobItem() job['title'] = response.xpath('/html/head/title/text()').extract() job['body'] = response.xpath('/html/body/blockquote/font').extract() job['url'] = response.url job['pub_date'] = response.xpath( '(/html/body/p)[2]/strong/br/following-sibling::text()').extract() job['scrape_date'] = timezone.now() job['board_title'] = self.board_title job['board_url'] = self.board_url job['org_title'] = response.xpath( '(/html/body/p)[2]/strong/big/text()').extract() job['org_email'] = response.xpath( '(/html/body/p)[2]/strong/a/text()').extract() yield job
def parse_node(self, response, node): item = JobItem() item['title'] = node.xpath('title/text()').extract_first() # item['title'] = str('test') item['company'] = str('Django Gigs') item['body'] = node.xpath('description/text()').extract() item['pub_date'] = node.xpath('pubDate/text()').extract_first() item['url'] = node.xpath('link/text()').extract_first() item['scrape_date'] = timezone.now() item['job_board'] = "Django Gigs" item['board_url'] = "www.djangogigs.com" item['email'] = str('n/a') item['salary'] = str('n/a') item['location'] = str('n/a') return item
def parse_node(self, response, node): item = JobItem() item['title'] = node.xpath('title/text()').extract_first().split('(', 1)[0] item['company'] = node.xpath('name/text()').extract_first() item['body'] = node.xpath('description/text()').extract() item['pub_date'] = node.xpath('pubDate/text()').extract_first() item['url'] = node.xpath('link/text()').extract_first() item["scrape_date"] = timezone.now() item["job_board"] = "Stack Overflow" item["board_url"] = "www.stackoverflow.com" item["email"] = str('n/a') item["salary"] = str('n/a') if node.xpath('location/text()'): item['location'] = node.xpath('location/text()').extract_first() else: item['location'] = str('n/a') return item
def parse_detail_pages(self, response): hxs = Selector(response) jobs = hxs.xpath('//main[contains(@class, "stacked")]') items = [] for job in jobs: item = JobItem() item["title"] = str('n/a') item["company"] = str('n/a') item["body"] = job.xpath( '//main[contains(@class, "stacked")]').extract() item["location"] = job.xpath( '//p[contains(@class, "meta")]').extract_first() item["url"] = response.request.url item["pub_date"] = str('n/a') item["email"] = str('n/a') item["salary"] = str('n/a') # item["tags"] = job.css('.-tags p a.post-tag::text').extract() item["scrape_date"] = timezone.now() item["job_board"] = "Workable" item["board_url"] = "www.workable.com" items.append(item) return items
def parse_detail_pages(self, response): hxs = Selector(response) jobs = hxs.xpath('//div[contains(@id, "content")]') items = [] for job in jobs: item = JobItem() item["title"] = job.xpath( '//h1[contains(@class, "jobtitle")]/text()').extract_first() item["company"] = str('n/a') item["body"] = job.xpath( '//div[contains(@class, "jobdesciption")]').extract() item["location"] = job.xpath( '//span[contains(@class, "meta-job-location-city")]').extract( ) item["url"] = response.request.url item["pub_date"] = str('n/a') item["email"] = str('n/a') item["salary"] = str('n/a') item["scrape_date"] = timezone.now() item["job_board"] = "Recruiter Box" item["board_url"] = "www.recruiterbox.com" items.append(item) return items
def parse_detail_pages(self, response): hxs = Selector(response) jobs = hxs.xpath('//div[contains(@id, "app_body")]') items = [] for job in jobs: item = JobItem() item["title"] = job.xpath( '//h1[contains(@class, "app-title")]/text()').extract_first() item["company"] = job.xpath( '//span[contains(@class, "company-name")]/text()' ).extract_first() item["body"] = job.xpath( '//div[contains(@id, "content")]').extract() item["location"] = job.xpath( '//div[contains(@class, "location")]').extract_first() item["url"] = response.request.url item["pub_date"] = str('n/a') item["email"] = str('n/a') item["salary"] = str('n/a') item["scrape_date"] = timezone.now() item["job_board"] = "Greenhouse" item["board_url"] = "www.greenhouse.io" items.append(item) return items