Beispiel #1
0
 def parse_job_detail(self, response):
     hxs = HtmlXPathSelector(response)
     
     title = hxs.select("//div[@id='businessmyaccountjobadpreviewmain']/div/font/text()").extract_unquoted()
     company = hxs.select("//li[@class='propertiesleft' and contains(text(),'Podjetje:')]/following-sibling::li/text()").extract_unquoted()
     
     if title and company:
         city = hxs.select("//li[@class='propertiesleft' and contains(text(),'Regija in kraj dela:')]/following-sibling::li/text()").extract_unquoted()
         category = hxs.select(u"//li[@class='propertiesleft2' and contains(text(),'Področje dela:')]/following-sibling::li/text()".encode('utf-8')).extract_unquoted()
         images_url = hxs.select("//img[@id='mainimage']/@src").extract()
         item=JobItem()
         
         if images_url:
             item.load_image(self.get_base_url(response, images_url[0]))
     
         loader = JobLoader(item)
         loader.add_value('title', title)
         loader.add_value('company', company)
         loader.add_value('category', category)
         loader.add_value('city', city)
         loader.add_value('details_url', url_query_cleaner(response.url))
         loader.add_value('published_date', hxs.select("//li[@class='dates']/text()").re(r".*:\s+(.*)"))
         loader.add_value('id', self.generate_id(response.url))
         loader.add_value('content', response.body_as_unicode())
         loader.add_value('source', self.name)
         loader.add_value('source_label', self.label)
         
         yield loader.load_item()
Beispiel #2
0
    def parse_job(self, response):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//ul[@class='pagination']/li[@class='selected']//following-sibling::li[1]/a/@href").extract()

        if next_page:
            next_page = self.get_base_url(response, next_page[0])
            if next_page != self.get_base_url(response, response.request.url):
                yield Request(url=next_page, callback=self.parse_job, meta={'category': response.request.meta['category']})

        for job in hxs.select("//ul[@id='newJobs']/li"):
            name = job.select("p[@class='jobTitle']/a/text()").extract_unquoted()
            company = job.select("strong/text()").extract_unquoted()

            if name and company:
                detail_url = job.select("p[@class='jobTitle']/a/@href").extract()
                detail_url = self.get_base_url(response, detail_url[0])

                if detail_url:
                    images_url = job.select("div[@class='jobImgDiv']/img/@src").extract()
                    item = JobItem()

                    item['title'] = name
                    item['company'] = company
                    item['category'] = response.request.meta['category']
                    item['summary'] =  job.select("p[2]/text()").extract_unquoted()
                    item['details_url'] = url_query_cleaner(detail_url)
                    item['published_date'] = job.select("span[1]/text()").re(r".*:\s(.*)")

                    if images_url:
                        item.load_image(self.get_base_url(response, images_url[0]))

                    yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
Beispiel #3
0
 def parse_job_detail(self, response):        
     hxs = HtmlXPathSelector(response)
     
     title = hxs.select("//span[@class='header']/text()").extract_unquoted()       
     company = hxs.select("//span[@class='fontblacklarge']/b/text()").extract_unquoted()   
          
     if title and company: 
         city = hxs.select("//span[@class='fontblacklarge']/b[2]/text()").extract_unquoted()
         category = response.request.meta['category']        
         published_date = hxs.select("//span[@class='fontblacklarge']/../../following-sibling::tr[2]/td/b/text()").extract_unquoted()
         
         item=JobItem()
         images_url = hxs.select("//span[@class='fontblacklarge']/../following-sibling::td[2]/img/@src").extract()        
         
         if images_url:
             item.load_image(self.get_base_url(response, images_url[0]))
         
         loader = JobLoader(item)
         loader.add_value('title', title)
         loader.add_value('company', company)
         loader.add_value('category', category)
         loader.add_value('city', city)
         loader.add_value('details_url', url_query_cleaner(response.url, ('najdi', 'id')))
         loader.add_value('published_date', published_date)
         loader.add_value('id', self.generate_id(response.url, ('najdi', 'id')))
         loader.add_value('content', response.body_as_unicode())
         loader.add_value('source', self.name)
         loader.add_value('source_label', self.label)
         
         yield loader.load_item()
Beispiel #4
0
    def parse_job(self, response):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//div[@class='PagedList-pager']//ul/li[contains(@class, 'PagedList-currentPage')]/following-sibling::li[1]/a/@href").extract()
        
        if next_page:
            next_page = self.get_base_url(response, next_page[0])
            
            if next_page != self.get_base_url(response, response.request.url):
                yield Request(url=next_page, callback=self.parse_job)
          
        category = hxs.select("//form[@id='searchForm']//select[@name='wfid']/option[@selected='selected']/text()").extract()      
        informations = hxs.select("//table[@class='job-add-listing']//tr//div[@class='job-add-item-inner']")
        
        for information in informations:
            name    = information.select("h2/a/text()").extract()
            company = information.select("p[3]/strong/text()").extract()

            if name and company:                
                detail_url = information.select("h2/a/@href").extract()
                detail_url = self.get_base_url(response, detail_url[0])
                
                if detail_url:
                    images_url = information.select("div[contains(@class,'city-logo')]/img/@src").extract()
                    item = JobItem()
                    
                    item['title'] = name
                    item['company'] = company
                    item['category'] = category
                    item['summary'] = information.select("p[2]/text()").extract()
                    item['city'] = information.select("p[1]/text()").extract()
                    item['details_url'] = url_query_cleaner(detail_url)
                    item['published_date'] = information.select("div[contains(@class,'city-logo')]/div/text()").extract()
                    
                    if images_url:
                        item.load_image(self.get_base_url(response, images_url[0]))
                
                    yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
Beispiel #5
0
    def parse_job(self, response):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//span[@class='stevilke']/a[contains(@class, 'active')]/following-sibling::a[1]/@href").extract()

        if next_page:
            next_page = self.get_base_url(response, next_page[0])
            if next_page != self.get_base_url(response, response.request.url):
                yield Request(url=next_page, callback=self.parse_job)

        category = hxs.select(        
            "//input[@type='checkbox' and contains(@class, 'iskalnik_kriteriji_tip_sektor') and @checked]" +
            "//following-sibling::label[1]/text()"
        ).extract_unquoted()
        
        for job in hxs.select("//tr[@class='bg_oglas_dm']"):
            name = job.select("td[@class='ena']/div/a/b/text()").extract_unquoted()
            company = job.select("td[@class='dva']/a/text()").extract_unquoted()
            
            if name and company:
                detail_url = job.select("td[@class='ena']/div/a/@href").extract()
                detail_url = self.get_base_url(response, detail_url[0])
                
                if detail_url:
                    images_url = job.select("td[@class='stiri']//img/@src").extract()
                    item = JobItem()
                    
                    item['title'] = name
                    item['company'] = company
                    item['category'] = category
                    item['city'] = job.select("td[@class='tri']/a/text()").extract_unquoted()
                    item['details_url'] = url_query_cleaner(detail_url)
                    item['published_date'] = job.select("td[@class='stiri']//div[2]/text()").re(r"\s+(\d{2}.\d{2}.\d{4})\s+")

                    if images_url:
                        item.load_image(self.get_base_url(response, images_url[0]))
        
                    yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})