def parse(self, response):
        super(SimplyhiredSpider,  self).parse(response)
#        # with scrapy selector
#        # for sel in response.xpath('//div[@class="job"]'):
#        for sel in response.css('div.job'):
#            item = self.init_item(response)
#            item['keyword'] = response.meta['keyword']
#            item['date_search'] = current_datetime()
#            #this works, but return different url
#            #item['item_url'] = sel.css('a.title::attr(href)').extract()[0]
#            item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0]
#            item['title'] = sel.css('a.title::text').extract()[0].strip()

        # with bs4
        soup = bs4.BeautifulSoup(response.body)
        soupitems = soup.select('div.job')     
        if len(soupitems) < 1:
            append(self.fail_url_path, 'no data:' + response.url)
            return
        for soupitem in soupitems:
            item = self.init_item(response)
            item['item_url'] = [a.attrs.get('href') for a in soupitem.select('div.tools > a') if a.attrs.get('href')][0]
            item['title'] = soupitem.select('h2')[0].text.strip()
            try:
                item['company'] = soupitem.h4.text
            except AttributeError:
                pass
            #      logger.debug('item: %s has no h4 tag' % i)
            if soupitem.find('span', itemprop="addressLocality"):
                item['locality'] = soupitem.find('span', itemprop="addressLocality").text
            if soupitem.find('span', itemprop="addressRegion"):
                item['region'] = soupitem.find('span', itemprop="addressRegion").text
            item['short_description'] = soupitem.find('p', itemprop="description").text
            item['published'] = timeago2datetimestr(item['date_search'],  soupitem.select('span.ago')[0].text)
            # data not available in this website
            item['salary'] = ''
            item['clearance'] = ''
            item['department'] = ''
            self.logger.debug('title %s' % item['title'])
            yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )

        #for url in response.xpath('//link[@rel="next"]/@href').extract()[0]:
        next = response.css('a.next::attr(href)').extract()            
        if next:
            self.logger.debug('next url: %s' % next[0])
            yield Request(
                # self.base_url + next[0]['href'],  # with soup
                next[0], 
                callback=self.parse, 
                meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
            )
        else:
            self.logger.debug('no next url')            
Beispiel #2
0
 def parse(self, response):
     super(GlassdoorSpider, self).parse(response)
     #        list = response.css('.standardJobListings')
     list = response.css('.jlGrid')
     rows = list.xpath('.//div[@itemtype="http://schema.org/JobPosting"]')
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         url = row.xpath('.//h3[@itemprop="title"]/a/@href').extract()[0]
         item['item_url'] = self.base_url + url
         # a text could have span tag inside
         item['title'] = row.xpath(
             'string(.//h3[@itemprop="title"]/a)').extract()[0].strip()
         item['company'] = row.xpath(
             'string(.//span[@class="employerName"])').extract()[0].strip()
         published = row.css('.logo').css(
             '.minor::text').extract()[0].strip()
         item['published'] = timeago2datetimestr(item['date_search'],
                                                 published)
         item['short_description'] = row.xpath(
             'string(.//p[@itemprop="description"])').extract()[0].strip()
         loc = row.xpath('string(.//span[@itemprop="addressLocality"])'
                         ).extract()[0].strip()
         try:
             item['locality'], item['region'] = loc.split(', ')
         except ValueError:
             item['locality'] = loc
             item['region'] = ''
         # data not available in this website
         item['salary'] = ''
         item['clearance'] = ''
         item['department'] = ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],
                       callback=self.parse_item,
                       meta={'item': item})
     next = response.css('.next a::attr(href)').extract()
     if next:
         self.logger.debug('next url: %s' % self.base_url + next[0])
         yield Request(
             # self.base_url + next[0]['href'],  # with soup
             self.base_url + next[0],
             callback=self.parse,
             meta={
                 'keyword': response.meta['keyword'],
                 'location': response.meta['location']
             })
     else:
         self.logger.debug('no next url')
    def parse(self, response):
        self.logger.debug('in parse')
        #        # for sel in response.xpath('//div[@class="job"]'):
        #        for sel in response.css('div.job'):
        #            self.logger.debug('parsing')
        #            item = ScrapyscrappersItem()
        #            item['keyword'] = response.meta['keyword']
        #            item['date_search'] = current_datetime()
        #            # item['item_url'] = sel.xpath('.//div[@class="tools"]/@href').extract()[0]
        #            #item['item_url'] = sel.css('a.title::attr(href)').extract()[0]
        #            item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0]
        #            #item['title'] = sel.xpaht('.//h2')
        #            item['title'] = sel.css('a.title::text').extract()[0].strip()

        soup = bs4.BeautifulSoup(response.body)
        soupitems = soup.select('div.job')
        for soupitem in soupitems:
            self.logger.debug('parsing')
            item = ScrapyscrappersItem()
            item['keyword'] = response.meta['keyword']
            item['date_search'] = current_datetime()
            item['item_url'] = [
                a.attrs.get('href') for a in soupitem.select('div.tools > a')
                if a.attrs.get('href')
            ][0]
            item['title'] = soupitem.select('h2')[0].text.strip()
            try:
                item['company'] = soupitem.h4.text
            except AttributeError:
                pass
            #      logger.debug('item: %s has no h4 tag' % i)
            if soupitem.find('span', itemprop="addressLocality"):
                item['locality'] = soupitem.find(
                    'span', itemprop="addressLocality").text
            if soupitem.find('span', itemprop="addressRegion"):
                item['region'] = soupitem.find('span',
                                               itemprop="addressRegion").text
            item['short_description'] = soupitem.find(
                'p', itemprop="description").text
            item['published'] = timeago2datetimestr(
                item['date_search'],
                soupitem.select('span.ago')[0].text)
            #salary
            #clearance
            #department
            #description

            self.logger.debug('title %s' % item['title'])
            yield Request(item['item_url'],
                          callback=self.parse_item,
                          meta={'item': item})
Beispiel #4
0
    def parse(self, response):
        super(CareerbuilderSpider,  self).parse(response)
        rows = response.css('.gs-job-result-abstract')
        if len(rows) < 1:
            append(self.fail_url_path, 'no data:' + response.url)
            return
        for row in rows:
            item = self.init_item(response)
            item['item_url'] = row.css('.jt').xpath('@href').extract()[0]
            item['title'] = row.css('.jt::text').extract()[0]
            #item['short_description'] = row.css('span[itemprop="description"]::text').extract()[0]
            # the same with xpath
            item['short_description'] = row.xpath('.//span[@itemprop="description"]/text()').extract()[0]
            item['company'] = row.xpath('.//a/@companyname').extract()[0]
            # or
#            try:
#                item['company'] = row.xpath('.//td[@itemprop="hiringOrganization"]/a/text()').extract()[0]
#            except IndexError:
#                self.logger.debug(row.xpath('.//td[@itemprop="hiringOrganization"]/a/text()').extract())
            location = row.xpath('.//div[@itemprop="jobLocation"]/span/text()').extract()[0]
            try:
                item['region'] ,  item['locality'] = location.split(' - ')
            except ValueError:
                item['locality'] = location
            teaser = [i.strip() for i in row.xpath('.//div[contains(@id, "pnlTeaser")]/p/text()').extract()[0].split('|')]
            if len(teaser) == 2:
                item['salary'] = teaser[1].split(':')[1]
            else:
                item['salary'] = ''
            item['department'] = ''
            ago = row.css('.jl_rslt_posted_cell span::text').extract()[0]
            item['published'] = timeago2datetimestr(item['date_search'] , ago)
            self.logger.debug('title %s' % item['title'])
            yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
        next = response.css('.JL_MXDLPagination2_next').xpath('@href').extract()
        if next:
            self.logger.debug('next url: %s' % next[0])
            yield Request(
                next[0], 
                callback=self.parse, 
                meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
            )
        else:
            self.logger.debug('no next url')
    def parse(self, response):
        self.logger.debug('in parse')
#        # for sel in response.xpath('//div[@class="job"]'):
#        for sel in response.css('div.job'):
#            self.logger.debug('parsing')
#            item = ScrapyscrappersItem()
#            item['keyword'] = response.meta['keyword']
#            item['date_search'] = current_datetime()
#            # item['item_url'] = sel.xpath('.//div[@class="tools"]/@href').extract()[0]
#            #item['item_url'] = sel.css('a.title::attr(href)').extract()[0]
#            item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0]
#            #item['title'] = sel.xpaht('.//h2')
#            item['title'] = sel.css('a.title::text').extract()[0].strip()

        soup = bs4.BeautifulSoup(response.body)
        soupitems = soup.select('div.job')     
        for soupitem in soupitems:
            self.logger.debug('parsing')
            item = ScrapyscrappersItem()
            item['keyword'] = response.meta['keyword']
            item['date_search'] = current_datetime()
            item['item_url'] = [a.attrs.get('href') for a in soupitem.select('div.tools > a') if a.attrs.get('href')][0]
            item['title'] = soupitem.select('h2')[0].text.strip()
            try:
                item['company'] = soupitem.h4.text
            except AttributeError:
                pass
            #      logger.debug('item: %s has no h4 tag' % i)
            if soupitem.find('span', itemprop="addressLocality"):
                item['locality'] = soupitem.find('span', itemprop="addressLocality").text
            if soupitem.find('span', itemprop="addressRegion"):
                item['region'] = soupitem.find('span', itemprop="addressRegion").text
            item['short_description'] = soupitem.find('p', itemprop="description").text
            item['published'] = timeago2datetimestr(item['date_search'],  soupitem.select('span.ago')[0].text)
            #salary
            #clearance
            #department
            #description
            
            self.logger.debug('title %s' % item['title'])
            yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
Beispiel #6
0
    def parse(self, response):
        super(GlassdoorSpider,  self).parse(response)
#        list = response.css('.standardJobListings')
        list = response.css('.jlGrid')
        rows = list.xpath('.//div[@itemtype="http://schema.org/JobPosting"]')
        if len(rows) < 1:
            append(self.fail_url_path, 'no data:' + response.url)
            return
        for row in rows:
            item = self.init_item(response)
            url = row.xpath('.//h3[@itemprop="title"]/a/@href').extract()[0]
            item['item_url'] = self.base_url + url
            # a text could have span tag inside
            item['title'] = row.xpath('string(.//h3[@itemprop="title"]/a)').extract()[0].strip()
            item['company'] = row.xpath('string(.//span[@class="employerName"])').extract()[0].strip()
            published = row.css('.logo').css('.minor::text').extract()[0].strip()
            item['published'] =  timeago2datetimestr(item['date_search'],  published)
            item['short_description'] = row.xpath('string(.//p[@itemprop="description"])').extract()[0].strip()
            loc = row.xpath('string(.//span[@itemprop="addressLocality"])').extract()[0].strip()
            try:
                item['locality'] ,  item['region']  = loc.split(', ')
            except ValueError:
                item['locality'] = loc
                item['region'] = ''
            # data not available in this website
            item['salary'] = ''
            item['clearance'] = ''
            item['department'] = ''
            self.logger.debug('title %s' % item['title'])
            yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
        next = response.css('.next a::attr(href)').extract()
        if next:
            self.logger.debug('next url: %s' % self.base_url + next[0])
            yield Request(
                # self.base_url + next[0]['href'],  # with soup
                self.base_url + next[0], 
                callback=self.parse, 
                meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
            )
        else:
            self.logger.debug('no next url')