コード例 #1
0
    def parse(self, response):
        self.logger.debug('in parse')
        #        # for sel in response.xpath('//div[@class="job"]'):
        #        for sel in response.css('div.job'):
        #            self.logger.debug('parsing')
        #            item = ScrapyscrappersItem()
        #            item['keyword'] = response.meta['keyword']
        #            item['date_search'] = current_datetime()
        #            # item['item_url'] = sel.xpath('.//div[@class="tools"]/@href').extract()[0]
        #            #item['item_url'] = sel.css('a.title::attr(href)').extract()[0]
        #            item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0]
        #            #item['title'] = sel.xpaht('.//h2')
        #            item['title'] = sel.css('a.title::text').extract()[0].strip()

        soup = bs4.BeautifulSoup(response.body)
        soupitems = soup.select('div.job')
        for soupitem in soupitems:
            self.logger.debug('parsing')
            item = ScrapyscrappersItem()
            item['keyword'] = response.meta['keyword']
            item['date_search'] = current_datetime()
            item['item_url'] = [
                a.attrs.get('href') for a in soupitem.select('div.tools > a')
                if a.attrs.get('href')
            ][0]
            item['title'] = soupitem.select('h2')[0].text.strip()
            try:
                item['company'] = soupitem.h4.text
            except AttributeError:
                pass
            #      logger.debug('item: %s has no h4 tag' % i)
            if soupitem.find('span', itemprop="addressLocality"):
                item['locality'] = soupitem.find(
                    'span', itemprop="addressLocality").text
            if soupitem.find('span', itemprop="addressRegion"):
                item['region'] = soupitem.find('span',
                                               itemprop="addressRegion").text
            item['short_description'] = soupitem.find(
                'p', itemprop="description").text
            item['published'] = timeago2datetimestr(
                item['date_search'],
                soupitem.select('span.ago')[0].text)
            #salary
            #clearance
            #department
            #description

            self.logger.debug('title %s' % item['title'])
            yield Request(item['item_url'],
                          callback=self.parse_item,
                          meta={'item': item})
コード例 #2
0
    def parse(self, response):
        self.logger.debug('in parse')
#        # for sel in response.xpath('//div[@class="job"]'):
#        for sel in response.css('div.job'):
#            self.logger.debug('parsing')
#            item = ScrapyscrappersItem()
#            item['keyword'] = response.meta['keyword']
#            item['date_search'] = current_datetime()
#            # item['item_url'] = sel.xpath('.//div[@class="tools"]/@href').extract()[0]
#            #item['item_url'] = sel.css('a.title::attr(href)').extract()[0]
#            item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0]
#            #item['title'] = sel.xpaht('.//h2')
#            item['title'] = sel.css('a.title::text').extract()[0].strip()

        soup = bs4.BeautifulSoup(response.body)
        soupitems = soup.select('div.job')     
        for soupitem in soupitems:
            self.logger.debug('parsing')
            item = ScrapyscrappersItem()
            item['keyword'] = response.meta['keyword']
            item['date_search'] = current_datetime()
            item['item_url'] = [a.attrs.get('href') for a in soupitem.select('div.tools > a') if a.attrs.get('href')][0]
            item['title'] = soupitem.select('h2')[0].text.strip()
            try:
                item['company'] = soupitem.h4.text
            except AttributeError:
                pass
            #      logger.debug('item: %s has no h4 tag' % i)
            if soupitem.find('span', itemprop="addressLocality"):
                item['locality'] = soupitem.find('span', itemprop="addressLocality").text
            if soupitem.find('span', itemprop="addressRegion"):
                item['region'] = soupitem.find('span', itemprop="addressRegion").text
            item['short_description'] = soupitem.find('p', itemprop="description").text
            item['published'] = timeago2datetimestr(item['date_search'],  soupitem.select('span.ago')[0].text)
            #salary
            #clearance
            #department
            #description
            
            self.logger.debug('title %s' % item['title'])
            yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
コード例 #3
0
 def parse(self, response):
     self.logger.debug('in parse', log_level=log.DEBUG)
     soup = bs4.BeautifulSoup(response.body)
     soupitems = soup.select('div#jobResultNew')    
     for soupitem in soupitems:
         item = ScrapyscrappersItem()
         item['keyword'] = response.meta['keyword']
         item['date_search'] = current_datetime()
         item['item_url'] = self.base_url + soupitem.select('a.jobTitleLink')[0].attrs.get('href')
         item['title'] = soupitem.select('a.jobTitleLink')[0].text
         item['short_description'] = soupitem.select('p.summary')[0].text.strip()
         details = table2dict(soupitem,  'table.joaResultsDetailsTable')
         item['company'] = details.get('Agency',  '')
         location_region = details.get('Location(s)',  '').split(', ')
         item['locality'] = location_region[0]
         try:
             item['region'] = location_region[1]
         except IndexError:
             pass
         item['salary'] = details.get('Salary',  '')
         item['department'] = details.get('Department',  '')
         # item.published = ''
         self.logger.debug('title %s' % item['title'], log_level=log.DEBUG)
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
コード例 #4
0
ファイル: basespider.py プロジェクト: davete/scrapyscrappers
 def init_item(self, response):
     item = ScrapyscrappersItem()
     item["keyword"] = response.meta["keyword"]
     item["location_search"] = response.meta["location"]
     item["date_search"] = current_datetime()
     return item