def parse(self, response): self.logger.debug('in parse') # # for sel in response.xpath('//div[@class="job"]'): # for sel in response.css('div.job'): # self.logger.debug('parsing') # item = ScrapyscrappersItem() # item['keyword'] = response.meta['keyword'] # item['date_search'] = current_datetime() # # item['item_url'] = sel.xpath('.//div[@class="tools"]/@href').extract()[0] # #item['item_url'] = sel.css('a.title::attr(href)').extract()[0] # item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0] # #item['title'] = sel.xpaht('.//h2') # item['title'] = sel.css('a.title::text').extract()[0].strip() soup = bs4.BeautifulSoup(response.body) soupitems = soup.select('div.job') for soupitem in soupitems: self.logger.debug('parsing') item = ScrapyscrappersItem() item['keyword'] = response.meta['keyword'] item['date_search'] = current_datetime() item['item_url'] = [ a.attrs.get('href') for a in soupitem.select('div.tools > a') if a.attrs.get('href') ][0] item['title'] = soupitem.select('h2')[0].text.strip() try: item['company'] = soupitem.h4.text except AttributeError: pass # logger.debug('item: %s has no h4 tag' % i) if soupitem.find('span', itemprop="addressLocality"): item['locality'] = soupitem.find( 'span', itemprop="addressLocality").text if soupitem.find('span', itemprop="addressRegion"): item['region'] = soupitem.find('span', itemprop="addressRegion").text item['short_description'] = soupitem.find( 'p', itemprop="description").text item['published'] = timeago2datetimestr( item['date_search'], soupitem.select('span.ago')[0].text) #salary #clearance #department #description self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item})
def parse(self, response): self.logger.debug('in parse') # # for sel in response.xpath('//div[@class="job"]'): # for sel in response.css('div.job'): # self.logger.debug('parsing') # item = ScrapyscrappersItem() # item['keyword'] = response.meta['keyword'] # item['date_search'] = current_datetime() # # item['item_url'] = sel.xpath('.//div[@class="tools"]/@href').extract()[0] # #item['item_url'] = sel.css('a.title::attr(href)').extract()[0] # item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0] # #item['title'] = sel.xpaht('.//h2') # item['title'] = sel.css('a.title::text').extract()[0].strip() soup = bs4.BeautifulSoup(response.body) soupitems = soup.select('div.job') for soupitem in soupitems: self.logger.debug('parsing') item = ScrapyscrappersItem() item['keyword'] = response.meta['keyword'] item['date_search'] = current_datetime() item['item_url'] = [a.attrs.get('href') for a in soupitem.select('div.tools > a') if a.attrs.get('href')][0] item['title'] = soupitem.select('h2')[0].text.strip() try: item['company'] = soupitem.h4.text except AttributeError: pass # logger.debug('item: %s has no h4 tag' % i) if soupitem.find('span', itemprop="addressLocality"): item['locality'] = soupitem.find('span', itemprop="addressLocality").text if soupitem.find('span', itemprop="addressRegion"): item['region'] = soupitem.find('span', itemprop="addressRegion").text item['short_description'] = soupitem.find('p', itemprop="description").text item['published'] = timeago2datetimestr(item['date_search'], soupitem.select('span.ago')[0].text) #salary #clearance #department #description self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} )
def parse(self, response): self.logger.debug('in parse', log_level=log.DEBUG) soup = bs4.BeautifulSoup(response.body) soupitems = soup.select('div#jobResultNew') for soupitem in soupitems: item = ScrapyscrappersItem() item['keyword'] = response.meta['keyword'] item['date_search'] = current_datetime() item['item_url'] = self.base_url + soupitem.select('a.jobTitleLink')[0].attrs.get('href') item['title'] = soupitem.select('a.jobTitleLink')[0].text item['short_description'] = soupitem.select('p.summary')[0].text.strip() details = table2dict(soupitem, 'table.joaResultsDetailsTable') item['company'] = details.get('Agency', '') location_region = details.get('Location(s)', '').split(', ') item['locality'] = location_region[0] try: item['region'] = location_region[1] except IndexError: pass item['salary'] = details.get('Salary', '') item['department'] = details.get('Department', '') # item.published = '' self.logger.debug('title %s' % item['title'], log_level=log.DEBUG) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} )
def init_item(self, response): item = ScrapyscrappersItem() item["keyword"] = response.meta["keyword"] item["location_search"] = response.meta["location"] item["date_search"] = current_datetime() return item