def parse_item(self, response): item = super(ClearancejobsSpider, self).parse_item(response) try: item['description'] = response.css('.cj-job-details').extract()[0] #div except IndexError: append(self.fail_url_path, 'failed to parse:' + response.url) yield item
def parse_item(self, response): item = super(SimplyhiredSpider, self).parse_item(response) try: item['description'] = response.css('div.detail')[0].extract() #item['description'] = response.css('div.description-full::text')[0].extract() except IndexError: append(self.fail_url_path, 'failed to parse:' + response.url) yield item
def parse_item(self, response): item = super(ClearedjobsSpider, self).parse_item(response) try: item['description'] = response.css('.view_long').extract()[0] item['salary'] = response.css('.view_job').xpath('.//div//div[contains(.,"Salary:")]/following-sibling::div[1]/text()').extract()[0].strip() except IndexError: append(self.fail_url_path, 'failed to parse:' + response.url) yield item
def parse_item(self, response): item = super(ClearedjobsSpider, self).parse_item(response) try: item['description'] = response.css('.view_long').extract()[0] item['salary'] = response.css('.view_job').xpath( './/div//div[contains(.,"Salary:")]/following-sibling::div[1]/text()' ).extract()[0].strip() except IndexError: append(self.fail_url_path, 'failed to parse:' + response.url) yield item
def handle_response(self, response): self.logger.debug("in handle response") try: if response.status == 200: append(settings.get("LOG_OK_URL_FULLPATH"), response.url) else: self.logger.debug("error http code: %s", response.status) append(settings.get("LOG_FAIL_URL_FULLPATH"), str(response.status) + ": " + response.url) except Exception, e: self.logger.exception(e)
def parse(self, response): super(ClearedconnectionsSpider, self).parse(response) table = response.xpath('//table[@class="jstext"]') rows = table.xpath('.//tr')[3:] if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) cols = row.xpath('.//td') col_url = cols[1] onclick = col_url.xpath('.//a/@onclick').extract()[0] m = re.search("PopupViewWindow\('(.+?)'", onclick) if m: url = m.group(1) item[ 'item_url'] = 'https://www.clearedconnections.com/JobSeekerX/' + url item['title'] = col_url.xpath('.//a/text()').extract()[0] col_published = cols[0] published = col_published.xpath('.//p/text()').extract()[0].strip() item['published'] = datetime2datetimestr( datetime.strptime(published, '%m/%d/%Y')) col_company = cols[3] try: item['company'] = col_company.xpath( './/a/text()').extract()[0].strip() except IndexError: item['company'] = col_company.xpath( './/p/text()').extract()[0].strip() col_loc = cols[2] loc = col_loc.xpath('./text()').extract()[0] try: item['locality'], item['region'], _ = loc.split('-') except ValueError: item['locality'] = loc # data not available in this website item['short_description'] = '' item['salary'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item}) next = response.css('.pagination').xpath( './/a[contains(.,"Next")]/@href').extract() if next: self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request(self.base_url + next[0], callback=self.parse, meta={ 'keyword': response.meta['keyword'], 'location': response.meta['location'] }) else: self.logger.debug('no next url')
def parse_item(self, response): item = super(ClearedconnectionsSpider, self).parse_item(response) try: item['description'] = response.xpath('//div[@class="viewjob"]').extract()[0] except IndexError: append(self.fail_url_path, 'failed to parse:' + response.url) else: table = response.xpath('//table[@class="LabelValueTable"]') details = tablexpath2dict(table) item['clearance'] = details.get('Security Clearances', '') yield item
def parse_item(self, response): item = super(UsajobsSpider, self).parse_item(response) soup = bs4.BeautifulSoup(response.body) # item['description'] = soup.select('div.jobdetail')[0].text # with soup, plan text try: item['description'] = response.css('div.jobdetail')[0].extract() except IndexError: append(self.fail_url_path, 'failed to parse:' + response.url) else: infodict = table2dict(soup, 'div#jobinfo2') item['clearance'] = infodict.get('SECURITY CLEARANCE') yield item
def parse(self, response): super(SimplyhiredSpider, self).parse(response) # # with scrapy selector # # for sel in response.xpath('//div[@class="job"]'): # for sel in response.css('div.job'): # item = self.init_item(response) # item['keyword'] = response.meta['keyword'] # item['date_search'] = current_datetime() # #this works, but return different url # #item['item_url'] = sel.css('a.title::attr(href)').extract()[0] # item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0] # item['title'] = sel.css('a.title::text').extract()[0].strip() # with bs4 soup = bs4.BeautifulSoup(response.body) soupitems = soup.select('div.job') if len(soupitems) < 1: append(self.fail_url_path, 'no data:' + response.url) return for soupitem in soupitems: item = self.init_item(response) item['item_url'] = [a.attrs.get('href') for a in soupitem.select('div.tools > a') if a.attrs.get('href')][0] item['title'] = soupitem.select('h2')[0].text.strip() try: item['company'] = soupitem.h4.text except AttributeError: pass # logger.debug('item: %s has no h4 tag' % i) if soupitem.find('span', itemprop="addressLocality"): item['locality'] = soupitem.find('span', itemprop="addressLocality").text if soupitem.find('span', itemprop="addressRegion"): item['region'] = soupitem.find('span', itemprop="addressRegion").text item['short_description'] = soupitem.find('p', itemprop="description").text item['published'] = timeago2datetimestr(item['date_search'], soupitem.select('span.ago')[0].text) # data not available in this website item['salary'] = '' item['clearance'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) #for url in response.xpath('//link[@rel="next"]/@href').extract()[0]: next = response.css('a.next::attr(href)').extract() if next: self.logger.debug('next url: %s' % next[0]) yield Request( # self.base_url + next[0]['href'], # with soup next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse(self, response): super(ClearedjobsSpider, self).parse(response) table = response.xpath('//table[@class="search_res"]//tbody') rows = table.xpath('.//tr') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) cols = row.xpath('.//td') col0 = cols[0] url = col0.xpath('.//a[@class="search"]/@href').extract()[0] item['item_url'] = self.base_url + "/" + url # a text could have span tag inside item['title'] = col0.xpath('string(.//a[@class="search"])').extract()[0].strip() item['company'] = col0.xpath('.//div[@class="info"]//a/text()').extract()[0] try: item['clearance'] = col0.xpath('.//div[@class="desc"]/text()').extract()[0] except IndexError: # item['clearance'] = '' pass published = col0.xpath('.//div[@class=""]/text()').extract()[0].replace('Posted - ','') item['published'] = datetime2datetimestr(datetime.strptime(u'July 8, 2015', '%B %d, %Y')) col1 = cols[1] loc = col1.xpath('./text()').extract()[0].strip() try: item['locality'] , item['region'] = loc.split(', ') except ValueError: item['locality'] = loc item['region'] = '' # data not available in this website item['short_description'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) next = response.css('.navbar_bottom').xpath('.//a[text()=">"]/@href').extract() if next: if next[0].startswith('/'): self.logger.debug('next url: %s' % self.base_url +next[0]) yield Request( self.base_url + next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('next url: %s' % self.base_url + '/' +next[0]) yield Request( self.base_url + '/' + next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse_item(self, response): item = super(ClearedconnectionsSpider, self).parse_item(response) try: item['description'] = response.xpath( '//div[@class="viewjob"]').extract()[0] except IndexError: append(self.fail_url_path, 'failed to parse:' + response.url) else: table = response.xpath('//table[@class="LabelValueTable"]') details = tablexpath2dict(table) item['clearance'] = details.get('Security Clearances', '') yield item
def parse(self, response): super(GlassdoorSpider, self).parse(response) # list = response.css('.standardJobListings') list = response.css('.jlGrid') rows = list.xpath('.//div[@itemtype="http://schema.org/JobPosting"]') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) url = row.xpath('.//h3[@itemprop="title"]/a/@href').extract()[0] item['item_url'] = self.base_url + url # a text could have span tag inside item['title'] = row.xpath( 'string(.//h3[@itemprop="title"]/a)').extract()[0].strip() item['company'] = row.xpath( 'string(.//span[@class="employerName"])').extract()[0].strip() published = row.css('.logo').css( '.minor::text').extract()[0].strip() item['published'] = timeago2datetimestr(item['date_search'], published) item['short_description'] = row.xpath( 'string(.//p[@itemprop="description"])').extract()[0].strip() loc = row.xpath('string(.//span[@itemprop="addressLocality"])' ).extract()[0].strip() try: item['locality'], item['region'] = loc.split(', ') except ValueError: item['locality'] = loc item['region'] = '' # data not available in this website item['salary'] = '' item['clearance'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item}) next = response.css('.next a::attr(href)').extract() if next: self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request( # self.base_url + next[0]['href'], # with soup self.base_url + next[0], callback=self.parse, meta={ 'keyword': response.meta['keyword'], 'location': response.meta['location'] }) else: self.logger.debug('no next url')
def parse(self, response): super(ClearancejobsSpider, self).parse(response) results = response.css('#search-results') # div rows = results.css('.cj-search-result-item') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) item['title'] = row.xpath('.//strong[@class="cj-search-result-item-title"]/a/text()').extract()[0].strip() item['item_url'] = row.xpath('.//strong[@class="cj-search-result-item-title"]/a/@href').extract()[0] try: item['company'] = row.xpath('.//span[@class="cj-company-name"]/a/text()').extract()[0].strip() except IndexError: item['company'] = '' #location = row.xpath('.//span[@class="cj-company-name"]/text()').extract()[0].strip() location = row.css('.cj-job-primary-info::text').extract()[1].strip() try: item['locality'] , item['region'] = location.split(', ') except ValueError: item['locality'] = location item['region'] = '' # attributes not provided item['short_description'] = '' item['salary'] = '' item['department'] = '' #updated = row.css('.cj-text-sm::text').extract()[1].strip() updated = row.xpath('.//span[@class="cj-text-sm cj-color-mediumgray"]//text()').extract()[1].strip() try: dt = datetime.strptime(updated, '%m/%d/%y') # FIXME: check if exception not only cause of Today string but other string like yesterday except ValueError: dt = datetime.now() item['published'] = datetime2datetimestr(dt) #item['clearance'] = row.xpath('.//div[@class="cj-card-data"])').extract()[0].strip() item['clearance'] = row.css('.cj-card-data::text').extract()[1] self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) #FIXME: no a link, javascript next = response.xpath('button[@class="cj-table-pagination-next"]/a/@href').extract() if next: self.logger.debug('next url: %s' % next[0]) yield Request( next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse_item(self, response): item = super(CareerbuilderSpider, self).parse_item(response) #soup = bs4.BeautifulSoup(response.body) #item['title'] = response.xpath('//h1/span/text()').extract() #if DEBUG == False: #item['description'] = response.css('article').extract()[0] # or # or to dont get html tags #item['description'] = bs4.BeautifulSoup(response.body).select('article')[0].text try: item['description'] = response.xpath('//article').extract()[0] except IndexError: append(self.fail_url_path, 'failed to parse:' + response.url) # attribute not provided # item['clearance'] = '' yield item
def parse(self, response): super(ClearedconnectionsSpider, self).parse(response) table = response.xpath('//table[@class="jstext"]') rows = table.xpath('.//tr')[3:] if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) cols = row.xpath('.//td') col_url = cols[1] onclick = col_url.xpath('.//a/@onclick').extract()[0] m = re.search("PopupViewWindow\('(.+?)'", onclick) if m: url = m.group(1) item['item_url'] = 'https://www.clearedconnections.com/JobSeekerX/' + url item['title'] = col_url.xpath('.//a/text()').extract()[0] col_published = cols[0] published = col_published.xpath('.//p/text()').extract()[0].strip() item['published'] = datetime2datetimestr(datetime.strptime(published,'%m/%d/%Y')) col_company = cols[3] try: item['company'] = col_company.xpath('.//a/text()').extract()[0].strip() except IndexError: item['company'] = col_company.xpath('.//p/text()').extract()[0].strip() col_loc = cols[2] loc = col_loc.xpath('./text()').extract()[0] try: item['locality'] , item['region'], _ = loc.split('-') except ValueError: item['locality'] = loc # data not available in this website item['short_description'] = '' item['salary'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) next = response.css('.pagination').xpath('.//a[contains(.,"Next")]/@href').extract() if next: self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request( self.base_url + next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse(self, response): super(CareerbuilderSpider, self).parse(response) rows = response.css('.gs-job-result-abstract') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) item['item_url'] = row.css('.jt').xpath('@href').extract()[0] item['title'] = row.css('.jt::text').extract()[0] #item['short_description'] = row.css('span[itemprop="description"]::text').extract()[0] # the same with xpath item['short_description'] = row.xpath('.//span[@itemprop="description"]/text()').extract()[0] item['company'] = row.xpath('.//a/@companyname').extract()[0] # or # try: # item['company'] = row.xpath('.//td[@itemprop="hiringOrganization"]/a/text()').extract()[0] # except IndexError: # self.logger.debug(row.xpath('.//td[@itemprop="hiringOrganization"]/a/text()').extract()) location = row.xpath('.//div[@itemprop="jobLocation"]/span/text()').extract()[0] try: item['region'] , item['locality'] = location.split(' - ') except ValueError: item['locality'] = location teaser = [i.strip() for i in row.xpath('.//div[contains(@id, "pnlTeaser")]/p/text()').extract()[0].split('|')] if len(teaser) == 2: item['salary'] = teaser[1].split(':')[1] else: item['salary'] = '' item['department'] = '' ago = row.css('.jl_rslt_posted_cell span::text').extract()[0] item['published'] = timeago2datetimestr(item['date_search'] , ago) self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) next = response.css('.JL_MXDLPagination2_next').xpath('@href').extract() if next: self.logger.debug('next url: %s' % next[0]) yield Request( next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse(self, response): super(GlassdoorSpider, self).parse(response) # list = response.css('.standardJobListings') list = response.css('.jlGrid') rows = list.xpath('.//div[@itemtype="http://schema.org/JobPosting"]') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) url = row.xpath('.//h3[@itemprop="title"]/a/@href').extract()[0] item['item_url'] = self.base_url + url # a text could have span tag inside item['title'] = row.xpath('string(.//h3[@itemprop="title"]/a)').extract()[0].strip() item['company'] = row.xpath('string(.//span[@class="employerName"])').extract()[0].strip() published = row.css('.logo').css('.minor::text').extract()[0].strip() item['published'] = timeago2datetimestr(item['date_search'], published) item['short_description'] = row.xpath('string(.//p[@itemprop="description"])').extract()[0].strip() loc = row.xpath('string(.//span[@itemprop="addressLocality"])').extract()[0].strip() try: item['locality'] , item['region'] = loc.split(', ') except ValueError: item['locality'] = loc item['region'] = '' # data not available in this website item['salary'] = '' item['clearance'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) next = response.css('.next a::attr(href)').extract() if next: self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request( # self.base_url + next[0]['href'], # with soup self.base_url + next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse(self, response): super(UsajobsSpider, self).parse(response) soup = bs4.BeautifulSoup(response.body) soupitems = soup.select('div#jobResultNew') if len(soupitems) < 1: append(self.fail_url_path, 'no data:' + response.url) return for soupitem in soupitems: item = self.init_item(response) item['item_url'] = self.base_url + soupitem.select('a.jobTitleLink')[0].attrs.get('href') item['title'] = soupitem.select('a.jobTitleLink')[0].text item['short_description'] = soupitem.select('p.summary')[0].text.strip() details = table2dict(soupitem, 'table.joaResultsDetailsTable') item['company'] = details.get('Agency', '') location_region = details.get('Location(s)', '').split(', ') item['locality'] = location_region[0] try: item['region'] = location_region[1] except IndexError: pass item['salary'] = details.get('Salary', '') item['department'] = details.get('Department', '') # data not available in this website item['published']= '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) # next = soup.select('a.nextPage') # with soup next = response.css('a.nextPage::attr(href)').extract() if next: self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request( # self.base_url + next[0]['href'], # with soup self.base_url + next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse(self, response): super(ClearedjobsSpider, self).parse(response) table = response.xpath('//table[@class="search_res"]//tbody') rows = table.xpath('.//tr') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) cols = row.xpath('.//td') col0 = cols[0] url = col0.xpath('.//a[@class="search"]/@href').extract()[0] item['item_url'] = self.base_url + "/" + url # a text could have span tag inside item['title'] = col0.xpath( 'string(.//a[@class="search"])').extract()[0].strip() item['company'] = col0.xpath( './/div[@class="info"]//a/text()').extract()[0] try: item['clearance'] = col0.xpath( './/div[@class="desc"]/text()').extract()[0] except IndexError: # item['clearance'] = '' pass published = col0.xpath( './/div[@class=""]/text()').extract()[0].replace( 'Posted - ', '') item['published'] = datetime2datetimestr( datetime.strptime(u'July 8, 2015', '%B %d, %Y')) col1 = cols[1] loc = col1.xpath('./text()').extract()[0].strip() try: item['locality'], item['region'] = loc.split(', ') except ValueError: item['locality'] = loc item['region'] = '' # data not available in this website item['short_description'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item}) next = response.css('.navbar_bottom').xpath( './/a[text()=">"]/@href').extract() if next: if next[0].startswith('/'): self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request(self.base_url + next[0], callback=self.parse, meta={ 'keyword': response.meta['keyword'], 'location': response.meta['location'] }) else: self.logger.debug('next url: %s' % self.base_url + '/' + next[0]) yield Request(self.base_url + '/' + next[0], callback=self.parse, meta={ 'keyword': response.meta['keyword'], 'location': response.meta['location'] }) else: self.logger.debug('no next url')