def parse_item(self, response): item = ScrapyItem() link = response.url title = response.css('div.fixed-inner-box h1::text').extract()[0] company = response.css('div.fixed-inner-box h2 a::text').extract()[0] info = response.xpath( '//div[@class="terminalpage-left"]/ul/li/strong//text()').extract( ) content = response.xpath( '//div[@class="tab-cont-box"]/div[@class="tab-inner-cont"]/p/text()' ).extract() content = ''.join( [i.replace('\n\r\t\xa0', '').strip() for i in content]) salary = exp = None if len(info) == 8: salary, item['location'], item['date_time'], item[ 'job_type'], exp, item['degree'], _, _ = info elif len(info) == 9: salary, item['location'], item['addr'], item['date_time'], item[ 'job_type'], exp, item['degree'], _, _ = info item['salary_l'], item['salary_h'] = self.toInt(salary) p = re.compile(r'(\d+)') exp = p.search(exp) if exp: exp = exp.group(1) else: exp = 0 item['exp'] = int(exp) if item.get('addr'): item['addr'] = item['addr'].strip('-') else: item['addr'] = '' item['date_time'] = item['date_time'].split(' ')[0] aid = self.trmd5(link) item['aid'] = aid item['content'] = content item['title'] = title item['company'] = company item['link'] = link item['referer'] = '智联招聘' yield item
def parse_item(self, response): item = ScrapyItem() title = response.css('span.name::text').extract()[0] link = response.url info = [i.strip('/').strip() for i in response.css('dd.job_request p')[0].css('span::text').extract()] date_time = response.css('p.publish_time::text').extract()[0].split()[0] company = response.css('dl#job_company dt img::attr(alt)').extract()[0] date_time = self.Strfdate(date_time) aid = self.trmd5(link) salary,item['location'],exp,item['degree'],item['job_type'] = info salary_l,salary_h = [int(i.strip('kK'))*1000 for i in salary.split('-')] addr = response.xpath('//div[@class="work_addr"]//text()').extract() addr =''.join([i.replace('\n\r\t\xa0', '')).strip('查看地图- ') for i in addr])
def parse_item(self, response): item = ScrapyItem() html = BeautifulSoup(response.text, 'html.parser') link = response.url title = response.css('div.cn h1::text').extract() adinfo = response.css('div.cn span.lname::text').extract() salary = response.css('div.cn strong::text').extract() company = response.css('div.cn p.cname a::text').extract() title, adinfo, company = self.default_value([title, adinfo, company]) if salary: salary = salary[0] item['salary_l'], item['salary_h'] = self.toInt(salary) else: item['salary_l'] = item['salary_h'] = 0 info = [ html.find('em', {'class': 'i{}'.format(i)}) for i in range(1, 5) ] info = self.default_info(info) exp, degree, _, date_time = info p = re.compile(r'(\d+)') exp = p.search(exp) if not exp: exp = 0 else: exp = exp.group(1) date_time = self.Strfdate(date_time) content = response.xpath( '//div[@class="bmsg job_msg inbox"]//text()').extract() content = ''.join( [i.replace('\n\t\r\xa0', '').strip() for i in content]) if '-' in adinfo: location, addr = adinfo.split('-') item['addr'] = addr else: location = adinfo item['addr'] = '' item['job_type'] = '全职' item['link'] = link item['title'] = title item['company'] = company item['location'] = location item['content'] = content item['exp'] = exp item['degree'] = degree item['date_time'] = date_time item['aid'] = self.trmd5(link) item['referer'] = '51Job' yield item
def parse_item(self, response): item = ScrapyItem() link = response.url title = response.css( '.job-msg-top-text span.job-name::text').extract()[0] job_type = response.css( '.job-msg-top-text span.blue-icon::text').extract()[0].strip('()') salary = response.css('.job-msg-top span.job-money::text').extract()[0] salary_l, salary_h = self.toInt(salary) location = response.css('li.ads span::text').extract() exp = response.css('li.exp span::text').extract() degree = response.css('li.edu span::text').extract() content = response.css('div#jp_maskit pre::text').extract() date_time = response.css('.job-msg-bottom .date::text').extract() addr = response.css('div.ads-msg span::text').extract() company = response.css('div.p-side-right p.title a::text').extract() location, exp, degree, content, date_time, addr, company = self.default_value( [location, exp, degree, content, date_time, addr, company]) content = ''.join( [i.replace('\n\r\t\xa0', '').strip() for i in content if i]) p = re.compile(r'(\d+)') exp = p.search(exp) if not exp: exp = 0 else: exp = int(exp.group(1)) exp = int(exp) date_time = date_time.strip('发布于') aid = self.trmd5(link) item['aid'] = aid item['job_type'] = job_type item['salary_l'] = salary_l item['salary_h'] = salary_h item['location'] = location item['exp'] = exp item['degree'] = degree item['content'] = content item['date_time'] = date_time item['title'] = title item['link'] = link item['addr'] = addr item['company'] = company item['referer'] = '大街网' yield item
def parse_item(self, response): item = ScrapyItem() link = response.url aid = self.trmd5(link) html = BeautifulSoup(response.text, 'html.parser') title = html.find('span', {'class': 'job_name'}).text salary = html.find('span', {'class': 'job_price'}).text if '面议' in salary: salary_l = salary_h = 0 else: salary_l, salary_h = [int(i) for i in salary.split('-')] adinfo = html.find('span', {'class': 'job_loc'}) job_type = adinfo.find_next_sibling().find_next_sibling() degree = job_type.find_next_sibling().find_next_sibling().text exp = html.find('span', {'class': 'job_exp'}).text company = html.find('div', {'class': 'job-company'}).find('h4').text date_time = html.find('p', {'class': 'updatetime'}).text.strip('更新') date_time = self.Strfdate(date_time) content = html.find('div', { 'class': 'job_intro_info' }).text.replace('\xa0', '').strip() p = re.compile(r'(\d+)') exp = p.search(exp) if not exp: exp = 0 else: exp = int(exp.group(1)) location, addr = adinfo.text.split(' ') item['aid'] = aid item['title'] = title item['link'] = link item['salary_l'] = salary_l item['salary_h'] = salary_h item['location'] = location item['addr'] = addr item['job_type'] = job_type.text item['degree'] = degree item['exp'] = exp item['date_time'] = date_time item['content'] = content item['company'] = company item['referer'] = '中华英才网' yield item
def parse_item(self, response): item = ScrapyItem() link = response.url title = response.css('div.pos_base_info .pos_title::text').extract()[0] salary = response.css( 'div.pos_base_info .pos_salary::text').extract()[0] info = response.css('div.pos_base_condition span::text').extract() adinfo = response.xpath( '//div[@class="pos-area"]/span//text()').extract() company = response.css( 'div.comp_baseInfo_title .baseInfo_link a::text').extract()[0] content = response.css('div.des::text').extract() content = ''.join( [i.replace('\n\r\t\xa0', '').strip() for i in content if i]) location = adinfo[1] addr = ''.join(adinfo[3:6:2]) date_time = response.css( 'span.pos_base_update span::text').extract()[0] date_time = self.Strfdate(date_time) _, degree, exp = info p = re.compile(r'(\d+)') exp = p.search(exp) if exp: exp = exp.group(1) else: exp = 0 item['location'] = location item['addr'] = addr item['job_type'] = '全职' item['date_time'] = date_time item['exp'] = int(exp) item['degree'] = degree item['salary_l'], item['salary_h'] = self.toInt(salary) item['title'] = title item['company'] = company item['aid'] = self.trmd5(link) item['link'] = link item['content'] = content item['referer'] = '58同城' yield item
def parse_item(self, response): item = ScrapyItem() link = response.url title = response.css('div.job-banner .name::text').extract()[0] salary = response.css( 'div.job-banner .name .salary::text').extract()[0] info = response.css('div.job-banner p::text').extract()[:3] print(info) location, exp, degree = info p = re.compile(r'(\d+)') exp = p.search(exp) if exp: exp = exp.group(1) else: exp = 0 date_time = response.css('div.job-tags span.time::text').extract()[0] date_time = date_time.strip('发布于') date_time = self.Strfdate(date_time) company = response.css('div.info-primary .name::text').extract()[0] content = response.css( 'div.detail-content .job-sec .text::text').extract() content = ''.join( [i.replace('\n\r\t\xa0', '').strip() for i in content]) addr = response.css( 'div.job-location .location-address::text').extract()[0] item['location'] = location item['addr'] = addr item['job_type'] = '全职' item['date_time'] = date_time item['exp'] = int(exp) item['degree'] = degree item['salary_l'], item['salary_h'] = self.toInt(salary) item['title'] = title item['company'] = company item['aid'] = self.trmd5(link) item['link'] = link item['content'] = content item['referer'] = 'boss直聘' yield item