def parse(self, response): """ 处理列表页数据 :param response: 下载器response对象==>中央引擎==>spider :return: """ # 先拿到正确的div列表 ==> 可以使用CSS样式选择器 div_list = response.xpath('//div[@class="el"]')[4:] for div in div_list: item = JobItem() item["name"] = div.xpath('./p/span/a/text()').extract_first().strip() item["job_url"] = div.xpath('./p/span/a/@href').extract_first() item["company_name"] = div.xpath('./span[1]/a/text()').extract_first() item["location"] = div.xpath('./span[2]/text()').extract_first() item["salary"] = div.xpath('./span[3]/text()').extract_first() item["release_date"] = div.xpath('./span[4]/text()').extract_first() yield scrapy.Request( url=item["job_url"], callback=self.parse_detail, meta={"item": item}, dont_filter=True, ) # break # 测试 yield scrapy.Request( url=response.xpath('//a[text()="下一页"]/@href').extract_first() )
def parse_job(self, response): item = JobItem() item['LINK'] = response.url item['TITLE'] = response.xpath( '//h1[@class="job_title"]/text()').extract_first() if not item['TITLE']: return item['TITLE'] = response.xpath( '//h1[@class="job_title"]/text()').extract_first().replace( '\n', ' ') item['SALARY'] = response.xpath( '//div[@class="salary"]/span[@class="salary-text"]/text()' ).extract_first() item['COMPANY'] = response.xpath( '//h3[@class="name"]/a/text()').extract_first() if not item['COMPANY']: return item['ADDRESS'] = response.xpath( '//div[@class="address__full-address"]/span/text()').extract_first( ).rsplit(',')[-1].strip() if not item['ADDRESS']: return skill = response.xpath( '//div[@class="tag-list"]/a[@class="big ilabel mkt-track"]/span/text()' ).extract() item['SKILL'] = ''.join(skill).replace('\n', ' ').split() if not item['SKILL']: return item['TYPE'] = response.xpath( '//p[@class="gear-icon"]/text()').extract_first().replace( '\n', ' ') if not item['TYPE']: return req = response.xpath( '//div[@class="experience"]/ul/li/text()').extract() if not req: return for text in req: year = re.findall( r"(\d.*) (?:years of experience|years of|năm kinh nghiệm|of experience)", text) if year: if year: item['EXP'] = year degree = re.findall(r'(?:Degree in|degree in|Tốt nghiệp) (.*)', text) if degree: item['DEGREE'] = degree[0] yield item
def parse_item(self, response): item = JobItem() content = response.xpath( '(//div[@class="main-content"]//div[contains(@class, "bloko-column_container")])[1]' ) vacancy_section = content.xpath( '(//div[@class="vacancy-description"]/div[@class="vacancy-section"])[1]/div[1]' ) item['name'] = content.xpath( './/div[contains(@class, "vacancy-title")]/h1//text()').get() item['salary'] = content.xpath( './/p[@class="vacancy-salary"]//*/text()').getall() item['company'] = content.xpath( './/a[@data-qa="vacancy-company-name"]//*/text()').getall() item['address'] = content.xpath( './/p[@data-qa="vacancy-view-location"]//text()').getall() item['experience'] = content.xpath( './/*[@data-qa="vacancy-experience"]//text()').getall() item['employment_mode'] = content.xpath( './/*[@data-qa="vacancy-view-employment-mode"]//text()').getall() item['skills'] = content.xpath( './/*[contains(@data-qa, "skills-element")]/span/text()').getall() item['description'] = vacancy_section.get() item['url'] = response.request.url yield item
def parse_page(self, response): item = JobItem() profession = response.xpath( '/html/body/div[3]/div[3]/div[1]/span/text()').extract() description = response.xpath( '/html/body/div[3]/div[3]/div[2]/div[1]/div[1]/div[1]/text()' ).extract() requirement = response.xpath( '/html/body/div[3]/div[3]/div[1]/div[4]/text()').extract() companyName = response.xpath( '/html/body/div[3]/div[4]/div[1]/div/div[1]/div[1]/a/text()' ).extract() companyDescription = response.xpath( '/html/body/div[3]/div[3]/div[2]/div[2]/div[1]/div/div/div/p/text()' ).extract() scale = response.xpath( '/html/body/div[3]/div[4]/div[1]/div/p[2]/text()').extract() companyCategory = response.xpath( '/html/body/div[3]/div[4]/div[1]/div/p[1]/a/text()').extract() item['profession'] = profession item['description'] = description item['requirement'] = requirement item['companyName'] = companyName item['companyDescription'] = companyDescription item['scale'] = scale item['companyCategory'] = companyCategory yield item
def parse_item(self, response): item = JobItem() item['type'] = "1" item['ename'] = str( response.xpath( '/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/@title'). extract()[0]) item['postinfo'] = str( response.xpath( '/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a/@title'). extract()[0]) #这个字段对应的标签在有的Html 标签中也是不对的 只是需要点时间去查找一下作下处理就行了 item['salary'] = str( response.xpath( '/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()'). extract()[0]) item['oldurl'] = response.url #51job上有的详情页中没有学历限制,所以只有三个<span>标签 这样有时会造成角标越界错误 #个别爬取的信息错误也是由于标签问题导致 try: item['releasetime'] = str( response.xpath( '/html/body/div[3]/div[2]/div[3]/div[1]/div/div/span[4]/text()' ).extract()[0]) except IndexError as e: item['releasetime'] = str( response.xpath( '/html/body/div[3]/div[2]/div[3]/div[1]/div/div/span[3]/text()' ).extract()[0]) return item
def parse(self, response): node_list = response.xpath("//div[@class='el' and not(@id)]") for node in node_list: item = JobItem() item["position_name"] = node.xpath( "./p[@class='t1 ']/span/a/@title").extract_first() item["position_link"] = node.xpath( "./p[@class='t1 ']/span/a/@href").extract_first() item["company_name"] = node.xpath( "./span[@class='t2']/a/@title").extract_first() item["postiton_pay"] = node.xpath( "./span[@class='t4']/text()").extract_first() item["work_location"] = node.xpath( "./span[@class='t3']/text()").extract_first() item["publish_times"] = node.xpath( "./span[@class='t5']/text()").extract_first() yield item # item 数据交给管道处理(需要自己实现) # Request数据交给调度器处理(框架已经实现) logging.info(item['position_link']) logging.info(type(item['position_link'])) yield scrapy.Request(item['position_link'], callback=self.parse_page)
def saveData(self, jobdict): """ 使用items以及pipeline实现爬虫数据持久化 :params: dict 爬取到的一条信息 """ item = JobItem() item = jobdict yield item
def parse_item(self, response): item = JobItem() content = response.xpath( '(//div[@class="main-content"]//div[contains(@class, "bloko-column_container")])[1]' ) vacancy_section = content.xpath( '(//div[@class="vacancy-description"]/div[@class="vacancy-section"])[1]/div[1]' ) item['name'] = content.xpath( './/div[contains(@class, "vacancy-title")]/h1//text()').get() if len( content.xpath( './/p[@class="vacancy-salary"]//*/text()').getall()) > 1: item['salaryMIN'] = int( (content.xpath('.//p[@class="vacancy-salary"]//*/text()'). getall()[1]).replace('\xa0', '')) if len( content.xpath('.//p[@class="vacancy-salary"]//*/text()'). getall()) > 2: if content.xpath('.//p[@class="vacancy-salary"]//*/text()' ).getall()[2] == ' до ': item['salaryMAX'] = int(( content.xpath('.//p[@class="vacancy-salary"]//*/text()' ).getall()[3]).replace('\xa0', '')) xpath_t = './p/strong[contains(text(), "{}")]/ancestor::p/following::ul[1]/li/text()' item['duty'] = vacancy_section.xpath( xpath_t.format('Обязанности')).getall() item['requirements'] = vacancy_section.xpath( xpath_t.format('Требования')).getall() item['conditions'] = vacancy_section.xpath( xpath_t.format('Условия')).getall() item['company'] = content.xpath( './/a[@data-qa="vacancy-company-name"]//*/text()').getall()[-1] item['address'] = content.xpath( './/p[@data-qa="vacancy-view-location"]//text()').extract_first() item['experience'] = content.xpath( './/*[@data-qa="vacancy-experience"]//text()').getall() item['type_of_employment'] = content.xpath( './/*[@data-qa="vacancy-view-employment-mode"]//text()').getall( )[0] item['schedule'] = content.xpath( './/*[@data-qa="vacancy-view-employment-mode"]//text()').getall( )[-1] item['date'] = content.xpath( './/*[@class="vacancy-creation-time"]//text()').getall()[1] item['skills'] = content.xpath( './/*[contains(@data-qa, "skills-element")]/span/text()').getall() item['description'] = vacancy_section.get() or '' item['url'] = response.request.url yield item
def parse(self, response): jobs = response.css('.content') for job in jobs: item = JobItem() item['designation'] = job.css('.desig::text').extract()[0] item['company'] = job.css('.org::text').extract()[0] item['skill'] = job.css('.skill::text').extract()[0] item['jobDescription'] = job.css('.desc::text').extract()[0] yield item
def parse(self, response): item = JobItem() html = response.text soup = BeautifulSoup(html, 'html.parser') # 全部页数 allpage = int( re.match('共(\d+)页,到第', soup.find(attrs={ 'class': 'td' }).string).group(1)) # 职位结果 result = soup.find_all(attrs={'class': 'el'})[16:] print(result) for message in result: job_soup = BeautifulSoup(str(message), 'html.parser') if '-' in str(job_soup.find(attrs={'class': 't3'}).string): split = str(job_soup.find(attrs={ 'class': 't3' }).string).split('-') item['city'] = split[0] #城市 item['address'] = split[1] #区域 else: item['city'] = str(job_soup.find(attrs={'class': 't3'}).string) item['address'] = '' item['jobname'] = ''.join(re.findall('\S+', str( job_soup.a.string))) # 职位名称 item['company'] = str(job_soup.find(attrs={ 'class': 't2' }).string) # 公司名称 item['salary'] = str(job_soup.find(attrs={ 'class': 't4' }).string) # 工资 item['welfare'] = '' # 福利待遇 item['education'] = '不限' # 学历要求 item['workyear'] = '不限' # 工作经验 item['link'] = str(job_soup.a['href']) # 详情页网址 item['createtime'] = '2018-' + str( job_soup.find(attrs={ 'class': 't5' }).string) yield item print('正在请求第%s页' % self.page) time.sleep(random.randint(1, 10)) # 随机休眠 if self.page < allpage: # 判断是否是最后一页 self.page += 1 yield scrapy.FormRequest(url=self.base_url + self.kw + ',2,' + str(self.page) + self.w_url, headers=self.headers, method='GET', callback=self.parse, dont_filter=True) return self.parse
def parse_detail(self, response): nodes, driver, kw = response.meta.get("params") for node in nodes: print(node) driver.execute_script("window.open('%s')" % node) time.sleep(2) driver.switch_to.window(driver.window_handles[1]) WebDriverWait(driver, timeout=10).until( EC.presence_of_element_located( (By.XPATH, "//div[@class='detail-content']"))) html = etree.HTML(driver.page_source) driver.close() driver.switch_to.window(driver.window_handles[0]) item = JobItem() item['recruitment_position'] = html.xpath( "//div[@class='job-primary detail-box']/div[@class='info-primary']/div[@class='name']/h1/text()" )[0] item['salary'] = html.xpath( "//div[@class='job-primary detail-box']/div[@class='info-primary']/div[@class='name']/span/text()" )[0] item['keyword'] = kw item['url'] = node item['source'] = "BOSS直聘" item['update_date'] = html.xpath( '//div[@class="sider-company"]/p[last()]/text()')[0] item['company_name'] = html.xpath( '//a[@ka="job-detail-company_custompage"]')[0].attrib.get( 'title').strip().replace("\n招聘", "") # item['company_name'] = html.xpath('//div[@class="level-list"]/preceding-sibling::div[1]/text()')[0] item['work_experience'] = html.xpath( '//*[@class="job-primary detail-box"]/div[2]/p/text()')[1] item['education_background'] = html.xpath( '//*[@class="job-primary detail-box"]/div[2]/p/text()')[2] item['job_requirements'] = "".join( html.xpath( '//div[@class="detail-content"]/div[@class="job-sec"]/div[@class="text"]/text()' )) item['company_info'] = "".join( html.xpath( '//div[@class="job-sec company-info"]//div[@class="text"]/text()' )) item['company_address'] = html.xpath( '//*[@class="location-address"]/text()')[0] item['company_welfare'] = ",".join( html.xpath( '//div[@class="job-banner"]/div[@class="job-primary detail-box"]/div[@class="info-primary"]/div[@class="tag-container"]/div[@class="job-tags"]/text()' )) item['id'] = get_md5(node) item['crawl_date'] = datetime.now().strftime("%Y-%m-%d") yield item
def parse_job_info(self, response): item = JobItem() print("parse job info from %s" % response.url) item['posted_url'] = response.url item['posted_website'] = '前程无忧' job_info = response.xpath('//div[@class="cn"]') item['company'] = job_info.xpath('p/a/text()').extract_first() item['position'] = job_info.xpath('h1/text()').extract_first() item['city'] = job_info.xpath( './/span[@class="lname"]/text()').extract_first() item['salary'] = job_info.xpath('./strong/text()').extract_first() company_str = job_info.xpath( './p[@class="msg ltype"]/text()').extract_first() company_str_list = re.sub('[\t\r\n \xa0]', '', company_str).split('|') for company_item in company_str_list: if '公司' in company_item or '合资' in company_item \ or r'外资' in company_item or r'国企' in company_item \ or r'单位' in company_item: item['company_type'] = company_item elif '人' in company_item and '0' in company_item: item['company_size'] = company_item else: item['company_industry'] = company_item item['experience_requirement'] = response.xpath( '//span[@class="sp4"][em[@class="i1"]]/text()').extract_first() item['education_requirement'] = response.xpath( '//span[@class="sp4"][em[@class="i2"]]/text()').extract_first() item['recruiting_number'] = response.xpath( '//span[@class="sp4"][em[@class="i3"]]/text()').extract_first() item['posted_date'] = response.xpath( '//span[@class="sp4"][em[@class="i4"]]/text()').extract_first() position_advantage_list = response.xpath( '//div[@class="jtag inbox"]/p/span/text()').extract() item['position_advantage'] = merge_string(position_advantage_list, delimiter=",") position_info_list = response.xpath( '//div[@class="bmsg job_msg inbox"]/text()').extract() item['position_info'] = merge_string(position_info_list) item['position_tag'] = "NULL" item['department'] = "NULL" item['position_type'] = "NULL" item["major_requirement"] = "NULL" item['company_finance'] = "NULL" item['company_url'] = "NULL" yield item
def parse(self, response): print("request -> " + response.url) # 如果发生输入验证码的情况 if (len(response.url) > 40): # 发出叫声 duration = 10000 # millisecond freq = 440 # Hz winsound.Beep(freq, duration) # 暂停 os.system("pause") return self.this_request() print(response.css('i.total_page::text')) self.currentpagetotal = int(response.css('i.total_page::text').extract()[0]) print(self.currentpagetotal) job_list = response.css('li.job_item') if (len(job_list) > 0): print("job58 Nums:" + str(len(job_list))) for job in job_list: item = JobItem() item['Jname'] = job.css('div.job_title > div.job_name > a > span::text').extract()[-1].strip() item['Jarea'] = response.css('div.zp_crumb>div.crumb_item>a::text').extract()[0].split('58')[0] item['Jtype'] = job.css("div.job_comp > p.job_require > span::text").extract()[0].strip() item['Jcompany'] = job.css('div.comp_name > a::text').extract_first().strip() item['Jeducation'] = job.css("div.job_comp > p.job_require > span::text").extract()[1].strip() item['Jexperience'] = job.css("div.job_comp > p.job_require > span::text").extract()[2].strip().split('年')[0] welfare = job.css('div.job_title > div.job_wel > span::text').extract() item['Jwelfare'] = ','.join(welfare) salary = job.css('div.job_title > p::text').extract()[0].split('-') try: item['JminSalary'] = float(salary[0])/1000 item['JmaxSalary'] = float(salary[-1])/1000 except: pass # #公司规模 # item['JcomSize'] = '' # #融资情况 # item['JcomFinanceStage'] = '' #爬取的网站名 item['Jsource']='58同城' print(item) yield item yield self.next_request()
def parse(self, response): body = response.css(".job-primary") for head in body: item = JobItem() item["title"] = head.css(".job-title::text").extract()[0] item["wage"] = head.css(".red::text").extract()[0] item["site"] = head.css(".info-primary p::text").extract()[0] item["name"] = head.css(".company-text .name a::text").extract()[0] item["expert"] = head.css(".info-primary p::text").extract()[1] item["edu"] = head.css(".info-primary p::text").extract()[2] yield item # 翻页 next_page = response.css(".page .next::attr(href)").extract()[0] if next_page is not None: yield response.follow("https://www.zhipin.com" + next_page,callback=self.parse)
def parse_content(self, response): ''' 解析帖子详情 @param response 下载的网页内容 yield jobitem 所需结果 ''' jobitem = JobItem() jobitem['link'] = response.url jobitem['author'] = self._get_author(response) jobitem['title'] = self._get_title(response) jobitem['pub_date'] = self._get_date(response) jobitem['content'] = self._get_content(response) jobitem['phone'] = self._get_phone(jobitem['content']) jobitem['email'] = self._get_email(jobitem['content']) jobitem['crawl_date'] = datetime.today().strftime('%Y-%m-%d') yield jobitem
def parse(self, response): item = JobItem() data = json.loads(response.text) result = data['data']['results'] #数据合集 numfound = data['data']['numFound'] #总条数 for message in result: item['city'] = message['city']['items'][0]['name']#城市 item['jobname'] = message['jobName']#职位名称 item['company'] = message['company']['name']#公司名称 item['salary'] = message['salary']#工资 item['welfare'] = '' for i in message['welfare']: item['welfare'] += i + ' '#福利待遇 if len(message['city']['items']) == 2: item['address'] = message['city']['items'][1]['name']#区域 else: item['address'] = '' item['education'] = message['eduLevel']['name']#学历要求 item['workyear'] = message['workingExp']['name']#工作经验 item['link'] = message['positionURL']#详情页网址 item['createtime'] = message['updateDate']#添加时间 yield item print('正在请求第%s页' % self.page) time.sleep(random.randint(1,10))#随机休眠 if self.page < int(numfound/90) :#判断是否是最后一页 self.page += 1 yield scrapy.FormRequest( url=self.url, headers=self.headers, method='GET', formdata={ 'start': str((self.page - 1) * 90), 'pageSize': '90', 'workExperience': '-1', 'education': '-1', 'companyType': '-1', 'employmentType': '-1', 'jobWelfareTag': '-1', 'kw': self.kw, 'kt': '3', '_v': '0.84269824', 'x-zp-page-request-id': '081b0f20bc5a4752a7db7315bce7fc06-1545653322592-595069' }, callback=self.parse ) return self.parse
def parse(self, response): zw_div = response.xpath('//div[@class="el"]') for item in zw_div: item_1 = JobItem() item_1['post'] = self.extract_with_xpath(item,'p/span/a/@title') item_1['company'] = self.extract_with_xpath(item,'span[@class="t2"]/a/text()') item_1['city'] = self.extract_with_xpath(item,'span[@class="t3"]/text()') item_1['salary'] = self.extract_with_xpath(item,'span[@class="t4"]/text()') item_1['publish_date'] = self.extract_with_xpath(item,'span[@class="t5"]/text()') item_1['href'] = self.extract_with_xpath(item,'p[@class="t1 "]/span/a/@href') yield item_1 #分页 //div[@class="dw_page"]//ul/li[last()]/a/@href for next_page in response.xpath('//li[@class="bk"][2]/a/@href'): yield response.follow(next_page,self.parse)
def parse_detail(self, response): jobs = ItemLoader(JobItem(), response=response) url_md5 = hashlib.md5() url_md5.update(response.url.encode(encoding='utf-8')) jobs.add_css('job_name', '.tHjob .cn h1::attr(title)') jobs.add_css('company_name', '.tHjob .cname a::attr(title)') jobs.add_css('experience', '.tBorderTop_box .jtag .t1 span:nth-child(1)::text') jobs.add_css('Education', '.tBorderTop_box .jtag .t1 span:nth-child(2)::text') jobs.add_css('workplace', '.tHeader .lname::text') jobs.add_css('salary_min', '.tHjob .cn strong::text') jobs.add_css('salary_max', '.tHjob .cn strong::text') jobs.add_value('url', response.url) jobs.add_value('object_url', url_md5.hexdigest()) jobs.add_css('jb_description', '.tCompany_main div[class=tBorderTop_box] .job_msg') jobs.add_css('company_type', '.tHjob .in .cn .ltype::text') jobs.add_css('company_people_min', '.tHjob .in .cn .ltype::text') jobs.add_css('company_people_max', '.tHjob .in .cn .ltype::text') jobs.add_css('company_work', '.tHjob .in .cn .ltype::text') return jobs.load_item()
def parse(self, response): item = JobItem() #存储的数据库 for post in response.xpath("//article/div"): # print(p) #职位名称 # item['Jdb']='demo' item['Jname'] = post.xpath( './header/h1/a/text()').extract()[0].strip() #薪水 已废弃 换成了最低薪资和最高薪资 # item['Jsalary'] = 0 # #地区 # item['Jarea'] = '' #值位类型 item['Jtype'] = post.xpath('./div/text()').extract()[0].strip() #职位要求 # item['Jrequirements'] = '' # #公司名称 # item['Jcompany'] = '' # #标签 # item['Jtag'] = '' # #福利 # item['Jwelfare'] = '' # #学历要求 # item['Jeducation'] = '' # #经验要求 # item['Jexperience'] = '' # #最低工资 # item['JminSalary'] = 0 # #最高工资 均以k为单位,类型为int # item['JmaxSalary'] = 0 # #每年多少薪资数据类型int # item['JpayTimes'] = 0 # #公司类型 # item['JcomType'] = '' # #招聘人数,字符串 # item['JhireCount']='0' # #公司规模 # item['JcomSize'] = '500-1000' item['JcreatedTime'] = '2020-12-03' item['JisSchoolJob'] = 1 item['Jlocation'] = '(23.45443,135.74434)' yield item
def parse_item(self, response): item = JobItem() # 标题 item['title'] = response.xpath('/html/head/title/text()').extract() addtime = response.xpath( '//*[@id="wrapper"]/div[3]/div[1]/div[2]/p/text()').extract() item['addtime'] = "".join(addtime).strip() # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合 content = response.xpath( '//*[@id="wrapper"]/div[3]/div[1]/div[3]/p/text()').extract() # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则 if len(content) == 0: content = response.xpath( '//*[@id="wrapper"]/div[3]/div[1]/div[3]/p[1]/text()').extract( ) item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() # 链接 item['url'] = response.url yield item
def parse(self, response): print response.url div = response.xpath("//div[@id='resultList']/div[@class='el']") for j in range(0, len(div)): item = JobItem() jobname = response.xpath( "//div[@class='el']/p/span/a/@title").extract()[j] companyname = response.xpath( "//div[@class='el']/span[@class='t2']/a/@title").extract()[j] location = response.xpath( "//div[@class='el']/span[@class='t3']/text()").extract()[j] pay = response.xpath( "//div[@class='el']/span[@class='t4']").extract()[j] time = response.xpath( "//div[@class='el']/span[@class='t5']/text()").extract()[j] item['jobname'] = jobname item['companyname'] = companyname item['location'] = location item['pay'] = pay[17:-7] item['time'] = time yield item
def parse(self,response): for result in response.xpath('//*[@id="main"]/div/div[2]/ul/li'): jobName=result.xpath('div/div[1]/h3/a/div[1]/text()').extract()[0] salary=result.xpath('div/div[1]/h3/a/span/text()').extract()[0] address_during_education = result.xpath('div/div[1]/p/text()').extract()[0] company=result.xpath('div/div[2]/div/h3/a/text()').extract()[0] url = result.xpath('div/div[1]/h3/a/@href').extract()[0] item = JobItem() item['jobName']=jobName item['salary'] = salary item['info'] = address_during_education item['company'] = company #检索详细信息 # yield Request(url, callback=self.parse_item, meta={'item': item}) #翻页 next_page_url = response.xpath('//*[@id="main"]/div/div[2]/div[2]/a[5]/@href').extract()[0] if next_page_url != 'javascript:void(0)': yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): item = JobItem() html = response.text soup = BeautifulSoup(html, 'html.parser') result = soup.find_all(attrs={'class': 'list-noimg job-list clearfix new-dl'}) print(result[1]) for message in result: job_soup = BeautifulSoup(str(message), 'html.parser') item['city'] = '北京' #城市 item['education'] = '不限' # 学历要求 item['workyear'] = '不限' # 工作经验 item['jobname'] = job_soup.find(attrs={'list_title gj_tongji'}).string item['company'] = job_soup.find(attrs={'new-dl-company'}).a['title'] item['salary'] = ''.join(re.findall('\S+',str(job_soup.find(attrs={'class':'new-dl-salary'}).string))) item['link'] = job_soup.find(attrs={'class': 'list_title gj_tongji'})['href'] if '-' in str(job_soup.find(attrs={'pub-time'}).span.string): item['createtime'] ='2018-' + str(job_soup.find(attrs={'pub-time'}).span.string) elif str(job_soup.find(attrs={'pub-time'}).span.string) == '昨天': item['createtime'] ='2018-12-25' else: item['createtime'] ='2018-12-26' item['address'] = job_soup.find(attrs={'class':'pay'}).string try: item['welfare'] =job_soup.find(attrs={'class':'new-dl-tags'}).find('i').string except: item['welfare'] = '' yield item print('正在请求第%s页' % self.page) time.sleep(random.randint(1, 10)) # 随机休眠 if self.page < 100: self.page += 1 yield scrapy.FormRequest( url='http://bj.ganji.com/zpjisuanjiwangluo/o%s/' % str(self.page), method='GET', callback=self.parse, dont_filter=True ) return self.parse
def parse1(self, response): item = JobItem() city = response.meta['city'] city2 = response.meta['city2'] # 用于表格 ul_list = response.xpath('//*[@id="sidebar-right"]/ul') sql = 'create table if not exists {} (title varchar(30) null,' \ 'price varchar(25) null,requie varchar(30) null,' \ 'company varchar(25) null)charset=utf8mb4;'.format(city2) creat_table(sql) print('CREAT TABLE SUSECESS') item['city_table'] = city2 time.sleep(2) for ur in ul_list: li_list = ur.xpath('./li') for li in li_list: print('https://{}.58.com'.format(city) + li.xpath('./strong/a/@href').extract_first()) yield scrapy.Request( 'https://{}.58.com'.format(city) + li.xpath('./strong/a/@href').extract_first(), callback=self.parse2, meta={'item': item})
def parse(self, response): item = JobItem() elist = response.xpath( '//*[@id="resultList"]//div[@class="el"]').extract() for i in range(0, len(elist)): el = elist[i] item['title'] = re.findall('title="(.*?)"', el, re.S)[0] item['url'] = re.findall('href="(.*?)"', el, re.S)[0] item['company'] = re.findall('title="(.*?)"', el, re.S)[1] item['company_url'] = re.findall('href="(.*?)"', el, re.S)[1] item['loc'] = re.findall('<span class="t3">(.*?)</span>', el, re.S)[0] item['money'] = re.findall('<span class="t4">(.*?)</span>', el, re.S)[0] item['time'] = re.findall('<span class="t5">(.*?)</span>', el, re.S)[0] print(item) yield item for p in range(2, 2001): url = "https://search.51job.com/list/180200,000000,0000,00,9,99,%2520,2," + str( p) + ".html" yield scrapy.Request(url, callback=self.parse)
def parse(self, response): job_list = response.css('.infoBox ul.infoList.teachinList') for job_info in job_list: item = JobItem() item['company_name'] = job_info.css('li.span1 a::text').extract_first().encode('utf-8') item['company_link'] = job_info.css('li.span1 a::attr(href)').extract_first() item['briefing_city'] = job_info.css('li.span2::text').extract_first() item['school'] = job_info.css('li.span5::text').extract_first() item['room'] = job_info.css('li.span4::text').extract_first() item['time'] = job_info.css('li.span5::text').extract()[1] # yield item company_detail = job_info.css('li.span1 a::attr(href)').extract_first() if company_detail is not None: # company_datail: /teachin/view/id/{company_page_id} yield response.follow(url=self.base_url + company_detail, callback=self.detail) next_page = response.css('div.ctl ul.page li.next a::attr(href)').extract_first() if next_page is not None: # next_page: /teachin/index?page={page_num} next_page = self.base_url + next_page yield scrapy.Request(next_page, callback=self.parse)
def parse_job_info(self, response): url = response.url site = url.split("//")[1].split('/')[0] print('parsing: %s' % response.url) if site == 'forbidden.lagou.com': # exit if blocked print("Oh, I'm blocked.") return None item = JobItem() item['url'] = url item['company_name'] = response.xpath( '//div[@class="company"]/text()').extract_first() item['job_position'] = response.xpath( '//div[@class="job-name"]/span[@class="name"]/text()' ).extract_first() job_requests = response.xpath( '//dd[@class="job_request"]/p/span/text()').extract() for job_request in job_requests: if 'k-' in job_request: item['salary'] = job_request elif '经验' in job_request: item['experience_requirement'] = job_request elif '学历' in job_request \ or '大专' in job_request \ or '本科' in job_request \ or '硕士' in job_request \ or '博士' in job_request: item['education_requirement'] = job_request elif '全职' in job_request \ or '兼职' in job_request \ or '实习' in job_request: item['job_type'] = job_request else: item['city'] = re.sub('[\ /]', '', job_request) # other field yield item
def parse(self, response): json_data = response.selector.re( r"(window.__SEARCH_RESULT__\s=\s)(\{.*\})") json_data = json.loads(json_data[1]) total_page = json_data['total_page'] for data in json_data["engine_search_result"]: item = JobItem() page_job_url = data['job_href'] print(page_job_url) # 职位名 item['job_name'] = data['job_name'] # 公司名 item['company_name'] = data['company_name'] # 薪资 item['providesalary_text'] = data['providesalary_text'] # 投递日期 item['updatedate'] = data['updatedate'] # 公司类型 item['companytype_text'] = data['companytype_text'] # 公司规模 item['companysize_text'] = data['companysize_text'] # 主营业务 item['companyind_text'] = data['companyind_text'] # 职位标签 item['jobwelf'] = data['jobwelf'] # 职位需求 item['attribute_text'] = '|'.join(data['attribute_text']) yield scrapy.Request(page_job_url, callback=self.parse_info, meta={'item': item}) # page xpath //div[@class='p_in']/span[1]/text() for count in range(2, int(total_page) + 1): next_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%25A8%258B%25E5%25BA%258F%25E5%2591%2598,2,{}.html?lang=c%2F&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format( count) yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): page_items = response.xpath('//div[@class="el"]') for job_item in page_items: job = JobItem() # job['post'] = job_item.xpath('normalize-space(//p/span/a)').extract() job['post'] = job_item.xpath('p/span/a/@title').extract() job['company'] = job_item.xpath( 'span[@class="t2"]/a/text()').extract() job['position'] = job_item.xpath( 'span[@class="t3"]/text()').extract() job['salary'] = job_item.xpath( 'span[@class="t4"]/text()').extract() job['time'] = job_item.xpath('span[@class="t5"]/text()').extract() if not job['salary']: job['salary'] = "商议决定" url = job_item.xpath('p/span/a/@href').extract() if url: #二级页面 yield scrapy.Request(url[0], meta={'item': job}, callback=self.parse_msg, dont_filter=True) #dont_filter不用过滤二级页面 pass
def parse_items(self, response): # print response.body.decode('gb2312') print '开始处理item' item = JobItem() url = response.url print url # lambda表达式,如果页面有数据,就获取,没有就赋值为空 f = lambda x: x[0] if x else '' company = response.xpath('//p[@class="cname"]/a/@title').extract() # 公司 # 判断是否为职位页面,有company就是具体职位页面,否者是公司页面 if company: company = company[0] # print company position = response.xpath('//h1/text()').extract()[0] # 职位名称 # print position salary = response.xpath('//div[@class="cn"]//strong/text()').extract()[0] # 薪水 location = response.xpath('//span[@class="lname"]/text()').extract()[0].strip() # 地区 work_address = location # print location work_years = f(response.xpath('//div[@class="t1"]//span[1]/text()').extract()) # 工作经验 # print work_years # 判断是否有学历要求,没有标明的话赋值不限学历 ie = response.xpath('//span/em[@class="i2"]').extract() if ie: degree = response.xpath('//div[@class="t1"]//span[2]/text()').extract()[0] # 学历 # print degree else: degree = '不限学历' # print degree position_type = response.xpath('//p[@class="msg ltype"]/text()').extract()[0].strip() # 职位类别 print response.xpath('//p[@class="msg ltype"]/text()').extract()[0] # 处理职位类别 去掉|和空格 position_type = position_type.split('|') list_type = [] for i in position_type: i = i.strip() list_type.append(i) position_type = list_type print position_type tags = f(response.xpath('//p[@class="t2"]/span/text()').extract()) # 标签 print tags pub_date = response.xpath('//div[@class="t1"]//span/text()').extract() # 发布日期 # print str(pub_date) date = re.compile(r'\d{2}-\d{2}') datep = date.findall(str(pub_date)) # print datep[0] desc = response.xpath('//div[@class="bmsg job_msg inbox"]//p/text()').extract() # 判断职位详情的html,有的是p标签,有的没有 if desc: position_desc1 = desc else: position_desc1 = response.xpath('//div[@class="bmsg job_msg inbox"]/text()').extract() # 职位简介 position_desc = [] # 处理职位详情 for i in position_desc1: i = i.strip() print i if i != '': position_desc.append(i.strip()) print position_desc # work_address = ''.join(response.xpath('//div[@class="bmsg inbox"]/a/@onclick').extract()) # 工作地址 # if len(work_address) > 1: # adds = work_address.split(',')[1] # work_address = adds.split(')')[0] # print work_address # else: # work_address = work_address[0] item['url'] = self.md5(url) item['company'] = company item['tags'] = tags item['position'] = position item['salary'] = salary item['degree'] = degree item['location'] = location item['work_years'] = work_years item['position_type'] = position_type item['pub_date'] = datep[0] item['position_desc'] = position_desc item['work_address'] = work_address print item yield item else: pass