def Spider(self): while not self.pagequeue.empty(): url = self.pagequeue.get() print('正在爬取:{}'.format(url)) req = requests.get(url, headers=get_header()) req.encoding = 'gbk' html = etree.HTML(req.text) for i in range(4, 54): try: title = html.xpath( '//*[@id="resultList"]/div[{}]/p/span/a/@title'.format( i)) if title[0] == None: break name = html.xpath( '//*[@id="resultList"]/div[{}]/span[1]/a/text()'. format(i)) url = html.xpath( '//*[@id="resultList"]/div[{}]/p/span/a/@href'.format( i)) print(url[0]) area = html.xpath( '//*[@id="resultList"]/div[{}]/span[2]/text()'.format( i)) salery = html.xpath( '//*[@id="resultList"]/div[{}]/span[3]/text()'.format( i)) time = html.xpath( '//*[@id="resultList"]/div[{}]/span[4]/text()'.format( i)) req1 = requests.get(url[0], headers=get_header()) req1.encoding = 'gb2312' html1 = etree.HTML(req1.text) detail = ''.join( html1.xpath( '//*[@class="bmsg job_msg inbox"]//*/text()')) if detail.isspace(): detail = ''.join( html1.xpath( '//*[@class="bmsg job_msg inbox"]/text()')) print(detail) gongsi = ''.join( html1.xpath('//*[@class="tmsg inbox"]/text()')) if gongsi.isspace(): gongsi = ''.join( html1.xpath('//*[@class="tmsg inbox"]//*/text()')) data = { "职位名称": title[0], "详细链接": url[0], "公司名称": name[0], "工作地点": area[0], "薪资": salery[0] if len(salery) != 0 else None, "发布时间": time[0], "职位信息": detail, "公司信息": gongsi } self.jobqueue.put(data) except: continue
def Spider(self): for i in range(1, 31): s = requests.Session() s.get( url= 'https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=', headers=get_header(), timeout=3) cookie = s.cookies req = requests.post(self.baseurl, headers=self.header, data={ 'first': True, 'pn': i, 'kd': self.keyword }, params={ 'px': 'default', 'city': self.city, 'needAddtionalResult': 'false' }, cookies=cookie, timeout=3) text = req.json() datas = text['content']['positionResult']['result'] for data in datas: s = requests.Session() s.get( url= 'https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=', headers=get_header(), timeout=3) cookie1 = s.cookies url = 'https://www.lagou.com/jobs/' + str( data.get('positionId')) + '.html' req1 = requests.get(url, headers=self.header, cookies=cookie1) req1.encoding = 'utf-8' html = etree.HTML(req1.text) detail = ''.join( html.xpath('//*[@class="job-detail"]//*/text()')).strip() if detail.isspace(): detail = ''.join( html.xpath('//*[@class="job-detail"]/text()')).strip() print(detail) data = { "职位名称": data.get('positionName'), "工作地点": data.get('district'), "薪资": data.get('salary'), "公司名称": data.get('companyFullName'), "经验要求": data.get('workYear'), "学历": data.get('education'), "福利": data.get('positionAdvantage'), "详细链接": url, "职位信息": detail } self.data.put(data)
def Spider(self): for i in range(1, 31): s = requests.Session() s.get( url='https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=', headers=get_header(), timeout=3) cookie = s.cookies req = requests.post(self.baseurl, headers=self.header, data={'first': True, 'pn': i, 'kd': self.keyword}, params={'px': 'default', 'city': self.city, 'needAddtionalResult': 'false'}, cookies=cookie, timeout=3) text = req.json() datas = text['content']['positionResult']['result'] for data in datas: s = requests.Session() s.get( url='https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=', headers=get_header(), timeout=3) cookie1 = s.cookies url = 'https://www.lagou.com/jobs/' + str(data.get('positionId')) + '.html' req1 = requests.get(url, headers=self.header, cookies=cookie1) req1.encoding = 'utf-8' html = etree.HTML(req1.text) detail = ''.join(html.xpath('//*[@class="job-detail"]//*/text()')).strip() if detail.isspace(): detail = ''.join(html.xpath('//*[@class="job-detail"]/text()')).strip() print(detail) # datax = { # "职位名称": data.get('positionName'), # "工作地点": data.get('district'), # "薪资": data.get('salary'), # "公司名称": data.get('companyFullName'), # "经验要求": data.get('workYear'), # "学历": data.get('education'), # "福利": data.get('positionAdvantage'), # "详细链接": url, # "职位信息": detail # } datax = { "positionName": data.get('positionName'), "district": data.get('district'), "salary": data.get('salary'), "companyFullName": data.get('companyFullName'), "workYear": data.get('workYear'), "education": data.get('education'), "positionAdvantage": data.get('positionAdvantage'), "url": url, "detail": detail } self.data.put(datax) insert_sql = """ insert into lagou_job(keyword,city,positionName,district,salary,companyFullName,workYear,education,positionAdvantage,url,detail) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ cursor.execute(insert_sql, (self.keyword,self.city, datax['positionName'], datax['district'], datax['salary'], datax['companyFullName'], datax['workYear'], datax['education'], datax['positionAdvantage'], datax['url'], datax['detail'])) conn.commit()
def Spider(self): while not self.pagequeue.empty(): url = self.pagequeue.get() print('正在爬取:{}'.format(url)) req = requests.get(url, headers=get_header()) req.encoding = 'gbk' html = etree.HTML(req.text) for i in range(4, 54): try: title = html.xpath('//*[@id="resultList"]/div[{}]/p/span/a/@title'.format(i)) if title[0] == None: break name = html.xpath('//*[@id="resultList"]/div[{}]/span[1]/a/text()'.format(i)) url = html.xpath('//*[@id="resultList"]/div[{}]/p/span/a/@href'.format(i)) area = html.xpath('//*[@id="resultList"]/div[{}]/span[2]/text()'.format(i)) salery = html.xpath('//*[@id="resultList"]/div[{}]/span[3]/text()'.format(i)) time = html.xpath('//*[@id="resultList"]/div[{}]/span[4]/text()'.format(i)) data = { "职位名称": title[0], "详细链接": url[0], "公司名称": name[0], "工作地点": area[0], "薪资": salery[0], "发布时间": time[0] } self.jobqueue.put(data) except: break
def Spider(self): page = 1 city = self._get_city_code() data = [] while 1: params = { 'query': self.keyword, 'page' : page, 'city': city } req = requests.get(url=self.base_url, params=params, headers=get_header()) print(req.url) req.encoding = req.apparent_encoding code = req.json().get('html') if code: html = etree.HTML(code) title = html.xpath('//*[@class="title"]/h4/text()') href = map(lambda x: 'https://www.zhipin.com'+x, html.xpath('//*[@class="item"]/a/@href')) salary = html.xpath('//*[@class="salary"]/text()') company = html.xpath('//*[@class="name"]/text()') area = html.xpath('//*[@class="msg"]/em[1]/text()') workingExp = html.xpath('//*[@class="msg"]/em[2]/text()') eduLevel = html.xpath('//*[@class="msg"]/em[3]/text()') for t,s,c,a,w,e,h in zip(title, salary, company, area, workingExp, eduLevel, href): job = {} job['职位名称'] = t job['职位链接'] = h job['公司名称'] = c job['工作地点'] = a job['薪资'] = s job['工作经验'] = w job['学历要求'] = e jobDetail = BossModel() jobDetail.name = t jobDetail.url = h jobDetail.company_name = c jobDetail.area = a jobDetail.salary = s jobDetail.work_exp = w jobDetail.edu_leve = e jobDetail.city = self.city jobDetail.key_word = self.keyword saleryArray = s.split("-", 1) if (len(saleryArray) >= 2): saleryMin = saleryArray[0] saleryMax = saleryArray[1] jobDetail.salary_max = saleryMax jobDetail.salary_min = saleryMin # jobDetail.save() data.append(job) page += 1 else: break return data
def _get_city_code(self): url = 'https://www.zhipin.com/wapi/zpCommon/data/city.json' req = requests.get(url=url, headers=get_header()).json() if req['message'] == 'Success': city_code_dict = req.get('zpData').get('cityList') for i in city_code_dict: for c in i['subLevelModelList']: if c['name'] == self.city: return str(c['code']) return '100010000' # 全国
def __init__(self, keyword, city='北京', thread=10, path=os.getcwd()): self.keyword = keyword self.city = city self.thread = thread self.csv_header = ['职位名称', '详细链接', '公司名称', '工作地点', '薪资', '发布时间'] self.baseurl = 'https://search.51job.com/list/' self.header = get_header() self.path = path self.pagequeue = queue.Queue() self.jobqueue = queue.Queue()
def searchRequests(self, url): """ 发起查询请求 :param url: 猎聘网搜索职位的url :return: 返回搜索到的text """ try: r = requests.get(url, headers=get_header()) r.encoding = 'utf-8' except: r = requests.get("https://www.liepin.com" + url) finally: # with open("text.html", 'w') as f: # f.write(r.text) return r.text
def get_one_page(query, n): data = { 'query': query, 'cityCode': 0, 'pageNum': n, } url = 'https://www.kanzhun.com/search/interview.json?' + urlencode(data) try: response = requests.get(url, headers=get_header(), timeout=3) if response.status_code == 200: return response.text return None except ConnectionError: print('请求网页出错') return None
def Spider(self): jobl = [] for page in range(self.page): params = { "start": 90 * page, "pageSize": 90, "workExperience": -1, "education": -1, "companyType": -1, "employmentType": -1, "jobWelfareTag": -1, "kw": self.keyword, "kt": 3, "cityId": self.city, "salary": '0, 0' } req = requests.get(url=self.base_url, params=params, headers=get_header()) data = req.json()['data']['results'] if len(data) != 0: for job in data: print(job) jobd = {} jobd['ID'] = job.get('number') jobd['工作名称'] = job.get('jobName') jobd['招聘详细链接'] = job.get('positionURL') company = job.get('company') jobd['公司名称'] = company.get('name') jobd['公司ID'] = company.get('number') jobd['公司性质'] = company.get('type').get('name') jobd['公司规模'] = company.get('size').get('name') jobd['公司招聘主页'] = company.get('url') jobd['公司地点'] = job.get('city').get('display') jobd['薪资'] = job.get('salary') jobd['学历要求'] = job.get('eduLevel').get('name') try: jobd['工作经历'] = job.get('workingExp').get('name') except: jobd['工作经历'] = '经验不限' jobd['职位类型'] = job.get('emplType') jobd['公司福利'] = '、'.join(job.get('welfare')) or '无' jobd['工作发布标签'] = job.get('timeState') jobd['更新时间'] = job.get('updateDate') jobl.append(jobd) else: break return jobl
def Spider(self): page = 1 city = self._get_city_code() data = [] while 1: params = {'query': self.keyword, 'page': page, 'city': city} req = requests.get(url=self.base_url, params=params, headers=get_header()) print(req.url) req.encoding = req.apparent_encoding code = req.json().get('html') if code: html = etree.HTML(code) title = html.xpath('//*[@class="title"]/h4/text()') href = map(lambda x: 'https://www.zhipin.com' + x, html.xpath('//*[@class="item"]/a/@href')) salary = html.xpath('//*[@class="salary"]/text()') company = html.xpath('//*[@class="name"]/text()') area = html.xpath('//*[@class="msg"]/em[1]/text()') workingExp = html.xpath('//*[@class="msg"]/em[2]/text()') eduLevel = html.xpath('//*[@class="msg"]/em[3]/text()') for t, s, c, a, w, e, h in zip(title, salary, company, area, workingExp, eduLevel, href): job = {} job['职位名称'] = t job['职位链接'] = h job['公司名称'] = c job['工作地点'] = a job['薪资'] = s job['工作经验'] = w job['学历要求'] = e data.append(job) page += 1 else: break return data
def Spider(self): while not self.pagequeue.empty(): url = self.pagequeue.get() print('正在爬取:{}'.format(url)) req = requests.get(url, headers=get_header()) req.encoding = 'gbk' html = etree.HTML(req.text) for i in range(4, 54): try: title = html.xpath( '//*[@id="resultList"]/div[{}]/p/span/a/@title'.format( i)) if title[0] == None: break name = html.xpath( '//*[@id="resultList"]/div[{}]/span[1]/a/text()'. format(i)) url = html.xpath( '//*[@id="resultList"]/div[{}]/p/span/a/@href'.format( i)) print(url[0]) area = html.xpath( '//*[@id="resultList"]/div[{}]/span[2]/text()'.format( i)) salery = html.xpath( '//*[@id="resultList"]/div[{}]/span[3]/text()'.format( i)) time = html.xpath( '//*[@id="resultList"]/div[{}]/span[4]/text()'.format( i)) req1 = requests.get(url[0], headers=get_header()) req1.encoding = 'gb2312' html1 = etree.HTML(req1.text) detail = ''.join( html1.xpath( '//*[@class="bmsg job_msg inbox"]//*/text()')) if detail.isspace(): detail = ''.join( html1.xpath( '//*[@class="bmsg job_msg inbox"]/text()')) print(detail) gongsi = ''.join( html1.xpath('//*[@class="tmsg inbox"]/text()')) if gongsi.isspace(): gongsi = ''.join( html1.xpath('//*[@class="tmsg inbox"]//*/text()')) jobDetail = Qcwy() jobDetail.title = title[0] jobDetail.url = url[0] jobDetail.company_name = name[0] jobDetail.area = area[0] jobDetail.salery = salery[0] if len(salery) != 0 else None jobDetail.time = time[0] jobDetail.detail = detail jobDetail.company_info = gongsi jobDetail.city = self.city jobDetail.key_word = self.keyword if len(salery) > 0: salary = salery[0] saleryArray = salary.split("-", 1) if (len(saleryArray) >= 2): saleryMin = saleryArray[0] saleryMax = saleryArray[1] jobDetail.salery_max = saleryMax jobDetail.salery_min = saleryMin # jobDetail.save() data = { "职位名称": title[0], "详细链接": url[0], "公司名称": name[0], "工作地点": area[0], "薪资": salery[0] if len(salery) != 0 else None, "发布时间": time[0], "职位信息": detail, "公司信息": gongsi } self.jobqueue.put(data) except: continue
def Spider(self): for page in range(1, 11): params = {'query': self.keyword, 'page': page} # req = requests.get(url=self.base_url + 'c' +self._get_city_code(), params=params, headers=get_header()) # print(req.url) # html = etree.HTML(req.text) browser.get("https://www.zhipin.com/c101280600?query=java&page=" + str(page)) htmlStr = browser.page_source html = pq(htmlStr) if page == 1: for i in range(1, 30): title = browser.find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/div[1]' .format(i)).text link = browser.find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a'. format(i)).get_attribute('href') name = browser.find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/h3/a' .format(i)).text data1 = browser.find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/p'. format(i)).text.split(' ') area = data1[0] salery = browser.find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/span' .format(i)).text exp = data1[1] study = data1[2] belong = browser.find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p' .format(i)).text status = browser.find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p' .format(i)).text try: size = browser.find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p' .format(i)).text except: size = '无' Hr = browser.find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[3]/h3'. format(i)).text req1 = requests.get(url=link, headers=get_header()) req1.encoding = 'utf-8' html1 = etree.HTML(req1.text) detail = ''.join( html1.xpath( '//*[@class="job-sec"][1]//*/text()')).strip() if detail.isspace(): detail = ''.join( html1.xpath( '//*[@class="job-sec"][1]/text()')).strip() print(detail) gongsi = ''.join( html1.xpath( '//*[@class="job-sec company-info"]//*/text()') ).strip() gongshang = ''.join( html1.xpath( '//*[@class="job-sec"][3]//*/text()')).strip() if '点击查看地图' in gongshang: gongshang = ''.join( html1.xpath( '//*[@class="job-sec"][2]//*/text()')).strip() data = {} data.update(职位名称=title, 公司名称=name, 职位链接=link, 工作地点=area, 薪资=salery, 工作经验=exp, 学历要求=study, 所属领域=belong, 公司状态=status, 公司规模=size, 发布人=Hr, 职位信息=detail, 公司介绍=gongsi, 工商信息=gongshang) self.downloadqueue.put(data) else: try: for i in range(1, 31): title = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/div[1]/text()' .format(i))[0] name = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/h3/a/text()' .format(i))[0] link = self.base_url.rstrip('/') + html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/@href' .format(i))[0] data1 = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/p/text()' .format(i)) area = data1[0] salery = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/span/text()' .format(i))[0] exp = data1[1] study = data1[2] belong = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[1]' .format(i))[0] status = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[2]' .format(i))[0] try: size = \ html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[3]'.format(i))[0] except: size = '无' who = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[3]/h3/text()[1]' .format(i))[0] try: job = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[1]/div/div[3]/h3/text()[2]' .format(i))[0] except: job = '无' Hr = '{}/{}'.format(who, job) req1 = requests.get(url=link, headers=get_header()) req1.encoding = 'utf-8' html1 = etree.HTML(req1.text) detail = ''.join( html1.xpath( '//*[@class="job-sec"][1]//*/text()')).strip() gongsi = ''.join( html1.xpath( '//*[@class="job-sec company-info"]//*/text()') ).strip() gongshang = ''.join( html1.xpath( '//*[@class="job-sec"][3]//*/text()')).strip() if '点击查看地图' in gongshang: gongshang = ''.join( html1.xpath( '//*[@class="job-sec"][2]//*/text()') ).strip() print(detail) data = {} data.update(职位名称=title, 公司名称=name, 职位链接=link, 工作地点=area, 薪资=salery, 工作经验=exp, 学历要求=study, 所属领域=belong, 公司状态=status, 公司规模=size, 发布人=Hr, 职位信息=detail, 公司介绍=gongsi, 工商信息=gongshang) self.downloadqueue.put(data) insert_sql = """ insert into boss(title,name,link,area,salery,exp,study,belong,status,size,Hr,detail,gongsi,gongshang) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ cursor.execute(insert_sql, (title, name, link, area, salery, exp, study, belong, status, size, Hr, detail, gongsi, gongshang)) conn.commit() except Exception as e: continue
def Spider(self): jobl = [] for page in range(self.page): params = { "start": 90 * page, "pageSize": 90, "workExperience": -1, "education": -1, "companyType": -1, "employmentType": -1, "jobWelfareTag": -1, "kw": self.keyword, "kt": 3, "cityId": self.city, "salary": '0, 0' } req = requests.get(url=self.base_url, params=params, headers=get_header()) cookie = req.cookies print(cookie) data = req.json()['data']['results'] if len(data) != 0: for job in data: # print(job) jobd = {} jobd['ID'] = job.get('number') jobd['工作名称'] = job.get('jobName') jobd['招聘详细链接'] = job.get('positionURL') company = job.get('company') jobd['公司名称'] = company.get('name') jobd['公司ID'] = company.get('number') jobd['公司性质'] = company.get('type').get('name') jobd['公司规模'] = company.get('size').get('name') jobd['公司招聘主页'] = company.get('url') jobd['公司地点'] = job.get('city').get('display') jobd['薪资'] = job.get('salary') jobd['学历要求'] = job.get('eduLevel').get('name') try: jobd['工作经历'] = job.get('workingExp').get('name') except: jobd['工作经历'] = '经验不限' jobd['职位类型'] = job.get('emplType') jobd['公司福利'] = '、'.join(job.get('welfare')) or '无' jobd['工作发布标签'] = job.get('timeState') jobd['更新时间'] = job.get('updateDate') header = get_header() header['referer'] = job.get('positionURL') header['upgrade-insecure-requests'] = '1' ZHILIAN_COOKIE = 'adfbid2=0; x-zp-client-id=912a3ef6-bb0d-46c9-a396-ea9566250f8a; sts_deviceid=16bd0b9394299-0b310c2e8d3a19-e343166-2073600-16bd0b93943668; acw_tc=2760824a15648151356115930e8773f8ab7c33fc21043ce1c44b11cd7f5da1; _uab_collina=156481966519005574477937; urlfrom2=121114583; adfcid2=www.baidu.com; sou_experiment=capi; ZP_OLD_FLAG=false; CANCELALL=1; LastCity=%E5%8C%97%E4%BA%AC; LastCity%5Fid=530; dywea=95841923.417992036732763400.1562574932.1564819776.1564824996.3; dywez=95841923.1564824996.3.3.dywecsr=baidu|dyweccn=(organic)|dywecmd=organic; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1562574933,1564819776,1564824996; __utma=269921210.1630440294.1562574933.1564819776.1564824996.3; __utmz=269921210.1564824996.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216bd0b9395e473-06c4ddd9790ee2-e343166-2073600-16bd0b9395f866%22%2C%22%24device_id%22%3A%2216bd0b9395e473-06c4ddd9790ee2-e343166-2073600-16bd0b9395f866%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baiduPC%22%2C%22%24latest_utm_medium%22%3A%22CPC%22%2C%22%24latest_utm_campaign%22%3A%22pp%22%2C%22%24latest_utm_content%22%3A%22pp%22%2C%22%24latest_utm_term%22%3A%228804373%22%7D%7D; sts_sg=1; sts_chnlsid=Unknown; jobRiskWarning=true; sts_sid=16c5fc1dd9645-03591d4b89f0ab-e343166-2073600-16c5fc1dd97a91; acw_sc__=5d479f892414863f1ea95e2ce7352ba574fd9d2f; zp_src_url=https%3A%2F%2Fjobs.zhaopin.com%2FCC120116927J00122526309.htm; ZL_REPORT_GLOBAL={%22jobs%22:{%22recommandActionidShare%22:%22de9c9056-061b-4d15-9b59-d930c831f19e-job%22%2C%22funczoneShare%22:%22dtl_best_for_you%22}%2C%22company%22:{%22actionid%22:%22fe6b5ca0-7e6b-4587-874f-4f39773534a2-company%22%2C%22funczone%22:%22hiring_jd%22}}; sts_evtseq=14' # header['cookie'] = config.ZHILIAN_COOKIE header['cookie'] = ZHILIAN_COOKIE req1 = requests.get( job.get('positionURL'), headers=header, ) req1.encoding = 'utf-8' html = etree.HTML(req1.text) detail = ''.join( html.xpath( '//*[@class="describtion__detail-content"]//*/text()' )) if not detail: detail = ''.join( html.xpath( '//*[@class="describtion__detail-content"]/text()' )) print(detail) jobd['职位描述'] = detail.strip() insert_sql = """ insert into zhilian(number ,jobName ,positionURL ,comName ,comNumber ,comType ,comSize ,comUrl ,comCity ,salary ,eduLevel ,workingExp ,emplType ,walfare ,timeState ,updateDate ,detail) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ cursor.execute( insert_sql, (job.get('number'), job.get('jobName'), job.get('positionURL'), company.get('name'), company.get('number'), company.get('type').get('name'), company.get('size').get('name'), company.get('url'), job.get('city').get('display'), job.get('salary'), job.get('eduLevel').get('name'), job.get('workingExp').get('name') if job.get('workingExp') and job.get('workingExp').get('name') else '经验不限', job.get('emplType'), '、'.join(job.get('welfare')) or '无', job.get('timeState'), job.get('updateDate'), detail.strip())) conn.commit() jobl.append(jobd) else: break return jobl
def Spider(self): for page in range(1, 11): params = {'query': self.keyword, 'page': page} req = requests.get(url=self.base_url + 'c' + self._get_city_code(), params=params, headers=get_header()) print(req.url) html = etree.HTML(req.text) if page == 1: for i in range(1, 30): title = html.xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/div[1]/text()' .format(i))[0] name = html.xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/h3/a/text()' .format(i))[0] data1 = html.xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/p/text()' .format(i)) area = data1[0] salery = html.xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/span/text()' .format(i))[0] exp = data1[1] study = data1[2] belong = html.xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[1]' .format(i))[0] status = html.xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[2]' .format(i))[0] try: size = html.xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[3]' .format(i))[0] except: size = '无' who = html.xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[3]/h3/text()[1]' .format(i))[0] try: job = html.xpath( '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[3]/h3/text()[2]' .format(i))[0] except: job = '无' Hr = '{}/{}'.format(who, job) data = {} data.update(职位名称=title, 公司名称=name, 工作地点=area, 薪资=salery, 工作经验=exp, 学历要求=study, 所属领域=belong, 公司状态=status, 公司规模=size, 发布人=Hr) self.downloadqueue.put(data) else: try: for i in range(1, 31): title = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/div[1]/text()' .format(i))[0] name = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/h3/a/text()' .format(i))[0] data1 = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/p/text()' .format(i)) area = data1[0] salery = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/span/text()' .format(i))[0] exp = data1[1] study = data1[2] belong = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[1]' .format(i))[0] status = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[2]' .format(i))[0] try: size = \ html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[3]'.format(i))[0] except: size = '无' who = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[3]/h3/text()[1]' .format(i))[0] try: job = html.xpath( '//*[@id="main"]/div/div[2]/ul/li[1]/div/div[3]/h3/text()[2]' .format(i))[0] except: job = '无' Hr = '{}/{}'.format(who, job) data = {} data.update(职位名称=title, 公司名称=name, 工作地点=area, 薪资=salery, 工作经验=exp, 学历要求=study, 所属领域=belong, 公司状态=status, 公司规模=size, 发布人=Hr) self.downloadqueue.put(data) except: continue
def Spider(self): for page in range(1, 11): params = { 'query': self.keyword, 'page' : page } req = requests.get(url=self.base_url + 'c' +self._get_city_code(), params=params, headers=get_header()) print(req.url) html = etree.HTML(req.text) if page == 1: for i in range(1, 30): title = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/div[1]/text()'.format(i))[0] link = self.base_url.rstrip('/') + html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/@href'.format(i))[0] name = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/h3/a/text()'.format(i))[0] data1 = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/p/text()'.format(i)) area = data1[0] salery = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/span/text()'.format(i))[0] exp = data1[1] study = data1[2] belong = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[1]'.format(i))[0] status = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[2]'.format(i))[0] try: size = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[3]'.format(i))[0] except: size = '无' who = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[3]/h3/text()[1]'.format(i))[0] try: job = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[3]/h3/text()[2]'.format(i))[0] except: job = '无' Hr = '{}/{}'.format(who, job) req1 = requests.get(url=link, headers=get_header()) req1.encoding = 'utf-8' html1 = etree.HTML(req1.text) detail = ''.join(html1.xpath('//*[@class="job-sec"][1]//*/text()')).strip() if detail.isspace(): detail = ''.join(html1.xpath('//*[@class="job-sec"][1]/text()')).strip() print(detail) gongsi = ''.join(html1.xpath('//*[@class="job-sec company-info"]//*/text()')).strip() gongshang = ''.join(html1.xpath('//*[@class="job-sec"][3]//*/text()')).strip() if '点击查看地图' in gongshang: gongshang = ''.join(html1.xpath('//*[@class="job-sec"][2]//*/text()')).strip() data = {} data.update(职位名称=title, 公司名称=name, 职位链接=link, 工作地点=area, 薪资=salery, 工作经验=exp, 学历要求=study, 所属领域=belong, 公司状态=status, 公司规模=size, 发布人=Hr, 职位信息=detail, 公司介绍=gongsi, 工商信息=gongshang) self.downloadqueue.put(data) else: try: for i in range(1,31): title = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/div[1]/text()'.format(i))[0] name = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/h3/a/text()'.format(i))[0] link = self.base_url.rstrip('/') + html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/@href'.format(i))[0] data1 = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/p/text()'.format(i)) area = data1[0] salery = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/span/text()'.format(i))[0] exp = data1[1] study = data1[2] belong = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[1]'.format(i))[0] status = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[2]'.format(i))[0] try: size = \ html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[3]'.format(i))[0] except: size = '无' who = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[3]/h3/text()[1]'.format(i))[0] try: job = html.xpath('//*[@id="main"]/div/div[2]/ul/li[1]/div/div[3]/h3/text()[2]'.format(i))[0] except: job = '无' Hr = '{}/{}'.format(who, job) req1 = requests.get(url=link, headers=get_header()) req1.encoding = 'utf-8' html1 = etree.HTML(req1.text) detail = ''.join(html1.xpath('//*[@class="job-sec"][1]//*/text()')).strip() gongsi = ''.join(html1.xpath('//*[@class="job-sec company-info"]//*/text()')).strip() gongshang = ''.join(html1.xpath('//*[@class="job-sec"][3]//*/text()')).strip() if '点击查看地图' in gongshang: gongshang = ''.join(html1.xpath('//*[@class="job-sec"][2]//*/text()')).strip() print(detail) data = {} data.update(职位名称=title, 公司名称=name, 职位链接=link, 工作地点=area, 薪资=salery, 工作经验=exp, 学历要求=study, 所属领域=belong, 公司状态=status, 公司规模=size, 发布人=Hr, 职位信息=detail, 公司介绍=gongsi, 工商信息=gongshang) self.downloadqueue.put(data) except Exception as e: continue
def Spider(self): jobl = [] for page in range(self.page): params = { "start": 90 * page, "pageSize": 90, "workExperience": -1, "education": -1, "companyType": -1, "employmentType": -1, "jobWelfareTag": -1, "kw": self.keyword, "kt": 3, "cityId": self.city, "salary": '0, 0' } req = requests.get(url=self.base_url, params=params, headers=get_header()) cookie = req.cookies print(cookie) data = req.json()['data']['results'] if len(data) != 0: for job in data: # print(job) jobd = {} jobd['ID'] = job.get('number') jobd['工作名称'] = job.get('jobName') jobd['招聘详细链接'] = job.get('positionURL') company = job.get('company') jobd['公司名称'] = company.get('name') jobd['公司ID'] = company.get('number') jobd['公司性质'] = company.get('type').get('name') jobd['公司规模'] = company.get('size').get('name') jobd['公司招聘主页'] = company.get('url') jobd['公司地点'] = job.get('city').get('display') jobd['薪资'] = job.get('salary') jobd['学历要求'] = job.get('eduLevel').get('name') try: jobd['工作经历'] = job.get('workingExp').get('name') except: jobd['工作经历'] = '经验不限' jobd['职位类型'] = job.get('emplType') jobd['公司福利'] = '、'.join(job.get('welfare')) or '无' jobd['工作发布标签'] = job.get('timeState') jobd['更新时间'] = job.get('updateDate') header = get_header() header['referer'] = job.get('positionURL') header['upgrade-insecure-requests'] = '1' header['cookie'] = config.ZHILIAN_COOKIE req1 = requests.get( job.get('positionURL'), headers=header, ) req1.encoding = 'utf-8' html = etree.HTML(req1.text) detail = ''.join( html.xpath( '//*[@class="describtion__detail-content"]//*/text()' )) if not detail: detail = ''.join( html.xpath( '//*[@class="describtion__detail-content"]/text()' )) print(job.get('positionURL')) print(detail) jobd['职位描述'] = detail.strip() jobl.append(jobd) else: break return jobl