def __main(self, url): """ 爬取数据""" util.logger.warning("正在爬取%s" % url) r = util.get(url) #如果没有得到结果,再爬取一遍,还没有,返回false截止 if r[0] == 0: r = util.get(url) if r[0] == 0: return False # 网页上抓包得到的数据,看网页network分析 body = r[1].decode("gbk").replace('\n', '').replace('\r', '').replace('\t', '') result = re.findall( '<div class="el">.*?_blank" title="(.*?)" href="(.*?)" onmousedown="">.*?<a target="_blank" title="(.*?)".*? <span class="t4">(.*?)</span>', body) if len(result) > 0: for item in result: self.posName = item[0] self.url = item[1] self.company = item[2] self.salary = item[3] # 1. 去重 url地址 rs = self.getDataByUrl() if rs == True: continue # 2. 反爬 time.sleep(5) self.__detail() #调用爬取职位描述 self.insertData() #调用函数数据库的, 入库
def __main(self, url): util.logger.warning("正在爬取%s" % url) r = util.get(url) if r[0] == 0: r = util.get(url) if r[0] == 0: return False body = r[1].decode().replace('\n', '').replace('\r', '').replace('\t', '') result = re.findall( '<li><h4>(.*?)</h4><a href=(.*?)target="_blank">(.*?)</a></li>', body) if len(result) > 0: for item in result: # print(item) self.date = item[0] self.url = item[1] self.title = item[2] # 1. 去重 url地址 rs = self.getDataByUrl() if rs == True: continue # 2. 反爬 time.sleep(5) self.insertData()
def __main(self, url, parms): """ 爬取数据""" util.logger.warning("正在爬取%s" % url) r = util.get(url,params=parms) #如果没有得到结果,再爬取一遍,还没有,返回false截止 if r[0] == 0: r = util.get(url,params=parms) if r[0] == 0: return False # 网页上抓包得到的数据,看网页network分析 body = r[1].decode() jsondata = json.loads(body)["data"]["results"] for item in jsondata: self.posName = item["jobName"] self.url = item["positionURL"] self.salary = item["salary"] self.workExp = item["workingExp"]["name"] self.edu = item["eduLevel"]["name"] self.company = item["company"]["name"] # 1. 去重 url地址 rs = self.getDataByUrl() if rs == True: continue # 2. 反爬 time.sleep(2) self.__detail() #调用爬取职位描述 self.insertData() #调用函数数据库的, 入库
def __main(self, url): util.logger.warning("正在爬取%s" % url) r = util.get(url, headers=self.__headers) if r[0] == 0: r = util.get(url) if r[0] == 0: return False body = r[1].decode().replace('\n', '').replace('\r', '').replace('\t', '') result = re.findall( 'class="job-primary">.*?href="(.*?)".*?job-title">(.*?)</div>.*?"red">(.*?)</span>.*?</em>(.*?)<em.*?/em>(.*?)</p>.*?blank">(.*?)</a>', body) if len(result) > 0: for item in result: self.posName = item[1] self.url = "https://www.zhipin.com%s" % item[0] self.salary = item[2] self.workExp = item[3] self.edu = item[4] self.company = item[5] # 1. 去重 url地址 rs = self.getDataByUrl() if rs == True: continue # 2. 反爬 time.sleep(2) self.__detail() self.insertData()
def __detail(self): util.logger.warning("正在爬取明细页面%s" % self.url) r = util.get(self.url, headers=self.__headers) if r[0] == 0: r = util.get(self.url, headers=self.__headers) if r[0] == 0: return False body = r[1].decode().replace('\n', '').replace('\r', '').replace('\t', '') body2 = re.findall('class="detail_content">(.*?)<script>', body) # 无论是哪种提取类型,都有可能会出现匹配失败的问题吧 if len(body2) > 0: self.body = body2[0]
def __detail(self): """ 爬取职位描述""" util.logger.warning("正在爬取明细页面%s"%self.url) r = util.get(self.url) if r[0] == 0: r = util.get(self.url) if r[0] == 0: return False body = r[1].decode().replace('\n', '').replace('\r', '').replace('\t', '') comname = re.findall('class="pos-ul">(.*?)</div>', body) # 无论是哪种提取类型,都有可能会出现匹配失败的问题吧,长度大于0 if len(comname) > 0: self.detail = comname[0]
def __detail(self): util.logger.warning("正在爬取明细页面%s" % self.url) r = util.get(self.url, headers=self.__headers) if r[0] == 0: r = util.get(self.url, headers=self.__headers) if r[0] == 0: return False body = r[1].decode().replace('\n', '').replace('\r', '').replace('\t', '') comname = re.findall('<h3>职位描述</h3>(.*?)</div>', body) # 无论是哪种提取类型,都有可能会出现匹配失败的问题吧 if len(comname) > 0: self.detail = comname[0]
def __main(self, url): """ 爬取数据""" util.logger.warning("正在爬取%s" % url) r = util.get(url) #如果没有得到结果,再爬取一遍,还没有,返回false截止 if r[0] == 0: r = util.get(url) if r[0] == 0: return False # 网页上抓包得到的数据,看网页network分析 # body = r[1].decode().replace('\n', '').replace('\r', '').replace('\t', '') # result = re.findall('<div class="job_name clearfix">.*?<a href="(.*?)".*?class="name">(.*?)</span></a>.*?="job_salary">(.*?)<i class="unit">.*?title="(.*?)"></a>.*?class="xueli">(.*?)</span>',body) # print(result) #//p[@class="job_salary"] # body=r[1].decode() # bodytree = etree.HTML(body) # result=bodytree.xpath("//p[@class='job_salary']/text()") body = r[1].decode() soup = BeautifulSoup(body, 'lxml') # result = soup.select("div.job_name > a > span.name ") result = soup.find_all("div", class_="job_name") # result1 = soup.find_all("div", class_="job_comp") # print(result1) if len(result) > 0: for item in result: self.posName = item.text self.salary = item.next_sibling.next # self.url=item.contents[0].attrs["href"] print(self.salary) print(self.posName) # if len(result1) > 0: # for item1 in result1: # self.company=item1.contents[2].text # self.edu=item1.contents[4].contents[2].next # print(self.company) # print(self.edu) # 1. 去重 url地址 # rs = self.getDataByUrl() # if rs == True: # continue # 2. 反爬 time.sleep(2) # self.__detail() #调用爬取职位描述 self.insertData() #调用函数数据库的, 入库