Ejemplo n.º 1
0
    def __main(self, url):
        """ 爬取数据"""
        util.logger.warning("正在爬取%s" % url)
        r = util.get(url)
        #如果没有得到结果,再爬取一遍,还没有,返回false截止
        if r[0] == 0:
            r = util.get(url)
        if r[0] == 0:
            return False
        # 网页上抓包得到的数据,看网页network分析
        body = r[1].decode("gbk").replace('\n',
                                          '').replace('\r',
                                                      '').replace('\t', '')

        result = re.findall(
            '<div class="el">.*?_blank" title="(.*?)" href="(.*?)" onmousedown="">.*?<a target="_blank" title="(.*?)".*? <span class="t4">(.*?)</span>',
            body)

        if len(result) > 0:
            for item in result:
                self.posName = item[0]
                self.url = item[1]
                self.company = item[2]
                self.salary = item[3]
                # 1. 去重  url地址
                rs = self.getDataByUrl()
                if rs == True:
                    continue
                # 2. 反爬
                time.sleep(5)
                self.__detail()  #调用爬取职位描述
                self.insertData()  #调用函数数据库的, 入库
Ejemplo n.º 2
0
 def __main(self, url):
     util.logger.warning("正在爬取%s" % url)
     r = util.get(url)
     if r[0] == 0:
         r = util.get(url)
     if r[0] == 0:
         return False
     body = r[1].decode().replace('\n', '').replace('\r',
                                                    '').replace('\t', '')
     result = re.findall(
         '<li><h4>(.*?)</h4><a href=(.*?)target="_blank">(.*?)</a></li>',
         body)
     if len(result) > 0:
         for item in result:
             # print(item)
             self.date = item[0]
             self.url = item[1]
             self.title = item[2]
             # 1. 去重  url地址
             rs = self.getDataByUrl()
             if rs == True:
                 continue
             # 2. 反爬
             time.sleep(5)
             self.insertData()
Ejemplo n.º 3
0
 def __main(self, url, parms):
     """ 爬取数据"""
     util.logger.warning("正在爬取%s" % url)
     r = util.get(url,params=parms)
     #如果没有得到结果,再爬取一遍,还没有,返回false截止
     if r[0] == 0:
         r = util.get(url,params=parms)
     if r[0] == 0:
         return False
     # 网页上抓包得到的数据,看网页network分析
     body = r[1].decode()
     jsondata = json.loads(body)["data"]["results"]
     for item in jsondata:
         self.posName = item["jobName"]
         self.url = item["positionURL"]
         self.salary = item["salary"]
         self.workExp = item["workingExp"]["name"]
         self.edu = item["eduLevel"]["name"]
         self.company = item["company"]["name"]
         # 1. 去重  url地址
         rs = self.getDataByUrl()
         if rs == True:
             continue
         # 2. 反爬
         time.sleep(2)
         self.__detail() #调用爬取职位描述
         self.insertData() #调用函数数据库的, 入库
Ejemplo n.º 4
0
 def __main(self, url):
     util.logger.warning("正在爬取%s" % url)
     r = util.get(url, headers=self.__headers)
     if r[0] == 0:
         r = util.get(url)
     if r[0] == 0:
         return False
     body = r[1].decode().replace('\n', '').replace('\r',
                                                    '').replace('\t', '')
     result = re.findall(
         'class="job-primary">.*?href="(.*?)".*?job-title">(.*?)</div>.*?"red">(.*?)</span>.*?</em>(.*?)<em.*?/em>(.*?)</p>.*?blank">(.*?)</a>',
         body)
     if len(result) > 0:
         for item in result:
             self.posName = item[1]
             self.url = "https://www.zhipin.com%s" % item[0]
             self.salary = item[2]
             self.workExp = item[3]
             self.edu = item[4]
             self.company = item[5]
             # 1. 去重  url地址
             rs = self.getDataByUrl()
             if rs == True:
                 continue
             # 2. 反爬
             time.sleep(2)
             self.__detail()
             self.insertData()
Ejemplo n.º 5
0
 def __detail(self):
     util.logger.warning("正在爬取明细页面%s" % self.url)
     r = util.get(self.url, headers=self.__headers)
     if r[0] == 0:
         r = util.get(self.url, headers=self.__headers)
     if r[0] == 0:
         return False
     body = r[1].decode().replace('\n', '').replace('\r', '').replace('\t', '')
     body2 = re.findall('class="detail_content">(.*?)<script>', body)
     # 无论是哪种提取类型,都有可能会出现匹配失败的问题吧
     if len(body2) > 0:
         self.body = body2[0]
Ejemplo n.º 6
0
 def __detail(self):
     """ 爬取职位描述"""
     util.logger.warning("正在爬取明细页面%s"%self.url)
     r = util.get(self.url)
     if r[0] == 0:
         r = util.get(self.url)
     if r[0] == 0:
         return False
     body = r[1].decode().replace('\n', '').replace('\r', '').replace('\t', '')
     comname = re.findall('class="pos-ul">(.*?)</div>', body)
     # 无论是哪种提取类型,都有可能会出现匹配失败的问题吧,长度大于0
     if len(comname) > 0:
         self.detail = comname[0]
Ejemplo n.º 7
0
 def __detail(self):
     util.logger.warning("正在爬取明细页面%s" % self.url)
     r = util.get(self.url, headers=self.__headers)
     if r[0] == 0:
         r = util.get(self.url, headers=self.__headers)
     if r[0] == 0:
         return False
     body = r[1].decode().replace('\n', '').replace('\r',
                                                    '').replace('\t', '')
     comname = re.findall('<h3>职位描述</h3>(.*?)</div>', body)
     # 无论是哪种提取类型,都有可能会出现匹配失败的问题吧
     if len(comname) > 0:
         self.detail = comname[0]
Ejemplo n.º 8
0
    def __main(self, url):
        """ 爬取数据"""
        util.logger.warning("正在爬取%s" % url)
        r = util.get(url)
        #如果没有得到结果,再爬取一遍,还没有,返回false截止
        if r[0] == 0:
            r = util.get(url)
        if r[0] == 0:
            return False
        # 网页上抓包得到的数据,看网页network分析
        # body = r[1].decode().replace('\n', '').replace('\r', '').replace('\t', '')
        # result = re.findall('<div class="job_name clearfix">.*?<a href="(.*?)".*?class="name">(.*?)</span></a>.*?="job_salary">(.*?)<i class="unit">.*?title="(.*?)"></a>.*?class="xueli">(.*?)</span>',body)
        # print(result)

        #//p[@class="job_salary"]
        # body=r[1].decode()
        # bodytree = etree.HTML(body)
        # result=bodytree.xpath("//p[@class='job_salary']/text()")

        body = r[1].decode()
        soup = BeautifulSoup(body, 'lxml')
        # result = soup.select("div.job_name > a > span.name ")
        result = soup.find_all("div", class_="job_name")
        # result1 = soup.find_all("div", class_="job_comp")
        # print(result1)
        if len(result) > 0:
            for item in result:
                self.posName = item.text
                self.salary = item.next_sibling.next
                # self.url=item.contents[0].attrs["href"]
                print(self.salary)
                print(self.posName)
                # if len(result1) > 0:
                # for item1 in result1:
                #     self.company=item1.contents[2].text
                #     self.edu=item1.contents[4].contents[2].next
                #     print(self.company)
                #     print(self.edu)

                # 1. 去重  url地址
                #  rs = self.getDataByUrl()
                #  if rs == True:
                #      continue
                # 2. 反爬
                time.sleep(2)
                # self.__detail() #调用爬取职位描述
                self.insertData()  #调用函数数据库的, 入库